Skip to content

Commit e0fbb86

Browse files
authored
Merge pull request #519 from sgl-project/patch-tool-use
[Bug fix]: only rank0 load dataset
2 parents 1f4e12a + c3fb09f commit e0fbb86

1 file changed

Lines changed: 13 additions & 9 deletions

File tree

scripts/prepare_hidden_states.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -613,15 +613,19 @@ def main():
613613
assert os.path.exists(
614614
args.data_path
615615
), f"Dataset path {args.data_path} does not exist"
616-
dataset = Dataset.from_generator(
617-
generator=safe_conversations_generator,
618-
gen_kwargs={"file_path": args.data_path},
619-
cache_dir=os.path.join(
620-
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
621-
"cache",
622-
"hf_dataset",
623-
),
624-
)
616+
617+
with rank_0_priority():
618+
print_with_rank("Loading/building dataset cache...")
619+
dataset = Dataset.from_generator(
620+
generator=safe_conversations_generator,
621+
gen_kwargs={"file_path": args.data_path},
622+
cache_dir=os.path.join(
623+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
624+
"cache",
625+
"hf_dataset",
626+
),
627+
num_proc=min(args.build_dataset_num_proc, 32),
628+
)
625629
if args.num_samples is not None:
626630
dataset = dataset.select(range(args.num_samples))
627631
# Tokenizer and cache key

0 commit comments

Comments
 (0)