File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -613,15 +613,19 @@ def main():
613613 assert os .path .exists (
614614 args .data_path
615615 ), f"Dataset path { args .data_path } does not exist"
616- dataset = Dataset .from_generator (
617- generator = safe_conversations_generator ,
618- gen_kwargs = {"file_path" : args .data_path },
619- cache_dir = os .path .join (
620- os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))),
621- "cache" ,
622- "hf_dataset" ,
623- ),
624- )
616+
617+ with rank_0_priority ():
618+ print_with_rank ("Loading/building dataset cache..." )
619+ dataset = Dataset .from_generator (
620+ generator = safe_conversations_generator ,
621+ gen_kwargs = {"file_path" : args .data_path },
622+ cache_dir = os .path .join (
623+ os .path .dirname (os .path .dirname (os .path .abspath (__file__ ))),
624+ "cache" ,
625+ "hf_dataset" ,
626+ ),
627+ num_proc = min (args .build_dataset_num_proc , 32 ),
628+ )
625629 if args .num_samples is not None :
626630 dataset = dataset .select (range (args .num_samples ))
627631 # Tokenizer and cache key
You can’t perform that action at this time.
0 commit comments