Skip to content

Commit

Permalink
Fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Hemil Desai <[email protected]>
  • Loading branch information
hemildesai committed Sep 30, 2024
1 parent fd81129 commit 46e3221
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions examples/llm/slimpajama/data/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def preprocess_data(
else:
num_tasks = 1
task_id = 0
shards_to_extract = get_shard_list(data_dir, num_tasks, extension="*.jsonl")
shards_to_extract = get_shard_list(data_dir, num_tasks, extension="concatenated*.jsonl")
shard_files = shards_to_extract[task_id]
cmd = [
"python",
Expand All @@ -71,6 +71,7 @@ def preprocess_data(
f"--dataset-impl={dataset_impl}",
f"--tokenizer-library={tokenizer_library}",
f"--tokenizer-type={tokenizer_type}",
f"--workers={multiprocessing.cpu_count()}",
]

if vocab_file_path and merges_file_path:
Expand All @@ -85,5 +86,5 @@ def preprocess_data(
final_cmd += extra_args
final_cmds.append((final_cmd, task_id))

with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
pool.map(execute_cmd, final_cmds)
for cmd in final_cmds:
execute_cmd(cmd)

0 comments on commit 46e3221

Please sign in to comment.