Skip to content

Commit

Permalink
Fixes
Browse files Browse the repository at this point in the history
Signed-off-by: Hemil Desai <[email protected]>
  • Loading branch information
hemildesai committed Oct 4, 2024
1 parent a223e42 commit 645e9e2
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
23 changes: 15 additions & 8 deletions examples/llm/slimpajama/data/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ def execute_cmd(cmd_tuple: tuple):
def preprocess_data(
data_dir: str,
output_dir: str,
dataset_impl: str = "mmap",
tokenizer_type: str = "GPT2BPETokenizer",
tokenizer_library: str = "megatron",
dataset_impl: str = "",
tokenizer_type: str = "",
tokenizer_library: str = "sentencepiece",
tokenizer_model: str = "",
vocab_file_path: Optional[str] = None,
merges_file_path: Optional[str] = None,
num_tasks: Optional[int] = None,
Expand Down Expand Up @@ -68,19 +69,25 @@ def preprocess_data(
flags = [
f"--input={split}",
f"--output-prefix={output_arg}",
f"--dataset-impl={dataset_impl}",
f"--tokenizer-library={tokenizer_library}",
f"--tokenizer-type={tokenizer_type}",
f"--tokenizer-type={tokenizer_type}" if tokenizer_type else f"--tokenizer-model={tokenizer_model}",
f"--workers={multiprocessing.cpu_count()}",
"--log-interval=100000",
"--apply-ftfy",
]

if vocab_file_path and merges_file_path:
if dataset_impl:
flags += [f"--dataset-impl={dataset_impl}"]

if vocab_file_path:
flags += [
f"--vocab={vocab_file_path}",
f"--merge-file={merges_file_path}",
f"--vocab-file={vocab_file_path}",
"--append-eod",
]

if merges_file_path:
flags += [f"--merges-file={merges_file_path}"]

final_cmd = cmd + flags
if extra_args:
final_cmd += extra_args
Expand Down
14 changes: 12 additions & 2 deletions examples/llm/slimpajama/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def slurm_executor(
custom_env_vars: Optional[dict[str, str]] = None,
container_image: str = "nvcr.io/nvidia/nemo:dev",
retries: int = 0,
ssh_key_file_path: Optional[str] = None,
) -> run.SlurmExecutor:
if not (user and host and remote_job_dir and account and partition and nodes and tasks_per_node):
raise RuntimeError(
Expand All @@ -29,6 +30,7 @@ def slurm_executor(
if custom_mounts:
mounts.extend(custom_mounts)

# Required to run on CPU nodes
env_vars = {"NVIDIA_VISIBLE_DEVICES": "void"}
if custom_env_vars:
env_vars |= custom_env_vars
Expand All @@ -40,6 +42,7 @@ def slurm_executor(
user=user,
host=host,
job_dir=remote_job_dir,
identity=ssh_key_file_path,
),
nodes=nodes,
ntasks_per_node=tasks_per_node,
Expand Down Expand Up @@ -83,13 +86,20 @@ def run_data_pipeline():
)

# Use NeMo image for the remaining tasks
executor.container_image = "nvcr.io/nvidia/nemo:dev"
executor.container_image = "nvcr.io/nvidia/nemo:nightly"
exp.add(run.Partial(run_extraction, data_dir="/data/slimpajama"), executor=executor)

# examples/llm/slimpajama is automatically mounted to /nemo_run/code
exp.add(run.Script("/nemo_run/code/data/concat.sh", args=["/data/slimpajama/train", "1"]), executor=executor)
exp.add(
run.Partial(preprocess_data, data_dir="/data/slimpajama", output_dir="/data/slimpajama_megatron"),
run.Partial(
preprocess_data,
data_dir="/data/slimpajama",
output_dir="/data/slimpajama_megatron",
tokenizer_model="/data/tokenizer/tokenizer.model",
tokenizer_library="sentencepiece",
vocab_file_path="/data/tokenizer/tokenizer.vocab",
),
executor=executor,
)

Expand Down

0 comments on commit 645e9e2

Please sign in to comment.