From 645e9e2e0bb11a896d06e61a380f78c87ae7cf59 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Wed, 2 Oct 2024 12:13:38 -0700 Subject: [PATCH] Fixes Signed-off-by: Hemil Desai --- examples/llm/slimpajama/data/preprocess.py | 23 ++++++++++++++-------- examples/llm/slimpajama/data_pipeline.py | 14 +++++++++++-- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/examples/llm/slimpajama/data/preprocess.py b/examples/llm/slimpajama/data/preprocess.py index 7d299bce4cc7..a7278e987355 100644 --- a/examples/llm/slimpajama/data/preprocess.py +++ b/examples/llm/slimpajama/data/preprocess.py @@ -34,9 +34,10 @@ def execute_cmd(cmd_tuple: tuple): def preprocess_data( data_dir: str, output_dir: str, - dataset_impl: str = "mmap", - tokenizer_type: str = "GPT2BPETokenizer", - tokenizer_library: str = "megatron", + dataset_impl: str = "", + tokenizer_type: str = "", + tokenizer_library: str = "sentencepiece", + tokenizer_model: str = "", vocab_file_path: Optional[str] = None, merges_file_path: Optional[str] = None, num_tasks: Optional[int] = None, @@ -68,19 +69,25 @@ def preprocess_data( flags = [ f"--input={split}", f"--output-prefix={output_arg}", - f"--dataset-impl={dataset_impl}", f"--tokenizer-library={tokenizer_library}", - f"--tokenizer-type={tokenizer_type}", + f"--tokenizer-type={tokenizer_type}" if tokenizer_type else f"--tokenizer-model={tokenizer_model}", f"--workers={multiprocessing.cpu_count()}", + "--log-interval=100000", + "--apply-ftfy", ] - if vocab_file_path and merges_file_path: + if dataset_impl: + flags += [f"--dataset-impl={dataset_impl}"] + + if vocab_file_path: flags += [ - f"--vocab={vocab_file_path}", - f"--merge-file={merges_file_path}", + f"--vocab-file={vocab_file_path}", "--append-eod", ] + if merges_file_path: + flags += [f"--merges-file={merges_file_path}"] + final_cmd = cmd + flags if extra_args: final_cmd += extra_args diff --git a/examples/llm/slimpajama/data_pipeline.py b/examples/llm/slimpajama/data_pipeline.py index 94e68427c749..6493e45406d8 100644 --- a/examples/llm/slimpajama/data_pipeline.py +++ b/examples/llm/slimpajama/data_pipeline.py @@ -19,6 +19,7 @@ def slurm_executor( custom_env_vars: Optional[dict[str, str]] = None, container_image: str = "nvcr.io/nvidia/nemo:dev", retries: int = 0, + ssh_key_file_path: Optional[str] = None, ) -> run.SlurmExecutor: if not (user and host and remote_job_dir and account and partition and nodes and tasks_per_node): raise RuntimeError( @@ -29,6 +30,7 @@ def slurm_executor( if custom_mounts: mounts.extend(custom_mounts) + # Required to run on CPU nodes env_vars = {"NVIDIA_VISIBLE_DEVICES": "void"} if custom_env_vars: env_vars |= custom_env_vars @@ -40,6 +42,7 @@ def slurm_executor( user=user, host=host, job_dir=remote_job_dir, + identity=ssh_key_file_path, ), nodes=nodes, ntasks_per_node=tasks_per_node, @@ -83,13 +86,20 @@ def run_data_pipeline(): ) # Use NeMo image for the remaining tasks - executor.container_image = "nvcr.io/nvidia/nemo:dev" + executor.container_image = "nvcr.io/nvidia/nemo:nightly" exp.add(run.Partial(run_extraction, data_dir="/data/slimpajama"), executor=executor) # examples/llm/slimpajama is automatically mounted to /nemo_run/code exp.add(run.Script("/nemo_run/code/data/concat.sh", args=["/data/slimpajama/train", "1"]), executor=executor) exp.add( - run.Partial(preprocess_data, data_dir="/data/slimpajama", output_dir="/data/slimpajama_megatron"), + run.Partial( + preprocess_data, + data_dir="/data/slimpajama", + output_dir="/data/slimpajama_megatron", + tokenizer_model="/data/tokenizer/tokenizer.model", + tokenizer_library="sentencepiece", + vocab_file_path="/data/tokenizer/tokenizer.vocab", + ), executor=executor, )