From 645e9e2e0bb11a896d06e61a380f78c87ae7cf59 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Wed, 2 Oct 2024 12:13:38 -0700
Subject: [PATCH] Fixes

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 examples/llm/slimpajama/data/preprocess.py | 23 ++++++++++++++--------
 examples/llm/slimpajama/data_pipeline.py   | 14 +++++++++++--
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/examples/llm/slimpajama/data/preprocess.py b/examples/llm/slimpajama/data/preprocess.py
index 7d299bce4cc7..a7278e987355 100644
--- a/examples/llm/slimpajama/data/preprocess.py
+++ b/examples/llm/slimpajama/data/preprocess.py
@@ -34,9 +34,10 @@ def execute_cmd(cmd_tuple: tuple):
 def preprocess_data(
     data_dir: str,
     output_dir: str,
-    dataset_impl: str = "mmap",
-    tokenizer_type: str = "GPT2BPETokenizer",
-    tokenizer_library: str = "megatron",
+    dataset_impl: str = "",
+    tokenizer_type: str = "",
+    tokenizer_library: str = "sentencepiece",
+    tokenizer_model: str = "",
     vocab_file_path: Optional[str] = None,
     merges_file_path: Optional[str] = None,
     num_tasks: Optional[int] = None,
@@ -68,19 +69,25 @@ def preprocess_data(
         flags = [
             f"--input={split}",
             f"--output-prefix={output_arg}",
-            f"--dataset-impl={dataset_impl}",
             f"--tokenizer-library={tokenizer_library}",
-            f"--tokenizer-type={tokenizer_type}",
+            f"--tokenizer-type={tokenizer_type}" if tokenizer_type else f"--tokenizer-model={tokenizer_model}",
             f"--workers={multiprocessing.cpu_count()}",
+            "--log-interval=100000",
+            "--apply-ftfy",
         ]
 
-        if vocab_file_path and merges_file_path:
+        if dataset_impl:
+            flags += [f"--dataset-impl={dataset_impl}"]
+
+        if vocab_file_path:
             flags += [
-                f"--vocab={vocab_file_path}",
-                f"--merge-file={merges_file_path}",
+                f"--vocab-file={vocab_file_path}",
                 "--append-eod",
             ]
 
+            if merges_file_path:
+                flags += [f"--merges-file={merges_file_path}"]
+
         final_cmd = cmd + flags
         if extra_args:
             final_cmd += extra_args
diff --git a/examples/llm/slimpajama/data_pipeline.py b/examples/llm/slimpajama/data_pipeline.py
index 94e68427c749..6493e45406d8 100644
--- a/examples/llm/slimpajama/data_pipeline.py
+++ b/examples/llm/slimpajama/data_pipeline.py
@@ -19,6 +19,7 @@ def slurm_executor(
     custom_env_vars: Optional[dict[str, str]] = None,
     container_image: str = "nvcr.io/nvidia/nemo:dev",
     retries: int = 0,
+    ssh_key_file_path: Optional[str] = None,
 ) -> run.SlurmExecutor:
     if not (user and host and remote_job_dir and account and partition and nodes and tasks_per_node):
         raise RuntimeError(
@@ -29,6 +30,7 @@ def slurm_executor(
     if custom_mounts:
         mounts.extend(custom_mounts)
 
+    # Required to run on CPU nodes
     env_vars = {"NVIDIA_VISIBLE_DEVICES": "void"}
     if custom_env_vars:
         env_vars |= custom_env_vars
@@ -40,6 +42,7 @@ def slurm_executor(
             user=user,
             host=host,
             job_dir=remote_job_dir,
+            identity=ssh_key_file_path,
         ),
         nodes=nodes,
         ntasks_per_node=tasks_per_node,
@@ -83,13 +86,20 @@ def run_data_pipeline():
         )
 
         # Use NeMo image for the remaining tasks
-        executor.container_image = "nvcr.io/nvidia/nemo:dev"
+        executor.container_image = "nvcr.io/nvidia/nemo:nightly"
         exp.add(run.Partial(run_extraction, data_dir="/data/slimpajama"), executor=executor)
 
         # examples/llm/slimpajama is automatically mounted to /nemo_run/code
         exp.add(run.Script("/nemo_run/code/data/concat.sh", args=["/data/slimpajama/train", "1"]), executor=executor)
         exp.add(
-            run.Partial(preprocess_data, data_dir="/data/slimpajama", output_dir="/data/slimpajama_megatron"),
+            run.Partial(
+                preprocess_data,
+                data_dir="/data/slimpajama",
+                output_dir="/data/slimpajama_megatron",
+                tokenizer_model="/data/tokenizer/tokenizer.model",
+                tokenizer_library="sentencepiece",
+                vocab_file_path="/data/tokenizer/tokenizer.vocab",
+            ),
             executor=executor,
         )