fix based on reviews

Signed-off-by: Vivian Chen <[email protected]>
xuanzic · Aug 2, 2024 · 9415477 · 9415477
1 parent fc11129
commit 9415477
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 15 deletions.
diff --git a/docs/source/multimodal/mllm/sequence_packing.rst b/docs/source/multimodal/mllm/sequence_packing.rst
@@ -103,15 +103,13 @@ To train with packed sequences, modify four items in the SFT/PEFT config file.
 
 .. code-block:: bash
 
-    ++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
-    ++model.data.crop_size=[224,224]
     ++model.data.packed_sequence=True
 
 2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached:
 
 .. code-block:: bash
 
-    ++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
+    ++model.data.data_path=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
     ++model.data.crop_size=[336,336]
 
 4. Adjust batch sizes:

diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -186,8 +186,15 @@ model:
     packed_sequence: False
     num_workers: 8
     dataloader_type: cyclic
-    data_path:
-      # could be a path to a single file or a list of files for data blending like below
+    data_path: 
+      # This configuration can either be a single string pointing to a data path, or a list of data paths for data blending.
+      # When using a blendable dataset, be aware of the following:
+      # - The sampling of data across datasets depends on both the relative sizes of the datasets and the concat_sampling_probabilities.
+      # - For example, if there are two datasets with lengths of 100 and 10, and the sampling probabilities are set to 0.5 for each,
+      #   then 55 samples would be taken from the dataset of length 100 and 55 from the dataset of length 10 (with repetition).
+      # - This means not all data might be seen in one epoch, and smaller datasets may need to be repeated to match the number of samples.
+      #   Please adjust your concat_sampling_probabilities accordingly to ensure balanced and effective training.
+
       # - /path/to/json
       # - /path/to/json
     global_batch_size: ${model.global_batch_size}
@@ -198,11 +205,9 @@ model:
     lazy_preprocess: True
     is_multimodal: True
     media_type: image # currently supported: image
-    num_frames: -1
     sep_image_conv_front: False
     conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
     image_folder: null
-    video_folder: null
     image_aspect_ratio: 'square'
 
   # Nsys profiling options

diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -1321,16 +1321,36 @@ def build_train_valid_test_datasets_blend(self):
 
     def build_train_valid_test_datasets(self):
         logging.info('Building Neva datasets.')
-        if isinstance(self.cfg.data.data_path, ListConfig) and self.cfg.data.get('concat_sampling_probabilities'):
-            return self.build_train_valid_test_datasets_blend()
+
+        if isinstance(self.cfg.data.data_path, (list, ListConfig)):
+            if len(self.cfg.data.data_path) > 1:
+                # Only consider data blending if there are multiple dataset paths
+                if self.cfg.data.get('concat_sampling_probabilities') is None:
+                    logging.warning("No sampling probabilities provided. Defaulting to uniform sampling.")
+                    self.cfg.data.concat_sampling_probabilities = [1 / len(self.cfg.data.data_path)] * len(self.cfg.data.data_path)
+                elif sum(self.cfg.data.concat_sampling_probabilities) != 1:
+                    raise ValueError("Concat_sampling_probabilities must sum up to 1.")
+                return self.build_train_valid_test_datasets_blend()
+            elif len(self.cfg.data.data_path) == 1: 
+                if self.cfg.data.concat_sampling_probabilities is not None:
+                    logging.warning("Using sampling probabilities with a single dataset has no effect. Defaulting to None and not using blend dataset.")
+                    self.cfg.data.concat_sampling_probabilities = None
+                self.cfg.data.data_path = self.cfg.data.data_path[0]
+            else:
+                raise ValueError("data_path must contain at least one valid path.")
+        elif isinstance(self.cfg.data.data_path, str):
+            pass
+        else:
+            raise TypeError("data_path must be a list of paths or a single string")
 
         if self.cfg.data.get("packed_sequence", False):
             assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
+
             self._train_ds = NevaPackedSeqDatatset(
-                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+                self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
             )
             self._validation_ds = NevaPackedSeqDatatset(
-                self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
+                self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
             )
         else:
             ds_dict = make_supervised_data_module(

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py
@@ -74,12 +74,14 @@ def __len__(self):
     def __getitem__(self, idx):
         dataset_idx = self.dataset_index[idx]
         sample_idx = self.dataset_sample_index[idx]
+        dataset_size = len(self.datasets[dataset_idx])
         # Ensure the sample index doesn't exceed the dataset size
-        # original build_index function does not handle the extreme case properly
-        sample_idx = sample_idx % len(self.datasets[dataset_idx])
-        data = self.datasets[dataset_idx][sample_idx]
+        if sample_idx >= dataset_size:
+            logging.warning(f"Index {sample_idx} out of bounds for dataset {dataset_idx}. Reusing existing examples.")
+            sample_idx = sample_idx % dataset_size
+            logging.warning(f"Reusing index {sample_idx} for dataset {dataset_idx}.")
 
-        return data
+        return self.datasets[dataset_idx][sample_idx]
 
     def create_data_mmap(self):
         for dataset in self.datasets: