diff --git a/docs/source/multimodal/mllm/sequence_packing.rst b/docs/source/multimodal/mllm/sequence_packing.rst index b061ee1d89c6..c5587e3f7173 100644 --- a/docs/source/multimodal/mllm/sequence_packing.rst +++ b/docs/source/multimodal/mllm/sequence_packing.rst @@ -103,15 +103,13 @@ To train with packed sequences, modify four items in the SFT/PEFT config file. .. code-block:: bash - ++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset - ++model.data.crop_size=[224,224] ++model.data.packed_sequence=True 2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached: .. code-block:: bash - ++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset + ++model.data.data_path=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset ++model.data.crop_size=[336,336] 4. Adjust batch sizes: diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml index 523a65fd95a7..3362f48032fc 100644 --- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml +++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml @@ -186,8 +186,15 @@ model: packed_sequence: False num_workers: 8 dataloader_type: cyclic - data_path: - # could be a path to a single file or a list of files for data blending like below + data_path: + # This configuration can either be a single string pointing to a data path, or a list of data paths for data blending. + # When using a blendable dataset, be aware of the following: + # - The sampling of data across datasets depends on both the relative sizes of the datasets and the concat_sampling_probabilities. + # - For example, if there are two datasets with lengths of 100 and 10, and the sampling probabilities are set to 0.5 for each, + # then 55 samples would be taken from the dataset of length 100 and 55 from the dataset of length 10 (with repetition). + # - This means not all data might be seen in one epoch, and smaller datasets may need to be repeated to match the number of samples. + # Please adjust your concat_sampling_probabilities accordingly to ensure balanced and effective training. + # - /path/to/json # - /path/to/json global_batch_size: ${model.global_batch_size} @@ -198,11 +205,9 @@ model: lazy_preprocess: True is_multimodal: True media_type: image # currently supported: image - num_frames: -1 sep_image_conv_front: False conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py` image_folder: null - video_folder: null image_aspect_ratio: 'square' # Nsys profiling options diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index c73ef2af2c5e..c0103d61eb2e 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -1321,16 +1321,36 @@ def build_train_valid_test_datasets_blend(self): def build_train_valid_test_datasets(self): logging.info('Building Neva datasets.') - if isinstance(self.cfg.data.data_path, ListConfig) and self.cfg.data.get('concat_sampling_probabilities'): - return self.build_train_valid_test_datasets_blend() + + if isinstance(self.cfg.data.data_path, (list, ListConfig)): + if len(self.cfg.data.data_path) > 1: + # Only consider data blending if there are multiple dataset paths + if self.cfg.data.get('concat_sampling_probabilities') is None: + logging.warning("No sampling probabilities provided. Defaulting to uniform sampling.") + self.cfg.data.concat_sampling_probabilities = [1 / len(self.cfg.data.data_path)] * len(self.cfg.data.data_path) + elif sum(self.cfg.data.concat_sampling_probabilities) != 1: + raise ValueError("Concat_sampling_probabilities must sum up to 1.") + return self.build_train_valid_test_datasets_blend() + elif len(self.cfg.data.data_path) == 1: + if self.cfg.data.concat_sampling_probabilities is not None: + logging.warning("Using sampling probabilities with a single dataset has no effect. Defaulting to None and not using blend dataset.") + self.cfg.data.concat_sampling_probabilities = None + self.cfg.data.data_path = self.cfg.data.data_path[0] + else: + raise ValueError("data_path must contain at least one valid path.") + elif isinstance(self.cfg.data.data_path, str): + pass + else: + raise TypeError("data_path must be a list of paths or a single string") if self.cfg.data.get("packed_sequence", False): assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence" + self._train_ds = NevaPackedSeqDatatset( - self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size") + self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size") ) self._validation_ds = NevaPackedSeqDatatset( - self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size") + self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size") ) else: ds_dict = make_supervised_data_module( diff --git a/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py index 47251d93ae6b..39b64ae89865 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/blendable_dataset.py @@ -74,12 +74,14 @@ def __len__(self): def __getitem__(self, idx): dataset_idx = self.dataset_index[idx] sample_idx = self.dataset_sample_index[idx] + dataset_size = len(self.datasets[dataset_idx]) # Ensure the sample index doesn't exceed the dataset size - # original build_index function does not handle the extreme case properly - sample_idx = sample_idx % len(self.datasets[dataset_idx]) - data = self.datasets[dataset_idx][sample_idx] + if sample_idx >= dataset_size: + logging.warning(f"Index {sample_idx} out of bounds for dataset {dataset_idx}. Reusing existing examples.") + sample_idx = sample_idx % dataset_size + logging.warning(f"Reusing index {sample_idx} for dataset {dataset_idx}.") - return data + return self.datasets[dataset_idx][sample_idx] def create_data_mmap(self): for dataset in self.datasets: