Skip to content

Commit

Permalink
fix based on reviews
Browse files Browse the repository at this point in the history
Signed-off-by: Vivian Chen <[email protected]>
  • Loading branch information
xuanzic committed Aug 2, 2024
1 parent fc11129 commit 9415477
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 15 deletions.
4 changes: 1 addition & 3 deletions docs/source/multimodal/mllm/sequence_packing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -103,15 +103,13 @@ To train with packed sequences, modify four items in the SFT/PEFT config file.

.. code-block:: bash
++model.data.data_prefix=/lustre/fsw/coreai_dlalgo_genai/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
++model.data.crop_size=[224,224]
++model.data.packed_sequence=True
2. Use the new dataset file instead of the original JSONL file and ensure the crop sizes are correctly specified since images are now cached:

.. code-block:: bash
++model.data.data_prefix=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
++model.data.data_path=/path/to/datasets/LLaVA-Instruct-150K/packed_seq_12288_336_v1/packed_seq_dataset
++model.data.crop_size=[336,336]
4. Adjust batch sizes:
Expand Down
13 changes: 9 additions & 4 deletions examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,15 @@ model:
packed_sequence: False
num_workers: 8
dataloader_type: cyclic
data_path:
# could be a path to a single file or a list of files for data blending like below
data_path:
# This configuration can either be a single string pointing to a data path, or a list of data paths for data blending.
# When using a blendable dataset, be aware of the following:
# - The sampling of data across datasets depends on both the relative sizes of the datasets and the concat_sampling_probabilities.
# - For example, if there are two datasets with lengths of 100 and 10, and the sampling probabilities are set to 0.5 for each,
# then 55 samples would be taken from the dataset of length 100 and 55 from the dataset of length 10 (with repetition).
# - This means not all data might be seen in one epoch, and smaller datasets may need to be repeated to match the number of samples.
# Please adjust your concat_sampling_probabilities accordingly to ensure balanced and effective training.

# - /path/to/json
# - /path/to/json
global_batch_size: ${model.global_batch_size}
Expand All @@ -198,11 +205,9 @@ model:
lazy_preprocess: True
is_multimodal: True
media_type: image # currently supported: image
num_frames: -1
sep_image_conv_front: False
conv_template: ${model.mm_cfg.llm.model_type} # check `nemo/collections/multimodal/data/neva/conversation.py`
image_folder: null
video_folder: null
image_aspect_ratio: 'square'

# Nsys profiling options
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1321,16 +1321,36 @@ def build_train_valid_test_datasets_blend(self):

def build_train_valid_test_datasets(self):
logging.info('Building Neva datasets.')
if isinstance(self.cfg.data.data_path, ListConfig) and self.cfg.data.get('concat_sampling_probabilities'):
return self.build_train_valid_test_datasets_blend()

if isinstance(self.cfg.data.data_path, (list, ListConfig)):
if len(self.cfg.data.data_path) > 1:
# Only consider data blending if there are multiple dataset paths
if self.cfg.data.get('concat_sampling_probabilities') is None:
logging.warning("No sampling probabilities provided. Defaulting to uniform sampling.")
self.cfg.data.concat_sampling_probabilities = [1 / len(self.cfg.data.data_path)] * len(self.cfg.data.data_path)
elif sum(self.cfg.data.concat_sampling_probabilities) != 1:
raise ValueError("Concat_sampling_probabilities must sum up to 1.")
return self.build_train_valid_test_datasets_blend()
elif len(self.cfg.data.data_path) == 1:
if self.cfg.data.concat_sampling_probabilities is not None:
logging.warning("Using sampling probabilities with a single dataset has no effect. Defaulting to None and not using blend dataset.")
self.cfg.data.concat_sampling_probabilities = None
self.cfg.data.data_path = self.cfg.data.data_path[0]
else:
raise ValueError("data_path must contain at least one valid path.")
elif isinstance(self.cfg.data.data_path, str):
pass
else:
raise TypeError("data_path must be a list of paths or a single string")

if self.cfg.data.get("packed_sequence", False):
assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"

self._train_ds = NevaPackedSeqDatatset(
self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
)
self._validation_ds = NevaPackedSeqDatatset(
self.cfg.data.data_prefix, self.cfg.mm_cfg.vision_encoder.get("crop_size")
self.cfg.data.data_path, self.cfg.mm_cfg.vision_encoder.get("crop_size")
)
else:
ds_dict = make_supervised_data_module(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,14 @@ def __len__(self):
def __getitem__(self, idx):
dataset_idx = self.dataset_index[idx]
sample_idx = self.dataset_sample_index[idx]
dataset_size = len(self.datasets[dataset_idx])
# Ensure the sample index doesn't exceed the dataset size
# original build_index function does not handle the extreme case properly
sample_idx = sample_idx % len(self.datasets[dataset_idx])
data = self.datasets[dataset_idx][sample_idx]
if sample_idx >= dataset_size:
logging.warning(f"Index {sample_idx} out of bounds for dataset {dataset_idx}. Reusing existing examples.")
sample_idx = sample_idx % dataset_size
logging.warning(f"Reusing index {sample_idx} for dataset {dataset_idx}.")

return data
return self.datasets[dataset_idx][sample_idx]

def create_data_mmap(self):
for dataset in self.datasets:
Expand Down

0 comments on commit 9415477

Please sign in to comment.