Skip to content

Commit

Permalink
Apply isort and black reformatting
Browse files Browse the repository at this point in the history
Signed-off-by: xuanzic <[email protected]>
  • Loading branch information
xuanzic committed Aug 1, 2024
1 parent 8af0b31 commit 4161292
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 37 deletions.
4 changes: 2 additions & 2 deletions nemo/collections/multimodal/data/neva/neva_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,7 @@ class DataCollatorForSupervisedDataset(object):
tokenizer: transformers.PreTrainedTokenizer

def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
packed_sequence = "cu_seqlens" in instances[0]
packed_sequence = "cu_seqlens" in instances[0]
max_len = max(instance['tokens'].shape[0] for instance in instances)
max_len = (max_len - 1) // 64 * 64 + 64
for instance in instances:
Expand Down Expand Up @@ -1354,7 +1354,7 @@ def make_supervised_data_module(tokenizer, image_processor, model_cfg, data_file
)

return dict(train_dataset=train_dataset, eval_dataset=train_dataset)


class NevaPackedSeqDatatset(Dataset):
def __init__(self, data_path: str, crop_size: Tuple[int, int] = (224, 224)):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@
NevaPackedSeqDatatset,
make_supervised_data_module,
)
from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
get_datasets_weights_and_num_samples,
)
from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import (
CLIPVisionTransformer,
MegatronCLIPModel,
)
from nemo.collections.multimodal.parts.utils import create_image_processor, load_nemo_model_weights
from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
get_datasets_weights_and_num_samples,
)
from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import MegatronPretrainingSampler
from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel, get_specs
Expand Down Expand Up @@ -1238,7 +1238,7 @@ def setup(self, stage=None):

if self.cfg.get('transformer_engine', False):
self.setup_transformer_engine_tp_groups()

def build_train_valid_test_datasets_blend(self):
logging.info('Building Blending Neva datasets.')

Expand All @@ -1255,7 +1255,9 @@ def build_train_valid_test_datasets_blend(self):
if data_cfg.get('concat_sampling_probabilities') is None or not isinstance(
data_cfg.concat_sampling_probabilities, ListConfig
):
raise ValueError("concat_sampling_probabilities must be a ListConfig with the same number of entries as data_path.")
raise ValueError(
"concat_sampling_probabilities must be a ListConfig with the same number of entries as data_path."
)

if len(data_cfg.concat_sampling_probabilities) != len(data_cfg.data_path):
raise ValueError(
Expand All @@ -1265,17 +1267,15 @@ def build_train_valid_test_datasets_blend(self):

for data_file in data_cfg.data_path:
if is_packed_sequence:
train_dataset = NevaPackedSeqDatatset(
data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size")
)
valid_dataset = NevaPackedSeqDatatset(
data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size")
)
train_dataset = NevaPackedSeqDatatset(data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size"))
valid_dataset = NevaPackedSeqDatatset(data_file, self.cfg.mm_cfg.vision_encoder.get("crop_size"))
else:
ds_dict = make_supervised_data_module(
tokenizer=self.tokenizer,
image_processor=(
self.model.module.image_processor if hasattr(self.model, "module") else self.model.image_processor
self.model.module.image_processor
if hasattr(self.model, "module")
else self.model.image_processor
),
model_cfg=self.cfg,
data_file=data_file,
Expand All @@ -1285,37 +1285,35 @@ def build_train_valid_test_datasets_blend(self):

train_datasets.append(train_dataset)
valid_datasets.append(valid_dataset)

# Create BlendableDataset for training
if self.trainer.max_steps is None or self.trainer.max_steps <= 0:
raise ValueError(f'Trainer max_steps must be set to a positive integer. Found {self.trainer.max_steps}')

num_train_samples = self.trainer.max_steps * data_cfg.global_batch_size
_, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(
data_prefix=[weight for pair in zip(data_cfg.concat_sampling_probabilities, data_cfg.data_path) for weight in pair],
num_samples=[num_train_samples]
data_prefix=[
weight for pair in zip(data_cfg.concat_sampling_probabilities, data_cfg.data_path) for weight in pair
],
num_samples=[num_train_samples],
)
num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])

logging.info(f"Number of train datasets: {len(train_datasets)}")
logging.info(f"Lengths of train datasets: {[len(ds) for ds in train_datasets]}")
logging.info(f"Number of train datasets after blending: {num_train_samples_after_blend}")

if is_packed_sequence:
num_train_samples_after_blend = sum([len(ds) for ds in train_datasets])

self._train_ds = BlendableDataset(
datasets=train_datasets,
weights=data_cfg.concat_sampling_probabilities,
size=num_train_samples_after_blend
)

datasets=train_datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend
)

self._validation_ds = BlendableDataset(
datasets=valid_datasets,
weights=data_cfg.concat_sampling_probabilities,
size=num_train_samples_after_blend
)

datasets=valid_datasets, weights=data_cfg.concat_sampling_probabilities, size=num_train_samples_after_blend
)

logging.info(f'Length of train dataset: {len(self._train_ds)}')
logging.info(f'Length of validation dataset: {len(self._validation_ds)}')

Expand All @@ -1325,7 +1323,7 @@ def build_train_valid_test_datasets(self):
logging.info('Building Neva datasets.')
if isinstance(self.cfg.data.data_path, ListConfig) and self.cfg.data.get('concat_sampling_probabilities'):
return self.build_train_valid_test_datasets_blend()

if self.cfg.data.get("packed_sequence", False):
assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
self._train_ds = NevaPackedSeqDatatset(
Expand Down Expand Up @@ -1385,7 +1383,7 @@ def build_pretraining_data_loader(
raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"')
else:
raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"')

collate_func = DataCollatorForSupervisedDataset(self.cfg, self.tokenizer)
return torch.utils.data.DataLoader(
dataset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, datasets, weights, size):
assert num_datasets < 255
self.dataset_index = np.zeros(self.size, dtype=np.uint8)
self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)

app_state = AppState()
try:
if app_state.local_rank == 0:
Expand Down Expand Up @@ -78,9 +78,8 @@ def __getitem__(self, idx):
# original build_index function does not handle the extreme case properly
sample_idx = sample_idx % len(self.datasets[dataset_idx])
data = self.datasets[dataset_idx][sample_idx]

return data


def create_data_mmap(self):
for dataset in self.datasets:
Expand All @@ -91,7 +90,7 @@ class MemoryEfficientBlendableDataset(torch.utils.data.Dataset):
"""
A BlendableDataset implementation that uses less memory than the original implementation.
Indices are computed algorithmically instead of storing them in memory.
To test call: MemoryEfficientBlendableDataset.test_index_blending()
"""

Expand Down

0 comments on commit 4161292

Please sign in to comment.