Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Jan 31, 2024
1 parent 2f21d8d commit 1129b2e
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 103 deletions.
2 changes: 1 addition & 1 deletion examples/nlp/language_modeling/megatron_retro_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,4 @@ def dummy():


if __name__ == '__main__':
main()
main()
141 changes: 70 additions & 71 deletions nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,35 +33,29 @@

try:
from megatron.core import mpu, tensor_parallel

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'tensor_parallel' is not used.
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
from megatron.core.datasets.blended_megatron_dataset_config import GPTDatasetConfig
from megatron.core.datasets.gpt_dataset import GPTDataset

# from megatron.core.models.retro.data.query.retro_dataset import RetroDataset as MCoreRETRODataset
# from megatron.core.models.retro.data.db.utils import get_merged_train_dataset as get_db_dataset
# from megatron.core.models.retro.data.query.chunk_dataset import get_chunk_dataset_map
# from megatron.core.models.retro.data.utils import BlockPathMap
# from megatron.core.models.retro.data.query.utils import get_neighbor_dir
from megatron.core.models.retro.data.config import RetroGPTDatasets
from megatron.core.models.retro.model import RetroConfig
from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
from megatron.core.datasets.blended_megatron_dataset_config import GPTDatasetConfig
from megatron.core.datasets.gpt_dataset import GPTDataset
from megatron.core.models.retro.data.query import get_retro_datasets
from megatron.core.models.retro.model import RetroConfig
from megatron.utils import get_ltor_masks_and_position_ids


HAVE_MEGATRON_CORE = True

except (ImportError, ModuleNotFoundError):

HAVE_MEGATRON_CORE = False


class RETRODataset(Dataset):
def __init__(
self,
cfg,
retro_config: RetroConfig,
tokenizer,
mcore_retro_dataset,
number_samples_with_neighbors
):
def __init__(self, cfg, retro_config: RetroConfig, tokenizer, mcore_retro_dataset, number_samples_with_neighbors):
super().__init__()

self.reset_position_ids = cfg.data.get('reset_position_ids', False)
Expand All @@ -79,7 +73,7 @@ def __init__(

def __len__(self):
return len(self.mcore_retro_dataset.chunk_dataset.sample_dataset)

def _get_text(self, idx: int):
return self.mcore_retro_dataset[idx]

Expand All @@ -92,33 +86,30 @@ def __getitem__(self, idx):

# Unpack
tokens_ = torch.from_numpy(sample['text'])
tokens_ = tokens_.long() # size should be [seq_length]
tokens_ = tokens_.long() # size should be [seq_length]
labels = tokens_[1:].contiguous()
tokens = tokens_[:-1].contiguous()
neighbor_tokens = torch.from_numpy(sample['neighbor_tokens'])
neighbor_tokens = neighbor_tokens.long() # size should be [l, k, r]
neighbor_tokens = neighbor_tokens.long() # size should be [l, k, r]

# note: [l, k, r] => [l*k, r]
# note: 2x == neighbor, continuation
neighbor_tokens = neighbor_tokens \
.view(-1, self.retro_config.retro_retrieved_length).long()
neighbor_tokens = neighbor_tokens.view(-1, self.retro_config.retro_retrieved_length).long()

# Get the masks and postition ids for tokens and neighbor_tokens
tokens = torch.unsqueeze(tokens, 0) # get_ltor_masks_and_position_ids takes as input tokens arguments as a batch (2D tensor), so need to convert tokens from 1D to 2D
tokens = torch.unsqueeze(
tokens, 0
) # get_ltor_masks_and_position_ids takes as input tokens arguments as a batch (2D tensor), so need to convert tokens from 1D to 2D
attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
tokens,
self.eos_id,
self.reset_position_ids,
self.reset_attention_mask,
self.eod_mask_loss)
tokens, attention_mask, loss_mask, position_ids = tokens[0], attention_mask[0], loss_mask[0], position_ids[0]
_, _, neighbor_position_ids = get_ltor_masks_and_position_ids( # neighbor_tokens is already a 2D array
neighbor_tokens,
self.eos_id,
self.reset_position_ids,
self.reset_attention_mask,
self.eod_mask_loss)
neighbor_attention_mask = torch.zeros([1, 1]) # just a dummy values, since the batch neighbor_attention_mask will be set to None in megatron_retro_model.py following Lawrence's implementation
tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss
)
tokens, attention_mask, loss_mask, position_ids = tokens[0], attention_mask[0], loss_mask[0], position_ids[0]
_, _, neighbor_position_ids = get_ltor_masks_and_position_ids( # neighbor_tokens is already a 2D array
neighbor_tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss
)
neighbor_attention_mask = torch.zeros(
[1, 1]
) # just a dummy values, since the batch neighbor_attention_mask will be set to None in megatron_retro_model.py following Lawrence's implementation

# # DEBUGGING
# if torch.distributed.get_rank() == 0:
Expand All @@ -140,8 +131,6 @@ def __getitem__(self, idx):
# print("========================================================== \n\n\n")
# # exit()



return {
'tokens': tokens,
'labels': labels,
Expand All @@ -150,15 +139,12 @@ def __getitem__(self, idx):
'position_ids': position_ids,
'context_input_ids': neighbor_tokens,
'context_attention_mask': neighbor_attention_mask,
'context_position_ids': neighbor_position_ids
'context_position_ids': neighbor_position_ids,
}


def build_train_valid_test_datasets(
cfg,
retro_config: RetroConfig,
train_valid_test_num_samples,
seq_length,
tokenizer,
cfg, retro_config: RetroConfig, train_valid_test_num_samples, seq_length, tokenizer,
):

# gpt dataset
Expand All @@ -183,30 +169,42 @@ def build_train_valid_test_datasets(
# print(stop_here)

Check notice

Code scanning / CodeQL

Commented-out code Note

This comment appears to contain commented-out code.

retro_train_ds, retro_valid_ds, retro_test_ds = get_retro_datasets(
config=retro_config,
gpt_datasets=gpt_datasets,
sample_length=seq_length,
eod_token_id=tokenizer.eos_id,
config=retro_config, gpt_datasets=gpt_datasets, sample_length=seq_length, eod_token_id=tokenizer.eos_id,
)

train_ds = RETRODataset(
cfg = cfg,
retro_config = retro_config,
tokenizer = tokenizer,
mcore_retro_dataset = retro_train_ds,
number_samples_with_neighbors=train_valid_test_num_samples[0]) if retro_train_ds else None
valid_ds = RETRODataset(
cfg = cfg,
retro_config = retro_config,
tokenizer = tokenizer,
mcore_retro_dataset = retro_valid_ds,
number_samples_with_neighbors=train_valid_test_num_samples[1]) if retro_valid_ds else None
test_ds = RETRODataset(
cfg = cfg,
retro_config = retro_config,
tokenizer = tokenizer,
mcore_retro_dataset = retro_test_ds,
number_samples_with_neighbors=train_valid_test_num_samples[2]) if retro_test_ds else None
train_ds = (
RETRODataset(
cfg=cfg,
retro_config=retro_config,
tokenizer=tokenizer,
mcore_retro_dataset=retro_train_ds,
number_samples_with_neighbors=train_valid_test_num_samples[0],
)
if retro_train_ds
else None
)
valid_ds = (
RETRODataset(
cfg=cfg,
retro_config=retro_config,
tokenizer=tokenizer,
mcore_retro_dataset=retro_valid_ds,
number_samples_with_neighbors=train_valid_test_num_samples[1],
)
if retro_valid_ds
else None
)
test_ds = (
RETRODataset(
cfg=cfg,
retro_config=retro_config,
tokenizer=tokenizer,
mcore_retro_dataset=retro_test_ds,
number_samples_with_neighbors=train_valid_test_num_samples[2],
)
if retro_test_ds
else None
)

return train_ds, valid_ds, test_ds

Expand All @@ -220,7 +218,9 @@ def gpt_train_valid_test_datasets_provider(cfg, train_val_test_num_samples):
"""

def is_dataset_built_on_rank():
return (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()) and mpu.get_tensor_model_parallel_rank() == 0
return (
mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
) and mpu.get_tensor_model_parallel_rank() == 0

def core_gpt_dataset_config_from_args(cfg): # need to make sure all args are the overriden written
# Implemented from core_gpt_dataset_config_from_args in M-LM/pretrain_gpt.py
Expand All @@ -229,23 +229,22 @@ def core_gpt_dataset_config_from_args(cfg): # need to make sure all args are th
random_seed=cfg.seed,
sequence_length=cfg.data.seq_length,
blend=cfg.data.data_prefix,
blend_per_split=[None, None, None], # no corresponding argument in Nemo, set to None, train/valid/test is constructed from data_path and split
blend_per_split=[
None,
None,
None,
], # no corresponding argument in Nemo, set to None, train/valid/test is constructed from data_path and split
split=cfg.data.splits_string,
path_to_cache=None, # no corresponding argument in Nemo, set to None, train/valid/test is constructed from data_path and split
path_to_cache=None, # no corresponding argument in Nemo, set to None, train/valid/test is constructed from data_path and split
return_document_ids=False,
)

print("> building train, validation, and test datasets for GPT ...")

train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
GPTDataset,
train_val_test_num_samples,
core_gpt_dataset_config_from_args(cfg)
GPTDataset, train_val_test_num_samples, core_gpt_dataset_config_from_args(cfg)
).build()

print("> finished creating GPT datasets ...")



return train_ds, valid_ds, test_ds

1 change: 1 addition & 0 deletions nemo/collections/nlp/models/language_modeling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@
MegatronGPTPromptLearningModel,
)
from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel

# from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
from nemo.collections.nlp.models.language_modeling.transformer_lm_model import TransformerLMModel
Original file line number Diff line number Diff line change
Expand Up @@ -1369,7 +1369,7 @@ def setup_test_data(self, cfg):
self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
else:
self._test_dl = None

def generate(
self,
inputs: Union[List[str], torch.Tensor, List[dict]],
Expand Down
Loading

0 comments on commit 1129b2e

Please sign in to comment.