Skip to content

Commit

Permalink
Merge pull request #2 from TJ-Solergibert/multilingual_nanoset
Browse files Browse the repository at this point in the history
Multilingual Nanoset
  • Loading branch information
negar-foroutan authored Jul 18, 2024
2 parents 35c43f7 + eed7bce commit da50231
Show file tree
Hide file tree
Showing 6 changed files with 500 additions and 7 deletions.
134 changes: 134 additions & 0 deletions examples/config_multilingual_nanoset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
checkpoints:
checkpoint_interval: 1000
checkpoints_path: checkpoints/
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_initial_state: false
data_stages:
- data:
dataset:
training_folder: datasets/c4-es/train
validation_folder: datasets/c4-es/validation
lang_to_ids:
es: 128002
num_loading_workers: 1
seed: 42
name: General purpose training (Single dataset)
start_training_step: 1
- data:
dataset:
training_folder:
- datasets/c4-es/train
- datasets/c4-en/train
- datasets/c4-fr/train
validation_folder:
- datasets/c4-es/validation
- datasets/c4-en/validation
- datasets/c4-fr/validation
lang_to_ids:
es: 128002
en: 128003
fr: 128004
num_loading_workers: 1
seed: 42
name: Second purpose training (> 1 dataset)
start_training_step: 15
- data:
dataset:
training_folder:
datasets/c4-es/train: 0.6
datasets/c4-en/train: 0.3
datasets/c4-fr/train: 0.1
validation_folder:
- datasets/c4-es/validation
- datasets/c4-en/validation
- datasets/c4-fr/validation
lang_to_ids:
es: 128002
en: 128003
fr: 128004

num_loading_workers: 1
seed: 42
name: Third purpose training (Blended dataset)
start_training_step: 25
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: Nanoset
run: llama
seed: 42
step: null
lighteval: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.025
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 1
eos_token_id: 2
hidden_act: silu
hidden_size: 512
initializer_range: 0.02
intermediate_size: 512
is_llama_config: true
max_position_embeddings: 1024
num_hidden_layers: 2
num_attention_heads: 32
num_key_value_heads: 8
pad_token_id: null
pretraining_tp: 1
rope_interleaved: false
rope_theta: 500000.0
rms_norm_eps: 1.0e-06
rope_scaling: null
tie_word_embeddings: true
use_cache: true
vocab_size: 128256
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.0003
lr_decay_starting_step: null
lr_decay_steps: 98
lr_decay_style: cosine
lr_warmup_steps: 2
lr_warmup_style: linear
min_decay_lr: 1.0e-05
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 1
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
tp: 1
tp_linear_async_communication: false
tp_mode: REDUCE_SCATTER
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 1
limit_test_batches: 0
limit_val_batches: 10
micro_batch_size: 4
sequence_length: 1024
train_steps: 200
val_check_interval: -1
118 changes: 115 additions & 3 deletions run_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@

import numpy as np
from nanotron import logging
from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs
from nanotron.config import (
DataArgs,
DatasetStageArgs,
MultilingualNanosetDatasetsArgs,
NanosetDatasetsArgs,
PretrainDatasetsArgs,
)
from nanotron.data.dataloader_builder import build_nanoset_dataloader
from nanotron.dataloader import (
clm_process,
Expand Down Expand Up @@ -171,13 +177,94 @@ def get_dataloader_from_data_stage(
dataloader_drop_last=True,
)

return train_dataloader
# Case 4: MultilingualNanosets
elif isinstance(data.dataset, MultilingualNanosetDatasetsArgs):
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(trainer.config.tokenizer.tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Nanoset
from nanotron.data.multilingual_nanoset import MultilingualNanoset

with main_rank_first(trainer.parallel_context.world_pg):
train_dataset = MultilingualNanoset(
dataset_folders=data.dataset.training_folder,
dataset_weights=data.dataset.dataset_weights,
sequence_length=trainer.sequence_length,
token_size=token_size,
train_split_num_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
dataset_tokens=data.dataset.dataset_tokens,
random_seed=data.seed,
)

# Prepare dataloader
train_dataloader = build_nanoset_dataloader(
train_dataset,
trainer.sequence_length,
parallel_context=trainer.parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=trainer.micro_batch_size,
consumed_train_samples=consumed_train_samples,
dataloader_num_workers=data.num_loading_workers,
dataloader_drop_last=True,
)

return train_dataloader
else:
raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")

return dataloader


def get_valid_dataloader_from_data_stage(
trainer: DistributedTrainer,
data: DataArgs,
# consumed_train_samples: int, We will never use this because in each valid iteration we consume all the samples
):

# First, we need to know which ranks to feed the dataloader to
input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)

# Only support Validation with MultilingualNanosets
if isinstance(data.dataset, MultilingualNanosetDatasetsArgs):
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(trainer.config.tokenizer.tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Multilingual Nanoset
from nanotron.data.multilingual_nanoset import MultilingualNanoset

with main_rank_first(trainer.parallel_context.world_pg):
valid_dataset = MultilingualNanoset(
dataset_folders=data.dataset.validation_folder,
sequence_length=trainer.sequence_length,
token_size=token_size,
dataset_tokens=data.dataset.dataset_tokens,
is_valid=True,
random_seed=data.seed,
)

# Prepare dataloader
valid_dataloader = build_nanoset_dataloader(
valid_dataset,
trainer.sequence_length,
parallel_context=trainer.parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=trainer.micro_batch_size,
dataloader_num_workers=data.num_loading_workers,
dataloader_drop_last=True,
)

return valid_dataloader
else:
raise ValueError(
f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}. Validation is currently just supported for MultilingualNanoset"
)


def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
dataloaders = {}

Expand Down Expand Up @@ -219,6 +306,30 @@ def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
return dataloaders


def get_valid_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
dataloaders = {}

for stage_idx, stage in enumerate(trainer.config.data_stages):
# NOTE: we only create the dataloader for the first stage,
# then we lazy initialize the dataloader for the other stages
stage = cast(DatasetStageArgs, stage)

log_rank(
f"[Validation Plan] Stage {stage.name} has {len(stage.data.dataset.validation_folder)} folders with samples in the validation set",
logger=logger,
level=logging.INFO,
rank=0,
)

dataloader = (
get_valid_dataloader_from_data_stage(trainer, stage.data)
if stage_idx == 0
else lambda stage=stage: get_dataloader_from_data_stage(trainer, stage.data)
)
dataloaders[stage.name] = dataloader
return dataloaders


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
Expand All @@ -231,7 +342,8 @@ def get_args():

# Load trainer and data
trainer = DistributedTrainer(config_file)
dataloader = get_dataloader(trainer)
train_dataloader = get_dataloader(trainer)
valid_dataloader = get_valid_dataloader(trainer)

# Train
trainer.train(dataloader)
trainer.train(train_dataloader, valid_dataloader)
29 changes: 28 additions & 1 deletion src/nanotron/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,11 +107,38 @@ def __post_init__(self):
self.dataset_weights = list(tmp_dataset_folder.values())


@dataclass
class MultilingualNanosetDatasetsArgs:
training_folder: Union[str, dict, List[str]]
validation_folder: Union[str, List[str]]
lang_to_ids: dict # Mapping from the previously defined folders to tokens. Respect the order

def __post_init__(self):
if isinstance(self.training_folder, str): # Case 1: 1 Dataset folder
self.training_folder = [self.training_folder]
self.validation_folder = [self.validation_folder]
self.dataset_weights = [1]
elif isinstance(self.training_folder, List): # Case 2: > 1 Dataset folder
self.dataset_weights = None # Set to None so we consume all the samples randomly
elif isinstance(self.training_folder, dict): # Case 3: dict with > 1 training_folder and weights
tmp_training_folder = self.training_folder.copy()
self.training_folder = list(tmp_training_folder.keys())
self.dataset_weights = list(tmp_training_folder.values())

self.dataset_tokens = list(self.lang_to_ids.values())
assert len(self.training_folder) == len(
self.validation_folder
), f"The sizes of training_folder and validation_folder mismatch ({len(self.training_folder)} vs {len(self.validation_folder)})"
assert len(self.training_folder) == len(
self.dataset_tokens
), f"The sizes of training_folder and lang_to_ids mismatch ({len(self.training_folder)} vs {len(self.dataset_tokens)})"


@dataclass
class DataArgs:
"""Arguments related to the data and data files processing"""

dataset: Union[PretrainDatasetsArgs, NanosetDatasetsArgs]
dataset: Union[PretrainDatasetsArgs, NanosetDatasetsArgs, MultilingualNanosetDatasetsArgs]
seed: Optional[int]
num_loading_workers: Optional[int] = 1

Expand Down
Loading

0 comments on commit da50231

Please sign in to comment.