Skip to content

Commit

Permalink
NeMo 2.0 configs adaptation
Browse files Browse the repository at this point in the history
Signed-off-by: dimapihtar <[email protected]>
  • Loading branch information
dimapihtar committed Aug 27, 2024
1 parent cedf70c commit 333bc51
Show file tree
Hide file tree
Showing 58 changed files with 138 additions and 1,555 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
124 changes: 4 additions & 120 deletions nemo/collections/llm/tools/auto_configurator/autoconfig/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
import os
from typing import Tuple

import omegaconf
import yaml
from autoconfig import utils


Expand Down Expand Up @@ -511,17 +509,10 @@ def _gbs_tp_pp_bert_40gb(model_size_in_b: float, seq_length: int) -> Tuple[int,


def generate_base_config(
model_size_in_b: float,
nodes: int,
gpus_per_node: int,
gpu_memory_gb: int,
max_training_days: float,
num_tokens_in_b: int,
vocab_size: int,
seq_length: int,
custom_cfg: str,
model_name: str,
cfg: omegaconf.dictconfig.DictConfig,
model_version: int,
model_size_in_b: int,
cfg: dict,
):
"""
Generates base config dictionary for a given model name and size.
Expand All @@ -535,114 +526,7 @@ def generate_base_config(
:return: base config object for the given model.
:rtype: dict
"""
base_cfg = utils.generic_base_config(cfg=cfg, custom_cfg=custom_cfg, model_name=model_name)

# GBS: global batch size
if custom_cfg is None:
gbs, tp, pp, cp, ep = _calculate_gbs_tp_pp(
model_size_in_b=model_size_in_b,
gpu_memory_gb=gpu_memory_gb,
model_name=model_name,
seq_length=seq_length,
)
else:
gbs = base_cfg["model"]["global_batch_size"]
tp = base_cfg["model"]["tensor_model_parallel_size"]
pp = base_cfg["model"]["pipeline_model_parallel_size"]
default_cp = None if model_name in ["bert", "t5", "mt5"] else 1
default_ep = None if model_name in ["bert", "t5", "mt5"] else 1
cp = base_cfg["model"].get("context_parallel_size", default_cp)
ep = base_cfg["model"].get("expert_model_parallel_size", default_ep)
# RUN
base_cfg["run"]["name"] = f"{model_name}_{model_size_in_b}b"
base_cfg["run"]["results_dir"] = "${base_results_dir}/${.name}"
int_days = int(max_training_days)
int_hours = int(24 * (max_training_days - int(max_training_days)))
base_cfg["run"]["time_limit"] = f"{int_days}-{int_hours:02d}:00:00"

# TRAINER
base_cfg["trainer"]["num_nodes"] = nodes
base_cfg["trainer"]["precision"] = "bf16"
base_cfg["trainer"]["max_steps"] = int((num_tokens_in_b * 1e9) / (seq_length * gbs))
if int_hours == 0:
int_days -= 1
int_hours = 23
else:
int_hours -= 1
base_cfg["trainer"]["max_time"] = f"{int_days}:{int_hours:02d}:30:00"

# EXP_MANAGER
wandb_cfg = cfg.get("wandb")
enable = wandb_cfg.get("enable")
project = wandb_cfg.get("project")
if enable:
base_cfg["exp_manager"]["create_wandb_logger"] = bool(enable)
base_cfg["exp_manager"]["wandb_logger_kwargs"]["project"] = project

# MODEL
if custom_cfg is None:
layers, hs, att_h, ffn, kv, lr = utils.calculate_model_size_params(
model_size_in_b=model_size_in_b,
vocab_size=vocab_size,
seq_length=seq_length,
model_name=model_name,
)
if model_name == "gpt3":
base_cfg["model"]["num_layers"] = int(layers)
base_cfg["model"]["global_batch_size"] = int(gbs)
base_cfg["model"]["hidden_size"] = int(hs)
base_cfg["model"]["num_attention_heads"] = int(att_h)
base_cfg["model"]["encoder_seq_length"] = seq_length
base_cfg["model"]["max_position_embeddings"] = seq_length
base_cfg["model"]["data"]["seq_length"] = seq_length
if ffn is not None:
base_cfg["model"]["ffn_hidden_size"] = int(ffn)
if kv is not None:
base_cfg["model"]["kv_channels"] = int(kv)
base_cfg["model"]["init_method_std"] = round(0.64 / math.sqrt(hs), 6)
base_cfg["model"]["optim"]["sched"]["warmup_steps"] = int(0.0015 * base_cfg["trainer"]["max_steps"])
base_cfg["model"]["optim"]["sched"]["constant_steps"] = int(0.166 * base_cfg["trainer"]["max_steps"])
if model_size_in_b <= 13.0:
base_cfg["model"]["sequence_parallel"] = False
elif model_name == "bert":
base_cfg["model"]["global_batch_size"] = int(gbs)
base_cfg["model"]["num_layers"] = int(layers)
base_cfg["model"]["hidden_size"] = int(hs)
base_cfg["model"]["num_attention_heads"] = int(att_h)
if ffn is not None:
base_cfg["model"]["ffn_hidden_size"] = int(ffn)
if kv is not None:
base_cfg["model"]["kv_channels"] = int(kv)
base_cfg["model"]["init_method_std"] = round(0.64 / math.sqrt(hs), 6)
base_cfg["model"]["optim"]["sched"]["warmup_steps"] = int(0.0015 * base_cfg["trainer"]["max_steps"])
base_cfg["model"]["optim"]["sched"]["constant_steps"] = int(0.166 * base_cfg["trainer"]["max_steps"])
if model_size_in_b <= 13.0:
base_cfg["model"]["sequence_parallel"] = False
else:
base_cfg["model"]["global_batch_size"] = int(gbs)
base_cfg["model"]["encoder"]["num_layers"] = int(layers)
base_cfg["model"]["decoder"]["num_layers"] = int(layers)
base_cfg["model"]["encoder"]["hidden_size"] = int(hs)
base_cfg["model"]["decoder"]["hidden_size"] = int(hs)
base_cfg["model"]["encoder"]["num_attention_heads"] = int(att_h)
base_cfg["model"]["decoder"]["num_attention_heads"] = int(att_h)
if ffn is not None:
base_cfg["model"]["encoder"]["ffn_hidden_size"] = int(ffn)
base_cfg["model"]["decoder"]["ffn_hidden_size"] = int(ffn)
if kv is not None:
base_cfg["model"]["encoder"]["kv_channels"] = int(kv)
base_cfg["model"]["decoder"]["kv_channels"] = int(kv)
base_cfg["model"]["init_method_std"] = 0.015
base_cfg["model"]["optim"]["sched"]["warmup_ratio"] = 0.01

base_cfg["model"]["optim"]["lr"] = lr
base_cfg["model"]["optim"]["sched"]["min_lr"] = round(lr * 0.1, 8)
base_cfg = utils.generic_base_config(model_name=model_name, model_version=model_version, model_size_in_b=model_size_in_b, cfg=cfg)

if cfg.get("cluster_type") == "bcp":
index_map_dir = os.path.join(cfg.get("base_results_dir"), "data_index_files")
os.makedirs(index_map_dir, exist_ok=True)
base_cfg["model"]["data"]["index_mapping_dir"] = index_map_dir

with open(f"{cfg.search_config.train_settings.logs}/base_cfg_{model_size_in_b}b.yaml", "w") as f:
yaml.dump(base_cfg, f)
return base_cfg
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
import os
from typing import Optional

import omegaconf
from autoconfig.base_config import calculate_model_size, generate_base_config
from autoconfig.inference_sweep import search_inference_config
from autoconfig.training_config import search_training_config

SUPPORTED_MODELS = [
Expand All @@ -44,26 +42,25 @@ def search_config(cfg: dict):
"""

# Read config
nodes = cfg.get("num_nodes")
gpus_per_node = cfg.get("gpus_per_node")
gpu_memory_gb = cfg.get("gpu_memory_gb")
max_training_days = cfg.get("max_training_days")
max_minutes_per_run = cfg.get("max_minutes_per_run")
num_nodes = cfg.get("num_nodes")
gpus_per_node = cfg.get("gpus_per_node", 8)
gpu_memory_gb = cfg.get("gpu_memory_gb", 80)
max_training_days = cfg.get("max_training_days", 2)
max_minutes_per_run = cfg.get("max_minutes_per_run", 30)
model_name = cfg.get("model_type")
model_version = cfg.get("model_version")
model_size_in_b = cfg.get("model_size_in_b")
vocab_size = cfg.get("vocab_size")
tflops_per_gpu = cfg.get("tflops_per_gpu")
num_tokens_in_b = cfg.get("num_tokens_in_b")
seq_length = cfg.get("seq_length")
vocab_size = cfg.get("vocab_size", 32000)
tflops_per_gpu = cfg.get("tflops_per_gpu", 140)
num_tokens_in_b = cfg.get("num_tokens_in_b", 300)
seq_length = cfg.get("seq_length", 2048)
log_dir = cfg.get("log_dir")
custom_cfg = None

print(cfg)
print(model_name)
assert model_name in SUPPORTED_MODELS, f"model must be set to one of {SUPPORTED_MODELS}/<model_size>"

gpu_count = nodes * gpus_per_node
assert isinstance(gpu_count, int) and gpu_count > 0, "nodes * gpus_per_node must be an int larger than zero."
gpu_count = num_nodes * gpus_per_node
assert isinstance(gpu_count, int) and gpu_count > 0, "num_nodes * gpus_per_node must be an int larger than zero."
assert isinstance(gpu_memory_gb, int) and gpu_memory_gb in (
40,
80,
Expand All @@ -72,11 +69,6 @@ def search_config(cfg: dict):
isinstance(max_minutes_per_run, int) and max_minutes_per_run >= 10
), "max_minutes_per_run must be an int and be at least 10 minutes."

os.makedirs(log_dir, exist_ok=True)
os.makedirs(os.path.join(log_dir, "candidate_configs"), exist_ok=True)
os.makedirs(os.path.join(log_dir, "training_logs"), exist_ok=True)
os.makedirs(os.path.join(log_dir, "final_result"), exist_ok=True)

# Calculate model size
model_size_in_b = calculate_model_size(
gpu_count=gpu_count,
Expand All @@ -90,26 +82,13 @@ def search_config(cfg: dict):

# Generate base config for the given model size
base_cfg = generate_base_config(
model_name=model_name,
model_version=model_version,
model_size_in_b=model_size_in_b,
nodes=nodes,
gpus_per_node=gpus_per_node,
gpu_memory_gb=gpu_memory_gb,
max_training_days=max_training_days,
num_tokens_in_b=num_tokens_in_b,
vocab_size=vocab_size,
seq_length=seq_length,
custom_cfg=custom_cfg,
cfg=cfg,
model_name=model_name,
)

# Launch grid search for training constraints
if cfg.get("run_training_hp_search"):
search_training_config(base_cfg, model_size_in_b, model_name, hydra_args, cfg)
configs = search_training_config(base_cfg, cfg, model_size_in_b, model_name)

# Launch grid search for inference constraints
if cfg.get("run_inference_hp_search"):
search_inference_config(
base_cfg=base_cfg,
cfg=cfg,
)
return configs
Loading

0 comments on commit 333bc51

Please sign in to comment.