diff --git a/config_hub/finetune/falcon-7b/lora.yaml b/config_hub/finetune/falcon-7b/lora.yaml index eab0954182..c45b0fed94 100644 --- a/config_hub/finetune/falcon-7b/lora.yaml +++ b/config_hub/finetune/falcon-7b/lora.yaml @@ -114,6 +114,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/falcon-7b/qlora.yaml b/config_hub/finetune/falcon-7b/qlora.yaml index dfc5377bd8..33ab9d9fc3 100644 --- a/config_hub/finetune/falcon-7b/qlora.yaml +++ b/config_hub/finetune/falcon-7b/qlora.yaml @@ -116,6 +116,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/gemma-2b/full.yaml b/config_hub/finetune/gemma-2b/full.yaml index 77f20658ca..879f1afee9 100644 --- a/config_hub/finetune/gemma-2b/full.yaml +++ b/config_hub/finetune/gemma-2b/full.yaml @@ -85,6 +85,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/gemma-2b/lora.yaml b/config_hub/finetune/gemma-2b/lora.yaml index c9f912a47c..91af82800d 100644 --- a/config_hub/finetune/gemma-2b/lora.yaml +++ b/config_hub/finetune/gemma-2b/lora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/gemma-2b/qlora.yaml b/config_hub/finetune/gemma-2b/qlora.yaml index dc15fe90d3..159ae2cc86 100644 --- a/config_hub/finetune/gemma-2b/qlora.yaml +++ b/config_hub/finetune/gemma-2b/qlora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/gemma-7b/lora.yaml b/config_hub/finetune/gemma-7b/lora.yaml index d7d56f5b5c..59120c5d0b 100644 --- a/config_hub/finetune/gemma-7b/lora.yaml +++ b/config_hub/finetune/gemma-7b/lora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/gemma-7b/qlora.yaml b/config_hub/finetune/gemma-7b/qlora.yaml index 7d4a2c634c..556fba0cf5 100644 --- a/config_hub/finetune/gemma-7b/qlora.yaml +++ b/config_hub/finetune/gemma-7b/qlora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/llama-2-7b/full.yaml b/config_hub/finetune/llama-2-7b/full.yaml index 10e439b2de..99de788c74 100644 --- a/config_hub/finetune/llama-2-7b/full.yaml +++ b/config_hub/finetune/llama-2-7b/full.yaml @@ -88,6 +88,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/llama-2-7b/lora.yaml b/config_hub/finetune/llama-2-7b/lora.yaml index 91f326757a..594b2f924d 100644 --- a/config_hub/finetune/llama-2-7b/lora.yaml +++ b/config_hub/finetune/llama-2-7b/lora.yaml @@ -114,6 +114,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/llama-2-7b/qlora.yaml b/config_hub/finetune/llama-2-7b/qlora.yaml index a3b7cb8dde..106b9422f4 100644 --- a/config_hub/finetune/llama-2-7b/qlora.yaml +++ b/config_hub/finetune/llama-2-7b/qlora.yaml @@ -116,6 +116,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/llama-3-8b/full.yaml b/config_hub/finetune/llama-3-8b/full.yaml index 11aebcb155..e06d037710 100644 --- a/config_hub/finetune/llama-3-8b/full.yaml +++ b/config_hub/finetune/llama-3-8b/full.yaml @@ -88,6 +88,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/llama-3-8b/lora.yaml b/config_hub/finetune/llama-3-8b/lora.yaml index 700a3b62f4..1d874a0690 100644 --- a/config_hub/finetune/llama-3-8b/lora.yaml +++ b/config_hub/finetune/llama-3-8b/lora.yaml @@ -114,6 +114,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/llama-3-8b/qlora.yaml b/config_hub/finetune/llama-3-8b/qlora.yaml index 1da95eaac5..33a0fc98be 100644 --- a/config_hub/finetune/llama-3-8b/qlora.yaml +++ b/config_hub/finetune/llama-3-8b/qlora.yaml @@ -116,6 +116,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/mistral-7b-v0.2/lora.yaml b/config_hub/finetune/mistral-7b-v0.2/lora.yaml index aad8f7c986..f56e34c525 100644 --- a/config_hub/finetune/mistral-7b-v0.2/lora.yaml +++ b/config_hub/finetune/mistral-7b-v0.2/lora.yaml @@ -114,6 +114,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml index e2f5c3aafc..b648b24d72 100644 --- a/config_hub/finetune/mistral-7b-v0.2/qlora.yaml +++ b/config_hub/finetune/mistral-7b-v0.2/qlora.yaml @@ -116,6 +116,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/mistral-7b/lora.yaml b/config_hub/finetune/mistral-7b/lora.yaml index adfed6b08d..e991ec424e 100644 --- a/config_hub/finetune/mistral-7b/lora.yaml +++ b/config_hub/finetune/mistral-7b/lora.yaml @@ -114,6 +114,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/mistral-7b/qlora.yaml b/config_hub/finetune/mistral-7b/qlora.yaml index 7972048f46..e43b745bb8 100644 --- a/config_hub/finetune/mistral-7b/qlora.yaml +++ b/config_hub/finetune/mistral-7b/qlora.yaml @@ -116,6 +116,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/phi-2/full.yaml b/config_hub/finetune/phi-2/full.yaml index 65040a393e..5b302a48ac 100644 --- a/config_hub/finetune/phi-2/full.yaml +++ b/config_hub/finetune/phi-2/full.yaml @@ -88,6 +88,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/phi-2/lora.yaml b/config_hub/finetune/phi-2/lora.yaml index a3f348c8b2..2571bc02d0 100644 --- a/config_hub/finetune/phi-2/lora.yaml +++ b/config_hub/finetune/phi-2/lora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/phi-2/qlora.yaml b/config_hub/finetune/phi-2/qlora.yaml index aa2c36d40a..d48d910939 100644 --- a/config_hub/finetune/phi-2/qlora.yaml +++ b/config_hub/finetune/phi-2/qlora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml index bd68af8714..c196fcc017 100644 --- a/config_hub/finetune/stablelm-base-alpha-3b/full.yaml +++ b/config_hub/finetune/stablelm-base-alpha-3b/full.yaml @@ -85,6 +85,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml index e674cc8419..6e52ea2175 100644 --- a/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml +++ b/config_hub/finetune/stablelm-base-alpha-3b/lora.yaml @@ -114,6 +114,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml index 27b579cbd8..ebd2f098eb 100644 --- a/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml +++ b/config_hub/finetune/stablelm-base-alpha-3b/qlora.yaml @@ -116,6 +116,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/tiny-llama/full.yaml b/config_hub/finetune/tiny-llama/full.yaml index 4bc09e460b..fe1d1ef99d 100644 --- a/config_hub/finetune/tiny-llama/full.yaml +++ b/config_hub/finetune/tiny-llama/full.yaml @@ -85,6 +85,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml index 4991900954..c42ff28ff3 100644 --- a/config_hub/finetune/tiny-llama/lora.yaml +++ b/config_hub/finetune/tiny-llama/lora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/finetune/tiny-llama/qlora.yaml b/config_hub/finetune/tiny-llama/qlora.yaml index 1e8cf20b8a..7e80e4d0ca 100644 --- a/config_hub/finetune/tiny-llama/qlora.yaml +++ b/config_hub/finetune/tiny-llama/qlora.yaml @@ -115,6 +115,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) logger_name: csv diff --git a/config_hub/pretrain/debug.yaml b/config_hub/pretrain/debug.yaml index bbe2fee2cc..e89dda3cc9 100644 --- a/config_hub/pretrain/debug.yaml +++ b/config_hub/pretrain/debug.yaml @@ -88,6 +88,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) devices: auto diff --git a/config_hub/pretrain/tinyllama.yaml b/config_hub/pretrain/tinyllama.yaml index a47bd946f3..e2418a5b17 100644 --- a/config_hub/pretrain/tinyllama.yaml +++ b/config_hub/pretrain/tinyllama.yaml @@ -88,6 +88,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) devices: auto diff --git a/config_hub/pretrain/tinystories.yaml b/config_hub/pretrain/tinystories.yaml index 8ef1232862..8ed53a09d7 100644 --- a/config_hub/pretrain/tinystories.yaml +++ b/config_hub/pretrain/tinystories.yaml @@ -104,6 +104,9 @@ eval: # Number of iterations (type: int, default: 100) max_iters: 100 + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto) devices: auto diff --git a/litgpt/args.py b/litgpt/args.py index b227ffe3f6..7e277fe9e6 100644 --- a/litgpt/args.py +++ b/litgpt/args.py @@ -79,3 +79,5 @@ class EvalArgs: """Number of tokens to generate""" max_iters: int = 100 """Number of iterations""" + initial_validation: bool = False + """Whether to evaluate on the validation set at the beginning of the training""" diff --git a/litgpt/finetune/adapter.py b/litgpt/finetune/adapter.py index be21af318d..313d0ea8e7 100644 --- a/litgpt/finetune/adapter.py +++ b/litgpt/finetune/adapter.py @@ -220,7 +220,12 @@ def fit( f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + if eval.initial_validation: + val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader))) + val_loss = f"{val_loss:.3f}" + else: + validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + val_loss = "n/a" train_iterator = CycleIterator(train_dataloader) throughput = ThroughputMonitor(fabric, window_size=50) @@ -232,7 +237,6 @@ def fit( iter_num = 0 total_lengths = 0 total_t0 = time.perf_counter() - val_loss = "n/a" while step_count < max_steps and train_iterator.epoch < train.epochs: iter_num += 1 diff --git a/litgpt/finetune/adapter_v2.py b/litgpt/finetune/adapter_v2.py index f354decfd0..39b2a2d0e2 100644 --- a/litgpt/finetune/adapter_v2.py +++ b/litgpt/finetune/adapter_v2.py @@ -220,7 +220,12 @@ def fit( f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + if eval.initial_validation: + val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader))) + val_loss = f"{val_loss:.3f}" + else: + validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + val_loss = "n/a" train_iterator = CycleIterator(train_dataloader) throughput = ThroughputMonitor(fabric, window_size=50) @@ -232,7 +237,6 @@ def fit( iter_num = 0 total_lengths = 0 total_t0 = time.perf_counter() - val_loss = "n/a" while step_count < max_steps and train_iterator.epoch < train.epochs: iter_num += 1 diff --git a/litgpt/finetune/full.py b/litgpt/finetune/full.py index 23de9b622c..01db855189 100644 --- a/litgpt/finetune/full.py +++ b/litgpt/finetune/full.py @@ -194,7 +194,13 @@ def fit( f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + if eval.initial_validation: + val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader))) + val_loss = f"{val_loss:.3f}" + else: + validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + val_loss = "n/a" + initial_iter = state["iter_num"] max_steps = train.max_steps or float("inf") train_iterator = CycleIterator(train_dataloader) @@ -216,7 +222,6 @@ def fit( fabric.device ) fabric.barrier() - val_loss = "n/a" while state["step_count"] < max_steps and train_iterator.epoch < train.epochs: state["iter_num"] += 1 diff --git a/litgpt/finetune/lora.py b/litgpt/finetune/lora.py index 39e805befe..ae48bbc8fe 100644 --- a/litgpt/finetune/lora.py +++ b/litgpt/finetune/lora.py @@ -251,7 +251,12 @@ def fit( f" {model.max_seq_length} and context length is {model.config.block_size}" ) - validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + if eval.initial_validation: + val_loss = validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=len(val_dataloader))) + val_loss = f"{val_loss:.3f}" + else: + validate(fabric, model, val_dataloader, dataclasses.replace(eval, max_iters=2)) # sanity check + val_loss = "n/a" train_iterator = CycleIterator(train_dataloader) throughput = ThroughputMonitor(fabric, window_size=50) @@ -263,7 +268,6 @@ def fit( iter_num = 0 total_lengths = 0 total_t0 = time.perf_counter() - val_loss = "n/a" while step_count < max_steps and train_iterator.epoch < train.epochs: iter_num += 1 diff --git a/litgpt/pretrain.py b/litgpt/pretrain.py index 3a763116a0..d5014dc022 100644 --- a/litgpt/pretrain.py +++ b/litgpt/pretrain.py @@ -228,7 +228,13 @@ def fit( model = state["model"] optimizer = state["optimizer"] - validate(fabric, model, val_dataloader, max_iters=2) # sanity check + if eval.initial_validation: + val_loss = validate(fabric, model, val_dataloader, max_iters=eval.max_iters) + val_loss = f"{val_loss:.3f}" + else: + validate(fabric, model, val_dataloader, max_iters=2) # sanity check + val_loss = "n/a" + throughput = ThroughputMonitor(fabric, window_size=5) with torch.device("meta"): @@ -252,7 +258,6 @@ def fit( ) fabric.barrier() total_t0 = time.perf_counter() - val_loss = "n/a" warmup_iters = train.warmup_iters(devices, max_iters, train_dataloader)