Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance mode (#10926) and gpt3 175b cli (#10985) #11021

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


from nemo.collections.llm.recipes import (
gpt3_175b,
llama3_8b,
llama3_8b_16k,
llama3_8b_64k,
Expand Down Expand Up @@ -61,6 +62,7 @@
"nemotron4_22b_16k",
"nemotron4_22b_64k",
"nemotron4_340b",
"gpt3_175b",
"adam",
"default_log",
"default_resume",
Expand Down
52 changes: 22 additions & 30 deletions nemo/collections/llm/recipes/gpt3_175b.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for GPT3 175B model.
Expand All @@ -155,6 +160,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.

Returns:
Expand All @@ -172,7 +178,7 @@ def pretrain_recipe(
Note:
This recipe is optimized for the large 175B model and requires significant computational resources.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -186,49 +192,35 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for GPT3 175B model.

This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.

Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

Returns:
run.Partial: Partial configuration for performance-optimized pre-training.

Examples:
CLI usage:
$ nemo llm pretrain --factory "gpt3_175b.pretrain_recipe_performance(num_nodes=64, name='perf_pretrain')"

Python API usage:
>>> recipe = pretrain_recipe_performance(name="gpt3_175b_perf", num_nodes=64)
>>> print(recipe)

Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
# overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
# by MegatronCommOverlapCallback. They are added here for user's knowledge.
# overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
# each PP stage launches independently as needed.

recipe.trainer.callbacks.append(
run.Config(
Expand Down
52 changes: 22 additions & 30 deletions nemo/collections/llm/recipes/llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Llama3.1 405B model.
Expand All @@ -157,6 +162,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.

Returns:
Expand All @@ -174,7 +180,7 @@ def pretrain_recipe(
Note:
This recipe is optimized for the large 405B model and requires significant computational resources.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -188,49 +194,35 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Llama3.1 405B model.

This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.

Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

Returns:
run.Partial: Partial configuration for performance-optimized pre-training.

Examples:
CLI usage:
$ nemo llm pretrain --factory "llama31_405b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"

Python API usage:
>>> recipe = pretrain_recipe_performance(name="llama31_405b_perf", num_nodes=4)
>>> print(recipe)

Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
# overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
# by MegatronCommOverlapCallback. They are added here for user's knowledge.
# overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
# each PP stage launches independently as needed.

recipe.trainer.callbacks.append(
run.Config(
Expand Down
51 changes: 24 additions & 27 deletions nemo/collections/llm/recipes/llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


from typing import Optional
from typing import Callable, Optional

import nemo_run as run
import pytorch_lightning as pl
Expand Down Expand Up @@ -142,7 +142,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Llama3 70B model.
Expand All @@ -155,6 +160,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.

Returns:
Expand All @@ -172,7 +178,8 @@ def pretrain_recipe(
Note:
This recipe is optimized for the large 70B model and requires significant computational resources.
"""
return run.Partial(

recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -186,45 +193,35 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 4, num_gpus_per_node: int = 8, fn=pretrain
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Llama3 70B model.

This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.

Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

Returns:
run.Partial: Partial configuration for performance-optimized pre-training.

Examples:
CLI usage:
$ nemo llm pretrain --factory "llama3_70b.pretrain_recipe_performance(num_nodes=4, name='perf_pretrain')"

Python API usage:
>>> recipe = pretrain_recipe_performance(name="llama3_70b_perf", num_nodes=4)
>>> print(recipe)

Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically by MegatronCommOverlapCallback
# They are added here for user's knowledge
# overlap_param_gather_with_optimizer_step- If true, overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else each PP stage launches independently as needed
# 'overlap_param_gather_with_optimizer_step' and 'align_param_gather' params are set automatically
# by MegatronCommOverlapCallback. They are added here for user's knowledge.
# overlap_param_gather_with_optimizer_step- Overlap param all-gather of first bucket with optimizer step.
# align_param_gather- If true, all PP stages launch param all-gathers simultaneously, else
# each PP stage launches independently as needed.

recipe.trainer.callbacks.append(
run.Config(
Expand Down
43 changes: 17 additions & 26 deletions nemo/collections/llm/recipes/llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,12 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
performance_mode: bool = False,
fn: Callable = pretrain,
) -> run.Partial:
"""
Create a pre-training recipe for Llama3 8B model.
Expand All @@ -156,6 +161,7 @@ def pretrain_recipe(
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
performance_mode (bool): If true, enables optimizations for maximum performance.
fn (Callable): The pre-training function to use.

Returns:
Expand All @@ -174,7 +180,7 @@ def pretrain_recipe(
For more details on pre-training LLMs with NeMo, see the pre-training
guide in the `examples/llm/pretrain/` directory.
"""
return run.Partial(
recipe = run.Partial(
fn,
model=model(),
trainer=trainer(
Expand All @@ -188,44 +194,29 @@ def pretrain_recipe(
resume=default_resume(),
)

if performance_mode:
recipe = pretrain_performance_optimizations(recipe)

@run.cli.factory(target=pretrain, name=NAME + "_performance")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
return recipe


def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial:
"""
Create a performance-optimized pre-training recipe for Llama3 8B model.

This recipe enables performance optimizations that may not be suitable for all use cases.
This method enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.

Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
recipe (run.Partial): Base pre-train recipe to which performance optimizations will be added

Returns:
run.Partial: Partial configuration for performance-optimized pre-training.

Examples:
$ nemo llm pretrain --factory llama3_8b_optimized

Python API usage:
>>> recipe = pretrain_recipe_performance(name="llama3_8b_perf", num_nodes=4)
>>> print(recipe)

Note:
Use this recipe with caution and only when you need maximum performance.
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
Expand Down
Loading
Loading