Skip to content

Commit

Permalink
Add deploy and REST API support to NeMo 2.0 (NVIDIA#9834) (NVIDIA#10002)
Browse files Browse the repository at this point in the history
* Add deploy method to nemo 2.0



* Add openai_format_response arg and store_args_to_json method



* Change default Triton port in deploy task NeMo 2.0



* Add logging statements and minor fixes



* Apply isort and black reformatting



* Add import guard for deploy in NeMo 2.0



* Add line end of file



* Add additional import guards



* Add import guard for TRTLLM only



* Add trt_llm_supported varibale



* Add import guard for uvicorn



* Apply isort and black reformatting



* Remove import uvicorn outside the try except block



---------

Signed-off-by: Abhishree <[email protected]>
Signed-off-by: athitten <[email protected]>
Signed-off-by: Abhishree Thittenamane <[email protected]>
Co-authored-by: Abhishree Thittenamane <[email protected]>
Co-authored-by: athitten <[email protected]>
  • Loading branch information
3 people authored and Mengdi Wang committed Aug 23, 2024
1 parent 6abb1a2 commit 8f3ef8c
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 2 deletions.
11 changes: 11 additions & 0 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@
gpt_forward_step,
)
from nemo.collections.llm.recipes import * # noqa
from nemo.utils import logging

try:
from nemo.collections.llm.api import deploy
except ImportError as error:
deploy = None
logging.warning(f"The deploy module could not be imported: {error}")

__all__ = [
"MockDataModule",
Expand Down Expand Up @@ -106,3 +113,7 @@
"dolly",
"peft",
]

# add 'deploy' to __all__ if it was successfully imported
if deploy is not None:
__all__.append("deploy")
167 changes: 167 additions & 0 deletions nemo/collections/llm/api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json
import os
from copy import deepcopy
from pathlib import Path
from typing import Any, Callable, Optional, Union
Expand All @@ -6,10 +8,24 @@
from typing_extensions import Annotated

from nemo.collections.llm.utils import Config, task
from nemo.deploy import DeployPyTriton
from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io
from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
from nemo.utils import logging

trt_llm_supported = True
try:
from nemo.export.tensorrt_llm import TensorRTLLM
except ImportError as error:
logging.warning(f"TensorRTLLM could not be imported from nemo.export: {error}")
trt_llm_supported = False

uvicorn_supported = True
try:
import uvicorn
except ImportError as error:
logging.warning(f"uvicorn could not be imported: {error}")
uvicorn_supported = False

TokenizerType = Any

Expand Down Expand Up @@ -225,6 +241,157 @@ def validate(
return app_state.exp_dir


def get_trtllm_deployable(
nemo_checkpoint,
model_type,
triton_model_repository,
num_gpus,
tensor_parallelism_size,
pipeline_parallelism_size,
max_input_len,
max_output_len,
max_batch_size,
dtype,
):
if triton_model_repository is None:
trt_llm_path = "/tmp/trt_llm_model_dir/"
Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
else:
trt_llm_path = triton_model_repository

if nemo_checkpoint is None and triton_model_repository is None:
raise ValueError(
"The provided model repository is not a valid TensorRT-LLM model "
"directory. Please provide a --nemo_checkpoint or a TensorRT-LLM engine."
)

if nemo_checkpoint is None and not os.path.isdir(triton_model_repository):
raise ValueError(
"The provided model repository is not a valid TensorRT-LLM model "
"directory. Please provide a --nemo_checkpoint or a valid TensorRT-LLM engine."
)

if nemo_checkpoint is not None and model_type is None:
raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")

if not trt_llm_supported:
raise ValueError("TensorRT-LLM engine is not supported in this environment.")
trt_llm_exporter = TensorRTLLM(
model_dir=trt_llm_path,
load_model=(nemo_checkpoint is None),
)

if nemo_checkpoint is not None:
try:
logging.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
trt_llm_exporter.export(
nemo_checkpoint_path=nemo_checkpoint,
model_type=model_type,
n_gpus=num_gpus,
tensor_parallelism_size=tensor_parallelism_size,
pipeline_parallelism_size=pipeline_parallelism_size,
max_input_len=max_input_len,
max_output_len=max_output_len,
max_batch_size=max_batch_size,
dtype=dtype,
)
except Exception as error:
raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))

return trt_llm_exporter


def store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response):
args_dict = {
"triton_service_ip": triton_http_address,
"triton_service_port": triton_port,
"triton_request_timeout": triton_request_timeout,
"openai_format_response": openai_format_response,
}
with open("nemo/deploy/service/config.json", "w") as f:
json.dump(args_dict, f)


@task(namespace="llm")
def deploy(
nemo_checkpoint: Path = None,
model_type: str = "llama",
triton_model_name: str = "xxx",
triton_model_version: Optional[int] = 1,
triton_port: int = 8080,
triton_http_address: str = "0.0.0.0",
triton_request_timeout: int = 60,
triton_model_repository: Path = None,
num_gpus: int = 1,
tensor_parallelism_size: int = 1,
pipeline_parallelism_size: int = 1,
dtype: str = "bfloat16",
max_input_len: int = 256,
max_output_len: int = 256,
max_batch_size: int = 8,
start_rest_service: bool = False,
rest_service_http_address: str = "0.0.0.0",
rest_service_port: int = 8000,
openai_format_response: bool = False,
):
if start_rest_service:
if triton_port == rest_service_port:
logging.error("REST service port and Triton server port cannot use the same port.")
return
# Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
store_args_to_json(triton_http_address, triton_port, triton_request_timeout, openai_format_response)

triton_deployable = get_trtllm_deployable(
nemo_checkpoint,
model_type,
triton_model_repository,
num_gpus,
tensor_parallelism_size,
pipeline_parallelism_size,
max_input_len,
max_output_len,
max_batch_size,
dtype,
)

try:
nm = DeployPyTriton(
model=triton_deployable,
triton_model_name=triton_model_name,
triton_model_version=triton_model_version,
max_batch_size=max_batch_size,
port=triton_port,
address=triton_http_address,
)

logging.info("Triton deploy function will be called.")
nm.deploy()
except Exception as error:
logging.error("Error message has occurred during deploy function. Error message: " + str(error))
return

try:
logging.info("Model serving on Triton is will be started.")
if start_rest_service and uvicorn_supported:
try:
logging.info("REST service will be started.")
uvicorn.run(
'nemo.deploy.service.rest_model_api:app',
host=rest_service_http_address,
port=rest_service_port,
reload=True,
)
except Exception as error:
logging.error("Error message has occurred during REST service start. Error message: " + str(error))
nm.serve()
except Exception as error:
logging.error("Error message has occurred during deploy function. Error message: " + str(error))
return

logging.info("Model serving will be stopped.")
nm.stop()


@task(name="import", namespace="llm")
def import_ckpt(
model: pl.LightningModule,
Expand Down
4 changes: 2 additions & 2 deletions scripts/deploy/nlp/deploy_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ def get_args(argv):
parser.add_argument(
"-srs",
"--start_rest_service",
default="False",
type=str,
default=False,
type=bool,
help="Starts the REST service for OpenAI API support",
)
parser.add_argument(
Expand Down

0 comments on commit 8f3ef8c

Please sign in to comment.