From b5094e722ec26c602dd929ddbbecda9cffed2a4a Mon Sep 17 00:00:00 2001
From: unknown <akberr@nvidia.com>
Date: Thu, 7 Mar 2024 17:16:39 -0800
Subject: [PATCH 1/7] Add Langchain connector for TensorRT-LLM API

---
 libs/trt/docs/trtllmapi.ipynb             |  91 +++++++
 libs/trt/langchain_nvidia_trt/__init__.py |   4 +-
 libs/trt/langchain_nvidia_trt/llms.py     | 282 +++++++++++++++++++++-
 libs/trt/langchain_nvidia_trt/utils.py    |  96 ++++++++
 4 files changed, 469 insertions(+), 4 deletions(-)
 create mode 100644 libs/trt/docs/trtllmapi.ipynb
 create mode 100644 libs/trt/langchain_nvidia_trt/utils.py
diff --git a/libs/trt/docs/trtllmapi.ipynb b/libs/trt/docs/trtllmapi.ipynb
new file mode 100644
index 00000000..0f0881b5
--- /dev/null
+++ b/libs/trt/docs/trtllmapi.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf9d3415-fe08-4cc0-bbb8-b582cb01e754",
+   "metadata": {},
+   "source": [
+    "# Nvidia TensorRT-LLM\n",
+    "TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and build TensorRT engines that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.\n",
+    "<br>\n",
+    "[TensorRT-LLM Github](https://github.com/NVIDIA/TensorRT-LLM)\n",
+    "\n",
+    "## TensorRT-LLM Environment Setup\n",
+    "Since TensorRT-LLM is a SDK for interacting with local models in process there are a few environment steps that must be followed to ensure that the TensorRT-LLM setup can be used.\n",
+    "<br>\n",
+    "1. Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM\n",
+    "2. Install `tensorrt_llm` via pip with `pip install tensorrt_llm==0.8.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/`\n",
+    "3. For this example we will use Llama2. The Llama2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama)<br>\n",
+    "   * The following files will be created from following the step above\n",
+    "   * `rank0.engine`: The main output of the build script, containing the executable graph of operations with the model weights embedded\n",
+    "   * `config.json`: Includes detailed information about the model, like its general structure and precision, as well as information about which plug-ins were incorporated into the engine\n",
+    "5. `mkdir model`\n",
+    "6. Move all of the files mentioned above to the model directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1af02aa0-be5a-4121-9853-7af277c257c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install langchain-nvidia-trt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49e836a3-be37-416a-8af6-5e53562789e4",
+   "metadata": {},
+   "source": [
+    "## Create the TrtLlmAPI instance\n",
+    "Call `invoke` with a prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "908ac6ac-b728-47d2-ae41-11a5cf4c1057",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_nvidia_trt.llms import TrtLlmAPI\n",
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "template = \"\"\"Question: {question}\n",
+    "\n",
+    "Answer: Let's think step by step.\"\"\"\n",
+    "\n",
+    "prompt = PromptTemplate.from_template(template)\n",
+    "\n",
+    "llm = TrtLlmAPI(\n",
+    "    model_path=\"./model\",\n",
+    "    tokenizer_dir=\"meta-llama/Llama-2-7b-chat\",\n",
+    ")\n",
+    "chain = prompt | llm\n",
+    "print(chain.invoke({\"question\": \"What is important about Half Life 2 RTX?\"}))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/trt/langchain_nvidia_trt/__init__.py b/libs/trt/langchain_nvidia_trt/__init__.py
index 5d89779e..d666a969 100644
--- a/libs/trt/langchain_nvidia_trt/__init__.py
+++ b/libs/trt/langchain_nvidia_trt/__init__.py
@@ -1,3 +1,3 @@
-from langchain_nvidia_trt.llms import TritonTensorRTLLM
+from langchain_nvidia_trt.llms import (TritonTensorRTLLM, TrtLlmAPI)
 
-__all__ = ["TritonTensorRTLLM"]
+__all__ = ["TritonTensorRTLLM", "TrtLlmAPI"]
diff --git a/libs/trt/langchain_nvidia_trt/llms.py b/libs/trt/langchain_nvidia_trt/llms.py
index 0ea1fca1..67defe07 100644
--- a/libs/trt/langchain_nvidia_trt/llms.py
+++ b/libs/trt/langchain_nvidia_trt/llms.py
@@ -5,7 +5,7 @@
 import random
 import time
 from functools import partial
-from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
 
 import google.protobuf.json_format
 import numpy as np
@@ -13,10 +13,18 @@
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from langchain_core.language_models import BaseLLM
 from langchain_core.outputs import Generation, GenerationChunk, LLMResult
-from langchain_core.pydantic_v1 import Field, root_validator
+from langchain_core.pydantic_v1 import Field, root_validator, PrivateAttr
 from tritonclient.grpc.service_pb2 import ModelInferResponse
 from tritonclient.utils import np_to_triton_dtype
 
+import gc
+import torch
+import tensorrt_llm
+import uuid
+from langchain_core.callbacks import CallbackManager
+from .utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS, load_tokenizer, read_model_name, default_prompt_template)
+from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner, ModelRunnerCpp
+from tensorrt_llm.logger import logger
 
 class TritonTensorRTError(Exception):
     """Base exception for TritonTensorRT."""
@@ -405,3 +413,273 @@ def __next__(self) -> str:
             )
             raise StopIteration()
         return val
+
+class TrtLlmAPI(BaseLLM):
+    model_path: Optional[str] = Field(
+        description="The path to the trt engine."
+    )
+    tokenizer_dir: Optional[str] = Field(
+        description="The path to the trt engine."
+    )
+    temperature: float = Field(
+        default=0.1, description="The temperature to use for sampling."
+    )
+    max_new_tokens: int = Field(
+        default=DEFAULT_NUM_OUTPUTS, description="The maximum number of tokens to generate."
+    )
+    context_window: int = Field(
+        default=DEFAULT_CONTEXT_WINDOW, description="The maximum number of context tokens for the model."
+    )
+    generate_kwargs: Dict[str, Any] = Field(
+        default_factory=dict, description="Kwargs used for generation."
+    )
+    model_kwargs: Dict[str, Any] = Field(
+        default_factory=dict, description="Kwargs used for model initialization."
+    )
+    verbose: bool = Field(default=False, description="Whether to print verbose output.")
+
+    _model: Any = PrivateAttr()
+    _model_name = PrivateAttr()
+    _model_version = PrivateAttr()
+    _model_config: Any = PrivateAttr()
+    _tokenizer: Any = PrivateAttr()
+    _pad_id:Any = PrivateAttr()
+    _end_id: Any = PrivateAttr()
+    _max_new_tokens = PrivateAttr()
+    _max_input_tokens = PrivateAttr()
+    _sampling_config = PrivateAttr()
+    _debug_mode = PrivateAttr()
+    _add_special_tokens = PrivateAttr()
+    _verbose = PrivateAttr()
+    _generate_kwargs = PrivateAttr()
+
+    def _init_attr(
+            self,
+            model_path: Optional[str] = None,
+            tokenizer_dir: Optional[str] = None,
+            vocab_file: Optional[str] = None,
+            temperature: float = 0.1,
+            max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
+            context_window: int = DEFAULT_CONTEXT_WINDOW,
+            messages_to_prompt: Optional[Callable] = None,
+            completion_to_prompt: Optional[Callable] = None,
+            prompt_template = None,
+            callback_manager: Optional[CallbackManager] = None,
+            generate_kwargs: Optional[Dict[str, Any]] = None,
+            model_kwargs: Optional[Dict[str, Any]] = None,
+            use_py_session = True,
+            add_special_tokens = False,
+            trtLlm_debug_mode = False,
+            verbose: bool = False
+    ) -> None:
+        runtime_rank = tensorrt_llm.mpi_rank()
+        self._model_name, self._model_version = read_model_name(model_path)
+        if tokenizer_dir is None:
+            logger.warning(
+                "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
+            )
+
+            if self._model_name == "GemmaForCausalLM":
+                tokenizer_dir = 'gpt2'
+            else:
+                tokenizer_dir = DEFAULT_HF_MODEL_DIRS[self._model_name]
+
+        self._max_input_tokens=context_window
+        self._add_special_tokens=add_special_tokens
+        self._verbose = verbose
+        model_kwargs = model_kwargs or {}
+        model_kwargs.update({"n_ctx": context_window, "verbose": verbose})
+
+        self._tokenizer, self._pad_id, self._end_id = load_tokenizer(
+            tokenizer_dir=tokenizer_dir,
+            vocab_file=vocab_file,
+            model_name=self._model_name,
+            model_version=self._model_version,
+            #tokenizer_type=args.tokenizer_type,
+        )
+
+        runner_cls = ModelRunner if use_py_session else ModelRunnerCpp
+        if verbose:
+            print(f"[ChatRTX] Trt-llm mode debug mode: {trtLlm_debug_mode}")
+
+        runner_kwargs = dict(engine_dir=model_path,
+                             rank=runtime_rank,
+                             debug_mode=trtLlm_debug_mode,
+                             lora_ckpt_source='hf')
+
+        if not use_py_session:
+            runner_kwargs.update(free_gpu_memory_fraction = 0.5)
+
+        self._model = runner_cls.from_dir(**runner_kwargs)
+        self._generate_kwargs = generate_kwargs or {}
+        self._generate_kwargs.update(
+           {"temperature": temperature, "max_tokens": max_new_tokens}
+        )
+
+        self._max_new_tokens = max_new_tokens
+        if prompt_template == None:
+            messages_to_prompt = None
+            completion_to_prompt = None
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of LLM."""
+        return "nvidia-trt-llm-api"
+
+    def _generate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+        # TODO: add caching here.
+        generations = []
+        for prompt in prompts:
+            text = (
+                self.complete_call(prompt, stop=stop, run_manager=run_manager, **kwargs)
+            )
+            generations.append([Generation(text=text)])
+        return LLMResult(generations=generations)
+
+    def complete_call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        self._init_attr(model_path=self.model_path, tokenizer_dir=self.tokenizer_dir, verbose=self.verbose)
+        self._generate_kwargs.update({"stream": False})
+        is_formatted = kwargs.pop("formatted", False)
+        if not is_formatted:
+            prompt = default_prompt_template(prompt)
+
+        if self._verbose:
+            print(f"[ChatRTX] Context send to LLM \n: {prompt}")
+
+        input_text = [prompt]
+        batch_input_ids = self.parse_input(
+                                tokenizer=self._tokenizer,
+                                input_text=input_text,
+                                prompt_template=None,
+                                input_file=None,
+                                add_special_tokens=self._add_special_tokens,
+                                max_input_length=self._max_input_tokens,
+                                pad_id=self._pad_id,
+                                num_prepend_vtokens=None,
+                                model_name= self._model_name,
+                                model_version=self._model_version)
+        input_lengths = [x.size(0) for x in batch_input_ids]
+
+        if self._verbose:
+            print(f"[ChatRTX] Number of token : {input_lengths[0]}")
+
+        with torch.no_grad():
+            outputs = self._model.generate(
+                batch_input_ids,
+                max_new_tokens=self._max_new_tokens,
+                max_attention_window_size=4096,
+                #sink_token_length=None,
+                end_id=self._end_id,
+                pad_id=self._pad_id,
+                temperature=1.0,
+                top_k=1,
+                top_p=0,
+                num_beams=1,
+                length_penalty=1.0,
+                early_stopping=False,
+                repetition_penalty=1.0,
+                presence_penalty=0.0,
+                frequency_penalty=0.0,
+                stop_words_list=None,
+                bad_words_list=None,
+                lora_uids=None,
+                prompt_table_path=None,
+                prompt_tasks=None,
+                streaming=False,
+                output_sequence_lengths=True,
+                return_dict=True)
+            torch.cuda.synchronize()
+
+        output_ids = outputs['output_ids']
+        sequence_lengths = outputs['sequence_lengths']
+        output_txt, output_token_ids = self.print_output(self._tokenizer,
+                                                        output_ids,
+                                                        input_lengths,
+                                                        sequence_lengths)
+        # call garbage collected after inference
+        torch.cuda.empty_cache()
+        gc.collect()
+        return output_txt
+
+    def parse_input(self,
+                    tokenizer,
+                    input_text=None,
+                    prompt_template=None,
+                    input_file=None,
+                    add_special_tokens=False,
+                    max_input_length=4096,
+                    pad_id=None,
+                    num_prepend_vtokens=[],
+                    model_name=None,
+                    model_version=None):
+        if pad_id is None:
+            pad_id = tokenizer.pad_token_id
+
+        batch_input_ids = []
+        if input_file is None:
+            for curr_text in input_text:
+                if prompt_template is not None:
+                    curr_text = prompt_template.format(input_text=curr_text)
+                input_ids = tokenizer.encode(curr_text,
+                                             add_special_tokens=add_special_tokens,
+                                             truncation=True,
+                                             max_length=max_input_length)
+                batch_input_ids.append(input_ids)
+
+        if num_prepend_vtokens:
+            assert len(num_prepend_vtokens) == len(batch_input_ids)
+            base_vocab_size = tokenizer.vocab_size - len(
+                tokenizer.special_tokens_map.get('additional_special_tokens', []))
+            for i, length in enumerate(num_prepend_vtokens):
+                batch_input_ids[i] = list(
+                    range(base_vocab_size,
+                          base_vocab_size + length)) + batch_input_ids[i]
+
+        if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
+            for ids in batch_input_ids:
+                ids.append(tokenizer.sop_token_id)
+
+        batch_input_ids = [
+            torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
+        ]
+
+        return batch_input_ids
+
+    def print_output(self,
+                     tokenizer,
+                     output_ids,
+                     input_lengths,
+                     sequence_lengths,
+                     output_csv=None,
+                     output_npy=None,
+                     context_logits=None,
+                     generation_logits=None,
+                     output_logits_npy=None):
+        output_text = ""
+        batch_size, num_beams, _ = output_ids.size()
+        if output_csv is None and output_npy is None:
+            for batch_idx in range(batch_size):
+                inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
+                )
+                for beam in range(num_beams):
+                    output_begin = input_lengths[batch_idx]
+                    output_end = sequence_lengths[batch_idx][beam]
+                    outputs = output_ids[batch_idx][beam][
+                              output_begin:output_end].tolist()
+                    output_text = tokenizer.decode(outputs)
+
+        output_ids = output_ids.reshape((-1, output_ids.size(2)))
+        return output_text, output_ids
diff --git a/libs/trt/langchain_nvidia_trt/utils.py b/libs/trt/langchain_nvidia_trt/utils.py
new file mode 100644
index 00000000..173732a1
--- /dev/null
+++ b/libs/trt/langchain_nvidia_trt/utils.py
@@ -0,0 +1,96 @@
+import json
+from pathlib import Path
+from typing import Optional
+from transformers import AutoTokenizer, T5Tokenizer
+import tensorrt_llm
+
+# TODO(enweiz): Update for refactered models
+DEFAULT_HF_MODEL_DIRS = {
+    'baichuan': 'baichuan-inc/Baichuan-13B-Chat',
+    'BloomForCausalLM': 'bigscience/bloom-560m',
+    'ChatGLMForCausalLM': 'THUDM/chatglm3-6b',
+    'FalconForCausalLM': 'tiiuae/falcon-rw-1b',
+    'gpt': 'gpt2-medium',
+    'GPTJForCausalLM': 'EleutherAI/gpt-j-6b',
+    'GPTNeoXForCausalLM': 'EleutherAI/gpt-neox-20b',
+    'internlm': 'internlm/internlm-chat-7b',
+    'llama': 'meta-llama/Llama-2-7b-hf',
+    'mpt': 'mosaicml/mpt-7b',
+    'PhiForCausalLM': 'microsoft/phi-2',
+    'OPTForCausalLM': 'facebook/opt-350m',
+    'qwen': 'Qwen/Qwen-7B',
+}
+DEFAULT_CONTEXT_WINDOW = 2048
+DEFAULT_NUM_OUTPUTS = 256
+
+def read_model_name(engine_dir: str):
+    engine_version = tensorrt_llm.runtime.engine.get_engine_version(engine_dir)
+
+    with open(Path(engine_dir) / "config.json", 'r') as f:
+        config = json.load(f)
+
+    if engine_version is None:
+        return config['builder_config']['name'], None
+
+    model_arch = config['pretrained_config']['architecture']
+    model_version = None
+    if model_arch == 'ChatGLMForCausalLM':
+        model_version = config['pretrained_config']['chatglm_version']
+    return model_arch, model_version
+
+def load_tokenizer(tokenizer_dir: Optional[str] = None,
+                   vocab_file: Optional[str] = None,
+                   model_name: str = 'gpt',
+                   model_version: Optional[str] = None,
+                   tokenizer_type: Optional[str] = None):
+    if vocab_file is None:
+        use_fast = True
+        if tokenizer_type is not None and tokenizer_type == "llama": 
+            use_fast = False
+        # Should set both padding_side and truncation_side to be 'left'
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                  legacy=False,
+                                                  padding_side='left',
+                                                  truncation_side='left',
+                                                  trust_remote_code=True,
+                                                  tokenizer_type=tokenizer_type,
+                                                  use_fast=use_fast)
+    else:
+        # For gpt-next, directly load from tokenizer.model
+        tokenizer = T5Tokenizer(vocab_file=vocab_file,
+                                padding_side='left',
+                                truncation_side='left',
+                                legacy=False)
+
+    if model_name == 'qwen':
+        with open(Path(tokenizer_dir) / "generation_config.json") as f:
+            gen_config = json.load(f)
+        chat_format = gen_config['chat_format']
+        if chat_format == 'raw':
+            pad_id = gen_config['pad_token_id']
+            end_id = gen_config['eos_token_id']
+        elif chat_format == 'chatml':
+            pad_id = tokenizer.im_end_id
+            end_id = tokenizer.im_end_id
+        else:
+            raise Exception(f"unknown chat format: {chat_format}")
+    elif model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eop_token_id
+    elif model_name == 'GemmaForCausalLM':
+        tokenizer.eos_token_id = tokenizer.sp_model.eos_id()
+        tokenizer.bos_token_id = tokenizer.sp_model.bos_id()
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eos_token_id
+    else:
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eos_token_id
+
+    return tokenizer, pad_id, end_id
+
+def default_prompt_template(query: str):
+    text_qa_template_str = "<s>[INST] {query_str} [/INST]"
+    formatted_str = text_qa_template_str.format(query_str=query)
+    return formatted_str

From ae6569dce8e2b0f382b718944c688bbd370b0329 Mon Sep 17 00:00:00 2001
From: unknown <akberr@nvidia.com>
Date: Thu, 7 Mar 2024 17:16:39 -0800
Subject: [PATCH 2/7] Add Langchain connector for TensorRT-LLM API

- Tested support for Mistral and Llama2 with TRT-LLM v0.9.0
---
 libs/trt/docs/trtllmapi.ipynb             | 149 +++++++++++++
 libs/trt/langchain_nvidia_trt/__init__.py |   4 +-
 libs/trt/langchain_nvidia_trt/llms.py     | 252 +++++++++++++++++++++-
 libs/trt/langchain_nvidia_trt/utils.py    |  75 +++++++
 libs/trt/pyproject.toml                   |   6 +-
 5 files changed, 479 insertions(+), 7 deletions(-)
 create mode 100644 libs/trt/docs/trtllmapi.ipynb
 create mode 100644 libs/trt/langchain_nvidia_trt/utils.py

diff --git a/libs/trt/docs/trtllmapi.ipynb b/libs/trt/docs/trtllmapi.ipynb
new file mode 100644
index 00000000..0f4a1fab
--- /dev/null
+++ b/libs/trt/docs/trtllmapi.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cf9d3415-fe08-4cc0-bbb8-b582cb01e754",
+   "metadata": {},
+   "source": [
+    "# NVIDIA TensorRT-LLM\n",
+    "TensorRT-LLM provides users with an easy-to-use Python API to define Large Language Models (LLMs) and build TensorRT engines that contain state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.\n",
+    "<br>\n",
+    "[TensorRT-LLM Github](https://github.com/NVIDIA/TensorRT-LLM)\n",
+    "\n",
+    "## TensorRT-LLM environment setup\n",
+    "Since TensorRT-LLM is a SDK for interacting with local models in process there are a few environment steps that must be followed to ensure that the TensorRT-LLM setup can be used.\n",
+    "<br>\n",
+    "1. Install `tensorrt_llm` following the instruction on [TensorRT-LLM Github](https://github.com/NVIDIA/TensorRT-LLM). Llama2 and Mistral models are supported with this connector. The following steps are shown for Llama2\n",
+    "2. Ensure you have access to the Llama 2 [repository on huggingface](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "585c521c-d2ab-407c-b3fb-768c1afd0922",
+   "metadata": {},
+   "source": [
+    "## Langchain-nvidia-trt setup\n",
+    "To install from source:\n",
+    "1. `cd langchain-nvidia/libs/trt`\n",
+    "2. `!pip install -e .`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "01e68c59-1245-4ab3-a7c8-0ba54619f873",
+   "metadata": {},
+   "source": [
+    "## Use TensorRT-LLM to create engine files for the model\n",
+    "Llama2 and Mistral models are supported. The following steps are shown for Llama2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aeeb384f-effa-4d1a-9bf0-3bf748daeb73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorrt_llm import LLM, ModelConfig\n",
+    "from huggingface_hub import snapshot_download\n",
+    "\n",
+    "#Download the Llama2 model\n",
+    "model_dir = snapshot_download(repo_id=\"meta-llama/Llama-2-7b-chat-hf\",token=\"<hf_token>\")\n",
+    "\n",
+    "# Load the model via LLM and save the .engine file\n",
+    "# Please restart the kernel after saving the .engine file\n",
+    "# to prevent OOM errors with the torch and engine loaded\n",
+    "config = ModelConfig(model_dir=model_dir)\n",
+    "llm = LLM(config)\n",
+    "llm.save(\"./model\")\n",
+    "#Plug this path to the TrtLlmAPI"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88f77f91-fced-408b-95ff-c9b604b9c188",
+   "metadata": {},
+   "source": [
+    "### Building engine files for Windows users\n",
+    "Instead of using the steps above, build the engine files using the following steps:\n",
+    "1. Clone the [TensorRT-LLM Github](https://github.com/NVIDIA/TensorRT-LLM) repository\n",
+    "2. Change directory to [examples/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama) for LLama models\n",
+    "3. Convert model to checkpoint format and build the engine using the following commands"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0de70883-215a-40aa-a12b-e684536dd495",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build the LLaMA 7B model using a single GPU and FP16.\n",
+    "python convert_checkpoint.py --model_dir ./tmp/llama/7B/ \\\n",
+    "                              --output_dir ./tllm_checkpoint_1gpu_fp16 \\\n",
+    "                              --dtype float16\n",
+    "\n",
+    "trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp16 \\\n",
+    "            --output_dir ./tmp/llama/7B/trt_engines/fp16/1-gpu \\\n",
+    "            --gemm_plugin float16 \\\n",
+    "            --context_fmha disable \\\n",
+    "            --context_fmha_fp32_acc enable"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49e836a3-be37-416a-8af6-5e53562789e4",
+   "metadata": {},
+   "source": [
+    "## Create the TrtLlmAPI instance\n",
+    "When setting up an LLM object, provide the model directory where the engine built is placed, tokenizer path to the cloned huggingface repository and temperature to specify the desired deterministic nature of the responses. Call `invoke` with a prompt. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "908ac6ac-b728-47d2-ae41-11a5cf4c1057",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_nvidia_trt.llms import TrtLlmAPI\n",
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "template = \"\"\"Question: {question}\n",
+    "\n",
+    "Answer: Let's think step by step.\"\"\"\n",
+    "\n",
+    "prompt = PromptTemplate.from_template(template)\n",
+    "\n",
+    "llm = TrtLlmAPI(\n",
+    "    model_path=\"./model\",\n",
+    "    tokenizer_dir=\"./model\",\n",
+    "    temperature=1.0\n",
+    ")\n",
+    "chain = prompt | llm\n",
+    "print(chain.invoke({\"question\": \"Who is Paul Graham?\"}))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/trt/langchain_nvidia_trt/__init__.py b/libs/trt/langchain_nvidia_trt/__init__.py
index 5d89779e..d666a969 100644
--- a/libs/trt/langchain_nvidia_trt/__init__.py
+++ b/libs/trt/langchain_nvidia_trt/__init__.py
@@ -1,3 +1,3 @@
-from langchain_nvidia_trt.llms import TritonTensorRTLLM
+from langchain_nvidia_trt.llms import (TritonTensorRTLLM, TrtLlmAPI)
 
-__all__ = ["TritonTensorRTLLM"]
+__all__ = ["TritonTensorRTLLM", "TrtLlmAPI"]
diff --git a/libs/trt/langchain_nvidia_trt/llms.py b/libs/trt/langchain_nvidia_trt/llms.py
index 0ea1fca1..12e24f95 100644
--- a/libs/trt/langchain_nvidia_trt/llms.py
+++ b/libs/trt/langchain_nvidia_trt/llms.py
@@ -5,7 +5,7 @@
 import random
 import time
 from functools import partial
-from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
+from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
 
 import google.protobuf.json_format
 import numpy as np
@@ -13,10 +13,18 @@
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from langchain_core.language_models import BaseLLM
 from langchain_core.outputs import Generation, GenerationChunk, LLMResult
-from langchain_core.pydantic_v1 import Field, root_validator
+from langchain_core.pydantic_v1 import Field, root_validator, PrivateAttr
 from tritonclient.grpc.service_pb2 import ModelInferResponse
 from tritonclient.utils import np_to_triton_dtype
 
+import gc
+import torch
+import tensorrt_llm
+import uuid
+from langchain_core.callbacks import CallbackManager
+from .utils import (DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS, load_tokenizer, read_model_name)
+from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner, ModelRunnerCpp
+from tensorrt_llm.logger import logger
 
 class TritonTensorRTError(Exception):
     """Base exception for TritonTensorRT."""
@@ -405,3 +413,243 @@ def __next__(self) -> str:
             )
             raise StopIteration()
         return val
+
+class TrtLlmAPI(BaseLLM):
+    model_path: Optional[str] = Field(
+        description="The path to the trt engine."
+    )
+    tokenizer_dir: Optional[str] = Field(
+        description="The path to the trt engine."
+    )
+    temperature: float = Field(
+        default=0.1, description="The temperature to use for sampling."
+    )
+    max_new_tokens: int = Field(
+        default=DEFAULT_NUM_OUTPUTS, description="The maximum number of tokens to generate."
+    )
+    context_window: int = Field(
+        default=DEFAULT_CONTEXT_WINDOW, description="The maximum number of context tokens for the model."
+    )
+    verbose: bool = Field(default=False, description="Whether to print verbose output.")
+
+    _model: Any = PrivateAttr()
+    _model_name = PrivateAttr()
+    _model_version = PrivateAttr()
+    _model_config: Any = PrivateAttr()
+    _tokenizer: Any = PrivateAttr()
+    _pad_id:Any = PrivateAttr()
+    _end_id: Any = PrivateAttr()
+    _max_new_tokens = PrivateAttr()
+    _max_input_tokens = PrivateAttr()
+    _sampling_config = PrivateAttr()
+    _debug_mode = PrivateAttr()
+    _add_special_tokens = PrivateAttr()
+    _verbose = PrivateAttr()
+
+    def _init_attr(
+            self,
+            model_path: Optional[str] = None,
+            tokenizer_dir: Optional[str] = None,
+            vocab_file: Optional[str] = None,
+            temperature: float = 0.1,
+            max_new_tokens: int = DEFAULT_NUM_OUTPUTS,
+            context_window: int = DEFAULT_CONTEXT_WINDOW,
+            callback_manager: Optional[CallbackManager] = None,
+            use_py_session = True,
+            add_special_tokens = False,
+            trtLlm_debug_mode = False,
+            verbose: bool = False
+    ) -> None:
+        runtime_rank = tensorrt_llm.mpi_rank()
+        self._model_name, self._model_version = read_model_name(model_path)
+        if tokenizer_dir is None:
+            logger.error(
+                "tokenizer_dir is not specified."
+            )
+
+        self._max_input_tokens=context_window
+        self._add_special_tokens=add_special_tokens
+        self._verbose = verbose
+
+        self._tokenizer, self._pad_id, self._end_id = load_tokenizer(
+            tokenizer_dir=tokenizer_dir,
+            vocab_file=vocab_file,
+            model_name=self._model_name,
+            model_version=self._model_version,
+        )
+
+        runner_cls = ModelRunner if use_py_session else ModelRunnerCpp
+        if verbose:
+            logger.info(f"Trt-llm mode debug mode: {trtLlm_debug_mode}")
+
+        runner_kwargs = dict(engine_dir=model_path,
+                             rank=runtime_rank,
+                             debug_mode=trtLlm_debug_mode,
+                             lora_ckpt_source='hf')
+
+        if not use_py_session:
+            runner_kwargs.update(free_gpu_memory_fraction = 0.5)
+
+        self._model = runner_cls.from_dir(**runner_kwargs)
+
+        self._max_new_tokens = max_new_tokens
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of LLM."""
+        return "nvidia-trt-llm-api"
+
+    def _generate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        """Run the LLM on the given prompt and input."""
+        generations = []
+        for prompt in prompts:
+            text = (
+                self.complete_call(prompt, stop=stop, run_manager=run_manager, **kwargs)
+            )
+            generations.append([Generation(text=text)])
+        return LLMResult(generations=generations)
+
+    def complete_call(
+        self,
+        prompt: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        self._init_attr(model_path=self.model_path,
+                        tokenizer_dir=self.tokenizer_dir,
+                        verbose=self.verbose,
+                        temperature=self.temperature)
+
+        if self._verbose:
+            logger.info(f"Context send to LLM \n: {prompt}")
+
+        input_text = [prompt]
+        batch_input_ids = self.parse_input(
+                                tokenizer=self._tokenizer,
+                                input_text=input_text,
+                                prompt_template=None,
+                                input_file=None,
+                                add_special_tokens=self._add_special_tokens,
+                                max_input_length=self._max_input_tokens,
+                                pad_id=self._pad_id,
+                                num_prepend_vtokens=None,
+                                model_name= self._model_name,
+                                model_version=self._model_version)
+        input_lengths = [x.size(0) for x in batch_input_ids]
+
+        if self._verbose:
+            logger.info(f"Number of token : {input_lengths[0]}")
+
+        with torch.no_grad():
+            outputs = self._model.generate(
+                batch_input_ids,
+                max_new_tokens=self._max_new_tokens,
+                max_attention_window_size=4096,
+                end_id=self._end_id,
+                pad_id=self._pad_id,
+                temperature=self.temperature,
+                top_k=1,
+                top_p=0,
+                num_beams=1,
+                length_penalty=1.0,
+                early_stopping=False,
+                repetition_penalty=1.0,
+                presence_penalty=0.0,
+                frequency_penalty=0.0,
+                stop_words_list=None,
+                bad_words_list=None,
+                lora_uids=None,
+                prompt_table_path=None,
+                prompt_tasks=None,
+                streaming=False,
+                output_sequence_lengths=True,
+                return_dict=True)
+            torch.cuda.synchronize()
+
+        output_ids = outputs['output_ids']
+        sequence_lengths = outputs['sequence_lengths']
+        output_txt, output_token_ids = self.print_output(self._tokenizer,
+                                                        output_ids,
+                                                        input_lengths,
+                                                        sequence_lengths)
+        # call garbage collected after inference
+        torch.cuda.empty_cache()
+        gc.collect()
+        return output_txt
+
+    def parse_input(self,
+                    tokenizer,
+                    input_text=None,
+                    prompt_template=None,
+                    input_file=None,
+                    add_special_tokens=False,
+                    max_input_length=4096,
+                    pad_id=None,
+                    num_prepend_vtokens=[],
+                    model_name=None,
+                    model_version=None):
+        if pad_id is None:
+            pad_id = tokenizer.pad_token_id
+
+        batch_input_ids = []
+        if input_file is None:
+            for curr_text in input_text:
+                if prompt_template is not None:
+                    curr_text = prompt_template.format(input_text=curr_text)
+                input_ids = tokenizer.encode(curr_text,
+                                             add_special_tokens=add_special_tokens,
+                                             truncation=True,
+                                             max_length=max_input_length)
+                batch_input_ids.append(input_ids)
+
+        if num_prepend_vtokens:
+            assert len(num_prepend_vtokens) == len(batch_input_ids)
+            base_vocab_size = tokenizer.vocab_size - len(
+                tokenizer.special_tokens_map.get('additional_special_tokens', []))
+            for i, length in enumerate(num_prepend_vtokens):
+                batch_input_ids[i] = list(
+                    range(base_vocab_size,
+                          base_vocab_size + length)) + batch_input_ids[i]
+
+        if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
+            for ids in batch_input_ids:
+                ids.append(tokenizer.sop_token_id)
+
+        batch_input_ids = [
+            torch.tensor(x, dtype=torch.int32) for x in batch_input_ids
+        ]
+
+        return batch_input_ids
+
+    def print_output(self,
+                     tokenizer,
+                     output_ids,
+                     input_lengths,
+                     sequence_lengths,
+                     output_csv=None,
+                     output_npy=None,
+                     context_logits=None,
+                     generation_logits=None,
+                     output_logits_npy=None):
+        output_text = ""
+        batch_size, num_beams, _ = output_ids.size()
+        if output_csv is None and output_npy is None:
+            for batch_idx in range(batch_size):
+                inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
+                )
+                for beam in range(num_beams):
+                    output_begin = input_lengths[batch_idx]
+                    output_end = sequence_lengths[batch_idx][beam]
+                    outputs = output_ids[batch_idx][beam][
+                              output_begin:output_end].tolist()
+                    output_text = tokenizer.decode(outputs)
+
+        output_ids = output_ids.reshape((-1, output_ids.size(2)))
+        return output_text, output_ids
diff --git a/libs/trt/langchain_nvidia_trt/utils.py b/libs/trt/langchain_nvidia_trt/utils.py
new file mode 100644
index 00000000..dd29c7b0
--- /dev/null
+++ b/libs/trt/langchain_nvidia_trt/utils.py
@@ -0,0 +1,75 @@
+import json
+from pathlib import Path
+from typing import Optional
+from transformers import AutoTokenizer, T5Tokenizer
+import tensorrt_llm
+
+DEFAULT_CONTEXT_WINDOW = 2048
+DEFAULT_NUM_OUTPUTS = 256
+
+def read_model_name(engine_dir: str):
+    engine_version = tensorrt_llm.builder.get_engine_version(engine_dir)
+
+    with open(Path(engine_dir) / "config.json", 'r') as f:
+        config = json.load(f)
+
+    if engine_version is None:
+        return config['builder_config']['name'], None
+
+    model_arch = config['pretrained_config']['architecture']
+    model_version = None
+    if model_arch == 'ChatGLMForCausalLM':
+        model_version = config['pretrained_config']['chatglm_version']
+    return model_arch, model_version
+
+def load_tokenizer(tokenizer_dir: Optional[str] = None,
+                   vocab_file: Optional[str] = None,
+                   model_name: str = 'gpt',
+                   model_version: Optional[str] = None,
+                   tokenizer_type: Optional[str] = None):
+    if vocab_file is None:
+        use_fast = True
+        if tokenizer_type is not None and tokenizer_type == "llama": 
+            use_fast = False
+        # Should set both padding_side and truncation_side to be 'left'
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                                  legacy=False,
+                                                  padding_side='left',
+                                                  truncation_side='left',
+                                                  trust_remote_code=True,
+                                                  tokenizer_type=tokenizer_type,
+                                                  use_fast=use_fast)
+    else:
+        # For gpt-next, directly load from tokenizer.model
+        tokenizer = T5Tokenizer(vocab_file=vocab_file,
+                                padding_side='left',
+                                truncation_side='left',
+                                legacy=False)
+
+    if model_name == 'qwen':
+        with open(Path(tokenizer_dir) / "generation_config.json") as f:
+            gen_config = json.load(f)
+        chat_format = gen_config['chat_format']
+        if chat_format == 'raw':
+            pad_id = gen_config['pad_token_id']
+            end_id = gen_config['eos_token_id']
+        elif chat_format == 'chatml':
+            pad_id = tokenizer.im_end_id
+            end_id = tokenizer.im_end_id
+        else:
+            raise Exception(f"unknown chat format: {chat_format}")
+    elif model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eop_token_id
+    elif model_name == 'GemmaForCausalLM':
+        tokenizer.eos_token_id = tokenizer.sp_model.eos_id()
+        tokenizer.bos_token_id = tokenizer.sp_model.bos_id()
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eos_token_id
+    else:
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        pad_id = tokenizer.pad_token_id
+        end_id = tokenizer.eos_token_id
+
+    return tokenizer, pad_id, end_id
diff --git a/libs/trt/pyproject.toml b/libs/trt/pyproject.toml
index 9846299c..fe3f9d20 100644
--- a/libs/trt/pyproject.toml
+++ b/libs/trt/pyproject.toml
@@ -28,7 +28,7 @@ pytest-mock = "^3.10.0"
 syrupy = "^4.0.2"
 pytest-watcher = "^0.3.4"
 pytest-asyncio = "^0.21.1"
-langchain-core = { path = "../../core", develop = true }
+langchain-core = "^0.1.30"
 
 [tool.poetry.group.codespell]
 optional = true
@@ -49,13 +49,13 @@ ruff = "^0.1.5"
 
 [tool.poetry.group.typing.dependencies]
 mypy = "^0.991"
-langchain-core = { path = "../../core", develop = true }
+langchain-core = "^0.1.30"
 
 [tool.poetry.group.dev]
 optional = true
 
 [tool.poetry.group.dev.dependencies]
-langchain-core = { path = "../../core", develop = true }
+langchain-core = "^0.1.30"
 
 [tool.ruff.lint]
 select = [

From 2972204d36f0bfe81e213633b81bdf632c133c3e Mon Sep 17 00:00:00 2001
From: araza008 <159492532+araza008@users.noreply.github.com>
Date: Wed, 1 May 2024 22:31:57 -0500
Subject: [PATCH 3/7] Update pyproject.toml to remove whitespace

---
 libs/trt/pyproject.toml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/libs/trt/pyproject.toml b/libs/trt/pyproject.toml
index dd8260fb..fe3f9d20 100644
--- a/libs/trt/pyproject.toml
+++ b/libs/trt/pyproject.toml
@@ -30,7 +30,6 @@ pytest-watcher = "^0.3.4"
 pytest-asyncio = "^0.21.1"
 langchain-core = "^0.1.30"
 
-
 [tool.poetry.group.codespell]
 optional = true
 
@@ -52,14 +51,12 @@ ruff = "^0.1.5"
 mypy = "^0.991"
 langchain-core = "^0.1.30"
 
-
 [tool.poetry.group.dev]
 optional = true
 
 [tool.poetry.group.dev.dependencies]
 langchain-core = "^0.1.30"
 
-
 [tool.ruff.lint]
 select = [
   "E",    # pycodestyle

From 17fb707e1d26ff70f71b4ff860e756f2d06fcf71 Mon Sep 17 00:00:00 2001
From: unknown <akberr@nvidia.com>
Date: Mon, 6 May 2024 12:29:41 -0700
Subject: [PATCH 4/7] Update poetry.lock to reflect pyproject.toml changes

---
 libs/trt/poetry.lock | 125 +++++++++++++++++++++++++------------------
 1 file changed, 72 insertions(+), 53 deletions(-)

diff --git a/libs/trt/poetry.lock b/libs/trt/poetry.lock
index d6f5f038..5027b7c9 100644
--- a/libs/trt/poetry.lock
+++ b/libs/trt/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -14,28 +14,6 @@ files = [
 [package.dependencies]
 typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""}
 
-[[package]]
-name = "anyio"
-version = "4.2.0"
-description = "High level compatibility layer for multiple asynchronous event loop implementations"
-optional = false
-python-versions = ">=3.8"
-files = [
-    {file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
-    {file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
-]
-
-[package.dependencies]
-exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
-idna = ">=2.8"
-sniffio = ">=1.1"
-typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
-
-[package.extras]
-doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
-test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
-trio = ["trio (>=0.23)"]
-
 [[package]]
 name = "astroid"
 version = "3.0.3"
@@ -391,42 +369,39 @@ files = [
 
 [[package]]
 name = "langchain-core"
-version = "0.1.23"
+version = "0.1.51"
 description = "Building applications with LLMs through composability"
 optional = false
-python-versions = ">=3.8.1,<4.0"
-files = []
-develop = true
+python-versions = "<4.0,>=3.8.1"
+files = [
+    {file = "langchain_core-0.1.51-py3-none-any.whl", hash = "sha256:3058bdb04d43a8eaae2e249365fe2e8d0356a09c7b2c1afa1a8100f8888da4fa"},
+    {file = "langchain_core-0.1.51.tar.gz", hash = "sha256:f7ea116f939be9e74c385baf95d6c84cd7a402b59c2c1893fc054bf98abbefc2"},
+]
 
 [package.dependencies]
-anyio = ">=3,<5"
-jsonpatch = "^1.33"
-langsmith = "^0.0.87"
-packaging = "^23.2"
+jsonpatch = ">=1.33,<2.0"
+langsmith = ">=0.1.0,<0.2.0"
+packaging = ">=23.2,<24.0"
 pydantic = ">=1,<3"
 PyYAML = ">=5.3"
-requests = "^2"
-tenacity = "^8.1.0"
+tenacity = ">=8.1.0,<9.0.0"
 
 [package.extras]
 extended-testing = ["jinja2 (>=3,<4)"]
 
-[package.source]
-type = "directory"
-url = "../../core"
-
 [[package]]
 name = "langsmith"
-version = "0.0.87"
+version = "0.1.54"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 optional = false
-python-versions = ">=3.8.1,<4.0"
+python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "langsmith-0.0.87-py3-none-any.whl", hash = "sha256:8903d3811b9fc89eb18f5961c8e6935fbd2d0f119884fbf30dc70b8f8f4121fc"},
-    {file = "langsmith-0.0.87.tar.gz", hash = "sha256:36c4cc47e5b54be57d038036a30fb19ce6e4c73048cd7a464b8f25b459694d34"},
+    {file = "langsmith-0.1.54-py3-none-any.whl", hash = "sha256:e8ba2758dbdff0fccb35337c28a5ab641dd980b22e178d390b72a15c9ae9caff"},
+    {file = "langsmith-0.1.54.tar.gz", hash = "sha256:86f5a90e48303de897f37a893f8bb635eabdaf23e674099e8bc0f2e9ca2f8faf"},
 ]
 
 [package.dependencies]
+orjson = ">=3.9.14,<4.0.0"
 pydantic = ">=1,<3"
 requests = ">=2,<3"
 
@@ -553,6 +528,61 @@ files = [
     {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"},
 ]
 
+[[package]]
+name = "orjson"
+version = "3.10.3"
+description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "orjson-3.10.3-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:9fb6c3f9f5490a3eb4ddd46fc1b6eadb0d6fc16fb3f07320149c3286a1409dd8"},
+    {file = "orjson-3.10.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:252124b198662eee80428f1af8c63f7ff077c88723fe206a25df8dc57a57b1fa"},
+    {file = "orjson-3.10.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9f3e87733823089a338ef9bbf363ef4de45e5c599a9bf50a7a9b82e86d0228da"},
+    {file = "orjson-3.10.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c8334c0d87103bb9fbbe59b78129f1f40d1d1e8355bbed2ca71853af15fa4ed3"},
+    {file = "orjson-3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1952c03439e4dce23482ac846e7961f9d4ec62086eb98ae76d97bd41d72644d7"},
+    {file = "orjson-3.10.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c0403ed9c706dcd2809f1600ed18f4aae50be263bd7112e54b50e2c2bc3ebd6d"},
+    {file = "orjson-3.10.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:382e52aa4270a037d41f325e7d1dfa395b7de0c367800b6f337d8157367bf3a7"},
+    {file = "orjson-3.10.3-cp310-none-win32.whl", hash = "sha256:be2aab54313752c04f2cbaab4515291ef5af8c2256ce22abc007f89f42f49109"},
+    {file = "orjson-3.10.3-cp310-none-win_amd64.whl", hash = "sha256:416b195f78ae461601893f482287cee1e3059ec49b4f99479aedf22a20b1098b"},
+    {file = "orjson-3.10.3-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:73100d9abbbe730331f2242c1fc0bcb46a3ea3b4ae3348847e5a141265479700"},
+    {file = "orjson-3.10.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:544a12eee96e3ab828dbfcb4d5a0023aa971b27143a1d35dc214c176fdfb29b3"},
+    {file = "orjson-3.10.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:520de5e2ef0b4ae546bea25129d6c7c74edb43fc6cf5213f511a927f2b28148b"},
+    {file = "orjson-3.10.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccaa0a401fc02e8828a5bedfd80f8cd389d24f65e5ca3954d72c6582495b4bcf"},
+    {file = "orjson-3.10.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a7bc9e8bc11bac40f905640acd41cbeaa87209e7e1f57ade386da658092dc16"},
+    {file = "orjson-3.10.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3582b34b70543a1ed6944aca75e219e1192661a63da4d039d088a09c67543b08"},
+    {file = "orjson-3.10.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1c23dfa91481de880890d17aa7b91d586a4746a4c2aa9a145bebdbaf233768d5"},
+    {file = "orjson-3.10.3-cp311-none-win32.whl", hash = "sha256:1770e2a0eae728b050705206d84eda8b074b65ee835e7f85c919f5705b006c9b"},
+    {file = "orjson-3.10.3-cp311-none-win_amd64.whl", hash = "sha256:93433b3c1f852660eb5abdc1f4dd0ced2be031ba30900433223b28ee0140cde5"},
+    {file = "orjson-3.10.3-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:a39aa73e53bec8d410875683bfa3a8edf61e5a1c7bb4014f65f81d36467ea098"},
+    {file = "orjson-3.10.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0943a96b3fa09bee1afdfccc2cb236c9c64715afa375b2af296c73d91c23eab2"},
+    {file = "orjson-3.10.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e852baafceff8da3c9defae29414cc8513a1586ad93e45f27b89a639c68e8176"},
+    {file = "orjson-3.10.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18566beb5acd76f3769c1d1a7ec06cdb81edc4d55d2765fb677e3eaa10fa99e0"},
+    {file = "orjson-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bd2218d5a3aa43060efe649ec564ebedec8ce6ae0a43654b81376216d5ebd42"},
+    {file = "orjson-3.10.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:cf20465e74c6e17a104ecf01bf8cd3b7b252565b4ccee4548f18b012ff2f8069"},
+    {file = "orjson-3.10.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ba7f67aa7f983c4345eeda16054a4677289011a478ca947cd69c0a86ea45e534"},
+    {file = "orjson-3.10.3-cp312-none-win32.whl", hash = "sha256:17e0713fc159abc261eea0f4feda611d32eabc35708b74bef6ad44f6c78d5ea0"},
+    {file = "orjson-3.10.3-cp312-none-win_amd64.whl", hash = "sha256:4c895383b1ec42b017dd2c75ae8a5b862fc489006afde06f14afbdd0309b2af0"},
+    {file = "orjson-3.10.3-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:be2719e5041e9fb76c8c2c06b9600fe8e8584e6980061ff88dcbc2691a16d20d"},
+    {file = "orjson-3.10.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0175a5798bdc878956099f5c54b9837cb62cfbf5d0b86ba6d77e43861bcec2"},
+    {file = "orjson-3.10.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:978be58a68ade24f1af7758626806e13cff7748a677faf95fbb298359aa1e20d"},
+    {file = "orjson-3.10.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16bda83b5c61586f6f788333d3cf3ed19015e3b9019188c56983b5a299210eb5"},
+    {file = "orjson-3.10.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ad1f26bea425041e0a1adad34630c4825a9e3adec49079b1fb6ac8d36f8b754"},
+    {file = "orjson-3.10.3-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9e253498bee561fe85d6325ba55ff2ff08fb5e7184cd6a4d7754133bd19c9195"},
+    {file = "orjson-3.10.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0a62f9968bab8a676a164263e485f30a0b748255ee2f4ae49a0224be95f4532b"},
+    {file = "orjson-3.10.3-cp38-none-win32.whl", hash = "sha256:8d0b84403d287d4bfa9bf7d1dc298d5c1c5d9f444f3737929a66f2fe4fb8f134"},
+    {file = "orjson-3.10.3-cp38-none-win_amd64.whl", hash = "sha256:8bc7a4df90da5d535e18157220d7915780d07198b54f4de0110eca6b6c11e290"},
+    {file = "orjson-3.10.3-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:9059d15c30e675a58fdcd6f95465c1522b8426e092de9fff20edebfdc15e1cb0"},
+    {file = "orjson-3.10.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d40c7f7938c9c2b934b297412c067936d0b54e4b8ab916fd1a9eb8f54c02294"},
+    {file = "orjson-3.10.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4a654ec1de8fdaae1d80d55cee65893cb06494e124681ab335218be6a0691e7"},
+    {file = "orjson-3.10.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:831c6ef73f9aa53c5f40ae8f949ff7681b38eaddb6904aab89dca4d85099cb78"},
+    {file = "orjson-3.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99b880d7e34542db89f48d14ddecbd26f06838b12427d5a25d71baceb5ba119d"},
+    {file = "orjson-3.10.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2e5e176c994ce4bd434d7aafb9ecc893c15f347d3d2bbd8e7ce0b63071c52e25"},
+    {file = "orjson-3.10.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b69a58a37dab856491bf2d3bbf259775fdce262b727f96aafbda359cb1d114d8"},
+    {file = "orjson-3.10.3-cp39-none-win32.whl", hash = "sha256:b8d4d1a6868cde356f1402c8faeb50d62cee765a1f7ffcfd6de732ab0581e063"},
+    {file = "orjson-3.10.3-cp39-none-win_amd64.whl", hash = "sha256:5102f50c5fc46d94f2033fe00d392588564378260d64377aec702f21a7a22912"},
+    {file = "orjson-3.10.3.tar.gz", hash = "sha256:2b166507acae7ba2f7c315dcf185a9111ad5e992ac81f2d507aac39193c2c818"},
+]
+
 [[package]]
 name = "packaging"
 version = "23.2"
@@ -1050,17 +1080,6 @@ files = [
     {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"},
 ]
 
-[[package]]
-name = "sniffio"
-version = "1.3.0"
-description = "Sniff out which async library your code is running under"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"},
-    {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
-]
-
 [[package]]
 name = "syrupy"
 version = "4.6.1"
@@ -1220,4 +1239,4 @@ watchmedo = ["PyYAML (>=3.10)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "0e732a2a4dbf0fcdd6dfb5d3764f630df29bb57c06f487c4d4f32dfeb2d11660"
+content-hash = "8932071a5e0a441d7f51044c769c6cdf25a4ee2608658b2644c226df187a7d80"

From cbfc6e730e6d6e93df96e20bfea5ac34945681b3 Mon Sep 17 00:00:00 2001
From: unknown <akberr@nvidia.com>
Date: Mon, 6 May 2024 12:50:51 -0700
Subject: [PATCH 5/7] Resolving misc errors pertaining to unused imports

---
 libs/trt/langchain_nvidia_trt/llms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/trt/langchain_nvidia_trt/llms.py b/libs/trt/langchain_nvidia_trt/llms.py
index 12e24f95..fce61ef9 100644
--- a/libs/trt/langchain_nvidia_trt/llms.py
+++ b/libs/trt/langchain_nvidia_trt/llms.py
@@ -5,7 +5,7 @@
 import random
 import time
 from functools import partial
-from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Union
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Union
 
 import google.protobuf.json_format
 import numpy as np

From 52af721b366e051b365b2454e35c82ad60795bb2 Mon Sep 17 00:00:00 2001
From: unknown <akberr@nvidia.com>
Date: Mon, 6 May 2024 15:08:40 -0700
Subject: [PATCH 6/7] Resolve lint errors reported

---
 libs/trt/langchain_nvidia_trt/__init__.py |  2 +-
 libs/trt/langchain_nvidia_trt/llms.py     | 23 +++++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/libs/trt/langchain_nvidia_trt/__init__.py b/libs/trt/langchain_nvidia_trt/__init__.py
index d666a969..01265a64 100644
--- a/libs/trt/langchain_nvidia_trt/__init__.py
+++ b/libs/trt/langchain_nvidia_trt/__init__.py
@@ -1,3 +1,3 @@
-from langchain_nvidia_trt.llms import (TritonTensorRTLLM, TrtLlmAPI)
+from langchain_nvidia_trt.llms import TritonTensorRTLLM, TrtLlmAPI
 
 __all__ = ["TritonTensorRTLLM", "TrtLlmAPI"]
diff --git a/libs/trt/langchain_nvidia_trt/llms.py b/libs/trt/langchain_nvidia_trt/llms.py
index fce61ef9..f0553097 100644
--- a/libs/trt/langchain_nvidia_trt/llms.py
+++ b/libs/trt/langchain_nvidia_trt/llms.py
@@ -20,10 +20,14 @@
 import gc
 import torch
 import tensorrt_llm
-import uuid
 from langchain_core.callbacks import CallbackManager
-from .utils import (DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS, load_tokenizer, read_model_name)
-from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner, ModelRunnerCpp
+from .utils import (
+    DEFAULT_CONTEXT_WINDOW,
+    DEFAULT_NUM_OUTPUTS,
+    load_tokenizer,
+    read_model_name
+    )
+from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
 from tensorrt_llm.logger import logger
 
 class TritonTensorRTError(Exception):
@@ -425,12 +429,17 @@ class TrtLlmAPI(BaseLLM):
         default=0.1, description="The temperature to use for sampling."
     )
     max_new_tokens: int = Field(
-        default=DEFAULT_NUM_OUTPUTS, description="The maximum number of tokens to generate."
+        default=DEFAULT_NUM_OUTPUTS,
+        description="The maximum number of tokens to generate."
     )
     context_window: int = Field(
-        default=DEFAULT_CONTEXT_WINDOW, description="The maximum number of context tokens for the model."
+        default=DEFAULT_CONTEXT_WINDOW,
+        description="The maximum number of context tokens for the model."
+    )
+    verbose: bool = Field(
+        default=False,
+        description="Whether to print verbose output."
     )
-    verbose: bool = Field(default=False, description="Whether to print verbose output.")
 
     _model: Any = PrivateAttr()
     _model_name = PrivateAttr()
@@ -642,8 +651,6 @@ def print_output(self,
         batch_size, num_beams, _ = output_ids.size()
         if output_csv is None and output_npy is None:
             for batch_idx in range(batch_size):
-                inputs = output_ids[batch_idx][0][:input_lengths[batch_idx]].tolist(
-                )
                 for beam in range(num_beams):
                     output_begin = input_lengths[batch_idx]
                     output_end = sequence_lengths[batch_idx][beam]

From 28f97d1d9b0637ccc948bfcd2a5b26814004db8e Mon Sep 17 00:00:00 2001
From: unknown <akberr@nvidia.com>
Date: Mon, 6 May 2024 15:48:37 -0700
Subject: [PATCH 7/7] Resolve import format related errors caught by lint

---
 libs/trt/langchain_nvidia_trt/llms.py  | 20 ++++++++++----------
 libs/trt/langchain_nvidia_trt/utils.py |  3 ++-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/libs/trt/langchain_nvidia_trt/llms.py b/libs/trt/langchain_nvidia_trt/llms.py
index f0553097..de5340c5 100644
--- a/libs/trt/langchain_nvidia_trt/llms.py
+++ b/libs/trt/langchain_nvidia_trt/llms.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import gc
 import json
 import queue
 import random
@@ -9,26 +10,25 @@
 
 import google.protobuf.json_format
 import numpy as np
+import tensorrt_llm
+import torch
 import tritonclient.grpc as grpcclient
-from langchain_core.callbacks import CallbackManagerForLLMRun
+from langchain_core.callbacks import CallbackManager, CallbackManagerForLLMRun
 from langchain_core.language_models import BaseLLM
 from langchain_core.outputs import Generation, GenerationChunk, LLMResult
-from langchain_core.pydantic_v1 import Field, root_validator, PrivateAttr
+from langchain_core.pydantic_v1 import Field, PrivateAttr, root_validator
+from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
 from tritonclient.grpc.service_pb2 import ModelInferResponse
 from tritonclient.utils import np_to_triton_dtype
 
-import gc
-import torch
-import tensorrt_llm
-from langchain_core.callbacks import CallbackManager
 from .utils import (
     DEFAULT_CONTEXT_WINDOW,
     DEFAULT_NUM_OUTPUTS,
     load_tokenizer,
-    read_model_name
-    )
-from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
-from tensorrt_llm.logger import logger
+    read_model_name,
+)
+
 
 class TritonTensorRTError(Exception):
     """Base exception for TritonTensorRT."""
diff --git a/libs/trt/langchain_nvidia_trt/utils.py b/libs/trt/langchain_nvidia_trt/utils.py
index dd29c7b0..8202c33e 100644
--- a/libs/trt/langchain_nvidia_trt/utils.py
+++ b/libs/trt/langchain_nvidia_trt/utils.py
@@ -1,8 +1,9 @@
 import json
 from pathlib import Path
 from typing import Optional
-from transformers import AutoTokenizer, T5Tokenizer
+
 import tensorrt_llm
+from transformers import AutoTokenizer, T5Tokenizer
 
 DEFAULT_CONTEXT_WINDOW = 2048
 DEFAULT_NUM_OUTPUTS = 256