diff --git a/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb new file mode 100644 index 00000000..c301fb8e --- /dev/null +++ b/libs/ai-endpoints/docs/llms/nvidia_ai_endpoints.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NVIDIA NIMs\n", + "\n", + ":::caution\n", + "You are currently on a page documenting the use of models as [text completion models](/docs/concepts/#llms).\n", + "Many popular models are [chat completion models](/docs/concepts/#chat-models).\n", + "\n", + "To use chat completion models, use [ChatNVIDA](/docs/integrations/chat/nvidia_ai_endpoints/) instead.\n", + ":::\n", + "\n", + "The `langchain-nvidia-ai-endpoints` package contains LangChain integrations building applications with models on \n", + "NVIDIA NIM inference microservice. NIM supports models across domains like chat, completion, embedding, and re-ranking models \n", + "from the community as well as NVIDIA. These models are optimized by NVIDIA to deliver the best performance on NVIDIA \n", + "accelerated infrastructure and deployed as a NIM, an easy-to-use, prebuilt containers that deploy anywhere using a single \n", + "command on NVIDIA accelerated infrastructure.\n", + "\n", + "NVIDIA hosted deployments of NIMs are available to test on the [NVIDIA API catalog](https://build.nvidia.com/). After testing, \n", + "NIMs can be exported from NVIDIA’s API catalog using the NVIDIA AI Enterprise license and run on-premises or in the cloud, \n", + "giving enterprises ownership and full control of their IP and AI application.\n", + "\n", + "NIMs are packaged as container images on a per model basis and are distributed as NGC container images through the NVIDIA NGC Catalog. \n", + "At their core, NIMs provide easy, consistent, and familiar APIs for running inference on an AI model.\n", + "\n", + "This example goes over how to use LangChain to interact with NVIDIA supported via the `NVIDIA` class.\n", + "\n", + "For more information on accessing the completion models through this api, check out the [NVIDIA](https://python.langchain.com/docs/integrations/llms/nvidia_ai_endpoints/) documentation.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#%pip install -qU langchain-nvidia-ai-endpoints" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "**To get started:**\n", + "\n", + "1. Create a free account with [NVIDIA](https://build.nvidia.com/), which hosts NVIDIA AI Foundation models.\n", + "\n", + "2. Click on your model of choice.\n", + "\n", + "3. Under `Input` select the `Python` tab, and click `Get API Key`. Then click `Generate Key`.\n", + "\n", + "4. Copy and save the generated key as `NVIDIA_API_KEY`. From there, you should have access to the endpoints." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "# del os.environ['NVIDIA_API_KEY'] ## delete key and reset\n", + "if os.environ.get(\"NVIDIA_API_KEY\", \"\").startswith(\"nvapi-\"):\n", + " print(\"Valid NVIDIA_API_KEY already in environment. Delete to reset\")\n", + "else:\n", + " candidate_api_key = getpass(\"NVAPI Key (starts with nvapi-): \")\n", + " assert candidate_api_key.startswith(\"nvapi-\"), f\"{candidate_api_key[:5]}... is not a valid key\"\n", + " os.environ[\"NVIDIA_API_KEY\"] = candidate_api_key" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage\n", + "\n", + "See [LLM](/docs/how_to#llms) for full functionality." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_nvidia_ai_endpoints import NVIDIA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm = NVIDIA().bind(max_tokens=256)\n", + "llm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prompt = \"# Function that does quicksort written in Rust without comments:\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(llm.invoke(prompt))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for chunk in llm.stream(prompt):\n", + " print(chunk, end=\"\", flush=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm.batch([prompt])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await llm.ainvoke(prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async for chunk in llm.astream(prompt):\n", + " print(chunk, end=\"\", flush=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await llm.abatch([prompt])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async for chunk in llm.astream_log(prompt):\n", + " print(chunk)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.invoke(\n", + " \"X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1) #Train a logistic regression model, predict the labels on the test set and compute the accuracy score\"\n", + ")\n", + "print(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langchain-nvidia-ai-endpoints-m0-Y4aGr-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py index d5796e3c..bfda8d36 100644 --- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py +++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/__init__.py @@ -42,6 +42,14 @@ from langchain_nvidia_ai_endpoints._statics import Model, register_model from langchain_nvidia_ai_endpoints.chat_models import ChatNVIDIA from langchain_nvidia_ai_endpoints.embeddings import NVIDIAEmbeddings +from langchain_nvidia_ai_endpoints.llm import NVIDIA from langchain_nvidia_ai_endpoints.reranking import NVIDIARerank -__all__ = ["ChatNVIDIA", "NVIDIAEmbeddings", "NVIDIARerank", "register_model", "Model"] +__all__ = [ + "ChatNVIDIA", + "NVIDIA", + "NVIDIAEmbeddings", + "NVIDIARerank", + "register_model", + "Model", +] diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py index 52e29679..a2992523 100644 --- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py +++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_statics.py @@ -10,8 +10,8 @@ class Model(BaseModel): Model information. id: unique identifier for the model, passed as model parameter for requests - model_type: API type (chat, vlm, embedding, ranking, completion) - client: client name, e.g. ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank + model_type: API type (chat, vlm, embedding, ranking, completions) + client: client name, e.g. ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank, NVIDIA endpoint: custom endpoint for the model aliases: list of aliases for the model supports_tools: whether the model supports tool calling @@ -23,9 +23,11 @@ class Model(BaseModel): id: str # why do we have a model_type? because ChatNVIDIA can speak both chat and vlm. model_type: Optional[ - Literal["chat", "vlm", "embedding", "ranking", "completion", "qa"] + Literal["chat", "vlm", "embedding", "ranking", "completions", "qa"] + ] = None + client: Optional[ + Literal["ChatNVIDIA", "NVIDIAEmbeddings", "NVIDIARerank", "NVIDIA"] ] = None - client: Optional[Literal["ChatNVIDIA", "NVIDIAEmbeddings", "NVIDIARerank"]] = None endpoint: Optional[str] = None aliases: Optional[list] = None supports_tools: Optional[bool] = False @@ -42,6 +44,7 @@ def validate_client(cls, client: str, values: dict) -> str: "ChatNVIDIA": ("chat", "vlm", "qa"), "NVIDIAEmbeddings": ("embedding",), "NVIDIARerank": ("ranking",), + "NVIDIA": ("completions",), } model_type = values.get("model_type") if model_type not in supported[client]: @@ -491,14 +494,18 @@ def validate_client(cls, client: str, values: dict) -> str: ), } -# COMPLETION_MODEL_TABLE = { -# "mistralai/mixtral-8x22b-v0.1": Model( -# id="mistralai/mixtral-8x22b-v0.1", -# model_type="completion", -# client="NVIDIA", -# aliases=["ai-mixtral-8x22b"], -# ), -# } +COMPLETION_MODEL_TABLE = { + "bigcode/starcoder2-7b": Model( + id="bigcode/starcoder2-7b", + model_type="completions", + client="NVIDIA", + ), + "bigcode/starcoder2-15b": Model( + id="bigcode/starcoder2-15b", + model_type="completions", + client="NVIDIA", + ), +} OPENAI_MODEL_TABLE = { @@ -518,6 +525,7 @@ def validate_client(cls, client: str, values: dict) -> str: **VLM_MODEL_TABLE, **EMBEDDING_MODEL_TABLE, **RANKING_MODEL_TABLE, + **COMPLETION_MODEL_TABLE, } if "_INCLUDE_OPENAI" in os.environ: diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py new file mode 100644 index 00000000..94d23cd4 --- /dev/null +++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/llm.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +import os +import warnings +from typing import Any, Dict, Iterator, List, Optional + +from langchain_core.callbacks.manager import CallbackManagerForLLMRun +from langchain_core.language_models.llms import LLM +from langchain_core.outputs import GenerationChunk +from langchain_core.pydantic_v1 import Field, PrivateAttr, root_validator + +from langchain_nvidia_ai_endpoints._common import _NVIDIAClient +from langchain_nvidia_ai_endpoints._statics import Model + + +class NVIDIA(LLM): + """ + LangChain LLM that uses the Completions API with NVIDIA NIMs. + """ + + class Config: + validate_assignment = True + + _client: _NVIDIAClient = PrivateAttr(_NVIDIAClient) + _default_model_name: str = "bigcode/starcoder2-7b" + _default_base_url: str = "https://integrate.api.nvidia.com/v1" + base_url: str = Field( + description="Base url for model listing and invocation", + ) + model: Optional[str] = Field(description="The model to use for completions.") + + _base_url_var = "NVIDIA_BASE_URL" + + _init_args: Dict[str, Any] = PrivateAttr() + """Stashed arguments given to the constructor that can be passed to + the Completions API endpoint.""" + + @root_validator(pre=True) + def _validate_base_url(cls, values: Dict[str, Any]) -> Dict[str, Any]: + values["base_url"] = ( + values.get(cls._base_url_var.lower()) + or values.get("base_url") + or os.getenv(cls._base_url_var) + or cls._default_base_url + ) + return values + + def __check_kwargs(self, kwargs: Dict[str, Any]) -> Dict[str, Any]: + """ + Check kwargs, warn for unknown keys, and return a copy recognized keys. + """ + completions_arguments = { + "frequency_penalty", + "max_tokens", + "presence_penalty", + "seed", + "stop", + "temperature", + "top_p", + "best_of", + "echo", + "logit_bias", + "logprobs", + "n", + "suffix", + "user", + "stream", + } + + recognized_kwargs = { + k: v for k, v in kwargs.items() if k in completions_arguments + } + unrecognized_kwargs = set(kwargs) - completions_arguments + if len(unrecognized_kwargs) > 0: + warnings.warn(f"Unrecognized, ignored arguments: {unrecognized_kwargs}") + + return recognized_kwargs + + def __init__(self, **kwargs: Any): + """ + Create a new NVIDIA LLM for Completions APIs. + + This class provides access to a NVIDIA NIM for completions. By default, it + connects to a hosted NIM, but can be configured to connect to a local NIM + using the `base_url` parameter. An API key is required to connect to the + hosted NIM. + + Args: + model (str): The model to use for reranking. + nvidia_api_key (str): The API key to use for connecting to the hosted NIM. + api_key (str): Alternative to nvidia_api_key. + base_url (str): The base URL of the NIM to connect to. + + API Key: + - The recommended way to provide the API key is through the `NVIDIA_API_KEY` + environment variable. + + Additional arguments that can be passed to the Completions API: + - max_tokens (int): The maximum number of tokens to generate. + - stop (str or List[str]): The stop sequence to use for generating completions. + - temperature (float): The temperature to use for generating completions. + - top_p (float): The top-p value to use for generating completions. + - frequency_penalty (float): The frequency penalty to apply to the completion. + - presence_penalty (float): The presence penalty to apply to the completion. + - seed (int): The seed to use for generating completions. + - best_of (int): The number of completions to generate and return the best of. + - echo (bool): Whether to echo the prompt in the completion. + - logit_bias (Dict[str, float]): The logit bias to apply to the completion. + - logprobs (int): The number of logprobs to return. + - n (int): The number of completions to generate. + - suffix (str): The suffix to use for generating completions. + - user (str): The user ID to use for generating completions. + + These additional arguments can also be passed with `bind()`, e.g. + `NVIDIA().bind(max_tokens=512)`, or pass directly to `invoke()` or `stream()`, + e.g. `NVIDIA().invoke("prompt", max_tokens=512)`. + """ + super().__init__(**kwargs) + self._client = _NVIDIAClient( + base_url=self.base_url, + model_name=self.model, + default_hosted_model_name=self._default_model_name, + api_key=kwargs.pop("nvidia_api_key", kwargs.pop("api_key", None)), + infer_path="{base_url}/completions", + cls=self.__class__.__name__, + ) + # todo: only store the model in one place + # the model may be updated to a newer name during initialization + self.model = self._client.model_name + + # stash all additional args that can be passed to the Completions API, + # but first make sure we pull out any args that are processed elsewhere. + for key in [ + "model", + "nvidia_base_url", + "base_url", + ]: + if key in kwargs: + del kwargs[key] + self._init_args = self.__check_kwargs(kwargs) + + @property + def available_models(self) -> List[Model]: + """ + Get a list of available models that work with NVIDIARerank. + """ + return self._client.get_available_models(self.__class__.__name__) + + @classmethod + def get_available_models( + cls, + **kwargs: Any, + ) -> List[Model]: + """ + Get a list of available models that work with the Completions API. + """ + return cls(**kwargs).available_models + + @property + def _llm_type(self) -> str: + """ + Get the type of language model used by this chat model. + Used for logging purposes only. + """ + return "NVIDIA" + + def _call( + self, + prompt: str, + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> str: + payload: Dict[str, Any] = { + "model": self.model, + "prompt": prompt, + **self._init_args, + **self.__check_kwargs(kwargs), + } + if stop: + payload["stop"] = stop + + if payload.get("stream", False): + warnings.warn("stream set to true for non-streaming call, ignoring") + del payload["stream"] + + response = self._client.get_req(payload=payload) + response.raise_for_status() + + # todo: handle response's usage and system_fingerprint + + choices = response.json()["choices"] + # todo: write a test for this by setting n > 1 on the request + # aug 2024: n > 1 is not supported by endpoints + if len(choices) > 1: + warnings.warn( + f"Multiple choices in response, returning only the first: {choices}" + ) + + return choices[0]["text"] + + def _stream( + self, + prompt: str, + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> Iterator[GenerationChunk]: + payload: Dict[str, Any] = { + "model": self.model, + "prompt": prompt, + "stream": True, + **self._init_args, + **self.__check_kwargs(kwargs), + } + if stop: + payload["stop"] = stop + + # we construct payload w/ **kwargs positioned to override stream=True, + # this lets us know if a user passed stream=False + if not payload.get("stream", True): + warnings.warn("stream set to false for streaming call, ignoring") + payload["stream"] = True + + for chunk in self._client.get_req_stream(payload=payload): + content = chunk["content"] + generation = GenerationChunk(text=content) + if run_manager: # todo: add tests for run_manager + run_manager.on_llm_new_token(content, chunk=generation) + yield generation diff --git a/libs/ai-endpoints/tests/integration_tests/conftest.py b/libs/ai-endpoints/tests/integration_tests/conftest.py index f1dd58f6..e89d2a7d 100644 --- a/libs/ai-endpoints/tests/integration_tests/conftest.py +++ b/libs/ai-endpoints/tests/integration_tests/conftest.py @@ -3,7 +3,12 @@ import pytest from langchain_core.documents import Document -from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank +from langchain_nvidia_ai_endpoints import ( + NVIDIA, + ChatNVIDIA, + NVIDIAEmbeddings, + NVIDIARerank, +) from langchain_nvidia_ai_endpoints._statics import MODEL_TABLE, Model @@ -39,6 +44,12 @@ def pytest_addoption(parser: pytest.Parser) -> None: nargs="+", help="Run tests for a specific qa model or list of models", ) + parser.addoption( + "--completions-model-id", + action="store", + nargs="+", + help="Run tests for a specific completions model or list of models", + ) parser.addoption( "--embedding-model-id", action="store", @@ -98,6 +109,18 @@ def get_all_known_models() -> List[Model]: ] metafunc.parametrize("tool_model", models, ids=models) + if "completions_model" in metafunc.fixturenames: + models = [NVIDIA._default_model_name] + if model_list := metafunc.config.getoption("completions_model_id"): + models = model_list + if metafunc.config.getoption("all_models"): + models = [ + model.id + for model in NVIDIA(**mode).available_models + if model.model_type == "completions" + ] + metafunc.parametrize("completions_model", models, ids=models) + if "structured_model" in metafunc.fixturenames: models = ["meta/llama-3.1-8b-instruct"] if model_list := metafunc.config.getoption("structured_model_id"): @@ -163,6 +186,7 @@ def mode(request: pytest.FixtureRequest) -> dict: ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank, + NVIDIA, ] ) def public_class(request: pytest.FixtureRequest) -> type: @@ -180,5 +204,7 @@ def _contact_service(instance: Any) -> None: instance.compress_documents( documents=[Document(page_content="World")], query="Hello" ) + elif isinstance(instance, NVIDIA): + instance.invoke("Hello") return _contact_service diff --git a/libs/ai-endpoints/tests/integration_tests/test_base_url.py b/libs/ai-endpoints/tests/integration_tests/test_base_url.py index 3c821623..7c522c92 100644 --- a/libs/ai-endpoints/tests/integration_tests/test_base_url.py +++ b/libs/ai-endpoints/tests/integration_tests/test_base_url.py @@ -13,6 +13,7 @@ def mock_endpoints(requests_mock: Mocker) -> None: "/v1/embeddings", "/v1/chat/completions", "/v1/ranking", + "/v1/completions", ]: requests_mock.post( re.compile(f".*{endpoint}"), diff --git a/libs/ai-endpoints/tests/integration_tests/test_completions_models.py b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py new file mode 100644 index 00000000..f7d1308a --- /dev/null +++ b/libs/ai-endpoints/tests/integration_tests/test_completions_models.py @@ -0,0 +1,150 @@ +# https://platform.openai.com/docs/api-reference/completions/create +# POST https://.../v1/completions +# model: str -- The ID of the model to use for completion. +# prompt: str | Array[str] -- The prompt(s) to generate completions for. +# best_of: Optional[int] (default: 1) -- An integer representing the number +# of completions to generate and score. +# The API will return the best completion +# of the group. +# echo: Optional[bool] (default: False) -- Whether to echo the prompt in addition +# to the completion. +# frequency_penalty: Optional[float] (default: 0.0) -- Float that penalizes new +# tokens. Range -2.0 to 2.0. +# logit_bias: Optional[Dict[str, float]] -- Dict containing token to logit bias. +# logprobs: Optional[int] (default: None) -- Integer representing the number of +# logprobs to return. 0 means no logprobs. +# Max value is 5. +# max_tokens: Optional[int] (default: 16) -- Integer representing the maximum number +# of tokens to generate. +# n: Optional[int] (default: 1) -- Integer representing the number of completions +# to generate. +# presence_penalty: Optional[float] (default: 0.0) -- Float that penalizes new tokens +# based on whether they appear in +# the text so far. Range -2.0 to +# 2.0. +# seed: Optional[int] (default: None) -- Integer seed that attempts to make the +# completions deterministic. +# stop: Optional[str|Array[str]] -- Token at which to stop generating completions. +# Up to 4 sequences. +# stream: Optional[bool] (default: False) -- Whether to stream back partial progress. +# stream_options: Optional[Dict["include_usage": bool]] -- Dict containing stream +# options. +# suffix: Optional[str] -- Suffix to add to the completion. +# temperature: Optional[float] (default: 1.0) -- Sampling temperature, between 0 and 2. +# top_p: Optional[float] (default: 1.0) -- Alternative to temperature sampling. +# user: Optional[str] -- User ID to associate with the request. +# +# Returns: +# id: str -- The ID of the completion. +# object: str -- Always "text_completion". +# created: int -- Unix timestamp of when the completion was created. +# model: str -- The ID of the model used to generate the completion. +# choices: List[{"finish_reason": "stop"|"length"|"content_filter", +# "index": int, +# "text": str, +# "logprobs": Optional[{"text_offset": array, +# "token_logprobs": array, +# "tokens": array, +# "top_logprobs": array}]}] -- +# List of completions generated by the model. +# usage: {"completion_tokens": int, +# "prompt_tokens": int, +# "total_tokens": int} -- Usage statistics for the model. +# system_fingerprint: str -- System fingerprint of the model used to generate +# the completion. + + +from typing import Any, Callable, Tuple + +import pytest + +from langchain_nvidia_ai_endpoints import NVIDIA + + +def invoke(llm: NVIDIA, prompt: str, **kwargs: Any) -> Tuple[str, int]: + return llm.invoke(prompt, **kwargs), 1 + + +def stream(llm: NVIDIA, prompt: str, **kwargs: Any) -> Tuple[str, int]: + response = "" + count = 0 + for chunk in llm.stream(prompt, **kwargs): + response += chunk + count += 1 + return response, count + + +@pytest.mark.parametrize( + "func, count", [(invoke, 0), (stream, 1)], ids=["invoke", "stream"] +) +def test_basic(completions_model: str, mode: dict, func: Callable, count: int) -> None: + llm = NVIDIA(model=completions_model, **mode) + response, cnt = func(llm, "Hello, my name is") + assert isinstance(response, str) + assert cnt > count, "Should have received more chunks" + + +@pytest.mark.parametrize( + "param, value", + [ + ("frequency_penalty", 0.5), + ("max_tokens", 32), + ("presence_penalty", 0.5), + ("seed", 1234), + ("stop", "Hello"), + ("temperature", 0.5), + ("top_p", 0.5), + ], +) +@pytest.mark.parametrize("func", [invoke, stream], ids=["invoke", "stream"]) +def test_params( + completions_model: str, mode: dict, param: str, value: Any, func: Callable +) -> None: + llm = NVIDIA(model=completions_model, **mode) + response, _ = func(llm, "Hello, my name is", **{param: value}) + assert isinstance(response, str) + + +@pytest.mark.parametrize( + "param, value", + [ + ("best_of", 5), + ("echo", True), + ("logit_bias", {"hello": 1.0}), + ("logprobs", 2), + ("n", 2), + ("suffix", "Hello"), + ("user", "1234"), + ], +) +@pytest.mark.parametrize("func", [invoke, stream], ids=["invoke", "stream"]) +@pytest.mark.xfail(reason="Not consistently implemented") +def test_params_incomplete( + completions_model: str, mode: dict, param: str, value: Any, func: Callable +) -> None: + llm = NVIDIA(model=completions_model, **mode) + response, _ = func(llm, "Hello, my name is", **{param: value}) + assert isinstance(response, str) + + +def test_invoke_with_stream_true(completions_model: str, mode: dict) -> None: + llm = NVIDIA(model=completions_model, **mode) + with pytest.warns(UserWarning) as record: + response = llm.invoke("Hello, my name is", stream=True) + assert isinstance(response, str) + assert len(record) == 1 + assert "stream set to true" in str(record[0].message) + assert "ignoring" in str(record[0].message) + + +def test_stream_with_stream_false(completions_model: str, mode: dict) -> None: + llm = NVIDIA(model=completions_model, **mode) + with pytest.warns(UserWarning) as record: + response = next(llm.stream("Hello, my name is", stream=False)) + assert isinstance(response, str) + assert len(record) == 1 + assert "stream set to false" in str(record[0].message) + assert "ignoring" in str(record[0].message) + + +# todo: check stream_options diff --git a/libs/ai-endpoints/tests/integration_tests/test_register_model.py b/libs/ai-endpoints/tests/integration_tests/test_register_model.py index 6488aee3..678c67fd 100644 --- a/libs/ai-endpoints/tests/integration_tests/test_register_model.py +++ b/libs/ai-endpoints/tests/integration_tests/test_register_model.py @@ -8,6 +8,7 @@ Model, NVIDIAEmbeddings, NVIDIARerank, + NVIDIA, register_model, ) @@ -34,6 +35,11 @@ "nv-rerank-qa-mistral-4b:1", "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0bf77f50-5c35-4488-8e7a-f49bb1974af6", ), + ( + NVIDIA, + "bigcode/starcoder2-7b", + "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/dd7b01e7-732d-4da5-8e8d-315f79165a23", + ), ], ) def test_registered_model_functional( diff --git a/libs/ai-endpoints/tests/unit_tests/conftest.py b/libs/ai-endpoints/tests/unit_tests/conftest.py index f0790214..4288819e 100644 --- a/libs/ai-endpoints/tests/unit_tests/conftest.py +++ b/libs/ai-endpoints/tests/unit_tests/conftest.py @@ -3,7 +3,12 @@ import pytest import requests_mock -from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank +from langchain_nvidia_ai_endpoints import ( + NVIDIA, + ChatNVIDIA, + NVIDIAEmbeddings, + NVIDIARerank, +) @pytest.fixture( @@ -11,6 +16,7 @@ ChatNVIDIA, NVIDIAEmbeddings, NVIDIARerank, + NVIDIA, ] ) def public_class(request: pytest.FixtureRequest) -> type: diff --git a/libs/ai-endpoints/tests/unit_tests/test_completions_models.py b/libs/ai-endpoints/tests/unit_tests/test_completions_models.py new file mode 100644 index 00000000..34de4c7c --- /dev/null +++ b/libs/ai-endpoints/tests/unit_tests/test_completions_models.py @@ -0,0 +1,149 @@ +import json +from functools import reduce +from operator import add +from typing import Any, Callable, List + +import pytest +import requests_mock + +from langchain_nvidia_ai_endpoints import NVIDIA + + +def invoke(llm: NVIDIA, prompt: str, **kwargs: Any) -> str: + return llm.invoke(prompt, **kwargs) + + +def stream(llm: NVIDIA, prompt: str, **kwargs: Any) -> str: + return reduce(add, llm.stream(prompt, **kwargs)) + + +mock_response = { + "id": "ID", + "object": "text_completion", + "created": 1234567890, + "model": "BOGUS", + "choices": [ + { + "index": 0, + "text": "COMPLETION", + } + ], + "usage": {"prompt_tokens": 7, "total_tokens": 207, "completion_tokens": 200}, +} + + +@pytest.fixture(scope="function") +def mock_v1_completions_invoke( + requests_mock: requests_mock.Mocker, +) -> requests_mock.Mocker: + requests_mock.post( + "https://integrate.api.nvidia.com/v1/completions", + json=mock_response, + ) + return requests_mock + + +@pytest.fixture(scope="function") +def mock_v1_completions_stream( + requests_mock: requests_mock.Mocker, +) -> requests_mock.Mocker: + requests_mock.post( + "https://integrate.api.nvidia.com/v1/completions", + text="\n\n".join( + [ + f"data: {json.dumps(mock_response)}", + "data: [DONE]", + ] + ), + ) + return requests_mock + + +@pytest.mark.parametrize( + "param, value", + [ + ("frequency_penalty", [0.25, 0.5, 0.75]), + ("max_tokens", [2, 32, 512]), + ("presence_penalty", [0.25, 0.5, 0.75]), + ("seed", [1, 1234, 4321]), + ("stop", ["Hello", "There", "World"]), + ("temperature", [0, 0.5, 1]), + ("top_p", [0, 0.5, 1]), + ("best_of", [1, 5, 10]), + ("echo", [True, False, True]), + ("logit_bias", [{"hello": 1.0}, {"there": 1.0}, {"world": 1.0}]), + ("logprobs", [1, 2, 3]), + ("n", [1, 2, 3]), + ("suffix", ["Hello", "There", "World"]), + ("user", ["Bob", "Alice", "Eve"]), + ], +) +@pytest.mark.parametrize( + "func, mock_name", + [(invoke, "mock_v1_completions_invoke"), (stream, "mock_v1_completions_stream")], + ids=["invoke", "stream"], +) +def test_params( + param: str, + value: List[Any], + func: Callable, + mock_name: str, + request: pytest.FixtureRequest, +) -> None: + """ + This tests the following... + - priority order (init -> bind -> infer) + - param passed to init, bind, invoke / stream + ...for each known Completion API param. + """ + + mock = request.getfixturevalue(mock_name) + + init, bind, infer = value + + llm = NVIDIA(api_key="BOGUS", **{param: init}) + func(llm, "IGNORED") + request_payload = mock.last_request.json() + assert param in request_payload + assert request_payload[param] == init + + bound_llm = llm.bind(**{param: bind}) + func(bound_llm, "IGNORED") + request_payload = mock.last_request.json() + assert param in request_payload + assert request_payload[param] == bind + + func(bound_llm, "IGNORED", **{param: infer}) + request_payload = mock.last_request.json() + assert param in request_payload + assert request_payload[param] == infer + + +@pytest.mark.parametrize( + "func, mock_name", + [(invoke, "mock_v1_completions_invoke"), (stream, "mock_v1_completions_stream")], + ids=["invoke", "stream"], +) +def test_params_unknown( + func: Callable, + mock_name: str, + request: pytest.FixtureRequest, +) -> None: + request.getfixturevalue(mock_name) + + with pytest.warns(UserWarning) as record: + llm = NVIDIA(api_key="BOGUS", init_unknown="INIT") + assert len(record) == 1 + assert "Unrecognized, ignored arguments: {'init_unknown'}" in str(record[0].message) + + with pytest.warns(UserWarning) as record: + func(llm, "IGNORED", arg_unknown="ARG") + assert len(record) == 1 + assert "Unrecognized, ignored arguments: {'arg_unknown'}" in str(record[0].message) + + bound_llm = llm.bind(bind_unknown="BIND") + + with pytest.warns(UserWarning) as record: + func(bound_llm, "IGNORED") + assert len(record) == 1 + assert "Unrecognized, ignored arguments: {'bind_unknown'}" in str(record[0].message) diff --git a/libs/ai-endpoints/tests/unit_tests/test_imports.py b/libs/ai-endpoints/tests/unit_tests/test_imports.py index e72c2c6c..200bbea4 100644 --- a/libs/ai-endpoints/tests/unit_tests/test_imports.py +++ b/libs/ai-endpoints/tests/unit_tests/test_imports.py @@ -4,6 +4,7 @@ "ChatNVIDIA", "NVIDIAEmbeddings", "NVIDIARerank", + "NVIDIA", "register_model", "Model", ] diff --git a/libs/ai-endpoints/tests/unit_tests/test_register_model.py b/libs/ai-endpoints/tests/unit_tests/test_register_model.py index d42bdee5..482d40dc 100644 --- a/libs/ai-endpoints/tests/unit_tests/test_register_model.py +++ b/libs/ai-endpoints/tests/unit_tests/test_register_model.py @@ -3,6 +3,7 @@ import pytest from langchain_nvidia_ai_endpoints import ( + NVIDIA, ChatNVIDIA, Model, NVIDIAEmbeddings, @@ -16,12 +17,19 @@ [ ("chat", "NVIDIAEmbeddings"), ("chat", "NVIDIARerank"), + ("chat", "NVIDIA"), ("vlm", "NVIDIAEmbeddings"), ("vlm", "NVIDIARerank"), + ("vlm", "NVIDIA"), ("embeddings", "ChatNVIDIA"), ("embeddings", "NVIDIARerank"), + ("embeddings", "NVIDIA"), ("ranking", "ChatNVIDIA"), ("ranking", "NVIDIAEmbeddings"), + ("ranking", "NVIDIA"), + ("completions", "ChatNVIDIA"), + ("completions", "NVIDIAEmbeddings"), + ("completions", "NVIDIARerank"), ], ) def test_mismatched_type_client(model_type: str, client: str) -> None: @@ -53,6 +61,7 @@ def test_registered_model_usable(public_class: type) -> None: "ChatNVIDIA": "chat", "NVIDIAEmbeddings": "embedding", "NVIDIARerank": "ranking", + "NVIDIA": "completions", }[public_class.__name__] with warnings.catch_warnings(): warnings.simplefilter("error") @@ -112,21 +121,38 @@ def test_registered_model_is_available() -> None: endpoint="BOGUS", ) ) + register_model( + Model( + id="test/completions", + model_type="completions", + client="NVIDIA", + endpoint="BOGUS", + ) + ) chat_models = ChatNVIDIA.get_available_models(api_key="BOGUS") embedding_models = NVIDIAEmbeddings.get_available_models(api_key="BOGUS") ranking_models = NVIDIARerank.get_available_models(api_key="BOGUS") + completions_models = NVIDIA.get_available_models(api_key="BOGUS") assert "test/chat" in [model.id for model in chat_models] assert "test/chat" not in [model.id for model in embedding_models] assert "test/chat" not in [model.id for model in ranking_models] + assert "test/chat" not in [model.id for model in completions_models] assert "test/embedding" not in [model.id for model in chat_models] assert "test/embedding" in [model.id for model in embedding_models] assert "test/embedding" not in [model.id for model in ranking_models] + assert "test/embedding" not in [model.id for model in completions_models] assert "test/rerank" not in [model.id for model in chat_models] assert "test/rerank" not in [model.id for model in embedding_models] assert "test/rerank" in [model.id for model in ranking_models] + assert "test/rerank" not in [model.id for model in completions_models] + + assert "test/completions" not in [model.id for model in chat_models] + assert "test/completions" not in [model.id for model in embedding_models] + assert "test/completions" not in [model.id for model in ranking_models] + assert "test/completions" in [model.id for model in completions_models] def test_registered_model_without_client_is_not_listed(public_class: type) -> None: