From 103b7987be3c7ecaf273dc13429b455e421c257b Mon Sep 17 00:00:00 2001 From: Matthew Farrellee Date: Wed, 28 Aug 2024 14:59:13 -0400 Subject: [PATCH 1/3] fix stream collection --- .../langchain_nvidia_ai_endpoints/_common.py | 2 +- .../tests/integration_tests/test_streaming.py | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 libs/ai-endpoints/tests/integration_tests/test_streaming.py diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py index 2bde648a..454b1d7d 100644 --- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py +++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py @@ -523,7 +523,7 @@ def get_req_stream( } response = self.get_session_fn().post( - **self.__add_authorization(self.last_inputs) + stream=True, **self.__add_authorization(self.last_inputs) ) self._try_raise(response) call = self.copy() diff --git a/libs/ai-endpoints/tests/integration_tests/test_streaming.py b/libs/ai-endpoints/tests/integration_tests/test_streaming.py new file mode 100644 index 00000000..158f81a2 --- /dev/null +++ b/libs/ai-endpoints/tests/integration_tests/test_streaming.py @@ -0,0 +1,27 @@ +import time + +from langchain_nvidia_ai_endpoints import ChatNVIDIA + + +def test_ttft(chat_model: str, mode: dict) -> None: + # we had an issue where streaming took a long time to start. the issue + # was all streamed results were collected before yielding them to the + # user. this test tries to detect the incorrect behavior. + # + # warning: + # - this can false positive if the model itself is slow to start + # - this can false nagative if there is a delay after the first chunk + # + # potential mitigation for false negative is to check mean & stdev and + # filter outliers. + # + # credit to Pouyan Rezakhani for finding this issue + llm = ChatNVIDIA(model=chat_model, **mode) + chunk_times = [time.time()] + for chunk in llm.stream("Count to 1000 by 2s, e.g. 2 4 6 8 ...", max_tokens=512): + chunk_times.append(time.time()) + ttft = chunk_times[1] - chunk_times[0] + total_time = chunk_times[-1] - chunk_times[0] + assert ttft < ( + total_time / 2 + ), "potential streaming issue, TTFT should be less than half of the total time" From fc6b913823319c5f2a5dbcb83d9e3fc032d5e837 Mon Sep 17 00:00:00 2001 From: raspawar Date: Thu, 29 Aug 2024 11:59:54 +0530 Subject: [PATCH 2/3] fix failing test cases for base url --- libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py index 2bde648a..ae60d137 100644 --- a/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py +++ b/libs/ai-endpoints/langchain_nvidia_ai_endpoints/_common.py @@ -17,7 +17,7 @@ Tuple, Union, ) -from urllib.parse import urlparse +from urllib.parse import urlparse, urlunparse import requests from langchain_core.pydantic_v1 import ( @@ -138,7 +138,9 @@ def _preprocess_args(cls, values: Dict[str, Any]) -> Dict[str, Any]: ): warnings.warn(f"Using {base_url}, ignoring the rest") - values["base_url"] = base_url + values["base_url"] = base_url = urlunparse( + (parsed.scheme, parsed.netloc, "v1", None, None, None) + ) values["infer_path"] = values["infer_path"].format(base_url=base_url) return values From 35c1488a98df198581fff9b57bb73dbc9e11f3a9 Mon Sep 17 00:00:00 2001 From: raspawar Date: Thu, 29 Aug 2024 12:25:06 +0530 Subject: [PATCH 3/3] bump the version --- libs/ai-endpoints/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/ai-endpoints/pyproject.toml b/libs/ai-endpoints/pyproject.toml index ee94d7f9..09206860 100644 --- a/libs/ai-endpoints/pyproject.toml +++ b/libs/ai-endpoints/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain-nvidia-ai-endpoints" -version = "0.2.1" +version = "0.2.2" description = "An integration package connecting NVIDIA AI Endpoints and LangChain" authors = [] readme = "README.md"