Merge branch 'main' into feature/chroma-query-text-retriever-filters

deepset-ai · May 10, 2024 · 16ac556 · 16ac556
2 parents 7b2ea59 + c29db9c
commit 16ac556
Show file tree

Hide file tree

Showing 76 changed files with 1,005 additions and 327 deletions.
diff --git a/.github/workflows/CI_readme_sync.yml b/.github/workflows/CI_readme_sync.yml
@@ -4,6 +4,7 @@ on:
   push:
     tags:
       - "**-v[0-9].[0-9]+.[0-9]+"
+
   workflow_dispatch: # Activate this workflow manually
     inputs:
       tag:
@@ -16,8 +17,30 @@ env:
   TAG: ${{ inputs.tag || github.ref_name }}
 
 jobs:
+  get-versions:
+    runs-on: ubuntu-latest
+    outputs:
+      versions: ${{ steps.version_finder.outputs.versions }}
+    steps:
+      - name: Get Haystack Docs versions
+        id: version_finder
+        run: |
+          curl -s https://dash.readme.com/api/v1/version --header 'authorization: Basic ${{ secrets.README_API_KEY }}' > out
+          VERSIONS=$(jq '[ .[] | select(.version | startswith("2."))| .version ]' out)
+          {
+            echo 'versions<<EOF'
+            echo $VERSIONS
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+
   sync:
     runs-on: ubuntu-latest
+    needs: get-versions
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        hs-docs-version: ${{ fromJSON(needs.get-versions.outputs.versions) }}
     steps:
       - name: Checkout this repo
         uses: actions/checkout@v4
@@ -39,7 +62,7 @@ jobs:
           import os
           project_path = os.environ["TAG"].rsplit("-", maxsplit=1)[0]
           with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
-            print(f'project_path={project_path}', file=f)          
+            print(f'project_path={project_path}', file=f)
 
       - name: Generate docs
         working-directory: ${{ steps.pathfinder.outputs.project_path }}
@@ -48,13 +71,16 @@ jobs:
           # from Readme.io as we need them to associate the slug
           # in config files with their id.
           README_API_KEY: ${{ secrets.README_API_KEY }}
+          # The same category has a different id on different readme docs versions.
+          # This is the docs version on readme that we'll use to get the category id.
+          PYDOC_TOOLS_HAYSTACK_DOC_VERSION: ${{ matrix.hs-docs-version }}
         run: |
           hatch run docs
           mkdir tmp
           find . -name "_readme_*.md" -exec cp "{}" tmp \;
           ls tmp
 
-      - name: Sync API docs
+      - name: Sync API docs with Haystack docs version ${{ matrix.hs-docs-version }}
         uses: readmeio/rdme@v8
         with:
-          rdme: docs ${{ steps.pathfinder.outputs.project_path }}/tmp --key=${{ secrets.README_API_KEY }} --version=2.0
+          rdme: docs ${{ steps.pathfinder.outputs.project_path }}/tmp --key=${{ secrets.README_API_KEY }} --version=${{ matrix.hs-docs-version }}
diff --git a/.github/workflows/nvidia.yml b/.github/workflows/nvidia.yml
@@ -22,6 +22,7 @@ env:
   PYTHONUNBUFFERED: "1"
   FORCE_COLOR: "1"
   NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
+  NVIDIA_CATALOG_API_KEY: ${{ secrets.NVIDIA_CATALOG_API_KEY }}
 
 jobs:
   run:
@@ -73,7 +74,7 @@ jobs:
         uses: ./.github/actions/send_failure
         with:
           title: |
-            core-integrations failure: 
+            core-integrations failure:
             ${{ (steps.tests.conclusion == 'nightly-haystack-main') && 'nightly-haystack-main' || 'tests' }}
              - ${{ github.workflow }}
           api-key: ${{ secrets.CORE_DATADOG_API_KEY }}
diff --git a/README.md b/README.md
@@ -51,5 +51,5 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta
 | [qdrant-haystack](integrations/qdrant/)                                                                        | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack)                            | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml)                                           |
 | [ragas-haystack](integrations/ragas/)                                                                          | Evaluator           | [![PyPI - Version](https://img.shields.io/pypi/v/ragas-haystack.svg)](https://pypi.org/project/ragas-haystack)                                           | [![Test / ragas](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ragas.yml)                                              |
 | [unstructured-fileconverter-haystack](integrations/unstructured/)                                              | File converter      | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured.yml)                         |
-| [uptrain-haystack](https://github.com/deepset-ai/haystack-core-integrations/tree/staging/integrations/uptrain) | Evaluator           | [![PyPI - Version](https://img.shields.io/pypi/v/uptrain-haystack.svg)](https://pypi.org/project/uptrain-haystack)                                       | Staged                                                                                                                                                                                                                                               |
+| [uptrain-haystack](https://github.com/deepset-ai/haystack-core-integrations/tree/staging/integrations/uptrain) | Evaluator           | [![PyPI - Version](https://img.shields.io/pypi/v/uptrain-haystack.svg)](https://pypi.org/project/uptrain-haystack)                                       | [Staged](https://docs.haystack.deepset.ai/docs/breaking-change-policy#discontinuing-an-integration)                                                                                                                                                                                                                                               |
 | [weaviate-haystack](integrations/weaviate/)                                                                    | Document Store      | [![PyPI - Version](https://img.shields.io/pypi/v/weaviate-haystack.svg)](https://pypi.org/project/weaviate-haystack)                                     | [![Test / weaviate](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weaviate.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/weaviate.yml)                                     |
diff --git a/integrations/amazon_bedrock/pydoc/config.yml b/integrations/amazon_bedrock/pydoc/config.yml
@@ -20,7 +20,7 @@ processors:
   - type: smart
   - type: crossref
 renderer:
-  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
+  type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
   excerpt: Amazon Bedrock integration for Haystack
   category_slug: integrations-api
   title: Amazon Bedrock

diff --git a/integrations/amazon_sagemaker/pydoc/config.yml b/integrations/amazon_sagemaker/pydoc/config.yml
@@ -14,7 +14,7 @@ processors:
   - type: smart
   - type: crossref
 renderer:
-  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
+  type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
   excerpt: Amazon Sagemaker integration for Haystack
   category_slug: integrations-api
   title: Amazon Sagemaker

diff --git a/integrations/anthropic/pydoc/config.yml b/integrations/anthropic/pydoc/config.yml
@@ -15,12 +15,12 @@ processors:
   - type: smart
   - type: crossref
 renderer:
-  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
+  type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
   excerpt: Anthropic integration for Haystack
   category_slug: integrations-api
   title: Anthropic
   slug: integrations-anthropic
-  order: 22
+  order: 23
   markdown:
     descriptive_class_title: false
     descriptive_module_title: true

diff --git a/integrations/astra/pydoc/config.yml b/integrations/astra/pydoc/config.yml
@@ -16,7 +16,7 @@ processors:
   - type: smart
   - type: crossref
 renderer:
-  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
+  type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
   excerpt: Astra integration for Haystack
   category_slug: integrations-api
   title: Astra

diff --git a/integrations/astra/pyproject.toml b/integrations/astra/pyproject.toml
@@ -80,12 +80,12 @@ dependencies = [
 [tool.hatch.envs.lint.scripts]
 typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}"
 style = [
-  "ruff {args:.}",
+  "ruff check {args:.}",
   "black --check --diff {args:.}",
 ]
 fmt = [
   "black {args:.}",
-  "ruff --fix {args:.}",
+  "ruff check --fix {args:.}",
   "style",
 ]
 all = [
@@ -104,7 +104,7 @@ skip-string-normalization = true
 [tool.ruff]
 target-version = "py38"
 line-length = 120
-select = [
+lint.select = [
   "A",
   "ARG",
   "B",
@@ -131,7 +131,7 @@ select = [
   "W",
   "YTT",
 ]
-ignore = [
+lint.ignore = [
   # Allow non-abstract empty methods in abstract base classes
   "B027",
   # Allow boolean positional values in function calls, like `dict.get(... True)`
@@ -141,19 +141,19 @@ ignore = [
   # Ignore complexity
   "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
 ]
-unfixable = [
+lint.unfixable = [
   # Don't touch unused imports
   "F401",
 ]
-exclude = ["example"]
+lint.exclude = ["example"]
 
-[tool.ruff.isort]
+[tool.ruff.lint.isort]
 known-first-party = ["haystack_integrations"]
 
-[tool.ruff.flake8-tidy-imports]
+[tool.ruff.lint.flake8-tidy-imports]
 ban-relative-imports = "parents"
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
 "tests/**/*" = ["PLR2004", "S101", "TID252"]
 

diff --git a/integrations/astra/tests/test_document_store.py b/integrations/astra/tests/test_document_store.py
@@ -14,7 +14,13 @@
 from haystack_integrations.document_stores.astra import AstraDocumentStore
 
 
-def test_namespace_init():
+@pytest.fixture
+def mock_auth(monkeypatch):
+    monkeypatch.setenv("ASTRA_DB_API_ENDPOINT", "http://example.com")
+    monkeypatch.setenv("ASTRA_DB_APPLICATION_TOKEN", "test_token")
+
+
+def test_namespace_init(mock_auth):  # noqa
     with mock.patch("haystack_integrations.document_stores.astra.astra_client.AstraDB") as client:
         AstraDocumentStore()
         assert "namespace" in client.call_args.kwargs
@@ -25,7 +31,7 @@ def test_namespace_init():
         assert client.call_args.kwargs["namespace"] == "foo"
 
 
-def test_to_dict():
+def test_to_dict(mock_auth):  # noqa
     with mock.patch("haystack_integrations.document_stores.astra.astra_client.AstraDB"):
         ds = AstraDocumentStore()
         result = ds.to_dict()

diff --git a/integrations/chroma/pydoc/config.yml b/integrations/chroma/pydoc/config.yml
@@ -17,7 +17,7 @@ processors:
   - type: smart
   - type: crossref
 renderer:
-  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
+  type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
   excerpt: Chroma integration for Haystack
   category_slug: integrations-api
   title: Chroma

diff --git a/integrations/cohere/pydoc/config.yml b/integrations/cohere/pydoc/config.yml
@@ -19,7 +19,7 @@ processors:
   - type: smart
   - type: crossref
 renderer:
-  type: haystack_pydoc_tools.renderers.ReadmePreviewRenderer
+  type: haystack_pydoc_tools.renderers.ReadmeIntegrationRenderer
   excerpt: Cohere integration for Haystack
   category_slug: integrations-api
   title: Cohere

diff --git a/...rations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py b/...rations/cohere/src/haystack_integrations/components/embedders/cohere/document_embedder.py
@@ -42,7 +42,6 @@ def __init__(
         api_base_url: str = "https://api.cohere.com",
         truncate: str = "END",
         use_async_client: bool = False,
-        max_retries: int = 3,
         timeout: int = 120,
         batch_size: int = 32,
         progress_bar: bool = True,
@@ -67,7 +66,6 @@ def __init__(
             If "NONE" is selected, when the input exceeds the maximum input token length an error will be returned.
         :param use_async_client: flag to select the AsyncClient. It is recommended to use
             AsyncClient for applications with many concurrent calls.
-        :param max_retries: maximal number of retries for requests.
         :param timeout: request timeout in seconds.
         :param batch_size: number of Documents to encode at once.
         :param progress_bar: whether to show a progress bar or not. Can be helpful to disable in production deployments
@@ -82,7 +80,6 @@ def __init__(
         self.api_base_url = api_base_url
         self.truncate = truncate
         self.use_async_client = use_async_client
-        self.max_retries = max_retries
         self.timeout = timeout
         self.batch_size = batch_size
         self.progress_bar = progress_bar
@@ -104,7 +101,6 @@ def to_dict(self) -> Dict[str, Any]:
             api_base_url=self.api_base_url,
             truncate=self.truncate,
             use_async_client=self.use_async_client,
-            max_retries=self.max_retries,
             timeout=self.timeout,
             batch_size=self.batch_size,
             progress_bar=self.progress_bar,
@@ -170,7 +166,6 @@ def run(self, documents: List[Document]):
             cohere_client = AsyncClient(
                 api_key,
                 base_url=self.api_base_url,
-                max_retries=self.max_retries,
                 timeout=self.timeout,
                 client_name="haystack",
             )
@@ -181,7 +176,6 @@ def run(self, documents: List[Document]):
             cohere_client = Client(
                 api_key,
                 base_url=self.api_base_url,
-                max_retries=self.max_retries,
                 timeout=self.timeout,
                 client_name="haystack",
             )

diff --git a/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py b/integrations/cohere/src/haystack_integrations/components/embedders/cohere/text_embedder.py
@@ -39,7 +39,6 @@ def __init__(
         api_base_url: str = "https://api.cohere.com",
         truncate: str = "END",
         use_async_client: bool = False,
-        max_retries: int = 3,
         timeout: int = 120,
     ):
         """
@@ -60,7 +59,6 @@ def __init__(
             If "NONE" is selected, when the input exceeds the maximum input token length an error will be returned.
         :param use_async_client: flag to select the AsyncClient. It is recommended to use
             AsyncClient for applications with many concurrent calls.
-        :param max_retries: maximum number of retries for requests.
         :param timeout: request timeout in seconds.
         """
 
@@ -70,7 +68,6 @@ def __init__(
         self.api_base_url = api_base_url
         self.truncate = truncate
         self.use_async_client = use_async_client
-        self.max_retries = max_retries
         self.timeout = timeout
 
     def to_dict(self) -> Dict[str, Any]:
@@ -88,7 +85,6 @@ def to_dict(self) -> Dict[str, Any]:
             api_base_url=self.api_base_url,
             truncate=self.truncate,
             use_async_client=self.use_async_client,
-            max_retries=self.max_retries,
             timeout=self.timeout,
         )
 
@@ -132,7 +128,6 @@ def run(self, text: str):
             cohere_client = AsyncClient(
                 api_key,
                 base_url=self.api_base_url,
-                max_retries=self.max_retries,
                 timeout=self.timeout,
                 client_name="haystack",
             )
@@ -143,7 +138,6 @@ def run(self, text: str):
             cohere_client = Client(
                 api_key,
                 base_url=self.api_base_url,
-                max_retries=self.max_retries,
                 timeout=self.timeout,
                 client_name="haystack",
             )

diff --git a/integrations/cohere/src/haystack_integrations/components/embedders/cohere/utils.py b/integrations/cohere/src/haystack_integrations/components/embedders/cohere/utils.py
@@ -62,7 +62,7 @@ def get_response(
         desc="Calculating embeddings",
     ):
         batch = texts[i : i + batch_size]
-        response = cohere_client.embed(batch, model=model_name, input_type=input_type, truncate=truncate)
+        response = cohere_client.embed(texts=batch, model=model_name, input_type=input_type, truncate=truncate)
         for emb in response.embeddings:
             all_embeddings.append(emb)
         if response.meta is not None: