From 3cb6d0d6743952c4b1ab9aadc59dd563f6a1075f Mon Sep 17 00:00:00 2001 From: Mouna Tayahi Date: Tue, 9 Jan 2024 13:38:57 +0100 Subject: [PATCH] Linter fixes --- .github/labeler.yml | 5 + .github/workflows/astra.yml | 3 + LICENSE | 201 ++++++++++++++++++ README.md | 79 +++++++ integrations/__init__.py | 0 integrations/astra/README.md | 135 ++++++------ integrations/astra/examples/example.py | 51 +++-- .../astra/examples/pipeline_example.py | 13 +- integrations/astra/pyproject.toml | 15 +- .../astra/src/astra_haystack/astra_client.py | 29 +-- .../src/astra_haystack/document_store.py | 71 ++++--- .../astra/src/astra_haystack/filters.py | 25 +-- .../astra/src/astra_haystack/retriever.py | 6 +- integrations/astra/tests/__init__.py | 3 - .../requirements.txt => requirements.txt | 0 15 files changed, 483 insertions(+), 153 deletions(-) create mode 100644 LICENSE create mode 100644 README.md delete mode 100644 integrations/__init__.py rename integrations/astra/requirements.txt => requirements.txt (100%) diff --git a/.github/labeler.yml b/.github/labeler.yml index 355e37231..319f7c726 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -4,6 +4,11 @@ integration:amazon-bedrock: - any-glob-to-any-file: "integrations/amazon_bedrock/**/*" - any-glob-to-any-file: ".github/workflows/amazon_bedrock.yml" +integration:astra: + - changed-files: + - any-glob-to-any-file: "integrations/astra/**/*" + - any-glob-to-any-file: ".github/workflows/astra.yml" + integration:chroma: - changed-files: - any-glob-to-any-file: "integrations/chroma/**/*" diff --git a/.github/workflows/astra.yml b/.github/workflows/astra.yml index d35b2bba8..b751550de 100644 --- a/.github/workflows/astra.yml +++ b/.github/workflows/astra.yml @@ -54,4 +54,7 @@ jobs: run: hatch run lint:all - name: Run tests + env: + ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }} + ASTRA_DB_ID: ${{ secrets.ASTRA_DB_ID }} run: hatch run cov \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..6134ab324 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 000000000..46517c76b --- /dev/null +++ b/README.md @@ -0,0 +1,79 @@ +# Haystack 2.x Core Integrations + +This repository contains integrations to extend the capabilities of [Haystack](https://github.com/deepset-ai/haystack) version 2.0 and +onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai), see each integration's `README` file for details around installation, usage and support. + +## Contributing + +You will need `hatch` to work on or create new integrations. Run `pip install hatch` to install it. + +### Local development + +All the integrations are self contained, so the first step before working on one is to `cd` into the proper folder. +For example, to work on the Chroma Document Store, from the root of the repo: + +```sh +$ cd integrations/chroma +``` + +From there, you can run the tests with `hatch`, that will take care of setting up an isolated Python environment: + +```sh +hatch run test +``` + +Similarly, to run the linters: + +```sh +hatch run lint:all +``` + +### Create a new integration + +> Core integrations follow the naming convention `PREFIX-haystack`, where `PREFIX` can be the name of the technology +> you're integrating Haystack with. For example, a deepset integration would be named as `deepset-haystack`. + +To create a new integration, from the root of the repo change directory into `integrations`: + +```sh +cd integrations +``` + +From there, use `hatch` to create the scaffold of the new integration: + +```sh +$ hatch --config hatch.toml new -i +Project name: deepset-haystack +Description []: An example integration, this text can be edited later + +deepset-haystack +├── src +│ └── deepset_haystack +│ ├── __about__.py +│ └── __init__.py +├── tests +│ └── __init__.py +├── LICENSE.txt +├── README.md +└── pyproject.toml +``` + +## Inventory + +| Package | Type | PyPi Package | Status | +| ------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [amazon-bedrock-haystack](integrations/amazon-bedrock/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-bedrock-haystack.svg)](https://pypi.org/project/amazon-bedrock-haystack) | [![Test / amazon_bedrock](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml) | +| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | +| [cohere-haystack](integrations/cohere/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | +| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | +| [google-ai-haystack](integrations/google_ai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-ai-haystack.svg)](https://pypi.org/project/google-ai-haystack) | [![Test / google-ai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml) | +| [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) | +| [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) | +| [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | +| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / jina](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) | +| [llama-cpp-haystack](integrations/llama_cpp/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/llama-cpp-haystack) | [![Test / llama-cpp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml) | +| [ollama-haystack](integrations/ollama/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) | +| [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | +| [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | +| [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | +| [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) | diff --git a/integrations/__init__.py b/integrations/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/integrations/astra/README.md b/integrations/astra/README.md index c72c525d2..1dd9138d8 100644 --- a/integrations/astra/README.md +++ b/integrations/astra/README.md @@ -1,91 +1,94 @@ -# Haystack 2.x Core Integrations +[![test](https://github.com/deepset-ai/document-store/actions/workflows/test.yml/badge.svg)](https://github.com/deepset-ai/document-store/actions/workflows/test.yml) -This repository contains integrations to extend the capabilities of [Haystack](https://github.com/deepset-ai/haystack) version 2.0 and -onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai), see each integration's `README` file for details around installation, usage and support. +# Astra Store -## Contributing +## Installation +install astra-haystack package locally to run integration tests: -You will need `hatch` to work on or create new integrations. Run `pip install hatch` to install it. +Open in gitpod: +[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/Anant/astra-haystack/tree/main) -### Local development - -All the integrations are self contained, so the first step before working on one is to `cd` into the proper folder. -For example, to work on the Chroma Document Store, from the root of the repo: - -```sh -$ cd integrations/chroma +Switch Python version to 3.9 (Requires 3.8+ but not 3.12) ``` - -From there, you can run the tests with `hatch`, that will take care of setting up an isolated Python environment: - -```sh -hatch run test +pyenv install 3.9 +pyenv local 3.9 ``` -Similarly, to run the linters: +Local install for the package +`pip install -e .` +To execute integration tests, add needed environment variables +`ASTRA_DB_ID=` +`ASTRA_DB_APPLICATION_TOKEN=` +and execute +`python examples/example.py` -```sh -hatch run lint:all +Install requirements +`pip install -r requirements.txt` + +Export environment variables +``` +export KEYSPACE_NAME= +export COLLECTION_NAME= +export OPENAI_API_KEY= +export ASTRA_DB_ID= +export ASTRA_DB_REGION= +export ASTRA_DB_APPLICATION_TOKEN= ``` +run the python examples +`python example/example.py` +or +`python example/pipeline_example.py` + ## Usage -This package includes Astra Document Store and Astra Retriever classes that integrate with Haystack 2.0, allowing you to easily perform document retrieval or RAG with Astra, and include those functions in Haystack pipelines. +This package includes Astra Document Store and Astra Retriever classes that integrate with Haystack, allowing you to easily perform document retrieval or RAG with Astra, and include those functions in Haystack pipelines. ### In order to use the Document Store directly: Import the Document Store: ``` -from astra_haystack.document_store import AstraDocumentStore -from haystack.document_stores import DuplicatePolicy +from astra_store.document_store import AstraDocumentStore +from haystack.preview.document_stores import DuplicatePolicy ``` -### Create a new integration - -> Core integrations follow the naming convention `PREFIX-haystack`, where `PREFIX` can be the name of the technology -> you're integrating Haystack with. For example, a deepset integration would be named as `deepset-haystack`. +Load in environment variables: +``` +astra_id = os.getenv("ASTRA_DB_ID", "") +astra_region = os.getenv("ASTRA_DB_REGION", "us-east1") -To create a new integration, from the root of the repo change directory into `integrations`: +astra_application_token = os.getenv("ASTRA_DB_APPLICATION_TOKEN", "") +collection_name = os.getenv("COLLECTION_NAME", "haystack_vector_search") +keyspace_name = os.getenv("KEYSPACE_NAME", "recommender_demo") +``` -```sh -cd integrations +Create the Document Store object: ``` +document_store = AstraDocumentStore( + astra_id=astra_id, + astra_region=astra_region, + astra_collection=collection_name, + astra_keyspace=keyspace_name, + astra_application_token=astra_application_token, + duplicates_policy=DuplicatePolicy.SKIP, + embedding_dim=384, +) +``` + +Then you can use the document store functions like count_document below: +`document_store.count_documents()` + +### Using the Astra Retriever with Haystack Pipelines + +Create the Document Store object like above, then import and create the Pipeline: -From there, use `hatch` to create the scaffold of the new integration: - -```sh -$ hatch --config hatch.toml new -i -Project name: deepset-haystack -Description []: An example integration, this text can be edited later - -deepset-haystack -├── src -│ └── deepset_haystack -│ ├── __about__.py -│ └── __init__.py -├── tests -│ └── __init__.py -├── LICENSE.txt -├── README.md -└── pyproject.toml ``` +from haystack.preview import Pipeline +pipeline = Pipeline() +``` + +Add your AstraRetriever into the pipeline +`pipeline.add_component(instance=AstraSingleRetriever(document_store=document_store), name="retriever")` -## Inventory - -| Package | Type | PyPi Package | Status | -| ------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| [amazon-bedrock-haystack](integrations/amazon-bedrock/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-bedrock-haystack.svg)](https://pypi.org/project/amazon-bedrock-haystack) | [![Test / amazon_bedrock](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml) | -| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | -| [cohere-haystack](integrations/cohere/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | -| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | -| [google-ai-haystack](integrations/google_ai/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-ai-haystack.svg)](https://pypi.org/project/google-ai-haystack) | [![Test / google-ai](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_ai.yml) | -| [google-vertex-haystack](integrations/google_vertex/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/google-vertex-haystack.svg)](https://pypi.org/project/google-vertex-haystack) | [![Test / google-vertex](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/google_vertex.yml) | -| [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) | -| [instructor-embedders-haystack](integrations/instructor_embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | -| [jina-haystack](integrations/jina/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / jina](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) | -| [llama-cpp-haystack](integrations/llama_cpp/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/llama-cpp-haystack) | [![Test / llama-cpp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml) | -| [ollama-haystack](integrations/ollama/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/ollama-haystack.svg?color=orange)](https://pypi.org/project/ollama-haystack) | [![Test / ollama](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/ollama.yml) | -| [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | -| [pinecone-haystack](integrations/pinecone/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/pinecone-haystack.svg?color=orange)](https://pypi.org/project/pinecone-haystack) | [![Test / pinecone](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/pinecone.yml) | -| [qdrant-haystack](integrations/qdrant/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/qdrant-haystack.svg?color=orange)](https://pypi.org/project/qdrant-haystack) | [![Test / qdrant](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/qdrant.yml) | -| [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) | +Add other components and connect them as desired. Then run your pipeline: +`pipeline.run(...)` \ No newline at end of file diff --git a/integrations/astra/examples/example.py b/integrations/astra/examples/example.py index af2c12b4c..ac93f43ed 100644 --- a/integrations/astra/examples/example.py +++ b/integrations/astra/examples/example.py @@ -1,3 +1,4 @@ +import logging import os from pathlib import Path @@ -12,9 +13,13 @@ from astra_haystack.document_store import AstraDocumentStore from astra_haystack.retriever import AstraRetriever +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + HERE = Path(__file__).resolve().parent file_paths = [HERE / "data" / Path(name) for name in os.listdir("integrations/astra/examples/data")] -print(file_paths) +logger.info(file_paths) astra_id = os.getenv("ASTRA_DB_ID", "") astra_region = os.getenv("ASTRA_DB_REGION", "us-east1") @@ -67,15 +72,18 @@ question = "This chapter introduces the manuals available with Vim" result = q.run({"embedder": {"text": question}, "retriever": {"top_k": 1}}) -print(result) - -print("count:") -print(document_store.count_documents()) -assert document_store.count_documents() == 9 - -print("filter:") -print( - document_store.filter_documents( +logger.info(result) + +ALL_DOCUMENTS_COUNT = 9 +documents_count = document_store.count_documents() +logger.info("count:") +logger.info(documents_count) +if documents_count != ALL_DOCUMENTS_COUNT: + msg = f"count mismatch, expected 9 documents, got {documents_count}" + raise ValueError(msg) + +logger.info( + f"""filter results: {document_store.filter_documents( { "field": "meta", "operator": "==", @@ -85,22 +93,27 @@ }, } ) +}""" ) -print("get_document_by_id") -print(document_store.get_document_by_id("92ef055fbae55b2b0fc79d34cbf8a80b0ad7700ca526053223b0cc6d1351df10")) -print("get_documents_by_ids") -print( - document_store.get_documents_by_id( +logger.info( + f"""get_document_by_id {document_store.get_document_by_id( + "92ef055fbae55b2b0fc79d34cbf8a80b0ad7700ca526053223b0cc6d1351df10")}""" +) + +logger.info( + f"""get_documents_by_ids {document_store.get_documents_by_id( [ "92ef055fbae55b2b0fc79d34cbf8a80b0ad7700ca526053223b0cc6d1351df10", "6f2450a51eaa3eeb9239d875402bcfe24b2d3534ff27f26c1f3fc8133b04e756", ] - ) + )}""" ) document_store.delete_documents(["92ef055fbae55b2b0fc79d34cbf8a80b0ad7700ca526053223b0cc6d1351df10"]) -print("count:") -print(document_store.count_documents()) -assert document_store.count_documents() == 8 +documents_count = document_store.count_documents() +logger.info(f"count: {document_store.count_documents()}") +if documents_count != ALL_DOCUMENTS_COUNT - 1: + msg = f"count mismatch, expected 9 documents, got {documents_count}" + raise ValueError(msg) diff --git a/integrations/astra/examples/pipeline_example.py b/integrations/astra/examples/pipeline_example.py index 1fd49fd44..fb13c3d93 100644 --- a/integrations/astra/examples/pipeline_example.py +++ b/integrations/astra/examples/pipeline_example.py @@ -1,3 +1,4 @@ +import logging import os from haystack import Document, Pipeline @@ -11,6 +12,9 @@ from astra_haystack.document_store import AstraDocumentStore from astra_haystack.retriever import AstraRetriever +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + # Create a RAG query pipeline prompt_template = """ Given these documents, answer the question. @@ -48,10 +52,12 @@ documents = [ Document(content="There are over 7,000 languages spoken around the world today."), Document( - content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors." + content="Elephants have been observed to behave in a way that indicates" + " a high level of self-awareness, such as recognizing themselves in mirrors." ), Document( - content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves." + content="In certain parts of the world, like the Maldives, Puerto Rico, " + "and San Diego, you can witness the phenomenon of bioluminescent waves." ), ] p = Pipeline() @@ -97,4 +103,5 @@ "answer_builder": {"query": question}, } ) -print(result) + +logger.info(result) diff --git a/integrations/astra/pyproject.toml b/integrations/astra/pyproject.toml index aa891b257..b99449e03 100644 --- a/integrations/astra/pyproject.toml +++ b/integrations/astra/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ ] dependencies = [ "haystack-ai", + "pydantic", "typing_extensions", ] @@ -93,11 +94,6 @@ target-version = ["py37"] line-length = 120 skip-string-normalization = true -[tool.isort] -profile = 'black' -line_length = 79 -skip_gitignore = true - [tool.ruff] target-version = "py37" line-length = 120 @@ -142,6 +138,7 @@ unfixable = [ # Don't touch unused imports "F401", ] +exclude = ["example"] [tool.ruff.isort] known-first-party = ["astra_haystack"] @@ -157,10 +154,13 @@ ban-relative-imports = "all" source_pkgs = ["astra_haystack", "tests"] branch = true parallel = true +omit = [ + "example" +] [tool.coverage.paths] astra_haystack = ["src/astra_haystack", "*/astra-store/src/astra_haystack"] -tests = ["tests", "*/astra-store/tests"] +tests = ["tests"] [tool.coverage.report] exclude_lines = [ @@ -178,6 +178,9 @@ markers = [ [[tool.mypy.overrides]] module = [ + "astra_haystack.*", + "astra_client.*", + "pydantic.*", "haystack.*", "pytest.*" ] diff --git a/integrations/astra/src/astra_haystack/astra_client.py b/integrations/astra/src/astra_haystack/astra_client.py index 3f21ef628..ec0263a5a 100644 --- a/integrations/astra/src/astra_haystack/astra_client.py +++ b/integrations/astra/src/astra_haystack/astra_client.py @@ -10,7 +10,7 @@ @dataclass class Response: - id: str + document_id: str text: Optional[str] values: Optional[list] metadata: Optional[dict] @@ -80,13 +80,15 @@ def find_index(self): collection_embedding_dim = collection_name_matches[0]["options"]["vector"]["dimension"] if collection_embedding_dim != self.embedding_dim: - raise Exception( + msg = ( f"Collection vector dimension is not valid, expected {self.embedding_dim}, " f"found {collection_embedding_dim}" ) + raise Exception(msg) else: - raise Exception(f"status not in response: {response.text}") + msg = f"status not in response: {response.text}" + raise Exception(msg) return True @@ -107,9 +109,8 @@ def create_index(self): def query( self, vector: Optional[List[float]] = None, - filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, + query_filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, top_k: Optional[int] = None, - namespace: Optional[str] = None, include_metadata: Optional[bool] = None, include_values: Optional[bool] = None, ) -> QueryResponse: @@ -122,7 +123,7 @@ def query( being queried. Each `query()` request can contain only one of the parameters `queries`, `id` or `vector`... [optional] top_k (int): The number of results to return for each query. Must be an integer greater than 1. - filter (Dict[str, Union[str, float, int, bool, List, dict]): + query_filter (Dict[str, Union[str, float, int, bool, List, dict]): The filter to apply. You can use vector metadata to limit your search. [optional] include_metadata (bool): Indicates whether metadata is included in the response as well as the ids. If omitted the server will use the default value of False [optional] @@ -134,9 +135,9 @@ def query( """ # get vector data and scores if vector is None: - responses = self._query_without_vector(top_k, filter) + responses = self._query_without_vector(top_k, query_filter) else: - responses = self._query(vector, top_k, filter) + responses = self._query(vector, top_k, query_filter) # include_metadata means return all columns in the table (including text that got embedded) # include_values means return the vector of the embedding for the searched items @@ -158,7 +159,7 @@ def _format_query_response(responses, include_metadata, include_values): score = response.pop("$similarity", None) text = response.pop("content", None) values = response.pop("$vector", None) if include_values else [] - metadata = response if include_metadata else dict() # Add all remaining fields to the metadata + metadata = response if include_metadata else {} # Add all remaining fields to the metadata rsp = Response(_id, text, values, metadata, score) final_res.append(rsp) return QueryResponse(final_res) @@ -185,7 +186,7 @@ def find_documents(self, find_query): if "data" in response_dict and "documents" in response_dict["data"]: return response_dict["data"]["documents"] else: - logger.warning("No documents found", response_dict) + logger.warning(f"No documents found: {response_dict}") def get_documents(self, ids: List[str], batch_size: int = 20) -> QueryResponse: document_batch = [] @@ -253,14 +254,14 @@ def delete( self, ids: Optional[List[str]] = None, delete_all: Optional[bool] = None, - filter: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, + filters: Optional[Dict[str, Union[str, float, int, bool, List, dict]]] = None, ) -> int: if delete_all: - query = {"deleteMany": {}} + query = {"deleteMany": {}} # type: dict if ids is not None: query = {"deleteMany": {"filter": {"_id": {"$in": ids}}}} - if filter is not None: - query = {"deleteMany": {"filter": filter}} + if filters is not None: + query = {"deleteMany": {"filter": filters}} deletion_counter = 0 moredata = True diff --git a/integrations/astra/src/astra_haystack/document_store.py b/integrations/astra/src/astra_haystack/document_store.py index 6d1a887dd..a9a02c148 100644 --- a/integrations/astra/src/astra_haystack/document_store.py +++ b/integrations/astra/src/astra_haystack/document_store.py @@ -14,14 +14,15 @@ DuplicatePolicy, MissingDocumentError, ) -from pydantic import validate_arguments from astra_haystack.astra_client import AstraClient from astra_haystack.errors import AstraDocumentStoreFilterError from astra_haystack.filters import _convert_filters logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) + + +MAX_BATCH_SIZE = 20 def _batches(input_list, batch_size): @@ -35,7 +36,6 @@ class AstraDocumentStore: An AstraDocumentStore document store for Haystack. """ - @validate_arguments def __init__( self, astra_id: str, @@ -44,7 +44,7 @@ def __init__( astra_keyspace: str, astra_collection: str, embedding_dim: Optional[int] = 768, - duplicates_policy: Optional[DuplicatePolicy] = DuplicatePolicy.NONE, + duplicates_policy: DuplicatePolicy = DuplicatePolicy.NONE, similarity: str = "cosine", ): """ @@ -62,7 +62,8 @@ def __init__( :param similarity: The similarity function used to compare document vectors. :param duplicates_policy: Handle duplicate documents based on DuplicatePolicy parameter options. Parameter options : (SKIP, OVERWRITE, FAIL, NONE) - - `DuplicatePolicy.NONE`: Default policy, If a Document with the same id already exists, it is skipped and not written. + - `DuplicatePolicy.NONE`: Default policy, If a Document with the same id already exists, + it is skipped and not written. - `DuplicatePolicy.SKIP`: If a Document with the same id already exists, it is skipped and not written. - `DuplicatePolicy.OVERWRITE`: If a Document with the same id already exists, it is overwritten. - `DuplicatePolicy.FAIL`: If a Document with the same id already exists, an error is raised. @@ -107,7 +108,7 @@ def write_documents( self, documents: List[Document], index: Optional[str] = None, - batch_size: Optional[int] = 20, + batch_size: int = 20, policy: DuplicatePolicy = DuplicatePolicy.NONE, ): """ @@ -119,13 +120,19 @@ def write_documents( :param batch_size: Number of documents that are passed to bulk function at a time. :param policy: Handle duplicate documents based on DuplicatePolicy parameter options. Parameter options : (SKIP, OVERWRITE, FAIL, NONE) - - `DuplicatePolicy.NONE`: Default policy, If a Document with the same id already exists, it is skipped and not written. - - `DuplicatePolicy.SKIP`: If a Document with the same id already exists, it is skipped and not written. + - `DuplicatePolicy.NONE`: Default policy, If a Document with the same id already exists, + it is skipped and not written. + - `DuplicatePolicy.SKIP`: If a Document with the same id already exists, + it is skipped and not written. - `DuplicatePolicy.OVERWRITE`: If a Document with the same id already exists, it is overwritten. - `DuplicatePolicy.FAIL`: If a Document with the same id already exists, an error is raised. :return: int """ + if index is None and self.index is None: + msg = "No Astra client provided" + raise ValueError(msg) + if index is None: index = self.index @@ -135,12 +142,12 @@ def write_documents( else: policy = DuplicatePolicy.SKIP - if batch_size > 20: + if batch_size > MAX_BATCH_SIZE: logger.warning( f"batch_size set to {batch_size}, " f"but maximum batch_size for Astra when using the JSON API is 20. batch_size set to 20." ) - batch_size = 20 + batch_size = MAX_BATCH_SIZE def _convert_input_document(document: Union[dict, Document]): if isinstance(document, Document): @@ -148,21 +155,22 @@ def _convert_input_document(document: Union[dict, Document]): elif isinstance(document, dict): document_dict = document else: - raise ValueError(f"Unsupported type for documents, documents is of type {type(document)}.") + msg = f"Unsupported type for documents, documents is of type {type(document)}." + raise ValueError(msg) if "id" in document_dict: if "_id" not in document_dict: document_dict["_id"] = document_dict.pop("id") elif "_id" in document_dict: - raise Exception( - f"Duplicate id definitions, both 'id' and '_id' present in document {document_dict}" - ) + msg = f"Duplicate id definitions, both 'id' and '_id' present in document {document_dict}" + raise Exception(msg) if "_id" in document_dict: if not isinstance(document_dict["_id"], str): - raise Exception( + msg = ( f"Document id {document_dict['_id']} is not a string, " f"but is of type {type(document_dict['_id'])}" ) + raise Exception(msg) if "dataframe" in document_dict and document_dict["dataframe"] is not None: document_dict["dataframe"] = document_dict.pop("dataframe").to_json() @@ -180,7 +188,8 @@ def _convert_input_document(document: Union[dict, Document]): response = self.index.find_documents({"filter": {"_id": doc["_id"]}}) if response: if policy == DuplicatePolicy.FAIL: - raise DuplicateDocumentError(f"ID '{doc['_id']}' already exists.") + msg = f"ID '{doc['_id']}' already exists." + raise DuplicateDocumentError(msg) duplicate_documents.append(doc) else: new_documents.append(doc) @@ -190,7 +199,7 @@ def _convert_input_document(document: Union[dict, Document]): if policy == DuplicatePolicy.SKIP: if len(new_documents) > 0: for batch in _batches(new_documents, batch_size): - inserted_ids = index.insert(batch) + inserted_ids = index.insert(batch) # type: ignore insertion_counter += len(inserted_ids) logger.info(f"write_documents inserted documents with id {inserted_ids}") else: @@ -199,7 +208,7 @@ def _convert_input_document(document: Union[dict, Document]): elif policy == DuplicatePolicy.OVERWRITE: if len(new_documents) > 0: for batch in _batches(new_documents, batch_size): - inserted_ids = index.insert(batch) + inserted_ids = index.insert(batch) # type: ignore insertion_counter += len(inserted_ids) logger.info(f"write_documents inserted documents with id {inserted_ids}") else: @@ -208,7 +217,7 @@ def _convert_input_document(document: Union[dict, Document]): if len(duplicate_documents) > 0: updated_ids = [] for duplicate_doc in duplicate_documents: - updated = index.update_document(duplicate_doc, "_id") + updated = index.update_document(duplicate_doc, "_id") # type: ignore if updated: updated_ids.append(duplicate_doc["_id"]) insertion_counter = insertion_counter + len(updated_ids) @@ -219,7 +228,7 @@ def _convert_input_document(document: Union[dict, Document]): elif policy == DuplicatePolicy.FAIL: if len(new_documents) > 0: for batch in _batches(new_documents, batch_size): - inserted_ids = index.insert(batch) + inserted_ids = index.insert(batch) # type: ignore insertion_counter = insertion_counter + len(inserted_ids) logger.info(f"write_documents inserted documents with id {inserted_ids}") else: @@ -264,13 +273,17 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc for vector in vectors: converted_filters = _convert_filters(filters) results = self.index.query( - vector=vector, filter=converted_filters, top_k=1000, include_values=True, include_metadata=True + vector=vector, + query_filter=converted_filters, + top_k=1000, + include_values=True, + include_metadata=True, ) documents.extend(self._get_result_to_documents(results)) else: converted_filters = _convert_filters(filters) results = self.index.query( - vector=vector, filter=converted_filters, top_k=1000, include_values=True, include_metadata=True + vector=vector, query_filter=converted_filters, top_k=1000, include_values=True, include_metadata=True ) documents = self._get_result_to_documents(results) return documents @@ -286,7 +299,7 @@ def _get_result_to_documents(results) -> List[Document]: df = None document = Document( content=match.text, - id=match.id, + id=match.document_id, embedding=match.values, dataframe=df, blob=match.metadata.pop("blob", None), @@ -313,7 +326,8 @@ def get_document_by_id(self, document_id: str) -> Document: document = self.index.get_documents(ids=[document_id]) ret = self._get_result_to_documents(document) if not ret: - raise MissingDocumentError(f"Document {document_id} does not exist") + msg = f"Document {document_id} does not exist" + raise MissingDocumentError(msg) return ret[0] def search( @@ -335,7 +349,7 @@ def search( self.index.query( vector=query_embedding, top_k=top_k, - filter=converted_filters, + query_filter=converted_filters, include_metadata=True, include_values=True, ) @@ -344,7 +358,7 @@ def search( return result - def delete_documents(self, document_ids: List[str] = None, delete_all: Optional[bool] = None) -> None: + def delete_documents(self, document_ids: Optional[List[str]] = None, delete_all: Optional[bool] = None) -> None: """ Deletes all documents with a matching document_ids from the document store. Fails with `MissingDocumentError` if no document with this id is present in the store. @@ -356,13 +370,14 @@ def delete_documents(self, document_ids: List[str] = None, delete_all: Optional[ deletion_counter = 0 if self.index.count_documents() > 0: if document_ids is not None: - for batch in _batches(document_ids, 20): + for batch in _batches(document_ids, MAX_BATCH_SIZE): deletion_counter += self.index.delete(ids=batch) else: deletion_counter = self.index.delete(delete_all=delete_all) logger.info(f"{deletion_counter} documents deleted") if document_ids is not None and deletion_counter == 0: - raise MissingDocumentError(f"Document {document_ids} does not exist") + msg = f"Document {document_ids} does not exist" + raise MissingDocumentError(msg) else: logger.info("No documents in document store") diff --git a/integrations/astra/src/astra_haystack/filters.py b/integrations/astra/src/astra_haystack/filters.py index 605d5ecaa..6b628486b 100644 --- a/integrations/astra/src/astra_haystack/filters.py +++ b/integrations/astra/src/astra_haystack/filters.py @@ -32,19 +32,19 @@ def _convert_filters(filters: Optional[Dict[str, Any]] = None) -> Optional[Dict[ else: if key == "id": filter_statements[key] = {"_id": value} - if key != "$in" and type(value) is list: + if key != "$in" and isinstance(value, list): filter_statements[key] = {"$in": value} + elif isinstance(value, pd.DataFrame): + filter_statements[key] = value.to_json() + elif isinstance(value, dict): + for dkey, dvalue in value.items(): + if dkey == "$in" and not isinstance(dvalue, list): + exception_message = f"$in operator must have `ARRAY`, got {dvalue} of type {type(dvalue)}" + raise FilterError(exception_message) + converted = {dkey: dvalue} + filter_statements[key] = converted else: - if type(value) is pd.DataFrame: - filter_statements[key] = value.to_json() - elif type(value) is dict: - for dkey, dvalue in value.items(): - if dkey == "$in" and type(dvalue) is not list: - raise FilterError(f"$in operator must have `ARRAY`, got {dvalue} of type {type(dvalue)}") - converted = {dkey: dvalue} - filter_statements[key] = converted - else: - filter_statements[key] = value + filter_statements[key] = value return filter_statements @@ -77,7 +77,8 @@ def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]: if len(conditions) > 1: conditions = _normalize_ranges(conditions) if operator not in OPERATORS: - raise FilterError(f"Unknown operator {operator}") + msg = f"Unknown operator {operator}" + raise FilterError(msg) return {OPERATORS[operator]: conditions} diff --git a/integrations/astra/src/astra_haystack/retriever.py b/integrations/astra/src/astra_haystack/retriever.py index 22c8f2664..47304df2c 100644 --- a/integrations/astra/src/astra_haystack/retriever.py +++ b/integrations/astra/src/astra_haystack/retriever.py @@ -28,7 +28,8 @@ def __init__(self, document_store: AstraDocumentStore, filters: Optional[Dict[st self.document_store = document_store if not isinstance(document_store, AstraDocumentStore): - raise Exception("document_store must be an instance of AstraDocumentStore") + message = "document_store must be an instance of AstraDocumentStore" + raise Exception(message) @component.output_types(documents=List[Document]) def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None): @@ -36,7 +37,8 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = Args: query_embedding (List[str]): An input list of queries - filters (Optional[Dict[str, Any]], optional): A dictionary with filters to narrow down the search space. Defaults to None. + filters (Optional[Dict[str, Any]], optional): A dictionary with filters to narrow down the search space. + Defaults to None. top_k (Optional[int], optional): The maximum number of documents to retrieve. Defaults to None. """ diff --git a/integrations/astra/tests/__init__.py b/integrations/astra/tests/__init__.py index ad09dadb6..f5e799e88 100644 --- a/integrations/astra/tests/__init__.py +++ b/integrations/astra/tests/__init__.py @@ -1,6 +1,3 @@ # SPDX-FileCopyrightText: 2023-present Anant Corporation # # SPDX-License-Identifier: Apache-2.0 -import sys - -sys.path.append("../src/astra_haystack/") diff --git a/integrations/astra/requirements.txt b/requirements.txt similarity index 100% rename from integrations/astra/requirements.txt rename to requirements.txt