From 8df2edffb68340e4504af3e174de40f6f478c1e0 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 16 Nov 2023 09:30:23 +0100 Subject: [PATCH 01/36] add labeller workflow --- .github/workflows/labeler.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 .github/workflows/labeler.yml diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 000000000..d3e9adbd9 --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,15 @@ +name: "Labeler" +on: +- pull_request_target + +permissions: + contents: read + pull-requests: write + +jobs: + triage: + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v4 + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" From 7d2b824ad27be1514a38a111d2d2a2480e12eaf8 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:05:42 +0100 Subject: [PATCH 02/36] Elasticsearch Document store - embedding retrieval (#52) * set scale_score default to False * unrelated: replace text w content * first implementation * test * fix some tests * make tests more robust; skip unsupported ones * rm unsupported test * ignore import-not-found * first chunk addressing PR feedback * improve tests --- .../elasticsearch/docker-compose.yml | 2 +- .../elasticsearch_haystack/document_store.py | 112 +++++++++++++++--- .../tests/test_bm25_retriever.py | 6 +- .../tests/test_document_store.py | 80 +++++++++++-- 4 files changed, 171 insertions(+), 29 deletions(-) diff --git a/document_stores/elasticsearch/docker-compose.yml b/document_stores/elasticsearch/docker-compose.yml index 6d21941b7..66dba73f5 100644 --- a/document_stores/elasticsearch/docker-compose.yml +++ b/document_stores/elasticsearch/docker-compose.yml @@ -1,6 +1,6 @@ services: elasticsearch: - image: "docker.elastic.co/elasticsearch/elasticsearch:8.10.0" + image: "docker.elastic.co/elasticsearch/elasticsearch:8.11.1" ports: - 9200:9200 restart: on-failure diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py index 740b54180..083918d71 100644 --- a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Dict, List, Mapping, Optional, Union +from typing import Any, Dict, List, Literal, Mapping, Optional, Union import numpy as np @@ -33,7 +33,14 @@ @document_store class ElasticsearchDocumentStore: - def __init__(self, *, hosts: Optional[Hosts] = None, index: str = "default", **kwargs): + def __init__( + self, + *, + hosts: Optional[Hosts] = None, + index: str = "default", + embedding_similarity_function: Literal["cosine", "dot_product", "l2_norm", "max_inner_product"] = "cosine", + **kwargs, + ): """ Creates a new ElasticsearchDocumentStore instance. @@ -45,19 +52,32 @@ def __init__(self, *, hosts: Optional[Hosts] = None, index: str = "default", **k :param hosts: List of hosts running the Elasticsearch client. Defaults to None :param index: Name of index in Elasticsearch, if it doesn't exist it will be created. Defaults to "default" + :param embedding_similarity_function: The similarity function used to compare Documents embeddings. + Defaults to "cosine". This parameter only takes effect if the index does not yet exist and is created. + To choose the most appropriate function, look for information about your embedding model. + To understand how document scores are computed, see the Elasticsearch documentation: + https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params :param **kwargs: Optional arguments that ``Elasticsearch`` takes. """ self._hosts = hosts self._client = Elasticsearch(hosts, **kwargs) self._index = index + self._embedding_similarity_function = embedding_similarity_function self._kwargs = kwargs # Check client connection, this will raise if not connected self._client.info() + # configure mapping for the embedding field + mappings = { + "properties": { + "embedding": {"type": "dense_vector", "index": True, "similarity": embedding_similarity_function} + } + } + # Create the index if it doesn't exist if not self._client.indices.exists(index=index): - self._client.indices.create(index=index) + self._client.indices.create(index=index, mappings=mappings) def to_dict(self) -> Dict[str, Any]: # This is not the best solution to serialise this class but is the fastest to implement. @@ -67,6 +87,7 @@ def to_dict(self) -> Dict[str, Any]: self, hosts=self._hosts, index=self._index, + embedding_similarity_function=self._embedding_similarity_function, **self._kwargs, ) @@ -80,6 +101,26 @@ def count_documents(self) -> int: """ return self._client.count(index=self._index)["count"] + def _search_documents(self, **kwargs) -> List[Document]: + """ + Calls the Elasticsearch client's search method and handles pagination. + """ + + documents: List[Document] = [] + from_ = 0 + # Handle pagination + while True: + res = self._client.search( + index=self._index, + from_=from_, + **kwargs, + ) + documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) + from_ = len(documents) + if from_ >= res["hits"]["total"]["value"]: + break + return documents + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: """ Returns the documents that match the filters provided. @@ -152,20 +193,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc :return: a list of Documents that match the given filters. """ query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None - - documents: List[Document] = [] - from_ = 0 - # Handle pagination - while True: - res = self._client.search( - index=self._index, - query=query, - from_=from_, - ) - documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) - from_ = len(documents) - if from_ >= res["hits"]["total"]["value"]: - break + documents = self._search_documents(query=query) return documents def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None: @@ -306,3 +334,53 @@ def _bm25_retrieval( hit["_score"] = float(1 / (1 + np.exp(-np.asarray(hit["_score"] / BM25_SCALING_FACTOR)))) docs.append(self._deserialize_document(hit)) return docs + + def _embedding_retrieval( + self, + query_embedding: List[float], + *, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + num_candidates: Optional[int] = None, + ) -> List[Document]: + """ + Retrieves documents that are most similar to the query embedding using a vector similarity metric. + It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm. + + This method is not mean to be part of the public interface of + `ElasticsearchDocumentStore` nor called directly. + `ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it. + + :param query_embedding: Embedding of the query. + :param filters: Filters applied to the retrieved Documents. Defaults to None. + Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned. + :param top_k: Maximum number of Documents to return, defaults to 10 + :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. + Increasing this value will improve search accuracy at the cost of slower search speeds. + You can read more about it in the Elasticsearch documentation: + https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy + :raises ValueError: If `query_embedding` is an empty list + :return: List of Document that are most similar to `query_embedding` + """ + + if not query_embedding: + msg = "query_embedding must be a non-empty list of floats" + raise ValueError(msg) + + if not num_candidates: + num_candidates = top_k * 10 + + body: Dict[str, Any] = { + "knn": { + "field": "embedding", + "query_vector": query_embedding, + "k": top_k, + "num_candidates": num_candidates, + }, + } + + if filters: + body["knn"]["filter"] = _normalize_filters(filters) + + docs = self._search_documents(**body) + return docs diff --git a/document_stores/elasticsearch/tests/test_bm25_retriever.py b/document_stores/elasticsearch/tests/test_bm25_retriever.py index 86c5aac3a..9139368d9 100644 --- a/document_stores/elasticsearch/tests/test_bm25_retriever.py +++ b/document_stores/elasticsearch/tests/test_bm25_retriever.py @@ -27,7 +27,11 @@ def test_to_dict(_mock_elasticsearch_client): "type": "ElasticsearchBM25Retriever", "init_parameters": { "document_store": { - "init_parameters": {"hosts": "some fake host", "index": "default"}, + "init_parameters": { + "hosts": "some fake host", + "index": "default", + "embedding_similarity_function": "cosine", + }, "type": "ElasticsearchDocumentStore", }, "filters": {}, diff --git a/document_stores/elasticsearch/tests/test_document_store.py b/document_stores/elasticsearch/tests/test_document_store.py index 130da8340..11443546c 100644 --- a/document_stores/elasticsearch/tests/test_document_store.py +++ b/document_stores/elasticsearch/tests/test_document_store.py @@ -6,6 +6,7 @@ import pandas as pd import pytest +from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found] from haystack.preview.dataclasses.document import Document from haystack.preview.document_stores.errors import DuplicateDocumentError from haystack.preview.document_stores.protocols import DuplicatePolicy @@ -30,7 +31,13 @@ def docstore(self, request): # Use a different index for each test so we can run them in parallel index = f"{request.node.name}" - store = ElasticsearchDocumentStore(hosts=hosts, index=index) + # this similarity function is rarely used in practice, but it is robust for test cases with fake embeddings + # in fact, it works fine with vectors like [0.0] * 768, while cosine similarity would raise an exception + embedding_similarity_function = "max_inner_product" + + store = ElasticsearchDocumentStore( + hosts=hosts, index=index, embedding_similarity_function=embedding_similarity_function + ) yield store store._client.options(ignore_status=[400, 404]).indices.delete(index=index) @@ -43,6 +50,7 @@ def test_to_dict(self, _mock_elasticsearch_client): "init_parameters": { "hosts": "some hosts", "index": "default", + "embedding_similarity_function": "cosine", }, } @@ -53,11 +61,13 @@ def test_from_dict(self, _mock_elasticsearch_client): "init_parameters": { "hosts": "some hosts", "index": "default", + "embedding_similarity_function": "cosine", }, } document_store = ElasticsearchDocumentStore.from_dict(data) assert document_store._hosts == "some hosts" assert document_store._index == "default" + assert document_store._embedding_similarity_function == "cosine" def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): docstore.write_documents( @@ -169,15 +179,6 @@ def test_in_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_ def test_in_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): pass - def test_ne_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - embedding = [0.0] * 768 - result = docstore.filter_documents(filters={"embedding": {"$ne": embedding}}) - assert self.contains_same_docs( - result, - [doc for doc in filterable_docs if doc.embedding is None or not embedding == doc.embedding], - ) - @pytest.mark.skip(reason="Not supported") def test_nin_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): pass @@ -186,6 +187,26 @@ def test_nin_filter_table(self, docstore: ElasticsearchDocumentStore, filterable def test_nin_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): pass + @pytest.mark.skip(reason="Not supported") + def test_eq_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): + """ + If the embedding field is a dense vector (as expected), raise the following error: + + elasticsearch.BadRequestError: BadRequestError(400, 'search_phase_execution_exception', + "failed to create query: Field [embedding] of type [dense_vector] doesn't support term queries") + """ + pass + + @pytest.mark.skip(reason="Not supported") + def test_ne_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): + """ + If the embedding field is a dense vector (as expected), raise the following error: + + elasticsearch.BadRequestError: BadRequestError(400, 'search_phase_execution_exception', + "failed to create query: Field [embedding] of type [dense_vector] doesn't support term queries") + """ + pass + def test_gt_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): docstore.write_documents(filterable_docs) result = docstore.filter_documents(filters={"page": {"$gt": "100"}}) @@ -231,3 +252,42 @@ def test_lte_filter_table(self, docstore: ElasticsearchDocumentStore, filterable docstore.write_documents(filterable_docs) result = docstore.filter_documents(filters={"dataframe": {"$lte": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) assert self.contains_same_docs(result, [d for d in filterable_docs if d.dataframe is not None]) + + def test_embedding_retrieval(self, docstore: ElasticsearchDocumentStore): + docs = [ + Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), + Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), + Document(content="Not very similar document", embedding=[0.0, 0.8, 0.3, 0.9]), + ] + docstore.write_documents(docs) + results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters={}) + assert len(results) == 2 + assert results[0].content == "Most similar document" + assert results[1].content == "2nd best document" + + def test_embedding_retrieval_w_filters(self, docstore: ElasticsearchDocumentStore): + docs = [ + Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), + Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), + Document( + content="Not very similar document with meta field", + embedding=[0.0, 0.8, 0.3, 0.9], + meta={"meta_field": "custom_value"}, + ), + ] + docstore.write_documents(docs) + + filters = {"meta_field": {"$eq": "custom_value"}} + results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters=filters) + assert len(results) == 1 + assert results[0].content == "Not very similar document with meta field" + + def test_embedding_retrieval_query_documents_different_embedding_sizes(self, docstore: ElasticsearchDocumentStore): + """ + Test that the retrieval fails if the query embedding and the documents have different embedding sizes. + """ + docs = [Document(content="Hello world", embedding=[0.1, 0.2, 0.3, 0.4])] + docstore.write_documents(docs) + + with pytest.raises(BadRequestError): + docstore._embedding_retrieval(query_embedding=[0.1, 0.1]) From 6babb9a4b3e35f46df9824a661b6b9a0d88a7d36 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:13:30 +0100 Subject: [PATCH 03/36] temporarily pin haystack-ai (#56) --- document_stores/elasticsearch/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/document_stores/elasticsearch/pyproject.toml b/document_stores/elasticsearch/pyproject.toml index 8861f188a..c54be02f2 100644 --- a/document_stores/elasticsearch/pyproject.toml +++ b/document_stores/elasticsearch/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ ] dependencies = [ # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" - "haystack-ai", + "haystack-ai==0.143.0", "elasticsearch>=8,<9", "typing_extensions", # This is not a direct dependency, but `haystack-ai` is missing it cause `canals` is missing it ] From 5ecacc57156d7f13a1d4df5db23a9c8a54735c68 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Thu, 16 Nov 2023 18:19:49 +0100 Subject: [PATCH 04/36] Elasticsearch Embedding Retriever (#54) * set scale_score default to False * unrelated: replace text w content * first implementation * test * fix some tests * make tests more robust; skip unsupported ones * rm unsupported test * ignore import-not-found * embedding retriever * tests * organize imports * first chunk addressing PR feedback * improve tests * add docstrings * more docstrings --- .../embedding_retriever.py | 80 +++++++++++++++++++ .../tests/test_embedding_retriever.py | 79 ++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 document_stores/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py create mode 100644 document_stores/elasticsearch/tests/test_embedding_retriever.py diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py b/document_stores/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py new file mode 100644 index 000000000..3bb4576ec --- /dev/null +++ b/document_stores/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from typing import Any, Dict, List, Optional + +from haystack.preview import component, default_from_dict, default_to_dict +from haystack.preview.dataclasses import Document + +from elasticsearch_haystack.document_store import ElasticsearchDocumentStore + + +@component +class ElasticsearchEmbeddingRetriever: + """ + Uses a vector similarity metric to retrieve documents from the ElasticsearchDocumentStore. + + Needs to be connected to the ElasticsearchDocumentStore to run. + """ + + def __init__( + self, + *, + document_store: ElasticsearchDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + num_candidates: Optional[int] = None, + ): + """ + Create the ElasticsearchEmbeddingRetriever component. + + :param document_store: An instance of ElasticsearchDocumentStore. + :param filters: Filters applied to the retrieved Documents. Defaults to None. + Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned. + :param top_k: Maximum number of Documents to return, defaults to 10 + :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. + Increasing this value will improve search accuracy at the cost of slower search speeds. + You can read more about it in the Elasticsearch documentation: + https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy + :raises ValueError: If `document_store` is not an instance of ElasticsearchDocumentStore. + """ + if not isinstance(document_store, ElasticsearchDocumentStore): + msg = "document_store must be an instance of ElasticsearchDocumentStore" + raise ValueError(msg) + + self._document_store = document_store + self._filters = filters or {} + self._top_k = top_k + self._num_candidates = num_candidates + + def to_dict(self) -> Dict[str, Any]: + return default_to_dict( + self, + filters=self._filters, + top_k=self._top_k, + num_candidates=self._num_candidates, + document_store=self._document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever": + data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict( + data["init_parameters"]["document_store"] + ) + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, query_embedding: List[float]): + """ + Retrieve documents using a vector similarity metric. + + :param query_embedding: Embedding of the query. + :return: List of Document similar to `query_embedding`. + """ + docs = self._document_store._embedding_retrieval( + query_embedding=query_embedding, + filters=self._filters, + top_k=self._top_k, + num_candidates=self._num_candidates, + ) + return {"documents": docs} diff --git a/document_stores/elasticsearch/tests/test_embedding_retriever.py b/document_stores/elasticsearch/tests/test_embedding_retriever.py new file mode 100644 index 000000000..b16e28830 --- /dev/null +++ b/document_stores/elasticsearch/tests/test_embedding_retriever.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import Mock, patch + +from haystack.preview.dataclasses import Document + +from elasticsearch_haystack.document_store import ElasticsearchDocumentStore +from elasticsearch_haystack.embedding_retriever import ElasticsearchEmbeddingRetriever + + +def test_init_default(): + mock_store = Mock(spec=ElasticsearchDocumentStore) + retriever = ElasticsearchEmbeddingRetriever(document_store=mock_store) + assert retriever._document_store == mock_store + assert retriever._filters == {} + assert retriever._top_k == 10 + assert retriever._num_candidates is None + + +@patch("elasticsearch_haystack.document_store.Elasticsearch") +def test_to_dict(_mock_elasticsearch_client): + document_store = ElasticsearchDocumentStore(hosts="some fake host") + retriever = ElasticsearchEmbeddingRetriever(document_store=document_store) + res = retriever.to_dict() + assert res == { + "type": "ElasticsearchEmbeddingRetriever", + "init_parameters": { + "document_store": { + "init_parameters": { + "hosts": "some fake host", + "index": "default", + "embedding_similarity_function": "cosine", + }, + "type": "ElasticsearchDocumentStore", + }, + "filters": {}, + "top_k": 10, + "num_candidates": None, + }, + } + + +@patch("elasticsearch_haystack.document_store.Elasticsearch") +def test_from_dict(_mock_elasticsearch_client): + data = { + "type": "ElasticsearchEmbeddingRetriever", + "init_parameters": { + "document_store": { + "init_parameters": {"hosts": "some fake host", "index": "default"}, + "type": "ElasticsearchDocumentStore", + }, + "filters": {}, + "top_k": 10, + "num_candidates": None, + }, + } + retriever = ElasticsearchEmbeddingRetriever.from_dict(data) + assert retriever._document_store + assert retriever._filters == {} + assert retriever._top_k == 10 + assert retriever._num_candidates is None + + +def test_run(): + mock_store = Mock(spec=ElasticsearchDocumentStore) + mock_store._embedding_retrieval.return_value = [Document(content="Test doc", embedding=[0.1, 0.2])] + retriever = ElasticsearchEmbeddingRetriever(document_store=mock_store) + res = retriever.run(query_embedding=[0.5, 0.7]) + mock_store._embedding_retrieval.assert_called_once_with( + query_embedding=[0.5, 0.7], + filters={}, + top_k=10, + num_candidates=None, + ) + assert len(res) == 1 + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" + assert res["documents"][0].embedding == [0.1, 0.2] From 48c0d5fabb9a8aca5990ea8b1300ce6f01647c94 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 16 Nov 2023 18:45:28 +0100 Subject: [PATCH 05/36] [chroma] Update the code to account for the latest changes in haystack Document (#55) * update the code to the latest changes in haystack Document * pin haystack-ai * fix linter * do not add None as metadata --- document_stores/chroma/pyproject.toml | 2 +- .../src/chroma_haystack/document_store.py | 34 +++++++++++-------- .../chroma/tests/test_document_store.py | 2 +- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/document_stores/chroma/pyproject.toml b/document_stores/chroma/pyproject.toml index d19461895..c1ab121a1 100644 --- a/document_stores/chroma/pyproject.toml +++ b/document_stores/chroma/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai", + "haystack-ai<0.144.0", "chromadb", ] diff --git a/document_stores/chroma/src/chroma_haystack/document_store.py b/document_stores/chroma/src/chroma_haystack/document_store.py index 30517ca5f..d67a2a36a 100644 --- a/document_stores/chroma/src/chroma_haystack/document_store.py +++ b/document_stores/chroma/src/chroma_haystack/document_store.py @@ -281,7 +281,8 @@ def _get_result_to_documents(self, result: GetResult) -> List[Document]: document_dict["content"] = result_documents[i] result_metadata = result.get("metadatas") - if result_metadata: + # Ensure metadata[i] is not None or don't add it to the document dict + if result_metadata and result_metadata[i]: document_dict["meta"] = result_metadata[i] result_embeddings = result.get("embeddings") @@ -296,26 +297,29 @@ def _query_result_to_documents(self, result: QueryResult) -> List[List[Document] """ Helper function to convert Chroma results into Haystack Documents """ - retval = [] - for i, answers in enumerate(result["documents"]): + retval: List[List[Document]] = [] + documents = result.get("documents") + if documents is None: + return retval + + for i, answers in enumerate(documents): converted_answers = [] for j in range(len(answers)): - # prepare metadata - metadata = result["metadatas"][i][j] - mime_type = metadata.pop("_mime_type") - - document_dict = { + document_dict: Dict[str, Any] = { "id": result["ids"][i][j], - "text": result["documents"][i][j].text, - "metadata": metadata, - "mime_type": mime_type, + "content": documents[i][j], } - if result["embeddings"][i][j]: - document_dict["embedding"] = np.array(result["embeddings"][i][j]) + # prepare metadata + if metadatas := result.get("metadatas"): + document_dict["metadata"] = dict(metadatas[i][j]) + document_dict["mime_type"] = document_dict["metadata"].pop("_mime_type") + + if embeddings := result.get("embeddings"): + document_dict["embedding"] = np.array(embeddings[i][j]) - if result["distances"][i][j]: - document_dict["score"] = result["distances"][i][j] + if distances := result.get("distances"): + document_dict["score"] = distances[i][j] converted_answers.append(Document.from_dict(document_dict)) retval.append(converted_answers) diff --git a/document_stores/chroma/tests/test_document_store.py b/document_stores/chroma/tests/test_document_store.py index 8d420be1a..ea204f6f6 100644 --- a/document_stores/chroma/tests/test_document_store.py +++ b/document_stores/chroma/tests/test_document_store.py @@ -64,7 +64,7 @@ def test_delete_not_empty_nonexisting(self, docstore: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma """ - doc = Document(text="test doc") + doc = Document(content="test doc") docstore.write_documents([doc]) docstore.delete_documents(["non_existing"]) From 1c6410e3f2f1c8758285df17f74e826c66050a42 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Mon, 20 Nov 2023 17:26:03 +0100 Subject: [PATCH 06/36] Elasticsearch - refactor `_search_documents` (#57) * set scale_score default to False * unrelated: replace text w content * first implementation * test * fix some tests * make tests more robust; skip unsupported ones * rm unsupported test * ignore import-not-found * first chunk addressing PR feedback * improve tests * use _search_documents also in bm25 retrieval * improve logic and tests * fix format * better format * Update document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * Update document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * remove wrong increment * move ruff ignore error --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- .../elasticsearch_haystack/document_store.py | 21 ++++++--- .../tests/test_document_store.py | 44 +++++++++++++++++++ 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py index 083918d71..4d1903e9f 100644 --- a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -106,6 +106,10 @@ def _search_documents(self, **kwargs) -> List[Document]: Calls the Elasticsearch client's search method and handles pagination. """ + top_k = kwargs.get("size") + if top_k is None and "knn" in kwargs and "k" in kwargs["knn"]: + top_k = kwargs["knn"]["k"] + documents: List[Document] = [] from_ = 0 # Handle pagination @@ -115,8 +119,12 @@ def _search_documents(self, **kwargs) -> List[Document]: from_=from_, **kwargs, ) + documents.extend(self._deserialize_document(hit) for hit in res["hits"]["hits"]) from_ = len(documents) + + if top_k is not None and from_ >= top_k: + break if from_ >= res["hits"]["total"]["value"]: break return documents @@ -326,14 +334,13 @@ def _bm25_retrieval( if filters: body["query"]["bool"]["filter"] = _normalize_filters(filters) - res = self._client.search(index=self._index, **body) + documents = self._search_documents(**body) - docs = [] - for hit in res["hits"]["hits"]: - if scale_score: - hit["_score"] = float(1 / (1 + np.exp(-np.asarray(hit["_score"] / BM25_SCALING_FACTOR)))) - docs.append(self._deserialize_document(hit)) - return docs + if scale_score: + for doc in documents: + doc.score = float(1 / (1 + np.exp(-np.asarray(doc.score / BM25_SCALING_FACTOR)))) + + return documents def _embedding_retrieval( self, diff --git a/document_stores/elasticsearch/tests/test_document_store.py b/document_stores/elasticsearch/tests/test_document_store.py index 11443546c..1e7b3f115 100644 --- a/document_stores/elasticsearch/tests/test_document_store.py +++ b/document_stores/elasticsearch/tests/test_document_store.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: 2023-present Silvano Cerza # # SPDX-License-Identifier: Apache-2.0 + +import random from typing import List from unittest.mock import patch @@ -92,6 +94,34 @@ def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): assert "functional" in res[1].content assert "functional" in res[2].content + def test_bm25_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): + """ + Test that handling of pagination works as expected, when the matching documents are > 10. + """ + docstore.write_documents( + [ + Document(content="Haskell is a functional programming language"), + Document(content="Lisp is a functional programming language"), + Document(content="Exilir is a functional programming language"), + Document(content="F# is a functional programming language"), + Document(content="C# is a functional programming language"), + Document(content="C++ is an object oriented programming language"), + Document(content="Dart is an object oriented programming language"), + Document(content="Go is an object oriented programming language"), + Document(content="Python is a object oriented programming language"), + Document(content="Ruby is a object oriented programming language"), + Document(content="PHP is a object oriented programming language"), + Document(content="Java is an object oriented programming language"), + Document(content="Javascript is a programming language"), + Document(content="Typescript is a programming language"), + Document(content="C is a programming language"), + ] + ) + + res = docstore._bm25_retrieval("programming", top_k=11) + assert len(res) == 11 + assert all("programming" in doc.content for doc in res) + def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStore): docstore.write_documents( [ @@ -282,6 +312,20 @@ def test_embedding_retrieval_w_filters(self, docstore: ElasticsearchDocumentStor assert len(results) == 1 assert results[0].content == "Not very similar document with meta field" + def test_embedding_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): + """ + Test that handling of pagination works as expected, when the matching documents are > 10. + """ + + docs = [ + Document(content=f"Document {i}", embedding=[random.random() for _ in range(4)]) # noqa: S311 + for i in range(20) + ] + + docstore.write_documents(docs) + results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=11, filters={}) + assert len(results) == 11 + def test_embedding_retrieval_query_documents_different_embedding_sizes(self, docstore: ElasticsearchDocumentStore): """ Test that the retrieval fails if the query embedding and the documents have different embedding sizes. From dfafb53791a785eb6deb58bdcd70ec07e698cf49 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Tue, 21 Nov 2023 16:21:15 +0100 Subject: [PATCH 07/36] Update README.md --- README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c4ea3fdb4..3511f588a 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,7 @@ -# Haystack 2.x additional resources +# Haystack 2.x Core Integrations This repository contains integrations to extend the capabilities of [Haystack](https://github.com/deepset-ai/haystack) version 2.0 and -onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai), some of it on a best-effort -basis: see each folder's `README` file for details around installation, usage and support. - -This is the list of packages currently hosted in this repo. +onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai), see each integration's `README` file for details around installation, usage and support. | Package | Type | PyPi Package | Status | | ----------------------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | From 6727367bcbdf5fe4050b166419fe5b9deda3baf4 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:42:24 +0100 Subject: [PATCH 08/36] Move `instructor-embedders` to `components/embedders` (#60) * move embedders to a subfolder * fix types in tests * fix README --- .github/workflows/components_instructor_embedders.yml | 6 +++--- README.md | 2 +- .../{ => embedders}/instructor-embedders/LICENSE.txt | 0 components/{ => embedders}/instructor-embedders/README.md | 0 .../instructor_embedders/__about__.py | 0 .../instructor-embedders/instructor_embedders/__init__.py | 0 .../instructor_embedders/embedding_backend/__init__.py | 0 .../embedding_backend/instructor_backend.py | 0 .../instructor_embedders/instructor_document_embedder.py | 2 +- .../instructor_embedders/instructor_text_embedder.py | 0 .../{ => embedders}/instructor-embedders/pyproject.toml | 0 .../instructor-embedders/tests/__init__.py | 0 .../instructor-embedders/tests/test_instructor_backend.py | 0 .../tests/test_instructor_document_embedder.py | 8 ++++---- .../tests/test_instructor_embedders.py | 0 .../tests/test_instructor_text_embedder.py | 8 ++++---- 16 files changed, 13 insertions(+), 13 deletions(-) rename components/{ => embedders}/instructor-embedders/LICENSE.txt (100%) rename components/{ => embedders}/instructor-embedders/README.md (100%) rename components/{ => embedders}/instructor-embedders/instructor_embedders/__about__.py (100%) rename components/{ => embedders}/instructor-embedders/instructor_embedders/__init__.py (100%) rename components/{ => embedders}/instructor-embedders/instructor_embedders/embedding_backend/__init__.py (100%) rename components/{ => embedders}/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py (100%) rename components/{ => embedders}/instructor-embedders/instructor_embedders/instructor_document_embedder.py (99%) rename components/{ => embedders}/instructor-embedders/instructor_embedders/instructor_text_embedder.py (100%) rename components/{ => embedders}/instructor-embedders/pyproject.toml (100%) rename components/{ => embedders}/instructor-embedders/tests/__init__.py (100%) rename components/{ => embedders}/instructor-embedders/tests/test_instructor_backend.py (100%) rename components/{ => embedders}/instructor-embedders/tests/test_instructor_document_embedder.py (96%) rename components/{ => embedders}/instructor-embedders/tests/test_instructor_embedders.py (100%) rename components/{ => embedders}/instructor-embedders/tests/test_instructor_text_embedder.py (95%) diff --git a/.github/workflows/components_instructor_embedders.yml b/.github/workflows/components_instructor_embedders.yml index 95d957d19..c20363d51 100644 --- a/.github/workflows/components_instructor_embedders.yml +++ b/.github/workflows/components_instructor_embedders.yml @@ -5,12 +5,12 @@ on: - cron: "0 0 * * *" pull_request: paths: - - 'components/instructor-embedders/**' + - 'components/embedders/instructor-embedders/**' - '.github/workflows/components_instructor_embedders.yml' defaults: run: - working-directory: components/instructor-embedders + working-directory: components/embedders/instructor-embedders jobs: test: @@ -27,7 +27,7 @@ jobs: - name: Ruff uses: chartboost/ruff-action@v1 with: - src: components/instructor-embedders + src: components/embedders/instructor-embedders - name: Install instructor-embedders run: | diff --git a/README.md b/README.md index 3511f588a..cdbb0e1ab 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai | ----------------------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [chroma-haystack](document_stores/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / Document Stores / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml) | | [elasticsearch-haystack](document_stores/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / Document Stores / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml) | -| [instructor-embedders-haystack](components/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml) | +| [instructor-embedders-haystack](components/embedders/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml) | ## Contributing diff --git a/components/instructor-embedders/LICENSE.txt b/components/embedders/instructor-embedders/LICENSE.txt similarity index 100% rename from components/instructor-embedders/LICENSE.txt rename to components/embedders/instructor-embedders/LICENSE.txt diff --git a/components/instructor-embedders/README.md b/components/embedders/instructor-embedders/README.md similarity index 100% rename from components/instructor-embedders/README.md rename to components/embedders/instructor-embedders/README.md diff --git a/components/instructor-embedders/instructor_embedders/__about__.py b/components/embedders/instructor-embedders/instructor_embedders/__about__.py similarity index 100% rename from components/instructor-embedders/instructor_embedders/__about__.py rename to components/embedders/instructor-embedders/instructor_embedders/__about__.py diff --git a/components/instructor-embedders/instructor_embedders/__init__.py b/components/embedders/instructor-embedders/instructor_embedders/__init__.py similarity index 100% rename from components/instructor-embedders/instructor_embedders/__init__.py rename to components/embedders/instructor-embedders/instructor_embedders/__init__.py diff --git a/components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py b/components/embedders/instructor-embedders/instructor_embedders/embedding_backend/__init__.py similarity index 100% rename from components/instructor-embedders/instructor_embedders/embedding_backend/__init__.py rename to components/embedders/instructor-embedders/instructor_embedders/embedding_backend/__init__.py diff --git a/components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py b/components/embedders/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py similarity index 100% rename from components/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py rename to components/embedders/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py diff --git a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/components/embedders/instructor-embedders/instructor_embedders/instructor_document_embedder.py similarity index 99% rename from components/instructor-embedders/instructor_embedders/instructor_document_embedder.py rename to components/embedders/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 083986385..91a8b38e2 100644 --- a/components/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/components/embedders/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -50,7 +50,7 @@ class InstructorDocumentEmbedder: ] result = doc_embedder.run(document_list) - print(f"Document Text: {result['documents'][0].text}") + print(f"Document Text: {result['documents'][0].content}") print(f"Document Embedding: {result['documents'][0].embedding}") print(f"Embedding Dimension: {len(result['documents'][0].embedding)}") """ # noqa: E501 diff --git a/components/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/components/embedders/instructor-embedders/instructor_embedders/instructor_text_embedder.py similarity index 100% rename from components/instructor-embedders/instructor_embedders/instructor_text_embedder.py rename to components/embedders/instructor-embedders/instructor_embedders/instructor_text_embedder.py diff --git a/components/instructor-embedders/pyproject.toml b/components/embedders/instructor-embedders/pyproject.toml similarity index 100% rename from components/instructor-embedders/pyproject.toml rename to components/embedders/instructor-embedders/pyproject.toml diff --git a/components/instructor-embedders/tests/__init__.py b/components/embedders/instructor-embedders/tests/__init__.py similarity index 100% rename from components/instructor-embedders/tests/__init__.py rename to components/embedders/instructor-embedders/tests/__init__.py diff --git a/components/instructor-embedders/tests/test_instructor_backend.py b/components/embedders/instructor-embedders/tests/test_instructor_backend.py similarity index 100% rename from components/instructor-embedders/tests/test_instructor_backend.py rename to components/embedders/instructor-embedders/tests/test_instructor_backend.py diff --git a/components/instructor-embedders/tests/test_instructor_document_embedder.py b/components/embedders/instructor-embedders/tests/test_instructor_document_embedder.py similarity index 96% rename from components/instructor-embedders/tests/test_instructor_document_embedder.py rename to components/embedders/instructor-embedders/tests/test_instructor_document_embedder.py index f65d81b5f..1b53c6c1b 100644 --- a/components/instructor-embedders/tests/test_instructor_document_embedder.py +++ b/components/embedders/instructor-embedders/tests/test_instructor_document_embedder.py @@ -58,7 +58,7 @@ def test_to_dict(self): embedder = InstructorDocumentEmbedder(model_name_or_path="hkunlp/instructor-base") embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "InstructorDocumentEmbedder", + "type": "instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cpu", @@ -90,7 +90,7 @@ def test_to_dict_with_custom_init_parameters(self): ) embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "InstructorDocumentEmbedder", + "type": "instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cuda", @@ -110,7 +110,7 @@ def test_from_dict(self): Test deserialization of InstructorDocumentEmbedder from a dictionary, using default initialization parameters. """ embedder_dict = { - "type": "InstructorDocumentEmbedder", + "type": "instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cpu", @@ -140,7 +140,7 @@ def test_from_dict_with_custom_init_parameters(self): Test deserialization of InstructorDocumentEmbedder from a dictionary, using custom initialization parameters. """ embedder_dict = { - "type": "InstructorDocumentEmbedder", + "type": "instructor_embedders.instructor_document_embedder.InstructorDocumentEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cuda", diff --git a/components/instructor-embedders/tests/test_instructor_embedders.py b/components/embedders/instructor-embedders/tests/test_instructor_embedders.py similarity index 100% rename from components/instructor-embedders/tests/test_instructor_embedders.py rename to components/embedders/instructor-embedders/tests/test_instructor_embedders.py diff --git a/components/instructor-embedders/tests/test_instructor_text_embedder.py b/components/embedders/instructor-embedders/tests/test_instructor_text_embedder.py similarity index 95% rename from components/instructor-embedders/tests/test_instructor_text_embedder.py rename to components/embedders/instructor-embedders/tests/test_instructor_text_embedder.py index 4481fcb97..4dd1b13af 100644 --- a/components/instructor-embedders/tests/test_instructor_text_embedder.py +++ b/components/embedders/instructor-embedders/tests/test_instructor_text_embedder.py @@ -51,7 +51,7 @@ def test_to_dict(self): embedder = InstructorTextEmbedder(model_name_or_path="hkunlp/instructor-base") embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "InstructorTextEmbedder", + "type": "instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cpu", @@ -79,7 +79,7 @@ def test_to_dict_with_custom_init_parameters(self): ) embedder_dict = embedder.to_dict() assert embedder_dict == { - "type": "InstructorTextEmbedder", + "type": "instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cuda", @@ -97,7 +97,7 @@ def test_from_dict(self): Test deserialization of InstructorTextEmbedder from a dictionary, using default initialization parameters. """ embedder_dict = { - "type": "InstructorTextEmbedder", + "type": "instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cpu", @@ -123,7 +123,7 @@ def test_from_dict_with_custom_init_parameters(self): Test deserialization of InstructorTextEmbedder from a dictionary, using custom initialization parameters. """ embedder_dict = { - "type": "InstructorTextEmbedder", + "type": "instructor_embedders.instructor_text_embedder.InstructorTextEmbedder", "init_parameters": { "model_name_or_path": "hkunlp/instructor-base", "device": "cuda", From c5da6012378a412773bc0c5d260a7465fd103756 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Tue, 21 Nov 2023 18:25:41 +0100 Subject: [PATCH 09/36] [Elasticsearch Document Store] improve error handling in `write_documents` (#59) * improve error handling + tests * improve logic to support skip * Update document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --------- Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> --- .../elasticsearch_haystack/document_store.py | 36 +++++++++++++------ .../tests/test_document_store.py | 14 +++++++- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py index 4d1903e9f..d131cdf01 100644 --- a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -12,7 +12,7 @@ from haystack.preview import default_from_dict, default_to_dict from haystack.preview.dataclasses import Document from haystack.preview.document_stores.decorator import document_store -from haystack.preview.document_stores.errors import DuplicateDocumentError +from haystack.preview.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.preview.document_stores.protocols import DuplicatePolicy from elasticsearch_haystack.filters import _normalize_filters @@ -214,7 +214,10 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D - skip: keep the existing document and ignore the new one. - overwrite: remove the old document and write the new one. - fail: an error is raised + + :raises ValueError: if 'documents' parameter is not a list of Document objects :raises DuplicateDocumentError: Exception trigger on duplicate document if `policy=DuplicatePolicy.FAIL` + :raises DocumentStoreError: Exception trigger on any other error when writing documents :return: None """ if len(documents) > 0: @@ -237,16 +240,27 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D index=self._index, raise_on_error=False, ) - if errors and policy == DuplicatePolicy.FAIL: - # TODO: Handle errors in a better way, we're assuming that all errors - # are related to duplicate documents but that could be very well be wrong. - - # mypy complains that `errors`` could be either `int` or a `list` of `dict`s. - # Since the type depends on the parameters passed to `helpers.bulk()`` we know - # for sure that it will be a `list`. - ids = ", ".join(e["create"]["_id"] for e in errors) # type: ignore[union-attr] - msg = f"IDs '{ids}' already exist in the document store." - raise DuplicateDocumentError(msg) + + if errors: + duplicate_errors_ids = [] + other_errors = [] + for e in errors: + error_type = e["create"]["error"]["type"] + if policy == DuplicatePolicy.FAIL and error_type == "version_conflict_engine_exception": + duplicate_errors_ids.append(e["create"]["_id"]) + elif policy == DuplicatePolicy.SKIP and error_type == "version_conflict_engine_exception": + # when the policy is skip, duplication errors are OK and we should not raise an exception + continue + else: + other_errors.append(e) + + if len(duplicate_errors_ids) > 0: + msg = f"IDs '{', '.join(duplicate_errors_ids)}' already exist in the document store." + raise DuplicateDocumentError(msg) + + if len(other_errors) > 0: + msg = f"Failed to write documents to Elasticsearch. Errors:\n{other_errors}" + raise DocumentStoreError(msg) def _deserialize_document(self, hit: Dict[str, Any]) -> Document: """ diff --git a/document_stores/elasticsearch/tests/test_document_store.py b/document_stores/elasticsearch/tests/test_document_store.py index 1e7b3f115..e71603126 100644 --- a/document_stores/elasticsearch/tests/test_document_store.py +++ b/document_stores/elasticsearch/tests/test_document_store.py @@ -10,7 +10,7 @@ import pytest from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found] from haystack.preview.dataclasses.document import Document -from haystack.preview.document_stores.errors import DuplicateDocumentError +from haystack.preview.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.preview.document_stores.protocols import DuplicatePolicy from haystack.preview.testing.document_store import DocumentStoreBaseTests @@ -335,3 +335,15 @@ def test_embedding_retrieval_query_documents_different_embedding_sizes(self, doc with pytest.raises(BadRequestError): docstore._embedding_retrieval(query_embedding=[0.1, 0.1]) + + def test_write_documents_different_embedding_sizes_fail(self, docstore: ElasticsearchDocumentStore): + """ + Test that write_documents fails if the documents have different embedding sizes. + """ + docs = [ + Document(content="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]), + Document(content="Hello world", embedding=[0.1, 0.2]), + ] + + with pytest.raises(DocumentStoreError): + docstore.write_documents(docs) From 5ec0882f86dcede0de952dfdf5a7ba358e9f4cb5 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Thu, 23 Nov 2023 12:28:09 +0100 Subject: [PATCH 10/36] Add Unstructured file converter (#61) * first skeleton * add fileconverter * first tests * more tests * fix format * add workflow * fix workflow * change labeler * fix mypy * fix format * mypy * path to str * try to fix coverage path * another path to str * rm from_dict * add entry to general README * rm test_from_dict * try using services * rm leftover * mv file_converters to converters * update pyproject * update workflow --- .github/labeler.yml | 4 +- .../components_unstructured_fileconverter.yml | 59 +++++ README.md | 1 + .../unstructured_fileconverter/LICENSE | 201 ++++++++++++++++++ .../unstructured_fileconverter/README.md | 86 ++++++++ .../unstructured_fileconverter/pyproject.toml | 178 ++++++++++++++++ .../__about__.py | 4 + .../__init__.py | 6 + .../fileconverter.py | 173 +++++++++++++++ .../tests/samples/sample_pdf.pdf | Bin 0 -> 26093 bytes .../tests/test_fileconverter.py | 97 +++++++++ 11 files changed, 808 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/components_unstructured_fileconverter.yml create mode 100644 components/converters/unstructured_fileconverter/LICENSE create mode 100644 components/converters/unstructured_fileconverter/README.md create mode 100644 components/converters/unstructured_fileconverter/pyproject.toml create mode 100644 components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py create mode 100644 components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py create mode 100644 components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py create mode 100644 components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf create mode 100644 components/converters/unstructured_fileconverter/tests/test_fileconverter.py diff --git a/.github/labeler.yml b/.github/labeler.yml index f46dac561..f5bb7b448 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,6 +1,8 @@ # Integrations integration:instructor-embedders: -- components/instructor-embedders/**/* +- components/embedders/instructor-embedders/**/* +integration:unstructured-fileconverter: +- components/converters/unstructured_fileconverter/**/* integration:chroma: - document_stores/chroma/**/* integration:elasticsearch: diff --git a/.github/workflows/components_unstructured_fileconverter.yml b/.github/workflows/components_unstructured_fileconverter.yml new file mode 100644 index 000000000..f60573f79 --- /dev/null +++ b/.github/workflows/components_unstructured_fileconverter.yml @@ -0,0 +1,59 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / Components / unstructured-fileconverter + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "components/converters/unstructured_fileconverter/**" + - ".github/workflows/components_unstructured_fileconverter.yml" + +concurrency: + group: components_unstructured_fileconverter-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + services: + unstructured-api: + image: "quay.io/unstructured-io/unstructured-api:latest" + ports: + - 8000:8000 + options: >- + --health-cmd "curl --fail http://localhost:8000/healthcheck || exit 1" + --health-interval 10s + --health-timeout 1s + --health-retries 10 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + working-directory: components/converters/unstructured_fileconverter + if: matrix.python-version == '3.9' + run: hatch run lint:all + + - name: Run tests + working-directory: components/converters/unstructured_fileconverter + run: hatch run cov diff --git a/README.md b/README.md index cdbb0e1ab..475da2c7d 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai | [chroma-haystack](document_stores/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / Document Stores / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml) | | [elasticsearch-haystack](document_stores/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / Document Stores / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml) | | [instructor-embedders-haystack](components/embedders/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml) | +| [unstructured-fileconverter-haystack](components/converters/unstructured_fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured-fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_unstructured_fileconverter.yml) ## Contributing diff --git a/components/converters/unstructured_fileconverter/LICENSE b/components/converters/unstructured_fileconverter/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/components/converters/unstructured_fileconverter/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/components/converters/unstructured_fileconverter/README.md b/components/converters/unstructured_fileconverter/README.md new file mode 100644 index 000000000..18a826bca --- /dev/null +++ b/components/converters/unstructured_fileconverter/README.md @@ -0,0 +1,86 @@ +# Unstructured FileConverter for Haystack + + + +Component for the Haystack (2.x) LLM framework to easily convert files and directories into Documents using the Unstructured API. + +**[Unstructured](https://unstructured-io.github.io/unstructured/index.html)** provides a series of tools to do **ETL for LLMs**. This component calls the Unstructured API that simply extracts text and other information from a vast range of file formats. +**[Supported file types](https://unstructured-io.github.io/unstructured/api.html#supported-file-types)**. + +**[Haystack](https://github.com/deepset-ai/haystack)** is an **orchestration framework** to build customizable, production-ready **LLM applications**. +Once your files are converted into Documents, you can start building RAG, question answering, semantic search applications and more. + +- [Installation](#installation) +- [Usage](#usage) +- [Configuration](#configuration) + +## Installation + +```bash +pip install unstructured-fileconverter-haystack +``` + +### Hosted API +If you plan to use the hosted version of the Unstructured API, you just need the **(free) Unsctructured API key**. You can get it by signing up [here](https://unstructured.io/api-key). + +### Local API (Docker) +If you want to run your own local instance of the Unstructured API, you need Docker and you can find instructions [here](https://unstructured-io.github.io/unstructured/api.html#using-docker-images). + +In short, this should work: +```bash +docker run -p 8000:8000 -d --rm --name unstructured-api quay.io/unstructured-io/unstructured-api:latest --port 8000 --host 0.0.0.0 +``` + +## Usage + +### In isolation +```python +import os +from unstructured_fileconverter_haystack import UnstructuredFileConverter + +os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" + +converter = UnstructuredFileConverter() + +documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["documents"] + +``` + +### In a Haystack Pipeline +```python +import os +from haystack.preview import Pipeline +from haystack.preview.components.writers import DocumentWriter +from haystack.preview.document_stores import MemoryDocumentStore +from unstructured_fileconverter_haystack import UnstructuredFileConverter + +os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" + +document_store = MemoryDocumentStore() + +indexing = Pipeline() +indexing.add_component("converter", UnstructuredFileConverter()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "writer") + +indexing.run({"converter": {"paths": ["a/file/path.pdf", "a/directory/path"]}}) +``` + +## Configuration + +### Initialization parameters +- `api_url`: URL of the Unstructured API. Defaults to the hosted version. If you run the API locally, you should specify this parameter. +- `api_key`: API key for the Unstructured API (https://unstructured.io/#get-api-key). + If you run the API locally, it is not needed. + If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. +- `document_creation_mode`: How to create Haystack Documents from the elements returned by Unstructured. + - `"one-doc-per-file"`: One Haystack Document per file. All elements are concatenated into one text field. + - `"one-doc-per-page"`: One Haystack Document per page. All elements on a page are concatenated into one text field. + - `"one-doc-per-element"`: One Haystack Document per element. Each element is converted to a Haystack Document + - `separator`: Separator between elements when concatenating them into one text field. +- `unstructured_kwargs`: Additional keyword arguments that are passed to the Unstructured API. They can be helpful to improve or speed up the conversion. See https://unstructured-io.github.io/unstructured/api.html#parameters. + +### `run` method +The method `run` just expects a list of paths (files or directories) in the `paths` parameter. + +If `paths` contains a directory, all files in the first level of the directory are converted. Subdirectories are ignored. diff --git a/components/converters/unstructured_fileconverter/pyproject.toml b/components/converters/unstructured_fileconverter/pyproject.toml new file mode 100644 index 000000000..faaba8d71 --- /dev/null +++ b/components/converters/unstructured_fileconverter/pyproject.toml @@ -0,0 +1,178 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "unstructured-fileconverter-haystack" +dynamic = ["version"] +description = 'Haystack 2.x component to convert files into Documents using the Unstructured API' +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" + "haystack-ai", + "unstructured", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/components/converters/unstructured_fileconverter#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/components/converters/unstructured_fileconverter" + +[tool.hatch.version] +path = "src/unstructured_fileconverter_haystack/__about__.py" + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", + "pytest-xdist", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/unstructured_fileconverter_haystack tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["unstructured_fileconverter_haystack"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["unstructured_fileconverter_haystack", "tests"] +branch = true +parallel = true +omit = [ + "src/unstructured_fileconverter/__about__.py", +] + +[tool.coverage.paths] +unstructured_fileconverter_haystack = ["src/unstructured_fileconverter_haystack", "*/unstructured-fileconverter-haystack/src/unstructured_fileconverter_haystack"] +tests = ["tests", "*/unstructured-fileconverter-haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pytest.ini_options] +minversion = "6.0" +markers = [ + "unit: unit tests", + "integration: integration tests" +] + +[[tool.mypy.overrides]] +module = [ + "haystack.*", + "pytest.*" +] +ignore_missing_imports = true diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py new file mode 100644 index 000000000..7200d918c --- /dev/null +++ b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +__version__ = "0.0.4" diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py new file mode 100644 index 000000000..bcce95bea --- /dev/null +++ b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from unstructured_fileconverter_haystack.fileconverter import UnstructuredFileConverter + +__all__ = ["UnstructuredFileConverter"] diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py new file mode 100644 index 000000000..c8201d8da --- /dev/null +++ b/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py @@ -0,0 +1,173 @@ +import logging +import os +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Union + +from haystack.preview import Document, component, default_to_dict +from tqdm import tqdm +from unstructured.documents.elements import Element # type: ignore[import] +from unstructured.partition.api import partition_via_api # type: ignore[import] + +logger = logging.getLogger(__name__) + +UNSTRUCTURED_HOSTED_API_URL = "https://api.unstructured.io/general/v0/general" + + +@component +class UnstructuredFileConverter: + """ + Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + """ + + def __init__( + self, + api_url: str = UNSTRUCTURED_HOSTED_API_URL, + api_key: Optional[str] = None, + document_creation_mode: Literal[ + "one-doc-per-file", "one-doc-per-page", "one-doc-per-element" + ] = "one-doc-per-file", + separator: str = "\n\n", + unstructured_kwargs: Optional[Dict[str, Any]] = None, + progress_bar: bool = True, # noqa: FBT001, FBT002 + ): + """ + :param api_url: URL of the Unstructured API. Defaults to the hosted version. + If you run the API locally, specify the URL of your local API (e.g. http://localhost:8000/general/v0/general). + See https://unstructured-io.github.io/unstructured/api.html#using-the-api-locally for more information. + :param api_key: API key for the Unstructured API (https://unstructured.io/#get-api-key). + If you run the API locally, it is not needed. + If you use the hosted version, it defaults to the environment variable UNSTRUCTURED_API_KEY. + :param document_creation_mode: How to create Haystack Documents from the elements returned by Unstructured. + - "one-doc-per-file": One Haystack Document per file. All elements are concatenated into one text field. + - "one-doc-per-page": One Haystack Document per page. + All elements on a page are concatenated into one text field. + - "one-doc-per-element": One Haystack Document per element. + Each element is converted to a Haystack Document. + :param separator: Separator between elements when concatenating them into one text field. + :param unstructured_kwargs: Additional keyword arguments that are passed to the Unstructured API. + See https://unstructured-io.github.io/unstructured/api.html. + :param progress_bar: Show a progress bar for the conversion. Defaults to True. + """ + + self.api_url = api_url + self.document_creation_mode = document_creation_mode + self.unstructured_kwargs = unstructured_kwargs or {} + self.separator = separator + self.progress_bar = progress_bar + + is_hosted_api = api_url == UNSTRUCTURED_HOSTED_API_URL + if api_key is None and is_hosted_api: + try: + api_key = os.environ["UNSTRUCTURED_API_KEY"] + except KeyError as e: + msg = ( + "To use the hosted version of Unstructured, you need to set the environment variable " + "UNSTRUCTURED_API_KEY (recommended) or explictly pass the parameter api_key." + ) + raise ValueError(msg) from e + self.api_key = api_key + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + + # do not serialize api_key + return default_to_dict( + self, + api_url=self.api_url, + document_creation_mode=self.document_creation_mode, + separator=self.separator, + unstructured_kwargs=self.unstructured_kwargs, + progress_bar=self.progress_bar, + ) + + @component.output_types(documents=List[Document]) + def run(self, paths: Union[List[str], List[os.PathLike]]): + """ + Convert files to Haystack Documents using the Unstructured API (hosted or running locally). + + :param paths: List of paths to convert. Paths can be files or directories. + If a path is a directory, all files in the directory are converted. Subdirectories are ignored. + """ + + unique_paths = {Path(path) for path in paths} + filepaths = {path for path in unique_paths if path.is_file()} + filepaths_in_directories = { + filepath for path in unique_paths if path.is_dir() for filepath in path.glob("*.*") if filepath.is_file() + } + + all_filepaths = filepaths.union(filepaths_in_directories) + + # currently, the files are converted sequentially to gently handle API failures + documents = [] + + for filepath in tqdm( + all_filepaths, desc="Converting files to Haystack Documents", disable=not self.progress_bar + ): + elements = self._partition_file_into_elements(filepath=filepath) + docs_for_file = self._create_documents( + filepath=filepath, + elements=elements, + document_creation_mode=self.document_creation_mode, + separator=self.separator, + ) + documents.extend(docs_for_file) + + return {"documents": documents} + + def _create_documents( + self, + filepath: Path, + elements: List[Element], + document_creation_mode: Literal["one-doc-per-file", "one-doc-per-page", "one-doc-per-element"], + separator: str, + ) -> List[Document]: + """ + Create Haystack Documents from the elements returned by Unstructured. + """ + docs = [] + + if document_creation_mode == "one-doc-per-file": + text = separator.join([str(el) for el in elements]) + docs = [Document(content=text, meta={"name": str(filepath)})] + + elif document_creation_mode == "one-doc-per-page": + texts_per_page: defaultdict[int, str] = defaultdict(str) + meta_per_page: defaultdict[int, dict] = defaultdict(dict) + for el in elements: + metadata = {"name": str(filepath)} + if hasattr(el, "metadata"): + metadata.update(el.metadata.to_dict()) + page_number = int(metadata.get("page_number", 1)) + + texts_per_page[page_number] += str(el) + separator + meta_per_page[page_number].update(metadata) + + docs = [Document(content=texts_per_page[page], meta=meta_per_page[page]) for page in texts_per_page.keys()] + + elif document_creation_mode == "one-doc-per-element": + for el in elements: + metadata = {"name": str(filepath)} + if hasattr(el, "metadata"): + metadata.update(el.metadata.to_dict()) + if hasattr(el, "category"): + metadata["category"] = el.category + doc = Document(content=str(el), meta=metadata) + docs.append(doc) + + return docs + + def _partition_file_into_elements(self, filepath: Path) -> List[Element]: + """ + Partition a file into elements using the Unstructured API. + """ + elements = [] + try: + elements = partition_via_api( + filename=str(filepath), api_url=self.api_url, api_key=self.api_key, **self.unstructured_kwargs + ) + except Exception as e: + logger.warning(f"Unstructured could not process file {filepath}. Error: {e}") + return elements diff --git a/components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf b/components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6384246e891c59abf174e7225ed7f793e814ed69 GIT binary patch literal 26093 zcmcG#1yo(jvM7oZBoH9DEeP)J?(PH&?oM!bcXti$?h@RC1P$))uE}4Jz0cYEopay4 z_l*C>8f(t(>e*G@)m2?Jt7{U=35ihC(=fsk_ikTo9~NF_PxSV~G6CoS)_P{JTwDNJ zDMKq`dlLXN5TpR06*0B2H?#%bEp+V-g$(tr4GaN1Jg|25wuZWvu+E^GQQu&@>E8LD zdW4=AKsSmnERz86%3$#z0=$jMf9eIOui`i6A;>%Yc6An3ym4}GV$bdq4FqD?5{Y3;Hv8|4kK_n=emd<(FKRtA4e30QjJ#LM(w?hJHv zFO9Fozc5M3#l{doE2nGxdbKsQvIj8!W|)GZowb9lzM&m}njl+9^i@r>Ar%Ml;{C$FJM?&B_##``=3wz zlNLSwUx@v&{`3s5&(X>NCq~QvA_P`I2>{Rv89JKk8!Ctj{P(hvk?5qP_ya9yyq&du z0+|p%e1+u-1NhDk@OwBvU%fe)i+6JU zUGhT0+sonRFsJ3rmn*5y18S>3b{pq@7@vB~wSdSy`9gHttActn<*2?tIqdp`j55-D z3x@Fy6hRfVbKS@YLiA1uZ2GC4J0+z^tiSf=gvQ&Tb&=|;9EsiRFRvIvRzV^t5Ln?E z7A{W`^lh+s#o9hhhF=iFR|(_{f%FV$IWS+!2se8xEp7^TC|ie;lSsC5u++vd+n6%L zTjf?#iPv+(A#y$?6R#QPxyK4DPsL~MLDxs+bCRR+O#bjpwN;CwGTL@D zkQ*ZPG~C*b1Z9vqODMkW^)lI00LTR-NZm^MC0?L~R&4JEJSkk*(UFB6XqGt*qt%k`8pm$MlWw**nxP{3X$7d59?jv+uCg-?#|! z%dx!&?-LA(eizGkKS5H2LE%@DBPhpx^}!yi<&CD_LpJmTfC=gu)WaVs8Ei`T?L0_> z_m3)0oHwG~BWps=sOiw?or~)dO=vs_jos+$*k>SA0ovVhZ}6-m8 zdz_&ujTkQ+cgh^k&}RCz^%Qha)e_>`^k8<~bHIHNhn0%efrW{sjFrUH&y1OBl{%EV zox003P-mhJR}Z1St-f2YMVF9rGOA=mTpPxoQ(kdgNIl7uw_M6yu3uEEYMd)mJBFee zOQXTc-}tFf(Mdp7l2(;gyHfr%hetlEJS*?3=vNJ|;m!%bf=(EJT31m{XwFyNOx@gd z{SPHErIQr7>bac#Z2AqKL(h3OHAt#LGQ?xWBZj0E=ciYcbXvyi#^KF(Oh}kYnOg_G zR4u8gm5Iz|v-fZfy63v2-BZGGhpC2Tpe>_mQyo(&QXf*`tC6cgs#?|^R6X=8^ro6? z>U#_Xnr-UWjukBL)Rz}iuR4A`_TWs_tXf8Em1!NnLAW7&BzdHV<_>ZWYDc)8w|69K z_4k-TI^0`t8aof)geG+(t)3DuT+L7$BOiM^mJ0`ueuh3xjbS_adA$k8HipK4CS2Ru zCTCiCF=12v@L^PXE@IcQ*Rpqcp8Lmh(<3FKBw~zc*g$+?+%DEQoK|pBrAmxSWn=8E z-l^GXArA*n2u}nLN{3@dsR!+y?lai4DA<)0q=xlrMafF;Zl9YDTTekw2xb#lnfw zi2c;yYRzg^j71HzhUG_*cCNNkx2#5Rhc1U6J(-P|og9oi9DfErbupiqBa{csR(x7) zmT}VTYFWSTIE=CcSc?5P8xa|aFvMQ93SF6&&z7(1&@rh!-RT!-7LZ?WrykQlu;{Zu zwK!fjsx3ELOd`~pyw)4p2;Gn+UWz?jRr64}Umm^jKW@*z$$xZObA02pw7lPPshXmv zruAc@VokGrd-USVMPsYVQO8GJjn6uWrcnk_bqqx{?Glw{qBDgDc?UjNQK>ua2X2kW z84Cl8b6;02y{14T!26(i@ZRIJc@5smnV0ih_USIIm-UPvemQ9QYV4)$C3Gxy9y2Z) zAeukW9px~?IX%Z(#2U}i9TD4Kz4>{-`g!eI&<&+@Uyd=i`3{1Ofc<=9;12&Z6Q1jb zOQ=W5KIwDTO~ZtGsCv%q(ridW9*e1)l-ugQ_f!V%7%nw$+AsI3@SBv>+F@hK@t85x z^fdM?C(jFk{($qa5cD7p6?>kRx~=aS6;%~0BR)x$TIbES#f$BaPs&N9#uemO}?A%&Ht$SmP4f1bIUGb(PsI?A5Y8LtgCjtXoN zr2Um}Un$!IIbJ+YDN-kLAX+Xu9)2Ib>{@-*cQz89G_@JK; z(}gpMrP0pnt^eiYzxv^?Zs@D`%|g%cr*Hh?#j{~x~8B&aT0ZHgd!%&APfvU)?z*Ey3q`GKGqqj6$Gh*v@FbZrKa{Al3V5#H$w zy{NvVb}LECBhz5vK7+SdkyXyHD`~;scXG{R_I#ReOb;d;Z}^gICZ1KCH&WVPP)DHg zi;4UsjB5b)1H~C19nO|}RgDR&$?Fyb}A%y2M@aMa! zgr9Av>W(=5cmeVTeLjNK#*n(fx18pbk$P>9n0QcaBx^1ZbU&v^c&mKpLvUe{f)5D- z>$34Hha5-W?#ww zcI$_$z7Om^`h7W^got`04$CkXyD3~gOE6$UxDlP`g+5cDcNeUmX-$RayfA`;s75&=P=vW?&D z_uwbQ;<6?O5v#Drk%<1dDuhd0H7gH4JT7m1@8fvDKbf?h4;8w=7N|5*yDl_dvu(vj zzjS^(1D22$f61@8A&nnfVqR=0!lJBV)NH1%=BYNhE6b}b7QzcJf2*O^crl*Xpcz}L zR98YlUVib3NI4>xH04y>OW^XBFVasaI+b-JozG9mpg=t(hK!Ly2xh}D{z$z3t#WIu zP!hRXK`euEC7PrE;}qKJDVFkz;786BIWV>J1hctt*j&WvlSaO*G$9#OC)jFS)80!6 zh>bc5jhx}YP#WtQmZUn!qrj6Ato?*(`yqV35D+tGvHAolua2r?!J<1`{PRp?afNkT z>85&r8H5!Z*!=};Th5-(087eCO--@DxKaUQZ1ffpLgsAn*iRb^W}t3bD`5)Hh7x1z z?}Ov?w+-yXZFmZ*6 za)ag^epd5acgK*yGKbHFyv3jDnxF#FTyExtZDSEtm5x zS<~4`V|HgEOIft&bCBZ+;5?N|`D4WtO$6~h9+a&qs>Ni-Yx`u$OP{-uwM7`kR`t4?QTQ!u2$gm)B*D%5~jpJ9TWr zR-HUv30h}&vlGklkcD!Ewoyn6G=8zdtfUXvmB2Mz@+qws!%wcWd$(LXL!B8_hf)eI zw5eY;^(zNQQFKOY*;v_$K?<|orSOfRlNj$O5x*=RpOvDl@`n1@@}gTxSnlxQ9YzxG zK6CJ(Q+K~f-8O!d>GYFzWp-uRvto9rt4Kz}PEC~pE&ww|m2JqRm?BI8LE6s9fwn72 zv={6f2yz_>TXr*tul$VE5aaUJECgQ9E$&gq7D`5MEWq#A8R?qCso^rJ3v$BHW>7wm z>CD$7Hw%BYHK$j7*nN99`OUa#Q`+4?%s)TF5#{y8GP0w6Lc0Mv9mWL(F`~ab^&|TPESU4 z5th`qnT8SJ@`TAobEXq0eRNnb-rn}It(7z5gHzq(n+}AIY5djV@WM+{Vy8hguXtmY zw^k}ME{TULKG@tp)cgqx5@lyiwBNJ+vQoQ4`n@aZa0yFhs>6~O8JKKT<{p`DV0DE+s& z40QhIHMsAf`yUGq%sFmY!gzFh{X3aNmKa2w#`f_hWH=&Vx-s%#^bjg9wM?m9&dfS# zj(G~`_uxNUB1ZI5?J>cv*T|lMWFCFTF{hI2=Ay$$O(c~~wpL!n3t(DpN0fxD9i0+9 zk0x2Bus735--WQvhTfZ<`KTPpXsY~Pvsiufr-&3=;S9v?yL^kuD5E@o%{V6s7?D1s z3#4hKZ##PFM*XmYLLz4qE?+HT<#;M%5YvT|K9W-%WpOF__54Z{U(TuIR~vJ6gmC=u zq0B`?^Ggyv-f|Zhg;Hj%S>X5w+P0`3#t6)mq}6>C(%Vd5Rub{KlpSj$vAZ@x*1T;x zBSZU{gtp+NH)Y=Q6pSYcHp)~8qg#*lB2X#5w6l2Wj?Z4gJS*fVRwl5xGKfWtShmMb zyYz9=>ZK5YMY@1|<8UR}#!o99%?4jk+h7GTRNv(*CmEkT70APcqmw-9WGp)~HthH2 zqI`qf4fo^iai_P!kDu=l_qi-U^kk^wS;YlQVozPfPJI8Pv4>{dt5=3BAb|H>ng^=X zDRP(>M%NGhLIs4LukA2a*4>LqS@(;jIO_i3LC(8QOFVwpQ1brIjM&(n75EVvm6g^y z9ZN!I?&i1;KEGyLs#b|PE!tL;gWC#fD3PCP5Cd=AF;S&`+^Sf-j(^yLoGhSyR8(9G z+sC^J$xs=;DP!yRTy^ulJ;3qC?{SR0e17}|>Zs4+_ir4Xe?%PVnb`i}#JmI*|BM3u zPhyVr|7Xzg)eZVf)DiaIL>*rOx_?9+U;F<_@c(((@g*wxC$T@nj=#f`|83kkLv7J& zQS8M^D2}JD6lO3QHcE1QN9*JX==?d9c|bxw zR3qdP12P+6mdWFAprm{m$uof~eKroqHF7iLRG5e_N4}4;ZSPQC@SY;|z>A{Sp~GCG zmfi5KtIQ+rBCTt?*Y*f-W~3R1 zn&z9wDy$I;B#jmQz{5>1>=TGNZ&!~uDFl}jNIz}i*0(Y>9cY$s)kp(K4Z2F?a|eBWSwFa&k~y7cLkKT8Jd2)tk{ z^T$UzCyu?^PFoyf=m$lLEFY_Jqxpta0(`Eh^oGHpQ(+=gp)hNx&BVZ!`t;`QZ+t_9 zi<(iP*Ow!Zb|VAV)WybpcA0+L^NQwyAlCYf&Lk|Y!Ar*s&`8NVrFerTJk?0n=nNM* zim99xs1USB<5bD2D7@Z#kG+g-k;3HN5{V%t$XmgQv9Y=d@7cE%DzZyvDApSZ^XZBq zzADwxNVE)8dipS5L_U4W68<@YwW>`&oD@{WlDZvqBdu=^{x9gFq#KWv`I3|XOs@GsNlGi3| zqH$hqVO0BZQ-ir-gfS(!_%8;#&w!3p#R0oWp8ddUnX^A9ZeoN24IWF=JwbRd$FE*_y=$^0+wA_w}c=b|CAQb0>?{d7YXiuY@B6c-rPU0EH zQ0xxs&Li*bBiDSNF==@hTx8)KXq4^;r6CEy?!8vBH9tv=gN1qQpJJdF7&Zi`(P`a0 zV#21=+}MPWw>vwx z+25Cl7`DoKSCi!wYTGI2@cK6^=j918Pr|s=1UuirDIhn<<+Z?OBXbF4EG}wSOIMW= z!V2S|s$;#igp^=R*5*}y14p=Y*-L?@Y)vC0MUzQjn~cNB*LNNcNx_V>9J&_TVo4TTu-J0pO@?o(wrB6zfY$0TjdKJ z28>7ouIGNHcVcWvw!H*u1*tbr6X7%>-+U7Vx;D)6EQ-rhmBBl7mrUTR8>QkKnQzlb znUC}s*wY*!xJ#O~JTN6FYx?c|_Fy$N(^~6Q+%c8Mq%&?tjrKXnrL_5%QJL~su=8@_{ za)r8CqfrnUq-h^@I9d~|y=1t(hi(>aK<>9ez)F4l`l9wQn?yk1F6T0a%(p^m81fp1 zDuEy4KPCf~WHd^15(F(g5IP#vzD}h+UH8^mHcs6z>St`GeLAGY?0kRj>{u1R))t&wY{w^Wec#XYba`>-393T^DjwY-4PY>vlvcMX-&eYvx0 zNFyneiS#)GgRv^nu|LIHboUWrnJH>69@R!GXP`KikjkEtSOK*PDQuR;6{q`L#whJ1 zf^)D**DbFz&U1?Hz_(hrc8yy3bNQRXw+q6HEz44p5vS_*J$BGRg0xc@8u$vg&PKS= z8Wy8>lnTzN`zHLDqMQs;Qw3TDMOrvcR%eUfY;cXP>z;)7I2U8A%L~a@tmJchHQL3E z(ug=En^KWwZ0it2Trm;gnn}%bLau5g7>d=<(%xFYBE+C3AH-J3cW*&}W1~DEAE8}U zOkQt(N1x54-&6ny5*(Y~s9qcue zPb=Hs8B>lBgyLV`n`Ak}cR7;U&mg0(CWd`%t}4^&Td<4MHzs!^7pz8c6iK=)>U>Jn zivzf)+a|-b%oa|u41Cm^UAM&iI%8?TczxZV%`*1%;MzY{A1(c9z6K8zDoJ6KUaPuf(UFzb_lZr;6I0;1zSF-#2CRZpB_cIfZHydt0d)?u8h2^3C8=v~cnS60D8JOAV|KZ~P z89jVCGXJ0W)C@1@^gtK-9|!sLuczt%JjwrWeCpT!f4aH~s#amh{xrfazj!MU>XkgH+ zFurk@*hnzB+e7ui*@33NB3WS*FI}Nbkh!{n%vC zs#AWG&_$il1_?2CLnI6NMU#9M^zRSodJm1IAQ!ae)(pq1YHpl&})Kl|eZ{LLaFJ;gpOl;6yHN_vZQBYx1E=d)p zrpNs7PW&}MRce)otY;or6cy}>54GHSSWtaW-PUUb49XxBzOkCob~GY@r3?C0;`vl3 zf%&8Ox-{t4XtUV**>$~@=Qt(MhW(1v!IUKwtQIn%q|971^vY&>25g@*betRjA?AvWy z(RK9=p|JKQ;7eC1~B8uzY5DgC87hM9sET0G$@w z_m3su)d^-y!$oN`teK{#gulSOGifDiS_(4Kiz;RJOjlF3R5h9x0@3?+Y0%H@g zY+bfYK*GmOV~R~Wd@^Obh7+8gFNNZgT(8I3qzVRZ6u|_wHpqAY-f>Zmm?fYfQ5+ur z7|&5^xdYf zPl$Ofh5r9)PhL0Dm(7))ft~eFd&2P7>u{zb2Cdf8wG4!)^l^{5M2@dNu!aV16&jKbZSpBVzn3qF1*5uV(ho z5i$J@(Z8y||9C{qe?#KFUp5dBvg`Cl*pKM>K={SDI#{68EL@VNaiI`XP$|0$-w3HPh6{->D!Cf%><{-0v{ zn|QzO82>9wuQma=qXV~qzsdLOrt_a-`kR2i?pyyUroTz}>(=+5V)~nizwVy@DW<>6 z_`e9|ORBZqKVP~2UGntH8=3>qqA&sZZfaonVoZQ&6ac?BEm->BTJXoxX#Pcdnf_Iy z1a<9oEv$`yr+C}_ncFR`YillUWn}%DR{c@|=A8hI;Y&t5(5U^*EW-X_)c$D`fx7ow z+x{FEt*V+j5cE6s8(8>9>dLMK}V>RSC1B4TQ5 zXD?`?Yx`noX{B`ks4xI+I<1PSfxXEqXIa@`=>SZxlYE&M9nf&I0sR2>*Z-e&MmAs_ z2xkP^YG!7Z|LPC>*S1$Yz&^jRzFb+D*Z?eS%m7xPpTtVX1YiN~J&tjBT)Z=ePEe^WmaY&ePHb1 zrOd?k&-wkYNftbesIFVgx?iTo|0GP;&P zS^Y00*h1GBsDMCGeCa6w{BQzRgPNV09ze~+Mh~E8XJZ8pjg=MnHH0`&Wli<@t&A-U zfkUC?x6^;|FWH%aU0(>kRHzw%n*pt$u8o+Xsj-PYEGzJM6sWj{mLGx7y?pwj`>GW{ z?~_*b_and+WBvVz7y!6twET|7uSx)50_qY_dz^pQfO5bL`#M5kkj0PzxW2$4DExJg z`Kwa@5tm{5?U(-X#SL&WEcC1_|NUnb+io!S=`#sCZeGXAw#5?lv?a%t!X*?lUlf#! zB@|4ni6j%LB^yQ(4Gdgjs-^WuL4!cVrx5l*9E}OsVN4SXQpWfMzZ<06rw$TIvC+K) zh)2ni#KG6evb4Cle#hySQssb1*lqo#o!>W+p3Y=9Hi|hq^Xs7Um)B{B+iB}<>uo!S z!*Wn$6^Uai`F!U(3UsC6cP}hb{ilu|)^ymxtoxoOj>@*1y^x1pQcT4L-1dh8%I)zM z^8S_;cE_!?FHb4$e&*{KR)bkC=oRbwq%zB7ULT)P9>za;T-ZFj9kbf}%0z*-E{htc z*|1wn^&^cO9KSXHY}tCVbzQhMo?zMJdAM*GIi0o3c-N-+(`s2e@`l9`eX3etY6xN< z8(&cHN8r8f6F7D_s@Fw#7A@0lN7pylv%Y!#G{lL3=f+FKpGO(zsoIxy3{g=nu87|` zL-QqJc4j=^u4c`0QC}0aEB>mr3{gXI$FzPknz^>xZ-e^L0A|M+jcY@?%6c=c9NU`O zPn`IYC=mlO{AvC3?iupbCisfpsKhVXDt4xCNyDRnBGtGvFviWptzP6Q>2K~4m$KtD z=aN~fSVrZ$UFCYZUeGNPP+%Vhg2vCm?*K%fKi~;T#=wVXDhdI!M1pI5fWOT!WAI!b zeQ5M?@(~553lSTp{a@NXTzDlgn+^4vjJ9B)E%s?0}U{(NzQqQ*8rWk@0HA8{=x? zEj<}j*T50o!z?kWyBm}{y7F+B7KmUtc>pk+ zUg$8bo}@6XD(uSuT|{TcIRExiWW2L<{ZbJ)n5>RbgCAebdcIZ;YRrYCe;u=3LYTWY zbOf8*{M7aR)8~EaI;V5lDw+M3=kJyNL}7k6zUkQ?qtS_czhukEVUrPYM>z#>(zH`^ z4U+`mSNWe2RRr;)PH5acfwP2sc7(+;LOsW98#LrF)Pqjzx%|^xOOi+I+&UglA;E;)SmDo&v=X1aYsjHN6%_US8I0I z{S6E#ngIbx4P96jSy(m8?AV5LIu7r15`=>ob-x1DHwLN^4z);!YA#AZpnWYeWi8(9 z$iq9M2Z98CmR^=x-9dwR%i^#1zq$>>BYu@ySS552e4`sRi`N_(tHHkj$=ZUx`}QQ0 zqryh?r0|*FRK-)-A>shF9qMGq@{ zC96lreungJICwuk)~eMQllF|e%B5m~`KZ}t1ocg#hPqaj`}9R)qE|Iq*+5z2LR*7y z>vzwr+^Nr<#r7wkdA8hV`#y=!k-hnk5uMt*BRdi>^xSrA3q$@lc~UP$rm)Y zoNjVnM@HFLA!W>fdM8~C;@Rk~yXs7!ydRRV%?H&cop%5);laL5l(y{W&H01d1?Rpb zw|m@~lPc<^zb*XtL5ZnsXiCwcqinJw5G=@JP)(Q?PnhC+|0lslxmT!n9^IpQ6 z_-)uV9vyERSN!Uu#=rmJAgbM}yVrjx z!{E7JF3Eiqge(gkQYig4Q|XA<{f<~H8QPD7=~K3&91IgeDkLuSG6V;B3-;BUJ+3{M ziGq7#qmDNn_6{9P&9HY8F!Ynq{nIU=A^B&WCfcWrqF#`Z6}&$@o<|y&p6mvl=JL(g zOl7*G?5Q@R%JNB65x`A#xKpnf(%<65HRt=&8*C06gf*lulbKkct_(O`+ayft(l}*s z{n)x@o6gwD$p5g{8rMGTbPgMDtuqqKf(m1@h&JkU0jQLcp1f<4-K0!5_VdIZNBQvG zMoxZg5Md+lWw4+ zwYLTEg{ugz)=tk+2Kk5Z5CaC4Ul&8Pq_+sa8(+NL5=hHid-Oem3H@wieuojADtLHE z>a9uhs|{pEE&ri5h_6qQa+;MQdoyp^v2s*CT$%Z6MNE_;xiSm<6)1EYoeGJUh877K zRs=2?x_}!UoL~B7)ZrnXrBwc%;6(jQ&^RjbRmFQjk!^gt4EA|U1ShC(92FN=Xt3>g zGwI*A4j1H)GLJ4&;%!@-{1)6-mRDC7*RLI~Zf;u_5l)Yfm)+RV7P;&5*KW}pq>c-2 zR0-aZg-ajz8?Y~i@rFg0W`>O11w(*@pZ6aa(P4$k+be=w`58?Xy|ZLz)W(?>3hP-X zUQcPD$_=6flEq+kIQ z!41s3rQZGxm{*T15aqbHg0O5?XUOSYmJyr*%T|H$1o0OC1N-h5R8U_uu?ehP0hCc_ z%1}LFemHnvv{>a8oMF+n45Zdv{iY8LpWZ1y@y_v7MxqA!c1aW{A7#imj9)q5!}~?qK>;{569PDDgPefB1F3qNMo0Y;+Tq2r6V#(wn4)_< zP)=y8iSnjF@8hN~Y7j0kSXVJvqds5$UoN!!oqj7&Gy*$mRR<=nX>{!vT!B=#h5-10 zX_`%b$YAa4CVqHIRW-fvj{R1?pLr7#Q5d29LiLCbC2#I4OsHrd`4!?w=7e4A;;>M- zhyt~2UBUWT3Mp5P@mEQdT&frp_tJ4guMzuO{kCz7pX_s#i&A}?<#b2!hFsN^D2RV` zCwEir`1+)q{NPkrn%ca4UvLTj7~yMZPQsaJ>+kBc`?UO7rFcuafhF$O`9>62i;@n% zY}{@VwFb=2P|pD!wHfw-3d;TG&M6rSEMC&hY|nC_8va)a+quTMgON zN#oo16)M_D1%c+WXB*Q#zlLsS@V0o%wjFt{qQmKg=VwA~@D7jUT{4ikhto;8)5u~`Po=pT!4|bvj(>BNFTk?+uJ?+5T z1ny7oZ#PXu&)MWed5Ub5?ot!;CkmqLrAp<>Ls0Ld%)IECPO!p*+%^!5-6qio2XH3L zn;0@mMpd5HhfnOCmk5Lc%t_3gC_Ipizlet44p^em9}nrA_XU>VdOSY3(=?uOo4~Y$ z5jvV=b0QVpZW~rnxv|qG*XrCbfrf=sv{mWRHr*0BIwNtm1>r9>Q0=kpkxXDtEN6@I?edTNy@B4#uE(h} z>Z%tyq=an!Afl+F4znd|NxUGl=4Hlto^240PGG!-sZI%D#PX&QOl^_bfmCdgB&x6X zw9+ic(R^Fw-W6)0-L_g*XGBbnK!vSc(>oT8ZWnnfk+MV+e!6d?h@Tvc z+8+D+d?I6(tB+1XlSg5X9}6$H3ztJOv%kjMH*aj~RFSR|^3I3w8dQqxXp3aWNmdH z-$1O_B3T?)Q-)z)zIk^G*aAS)3#6kpT*kzQs@hilpcej-ob{*>cm&LqM z41A8)<^6k6ttP8exWAwqhm>3FKsCC0m)m8>G_=^m?>C2iV61Gbg?DhqHjaa z&zhy!s?a0X83u)FPJ_-Rn<;ZGtRg7c;V!hXCoOZj?Slobc)LP@*=7OJQ$7P5lBDG+ z{9<_ft&HZ3?VAu=lZi>KtH%pV1Pv;Dj1t&1E=w(aB=3PeX$XcX>W2HD-S%C_)G`il zk89PP3VLkmD|E9?L z+NSTEQpmMmgj$-OYysH=e2Xz-&B5)$n2s#dI2GQ4ZXOc=9~vC#kn?v0xM@Kr)=i5O zH!9b&j(Q#mR_COQEoDj+)z`I7FmJw(kBe(AUowe6^Zog$cNS+q!*fN+P?pSQU^4gV?$S#>jDR>ykb@Ptg#^W!k6oPI%DwHPR(9P3 z_g3T&*8ilg$mDtnd6!B;j{Ma;&-DhSC;0k`bo}r`rKGQkXVOZ47H2-rEC{!^Qxj1&00B-6bmw{!fYHB1PF& z0lNIw6~Z3xiYs`-nX`gMT;#ZB^&Vv7BivIY?x)7GVIGKxIzDLfnuB-m1yNx6C7`6D z7Pc%gKE1COcXg~wA>x+Qx*;%&`2ejB4VhaVFQ8QB`MBqGg}ymPJyoA{Z{gXphl@Lm zTKlCqm-6)PmiD0q<%IhADU8-UeaNJ^*Nzghx{jH}K0vcNsGH2RJH;EqHDw!$iZTi*rbH_uL!d^xXLfW_0`|y^YT?|A0 zCq5DKSdT%HqaUb4jY4qM$^CbaJ8`jo(Edy-hp!D#P;prNH0Q!-q`@@z!Cyfso4i=X z{mLh&Id%kP9G#&cfz`-*dHLoVy3pU*U?j#T=U!s|3$m3KxjoI((2_Q{)aCkjcZ%dF zpY22+s;zD5J#lh3%o_R)ekfc6?S+=zNnhyc&Wi}IxTpFBaZit9#iyxmILd(H#saN= zC_{aCB>lZDdsRHYFF|~s0W~StrTFZHOxoYclzX>+lA_KcpR zoEqTB)-y1`Ul_zI%gsccxk1dm!HE#BuzB~6z6(maiXX*d0s@=q-xQ}`+lcxwvAX=U zxJ^w4z1OBxLqN%$wDds5%)W_!5d5q|qEK$~jmQIx!uXK-qpJP&Y{%uTZ>6!U(`ko9 z=OQETd_z8GiP{8uVN_zNmy&dzkyXmUue~Fxub(M2ju8u;aE@s*iX7Fxa34R{8p{4s z&Pv5;cc>;^jDEzIY*`kN8m=L5%Bd)oi%`Shi@z+gnTh2oGLT5%Zsrj8R-6s$$U!WY zz+@8qz)C|keG<*Az!?=vY?i}xA_ITT9A2t(s#3bt%xAqp|1^+-TTf*Vy5F@_lq?ID z@7<0DdD!r!LJ+AY>dis^M;Phr&yz;=Qw(dn=rw+NL6To>uJumsXKx(3nlFsTi>Md2 zZ;30p@TekL4D7xVVNXBK8q}J_nJ1`GA1NZT9PrGfkD$BA46G}XqVtRfV4JV5N|H}8R^~(IuzDV!OzykgIxFmQ>FdO{4qLGdyF5kk ztW*?Fj%P*PlUePjPv5_dB~-!+(Ux&hjSjkYdwvDM>+0?m$($YjWUF2gB)va%*c5wg z#x07SXEye^#52tOF};)5-a;mtT5?ar1#6R+z@p%s9Nk0u4s^B+QaSdNrkVxAjI-$N zta)GX$2#674kJ$2>#npzL+c7?qX||zKRi%AMEGoA8Juy?2lJ{E-c<*R8wgJ3M;!eF zye_mxnRrKa%I`UZ zMBe}^`k*>96o=I+cq&R28sDu`CWN6y=2)a)sZFx}S!7XmC-RFz4krWK!*BW2o*qVC zITLhfEqHj3TFt91D>{Kzp}do=^=J1P4q*f%`(c#}Z*^}$7*=@Wa;bF+dh>$s6(MrO z7Gc?2KYvw@wUbJefL91%&O`Q7(u;m6@>9@@5knRC0unI(22`VX9}yls>Y9gLB1$6A zV^t@JQFk&GUBvDV3tQ>kV*AWUK_SGaReZtTd?u^6w4A2|<16B&Y*+j7-Fqm}QQmpG z=Y$Yr?H@~r`+LNQgS9O5$ByXVy+VYOd9OO()#!xT)R;k=vfZ-!ltO^lS(#4`Wf*L@U!}id7nUW zgK;3b;t5ffq8!G%9_NgLa9(|bLC zSrT*iDt()r@!dh`q>EhgsF@=x#86Y~sbDHnJ|>?V5oW3e9#65)zTP3}c}R2w-bGDv zbEF6AmDFu57w~I`mfhdx=|7}c2aJO9dPD1A*uU}KDhnE=?SZd3mG8Oz4j!v?&_s}0 z2&E&)-=wdryXr%Ue!}hcxZBj3RqBAx=6(mF$$NRuxc5%OH*{RL%xKyHYd)A^?m7mI zYvzMxt|1W*aVfmIj^_n$Xd~MCS9f5-T=%mm{yQF*v$P=m_Z<*+ael`#-d|D@g6MZJ zZmdzO$lvPP5tSl8B3lKTu+`D(UNN^xp(G^{1y!!O91;69ZJZDM;=9{8#1=iB+x+|e-|}The{s!&pMwO0a488Yf{6Nw%V(2qwCH1@&bvSv6 zK$DZ30?s~yX<6V-Z9#V_ibRB4(**6A-+w1$BT z8dpD0A{WwlDi1iNp(4hxVCK~|>JX~iAz?cqSOqI^tGG!SaF0~;bdK}H7>p=1;!YEW z=b9A61N5?xF7w?Yp1D6P(EPw|U32;LcHFP_{hcWD4YCOi6C4LZ3yeLKJ?kFQgv-Pw z;DT$q))`c&C8uv!kc`N2a9zzNHgExL9uuCX*Tu4Zf_15IXSQ= ztC)HLKX(~ROvY~vR#828yjD9hd*-lj%^I&76MBpk^I|FB&jO}*@~G)Q=Y8T~!{T<# zb|Q<$M;PdD6@>gjVp5K&i-I=ayJ^eS06P3m!+)I&O>vt{2;l&HajwvV9&>KQ%^)HU z+@3dba)T}aB2k=jokvFN$Xmu+aTDn8+v1oVdP6` zs@CvNGDAiRv)3$&t)pmK0wNCuJ=T*XL}5Qt!%3na{=e$ZGb)N@+v5U9lH?>AWF#Zb z3^T(FFl3ROlLW~~&LBAns0c_91<3-E(|{sL5G9BQL^6^^K$7Gn2(RJjc^Bs1d*0XA zt3U0vYghHIuI}o!{=eP*KOf<4XAyiKsgAPYyKc{~pDklS%3$zDqOCYTzB|mGdge#x z7E)=$MP`4qoXIi9DPKhprhGM))bwsD^Zo471M_nBmBbMn1)5!xfk1(w>^=77?&Wbu zdfs>Zj#pBHjLK_=-_4TD|`)mz{ zno~QsF^Ch4RER(C*dqTDNKKMZs@sF#Z)o%V-IZn0(hMj%=M{TOEU_xp;uCX1vYjRy zmOUB={jo^peHTwx$XDT4q$-EcKdm+R*C0BcaJNdUhPxGMqJ|n-m`Z5Kdxjc?-?D9D zxNR<|;vZCn-P2YG;H-DMA;qtcnL&M6C6Xc#MVtGKjizB>TNWJbvpHuODr+uSNiiY!>fy4~X zonUrC9WCPt&8s}!^NbgLjY~X}a%XI--;gcLiNT)vWZK<-Y(D(G5G9`KZYkRAD~0>P ztb%)fsjKfWbS37Yhws+Q%!HII_4N*Iq1*sc60e%?M6R)jKFc(|c^n_cF!S<^Pl^gt z=>9^NZ$lM(i4*43cz#5U_z`jYRoyr5W7044ej=9%@}SMM+EnOPAFq$knu>Vsh=d0d zQ>jYJS|FN3==1EhUh=^Cin6PcBaD(S%IT%sZ@7z>F10BmI|=ruww})K_r(kh8X@{b z)~_k4YS6%E;}-e`OJDE)T+Dov*c|lPVBf_5z_4*Li1(pJ#>9Sqw_M>Shko2HV^g1l zzO(CgXLO)xZ;RA&+qxTukk<1T=%YnLWQWF^U1=}}e z>$x*xva)#{WBr)=2Fa{xdJMAdyyI2f+J~9g318nF(SLm58eP2{kx%A#XGWY5WiqF` zF8X7;eT)X7j1UR+mP3OaxcQ(?+mD}+x|a(J-yY5sI`vx4%HE*R=5L&l>11)u7-qZO z&pIR1U~%Q}6?MtOs>J45ZLwYisDFcLv5jN`g}G_{p`4pRxcf)e@Ng5?d8l82*VYY3 zBQ&P1-l0>Tq|6X9IA=w6)lhI>1Ca=+ViGwxDtYbsfPeT79yxMXy|L>&{`QW}ECFsv z$e!ZFVaU0QJ=1AUx97Y?(CMCamDBS*S!>(dYcjb#Q)}M-2Vfc*{Tc%OVG_{KMoNCI zCF8eki>~hIftwMD8-)|({6*@u-Ohh}$bRw`bWaIW%{nQMslM!&69TewSq&n)zEF|Q ztgl8E#%A|nNW+QSRYJihoEBps6S+wFwN3-{q z*RG7U<G#<1`T98*Lp>nGI^cAwbO(y*zCvWXa%_K}=0h8dL`!FqW~#_(ajex3`R zwm7K5$^9wtqLDQkv!kOw7b%Zr7P>bCY!m?n)Vw>6hYT3k+l zS~RNh(eUVa4|7QlC{2oA(^{!$Y{5=PB?Bfaib0w=Job+;`n+yN9a7F0WQDfoU*2x|!~UXG zee2HPA+dL7Z(LTi=<3}<+N0~M*V{P04yoia`g|ldc|!<^G5UyE-R8EI;Pm&JQrHyk zJbK&Vy!M(QW`=F`kU*E8$=hZ2k;~QrWJ+eAt@cZj3$sNhhac{x?g_(eq3xU<+9+Ge zblw${g~eT6A_?h6(f(j(AA(SW?ko;H4|%WvSKv-T_U&<>FEwQ*dle7UC!c?|m-s<> zyZ(o8(4Fe4Cz$xsC5Ll}cy=H4D;>&-TpP6pkD{H12FCJ_!eot;b1O48vngp=t~Z$( z&6jIP=*crVk9Lppan*NvNO&!A}OFAuyE4~qd)hdzcT-%Hzl zZUG9U!TI>DI!fXmG=u0&ektK#C%m!qzJ#0f-)^X*^>jdbI&aftit=+6j1?(m3srjq z_2to&?sd;Jszo~bgk(Zz_U+M|)P~Aiyf-;LFTf&LIs^}*y23^|uKGEWM z05`(T&&1>1^!xdvB+8_LwD|CIbfC;j+W6rJlQ1<5%TQfqA+IAO{5z8=|fd~$c$UU8JpZ8g=dPOjD%qky+@T5aTPsbZlf9hFCuMP0senb2r6mG>MI)jgF+B~EaL){tJ=;wPPQl2egUb~iRE8( zfq;hUZ(SfDmiu2>ARrS%LD->4KpZD5Ong=ib`t5gKJ=Fc3Q%YLivACEhGSKf-hnA1 zR`zJ!jmVXHzeP$Dws%OA2byjXYC$qRpj*oLsr2+z%AaD^_Sd(CM|Pxlr|u@s<}kv z@>jBsU|Wex390-(BD@jfhe|A65oYz;k-G2s1X*blq$%QGfyvK16Db&tE{5I?%W=)b z2_ulEnoGsN7OU+U49DHkZNj@%vW$i}CndZU==Sk1z8d z3S5`yswgvErwOdAyE!s+fHa*p=cz;L2alP%1aoN@C^+z&YS`F5GUk9+G~w{ct1CX! zw&OJMttcGVgzMy{^KDVtx)+e3gCEI9Khu>)Z&B!=w~gYMl7M)_Nzx$gmc?P=m6T-R z34ft|uR*7AzW5Bq%bLDUx#fzpYO_0~H6!+s^Hlpd2!BUUvzG1ZdpYUhGis$bUi(K< zm{#Z~Rr`MqXSw-@=^n|lhnuU_a?DKKVJPXdx@9H6JDM)zwDHpApVv~#{bPUdsjq?sqivQ`rn_8gG!W_k6H+3Y|`HXQ$ zLPq?}wZJK&^WCZKdhMbmXwxO7hURUFGRu4m7Kavow$~x*MAzO6T9J6s#>AfM&U?ni z^(gg{*WCEvvsz8vj_D06&O8CSS<-@$DjS~1TY)W^430!Q8bh=t4Ix%PYZ=b?`OZK8XdAx9&!)!3*+H0pTjNqsWm)V8Z7Vf(r?Td zrjAVt>|~8uTuYWB3G4n~#Y&w{p%q=?hSF?RRK8rOOFw*0w`2d_*f6n@nteSliw=nS zUZDo9*d~d&AN3x?l?z;aw={3dYj*U>FSy@(R>rpJ|2$`_Oui>#TK==QfIN1C;3S?k-CW4^vT%F#} z#9Yn(^HV7eb~&ab9ZR}mK-|+N8`)Z_RG`1rOQ7_{%{BJ48b6tjvdS*R;S~kmmIa}kB!iU+{Z2~39 zC14ur^1azZDid^4J)()-PqxJBgR;_}1D;vV)Tbxq+6FuzpZzcT2AtCpb%>}vYoQ+Ksxe_c!XS+BP zM#;Dc)KYdQ_o@X%#jGRDhO7^Dni~yQ1!iwSKfiN0kapeka&@#VU3H=~vEL*|>PmM0 zo0rLN%n1S&`ulBiit4iU0A$rX?q-fI4pt^EmewY>Z9QyEYyicym76;+07|$yTY6dm zzY5@SG_z3CV%G)Cw$4s~I9m_`7lg4J@ko2xIskP#QP-eIVIy944No&S4_|hmp#2F5 z>FvQwd{qYcs{p_$u=AYY9zfI+0}w{`xb{4ctOyhg6y!&M;b0^b0uwd>gSmkd@KbTN z{O>ORQd!^I&B_`Ozr*3gfTaD`!H$5#0ff++{Vy97hB&77j}LaIUp6rKSj>Og27#cC zk)AVt;A6nxjO`d8IBSC)W0I%+grEpuZ+FTDfkLp_LZL9>v*QE&;J_@NY6}L#k1@&9 zHX-1}ch&}nA8#Gb_z8ir#s`NVBblfD;0Od(A4nK>JYY+Rl@}xoJ022_6%T+95Lo#` zBCz5Ce#a2fsqsO;5aF|9hJb~z<_!XdBG2Xp0)`*ML1*G2j%kOpHela-)+UTNJ0B1* z@)%7$(-w7%2A#Db&*l^Y3aF zy%0Ed?jZ>5TtS4f))54l4(vXV2&_J!;Nvas>9IinO|FEFsfp8m!1_4;ZaUQ#Dg@j_ zXKhdfRv%C(3W?<>1O=d?Gx1<>)W7uM;bsPqA8seVP|>pWwE~_m>{m|!=@YyR_@p`^ z$9{cGDaZnNniv8Mg;^sImMC+$wGa$$jf9&+ktiszc&x3UR%TFf;(vDei??z203N0% S4>n+N3&DvwIj^b75&sVaB5i5_ literal 0 HcmV?d00001 diff --git a/components/converters/unstructured_fileconverter/tests/test_fileconverter.py b/components/converters/unstructured_fileconverter/tests/test_fileconverter.py new file mode 100644 index 000000000..07c7be1f4 --- /dev/null +++ b/components/converters/unstructured_fileconverter/tests/test_fileconverter.py @@ -0,0 +1,97 @@ +from pathlib import Path + +import pytest + +from unstructured_fileconverter_haystack import UnstructuredFileConverter + + +@pytest.fixture +def samples_path(): + return Path(__file__).parent / "samples" + + +class TestUnstructuredFileConverter: + def test_init_default(self): + converter = UnstructuredFileConverter(api_key="test-api-key") + assert converter.api_url == "https://api.unstructured.io/general/v0/general" + assert converter.api_key == "test-api-key" + assert converter.document_creation_mode == "one-doc-per-file" + assert converter.separator == "\n\n" + assert converter.unstructured_kwargs == {} + assert converter.progress_bar + + def test_init_with_parameters(self): + converter = UnstructuredFileConverter( + api_url="http://custom-url:8000/general", + document_creation_mode="one-doc-per-element", + separator="|", + unstructured_kwargs={"foo": "bar"}, + progress_bar=False, + ) + assert converter.api_url == "http://custom-url:8000/general" + assert converter.api_key is None + assert converter.document_creation_mode == "one-doc-per-element" + assert converter.separator == "|" + assert converter.unstructured_kwargs == {"foo": "bar"} + assert not converter.progress_bar + + def test_to_dict(self): + converter = UnstructuredFileConverter(api_key="test-api-key") + converter_dict = converter.to_dict() + + assert converter_dict == { + "type": "unstructured_fileconverter_haystack.fileconverter.UnstructuredFileConverter", + "init_parameters": { + "api_url": "https://api.unstructured.io/general/v0/general", + "document_creation_mode": "one-doc-per-file", + "separator": "\n\n", + "unstructured_kwargs": {}, + "progress_bar": True, + }, + } + + @pytest.mark.integration + def test_run_one_doc_per_file(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-file" + ) + + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) == 1 + assert documents[0].meta == {"name": str(pdf_path)} + + @pytest.mark.integration + def test_run_one_doc_per_page(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-page" + ) + + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) == 4 + for i, doc in enumerate(documents, start=1): + assert doc.meta["name"] == str(pdf_path) + assert doc.meta["page_number"] == i + + @pytest.mark.integration + def test_run_one_doc_per_element(self, samples_path): + pdf_path = samples_path / "sample_pdf.pdf" + + local_converter = UnstructuredFileConverter( + api_url="http://localhost:8000/general/v0/general", document_creation_mode="one-doc-per-element" + ) + + documents = local_converter.run([pdf_path])["documents"] + + assert len(documents) > 4 + for doc in documents: + assert doc.meta["name"] == str(pdf_path) + assert "page_number" in doc.meta + + # elements have a category attribute that is saved in the document meta + assert "category" in doc.meta From 1591c01ba9157a77df1a374ebdbbed5540cff3db Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Wed, 29 Nov 2023 16:04:21 +0100 Subject: [PATCH 11/36] Reorganize repository (#62) * move dirs * refactor workflows * fix error * workflows * unstructured --- .../{document_stores_chroma.yml => chroma.yml} | 10 +++++----- ...stores_elasticsearch.yml => elasticsearch.yml} | 14 +++++++------- ...tor_embedders.yml => instructor_embedders.yml} | 10 +++++----- .github/workflows/nodes_text2speech.yml | 4 ++-- ...nverter.yml => unstructured_fileconverter.yml} | 12 ++++++------ components/README.md | 6 ------ document_stores/hatch.toml | 12 ------------ .../chroma/.gitignore | 0 .../chroma}/LICENSE | 0 .../chroma/README.md | 0 .../chroma/example/data/usr_01.txt | 0 .../chroma/example/data/usr_02.txt | 0 .../chroma/example/data/usr_03.txt | 0 .../chroma/example/data/usr_04.txt | 0 .../chroma/example/data/usr_05.txt | 0 .../chroma/example/data/usr_06.txt | 0 .../chroma/example/data/usr_07.txt | 0 .../chroma/example/data/usr_08.txt | 0 .../chroma/example/data/usr_09.txt | 0 .../chroma/example/data/usr_10.txt | 0 .../chroma/example/data/usr_11.txt | 0 .../chroma/example/data/usr_12.txt | 0 .../chroma/example/data/usr_20.txt | 0 .../chroma/example/data/usr_21.txt | 0 .../chroma/example/data/usr_22.txt | 0 .../chroma/example/data/usr_23.txt | 0 .../chroma/example/data/usr_24.txt | 0 .../chroma/example/data/usr_25.txt | 0 .../chroma/example/data/usr_26.txt | 0 .../chroma/example/data/usr_27.txt | 0 .../chroma/example/data/usr_28.txt | 0 .../chroma/example/data/usr_29.txt | 0 .../chroma/example/data/usr_30.txt | 0 .../chroma/example/data/usr_31.txt | 0 .../chroma/example/data/usr_32.txt | 0 .../chroma/example/data/usr_40.txt | 0 .../chroma/example/data/usr_41.txt | 0 .../chroma/example/data/usr_42.txt | 0 .../chroma/example/data/usr_43.txt | 0 .../chroma/example/data/usr_44.txt | 0 .../chroma/example/data/usr_45.txt | 0 .../chroma/example/data/usr_46.txt | 0 .../chroma/example/data/usr_50.txt | 0 .../chroma/example/data/usr_51.txt | 0 .../chroma/example/data/usr_52.txt | 0 .../chroma/example/data/usr_90.txt | 0 .../chroma/example/example.py | 0 .../chroma/pyproject.toml | 0 .../chroma/src/chroma_haystack/__about__.py | 0 .../chroma/src/chroma_haystack/__init__.py | 0 .../chroma/src/chroma_haystack/document_store.py | 0 .../chroma/src/chroma_haystack/errors.py | 0 .../chroma/src/chroma_haystack/retriever.py | 0 .../chroma/src/chroma_haystack/utils.py | 0 .../chroma/tests/__init__.py | 0 .../chroma/tests/test_document_store.py | 0 .../chroma/tests/test_retriever.py | 0 .../elasticsearch/.gitignore | 0 .../chroma => integrations/elasticsearch}/LICENSE | 0 .../elasticsearch/README.md | 0 .../elasticsearch/docker-compose.yml | 0 .../elasticsearch/pyproject.toml | 0 .../src/elasticsearch_haystack/__about__.py | 0 .../src/elasticsearch_haystack/__init__.py | 0 .../src/elasticsearch_haystack/bm25_retriever.py | 0 .../src/elasticsearch_haystack/document_store.py | 0 .../elasticsearch_haystack/embedding_retriever.py | 0 .../src/elasticsearch_haystack/filters.py | 0 .../elasticsearch/tests/__init__.py | 0 .../elasticsearch/tests/test_bm25_retriever.py | 0 .../elasticsearch/tests/test_document_store.py | 0 .../tests/test_embedding_retriever.py | 0 .../elasticsearch/tests/test_filters.py | 0 .../instructor-embedders/LICENSE.txt | 0 .../instructor-embedders/README.md | 0 .../instructor_embedders/__about__.py | 0 .../instructor_embedders/__init__.py | 0 .../embedding_backend/__init__.py | 0 .../embedding_backend/instructor_backend.py | 0 .../instructor_document_embedder.py | 0 .../instructor_text_embedder.py | 0 .../instructor-embedders/pyproject.toml | 0 .../instructor-embedders/tests/__init__.py | 0 .../tests/test_instructor_backend.py | 0 .../tests/test_instructor_document_embedder.py | 0 .../tests/test_instructor_embedders.py | 0 .../tests/test_instructor_text_embedder.py | 0 {nodes => integrations/nodes}/README.md | 0 {components => integrations/nodes}/hatch.toml | 0 .../nodes}/text2speech/LICENSE.txt | 0 .../nodes}/text2speech/README.md | 0 .../nodes}/text2speech/pyproject.toml | 0 .../nodes}/text2speech/tests/__init__.py | 0 .../nodes}/text2speech/tests/samples/answer.wav | Bin .../the context for this answer is here.wav | Bin .../this is the content of the document.wav | Bin .../nodes}/text2speech/tests/test_nodes.py | 0 .../nodes}/text2speech/text2speech/__about__.py | 0 .../nodes}/text2speech/text2speech/__init__.py | 0 .../text2speech/text2speech/answer_to_speech.py | 0 .../text2speech/text2speech/document_to_speech.py | 0 .../nodes}/text2speech/text2speech/errors.py | 0 .../text2speech/text2speech/utils/__init__.py | 0 .../text2speech/utils/text_to_speech.py | 0 .../unstructured/fileconverter}/LICENSE | 0 .../unstructured/fileconverter}/README.md | 0 .../unstructured/fileconverter}/pyproject.toml | 0 .../__about__.py | 0 .../__init__.py | 0 .../fileconverter.py | 0 .../fileconverter}/tests/samples/sample_pdf.pdf | Bin .../fileconverter}/tests/test_fileconverter.py | 0 nodes/hatch.toml | 12 ------------ 113 files changed, 25 insertions(+), 55 deletions(-) rename .github/workflows/{document_stores_chroma.yml => chroma.yml} (84%) rename .github/workflows/{document_stores_elasticsearch.yml => elasticsearch.yml} (75%) rename .github/workflows/{components_instructor_embedders.yml => instructor_embedders.yml} (67%) rename .github/workflows/{components_unstructured_fileconverter.yml => unstructured_fileconverter.yml} (78%) delete mode 100644 components/README.md delete mode 100644 document_stores/hatch.toml rename {document_stores => integrations}/chroma/.gitignore (100%) rename {components/converters/unstructured_fileconverter => integrations/chroma}/LICENSE (100%) rename {document_stores => integrations}/chroma/README.md (100%) rename {document_stores => integrations}/chroma/example/data/usr_01.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_02.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_03.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_04.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_05.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_06.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_07.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_08.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_09.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_10.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_11.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_12.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_20.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_21.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_22.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_23.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_24.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_25.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_26.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_27.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_28.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_29.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_30.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_31.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_32.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_40.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_41.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_42.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_43.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_44.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_45.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_46.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_50.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_51.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_52.txt (100%) rename {document_stores => integrations}/chroma/example/data/usr_90.txt (100%) rename {document_stores => integrations}/chroma/example/example.py (100%) rename {document_stores => integrations}/chroma/pyproject.toml (100%) rename {document_stores => integrations}/chroma/src/chroma_haystack/__about__.py (100%) rename {document_stores => integrations}/chroma/src/chroma_haystack/__init__.py (100%) rename {document_stores => integrations}/chroma/src/chroma_haystack/document_store.py (100%) rename {document_stores => integrations}/chroma/src/chroma_haystack/errors.py (100%) rename {document_stores => integrations}/chroma/src/chroma_haystack/retriever.py (100%) rename {document_stores => integrations}/chroma/src/chroma_haystack/utils.py (100%) rename {document_stores => integrations}/chroma/tests/__init__.py (100%) rename {document_stores => integrations}/chroma/tests/test_document_store.py (100%) rename {document_stores => integrations}/chroma/tests/test_retriever.py (100%) rename {document_stores => integrations}/elasticsearch/.gitignore (100%) rename {document_stores/chroma => integrations/elasticsearch}/LICENSE (100%) rename {document_stores => integrations}/elasticsearch/README.md (100%) rename {document_stores => integrations}/elasticsearch/docker-compose.yml (100%) rename {document_stores => integrations}/elasticsearch/pyproject.toml (100%) rename {document_stores => integrations}/elasticsearch/src/elasticsearch_haystack/__about__.py (100%) rename {document_stores => integrations}/elasticsearch/src/elasticsearch_haystack/__init__.py (100%) rename {document_stores => integrations}/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py (100%) rename {document_stores => integrations}/elasticsearch/src/elasticsearch_haystack/document_store.py (100%) rename {document_stores => integrations}/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py (100%) rename {document_stores => integrations}/elasticsearch/src/elasticsearch_haystack/filters.py (100%) rename {document_stores => integrations}/elasticsearch/tests/__init__.py (100%) rename {document_stores => integrations}/elasticsearch/tests/test_bm25_retriever.py (100%) rename {document_stores => integrations}/elasticsearch/tests/test_document_store.py (100%) rename {document_stores => integrations}/elasticsearch/tests/test_embedding_retriever.py (100%) rename {document_stores => integrations}/elasticsearch/tests/test_filters.py (100%) rename {components/embedders => integrations}/instructor-embedders/LICENSE.txt (100%) rename {components/embedders => integrations}/instructor-embedders/README.md (100%) rename {components/embedders => integrations}/instructor-embedders/instructor_embedders/__about__.py (100%) rename {components/embedders => integrations}/instructor-embedders/instructor_embedders/__init__.py (100%) rename {components/embedders => integrations}/instructor-embedders/instructor_embedders/embedding_backend/__init__.py (100%) rename {components/embedders => integrations}/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py (100%) rename {components/embedders => integrations}/instructor-embedders/instructor_embedders/instructor_document_embedder.py (100%) rename {components/embedders => integrations}/instructor-embedders/instructor_embedders/instructor_text_embedder.py (100%) rename {components/embedders => integrations}/instructor-embedders/pyproject.toml (100%) rename {components/embedders => integrations}/instructor-embedders/tests/__init__.py (100%) rename {components/embedders => integrations}/instructor-embedders/tests/test_instructor_backend.py (100%) rename {components/embedders => integrations}/instructor-embedders/tests/test_instructor_document_embedder.py (100%) rename {components/embedders => integrations}/instructor-embedders/tests/test_instructor_embedders.py (100%) rename {components/embedders => integrations}/instructor-embedders/tests/test_instructor_text_embedder.py (100%) rename {nodes => integrations/nodes}/README.md (100%) rename {components => integrations/nodes}/hatch.toml (100%) rename {nodes => integrations/nodes}/text2speech/LICENSE.txt (100%) rename {nodes => integrations/nodes}/text2speech/README.md (100%) rename {nodes => integrations/nodes}/text2speech/pyproject.toml (100%) rename {nodes => integrations/nodes}/text2speech/tests/__init__.py (100%) rename {nodes => integrations/nodes}/text2speech/tests/samples/answer.wav (100%) rename {nodes => integrations/nodes}/text2speech/tests/samples/the context for this answer is here.wav (100%) rename {nodes => integrations/nodes}/text2speech/tests/samples/this is the content of the document.wav (100%) rename {nodes => integrations/nodes}/text2speech/tests/test_nodes.py (100%) rename {nodes => integrations/nodes}/text2speech/text2speech/__about__.py (100%) rename {nodes => integrations/nodes}/text2speech/text2speech/__init__.py (100%) rename {nodes => integrations/nodes}/text2speech/text2speech/answer_to_speech.py (100%) rename {nodes => integrations/nodes}/text2speech/text2speech/document_to_speech.py (100%) rename {nodes => integrations/nodes}/text2speech/text2speech/errors.py (100%) rename {nodes => integrations/nodes}/text2speech/text2speech/utils/__init__.py (100%) rename {nodes => integrations/nodes}/text2speech/text2speech/utils/text_to_speech.py (100%) rename {document_stores/elasticsearch => integrations/unstructured/fileconverter}/LICENSE (100%) rename {components/converters/unstructured_fileconverter => integrations/unstructured/fileconverter}/README.md (100%) rename {components/converters/unstructured_fileconverter => integrations/unstructured/fileconverter}/pyproject.toml (100%) rename {components/converters/unstructured_fileconverter => integrations/unstructured/fileconverter}/src/unstructured_fileconverter_haystack/__about__.py (100%) rename {components/converters/unstructured_fileconverter => integrations/unstructured/fileconverter}/src/unstructured_fileconverter_haystack/__init__.py (100%) rename {components/converters/unstructured_fileconverter => integrations/unstructured/fileconverter}/src/unstructured_fileconverter_haystack/fileconverter.py (100%) rename {components/converters/unstructured_fileconverter => integrations/unstructured/fileconverter}/tests/samples/sample_pdf.pdf (100%) rename {components/converters/unstructured_fileconverter => integrations/unstructured/fileconverter}/tests/test_fileconverter.py (100%) delete mode 100644 nodes/hatch.toml diff --git a/.github/workflows/document_stores_chroma.yml b/.github/workflows/chroma.yml similarity index 84% rename from .github/workflows/document_stores_chroma.yml rename to .github/workflows/chroma.yml index 3068ea1fc..88020818e 100644 --- a/.github/workflows/document_stores_chroma.yml +++ b/.github/workflows/chroma.yml @@ -1,21 +1,21 @@ # This workflow comes from https://github.com/ofek/hatch-mypyc # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml -name: Test / Document Stores / chroma +name: Test / chroma on: schedule: - cron: "0 0 * * *" pull_request: paths: - - 'document_stores/chroma/**' - - '.github/workflows/document_stores_chroma.yml' + - 'integrations/chroma/**' + - '.github/workflows/chroma.yml' defaults: run: - working-directory: document_stores/chroma + working-directory: integrations/chroma concurrency: - group: document_stores_chroma-${{ github.head_ref }} + group: chroma-${{ github.head_ref }} cancel-in-progress: true env: diff --git a/.github/workflows/document_stores_elasticsearch.yml b/.github/workflows/elasticsearch.yml similarity index 75% rename from .github/workflows/document_stores_elasticsearch.yml rename to .github/workflows/elasticsearch.yml index 8da8d1e56..08254a58b 100644 --- a/.github/workflows/document_stores_elasticsearch.yml +++ b/.github/workflows/elasticsearch.yml @@ -1,17 +1,17 @@ # This workflow comes from https://github.com/ofek/hatch-mypyc # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml -name: Test / Document Stores / elasticsearch +name: Test / elasticsearch on: schedule: - cron: "0 0 * * *" pull_request: paths: - - "document_stores/elasticsearch/**" - - ".github/workflows/document_stores_elasticsearch.yml" + - "integrations/elasticsearch/**" + - ".github/workflows/elasticsearch.yml" concurrency: - group: document_stores_elasticsearch-${{ github.head_ref }} + group: elasticsearch-${{ github.head_ref }} cancel-in-progress: true env: @@ -40,14 +40,14 @@ jobs: run: pip install --upgrade hatch - name: Lint - working-directory: document_stores/elasticsearch + working-directory: integrations/elasticsearch if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run ElasticSearch container - working-directory: document_stores/elasticsearch + working-directory: integrations/elasticsearch run: docker-compose up -d - name: Run tests - working-directory: document_stores/elasticsearch + working-directory: integrations/elasticsearch run: hatch run cov diff --git a/.github/workflows/components_instructor_embedders.yml b/.github/workflows/instructor_embedders.yml similarity index 67% rename from .github/workflows/components_instructor_embedders.yml rename to .github/workflows/instructor_embedders.yml index c20363d51..293c6c142 100644 --- a/.github/workflows/components_instructor_embedders.yml +++ b/.github/workflows/instructor_embedders.yml @@ -1,16 +1,16 @@ -name: Test / Components / instructor-embedders +name: Test / instructor-embedders on: schedule: - cron: "0 0 * * *" pull_request: paths: - - 'components/embedders/instructor-embedders/**' - - '.github/workflows/components_instructor_embedders.yml' + - 'integrations/instructor-embedders/**' + - '.github/workflows/instructor_embedders.yml' defaults: run: - working-directory: components/embedders/instructor-embedders + working-directory: instructor-embedders jobs: test: @@ -27,7 +27,7 @@ jobs: - name: Ruff uses: chartboost/ruff-action@v1 with: - src: components/embedders/instructor-embedders + src: integrations/instructor-embedders - name: Install instructor-embedders run: | diff --git a/.github/workflows/nodes_text2speech.yml b/.github/workflows/nodes_text2speech.yml index 74329d53c..315215d0d 100644 --- a/.github/workflows/nodes_text2speech.yml +++ b/.github/workflows/nodes_text2speech.yml @@ -5,12 +5,12 @@ on: - cron: "0 0 * * *" pull_request: paths: - - 'nodes/text2speech/**' + - 'integrations/nodes/text2speech/**' - '.github/workflows/nodes_text2speech.yml' defaults: run: - working-directory: nodes/text2speech + working-directory: integrations/nodes/text2speech jobs: test: diff --git a/.github/workflows/components_unstructured_fileconverter.yml b/.github/workflows/unstructured_fileconverter.yml similarity index 78% rename from .github/workflows/components_unstructured_fileconverter.yml rename to .github/workflows/unstructured_fileconverter.yml index f60573f79..8d7ece048 100644 --- a/.github/workflows/components_unstructured_fileconverter.yml +++ b/.github/workflows/unstructured_fileconverter.yml @@ -1,17 +1,17 @@ # This workflow comes from https://github.com/ofek/hatch-mypyc # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml -name: Test / Components / unstructured-fileconverter +name: Test / unstructured / fileconverter on: schedule: - cron: "0 0 * * *" pull_request: paths: - - "components/converters/unstructured_fileconverter/**" - - ".github/workflows/components_unstructured_fileconverter.yml" + - "integrations/unstructured/fileconverter/**" + - ".github/workflows/unstructured_fileconverter.yml" concurrency: - group: components_unstructured_fileconverter-${{ github.head_ref }} + group: unstructured_fileconverter-${{ github.head_ref }} cancel-in-progress: true env: @@ -50,10 +50,10 @@ jobs: run: pip install --upgrade hatch - name: Lint - working-directory: components/converters/unstructured_fileconverter + working-directory: integrations/unstructured/fileconverter if: matrix.python-version == '3.9' run: hatch run lint:all - name: Run tests - working-directory: components/converters/unstructured_fileconverter + working-directory: integrations/unstructured/fileconverter run: hatch run cov diff --git a/components/README.md b/components/README.md deleted file mode 100644 index 7de46eb5d..000000000 --- a/components/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Components (for Haystack 2.x) - -To create a new package, run this command: -```sh -hatch --config hatch.toml new my_custom_component -``` diff --git a/document_stores/hatch.toml b/document_stores/hatch.toml deleted file mode 100644 index be4d67218..000000000 --- a/document_stores/hatch.toml +++ /dev/null @@ -1,12 +0,0 @@ -[template] -name = "deepset GmbH" -email = "info@deepset.ai" - -[template.licenses] -headers = true -default = [ - "Apache-2.0", -] - -[template.plugins.default] -src-layout = false diff --git a/document_stores/chroma/.gitignore b/integrations/chroma/.gitignore similarity index 100% rename from document_stores/chroma/.gitignore rename to integrations/chroma/.gitignore diff --git a/components/converters/unstructured_fileconverter/LICENSE b/integrations/chroma/LICENSE similarity index 100% rename from components/converters/unstructured_fileconverter/LICENSE rename to integrations/chroma/LICENSE diff --git a/document_stores/chroma/README.md b/integrations/chroma/README.md similarity index 100% rename from document_stores/chroma/README.md rename to integrations/chroma/README.md diff --git a/document_stores/chroma/example/data/usr_01.txt b/integrations/chroma/example/data/usr_01.txt similarity index 100% rename from document_stores/chroma/example/data/usr_01.txt rename to integrations/chroma/example/data/usr_01.txt diff --git a/document_stores/chroma/example/data/usr_02.txt b/integrations/chroma/example/data/usr_02.txt similarity index 100% rename from document_stores/chroma/example/data/usr_02.txt rename to integrations/chroma/example/data/usr_02.txt diff --git a/document_stores/chroma/example/data/usr_03.txt b/integrations/chroma/example/data/usr_03.txt similarity index 100% rename from document_stores/chroma/example/data/usr_03.txt rename to integrations/chroma/example/data/usr_03.txt diff --git a/document_stores/chroma/example/data/usr_04.txt b/integrations/chroma/example/data/usr_04.txt similarity index 100% rename from document_stores/chroma/example/data/usr_04.txt rename to integrations/chroma/example/data/usr_04.txt diff --git a/document_stores/chroma/example/data/usr_05.txt b/integrations/chroma/example/data/usr_05.txt similarity index 100% rename from document_stores/chroma/example/data/usr_05.txt rename to integrations/chroma/example/data/usr_05.txt diff --git a/document_stores/chroma/example/data/usr_06.txt b/integrations/chroma/example/data/usr_06.txt similarity index 100% rename from document_stores/chroma/example/data/usr_06.txt rename to integrations/chroma/example/data/usr_06.txt diff --git a/document_stores/chroma/example/data/usr_07.txt b/integrations/chroma/example/data/usr_07.txt similarity index 100% rename from document_stores/chroma/example/data/usr_07.txt rename to integrations/chroma/example/data/usr_07.txt diff --git a/document_stores/chroma/example/data/usr_08.txt b/integrations/chroma/example/data/usr_08.txt similarity index 100% rename from document_stores/chroma/example/data/usr_08.txt rename to integrations/chroma/example/data/usr_08.txt diff --git a/document_stores/chroma/example/data/usr_09.txt b/integrations/chroma/example/data/usr_09.txt similarity index 100% rename from document_stores/chroma/example/data/usr_09.txt rename to integrations/chroma/example/data/usr_09.txt diff --git a/document_stores/chroma/example/data/usr_10.txt b/integrations/chroma/example/data/usr_10.txt similarity index 100% rename from document_stores/chroma/example/data/usr_10.txt rename to integrations/chroma/example/data/usr_10.txt diff --git a/document_stores/chroma/example/data/usr_11.txt b/integrations/chroma/example/data/usr_11.txt similarity index 100% rename from document_stores/chroma/example/data/usr_11.txt rename to integrations/chroma/example/data/usr_11.txt diff --git a/document_stores/chroma/example/data/usr_12.txt b/integrations/chroma/example/data/usr_12.txt similarity index 100% rename from document_stores/chroma/example/data/usr_12.txt rename to integrations/chroma/example/data/usr_12.txt diff --git a/document_stores/chroma/example/data/usr_20.txt b/integrations/chroma/example/data/usr_20.txt similarity index 100% rename from document_stores/chroma/example/data/usr_20.txt rename to integrations/chroma/example/data/usr_20.txt diff --git a/document_stores/chroma/example/data/usr_21.txt b/integrations/chroma/example/data/usr_21.txt similarity index 100% rename from document_stores/chroma/example/data/usr_21.txt rename to integrations/chroma/example/data/usr_21.txt diff --git a/document_stores/chroma/example/data/usr_22.txt b/integrations/chroma/example/data/usr_22.txt similarity index 100% rename from document_stores/chroma/example/data/usr_22.txt rename to integrations/chroma/example/data/usr_22.txt diff --git a/document_stores/chroma/example/data/usr_23.txt b/integrations/chroma/example/data/usr_23.txt similarity index 100% rename from document_stores/chroma/example/data/usr_23.txt rename to integrations/chroma/example/data/usr_23.txt diff --git a/document_stores/chroma/example/data/usr_24.txt b/integrations/chroma/example/data/usr_24.txt similarity index 100% rename from document_stores/chroma/example/data/usr_24.txt rename to integrations/chroma/example/data/usr_24.txt diff --git a/document_stores/chroma/example/data/usr_25.txt b/integrations/chroma/example/data/usr_25.txt similarity index 100% rename from document_stores/chroma/example/data/usr_25.txt rename to integrations/chroma/example/data/usr_25.txt diff --git a/document_stores/chroma/example/data/usr_26.txt b/integrations/chroma/example/data/usr_26.txt similarity index 100% rename from document_stores/chroma/example/data/usr_26.txt rename to integrations/chroma/example/data/usr_26.txt diff --git a/document_stores/chroma/example/data/usr_27.txt b/integrations/chroma/example/data/usr_27.txt similarity index 100% rename from document_stores/chroma/example/data/usr_27.txt rename to integrations/chroma/example/data/usr_27.txt diff --git a/document_stores/chroma/example/data/usr_28.txt b/integrations/chroma/example/data/usr_28.txt similarity index 100% rename from document_stores/chroma/example/data/usr_28.txt rename to integrations/chroma/example/data/usr_28.txt diff --git a/document_stores/chroma/example/data/usr_29.txt b/integrations/chroma/example/data/usr_29.txt similarity index 100% rename from document_stores/chroma/example/data/usr_29.txt rename to integrations/chroma/example/data/usr_29.txt diff --git a/document_stores/chroma/example/data/usr_30.txt b/integrations/chroma/example/data/usr_30.txt similarity index 100% rename from document_stores/chroma/example/data/usr_30.txt rename to integrations/chroma/example/data/usr_30.txt diff --git a/document_stores/chroma/example/data/usr_31.txt b/integrations/chroma/example/data/usr_31.txt similarity index 100% rename from document_stores/chroma/example/data/usr_31.txt rename to integrations/chroma/example/data/usr_31.txt diff --git a/document_stores/chroma/example/data/usr_32.txt b/integrations/chroma/example/data/usr_32.txt similarity index 100% rename from document_stores/chroma/example/data/usr_32.txt rename to integrations/chroma/example/data/usr_32.txt diff --git a/document_stores/chroma/example/data/usr_40.txt b/integrations/chroma/example/data/usr_40.txt similarity index 100% rename from document_stores/chroma/example/data/usr_40.txt rename to integrations/chroma/example/data/usr_40.txt diff --git a/document_stores/chroma/example/data/usr_41.txt b/integrations/chroma/example/data/usr_41.txt similarity index 100% rename from document_stores/chroma/example/data/usr_41.txt rename to integrations/chroma/example/data/usr_41.txt diff --git a/document_stores/chroma/example/data/usr_42.txt b/integrations/chroma/example/data/usr_42.txt similarity index 100% rename from document_stores/chroma/example/data/usr_42.txt rename to integrations/chroma/example/data/usr_42.txt diff --git a/document_stores/chroma/example/data/usr_43.txt b/integrations/chroma/example/data/usr_43.txt similarity index 100% rename from document_stores/chroma/example/data/usr_43.txt rename to integrations/chroma/example/data/usr_43.txt diff --git a/document_stores/chroma/example/data/usr_44.txt b/integrations/chroma/example/data/usr_44.txt similarity index 100% rename from document_stores/chroma/example/data/usr_44.txt rename to integrations/chroma/example/data/usr_44.txt diff --git a/document_stores/chroma/example/data/usr_45.txt b/integrations/chroma/example/data/usr_45.txt similarity index 100% rename from document_stores/chroma/example/data/usr_45.txt rename to integrations/chroma/example/data/usr_45.txt diff --git a/document_stores/chroma/example/data/usr_46.txt b/integrations/chroma/example/data/usr_46.txt similarity index 100% rename from document_stores/chroma/example/data/usr_46.txt rename to integrations/chroma/example/data/usr_46.txt diff --git a/document_stores/chroma/example/data/usr_50.txt b/integrations/chroma/example/data/usr_50.txt similarity index 100% rename from document_stores/chroma/example/data/usr_50.txt rename to integrations/chroma/example/data/usr_50.txt diff --git a/document_stores/chroma/example/data/usr_51.txt b/integrations/chroma/example/data/usr_51.txt similarity index 100% rename from document_stores/chroma/example/data/usr_51.txt rename to integrations/chroma/example/data/usr_51.txt diff --git a/document_stores/chroma/example/data/usr_52.txt b/integrations/chroma/example/data/usr_52.txt similarity index 100% rename from document_stores/chroma/example/data/usr_52.txt rename to integrations/chroma/example/data/usr_52.txt diff --git a/document_stores/chroma/example/data/usr_90.txt b/integrations/chroma/example/data/usr_90.txt similarity index 100% rename from document_stores/chroma/example/data/usr_90.txt rename to integrations/chroma/example/data/usr_90.txt diff --git a/document_stores/chroma/example/example.py b/integrations/chroma/example/example.py similarity index 100% rename from document_stores/chroma/example/example.py rename to integrations/chroma/example/example.py diff --git a/document_stores/chroma/pyproject.toml b/integrations/chroma/pyproject.toml similarity index 100% rename from document_stores/chroma/pyproject.toml rename to integrations/chroma/pyproject.toml diff --git a/document_stores/chroma/src/chroma_haystack/__about__.py b/integrations/chroma/src/chroma_haystack/__about__.py similarity index 100% rename from document_stores/chroma/src/chroma_haystack/__about__.py rename to integrations/chroma/src/chroma_haystack/__about__.py diff --git a/document_stores/chroma/src/chroma_haystack/__init__.py b/integrations/chroma/src/chroma_haystack/__init__.py similarity index 100% rename from document_stores/chroma/src/chroma_haystack/__init__.py rename to integrations/chroma/src/chroma_haystack/__init__.py diff --git a/document_stores/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py similarity index 100% rename from document_stores/chroma/src/chroma_haystack/document_store.py rename to integrations/chroma/src/chroma_haystack/document_store.py diff --git a/document_stores/chroma/src/chroma_haystack/errors.py b/integrations/chroma/src/chroma_haystack/errors.py similarity index 100% rename from document_stores/chroma/src/chroma_haystack/errors.py rename to integrations/chroma/src/chroma_haystack/errors.py diff --git a/document_stores/chroma/src/chroma_haystack/retriever.py b/integrations/chroma/src/chroma_haystack/retriever.py similarity index 100% rename from document_stores/chroma/src/chroma_haystack/retriever.py rename to integrations/chroma/src/chroma_haystack/retriever.py diff --git a/document_stores/chroma/src/chroma_haystack/utils.py b/integrations/chroma/src/chroma_haystack/utils.py similarity index 100% rename from document_stores/chroma/src/chroma_haystack/utils.py rename to integrations/chroma/src/chroma_haystack/utils.py diff --git a/document_stores/chroma/tests/__init__.py b/integrations/chroma/tests/__init__.py similarity index 100% rename from document_stores/chroma/tests/__init__.py rename to integrations/chroma/tests/__init__.py diff --git a/document_stores/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py similarity index 100% rename from document_stores/chroma/tests/test_document_store.py rename to integrations/chroma/tests/test_document_store.py diff --git a/document_stores/chroma/tests/test_retriever.py b/integrations/chroma/tests/test_retriever.py similarity index 100% rename from document_stores/chroma/tests/test_retriever.py rename to integrations/chroma/tests/test_retriever.py diff --git a/document_stores/elasticsearch/.gitignore b/integrations/elasticsearch/.gitignore similarity index 100% rename from document_stores/elasticsearch/.gitignore rename to integrations/elasticsearch/.gitignore diff --git a/document_stores/chroma/LICENSE b/integrations/elasticsearch/LICENSE similarity index 100% rename from document_stores/chroma/LICENSE rename to integrations/elasticsearch/LICENSE diff --git a/document_stores/elasticsearch/README.md b/integrations/elasticsearch/README.md similarity index 100% rename from document_stores/elasticsearch/README.md rename to integrations/elasticsearch/README.md diff --git a/document_stores/elasticsearch/docker-compose.yml b/integrations/elasticsearch/docker-compose.yml similarity index 100% rename from document_stores/elasticsearch/docker-compose.yml rename to integrations/elasticsearch/docker-compose.yml diff --git a/document_stores/elasticsearch/pyproject.toml b/integrations/elasticsearch/pyproject.toml similarity index 100% rename from document_stores/elasticsearch/pyproject.toml rename to integrations/elasticsearch/pyproject.toml diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/__about__.py b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py similarity index 100% rename from document_stores/elasticsearch/src/elasticsearch_haystack/__about__.py rename to integrations/elasticsearch/src/elasticsearch_haystack/__about__.py diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/__init__.py b/integrations/elasticsearch/src/elasticsearch_haystack/__init__.py similarity index 100% rename from document_stores/elasticsearch/src/elasticsearch_haystack/__init__.py rename to integrations/elasticsearch/src/elasticsearch_haystack/__init__.py diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py b/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py similarity index 100% rename from document_stores/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py rename to integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py similarity index 100% rename from document_stores/elasticsearch/src/elasticsearch_haystack/document_store.py rename to integrations/elasticsearch/src/elasticsearch_haystack/document_store.py diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py b/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py similarity index 100% rename from document_stores/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py rename to integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py diff --git a/document_stores/elasticsearch/src/elasticsearch_haystack/filters.py b/integrations/elasticsearch/src/elasticsearch_haystack/filters.py similarity index 100% rename from document_stores/elasticsearch/src/elasticsearch_haystack/filters.py rename to integrations/elasticsearch/src/elasticsearch_haystack/filters.py diff --git a/document_stores/elasticsearch/tests/__init__.py b/integrations/elasticsearch/tests/__init__.py similarity index 100% rename from document_stores/elasticsearch/tests/__init__.py rename to integrations/elasticsearch/tests/__init__.py diff --git a/document_stores/elasticsearch/tests/test_bm25_retriever.py b/integrations/elasticsearch/tests/test_bm25_retriever.py similarity index 100% rename from document_stores/elasticsearch/tests/test_bm25_retriever.py rename to integrations/elasticsearch/tests/test_bm25_retriever.py diff --git a/document_stores/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py similarity index 100% rename from document_stores/elasticsearch/tests/test_document_store.py rename to integrations/elasticsearch/tests/test_document_store.py diff --git a/document_stores/elasticsearch/tests/test_embedding_retriever.py b/integrations/elasticsearch/tests/test_embedding_retriever.py similarity index 100% rename from document_stores/elasticsearch/tests/test_embedding_retriever.py rename to integrations/elasticsearch/tests/test_embedding_retriever.py diff --git a/document_stores/elasticsearch/tests/test_filters.py b/integrations/elasticsearch/tests/test_filters.py similarity index 100% rename from document_stores/elasticsearch/tests/test_filters.py rename to integrations/elasticsearch/tests/test_filters.py diff --git a/components/embedders/instructor-embedders/LICENSE.txt b/integrations/instructor-embedders/LICENSE.txt similarity index 100% rename from components/embedders/instructor-embedders/LICENSE.txt rename to integrations/instructor-embedders/LICENSE.txt diff --git a/components/embedders/instructor-embedders/README.md b/integrations/instructor-embedders/README.md similarity index 100% rename from components/embedders/instructor-embedders/README.md rename to integrations/instructor-embedders/README.md diff --git a/components/embedders/instructor-embedders/instructor_embedders/__about__.py b/integrations/instructor-embedders/instructor_embedders/__about__.py similarity index 100% rename from components/embedders/instructor-embedders/instructor_embedders/__about__.py rename to integrations/instructor-embedders/instructor_embedders/__about__.py diff --git a/components/embedders/instructor-embedders/instructor_embedders/__init__.py b/integrations/instructor-embedders/instructor_embedders/__init__.py similarity index 100% rename from components/embedders/instructor-embedders/instructor_embedders/__init__.py rename to integrations/instructor-embedders/instructor_embedders/__init__.py diff --git a/components/embedders/instructor-embedders/instructor_embedders/embedding_backend/__init__.py b/integrations/instructor-embedders/instructor_embedders/embedding_backend/__init__.py similarity index 100% rename from components/embedders/instructor-embedders/instructor_embedders/embedding_backend/__init__.py rename to integrations/instructor-embedders/instructor_embedders/embedding_backend/__init__.py diff --git a/components/embedders/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py b/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py similarity index 100% rename from components/embedders/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py rename to integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py diff --git a/components/embedders/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py similarity index 100% rename from components/embedders/instructor-embedders/instructor_embedders/instructor_document_embedder.py rename to integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py diff --git a/components/embedders/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py similarity index 100% rename from components/embedders/instructor-embedders/instructor_embedders/instructor_text_embedder.py rename to integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py diff --git a/components/embedders/instructor-embedders/pyproject.toml b/integrations/instructor-embedders/pyproject.toml similarity index 100% rename from components/embedders/instructor-embedders/pyproject.toml rename to integrations/instructor-embedders/pyproject.toml diff --git a/components/embedders/instructor-embedders/tests/__init__.py b/integrations/instructor-embedders/tests/__init__.py similarity index 100% rename from components/embedders/instructor-embedders/tests/__init__.py rename to integrations/instructor-embedders/tests/__init__.py diff --git a/components/embedders/instructor-embedders/tests/test_instructor_backend.py b/integrations/instructor-embedders/tests/test_instructor_backend.py similarity index 100% rename from components/embedders/instructor-embedders/tests/test_instructor_backend.py rename to integrations/instructor-embedders/tests/test_instructor_backend.py diff --git a/components/embedders/instructor-embedders/tests/test_instructor_document_embedder.py b/integrations/instructor-embedders/tests/test_instructor_document_embedder.py similarity index 100% rename from components/embedders/instructor-embedders/tests/test_instructor_document_embedder.py rename to integrations/instructor-embedders/tests/test_instructor_document_embedder.py diff --git a/components/embedders/instructor-embedders/tests/test_instructor_embedders.py b/integrations/instructor-embedders/tests/test_instructor_embedders.py similarity index 100% rename from components/embedders/instructor-embedders/tests/test_instructor_embedders.py rename to integrations/instructor-embedders/tests/test_instructor_embedders.py diff --git a/components/embedders/instructor-embedders/tests/test_instructor_text_embedder.py b/integrations/instructor-embedders/tests/test_instructor_text_embedder.py similarity index 100% rename from components/embedders/instructor-embedders/tests/test_instructor_text_embedder.py rename to integrations/instructor-embedders/tests/test_instructor_text_embedder.py diff --git a/nodes/README.md b/integrations/nodes/README.md similarity index 100% rename from nodes/README.md rename to integrations/nodes/README.md diff --git a/components/hatch.toml b/integrations/nodes/hatch.toml similarity index 100% rename from components/hatch.toml rename to integrations/nodes/hatch.toml diff --git a/nodes/text2speech/LICENSE.txt b/integrations/nodes/text2speech/LICENSE.txt similarity index 100% rename from nodes/text2speech/LICENSE.txt rename to integrations/nodes/text2speech/LICENSE.txt diff --git a/nodes/text2speech/README.md b/integrations/nodes/text2speech/README.md similarity index 100% rename from nodes/text2speech/README.md rename to integrations/nodes/text2speech/README.md diff --git a/nodes/text2speech/pyproject.toml b/integrations/nodes/text2speech/pyproject.toml similarity index 100% rename from nodes/text2speech/pyproject.toml rename to integrations/nodes/text2speech/pyproject.toml diff --git a/nodes/text2speech/tests/__init__.py b/integrations/nodes/text2speech/tests/__init__.py similarity index 100% rename from nodes/text2speech/tests/__init__.py rename to integrations/nodes/text2speech/tests/__init__.py diff --git a/nodes/text2speech/tests/samples/answer.wav b/integrations/nodes/text2speech/tests/samples/answer.wav similarity index 100% rename from nodes/text2speech/tests/samples/answer.wav rename to integrations/nodes/text2speech/tests/samples/answer.wav diff --git a/nodes/text2speech/tests/samples/the context for this answer is here.wav b/integrations/nodes/text2speech/tests/samples/the context for this answer is here.wav similarity index 100% rename from nodes/text2speech/tests/samples/the context for this answer is here.wav rename to integrations/nodes/text2speech/tests/samples/the context for this answer is here.wav diff --git a/nodes/text2speech/tests/samples/this is the content of the document.wav b/integrations/nodes/text2speech/tests/samples/this is the content of the document.wav similarity index 100% rename from nodes/text2speech/tests/samples/this is the content of the document.wav rename to integrations/nodes/text2speech/tests/samples/this is the content of the document.wav diff --git a/nodes/text2speech/tests/test_nodes.py b/integrations/nodes/text2speech/tests/test_nodes.py similarity index 100% rename from nodes/text2speech/tests/test_nodes.py rename to integrations/nodes/text2speech/tests/test_nodes.py diff --git a/nodes/text2speech/text2speech/__about__.py b/integrations/nodes/text2speech/text2speech/__about__.py similarity index 100% rename from nodes/text2speech/text2speech/__about__.py rename to integrations/nodes/text2speech/text2speech/__about__.py diff --git a/nodes/text2speech/text2speech/__init__.py b/integrations/nodes/text2speech/text2speech/__init__.py similarity index 100% rename from nodes/text2speech/text2speech/__init__.py rename to integrations/nodes/text2speech/text2speech/__init__.py diff --git a/nodes/text2speech/text2speech/answer_to_speech.py b/integrations/nodes/text2speech/text2speech/answer_to_speech.py similarity index 100% rename from nodes/text2speech/text2speech/answer_to_speech.py rename to integrations/nodes/text2speech/text2speech/answer_to_speech.py diff --git a/nodes/text2speech/text2speech/document_to_speech.py b/integrations/nodes/text2speech/text2speech/document_to_speech.py similarity index 100% rename from nodes/text2speech/text2speech/document_to_speech.py rename to integrations/nodes/text2speech/text2speech/document_to_speech.py diff --git a/nodes/text2speech/text2speech/errors.py b/integrations/nodes/text2speech/text2speech/errors.py similarity index 100% rename from nodes/text2speech/text2speech/errors.py rename to integrations/nodes/text2speech/text2speech/errors.py diff --git a/nodes/text2speech/text2speech/utils/__init__.py b/integrations/nodes/text2speech/text2speech/utils/__init__.py similarity index 100% rename from nodes/text2speech/text2speech/utils/__init__.py rename to integrations/nodes/text2speech/text2speech/utils/__init__.py diff --git a/nodes/text2speech/text2speech/utils/text_to_speech.py b/integrations/nodes/text2speech/text2speech/utils/text_to_speech.py similarity index 100% rename from nodes/text2speech/text2speech/utils/text_to_speech.py rename to integrations/nodes/text2speech/text2speech/utils/text_to_speech.py diff --git a/document_stores/elasticsearch/LICENSE b/integrations/unstructured/fileconverter/LICENSE similarity index 100% rename from document_stores/elasticsearch/LICENSE rename to integrations/unstructured/fileconverter/LICENSE diff --git a/components/converters/unstructured_fileconverter/README.md b/integrations/unstructured/fileconverter/README.md similarity index 100% rename from components/converters/unstructured_fileconverter/README.md rename to integrations/unstructured/fileconverter/README.md diff --git a/components/converters/unstructured_fileconverter/pyproject.toml b/integrations/unstructured/fileconverter/pyproject.toml similarity index 100% rename from components/converters/unstructured_fileconverter/pyproject.toml rename to integrations/unstructured/fileconverter/pyproject.toml diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py b/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__about__.py similarity index 100% rename from components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__about__.py rename to integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__about__.py diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py b/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py similarity index 100% rename from components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/__init__.py rename to integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/__init__.py diff --git a/components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py b/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py similarity index 100% rename from components/converters/unstructured_fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py rename to integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py diff --git a/components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf b/integrations/unstructured/fileconverter/tests/samples/sample_pdf.pdf similarity index 100% rename from components/converters/unstructured_fileconverter/tests/samples/sample_pdf.pdf rename to integrations/unstructured/fileconverter/tests/samples/sample_pdf.pdf diff --git a/components/converters/unstructured_fileconverter/tests/test_fileconverter.py b/integrations/unstructured/fileconverter/tests/test_fileconverter.py similarity index 100% rename from components/converters/unstructured_fileconverter/tests/test_fileconverter.py rename to integrations/unstructured/fileconverter/tests/test_fileconverter.py diff --git a/nodes/hatch.toml b/nodes/hatch.toml deleted file mode 100644 index be4d67218..000000000 --- a/nodes/hatch.toml +++ /dev/null @@ -1,12 +0,0 @@ -[template] -name = "deepset GmbH" -email = "info@deepset.ai" - -[template.licenses] -headers = true -default = [ - "Apache-2.0", -] - -[template.plugins.default] -src-layout = false From be898250e39ddb5954f3cfdef8e2b6093a4064c2 Mon Sep 17 00:00:00 2001 From: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Date: Wed, 29 Nov 2023 17:16:39 +0100 Subject: [PATCH 12/36] Update `ElasticSearchDocumentStore` to use latest `haystack-ai` version (#63) * Update haystack version * Update imports * Support new filters * Update ElasticSearchDocumentStore * Update tests * Fix corner cases when filter value is None * Convert legacy filters if used * Fix linting --- integrations/elasticsearch/pyproject.toml | 4 +- .../elasticsearch_haystack/bm25_retriever.py | 4 +- .../elasticsearch_haystack/document_store.py | 105 +----- .../embedding_retriever.py | 4 +- .../src/elasticsearch_haystack/filters.py | 307 ++++++++++++------ .../tests/test_bm25_retriever.py | 10 +- .../tests/test_document_store.py | 229 ++++--------- .../tests/test_embedding_retriever.py | 10 +- .../elasticsearch/tests/test_filters.py | 258 +++++++-------- 9 files changed, 428 insertions(+), 503 deletions(-) diff --git a/integrations/elasticsearch/pyproject.toml b/integrations/elasticsearch/pyproject.toml index c54be02f2..f67e9cc35 100644 --- a/integrations/elasticsearch/pyproject.toml +++ b/integrations/elasticsearch/pyproject.toml @@ -24,10 +24,8 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - # we distribute the preview version of Haystack 2.0 under the package "haystack-ai" - "haystack-ai==0.143.0", + "haystack-ai", "elasticsearch>=8,<9", - "typing_extensions", # This is not a direct dependency, but `haystack-ai` is missing it cause `canals` is missing it ] [project.urls] diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py b/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py index 017860a9a..804e8db15 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional -from haystack.preview import component, default_from_dict, default_to_dict -from haystack.preview.dataclasses import Document +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document from elasticsearch_haystack.document_store import ElasticsearchDocumentStore diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py index d131cdf01..6dae14341 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -9,11 +9,10 @@ # There are no import stubs for elastic_transport and elasticsearch so mypy fails from elastic_transport import NodeConfig # type: ignore[import-not-found] from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found] -from haystack.preview import default_from_dict, default_to_dict -from haystack.preview.dataclasses import Document -from haystack.preview.document_stores.decorator import document_store -from haystack.preview.document_stores.errors import DocumentStoreError, DuplicateDocumentError -from haystack.preview.document_stores.protocols import DuplicatePolicy +from haystack import default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores import DocumentStoreError, DuplicateDocumentError, DuplicatePolicy, document_store +from haystack.utils.filters import convert from elasticsearch_haystack.filters import _normalize_filters @@ -130,103 +129,29 @@ def _search_documents(self, **kwargs) -> List[Document]: return documents def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: - """ - Returns the documents that match the filters provided. - - Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical operator (`"$and"`, - `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `$ne`, `"$in"`, `$nin`, `"$gt"`, `"$gte"`, `"$lt"`, - `"$lte"`) or a metadata field name. - - Logical operator keys take a dictionary of metadata field names and/or logical operators as value. Metadata - field names take a dictionary of comparison operators as value. Comparison operator keys take a single value or - (in case of `"$in"`) a list of values as value. If no logical operator is provided, `"$and"` is used as default - operation. If no comparison operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used - as default operation. - - Example: - - ```python - filters = { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": {"$in": ["economy", "politics"]}, - "publisher": {"$eq": "nytimes"} - } - } - } - # or simpler using default operators - filters = { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": { - "genre": ["economy", "politics"], - "publisher": "nytimes" - } - } - ``` - - To use the same logical operator multiple times on the same level, logical operators can take a list of - dictionaries as value. - - Example: - - ```python - filters = { - "$or": [ - { - "$and": { - "Type": "News Paper", - "Date": { - "$lt": "2019-01-01" - } - } - }, - { - "$and": { - "Type": "Blog Post", - "Date": { - "$gte": "2019-01-01" - } - } - } - ] - } - ``` + if filters and "operator" not in filters and "conditions" not in filters: + filters = convert(filters) - :param filters: the filters to apply to the document list. - :return: a list of Documents that match the given filters. - """ query = {"bool": {"filter": _normalize_filters(filters)}} if filters else None documents = self._search_documents(query=query) return documents - def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.FAIL) -> None: + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: """ - Writes (or overwrites) documents into the store. - - :param documents: a list of documents. - :param policy: documents with the same ID count as duplicates. When duplicates are met, - the store can: - - skip: keep the existing document and ignore the new one. - - overwrite: remove the old document and write the new one. - - fail: an error is raised - - :raises ValueError: if 'documents' parameter is not a list of Document objects - :raises DuplicateDocumentError: Exception trigger on duplicate document if `policy=DuplicatePolicy.FAIL` - :raises DocumentStoreError: Exception trigger on any other error when writing documents - :return: None + Writes Documents to Elasticsearch. + If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the + same ID already exists in the document store. """ if len(documents) > 0: if not isinstance(documents[0], Document): msg = "param 'documents' must contain a list of objects of type Document" raise ValueError(msg) + if policy == DuplicatePolicy.NONE: + policy = DuplicatePolicy.FAIL + action = "index" if policy == DuplicatePolicy.OVERWRITE else "create" - _, errors = helpers.bulk( + documents_written, errors = helpers.bulk( client=self._client, actions=( { @@ -262,6 +187,8 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D msg = f"Failed to write documents to Elasticsearch. Errors:\n{other_errors}" raise DocumentStoreError(msg) + return documents_written + def _deserialize_document(self, hit: Dict[str, Any]) -> Document: """ Creates a Document from the search hit provided. diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py b/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py index 3bb4576ec..2aaba382d 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional -from haystack.preview import component, default_from_dict, default_to_dict -from haystack.preview.dataclasses import Document +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document from elasticsearch_haystack.document_store import ElasticsearchDocumentStore diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/filters.py b/integrations/elasticsearch/src/elasticsearch_haystack/filters.py index 78adae585..bb5b15311 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/filters.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/filters.py @@ -1,123 +1,218 @@ -from typing import Any, Dict, List, Union +from datetime import datetime +from typing import Any, Dict, List -from haystack.preview.errors import FilterError +from haystack.errors import FilterError from pandas import DataFrame -def _normalize_filters(filters: Union[List[Dict], Dict], logical_condition="") -> Dict[str, Any]: +def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]: """ Converts Haystack filters in ElasticSearch compatible filters. """ - if not isinstance(filters, dict) and not isinstance(filters, list): - msg = "Filters must be either a dictionary or a list" + if not isinstance(filters, dict): + msg = "Filters must be a dictionary" raise FilterError(msg) - conditions = [] - if isinstance(filters, dict): - filters = [filters] - for filter_ in filters: - for operator, value in filter_.items(): - if operator in ["$not", "$and", "$or"]: - # Logical operators - conditions.append(_normalize_filters(value, operator)) - else: - # Comparison operators - conditions.extend(_parse_comparison(operator, value)) + if "field" in filters: + return {"bool": {"must": _parse_comparison_condition(filters)}} + return _parse_logical_condition(filters) + + +def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "conditions" not in condition: + msg = f"'conditions' key missing in {condition}" + raise FilterError(msg) + + operator = condition["operator"] + conditions = [_parse_comparison_condition(c) for c in condition["conditions"]] if len(conditions) > 1: conditions = _normalize_ranges(conditions) + if operator == "AND": + return {"bool": {"must": conditions}} + elif operator == "OR": + return {"bool": {"should": conditions}} + elif operator == "NOT": + return {"bool": {"must_not": [{"bool": {"must": conditions}}]}} else: - # mypy is complaining we're assigning a dict to a list of dicts. - # We're ok with this as we're returning right after this. - conditions = conditions[0] # type: ignore[assignment] + msg = f"Unknown logical operator '{operator}'" + raise FilterError(msg) - if logical_condition == "$not": - return {"bool": {"must_not": conditions}} - elif logical_condition == "$or": - return {"bool": {"should": conditions}} - # If no logical condition is specified we default to "$and" - return {"bool": {"must": conditions}} - - -def _parse_comparison(field: str, comparison: Union[Dict, List, str, float]) -> List: - result: List[Dict[str, Any]] = [] - if isinstance(comparison, dict): - for comparator, val in comparison.items(): - if isinstance(val, DataFrame): - # Ruff is complaining we're overriding the loop variable `var` - # but we actually want to override it. So we ignore the error. - val = val.to_json() # noqa: PLW2901 - if comparator == "$eq": - if isinstance(val, list): - result.append( - { - "terms_set": { - field: { - "terms": val, - "minimum_should_match_script": { - "source": f"Math.max(params.num_terms, doc['{field}'].size())" - }, - } - } - } - ) - result.append({"term": {field: val}}) - elif comparator == "$ne": - if isinstance(val, list): - result.append({"bool": {"must_not": {"terms": {field: val}}}}) - else: - result.append( - {"bool": {"must_not": {"match": {field: {"query": val, "minimum_should_match": "100%"}}}}} - ) - elif comparator == "$in": - if not isinstance(val, list): - msg = f"{field}'s value must be a list when using '{comparator}' comparator" - raise FilterError(msg) - result.append({"terms": {field: val}}) - elif comparator == "$nin": - if not isinstance(val, list): - msg = f"{field}'s value must be a list when using '{comparator}' comparator" - raise FilterError(msg) - result.append({"bool": {"must_not": {"terms": {field: val}}}}) - elif comparator in ["$gt", "$gte", "$lt", "$lte"]: - if not isinstance(val, str) and not isinstance(val, int) and not isinstance(val, float): - msg = f"{field}'s value must be 'str', 'int', 'float' types when using '{comparator}' comparator" - raise FilterError(msg) - result.append({"range": {field: {comparator[1:]: val}}}) - elif comparator in ["$not", "$or"]: - if isinstance(val, list): - # This handles corner cases like this: - # `{"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}}` - # If we don't handle it like this we'd lose the "name" field and the - # generated query would be wrong and return unexpected results. - comparisons = [_parse_comparison(field, v)[0] for v in val] - if comparator == "$not": - result.append({"bool": {"must_not": comparisons}}) - elif comparator == "$or": - result.append({"bool": {"should": comparisons}}) - else: - result.append(_normalize_filters(val, comparator)) - elif comparator == "$and" and isinstance(val, list): - # We're assuming there are no duplicate items in the list - flat_filters = {k: v for d in val for k, v in d.items()} - result.extend(_parse_comparison(field, flat_filters)) - elif comparator == "$and": - result.append(_normalize_filters({field: val}, comparator)) - else: - msg = f"Unknown comparator '{comparator}'" - raise FilterError(msg) - elif isinstance(comparison, list): - result.append({"terms": {field: comparison}}) - elif isinstance(comparison, DataFrame): - result.append({"match": {field: {"query": comparison.to_json(), "minimum_should_match": "100%"}}}) - elif isinstance(comparison, str): - # We can't use "term" for text fields as ElasticSearch changes the value of text. - # More info here: - # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-term-query.html#query-dsl-term-query - result.append({"match": {field: {"query": comparison, "minimum_should_match": "100%"}}}) - else: - result.append({"term": {field: comparison}}) - return result +def _equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + return {"bool": {"must_not": {"exists": {"field": field}}}} + + if isinstance(value, list): + return { + "terms_set": { + field: { + "terms": value, + "minimum_should_match_script": {"source": f"Math.max(params.num_terms, doc['{field}'].size())"}, + } + } + } + if field in ["text", "dataframe"]: + # We want to fully match the text field. + return {"match": {field: {"query": value, "minimum_should_match": "100%"}}} + return {"term": {field: value}} + + +def _not_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + return {"exists": {"field": field}} + + if isinstance(value, list): + return {"bool": {"must_not": {"terms": {field: value}}}} + if field in ["text", "dataframe"]: + # We want to fully match the text field. + return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}} + + return {"bool": {"must_not": {"term": {field: value}}}} + + +def _greater_than(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '>' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"gt": value}}} + + +def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '>=' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"gte": value}}} + + +def _less_than(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '<' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"lt": value}}} + + +def _less_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '<=' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"lte": value}}} + + +def _in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators" + raise FilterError(msg) + return {"terms": {field: value}} + + +def _not_in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators" + raise FilterError(msg) + return {"bool": {"must_not": {"terms": {field: value}}}} + + +COMPARISON_OPERATORS = { + "==": _equal, + "!=": _not_equal, + ">": _greater_than, + ">=": _greater_than_equal, + "<": _less_than, + "<=": _less_than_equal, + "in": _in, + "not in": _not_in, +} + + +def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "field" not in condition: + # 'field' key is only found in comparison dictionaries. + # We assume this is a logic dictionary since it's not present. + return _parse_logical_condition(condition) + field: str = condition["field"] + + if field.startswith("meta."): + # Remove the "meta." prefix if present. + # Documents are flattened when using the ElasticSearchDocumentStore + # so we don't need to specify the "meta." prefix. + # Instead of raising an error we handle it gracefully. + field = field[5:] + + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "value" not in condition: + msg = f"'value' key missing in {condition}" + raise FilterError(msg) + operator: str = condition["operator"] + value: Any = condition["value"] + if isinstance(value, DataFrame): + value = value.to_json() + + return COMPARISON_OPERATORS[operator](field, value) def _normalize_ranges(conditions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: diff --git a/integrations/elasticsearch/tests/test_bm25_retriever.py b/integrations/elasticsearch/tests/test_bm25_retriever.py index 9139368d9..8f19c8897 100644 --- a/integrations/elasticsearch/tests/test_bm25_retriever.py +++ b/integrations/elasticsearch/tests/test_bm25_retriever.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from unittest.mock import Mock, patch -from haystack.preview.dataclasses import Document +from haystack.dataclasses import Document from elasticsearch_haystack.bm25_retriever import ElasticsearchBM25Retriever from elasticsearch_haystack.document_store import ElasticsearchDocumentStore @@ -24,7 +24,7 @@ def test_to_dict(_mock_elasticsearch_client): retriever = ElasticsearchBM25Retriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "ElasticsearchBM25Retriever", + "type": "elasticsearch_haystack.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { "init_parameters": { @@ -32,7 +32,7 @@ def test_to_dict(_mock_elasticsearch_client): "index": "default", "embedding_similarity_function": "cosine", }, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "fuzziness": "AUTO", @@ -45,11 +45,11 @@ def test_to_dict(_mock_elasticsearch_client): @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_from_dict(_mock_elasticsearch_client): data = { - "type": "ElasticsearchBM25Retriever", + "type": "elasticsearch_haystack.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { "init_parameters": {"hosts": "some fake host", "index": "default"}, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "fuzziness": "AUTO", diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index e71603126..d6428e762 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -6,13 +6,12 @@ from typing import List from unittest.mock import patch -import pandas as pd import pytest from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found] -from haystack.preview.dataclasses.document import Document -from haystack.preview.document_stores.errors import DocumentStoreError, DuplicateDocumentError -from haystack.preview.document_stores.protocols import DuplicatePolicy -from haystack.preview.testing.document_store import DocumentStoreBaseTests +from haystack.dataclasses.document import Document +from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError +from haystack.document_stores.protocols import DuplicatePolicy +from haystack.testing.document_store import DocumentStoreBaseTests from elasticsearch_haystack.document_store import ElasticsearchDocumentStore @@ -24,7 +23,7 @@ class TestDocumentStore(DocumentStoreBaseTests): """ @pytest.fixture - def docstore(self, request): + def document_store(self, request): """ This is the most basic requirement for the child class: provide an instance of this document store so the base class can use it. @@ -43,12 +42,37 @@ def docstore(self, request): yield store store._client.options(ignore_status=[400, 404]).indices.delete(index=index) + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): + """ + The ElasticSearchDocumentStore.filter_documents() method returns a Documents with their score set. + We don't want to compare the score, so we set it to None before comparing the documents. + """ + received_meta = [] + for doc in received: + r = { + "number": doc.meta.get("number"), + "name": doc.meta.get("name"), + } + received_meta.append(r) + + expected_meta = [] + for doc in expected: + r = { + "number": doc.meta.get("number"), + "name": doc.meta.get("name"), + } + expected_meta.append(r) + for doc in received: + doc.score = None + + super().assert_documents_are_equal(received, expected) + @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_to_dict(self, _mock_elasticsearch_client): document_store = ElasticsearchDocumentStore(hosts="some hosts") res = document_store.to_dict() assert res == { - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", "init_parameters": { "hosts": "some hosts", "index": "default", @@ -59,7 +83,7 @@ def test_to_dict(self, _mock_elasticsearch_client): @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_from_dict(self, _mock_elasticsearch_client): data = { - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", "init_parameters": { "hosts": "some hosts", "index": "default", @@ -71,8 +95,14 @@ def test_from_dict(self, _mock_elasticsearch_client): assert document_store._index == "default" assert document_store._embedding_similarity_function == "cosine" - def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): - docstore.write_documents( + def test_write_documents(self, document_store: ElasticsearchDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, DuplicatePolicy.FAIL) + + def test_bm25_retrieval(self, document_store: ElasticsearchDocumentStore): + document_store.write_documents( [ Document(content="Haskell is a functional programming language"), Document(content="Lisp is a functional programming language"), @@ -88,17 +118,17 @@ def test_bm25_retrieval(self, docstore: ElasticsearchDocumentStore): ] ) - res = docstore._bm25_retrieval("functional", top_k=3) + res = document_store._bm25_retrieval("functional", top_k=3) assert len(res) == 3 assert "functional" in res[0].content assert "functional" in res[1].content assert "functional" in res[2].content - def test_bm25_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): + def test_bm25_retrieval_pagination(self, document_store: ElasticsearchDocumentStore): """ Test that handling of pagination works as expected, when the matching documents are > 10. """ - docstore.write_documents( + document_store.write_documents( [ Document(content="Haskell is a functional programming language"), Document(content="Lisp is a functional programming language"), @@ -118,12 +148,12 @@ def test_bm25_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): ] ) - res = docstore._bm25_retrieval("programming", top_k=11) + res = document_store._bm25_retrieval("programming", top_k=11) assert len(res) == 11 assert all("programming" in doc.content for doc in res) - def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStore): - docstore.write_documents( + def test_bm25_retrieval_with_fuzziness(self, document_store: ElasticsearchDocumentStore): + document_store.write_documents( [ Document(content="Haskell is a functional programming language"), Document(content="Lisp is a functional programming language"), @@ -141,161 +171,30 @@ def test_bm25_retrieval_with_fuzziness(self, docstore: ElasticsearchDocumentStor query_with_typo = "functinal" # Query without fuzziness to search for the exact match - res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0") + res = document_store._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0") # Nothing is found as the query contains a typo assert res == [] # Query with fuzziness with the same query - res = docstore._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1") + res = document_store._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1") assert len(res) == 3 assert "functional" in res[0].content assert "functional" in res[1].content assert "functional" in res[2].content - def test_write_duplicate_fail(self, docstore: ElasticsearchDocumentStore): - """ - Verify `DuplicateDocumentError` is raised when trying to write duplicate files. - - `DocumentStoreBaseTests` declares this test but we override it since we return - a different error message that it expects. - """ - doc = Document(content="test doc") - docstore.write_documents([doc]) - with pytest.raises(DuplicateDocumentError): - docstore.write_documents(documents=[doc], policy=DuplicatePolicy.FAIL) - assert docstore.filter_documents(filters={"id": doc.id}) == [doc] - - def test_delete_not_empty(self, docstore: ElasticsearchDocumentStore): - """ - Verifies delete properly deletes specified document. - - `DocumentStoreBaseTests` declares this test but we override it since we - want `delete_documents` to be idempotent. - """ - doc = Document(content="test doc") - docstore.write_documents([doc]) - - docstore.delete_documents([doc.id]) - - res = docstore.filter_documents(filters={"id": doc.id}) - assert res == [] - - def test_delete_empty(self, docstore: ElasticsearchDocumentStore): - """ - Verifies delete doesn't raises when trying to delete a non-existing document. - - `DocumentStoreBaseTests` declares this test but we override it since we - want `delete_documents` to be idempotent. - """ - docstore.delete_documents(["test"]) - - def test_delete_not_empty_nonexisting(self, docstore: ElasticsearchDocumentStore): - """ - `DocumentStoreBaseTests` declares this test but we override it since we - want `delete_documents` to be idempotent. - """ - doc = Document(content="test doc") - docstore.write_documents([doc]) - - docstore.delete_documents(["non_existing"]) - - assert docstore.filter_documents(filters={"id": doc.id}) == [doc] - - @pytest.mark.skip(reason="Not supported") - def test_in_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_in_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_nin_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_nin_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - pass - - @pytest.mark.skip(reason="Not supported") - def test_eq_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - """ - If the embedding field is a dense vector (as expected), raise the following error: - - elasticsearch.BadRequestError: BadRequestError(400, 'search_phase_execution_exception', - "failed to create query: Field [embedding] of type [dense_vector] doesn't support term queries") - """ - pass - - @pytest.mark.skip(reason="Not supported") - def test_ne_filter_embedding(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - """ - If the embedding field is a dense vector (as expected), raise the following error: - - elasticsearch.BadRequestError: BadRequestError(400, 'search_phase_execution_exception', - "failed to create query: Field [embedding] of type [dense_vector] doesn't support term queries") - """ - pass - - def test_gt_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$gt": "100"}}) - assert self.contains_same_docs( - result, [d for d in filterable_docs if "page" in d.meta and d.meta["page"] > "100"] - ) - - def test_gt_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$gt": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert result == [] - - def test_gte_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$gte": "100"}}) - assert self.contains_same_docs( - result, [d for d in filterable_docs if "page" in d.meta and d.meta["page"] >= "100"] - ) - - def test_gte_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$gte": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert result == [] - - def test_lt_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$lt": "100"}}) - assert result == [] - - def test_lt_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$lt": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert self.contains_same_docs(result, [d for d in filterable_docs if d.dataframe is not None]) - - def test_lte_filter_non_numeric(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$lte": "100"}}) - assert self.contains_same_docs( - result, [d for d in filterable_docs if "page" in d.meta and d.meta["page"] <= "100"] - ) - - def test_lte_filter_table(self, docstore: ElasticsearchDocumentStore, filterable_docs: List[Document]): - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"dataframe": {"$lte": pd.DataFrame([[1, 2, 3], [-1, -2, -3]])}}) - assert self.contains_same_docs(result, [d for d in filterable_docs if d.dataframe is not None]) - - def test_embedding_retrieval(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval(self, document_store: ElasticsearchDocumentStore): docs = [ Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), Document(content="Not very similar document", embedding=[0.0, 0.8, 0.3, 0.9]), ] - docstore.write_documents(docs) - results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters={}) + document_store.write_documents(docs) + results = document_store._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters={}) assert len(results) == 2 assert results[0].content == "Most similar document" assert results[1].content == "2nd best document" - def test_embedding_retrieval_w_filters(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval_with_filters(self, document_store: ElasticsearchDocumentStore): docs = [ Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), @@ -305,14 +204,14 @@ def test_embedding_retrieval_w_filters(self, docstore: ElasticsearchDocumentStor meta={"meta_field": "custom_value"}, ), ] - docstore.write_documents(docs) + document_store.write_documents(docs) - filters = {"meta_field": {"$eq": "custom_value"}} - results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters=filters) + filters = {"field": "meta_field", "operator": "==", "value": "custom_value"} + results = document_store._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters=filters) assert len(results) == 1 assert results[0].content == "Not very similar document with meta field" - def test_embedding_retrieval_pagination(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval_pagination(self, document_store: ElasticsearchDocumentStore): """ Test that handling of pagination works as expected, when the matching documents are > 10. """ @@ -322,21 +221,23 @@ def test_embedding_retrieval_pagination(self, docstore: ElasticsearchDocumentSto for i in range(20) ] - docstore.write_documents(docs) - results = docstore._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=11, filters={}) + document_store.write_documents(docs) + results = document_store._embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=11, filters={}) assert len(results) == 11 - def test_embedding_retrieval_query_documents_different_embedding_sizes(self, docstore: ElasticsearchDocumentStore): + def test_embedding_retrieval_query_documents_different_embedding_sizes( + self, document_store: ElasticsearchDocumentStore + ): """ Test that the retrieval fails if the query embedding and the documents have different embedding sizes. """ docs = [Document(content="Hello world", embedding=[0.1, 0.2, 0.3, 0.4])] - docstore.write_documents(docs) + document_store.write_documents(docs) with pytest.raises(BadRequestError): - docstore._embedding_retrieval(query_embedding=[0.1, 0.1]) + document_store._embedding_retrieval(query_embedding=[0.1, 0.1]) - def test_write_documents_different_embedding_sizes_fail(self, docstore: ElasticsearchDocumentStore): + def test_write_documents_different_embedding_sizes_fail(self, document_store: ElasticsearchDocumentStore): """ Test that write_documents fails if the documents have different embedding sizes. """ @@ -346,4 +247,4 @@ def test_write_documents_different_embedding_sizes_fail(self, docstore: Elastics ] with pytest.raises(DocumentStoreError): - docstore.write_documents(docs) + document_store.write_documents(docs) diff --git a/integrations/elasticsearch/tests/test_embedding_retriever.py b/integrations/elasticsearch/tests/test_embedding_retriever.py index b16e28830..fd60b0940 100644 --- a/integrations/elasticsearch/tests/test_embedding_retriever.py +++ b/integrations/elasticsearch/tests/test_embedding_retriever.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from unittest.mock import Mock, patch -from haystack.preview.dataclasses import Document +from haystack.dataclasses import Document from elasticsearch_haystack.document_store import ElasticsearchDocumentStore from elasticsearch_haystack.embedding_retriever import ElasticsearchEmbeddingRetriever @@ -24,7 +24,7 @@ def test_to_dict(_mock_elasticsearch_client): retriever = ElasticsearchEmbeddingRetriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "ElasticsearchEmbeddingRetriever", + "type": "elasticsearch_haystack.embedding_retriever.ElasticsearchEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": { @@ -32,7 +32,7 @@ def test_to_dict(_mock_elasticsearch_client): "index": "default", "embedding_similarity_function": "cosine", }, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "top_k": 10, @@ -44,11 +44,11 @@ def test_to_dict(_mock_elasticsearch_client): @patch("elasticsearch_haystack.document_store.Elasticsearch") def test_from_dict(_mock_elasticsearch_client): data = { - "type": "ElasticsearchEmbeddingRetriever", + "type": "elasticsearch_haystack.embedding_retriever.ElasticsearchEmbeddingRetriever", "init_parameters": { "document_store": { "init_parameters": {"hosts": "some fake host", "index": "default"}, - "type": "ElasticsearchDocumentStore", + "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", }, "filters": {}, "top_k": 10, diff --git a/integrations/elasticsearch/tests/test_filters.py b/integrations/elasticsearch/tests/test_filters.py index efaa168b0..6db6a0dd2 100644 --- a/integrations/elasticsearch/tests/test_filters.py +++ b/integrations/elasticsearch/tests/test_filters.py @@ -1,108 +1,99 @@ import pytest -from haystack.preview.errors import FilterError +from haystack.errors import FilterError from elasticsearch_haystack.filters import _normalize_filters, _normalize_ranges filters_data = [ ( { - "$and": { - "type": {"$eq": "article"}, - "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - } + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + }, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.date", "operator": "<", "value": "2021-01-01"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ], }, { "bool": { - "must": { - "bool": { - "must": [ - {"term": {"type": "article"}}, - { - "bool": { - "should": [ - {"terms": {"genre": ["economy", "politics"]}}, - {"term": {"publisher": "nytimes"}}, - ] - } - }, - {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, - {"range": {"rating": {"gte": 3}}}, - ] - } - } - } - }, - ), - ( - { - "$or": [ - {"Type": "News Paper", "Date": {"$lt": "2019-01-01"}}, - {"Type": "Blog Post", "Date": {"$gte": "2019-01-01"}}, - ] - }, - { - "bool": { - "must": { - "bool": { - "should": [ - {"match": {"Type": {"query": "News Paper", "minimum_should_match": "100%"}}}, - {"match": {"Type": {"query": "Blog Post", "minimum_should_match": "100%"}}}, - {"range": {"Date": {"lt": "2019-01-01", "gte": "2019-01-01"}}}, - ] - } - } + "must": [ + {"term": {"type": "article"}}, + { + "bool": { + "should": [ + {"terms": {"genre": ["economy", "politics"]}}, + {"term": {"publisher": "nytimes"}}, + ] + } + }, + {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, + {"range": {"rating": {"gte": 3}}}, + ] } }, ), ( { - "$and": { - "type": {"$eq": "article"}, - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": {"$in": ["economy", "politics"]}, "publisher": {"$eq": "nytimes"}}, - } + "operator": "OR", + "conditions": [ + { + "operator": "AND", + "conditions": [ + {"field": "meta.Type", "operator": "==", "value": "News Paper"}, + {"field": "meta.Date", "operator": "<", "value": "2020-01-01"}, + ], + }, + { + "operator": "AND", + "conditions": [ + {"field": "meta.Type", "operator": "==", "value": "Blog Post"}, + {"field": "meta.Date", "operator": ">=", "value": "2019-01-01"}, + ], + }, + ], }, { "bool": { - "must": { - "bool": { - "must": [ - {"term": {"type": "article"}}, - { - "bool": { - "should": [ - {"terms": {"genre": ["economy", "politics"]}}, - {"term": {"publisher": "nytimes"}}, - ] - } - }, - {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, - {"range": {"rating": {"gte": 3}}}, - ] - } - } + "should": [ + {"bool": {"must": [{"term": {"Type": "News Paper"}}, {"range": {"Date": {"lt": "2020-01-01"}}}]}}, + {"bool": {"must": [{"term": {"Type": "Blog Post"}}, {"range": {"Date": {"gte": "2019-01-01"}}}]}}, + ] } }, ), ( { - "type": "article", - "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, - "rating": {"$gte": 3}, - "$or": {"genre": ["economy", "politics"], "publisher": "nytimes"}, + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.date", "operator": "<", "value": "2021-01-01"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], }, { "bool": { "must": [ - {"match": {"type": {"query": "article", "minimum_should_match": "100%"}}}, + {"term": {"type": "article"}}, { "bool": { "should": [ {"terms": {"genre": ["economy", "politics"]}}, - {"match": {"publisher": {"query": "nytimes", "minimum_should_match": "100%"}}}, + {"term": {"publisher": "nytimes"}}, ] } }, @@ -113,75 +104,72 @@ }, ), ( - {"text": "A Foo Document 1"}, - {"bool": {"must": {"match": {"text": {"query": "A Foo Document 1", "minimum_should_match": "100%"}}}}}, + {"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]}, + {"bool": {"must": [{"match": {"text": {"query": "A Foo Document 1", "minimum_should_match": "100%"}}}]}}, ), ( - {"$or": {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}, "number": {"$lt": 1.0}}}, + { + "operator": "OR", + "conditions": [ + { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + {"field": "meta.name", "operator": "==", "value": "name_1"}, + ], + }, + {"field": "meta.number", "operator": "<", "value": 1.0}, + ], + }, { "bool": { - "must": { - "bool": { - "should": [ - { - "bool": { - "should": [ - {"term": {"name": "name_0"}}, - {"term": {"name": "name_1"}}, - ] - } - }, - {"range": {"number": {"lt": 1.0}}}, - ] - } - } + "should": [ + {"bool": {"should": [{"term": {"name": "name_0"}}, {"term": {"name": "name_1"}}]}}, + {"range": {"number": {"lt": 1.0}}}, + ] } }, ), ( - {"$and": {"number": {"$and": {"$lte": 2, "$gte": 0}}, "name": {"$in": ["name_0", "name_1"]}}}, { - "bool": { - "must": { - "bool": { - "must": [ - {"bool": {"must": [{"range": {"number": {"lte": 2, "gte": 0}}}]}}, - {"terms": {"name": ["name_0", "name_1"]}}, - ] - } - } - } + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "<=", "value": 2}, + {"field": "meta.number", "operator": ">=", "value": 0}, + {"field": "meta.name", "operator": "in", "value": ["name_0", "name_1"]}, + ], }, + {"bool": {"must": [{"terms": {"name": ["name_0", "name_1"]}}, {"range": {"number": {"lte": 2, "gte": 0}}}]}}, ), ( - {"number": {"$lte": 2, "$gte": 0}, "name": ["name_0", "name_1"]}, { - "bool": { - "must": [ - {"terms": {"name": ["name_0", "name_1"]}}, - {"range": {"number": {"lte": 2, "gte": 0}}}, - ] - } + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "<=", "value": 2}, + {"field": "meta.number", "operator": ">=", "value": 0}, + ], }, + {"bool": {"must": [{"range": {"number": {"lte": 2, "gte": 0}}}]}}, ), ( - {"number": {"$and": [{"$lte": 2}, {"$gte": 0}]}}, - {"bool": {"must": [{"range": {"number": {"lte": 2, "gte": 0}}}]}}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + {"field": "meta.name", "operator": "==", "value": "name_1"}, + ], + }, + {"bool": {"should": [{"term": {"name": "name_0"}}, {"term": {"name": "name_1"}}]}}, ), ( - {"name": {"$or": [{"$eq": "name_0"}, {"$eq": "name_1"}]}}, { - "bool": { - "must": { - "bool": { - "should": [ - {"term": {"name": "name_0"}}, - {"term": {"name": "name_1"}}, - ] - } - } - } + "operator": "NOT", + "conditions": [ + {"field": "meta.number", "operator": "==", "value": 100}, + {"field": "meta.name", "operator": "==", "value": "name_0"}, + ], }, + {"bool": {"must_not": [{"bool": {"must": [{"term": {"number": 100}}, {"term": {"name": "name_0"}}]}}]}}, ), ] @@ -192,15 +180,31 @@ def test_normalize_filters(filters, expected): assert result == expected -def test_normalize_filters_raises_with_malformed_filters(): +def test_normalize_filters_invalid_operator(): + with pytest.raises(FilterError): + _normalize_filters({"operator": "INVALID", "conditions": []}) + + +def test_normalize_filters_malformed(): + # Missing operator + with pytest.raises(FilterError): + _normalize_filters({"conditions": []}) + + # Missing conditions + with pytest.raises(FilterError): + _normalize_filters({"operator": "AND"}) + + # Missing comparison field with pytest.raises(FilterError): - _normalize_filters("not a filter") + _normalize_filters({"operator": "AND", "conditions": [{"operator": "==", "value": "article"}]}) + # Missing comparison operator with pytest.raises(FilterError): - _normalize_filters({"number": {"page": "100"}}) + _normalize_filters({"operator": "AND", "conditions": [{"field": "meta.type", "operator": "=="}]}) + # Missing comparison value with pytest.raises(FilterError): - _normalize_filters({"number": {"page": {"chapter": "intro"}}}) + _normalize_filters({"operator": "AND", "conditions": [{"field": "meta.type", "value": "article"}]}) def test_normalize_ranges(): From 38ed84a67cd2bc6ebf767dcf3cb412d23df94ed8 Mon Sep 17 00:00:00 2001 From: Silvano Cerza Date: Wed, 29 Nov 2023 17:24:30 +0100 Subject: [PATCH 13/36] Bump elasticsearch_haystack to 0.0.2 --- .../elasticsearch/src/elasticsearch_haystack/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py index f3717f266..2faac960f 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present Silvano Cerza # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.0.1" +__version__ = "0.0.2" From 66969b3ad555cd5bf861f3545373ac7fdb9a6676 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 18:48:03 +0100 Subject: [PATCH 14/36] move nodes/ at the root --- {integrations/nodes => nodes}/README.md | 0 {integrations/nodes => nodes}/hatch.toml | 0 .../nodes => nodes}/text2speech/LICENSE.txt | 0 {integrations/nodes => nodes}/text2speech/README.md | 0 .../nodes => nodes}/text2speech/pyproject.toml | 0 .../nodes => nodes}/text2speech/tests/__init__.py | 0 .../text2speech/tests/samples/answer.wav | Bin .../samples/the context for this answer is here.wav | Bin .../samples/this is the content of the document.wav | Bin .../nodes => nodes}/text2speech/tests/test_nodes.py | 0 .../text2speech/text2speech/__about__.py | 0 .../text2speech/text2speech/__init__.py | 0 .../text2speech/text2speech/answer_to_speech.py | 0 .../text2speech/text2speech/document_to_speech.py | 0 .../text2speech/text2speech/errors.py | 0 .../text2speech/text2speech/utils/__init__.py | 0 .../text2speech/text2speech/utils/text_to_speech.py | 0 17 files changed, 0 insertions(+), 0 deletions(-) rename {integrations/nodes => nodes}/README.md (100%) rename {integrations/nodes => nodes}/hatch.toml (100%) rename {integrations/nodes => nodes}/text2speech/LICENSE.txt (100%) rename {integrations/nodes => nodes}/text2speech/README.md (100%) rename {integrations/nodes => nodes}/text2speech/pyproject.toml (100%) rename {integrations/nodes => nodes}/text2speech/tests/__init__.py (100%) rename {integrations/nodes => nodes}/text2speech/tests/samples/answer.wav (100%) rename {integrations/nodes => nodes}/text2speech/tests/samples/the context for this answer is here.wav (100%) rename {integrations/nodes => nodes}/text2speech/tests/samples/this is the content of the document.wav (100%) rename {integrations/nodes => nodes}/text2speech/tests/test_nodes.py (100%) rename {integrations/nodes => nodes}/text2speech/text2speech/__about__.py (100%) rename {integrations/nodes => nodes}/text2speech/text2speech/__init__.py (100%) rename {integrations/nodes => nodes}/text2speech/text2speech/answer_to_speech.py (100%) rename {integrations/nodes => nodes}/text2speech/text2speech/document_to_speech.py (100%) rename {integrations/nodes => nodes}/text2speech/text2speech/errors.py (100%) rename {integrations/nodes => nodes}/text2speech/text2speech/utils/__init__.py (100%) rename {integrations/nodes => nodes}/text2speech/text2speech/utils/text_to_speech.py (100%) diff --git a/integrations/nodes/README.md b/nodes/README.md similarity index 100% rename from integrations/nodes/README.md rename to nodes/README.md diff --git a/integrations/nodes/hatch.toml b/nodes/hatch.toml similarity index 100% rename from integrations/nodes/hatch.toml rename to nodes/hatch.toml diff --git a/integrations/nodes/text2speech/LICENSE.txt b/nodes/text2speech/LICENSE.txt similarity index 100% rename from integrations/nodes/text2speech/LICENSE.txt rename to nodes/text2speech/LICENSE.txt diff --git a/integrations/nodes/text2speech/README.md b/nodes/text2speech/README.md similarity index 100% rename from integrations/nodes/text2speech/README.md rename to nodes/text2speech/README.md diff --git a/integrations/nodes/text2speech/pyproject.toml b/nodes/text2speech/pyproject.toml similarity index 100% rename from integrations/nodes/text2speech/pyproject.toml rename to nodes/text2speech/pyproject.toml diff --git a/integrations/nodes/text2speech/tests/__init__.py b/nodes/text2speech/tests/__init__.py similarity index 100% rename from integrations/nodes/text2speech/tests/__init__.py rename to nodes/text2speech/tests/__init__.py diff --git a/integrations/nodes/text2speech/tests/samples/answer.wav b/nodes/text2speech/tests/samples/answer.wav similarity index 100% rename from integrations/nodes/text2speech/tests/samples/answer.wav rename to nodes/text2speech/tests/samples/answer.wav diff --git a/integrations/nodes/text2speech/tests/samples/the context for this answer is here.wav b/nodes/text2speech/tests/samples/the context for this answer is here.wav similarity index 100% rename from integrations/nodes/text2speech/tests/samples/the context for this answer is here.wav rename to nodes/text2speech/tests/samples/the context for this answer is here.wav diff --git a/integrations/nodes/text2speech/tests/samples/this is the content of the document.wav b/nodes/text2speech/tests/samples/this is the content of the document.wav similarity index 100% rename from integrations/nodes/text2speech/tests/samples/this is the content of the document.wav rename to nodes/text2speech/tests/samples/this is the content of the document.wav diff --git a/integrations/nodes/text2speech/tests/test_nodes.py b/nodes/text2speech/tests/test_nodes.py similarity index 100% rename from integrations/nodes/text2speech/tests/test_nodes.py rename to nodes/text2speech/tests/test_nodes.py diff --git a/integrations/nodes/text2speech/text2speech/__about__.py b/nodes/text2speech/text2speech/__about__.py similarity index 100% rename from integrations/nodes/text2speech/text2speech/__about__.py rename to nodes/text2speech/text2speech/__about__.py diff --git a/integrations/nodes/text2speech/text2speech/__init__.py b/nodes/text2speech/text2speech/__init__.py similarity index 100% rename from integrations/nodes/text2speech/text2speech/__init__.py rename to nodes/text2speech/text2speech/__init__.py diff --git a/integrations/nodes/text2speech/text2speech/answer_to_speech.py b/nodes/text2speech/text2speech/answer_to_speech.py similarity index 100% rename from integrations/nodes/text2speech/text2speech/answer_to_speech.py rename to nodes/text2speech/text2speech/answer_to_speech.py diff --git a/integrations/nodes/text2speech/text2speech/document_to_speech.py b/nodes/text2speech/text2speech/document_to_speech.py similarity index 100% rename from integrations/nodes/text2speech/text2speech/document_to_speech.py rename to nodes/text2speech/text2speech/document_to_speech.py diff --git a/integrations/nodes/text2speech/text2speech/errors.py b/nodes/text2speech/text2speech/errors.py similarity index 100% rename from integrations/nodes/text2speech/text2speech/errors.py rename to nodes/text2speech/text2speech/errors.py diff --git a/integrations/nodes/text2speech/text2speech/utils/__init__.py b/nodes/text2speech/text2speech/utils/__init__.py similarity index 100% rename from integrations/nodes/text2speech/text2speech/utils/__init__.py rename to nodes/text2speech/text2speech/utils/__init__.py diff --git a/integrations/nodes/text2speech/text2speech/utils/text_to_speech.py b/nodes/text2speech/text2speech/utils/text_to_speech.py similarity index 100% rename from integrations/nodes/text2speech/text2speech/utils/text_to_speech.py rename to nodes/text2speech/text2speech/utils/text_to_speech.py From 622c546453d26615e90fee2d25a33e4f80b9c38e Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:10:53 +0100 Subject: [PATCH 15/36] update import paths (#64) * rm preview * fix unstructured dir --- .github/workflows/instructor_embedders.yml | 2 +- integrations/chroma/example/example.py | 6 +++--- integrations/chroma/pyproject.toml | 2 +- integrations/chroma/src/chroma_haystack/document_store.py | 6 +++--- integrations/chroma/src/chroma_haystack/errors.py | 4 ++-- integrations/chroma/src/chroma_haystack/retriever.py | 2 +- integrations/chroma/tests/test_document_store.py | 4 ++-- .../embedding_backend/instructor_backend.py | 2 +- .../instructor_embedders/instructor_document_embedder.py | 4 ++-- .../instructor_embedders/instructor_text_embedder.py | 2 +- .../tests/test_instructor_document_embedder.py | 2 +- integrations/unstructured/fileconverter/README.md | 6 +++--- .../unstructured_fileconverter_haystack/fileconverter.py | 2 +- 13 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.github/workflows/instructor_embedders.yml b/.github/workflows/instructor_embedders.yml index 293c6c142..626933fa0 100644 --- a/.github/workflows/instructor_embedders.yml +++ b/.github/workflows/instructor_embedders.yml @@ -10,7 +10,7 @@ on: defaults: run: - working-directory: instructor-embedders + working-directory: integrations/instructor-embedders jobs: test: diff --git a/integrations/chroma/example/example.py b/integrations/chroma/example/example.py index cedfa8cdb..a6053db1c 100644 --- a/integrations/chroma/example/example.py +++ b/integrations/chroma/example/example.py @@ -2,9 +2,9 @@ import os from pathlib import Path -from haystack.preview import Pipeline -from haystack.preview.components.file_converters import TextFileToDocument -from haystack.preview.components.writers import DocumentWriter +from haystack import Pipeline +from haystack.components.file_converters import TextFileToDocument +from haystack.components.writers import DocumentWriter from chroma_haystack import ChromaDocumentStore from chroma_haystack.retriever import ChromaQueryRetriever diff --git a/integrations/chroma/pyproject.toml b/integrations/chroma/pyproject.toml index c1ab121a1..d19461895 100644 --- a/integrations/chroma/pyproject.toml +++ b/integrations/chroma/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "haystack-ai<0.144.0", + "haystack-ai", "chromadb", ] diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index d67a2a36a..16e6f5e9c 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -8,9 +8,9 @@ import chromadb import numpy as np from chromadb.api.types import GetResult, QueryResult, validate_where, validate_where_document -from haystack.preview.dataclasses import Document -from haystack.preview.document_stores.decorator import document_store -from haystack.preview.document_stores.protocols import DuplicatePolicy +from haystack.dataclasses import Document +from haystack.document_stores.decorator import document_store +from haystack.document_stores.protocols import DuplicatePolicy from chroma_haystack.errors import ChromaDocumentStoreFilterError from chroma_haystack.utils import get_embedding_function diff --git a/integrations/chroma/src/chroma_haystack/errors.py b/integrations/chroma/src/chroma_haystack/errors.py index 0afcb7ef2..474938be4 100644 --- a/integrations/chroma/src/chroma_haystack/errors.py +++ b/integrations/chroma/src/chroma_haystack/errors.py @@ -1,8 +1,8 @@ # SPDX-FileCopyrightText: 2023-present John Doe # # SPDX-License-Identifier: Apache-2.0 -from haystack.preview.document_stores.errors import DocumentStoreError -from haystack.preview.errors import FilterError +from haystack.document_stores.errors import DocumentStoreError +from haystack.errors import FilterError class ChromaDocumentStoreError(DocumentStoreError): diff --git a/integrations/chroma/src/chroma_haystack/retriever.py b/integrations/chroma/src/chroma_haystack/retriever.py index a4a8451fa..70dae7a6d 100644 --- a/integrations/chroma/src/chroma_haystack/retriever.py +++ b/integrations/chroma/src/chroma_haystack/retriever.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional -from haystack.preview import Document, component, default_from_dict, default_to_dict +from haystack import Document, component, default_from_dict, default_to_dict from chroma_haystack import ChromaDocumentStore diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index ea204f6f6..ece91b252 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -8,8 +8,8 @@ import numpy as np import pytest from chromadb.api.types import Documents, EmbeddingFunction, Embeddings -from haystack.preview import Document -from haystack.preview.testing.document_store import DocumentStoreBaseTests +from haystack import Document +from haystack.testing.document_store import DocumentStoreBaseTests from chroma_haystack.document_store import ChromaDocumentStore diff --git a/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py b/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py index c3ff3a79b..b71f9ffdc 100644 --- a/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py +++ b/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py @@ -1,6 +1,6 @@ from typing import ClassVar, Dict, List, Optional, Union -from haystack.preview.lazy_imports import LazyImport +from haystack.lazy_imports import LazyImport with LazyImport(message="Run 'pip install InstructorEmbedding'") as instructor_embeddings_import: from InstructorEmbedding import INSTRUCTOR diff --git a/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 91a8b38e2..31b6a2f6a 100644 --- a/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, Optional, Union -from haystack.preview import Document, component, default_from_dict, default_to_dict +from haystack import Document, component, default_from_dict, default_to_dict from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory @@ -17,7 +17,7 @@ class InstructorDocumentEmbedder: # pip install instructor-embedders-haystack from instructor_embedders.instructor_document_embedder import InstructorDocumentEmbedder - from haystack.preview.dataclasses import Document + from haystack.dataclasses import Document doc_embedding_instruction = "Represent the Medical Document for retrieval:" diff --git a/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py index 693ef57bb..3a19f860d 100644 --- a/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, Optional, Union -from haystack.preview import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict from instructor_embedders.embedding_backend.instructor_backend import _InstructorEmbeddingBackendFactory diff --git a/integrations/instructor-embedders/tests/test_instructor_document_embedder.py b/integrations/instructor-embedders/tests/test_instructor_document_embedder.py index 1b53c6c1b..6d9434976 100644 --- a/integrations/instructor-embedders/tests/test_instructor_document_embedder.py +++ b/integrations/instructor-embedders/tests/test_instructor_document_embedder.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from haystack.preview import Document +from haystack import Document from instructor_embedders.instructor_document_embedder import InstructorDocumentEmbedder diff --git a/integrations/unstructured/fileconverter/README.md b/integrations/unstructured/fileconverter/README.md index 18a826bca..274c01c0f 100644 --- a/integrations/unstructured/fileconverter/README.md +++ b/integrations/unstructured/fileconverter/README.md @@ -49,9 +49,9 @@ documents = converter.run(paths = ["a/file/path.pdf", "a/directory/path"])["docu ### In a Haystack Pipeline ```python import os -from haystack.preview import Pipeline -from haystack.preview.components.writers import DocumentWriter -from haystack.preview.document_stores import MemoryDocumentStore +from haystack import Pipeline +from haystack.components.writers import DocumentWriter +from haystack.document_stores import MemoryDocumentStore from unstructured_fileconverter_haystack import UnstructuredFileConverter os.environ["UNSTRUCTURED_API_KEY"] = "YOUR-API-KEY" diff --git a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py b/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py index c8201d8da..0f65365f9 100644 --- a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py +++ b/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Any, Dict, List, Literal, Optional, Union -from haystack.preview import Document, component, default_to_dict +from haystack import Document, component, default_to_dict from tqdm import tqdm from unstructured.documents.elements import Element # type: ignore[import] from unstructured.partition.api import partition_via_api # type: ignore[import] From 8d74e74c8581cb5c87c86bfb0cab980a25034171 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:11:20 +0100 Subject: [PATCH 16/36] fix readme (#65) --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 475da2c7d..114a85ffc 100644 --- a/README.md +++ b/README.md @@ -5,10 +5,10 @@ onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai | Package | Type | PyPi Package | Status | | ----------------------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [chroma-haystack](document_stores/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / Document Stores / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_chroma.yml) | -| [elasticsearch-haystack](document_stores/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / Document Stores / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml) | -| [instructor-embedders-haystack](components/embedders/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml) | -| [unstructured-fileconverter-haystack](components/converters/unstructured_fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured-fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_unstructured_fileconverter.yml) +| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | +| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | +| [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | +| [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) ## Contributing From 671c688706e891d44d588cfde39ae10839ca8935 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:17:57 +0100 Subject: [PATCH 17/36] fix readme (#66) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 114a85ffc..24a46f86c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai | ----------------------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | -| [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/components_instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | +| [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) ## Contributing From a677c0d97e231095be6a3b53af9e18e1bb983d90 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 21:45:19 +0100 Subject: [PATCH 18/36] Patch chroma filters tests (#67) * fix test class * remove deprecated method * make the class not discoverable by pytest * patch it with glue and sticks * lint --- .../src/chroma_haystack/document_store.py | 2 +- .../chroma/src/chroma_haystack/errors.py | 2 +- .../chroma/tests/test_document_store.py | 105 +++++++++++------- integrations/chroma/tests/test_retriever.py | 4 +- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index 16e6f5e9c..b6840b2a7 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -155,7 +155,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D raise ValueError(msg) if doc.content is None: - logger.warn( + logger.warning( "ChromaDocumentStore can only store the text field of Documents: " "'array', 'dataframe' and 'blob' will be dropped." ) diff --git a/integrations/chroma/src/chroma_haystack/errors.py b/integrations/chroma/src/chroma_haystack/errors.py index 474938be4..aeb0230cd 100644 --- a/integrations/chroma/src/chroma_haystack/errors.py +++ b/integrations/chroma/src/chroma_haystack/errors.py @@ -9,7 +9,7 @@ class ChromaDocumentStoreError(DocumentStoreError): pass -class ChromaDocumentStoreFilterError(FilterError): +class ChromaDocumentStoreFilterError(FilterError, ValueError): pass diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index ece91b252..e99a2bbfd 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -9,12 +9,16 @@ import pytest from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document -from haystack.testing.document_store import DocumentStoreBaseTests +from haystack.testing.document_store import ( + CountDocumentsTest, + DeleteDocumentsTest, + LegacyFilterDocumentsTest, +) from chroma_haystack.document_store import ChromaDocumentStore -class TestEmbeddingFunction(EmbeddingFunction): +class _TestEmbeddingFunction(EmbeddingFunction): """ Chroma lets you provide custom functions to compute embeddings, we use this feature to provide a fake algorithm returning random @@ -26,49 +30,64 @@ def __call__(self, input: Documents) -> Embeddings: # noqa - chroma will inspec return [np.random.default_rng().uniform(-1, 1, 768).tolist()] -class TestDocumentStore(DocumentStoreBaseTests): +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest): """ Common test cases will be provided by `DocumentStoreBaseTests` but you can add more to this class. """ @pytest.fixture - def docstore(self) -> ChromaDocumentStore: + def document_store(self) -> ChromaDocumentStore: """ This is the most basic requirement for the child class: provide an instance of this document store so the base class can use it. """ with mock.patch("chroma_haystack.document_store.get_embedding_function") as get_func: - get_func.return_value = TestEmbeddingFunction() + get_func.return_value = _TestEmbeddingFunction() return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1())) + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): + """ + Assert that two lists of Documents are equal. + This is used in every test, if a Document Store implementation has a different behaviour + it should override this method. + + This can happen for example when the Document Store sets a score to returned Documents. + Since we can't know what the score will be, we can't compare the Documents reliably. + """ + for doc_received, doc_expected in zip(received, expected): + assert doc_received.content == doc_expected.content + assert doc_received.meta == doc_expected.meta + @pytest.mark.unit - def test_ne_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): """ We customize this test because Chroma consider "not equal" true when a field is missing """ - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$ne": "100"}}) - assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"]) + document_store.write_documents(filterable_docs) + result = document_store.filter_documents(filters={"page": {"$ne": "100"}}) + self.assert_documents_are_equal( + result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"] + ) @pytest.mark.unit - def test_delete_empty(self, docstore: ChromaDocumentStore): + def test_delete_empty(self, document_store: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma """ - docstore.delete_documents(["test"]) + document_store.delete_documents(["test"]) @pytest.mark.unit - def test_delete_not_empty_nonexisting(self, docstore: ChromaDocumentStore): + def test_delete_not_empty_nonexisting(self, document_store: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma """ doc = Document(content="test doc") - docstore.write_documents([doc]) - docstore.delete_documents(["non_existing"]) + document_store.write_documents([doc]) + document_store.delete_documents(["non_existing"]) - assert docstore.filter_documents(filters={"id": doc.id}) == [doc] + assert document_store.filter_documents(filters={"id": doc.id}) == [doc] @pytest.mark.integration def test_to_json(self, request): @@ -95,141 +114,143 @@ def test_from_json(self): @pytest.mark.skip(reason="Filter on array contents is not supported.") @pytest.mark.unit - def test_filter_document_array(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_document_array(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on dataframe contents is not supported.") @pytest.mark.unit - def test_filter_document_dataframe(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_document_dataframe(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_eq_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_eq_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on embedding value is not supported.") @pytest.mark.unit - def test_eq_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_eq_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported.") @pytest.mark.unit - def test_in_filter_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported. Filter on table contents is not supported.") @pytest.mark.unit - def test_in_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported.") @pytest.mark.unit - def test_in_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_ne_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on embedding value is not supported.") @pytest.mark.unit - def test_ne_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported. Filter on table contents is not supported.") @pytest.mark.unit - def test_nin_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported. Filter on embedding value is not supported.") @pytest.mark.unit - def test_nin_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported.") @pytest.mark.unit - def test_nin_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_simple_implicit_and_with_multi_key_dict( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_simple_explicit_and_with_multikey_dict( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_explicit_and_with_list(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_explicit_and_with_list( + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] + ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_explicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_explicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_filter_nested_and_or_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_and_or_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_and_or_implicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_and_or_implicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_or_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_or_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_nested_multiple_identical_operators_same_level( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_fail(self, docstore: ChromaDocumentStore): + def test_write_duplicate_fail(self, document_store: ChromaDocumentStore): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_skip(self, docstore: ChromaDocumentStore): + def test_write_duplicate_skip(self, document_store: ChromaDocumentStore): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_overwrite(self, docstore: ChromaDocumentStore): + def test_write_duplicate_overwrite(self, document_store: ChromaDocumentStore): pass diff --git a/integrations/chroma/tests/test_retriever.py b/integrations/chroma/tests/test_retriever.py index d1bbe5c49..b77dd4ca4 100644 --- a/integrations/chroma/tests/test_retriever.py +++ b/integrations/chroma/tests/test_retriever.py @@ -11,7 +11,7 @@ def test_retriever_to_json(request): ) retriever = ChromaQueryRetriever(ds, filters={"foo": "bar"}, top_k=99) assert retriever.to_dict() == { - "type": "ChromaQueryRetriever", + "type": "chroma_haystack.retriever.ChromaQueryRetriever", "init_parameters": { "filters": {"foo": "bar"}, "top_k": 99, @@ -27,7 +27,7 @@ def test_retriever_to_json(request): @pytest.mark.integration def test_retriever_from_json(request): data = { - "type": "ChromaQueryRetriever", + "type": "chroma_haystack.retriever.ChromaQueryRetriever", "init_parameters": { "filters": {"bar": "baz"}, "top_k": 42, From 6dbec0e91b4ebc9a02119fe65773d5f9cc1186e8 Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Thu, 30 Nov 2023 14:22:57 +0100 Subject: [PATCH 19/36] feat: [OpenSearch] add document store, BM25Retriever and EmbeddingRetriever (#68) * feat: add OpenSearchDocumentStore * reformat * fix unused imports and mypy * remove file header overhead * fix project urls * remove typeignore * expose retrievers via root module * fix lint --- .github/workflows/opensearch.yml | 53 +++ README.md | 1 + integrations/opensearch/.gitignore | 163 +++++++++ integrations/opensearch/LICENSE | 201 +++++++++++ integrations/opensearch/README.md | 32 ++ integrations/opensearch/docker-compose.yml | 15 + integrations/opensearch/pyproject.toml | 178 ++++++++++ .../src/opensearch_haystack/__about__.py | 1 + .../src/opensearch_haystack/__init__.py | 5 + .../src/opensearch_haystack/bm25_retriever.py | 56 +++ .../src/opensearch_haystack/document_store.py | 334 ++++++++++++++++++ .../embedding_retriever.py | 69 ++++ .../src/opensearch_haystack/filters.py | 246 +++++++++++++ integrations/opensearch/tests/__init__.py | 0 .../opensearch/tests/test_bm25_retriever.py | 78 ++++ .../opensearch/tests/test_document_store.py | 275 ++++++++++++++ .../tests/test_embedding_retriever.py | 70 ++++ integrations/opensearch/tests/test_filters.py | 218 ++++++++++++ 18 files changed, 1995 insertions(+) create mode 100644 .github/workflows/opensearch.yml create mode 100644 integrations/opensearch/.gitignore create mode 100644 integrations/opensearch/LICENSE create mode 100644 integrations/opensearch/README.md create mode 100644 integrations/opensearch/docker-compose.yml create mode 100644 integrations/opensearch/pyproject.toml create mode 100644 integrations/opensearch/src/opensearch_haystack/__about__.py create mode 100644 integrations/opensearch/src/opensearch_haystack/__init__.py create mode 100644 integrations/opensearch/src/opensearch_haystack/bm25_retriever.py create mode 100644 integrations/opensearch/src/opensearch_haystack/document_store.py create mode 100644 integrations/opensearch/src/opensearch_haystack/embedding_retriever.py create mode 100644 integrations/opensearch/src/opensearch_haystack/filters.py create mode 100644 integrations/opensearch/tests/__init__.py create mode 100644 integrations/opensearch/tests/test_bm25_retriever.py create mode 100644 integrations/opensearch/tests/test_document_store.py create mode 100644 integrations/opensearch/tests/test_embedding_retriever.py create mode 100644 integrations/opensearch/tests/test_filters.py diff --git a/.github/workflows/opensearch.yml b/.github/workflows/opensearch.yml new file mode 100644 index 000000000..faf359cfc --- /dev/null +++ b/.github/workflows/opensearch.yml @@ -0,0 +1,53 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / opensearch + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/opensearch/**" + - ".github/workflows/opensearch.yml" + +concurrency: + group: opensearch-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + working-directory: integrations/opensearch + if: matrix.python-version == '3.9' + run: hatch run lint:all + + - name: Run opensearch container + working-directory: integrations/opensearch + run: docker-compose up -d + + - name: Run tests + working-directory: integrations/opensearch + run: hatch run cov diff --git a/README.md b/README.md index 24a46f86c..978be06d8 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai | ----------------------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | +| [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | | [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) diff --git a/integrations/opensearch/.gitignore b/integrations/opensearch/.gitignore new file mode 100644 index 000000000..d1c340c1f --- /dev/null +++ b/integrations/opensearch/.gitignore @@ -0,0 +1,163 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# VS Code +.vscode diff --git a/integrations/opensearch/LICENSE b/integrations/opensearch/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/integrations/opensearch/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/opensearch/README.md b/integrations/opensearch/README.md new file mode 100644 index 000000000..9c6dd6e6c --- /dev/null +++ b/integrations/opensearch/README.md @@ -0,0 +1,32 @@ +[![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_opensearch.yml) + +[![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) + +# OpenSearch Document Store + +Document Store for Haystack 2.x, supports OpenSearch. + +## Installation + +```console +pip install opensearch-haystack +``` + +## Testing + +To run tests first start a Docker container running OpenSearch. We provide a utility `docker-compose.yml` for that: + +```console +docker-compose up +``` + +Then run tests: + +```console +hatch run test +``` + +## License + +`opensearch-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/opensearch/docker-compose.yml b/integrations/opensearch/docker-compose.yml new file mode 100644 index 000000000..30b01d5c1 --- /dev/null +++ b/integrations/opensearch/docker-compose.yml @@ -0,0 +1,15 @@ +services: + opensearch: + image: "opensearchproject/opensearch:2.11.0" + ports: + - 9200:9200 + - 9600:9600 + restart: on-failure + environment: + - discovery.type=single-node + - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m" + healthcheck: + test: curl --fail https://localhost:9200/_cat/health -ku admin:admin || exit 1 + interval: 10s + timeout: 1s + retries: 10 \ No newline at end of file diff --git a/integrations/opensearch/pyproject.toml b/integrations/opensearch/pyproject.toml new file mode 100644 index 000000000..fb02b3b63 --- /dev/null +++ b/integrations/opensearch/pyproject.toml @@ -0,0 +1,178 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "opensearch-haystack" +dynamic = ["version"] +description = 'Haystack 2.x Document Store for OpenSearch' +readme = "README.md" +requires-python = ">=3.8" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset", email = "info@deepset.ai" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai", + "opensearch-py>=2,<3", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/opensearch#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/opensearch" + +[tool.hatch.version] +path = "src/opensearch_haystack/__about__.py" + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", + "pytest-xdist", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/opensearch_haystack tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["opensearch_haystack"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["opensearch_haystack", "tests"] +branch = true +parallel = true +omit = [ + "src/opensearch_haystack/__about__.py", +] + +[tool.coverage.paths] +opensearch_haystack = ["src/opensearch_haystack", "*/opensearch-haystack/src/opensearch_haystack"] +tests = ["tests", "*/opensearch-haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.pytest.ini_options] +minversion = "6.0" +markers = [ + "unit: unit tests", + "integration: integration tests" +] + +[[tool.mypy.overrides]] +module = [ + "haystack.*", + "pytest.*", + "opensearchpy.*", +] +ignore_missing_imports = true diff --git a/integrations/opensearch/src/opensearch_haystack/__about__.py b/integrations/opensearch/src/opensearch_haystack/__about__.py new file mode 100644 index 000000000..f102a9cad --- /dev/null +++ b/integrations/opensearch/src/opensearch_haystack/__about__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/integrations/opensearch/src/opensearch_haystack/__init__.py b/integrations/opensearch/src/opensearch_haystack/__init__.py new file mode 100644 index 000000000..7112ecda6 --- /dev/null +++ b/integrations/opensearch/src/opensearch_haystack/__init__.py @@ -0,0 +1,5 @@ +from opensearch_haystack.bm25_retriever import OpenSearchBM25Retriever +from opensearch_haystack.document_store import OpenSearchDocumentStore +from opensearch_haystack.embedding_retriever import OpenSearchEmbeddingRetriever + +__all__ = ["OpenSearchDocumentStore", "OpenSearchBM25Retriever", "OpenSearchEmbeddingRetriever"] diff --git a/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py b/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py new file mode 100644 index 000000000..9755d6253 --- /dev/null +++ b/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py @@ -0,0 +1,56 @@ +from typing import Any, Dict, List, Optional + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document + +from opensearch_haystack.document_store import OpenSearchDocumentStore + + +@component +class OpenSearchBM25Retriever: + def __init__( + self, + *, + document_store: OpenSearchDocumentStore, + filters: Optional[Dict[str, Any]] = None, + fuzziness: str = "AUTO", + top_k: int = 10, + scale_score: bool = False, + ): + if not isinstance(document_store, OpenSearchDocumentStore): + msg = "document_store must be an instance of OpenSearchDocumentStore" + raise ValueError(msg) + + self._document_store = document_store + self._filters = filters or {} + self._fuzziness = fuzziness + self._top_k = top_k + self._scale_score = scale_score + + def to_dict(self) -> Dict[str, Any]: + return default_to_dict( + self, + filters=self._filters, + fuzziness=self._fuzziness, + top_k=self._top_k, + scale_score=self._scale_score, + document_store=self._document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchBM25Retriever": + data["init_parameters"]["document_store"] = OpenSearchDocumentStore.from_dict( + data["init_parameters"]["document_store"] + ) + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, query: str): + docs = self._document_store._bm25_retrieval( + query=query, + filters=self._filters, + fuzziness=self._fuzziness, + top_k=self._top_k, + scale_score=self._scale_score, + ) + return {"documents": docs} diff --git a/integrations/opensearch/src/opensearch_haystack/document_store.py b/integrations/opensearch/src/opensearch_haystack/document_store.py new file mode 100644 index 000000000..fe8495fb0 --- /dev/null +++ b/integrations/opensearch/src/opensearch_haystack/document_store.py @@ -0,0 +1,334 @@ +import logging +from typing import Any, Dict, List, Mapping, Optional, Union + +import numpy as np +from haystack import default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores import DocumentStoreError, DuplicateDocumentError, DuplicatePolicy, document_store +from haystack.utils.filters import convert +from opensearchpy import OpenSearch +from opensearchpy.helpers import bulk + +from opensearch_haystack.filters import _normalize_filters + +logger = logging.getLogger(__name__) + +Hosts = Union[str, List[Union[str, Mapping[str, Union[str, int]]]]] + +# document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to +# True. Scaling uses the expit function (inverse of the logit function) after applying a scaling factor +# (e.g., BM25_SCALING_FACTOR for the bm25_retrieval method). +# Larger scaling factor decreases scaled scores. For example, an input of 10 is scaled to 0.99 with +# BM25_SCALING_FACTOR=2 but to 0.78 with BM25_SCALING_FACTOR=8 (default). The defaults were chosen empirically. +# Increase the default if most unscaled scores are larger than expected (>30) and otherwise would incorrectly +# all be mapped to scores ~1. +BM25_SCALING_FACTOR = 8 + + +@document_store +class OpenSearchDocumentStore: + def __init__( + self, + *, + hosts: Optional[Hosts] = None, + index: str = "default", + **kwargs, + ): + """ + Creates a new OpenSearchDocumentStore instance. + + For more information on connection parameters, see the official OpenSearch documentation: + https://www.elastic.co/guide/en/OpenSearch/client/python-api/current/connecting.html + + For the full list of supported kwargs, see the official OpenSearch reference: + https://OpenSearch-py.readthedocs.io/en/stable/api.html#module-OpenSearch + + :param hosts: List of hosts running the OpenSearch client. Defaults to None + :param index: Name of index in OpenSearch, if it doesn't exist it will be created. Defaults to "default" + :param **kwargs: Optional arguments that ``OpenSearch`` takes. + """ + self._hosts = hosts + self._client = OpenSearch(hosts, **kwargs) + self._index = index + self._kwargs = kwargs + + # Check client connection, this will raise if not connected + self._client.info() + + # configure mapping for the embedding field + embedding_dim = kwargs.get("embedding_dim", 768) + method = kwargs.get("method", None) + + mappings: Dict[str, Any] = { + "properties": { + "embedding": {"type": "knn_vector", "index": True, "dimension": embedding_dim}, + "content": {"type": "text"}, + }, + "dynamic_templates": [ + { + "strings": { + "path_match": "*", + "match_mapping_type": "string", + "mapping": { + "type": "keyword", + }, + } + } + ], + } + if method: + mappings["properties"]["embedding"]["method"] = method + + mappings = kwargs.get("mappings", mappings) + settings = kwargs.get("settings", {"index.knn": True}) + + body = {"mappings": mappings, "settings": settings} + + # Create the index if it doesn't exist + if not self._client.indices.exists(index=index): + self._client.indices.create(index=index, body=body) + + def to_dict(self) -> Dict[str, Any]: + # This is not the best solution to serialise this class but is the fastest to implement. + # Not all kwargs types can be serialised to text so this can fail. We must serialise each + # type explicitly to handle this properly. + return default_to_dict( + self, + hosts=self._hosts, + index=self._index, + **self._kwargs, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchDocumentStore": + return default_from_dict(cls, data) + + def count_documents(self) -> int: + """ + Returns how many documents are present in the document store. + """ + return self._client.count(index=self._index)["count"] + + def _search_documents(self, **kwargs) -> List[Document]: + """ + Calls the OpenSearch client's search method and handles pagination. + """ + res = self._client.search( + index=self._index, + body=kwargs, + ) + documents: List[Document] = [self._deserialize_document(hit) for hit in res["hits"]["hits"]] + return documents + + def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Document]: + if filters and "operator" not in filters and "conditions" not in filters: + filters = convert(filters) + + if filters: + query = {"bool": {"filter": _normalize_filters(filters)}} + documents = self._search_documents(query=query, size=10_000) + else: + documents = self._search_documents(size=10_000) + + return documents + + def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: + """ + Writes Documents to OpenSearch. + If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the + same ID already exists in the document store. + """ + if len(documents) > 0: + if not isinstance(documents[0], Document): + msg = "param 'documents' must contain a list of objects of type Document" + raise ValueError(msg) + + if policy == DuplicatePolicy.NONE: + policy = DuplicatePolicy.FAIL + + action = "index" if policy == DuplicatePolicy.OVERWRITE else "create" + documents_written, errors = bulk( + client=self._client, + actions=( + { + "_op_type": action, + "_id": doc.id, + "_source": doc.to_dict(), + } + for doc in documents + ), + refresh="wait_for", + index=self._index, + raise_on_error=False, + ) + + if errors: + duplicate_errors_ids = [] + other_errors = [] + for e in errors: + error_type = e["create"]["error"]["type"] + if policy == DuplicatePolicy.FAIL and error_type == "version_conflict_engine_exception": + duplicate_errors_ids.append(e["create"]["_id"]) + elif policy == DuplicatePolicy.SKIP and error_type == "version_conflict_engine_exception": + # when the policy is skip, duplication errors are OK and we should not raise an exception + continue + else: + other_errors.append(e) + + if len(duplicate_errors_ids) > 0: + msg = f"IDs '{', '.join(duplicate_errors_ids)}' already exist in the document store." + raise DuplicateDocumentError(msg) + + if len(other_errors) > 0: + msg = f"Failed to write documents to OpenSearch. Errors:\n{other_errors}" + raise DocumentStoreError(msg) + + return documents_written + + def _deserialize_document(self, hit: Dict[str, Any]) -> Document: + """ + Creates a Document from the search hit provided. + This is mostly useful in self.filter_documents(). + """ + data = hit["_source"] + + if "highlight" in hit: + data["metadata"]["highlighted"] = hit["highlight"] + data["score"] = hit["_score"] + + return Document.from_dict(data) + + def delete_documents(self, document_ids: List[str]) -> None: + """ + Deletes all documents with a matching document_ids from the document store. + + :param object_ids: the object_ids to delete + """ + + bulk( + client=self._client, + actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids), + refresh="wait_for", + index=self._index, + raise_on_error=False, + ) + + def _bm25_retrieval( + self, + query: str, + *, + filters: Optional[Dict[str, Any]] = None, + fuzziness: str = "AUTO", + top_k: int = 10, + scale_score: bool = False, + ) -> List[Document]: + """ + OpenSearch by defaults uses BM25 search algorithm. + Even though this method is called `bm25_retrieval` it searches for `query` + using the search algorithm `_client` was configured with. + + This method is not mean to be part of the public interface of + `OpenSearchDocumentStore` nor called directly. + `OpenSearchBM25Retriever` uses this method directly and is the public interface for it. + + `query` must be a non empty string, otherwise a `ValueError` will be raised. + + :param query: String to search in saved Documents' text. + :param filters: Filters applied to the retrieved Documents, for more info + see `OpenSearchDocumentStore.filter_documents`, defaults to None + :param fuzziness: Fuzziness parameter passed to OpenSearch, defaults to "AUTO". + see the official documentation for valid values: + https://www.elastic.co/guide/en/OpenSearch/reference/current/common-options.html#fuzziness + :param top_k: Maximum number of Documents to return, defaults to 10 + :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False + :raises ValueError: If `query` is an empty string + :return: List of Document that match `query` + """ + + if not query: + msg = "query must be a non empty string" + raise ValueError(msg) + + body: Dict[str, Any] = { + "size": top_k, + "query": { + "bool": { + "must": [ + { + "multi_match": { + "query": query, + "fuzziness": fuzziness, + "type": "most_fields", + "operator": "AND", + } + } + ] + } + }, + } + + if filters: + body["query"]["bool"]["filter"] = _normalize_filters(filters) + + documents = self._search_documents(**body) + + if scale_score: + for doc in documents: + doc.score = float(1 / (1 + np.exp(-np.asarray(doc.score / BM25_SCALING_FACTOR)))) + + return documents + + def _embedding_retrieval( + self, + query_embedding: List[float], + *, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + ) -> List[Document]: + """ + Retrieves documents that are most similar to the query embedding using a vector similarity metric. + It uses the OpenSearch's Approximate k-Nearest Neighbors search algorithm. + + This method is not mean to be part of the public interface of + `OpenSearchDocumentStore` nor called directly. + `OpenSearchEmbeddingRetriever` uses this method directly and is the public interface for it. + + :param query_embedding: Embedding of the query. + :param filters: Filters applied to the retrieved Documents. Defaults to None. + Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned. + :param top_k: Maximum number of Documents to return, defaults to 10 + :param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10. + Increasing this value will improve search accuracy at the cost of slower search speeds. + You can read more about it in the OpenSearch documentation: + https://www.elastic.co/guide/en/OpenSearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy + :raises ValueError: If `query_embedding` is an empty list + :return: List of Document that are most similar to `query_embedding` + """ + + if not query_embedding: + msg = "query_embedding must be a non-empty list of floats" + raise ValueError(msg) + + body: Dict[str, Any] = { + "query": { + "bool": { + "must": [ + { + "knn": { + "embedding": { + "vector": query_embedding, + "k": top_k, + } + } + } + ], + } + }, + "size": top_k, + } + + if filters: + body["query"]["bool"]["filter"] = _normalize_filters(filters) + + docs = self._search_documents(**body) + return docs diff --git a/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py b/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py new file mode 100644 index 000000000..9bbc2a7a3 --- /dev/null +++ b/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py @@ -0,0 +1,69 @@ +from typing import Any, Dict, List, Optional + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document + +from opensearch_haystack.document_store import OpenSearchDocumentStore + + +@component +class OpenSearchEmbeddingRetriever: + """ + Uses a vector similarity metric to retrieve documents from the OpenSearchDocumentStore. + + Needs to be connected to the OpenSearchDocumentStore to run. + """ + + def __init__( + self, + *, + document_store: OpenSearchDocumentStore, + filters: Optional[Dict[str, Any]] = None, + top_k: int = 10, + ): + """ + Create the OpenSearchEmbeddingRetriever component. + + :param document_store: An instance of OpenSearchDocumentStore. + :param filters: Filters applied to the retrieved Documents. Defaults to None. + Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned. + :param top_k: Maximum number of Documents to return, defaults to 10 + :raises ValueError: If `document_store` is not an instance of OpenSearchDocumentStore. + """ + if not isinstance(document_store, OpenSearchDocumentStore): + msg = "document_store must be an instance of OpenSearchDocumentStore" + raise ValueError(msg) + + self._document_store = document_store + self._filters = filters or {} + self._top_k = top_k + + def to_dict(self) -> Dict[str, Any]: + return default_to_dict( + self, + filters=self._filters, + top_k=self._top_k, + document_store=self._document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchEmbeddingRetriever": + data["init_parameters"]["document_store"] = OpenSearchDocumentStore.from_dict( + data["init_parameters"]["document_store"] + ) + return default_from_dict(cls, data) + + @component.output_types(documents=List[Document]) + def run(self, query_embedding: List[float]): + """ + Retrieve documents using a vector similarity metric. + + :param query_embedding: Embedding of the query. + :return: List of Document similar to `query_embedding`. + """ + docs = self._document_store._embedding_retrieval( + query_embedding=query_embedding, + filters=self._filters, + top_k=self._top_k, + ) + return {"documents": docs} diff --git a/integrations/opensearch/src/opensearch_haystack/filters.py b/integrations/opensearch/src/opensearch_haystack/filters.py new file mode 100644 index 000000000..8f5418145 --- /dev/null +++ b/integrations/opensearch/src/opensearch_haystack/filters.py @@ -0,0 +1,246 @@ +from datetime import datetime +from typing import Any, Dict, List + +from haystack.errors import FilterError +from pandas import DataFrame + + +def _normalize_filters(filters: Dict[str, Any]) -> Dict[str, Any]: + """ + Converts Haystack filters in OpenSearch compatible filters. + """ + if not isinstance(filters, dict): + msg = "Filters must be a dictionary" + raise FilterError(msg) + + if "field" in filters: + return {"bool": {"must": _parse_comparison_condition(filters)}} + return _parse_logical_condition(filters) + + +def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "conditions" not in condition: + msg = f"'conditions' key missing in {condition}" + raise FilterError(msg) + + operator = condition["operator"] + conditions = [_parse_comparison_condition(c) for c in condition["conditions"]] + if len(conditions) > 1: + conditions = _normalize_ranges(conditions) + if operator == "AND": + return {"bool": {"must": conditions}} + elif operator == "OR": + return {"bool": {"should": conditions}} + elif operator == "NOT": + return {"bool": {"must_not": [{"bool": {"must": conditions}}]}} + else: + msg = f"Unknown logical operator '{operator}'" + raise FilterError(msg) + + +def _equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + return {"bool": {"must_not": {"exists": {"field": field}}}} + + if isinstance(value, list): + return { + "terms_set": { + field: { + "terms": value, + "minimum_should_match_script": {"source": f"Math.max(params.num_terms, doc['{field}'].size())"}, + } + } + } + if field in ["text", "dataframe"]: + # We want to fully match the text field. + return {"match": {field: {"query": value, "minimum_should_match": "100%"}}} + return {"term": {field: value}} + + +def _not_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + return {"exists": {"field": field}} + + if isinstance(value, list): + return {"bool": {"must_not": {"terms": {field: value}}}} + if field in ["text", "dataframe"]: + # We want to fully match the text field. + return {"bool": {"must_not": {"match": {field: {"query": value, "minimum_should_match": "100%"}}}}} + + return {"bool": {"must_not": {"term": {field: value}}}} + + +def _greater_than(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '>' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"gt": value}}} + + +def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '>=' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"gte": value}}} + + +def _less_than(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '<' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"lt": value}}} + + +def _less_than_equal(field: str, value: Any) -> Dict[str, Any]: + if value is None: + # When the value is None and '<=' is used we create a filter that would return a Document + # if it has a field set and not set at the same time. + # This will cause the filter to match no Document. + # This way we keep the behavior consistent with other Document Stores. + return {"bool": {"must": [{"exists": {"field": field}}, {"bool": {"must_not": {"exists": {"field": field}}}}]}} + if isinstance(value, str): + try: + datetime.fromisoformat(value) + except (ValueError, TypeError) as exc: + msg = ( + "Can't compare strings using operators '>', '>=', '<', '<='. " + "Strings are only comparable if they are ISO formatted dates." + ) + raise FilterError(msg) from exc + if type(value) in [list, DataFrame]: + msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='" + raise FilterError(msg) + return {"range": {field: {"lte": value}}} + + +def _in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators" + raise FilterError(msg) + return {"terms": {field: value}} + + +def _not_in(field: str, value: Any) -> Dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators" + raise FilterError(msg) + return {"bool": {"must_not": {"terms": {field: value}}}} + + +COMPARISON_OPERATORS = { + "==": _equal, + "!=": _not_equal, + ">": _greater_than, + ">=": _greater_than_equal, + "<": _less_than, + "<=": _less_than_equal, + "in": _in, + "not in": _not_in, +} + + +def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]: + if "field" not in condition: + # 'field' key is only found in comparison dictionaries. + # We assume this is a logic dictionary since it's not present. + return _parse_logical_condition(condition) + field: str = condition["field"] + + if field.startswith("meta."): + # Remove the "meta." prefix if present. + # Documents are flattened when using the OpenSearchDocumentStore + # so we don't need to specify the "meta." prefix. + # Instead of raising an error we handle it gracefully. + field = field[5:] + + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "value" not in condition: + msg = f"'value' key missing in {condition}" + raise FilterError(msg) + operator: str = condition["operator"] + value: Any = condition["value"] + if isinstance(value, DataFrame): + value = value.to_json() + + return COMPARISON_OPERATORS[operator](field, value) + + +def _normalize_ranges(conditions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Merges range conditions acting on a same field. + + Example usage: + + ```python + conditions = [ + {"range": {"date": {"lt": "2021-01-01"}}}, + {"range": {"date": {"gte": "2015-01-01"}}}, + ] + conditions = _normalize_ranges(conditions) + assert conditions == [ + {"range": {"date": {"lt": "2021-01-01", "gte": "2015-01-01"}}}, + ] + ``` + """ + range_conditions = [next(iter(c["range"].items())) for c in conditions if "range" in c] + if range_conditions: + conditions = [c for c in conditions if "range" not in c] + range_conditions_dict: Dict[str, Any] = {} + for field_name, comparison in range_conditions: + if field_name not in range_conditions_dict: + range_conditions_dict[field_name] = {} + range_conditions_dict[field_name].update(comparison) + + for field_name, comparisons in range_conditions_dict.items(): + conditions.append({"range": {field_name: comparisons}}) + return conditions diff --git a/integrations/opensearch/tests/__init__.py b/integrations/opensearch/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py new file mode 100644 index 000000000..cfea2d767 --- /dev/null +++ b/integrations/opensearch/tests/test_bm25_retriever.py @@ -0,0 +1,78 @@ +from unittest.mock import Mock, patch + +from haystack.dataclasses import Document + +from opensearch_haystack.bm25_retriever import OpenSearchBM25Retriever +from opensearch_haystack.document_store import OpenSearchDocumentStore + + +def test_init_default(): + mock_store = Mock(spec=OpenSearchDocumentStore) + retriever = OpenSearchBM25Retriever(document_store=mock_store) + assert retriever._document_store == mock_store + assert retriever._filters == {} + assert retriever._top_k == 10 + assert not retriever._scale_score + + +@patch("opensearch_haystack.document_store.OpenSearch") +def test_to_dict(_mock_opensearch_client): + document_store = OpenSearchDocumentStore(hosts="some fake host") + retriever = OpenSearchBM25Retriever(document_store=document_store) + res = retriever.to_dict() + assert res == { + "type": "opensearch_haystack.bm25_retriever.OpenSearchBM25Retriever", + "init_parameters": { + "document_store": { + "init_parameters": { + "hosts": "some fake host", + "index": "default", + }, + "type": "opensearch_haystack.document_store.OpenSearchDocumentStore", + }, + "filters": {}, + "fuzziness": "AUTO", + "top_k": 10, + "scale_score": False, + }, + } + + +@patch("opensearch_haystack.document_store.OpenSearch") +def test_from_dict(_mock_opensearch_client): + data = { + "type": "opensearch_haystack.bm25_retriever.OpenSearchBM25Retriever", + "init_parameters": { + "document_store": { + "init_parameters": {"hosts": "some fake host", "index": "default"}, + "type": "opensearch_haystack.document_store.OpenSearchDocumentStore", + }, + "filters": {}, + "fuzziness": "AUTO", + "top_k": 10, + "scale_score": True, + }, + } + retriever = OpenSearchBM25Retriever.from_dict(data) + assert retriever._document_store + assert retriever._filters == {} + assert retriever._fuzziness == "AUTO" + assert retriever._top_k == 10 + assert retriever._scale_score + + +def test_run(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._bm25_retrieval.return_value = [Document(content="Test doc")] + retriever = OpenSearchBM25Retriever(document_store=mock_store) + res = retriever.run(query="some query") + mock_store._bm25_retrieval.assert_called_once_with( + query="some query", + filters={}, + fuzziness="AUTO", + top_k=10, + scale_score=False, + ) + assert len(res) == 1 + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py new file mode 100644 index 000000000..4678cca70 --- /dev/null +++ b/integrations/opensearch/tests/test_document_store.py @@ -0,0 +1,275 @@ +import random +from typing import List +from unittest.mock import patch + +import pytest +from haystack.dataclasses.document import Document +from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError +from haystack.document_stores.protocols import DuplicatePolicy +from haystack.testing.document_store import DocumentStoreBaseTests +from opensearchpy.exceptions import RequestError + +from opensearch_haystack.document_store import OpenSearchDocumentStore + + +class TestDocumentStore(DocumentStoreBaseTests): + """ + Common test cases will be provided by `DocumentStoreBaseTests` but + you can add more to this class. + """ + + @pytest.fixture + def document_store(self, request): + """ + This is the most basic requirement for the child class: provide + an instance of this document store so the base class can use it. + """ + hosts = ["https://localhost:9200"] + # Use a different index for each test so we can run them in parallel + index = f"{request.node.name}" + + store = OpenSearchDocumentStore( + hosts=hosts, + index=index, + http_auth=("admin", "admin"), + verify_certs=False, + embedding_dim=768, + method={"space_type": "cosinesimil", "engine": "nmslib", "name": "hnsw"}, + ) + yield store + store._client.indices.delete(index=index, params={"ignore": [400, 404]}) + + @pytest.fixture + def document_store_embedding_dim_4(self, request): + """ + This is the most basic requirement for the child class: provide + an instance of this document store so the base class can use it. + """ + hosts = ["https://localhost:9200"] + # Use a different index for each test so we can run them in parallel + index = f"{request.node.name}" + + store = OpenSearchDocumentStore( + hosts=hosts, + index=index, + http_auth=("admin", "admin"), + verify_certs=False, + embedding_dim=4, + method={"space_type": "cosinesimil", "engine": "nmslib", "name": "hnsw"}, + ) + yield store + store._client.indices.delete(index=index, params={"ignore": [400, 404]}) + + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): + """ + The OpenSearchDocumentStore.filter_documents() method returns a Documents with their score set. + We don't want to compare the score, so we set it to None before comparing the documents. + """ + received_meta = [] + for doc in received: + r = { + "number": doc.meta.get("number"), + "name": doc.meta.get("name"), + } + received_meta.append(r) + + expected_meta = [] + for doc in expected: + r = { + "number": doc.meta.get("number"), + "name": doc.meta.get("name"), + } + expected_meta.append(r) + for doc in received: + doc.score = None + + super().assert_documents_are_equal(received, expected) + + @patch("opensearch_haystack.document_store.OpenSearch") + def test_to_dict(self, _mock_opensearch_client): + document_store = OpenSearchDocumentStore(hosts="some hosts") + res = document_store.to_dict() + assert res == { + "type": "opensearch_haystack.document_store.OpenSearchDocumentStore", + "init_parameters": { + "hosts": "some hosts", + "index": "default", + }, + } + + @patch("opensearch_haystack.document_store.OpenSearch") + def test_from_dict(self, _mock_opensearch_client): + data = { + "type": "opensearch_haystack.document_store.OpenSearchDocumentStore", + "init_parameters": { + "hosts": "some hosts", + "index": "default", + }, + } + document_store = OpenSearchDocumentStore.from_dict(data) + assert document_store._hosts == "some hosts" + assert document_store._index == "default" + + def test_write_documents(self, document_store: OpenSearchDocumentStore): + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, DuplicatePolicy.FAIL) + + def test_bm25_retrieval(self, document_store: OpenSearchDocumentStore): + document_store.write_documents( + [ + Document(content="Haskell is a functional programming language"), + Document(content="Lisp is a functional programming language"), + Document(content="Exilir is a functional programming language"), + Document(content="F# is a functional programming language"), + Document(content="C# is a functional programming language"), + Document(content="C++ is an object oriented programming language"), + Document(content="Dart is an object oriented programming language"), + Document(content="Go is an object oriented programming language"), + Document(content="Python is a object oriented programming language"), + Document(content="Ruby is a object oriented programming language"), + Document(content="PHP is a object oriented programming language"), + ] + ) + + res = document_store._bm25_retrieval("functional", top_k=3) + assert len(res) == 3 + assert "functional" in res[0].content + assert "functional" in res[1].content + assert "functional" in res[2].content + + def test_bm25_retrieval_pagination(self, document_store: OpenSearchDocumentStore): + """ + Test that handling of pagination works as expected, when the matching documents are > 10. + """ + document_store.write_documents( + [ + Document(content="Haskell is a functional programming language"), + Document(content="Lisp is a functional programming language"), + Document(content="Exilir is a functional programming language"), + Document(content="F# is a functional programming language"), + Document(content="C# is a functional programming language"), + Document(content="C++ is an object oriented programming language"), + Document(content="Dart is an object oriented programming language"), + Document(content="Go is an object oriented programming language"), + Document(content="Python is a object oriented programming language"), + Document(content="Ruby is a object oriented programming language"), + Document(content="PHP is a object oriented programming language"), + Document(content="Java is an object oriented programming language"), + Document(content="Javascript is a programming language"), + Document(content="Typescript is a programming language"), + Document(content="C is a programming language"), + ] + ) + + res = document_store._bm25_retrieval("programming", top_k=11) + assert len(res) == 11 + assert all("programming" in doc.content for doc in res) + + def test_bm25_retrieval_with_fuzziness(self, document_store: OpenSearchDocumentStore): + document_store.write_documents( + [ + Document(content="Haskell is a functional programming language"), + Document(content="Lisp is a functional programming language"), + Document(content="Exilir is a functional programming language"), + Document(content="F# is a functional programming language"), + Document(content="C# is a functional programming language"), + Document(content="C++ is an object oriented programming language"), + Document(content="Dart is an object oriented programming language"), + Document(content="Go is an object oriented programming language"), + Document(content="Python is a object oriented programming language"), + Document(content="Ruby is a object oriented programming language"), + Document(content="PHP is a object oriented programming language"), + ] + ) + + query_with_typo = "functinal" + # Query without fuzziness to search for the exact match + res = document_store._bm25_retrieval(query_with_typo, top_k=3, fuzziness="0") + # Nothing is found as the query contains a typo + assert res == [] + + # Query with fuzziness with the same query + res = document_store._bm25_retrieval(query_with_typo, top_k=3, fuzziness="1") + assert len(res) == 3 + assert "functional" in res[0].content + assert "functional" in res[1].content + assert "functional" in res[2].content + + def test_embedding_retrieval(self, document_store_embedding_dim_4: OpenSearchDocumentStore): + docs = [ + Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), + Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), + Document(content="Not very similar document", embedding=[0.0, 0.8, 0.3, 0.9]), + ] + document_store_embedding_dim_4.write_documents(docs) + results = document_store_embedding_dim_4._embedding_retrieval( + query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2, filters={} + ) + assert len(results) == 2 + assert results[0].content == "Most similar document" + assert results[1].content == "2nd best document" + + def test_embedding_retrieval_with_filters(self, document_store_embedding_dim_4: OpenSearchDocumentStore): + docs = [ + Document(content="Most similar document", embedding=[1.0, 1.0, 1.0, 1.0]), + Document(content="2nd best document", embedding=[0.8, 0.8, 0.8, 1.0]), + Document( + content="Not very similar document with meta field", + embedding=[0.0, 0.8, 0.3, 0.9], + meta={"meta_field": "custom_value"}, + ), + ] + document_store_embedding_dim_4.write_documents(docs) + + filters = {"field": "meta_field", "operator": "==", "value": "custom_value"} + # we set top_k=3, to make the test pass as we are not sure whether efficient filtering is supported for nmslib + # TODO: remove top_k=3, when efficient filtering is supported for nmslib + results = document_store_embedding_dim_4._embedding_retrieval( + query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=3, filters=filters + ) + assert len(results) == 1 + assert results[0].content == "Not very similar document with meta field" + + def test_embedding_retrieval_pagination(self, document_store_embedding_dim_4: OpenSearchDocumentStore): + """ + Test that handling of pagination works as expected, when the matching documents are > 10. + """ + + docs = [ + Document(content=f"Document {i}", embedding=[random.random() for _ in range(4)]) # noqa: S311 + for i in range(20) + ] + + document_store_embedding_dim_4.write_documents(docs) + results = document_store_embedding_dim_4._embedding_retrieval( + query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=11, filters={} + ) + assert len(results) == 11 + + def test_embedding_retrieval_query_documents_different_embedding_sizes( + self, document_store_embedding_dim_4: OpenSearchDocumentStore + ): + """ + Test that the retrieval fails if the query embedding and the documents have different embedding sizes. + """ + docs = [Document(content="Hello world", embedding=[0.1, 0.2, 0.3, 0.4])] + document_store_embedding_dim_4.write_documents(docs) + + with pytest.raises(RequestError): + document_store_embedding_dim_4._embedding_retrieval(query_embedding=[0.1, 0.1]) + + def test_write_documents_different_embedding_sizes_fail( + self, document_store_embedding_dim_4: OpenSearchDocumentStore + ): + """ + Test that write_documents fails if the documents have different embedding sizes. + """ + docs = [ + Document(content="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]), + Document(content="Hello world", embedding=[0.1, 0.2]), + ] + + with pytest.raises(DocumentStoreError): + document_store_embedding_dim_4.write_documents(docs) diff --git a/integrations/opensearch/tests/test_embedding_retriever.py b/integrations/opensearch/tests/test_embedding_retriever.py new file mode 100644 index 000000000..2bfe5761a --- /dev/null +++ b/integrations/opensearch/tests/test_embedding_retriever.py @@ -0,0 +1,70 @@ +from unittest.mock import Mock, patch + +from haystack.dataclasses import Document + +from opensearch_haystack.document_store import OpenSearchDocumentStore +from opensearch_haystack.embedding_retriever import OpenSearchEmbeddingRetriever + + +def test_init_default(): + mock_store = Mock(spec=OpenSearchDocumentStore) + retriever = OpenSearchEmbeddingRetriever(document_store=mock_store) + assert retriever._document_store == mock_store + assert retriever._filters == {} + assert retriever._top_k == 10 + + +@patch("opensearch_haystack.document_store.OpenSearch") +def test_to_dict(_mock_opensearch_client): + document_store = OpenSearchDocumentStore(hosts="some fake host") + retriever = OpenSearchEmbeddingRetriever(document_store=document_store) + res = retriever.to_dict() + assert res == { + "type": "opensearch_haystack.embedding_retriever.OpenSearchEmbeddingRetriever", + "init_parameters": { + "document_store": { + "init_parameters": { + "hosts": "some fake host", + "index": "default", + }, + "type": "opensearch_haystack.document_store.OpenSearchDocumentStore", + }, + "filters": {}, + "top_k": 10, + }, + } + + +@patch("opensearch_haystack.document_store.OpenSearch") +def test_from_dict(_mock_opensearch_client): + data = { + "type": "opensearch_haystack.embedding_retriever.OpenSearchEmbeddingRetriever", + "init_parameters": { + "document_store": { + "init_parameters": {"hosts": "some fake host", "index": "default"}, + "type": "opensearch_haystack.document_store.OpenSearchDocumentStore", + }, + "filters": {}, + "top_k": 10, + }, + } + retriever = OpenSearchEmbeddingRetriever.from_dict(data) + assert retriever._document_store + assert retriever._filters == {} + assert retriever._top_k == 10 + + +def test_run(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._embedding_retrieval.return_value = [Document(content="Test doc", embedding=[0.1, 0.2])] + retriever = OpenSearchEmbeddingRetriever(document_store=mock_store) + res = retriever.run(query_embedding=[0.5, 0.7]) + mock_store._embedding_retrieval.assert_called_once_with( + query_embedding=[0.5, 0.7], + filters={}, + top_k=10, + ) + assert len(res) == 1 + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" + assert res["documents"][0].embedding == [0.1, 0.2] diff --git a/integrations/opensearch/tests/test_filters.py b/integrations/opensearch/tests/test_filters.py new file mode 100644 index 000000000..09968121a --- /dev/null +++ b/integrations/opensearch/tests/test_filters.py @@ -0,0 +1,218 @@ +import pytest +from haystack.errors import FilterError + +from opensearch_haystack.filters import _normalize_filters, _normalize_ranges + +filters_data = [ + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + }, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.date", "operator": "<", "value": "2021-01-01"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + ], + }, + { + "bool": { + "must": [ + {"term": {"type": "article"}}, + { + "bool": { + "should": [ + {"terms": {"genre": ["economy", "politics"]}}, + {"term": {"publisher": "nytimes"}}, + ] + } + }, + {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, + {"range": {"rating": {"gte": 3}}}, + ] + } + }, + ), + ( + { + "operator": "OR", + "conditions": [ + { + "operator": "AND", + "conditions": [ + {"field": "meta.Type", "operator": "==", "value": "News Paper"}, + {"field": "meta.Date", "operator": "<", "value": "2020-01-01"}, + ], + }, + { + "operator": "AND", + "conditions": [ + {"field": "meta.Type", "operator": "==", "value": "Blog Post"}, + {"field": "meta.Date", "operator": ">=", "value": "2019-01-01"}, + ], + }, + ], + }, + { + "bool": { + "should": [ + {"bool": {"must": [{"term": {"Type": "News Paper"}}, {"range": {"Date": {"lt": "2020-01-01"}}}]}}, + {"bool": {"must": [{"term": {"Type": "Blog Post"}}, {"range": {"Date": {"gte": "2019-01-01"}}}]}}, + ] + } + }, + ), + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.type", "operator": "==", "value": "article"}, + {"field": "meta.date", "operator": ">=", "value": "2015-01-01"}, + {"field": "meta.date", "operator": "<", "value": "2021-01-01"}, + {"field": "meta.rating", "operator": ">=", "value": 3}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, + {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, + ], + }, + ], + }, + { + "bool": { + "must": [ + {"term": {"type": "article"}}, + { + "bool": { + "should": [ + {"terms": {"genre": ["economy", "politics"]}}, + {"term": {"publisher": "nytimes"}}, + ] + } + }, + {"range": {"date": {"gte": "2015-01-01", "lt": "2021-01-01"}}}, + {"range": {"rating": {"gte": 3}}}, + ] + } + }, + ), + ( + {"operator": "AND", "conditions": [{"field": "text", "operator": "==", "value": "A Foo Document 1"}]}, + {"bool": {"must": [{"match": {"text": {"query": "A Foo Document 1", "minimum_should_match": "100%"}}}]}}, + ), + ( + { + "operator": "OR", + "conditions": [ + { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + {"field": "meta.name", "operator": "==", "value": "name_1"}, + ], + }, + {"field": "meta.number", "operator": "<", "value": 1.0}, + ], + }, + { + "bool": { + "should": [ + {"bool": {"should": [{"term": {"name": "name_0"}}, {"term": {"name": "name_1"}}]}}, + {"range": {"number": {"lt": 1.0}}}, + ] + } + }, + ), + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "<=", "value": 2}, + {"field": "meta.number", "operator": ">=", "value": 0}, + {"field": "meta.name", "operator": "in", "value": ["name_0", "name_1"]}, + ], + }, + {"bool": {"must": [{"terms": {"name": ["name_0", "name_1"]}}, {"range": {"number": {"lte": 2, "gte": 0}}}]}}, + ), + ( + { + "operator": "AND", + "conditions": [ + {"field": "meta.number", "operator": "<=", "value": 2}, + {"field": "meta.number", "operator": ">=", "value": 0}, + ], + }, + {"bool": {"must": [{"range": {"number": {"lte": 2, "gte": 0}}}]}}, + ), + ( + { + "operator": "OR", + "conditions": [ + {"field": "meta.name", "operator": "==", "value": "name_0"}, + {"field": "meta.name", "operator": "==", "value": "name_1"}, + ], + }, + {"bool": {"should": [{"term": {"name": "name_0"}}, {"term": {"name": "name_1"}}]}}, + ), + ( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.number", "operator": "==", "value": 100}, + {"field": "meta.name", "operator": "==", "value": "name_0"}, + ], + }, + {"bool": {"must_not": [{"bool": {"must": [{"term": {"number": 100}}, {"term": {"name": "name_0"}}]}}]}}, + ), +] + + +@pytest.mark.parametrize("filters, expected", filters_data) +def test_normalize_filters(filters, expected): + result = _normalize_filters(filters) + assert result == expected + + +def test_normalize_filters_invalid_operator(): + with pytest.raises(FilterError): + _normalize_filters({"operator": "INVALID", "conditions": []}) + + +def test_normalize_filters_malformed(): + # Missing operator + with pytest.raises(FilterError): + _normalize_filters({"conditions": []}) + + # Missing conditions + with pytest.raises(FilterError): + _normalize_filters({"operator": "AND"}) + + # Missing comparison field + with pytest.raises(FilterError): + _normalize_filters({"operator": "AND", "conditions": [{"operator": "==", "value": "article"}]}) + + # Missing comparison operator + with pytest.raises(FilterError): + _normalize_filters({"operator": "AND", "conditions": [{"field": "meta.type", "operator": "=="}]}) + + # Missing comparison value + with pytest.raises(FilterError): + _normalize_filters({"operator": "AND", "conditions": [{"field": "meta.type", "value": "article"}]}) + + +def test_normalize_ranges(): + conditions = [ + {"range": {"date": {"lt": "2021-01-01"}}}, + {"range": {"date": {"gte": "2015-01-01"}}}, + ] + conditions = _normalize_ranges(conditions) + assert conditions == [ + {"range": {"date": {"lt": "2021-01-01", "gte": "2015-01-01"}}}, + ] From 2f453ce6961bfde99e22b21ccfb2c5c66fc74a3b Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Thu, 30 Nov 2023 15:52:15 +0100 Subject: [PATCH 20/36] feat: extend OpenSearch params support (#70) * feat: extend OpenSearch params support * add defaults to docstrings --- .../src/opensearch_haystack/bm25_retriever.py | 58 +++++++++++++++++-- .../src/opensearch_haystack/document_store.py | 8 ++- .../embedding_retriever.py | 13 ++++- .../opensearch/tests/test_bm25_retriever.py | 58 +++++++++++++++++++ .../opensearch/tests/test_document_store.py | 46 +++++++++++++++ .../tests/test_embedding_retriever.py | 32 ++++++++++ 6 files changed, 204 insertions(+), 11 deletions(-) diff --git a/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py b/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py index 9755d6253..91a133345 100644 --- a/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py +++ b/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py @@ -16,7 +16,22 @@ def __init__( fuzziness: str = "AUTO", top_k: int = 10, scale_score: bool = False, + all_terms_must_match: bool = False, ): + """ + Create the OpenSearchBM25Retriever component. + + :param document_store: An instance of OpenSearchDocumentStore. + :param filters: Filters applied to the retrieved Documents. Defaults to None. + :param fuzziness: Fuzziness parameter for full-text queries. Defaults to "AUTO". + :param top_k: Maximum number of Documents to return, defaults to 10 + :param scale_score: Whether to scale the score of retrieved documents between 0 and 1. + This is useful when comparing documents across different indexes. Defaults to False. + :param all_terms_must_match: If True, all terms in the query string must be present in the retrieved documents. + This is useful when searching for short text where even one term can make a difference. Defaults to False. + :raises ValueError: If `document_store` is not an instance of OpenSearchDocumentStore. + + """ if not isinstance(document_store, OpenSearchDocumentStore): msg = "document_store must be an instance of OpenSearchDocumentStore" raise ValueError(msg) @@ -26,6 +41,7 @@ def __init__( self._fuzziness = fuzziness self._top_k = top_k self._scale_score = scale_score + self._all_terms_must_match = all_terms_must_match def to_dict(self) -> Dict[str, Any]: return default_to_dict( @@ -45,12 +61,44 @@ def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchBM25Retriever": return default_from_dict(cls, data) @component.output_types(documents=List[Document]) - def run(self, query: str): + def run( + self, + query: str, + filters: Optional[Dict[str, Any]] = None, + all_terms_must_match: Optional[bool] = None, + top_k: Optional[int] = None, + fuzziness: Optional[str] = None, + scale_score: Optional[bool] = None, + ): + """ + Retrieve documents using BM25 retrieval. + + :param query: The query string + :param filters: Optional filters to narrow down the search space. + :param all_terms_must_match: If True, all terms in the query string must be present in the retrieved documents. + :param top_k: Maximum number of Documents to return. + :param fuzziness: Fuzziness parameter for full-text queries. + :param scale_score: Whether to scale the score of retrieved documents between 0 and 1. + This is useful when comparing documents across different indexes. + :return: A dictionary containing the retrieved documents. + """ + if filters is None: + filters = self._filters + if all_terms_must_match is None: + all_terms_must_match = self._all_terms_must_match + if top_k is None: + top_k = self._top_k + if fuzziness is None: + fuzziness = self._fuzziness + if scale_score is None: + scale_score = self._scale_score + docs = self._document_store._bm25_retrieval( query=query, - filters=self._filters, - fuzziness=self._fuzziness, - top_k=self._top_k, - scale_score=self._scale_score, + filters=filters, + fuzziness=fuzziness, + top_k=top_k, + scale_score=scale_score, + all_terms_must_match=all_terms_must_match, ) return {"documents": docs} diff --git a/integrations/opensearch/src/opensearch_haystack/document_store.py b/integrations/opensearch/src/opensearch_haystack/document_store.py index fe8495fb0..e4167f777 100644 --- a/integrations/opensearch/src/opensearch_haystack/document_store.py +++ b/integrations/opensearch/src/opensearch_haystack/document_store.py @@ -221,6 +221,7 @@ def _bm25_retrieval( fuzziness: str = "AUTO", top_k: int = 10, scale_score: bool = False, + all_terms_must_match: bool = False, ) -> List[Document]: """ OpenSearch by defaults uses BM25 search algorithm. @@ -234,13 +235,13 @@ def _bm25_retrieval( `query` must be a non empty string, otherwise a `ValueError` will be raised. :param query: String to search in saved Documents' text. - :param filters: Filters applied to the retrieved Documents, for more info - see `OpenSearchDocumentStore.filter_documents`, defaults to None + :param filters: Optional filters to narrow down the search space. :param fuzziness: Fuzziness parameter passed to OpenSearch, defaults to "AUTO". see the official documentation for valid values: https://www.elastic.co/guide/en/OpenSearch/reference/current/common-options.html#fuzziness :param top_k: Maximum number of Documents to return, defaults to 10 :param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False + :param all_terms_must_match: If `True` all terms in `query` must be present in the Document, defaults to False :raises ValueError: If `query` is an empty string :return: List of Document that match `query` """ @@ -249,6 +250,7 @@ def _bm25_retrieval( msg = "query must be a non empty string" raise ValueError(msg) + operator = "AND" if all_terms_must_match else "OR" body: Dict[str, Any] = { "size": top_k, "query": { @@ -259,7 +261,7 @@ def _bm25_retrieval( "query": query, "fuzziness": fuzziness, "type": "most_fields", - "operator": "AND", + "operator": operator, } } ] diff --git a/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py b/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py index 9bbc2a7a3..427920e8a 100644 --- a/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py +++ b/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py @@ -54,16 +54,23 @@ def from_dict(cls, data: Dict[str, Any]) -> "OpenSearchEmbeddingRetriever": return default_from_dict(cls, data) @component.output_types(documents=List[Document]) - def run(self, query_embedding: List[float]): + def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] = None, top_k: Optional[int] = None): """ Retrieve documents using a vector similarity metric. :param query_embedding: Embedding of the query. + :param filters: Optional filters to narrow down the search space. + :param top_k: Maximum number of Documents to return. :return: List of Document similar to `query_embedding`. """ + if filters is None: + filters = self._filters + if top_k is None: + top_k = self._top_k + docs = self._document_store._embedding_retrieval( query_embedding=query_embedding, - filters=self._filters, - top_k=self._top_k, + filters=filters, + top_k=top_k, ) return {"documents": docs} diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py index cfea2d767..c552113c9 100644 --- a/integrations/opensearch/tests/test_bm25_retriever.py +++ b/integrations/opensearch/tests/test_bm25_retriever.py @@ -72,6 +72,64 @@ def test_run(): fuzziness="AUTO", top_k=10, scale_score=False, + all_terms_must_match=False, + ) + assert len(res) == 1 + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" + + +def test_run_init_params(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._bm25_retrieval.return_value = [Document(content="Test doc")] + retriever = OpenSearchBM25Retriever( + document_store=mock_store, + filters={"from": "init"}, + all_terms_must_match=True, + scale_score=True, + top_k=11, + fuzziness="1", + ) + res = retriever.run(query="some query") + mock_store._bm25_retrieval.assert_called_once_with( + query="some query", + filters={"from": "init"}, + fuzziness="1", + top_k=11, + scale_score=True, + all_terms_must_match=True, + ) + assert len(res) == 1 + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" + + +def test_run_time_params(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._bm25_retrieval.return_value = [Document(content="Test doc")] + retriever = OpenSearchBM25Retriever( + document_store=mock_store, + filters={"from": "init"}, + all_terms_must_match=True, + scale_score=True, + top_k=11, + fuzziness="1", + ) + res = retriever.run( + query="some query", + filters={"from": "run"}, + all_terms_must_match=False, + scale_score=False, + top_k=9, + fuzziness="2", + ) + mock_store._bm25_retrieval.assert_called_once_with( + query="some query", + filters={"from": "run"}, + fuzziness="2", + top_k=9, + scale_score=False, + all_terms_must_match=False, ) assert len(res) == 1 assert len(res["documents"]) == 1 diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 4678cca70..8f6e0a13c 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -167,6 +167,52 @@ def test_bm25_retrieval_pagination(self, document_store: OpenSearchDocumentStore assert len(res) == 11 assert all("programming" in doc.content for doc in res) + def test_bm25_retrieval_all_terms_must_match(self, document_store: OpenSearchDocumentStore): + document_store.write_documents( + [ + Document(content="Haskell is a functional programming language"), + Document(content="Lisp is a functional programming language"), + Document(content="Exilir is a functional programming language"), + Document(content="F# is a functional programming language"), + Document(content="C# is a functional programming language"), + Document(content="C++ is an object oriented programming language"), + Document(content="Dart is an object oriented programming language"), + Document(content="Go is an object oriented programming language"), + Document(content="Python is a object oriented programming language"), + Document(content="Ruby is a object oriented programming language"), + Document(content="PHP is a object oriented programming language"), + ] + ) + + res = document_store._bm25_retrieval("functional Haskell", top_k=3, all_terms_must_match=True) + assert len(res) == 1 + assert "Haskell is a functional programming language" in res[0].content + + def test_bm25_retrieval_all_terms_must_match_false(self, document_store: OpenSearchDocumentStore): + document_store.write_documents( + [ + Document(content="Haskell is a functional programming language"), + Document(content="Lisp is a functional programming language"), + Document(content="Exilir is a functional programming language"), + Document(content="F# is a functional programming language"), + Document(content="C# is a functional programming language"), + Document(content="C++ is an object oriented programming language"), + Document(content="Dart is an object oriented programming language"), + Document(content="Go is an object oriented programming language"), + Document(content="Python is a object oriented programming language"), + Document(content="Ruby is a object oriented programming language"), + Document(content="PHP is a object oriented programming language"), + ] + ) + + res = document_store._bm25_retrieval("functional Haskell", top_k=10, all_terms_must_match=False) + assert len(res) == 5 + assert "functional" in res[0].content + assert "functional" in res[1].content + assert "functional" in res[2].content + assert "functional" in res[3].content + assert "functional" in res[4].content + def test_bm25_retrieval_with_fuzziness(self, document_store: OpenSearchDocumentStore): document_store.write_documents( [ diff --git a/integrations/opensearch/tests/test_embedding_retriever.py b/integrations/opensearch/tests/test_embedding_retriever.py index 2bfe5761a..f97dd6e9a 100644 --- a/integrations/opensearch/tests/test_embedding_retriever.py +++ b/integrations/opensearch/tests/test_embedding_retriever.py @@ -68,3 +68,35 @@ def test_run(): assert len(res["documents"]) == 1 assert res["documents"][0].content == "Test doc" assert res["documents"][0].embedding == [0.1, 0.2] + + +def test_run_init_params(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._embedding_retrieval.return_value = [Document(content="Test doc", embedding=[0.1, 0.2])] + retriever = OpenSearchEmbeddingRetriever(document_store=mock_store, filters={"from": "init"}, top_k=11) + res = retriever.run(query_embedding=[0.5, 0.7]) + mock_store._embedding_retrieval.assert_called_once_with( + query_embedding=[0.5, 0.7], + filters={"from": "init"}, + top_k=11, + ) + assert len(res) == 1 + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" + assert res["documents"][0].embedding == [0.1, 0.2] + + +def test_run_time_params(): + mock_store = Mock(spec=OpenSearchDocumentStore) + mock_store._embedding_retrieval.return_value = [Document(content="Test doc", embedding=[0.1, 0.2])] + retriever = OpenSearchEmbeddingRetriever(document_store=mock_store, filters={"from": "init"}, top_k=11) + res = retriever.run(query_embedding=[0.5, 0.7], filters={"from": "run"}, top_k=9) + mock_store._embedding_retrieval.assert_called_once_with( + query_embedding=[0.5, 0.7], + filters={"from": "run"}, + top_k=9, + ) + assert len(res) == 1 + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" + assert res["documents"][0].embedding == [0.1, 0.2] From 449ba201c43e1063305a1279fcb48f94ebd4501d Mon Sep 17 00:00:00 2001 From: tstadel <60758086+tstadel@users.noreply.github.com> Date: Thu, 30 Nov 2023 15:57:18 +0100 Subject: [PATCH 21/36] build: bump OpenSearch integration version to 0.0.2 (#71) --- integrations/opensearch/src/opensearch_haystack/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/opensearch/src/opensearch_haystack/__about__.py b/integrations/opensearch/src/opensearch_haystack/__about__.py index f102a9cad..3b93d0be0 100644 --- a/integrations/opensearch/src/opensearch_haystack/__about__.py +++ b/integrations/opensearch/src/opensearch_haystack/__about__.py @@ -1 +1 @@ -__version__ = "0.0.1" +__version__ = "0.0.2" From cbc3bfcc258658931a5b33feeaa6b2df8cd86098 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 1 Dec 2023 21:13:02 +0100 Subject: [PATCH 22/36] bump instructor_embedders version --- .../instructor-embedders/instructor_embedders/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/instructor-embedders/instructor_embedders/__about__.py b/integrations/instructor-embedders/instructor_embedders/__about__.py index d4a92df1b..bccfd8317 100644 --- a/integrations/instructor-embedders/instructor_embedders/__about__.py +++ b/integrations/instructor-embedders/instructor_embedders/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.0.2" +__version__ = "0.1.0" From d3aa338175e2bea9ba0b18036f43f78d704c7cd9 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sat, 2 Dec 2023 16:20:05 +0100 Subject: [PATCH 23/36] Add Gradient Integration (#72) * Remove preview folder in test/ --------- Co-authored-by: Silvano Cerza * Fix all tests * remove unit marker (#6450) * Gradient embeddings (#2) * apply pr requests * apply the changes (#10) * update tests * more updates * more updates * prepare * remove commits migration artifacts * create package structure * run tests in CI * fix project file * add integration test * fix complaining linter * fix skip call --------- Co-authored-by: Silvano Cerza Co-authored-by: Mateusz Haligowski Co-authored-by: Mateusz Haligowski --- .github/workflows/gradient.yml | 56 ++++++ integrations/gradient/LICENSE.txt | 9 + integrations/gradient/README.md | 21 +++ integrations/gradient/pyproject.toml | 169 ++++++++++++++++++ .../src/gradient_haystack/__about__.py | 4 + .../src/gradient_haystack/__init__.py | 3 + .../gradient_haystack/embedders/__init__.py | 3 + .../embedders/gradient_document_embedder.py | 112 ++++++++++++ .../embedders/gradient_text_embedder.py | 89 +++++++++ .../gradient_haystack/generator/__init__.py | 3 + .../src/gradient_haystack/generator/base.py | 129 +++++++++++++ integrations/gradient/tests/__init__.py | 3 + .../tests/test_gradient_document_embedder.py | 158 ++++++++++++++++ .../tests/test_gradient_rag_pipelines.py | 93 ++++++++++ .../tests/test_gradient_text_embedder.py | 127 +++++++++++++ 15 files changed, 979 insertions(+) create mode 100644 .github/workflows/gradient.yml create mode 100644 integrations/gradient/LICENSE.txt create mode 100644 integrations/gradient/README.md create mode 100644 integrations/gradient/pyproject.toml create mode 100644 integrations/gradient/src/gradient_haystack/__about__.py create mode 100644 integrations/gradient/src/gradient_haystack/__init__.py create mode 100644 integrations/gradient/src/gradient_haystack/embedders/__init__.py create mode 100644 integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py create mode 100644 integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py create mode 100644 integrations/gradient/src/gradient_haystack/generator/__init__.py create mode 100644 integrations/gradient/src/gradient_haystack/generator/base.py create mode 100644 integrations/gradient/tests/__init__.py create mode 100644 integrations/gradient/tests/test_gradient_document_embedder.py create mode 100644 integrations/gradient/tests/test_gradient_rag_pipelines.py create mode 100644 integrations/gradient/tests/test_gradient_text_embedder.py diff --git a/.github/workflows/gradient.yml b/.github/workflows/gradient.yml new file mode 100644 index 000000000..f717ba2c9 --- /dev/null +++ b/.github/workflows/gradient.yml @@ -0,0 +1,56 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / gradient + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - 'integrations/gradient/**' + - '.github/workflows/gradient.yml' + +defaults: + run: + working-directory: integrations/gradient + +concurrency: + group: gradient-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.9', '3.10'] + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run lint:all + + - name: Run tests + run: hatch run cov \ No newline at end of file diff --git a/integrations/gradient/LICENSE.txt b/integrations/gradient/LICENSE.txt new file mode 100644 index 000000000..cf4129e2b --- /dev/null +++ b/integrations/gradient/LICENSE.txt @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2023-present Massimiliano Pippi + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/integrations/gradient/README.md b/integrations/gradient/README.md new file mode 100644 index 000000000..853f5b4c3 --- /dev/null +++ b/integrations/gradient/README.md @@ -0,0 +1,21 @@ +# gradient-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) + +----- + +**Table of Contents** + +- [Installation](#installation) +- [License](#license) + +## Installation + +```console +pip install gradient-haystack +``` + +## License + +`gradient-haystack` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license. diff --git a/integrations/gradient/pyproject.toml b/integrations/gradient/pyproject.toml new file mode 100644 index 000000000..afdd8ecb8 --- /dev/null +++ b/integrations/gradient/pyproject.toml @@ -0,0 +1,169 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "gradient-haystack" +dynamic = ["version"] +description = '' +readme = "README.md" +requires-python = ">=3.7" +license = "MIT" +keywords = [] +authors = [ + { name = "Mateusz Haligowski", email = "mhaligowski@gmail.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai", + "gradientai", +] + +[project.urls] +Documentation = "https://github.com/unknown/gradient-haystack#readme" +Issues = "https://github.com/unknown/gradient-haystack/issues" +Source = "https://github.com/unknown/gradient-haystack" + +[tool.hatch.version] +path = "src/gradient_haystack/__about__.py" + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.8", "3.9", "3.10", "3.11"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/gradient_haystack tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.black] +target-version = ["py38"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py38" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["gradient_haystack"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["gradient_haystack", "tests"] +branch = true +parallel = true +omit = [ + "src/gradient_haystack/__about__.py", +] + +[tool.coverage.paths] +gradient_haystack = ["src/gradient_haystack", "*/gradient-haystack/src/gradient_haystack"] +tests = ["tests", "*/gradient-haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[[tool.mypy.overrides]] +module = [ + "gradientai.*", + "haystack.*", + "pytest.*", + "numpy.*", +] +ignore_missing_imports = true \ No newline at end of file diff --git a/integrations/gradient/src/gradient_haystack/__about__.py b/integrations/gradient/src/gradient_haystack/__about__.py new file mode 100644 index 000000000..132530b41 --- /dev/null +++ b/integrations/gradient/src/gradient_haystack/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# +# SPDX-License-Identifier: MIT +__version__ = "0.0.1" diff --git a/integrations/gradient/src/gradient_haystack/__init__.py b/integrations/gradient/src/gradient_haystack/__init__.py new file mode 100644 index 000000000..bd78f6a28 --- /dev/null +++ b/integrations/gradient/src/gradient_haystack/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# +# SPDX-License-Identifier: MIT diff --git a/integrations/gradient/src/gradient_haystack/embedders/__init__.py b/integrations/gradient/src/gradient_haystack/embedders/__init__.py new file mode 100644 index 000000000..bd78f6a28 --- /dev/null +++ b/integrations/gradient/src/gradient_haystack/embedders/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# +# SPDX-License-Identifier: MIT diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py new file mode 100644 index 000000000..81a93ad2b --- /dev/null +++ b/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py @@ -0,0 +1,112 @@ +import logging +from typing import Any, Dict, List, Optional + +from haystack import Document, component, default_to_dict +from haystack.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install gradientai'") as gradientai_import: + from gradientai import Gradient + +logger = logging.getLogger(__name__) + + +@component +class GradientDocumentEmbedder: + """ + A component for computing Document embeddings using Gradient AI API.. + The embedding of each Document is stored in the `embedding` field of the Document. + + ```python + embedder = GradientDocumentEmbedder( + access_token=gradient_access_token, + workspace_id=gradient_workspace_id, + model_name="bge_large")) + p = Pipeline() + p.add_component(embedder, name="document_embedder") + p.add_component(instance=GradientDocumentEmbedder( + p.add_component(instance=DocumentWriter(document_store=InMemoryDocumentStore()), name="document_writer") + p.connect("document_embedder", "document_writer") + p.run({"document_embedder": {"documents": documents}}) + ``` + """ + + def __init__( + self, + *, + model_name: str = "bge-large", + batch_size: int = 100, + access_token: Optional[str] = None, + workspace_id: Optional[str] = None, + host: Optional[str] = None, + ) -> None: + """ + Create a GradientDocumentEmbedder component. + + :param model_name: The name of the model to use. + :param access_token: The Gradient access token. If not provided it's read from the environment + variable GRADIENT_ACCESS_TOKEN. + :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment + variable GRADIENT_WORKSPACE_ID. + :param host: The Gradient host. By default it uses https://api.gradient.ai/. + """ + gradientai_import.check() + self._batch_size = batch_size + self._host = host + self._model_name = model_name + + self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id) + + def _get_telemetry_data(self) -> Dict[str, Any]: + """ + Data that is sent to Posthog for usage analytics. + """ + return {"model": self._model_name} + + def to_dict(self) -> dict: + """ + Serialize the component to a Python dictionary. + """ + return default_to_dict(self, workspace_id=self._gradient.workspace_id, model_name=self._model_name) + + def warm_up(self) -> None: + """ + Load the embedding model. + """ + if not hasattr(self, "_embedding_model"): + self._embedding_model = self._gradient.get_embeddings_model(slug=self._model_name) + + def _generate_embeddings(self, documents: List[Document], batch_size: int) -> List[List[float]]: + """ + Batches the documents and generates the embeddings. + """ + batches = [documents[i : i + batch_size] for i in range(0, len(documents), batch_size)] + + embeddings = [] + for batch in batches: + response = self._embedding_model.generate_embeddings(inputs=[{"input": doc.content} for doc in batch]) + embeddings.extend([e.embedding for e in response.embeddings]) + + return embeddings + + @component.output_types(documents=List[Document]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + + :param documents: A list of Documents to embed. + """ + if not isinstance(documents, list) or documents and any(not isinstance(doc, Document) for doc in documents): + msg = "GradientDocumentEmbedder expects a list of Documents as input.\ + In case you want to embed a list of strings, please use the GradientTextEmbedder." + raise TypeError(msg) + + if not hasattr(self, "_embedding_model"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + embeddings = self._generate_embeddings(documents=documents, batch_size=self._batch_size) + for doc, embedding in zip(documents, embeddings): + doc.embedding = embedding + + return {"documents": documents} diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py new file mode 100644 index 000000000..53996b785 --- /dev/null +++ b/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py @@ -0,0 +1,89 @@ +from typing import Any, Dict, List, Optional + +from haystack import component, default_to_dict +from haystack.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install gradientai'") as gradientai_import: + from gradientai import Gradient + + +@component +class GradientTextEmbedder: + """ + A component for embedding strings using models hosted on Gradient AI (https://gradient.ai). + + ```python + embedder = GradientTextEmbedder( + access_token=gradient_access_token, + workspace_id=gradient_workspace_id, + model_name="bge_large") + p = Pipeline() + p.add_component(instance=embedder, name="text_embedder") + p.add_component(instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever") + p.connect("text_embedder", "retriever") + p.run("embed me!!!") + ``` + """ + + def __init__( + self, + *, + model_name: str = "bge-large", + access_token: Optional[str] = None, + workspace_id: Optional[str] = None, + host: Optional[str] = None, + ) -> None: + """ + Create a GradientTextEmbedder component. + + :param model_name: The name of the model to use. + :param access_token: The Gradient access token. If not provided it's read from the environment + variable GRADIENT_ACCESS_TOKEN. + :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment + variable GRADIENT_WORKSPACE_ID. + :param host: The Gradient host. By default it uses https://api.gradient.ai/. + """ + gradientai_import.check() + self._host = host + self._model_name = model_name + + self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id) + + def _get_telemetry_data(self) -> Dict[str, Any]: + """ + Data that is sent to Posthog for usage analytics. + """ + return {"model": self._model_name} + + def to_dict(self) -> dict: + """ + Serialize the component to a Python dictionary. + """ + return default_to_dict(self, workspace_id=self._gradient.workspace_id, model_name=self._model_name) + + def warm_up(self) -> None: + """ + Load the embedding model. + """ + if not hasattr(self, "_embedding_model"): + self._embedding_model = self._gradient.get_embeddings_model(slug=self._model_name) + + @component.output_types(embedding=List[float]) + def run(self, text: str): + """Generates an embedding for a single text.""" + if not isinstance(text, str): + msg = "GradientTextEmbedder expects a string as an input.\ + In case you want to embed a list of Documents, please use the GradientDocumentEmbedder." + raise TypeError(msg) + + if not hasattr(self, "_embedding_model"): + msg = "The embedding model has not been loaded. Please call warm_up() before running." + raise RuntimeError(msg) + + result = self._embedding_model.generate_embeddings(inputs=[{"input": text}]) + + if (not result) or (result.embeddings is None) or (len(result.embeddings) == 0): + msg = "The embedding model did not return any embeddings." + raise RuntimeError(msg) + + return {"embedding": result.embeddings[0].embedding} diff --git a/integrations/gradient/src/gradient_haystack/generator/__init__.py b/integrations/gradient/src/gradient_haystack/generator/__init__.py new file mode 100644 index 000000000..bd78f6a28 --- /dev/null +++ b/integrations/gradient/src/gradient_haystack/generator/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# +# SPDX-License-Identifier: MIT diff --git a/integrations/gradient/src/gradient_haystack/generator/base.py b/integrations/gradient/src/gradient_haystack/generator/base.py new file mode 100644 index 000000000..536525377 --- /dev/null +++ b/integrations/gradient/src/gradient_haystack/generator/base.py @@ -0,0 +1,129 @@ +import logging +from typing import Any, Dict, List, Optional + +from haystack import component, default_to_dict +from haystack.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install gradientai'") as gradientai_import: + from gradientai import Gradient + +logger = logging.getLogger(__name__) + + +@component +class GradientGenerator: + """ + LLM Generator interfacing [Gradient AI](https://gradient.ai/). + + Queries the LLM using Gradient AI's SDK ('gradientai' package). + See [Gradient AI API](https://docs.gradient.ai/docs/sdk-quickstart) for more details. + + ```python + llm = GradientGenerator( + access_token=gradient_access_token, + workspace_id=gradient_workspace_id, + base_model_slug="llama2-7b-chat") + llm.warm_up() + print(llm.run(prompt="What is the meaning of life?")) + # Output: {'replies': ['42']} + ``` + """ + + def __init__( + self, + *, + access_token: Optional[str] = None, + base_model_slug: Optional[str] = None, + host: Optional[str] = None, + max_generated_token_count: Optional[int] = None, + model_adapter_id: Optional[str] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + workspace_id: Optional[str] = None, + ) -> None: + """ + Create a GradientGenerator component. + + :param access_token: The Gradient access token. If not provided it's read from the environment + variable GRADIENT_ACCESS_TOKEN. + :param base_model_slug: The base model slug to use. + :param host: The Gradient host. By default it uses https://api.gradient.ai/. + :param max_generated_token_count: The maximum number of tokens to generate. + :param model_adapter_id: The model adapter ID to use. + :param temperature: The temperature to use. + :param top_k: The top k to use. + :param top_p: The top p to use. + :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment + variable GRADIENT_WORKSPACE_ID. + """ + gradientai_import.check() + + self._access_token = access_token + self._base_model_slug = base_model_slug + self._host = host + self._max_generated_token_count = max_generated_token_count + self._model_adapter_id = model_adapter_id + self._temperature = temperature + self._top_k = top_k + self._top_p = top_p + self._workspace_id = workspace_id + + has_base_model_slug = base_model_slug is not None and base_model_slug != "" + has_model_adapter_id = model_adapter_id is not None and model_adapter_id != "" + + if not has_base_model_slug and not has_model_adapter_id: + msg = "Either base_model_slug or model_adapter_id must be provided." + raise ValueError(msg) + if has_base_model_slug and has_model_adapter_id: + msg = "Only one of base_model_slug or model_adapter_id must be provided." + raise ValueError(msg) + + if has_base_model_slug: + self._base_model_slug = base_model_slug + if has_model_adapter_id: + self._model_adapter_id = model_adapter_id + + self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + base_model_slug=self._base_model_slug, + host=self._host, + max_generated_token_count=self._max_generated_token_count, + model_adapter_id=self._model_adapter_id, + temperature=self._temperature, + top_k=self._top_k, + top_p=self._top_p, + workspace_id=self._workspace_id, + ) + + def warm_up(self): + """ + Initializes the LLM model instance if it doesn't exist. + """ + if not hasattr(self, "_model"): + if isinstance(self._base_model_slug, str): + self._model = self._gradient.get_base_model(base_model_slug=self._base_model_slug) + if isinstance(self._model_adapter_id, str): + self._model = self._gradient.get_model_adapter(model_adapter_id=self._model_adapter_id) + + @component.output_types(replies=List[str]) + def run(self, prompt: str): + """ + Queries the LLM with the prompt to produce replies. + + :param prompt: The prompt to be sent to the generative model. + """ + resp = self._model.complete( + query=prompt, + max_generated_token_count=self._max_generated_token_count, + temperature=self._temperature, + top_k=self._top_k, + top_p=self._top_p, + ) + return {"replies": [resp.generated_output]} diff --git a/integrations/gradient/tests/__init__.py b/integrations/gradient/tests/__init__.py new file mode 100644 index 000000000..bd78f6a28 --- /dev/null +++ b/integrations/gradient/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# +# SPDX-License-Identifier: MIT diff --git a/integrations/gradient/tests/test_gradient_document_embedder.py b/integrations/gradient/tests/test_gradient_document_embedder.py new file mode 100644 index 000000000..bac02df5e --- /dev/null +++ b/integrations/gradient/tests/test_gradient_document_embedder.py @@ -0,0 +1,158 @@ +from unittest.mock import MagicMock, NonCallableMagicMock + +import numpy as np +import pytest +from gradientai.openapi.client.models.generate_embedding_success import GenerateEmbeddingSuccess +from haystack import Document + +from gradient_haystack.embedders.gradient_document_embedder import GradientDocumentEmbedder + +access_token = "access_token" +workspace_id = "workspace_id" +model = "bge-large" + + +class TestGradientDocumentEmbedder: + @pytest.mark.unit + def test_init_from_env(self, monkeypatch): + monkeypatch.setenv("GRADIENT_ACCESS_TOKEN", access_token) + monkeypatch.setenv("GRADIENT_WORKSPACE_ID", workspace_id) + + embedder = GradientDocumentEmbedder() + assert embedder is not None + assert embedder._gradient.workspace_id == workspace_id + assert embedder._gradient._api_client.configuration.access_token == access_token + + @pytest.mark.unit + def test_init_without_access_token(self, monkeypatch): + monkeypatch.delenv("GRADIENT_ACCESS_TOKEN", raising=False) + + with pytest.raises(ValueError): + GradientDocumentEmbedder(workspace_id=workspace_id) + + @pytest.mark.unit + def test_init_without_workspace(self, monkeypatch): + monkeypatch.delenv("GRADIENT_WORKSPACE_ID", raising=False) + + with pytest.raises(ValueError): + GradientDocumentEmbedder(access_token=access_token) + + @pytest.mark.unit + def test_init_from_params(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + assert embedder is not None + assert embedder._gradient.workspace_id == workspace_id + assert embedder._gradient._api_client.configuration.access_token == access_token + + @pytest.mark.unit + def test_init_from_params_precedence(self, monkeypatch): + monkeypatch.setenv("GRADIENT_ACCESS_TOKEN", "env_access_token") + monkeypatch.setenv("GRADIENT_WORKSPACE_ID", "env_workspace_id") + + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + assert embedder is not None + assert embedder._gradient.workspace_id == workspace_id + assert embedder._gradient._api_client.configuration.access_token == access_token + + @pytest.mark.unit + def test_to_dict(self): + component = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + data = component.to_dict() + assert data == { + "type": "gradient_haystack.embedders.gradient_document_embedder.GradientDocumentEmbedder", + "init_parameters": {"workspace_id": workspace_id, "model_name": "bge-large"}, + } + + @pytest.mark.unit + def test_warmup(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._gradient.get_embeddings_model = MagicMock() + embedder.warm_up() + embedder._gradient.get_embeddings_model.assert_called_once_with(slug="bge-large") + + @pytest.mark.unit + def test_warmup_doesnt_reload(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._gradient.get_embeddings_model = MagicMock(default_return_value="fake model") + embedder.warm_up() + embedder.warm_up() + embedder._gradient.get_embeddings_model.assert_called_once_with(slug="bge-large") + + @pytest.mark.unit + def test_run_fail_if_not_warmed_up(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + + with pytest.raises(RuntimeError, match="warm_up()"): + embedder.run(documents=[Document(content=f"document number {i}") for i in range(5)]) + + @pytest.mark.unit + def test_run(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._embedding_model = NonCallableMagicMock() + embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": i} for i in range(5)] + ) + + documents = [Document(content=f"document number {i}") for i in range(5)] + + result = embedder.run(documents=documents) + + assert embedder._embedding_model.generate_embeddings.call_count == 1 + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) + + @pytest.mark.unit + def test_run_batch(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._embedding_model = NonCallableMagicMock() + + embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": i} for i in range(110)] + ) + + documents = [Document(content=f"document number {i}") for i in range(110)] + + result = embedder.run(documents=documents) + + assert embedder._embedding_model.generate_embeddings.call_count == 2 + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) + + @pytest.mark.unit + def test_run_custom_batch(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id, batch_size=20) + embedder._embedding_model = NonCallableMagicMock() + + document_count = 101 + embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": i} for i in range(document_count)] + ) + + documents = [Document(content=f"document number {i}") for i in range(document_count)] + + result = embedder.run(documents=documents) + + assert embedder._embedding_model.generate_embeddings.call_count == 6 + assert isinstance(result["documents"], list) + assert len(result["documents"]) == len(documents) + for doc in result["documents"]: + assert isinstance(doc, Document) + assert isinstance(doc.embedding, list) + assert isinstance(doc.embedding[0], float) + + @pytest.mark.unit + def test_run_empty(self): + embedder = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._embedding_model = NonCallableMagicMock() + + result = embedder.run(documents=[]) + + assert result["documents"] == [] diff --git a/integrations/gradient/tests/test_gradient_rag_pipelines.py b/integrations/gradient/tests/test_gradient_rag_pipelines.py new file mode 100644 index 000000000..5835944a8 --- /dev/null +++ b/integrations/gradient/tests/test_gradient_rag_pipelines.py @@ -0,0 +1,93 @@ +import json +import os + +import pytest +from haystack import Document, Pipeline +from haystack.components.builders.answer_builder import AnswerBuilder +from haystack.components.builders.prompt_builder import PromptBuilder +from haystack.components.retrievers import InMemoryEmbeddingRetriever +from haystack.components.writers import DocumentWriter +from haystack.document_stores import InMemoryDocumentStore + +from gradient_haystack.embedders.gradient_document_embedder import GradientDocumentEmbedder +from gradient_haystack.embedders.gradient_text_embedder import GradientTextEmbedder +from gradient_haystack.generator.base import GradientGenerator + + +@pytest.mark.skipif( + not os.environ.get("GRADIENT_ACCESS_TOKEN", None) or not os.environ.get("GRADIENT_WORKSPACE_ID", None), + reason="Export env variables called GRADIENT_ACCESS_TOKEN and GRADIENT_WORKSPACE_ID \ + containing the Gradient configuration settings to run this test.", +) +def test_gradient_embedding_retrieval_rag_pipeline(tmp_path): + # Create the RAG pipeline + prompt_template = """ + Given these documents, answer the question.\nDocuments: + {% for doc in documents %} + {{ doc.content }} + {% endfor %} + \nQuestion: {{question}} + \nAnswer: + """ + + gradient_access_token = os.environ.get("GRADIENT_ACCESS_TOKEN") + rag_pipeline = Pipeline() + embedder = GradientTextEmbedder(access_token=gradient_access_token) + rag_pipeline.add_component(instance=embedder, name="text_embedder") + rag_pipeline.add_component( + instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever" + ) + rag_pipeline.add_component(instance=PromptBuilder(template=prompt_template), name="prompt_builder") + rag_pipeline.add_component( + instance=GradientGenerator(access_token=gradient_access_token, base_model_slug="llama2-7b-chat"), name="llm" + ) + rag_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder") + rag_pipeline.connect("text_embedder", "retriever") + rag_pipeline.connect("retriever", "prompt_builder.documents") + rag_pipeline.connect("prompt_builder", "llm") + rag_pipeline.connect("llm.replies", "answer_builder.replies") + rag_pipeline.connect("retriever", "answer_builder.documents") + + # Draw the pipeline + rag_pipeline.draw(tmp_path / "test_gradient_embedding_rag_pipeline.png") + + # Serialize the pipeline to JSON + with open(tmp_path / "test_bm25_rag_pipeline.json", "w") as f: + json.dump(rag_pipeline.to_dict(), f) + + # Load the pipeline back + with open(tmp_path / "test_bm25_rag_pipeline.json") as f: + rag_pipeline = Pipeline.from_dict(json.load(f)) + + # Populate the document store + documents = [ + Document(content="My name is Jean and I live in Paris."), + Document(content="My name is Mark and I live in Berlin."), + Document(content="My name is Giorgio and I live in Rome."), + ] + document_store = rag_pipeline.get_component("retriever").document_store + indexing_pipeline = Pipeline() + indexing_pipeline.add_component(instance=GradientDocumentEmbedder(), name="document_embedder") + indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="document_writer") + indexing_pipeline.connect("document_embedder", "document_writer") + indexing_pipeline.run({"document_embedder": {"documents": documents}}) + + # Query and assert + questions = ["Who lives in Paris?", "Who lives in Berlin?", "Who lives in Rome?"] + answers_spywords = ["Jean", "Mark", "Giorgio"] + + for question, spyword in zip(questions, answers_spywords): + result = rag_pipeline.run( + { + "text_embedder": {"text": question}, + "prompt_builder": {"question": question}, + "answer_builder": {"query": question}, + } + ) + + assert len(result["answer_builder"]["answers"]) == 1 + generated_answer = result["answer_builder"]["answers"][0] + assert spyword in generated_answer.data + assert generated_answer.query == question + assert hasattr(generated_answer, "documents") + assert hasattr(generated_answer, "metadata") diff --git a/integrations/gradient/tests/test_gradient_text_embedder.py b/integrations/gradient/tests/test_gradient_text_embedder.py new file mode 100644 index 000000000..9623db5d4 --- /dev/null +++ b/integrations/gradient/tests/test_gradient_text_embedder.py @@ -0,0 +1,127 @@ +from unittest.mock import MagicMock, NonCallableMagicMock + +import numpy as np +import pytest +from gradientai.openapi.client.models.generate_embedding_success import GenerateEmbeddingSuccess + +from gradient_haystack.embedders.gradient_text_embedder import GradientTextEmbedder + +access_token = "access_token" +workspace_id = "workspace_id" +model = "bge-large" + + +class TestGradientTextEmbedder: + @pytest.mark.unit + def test_init_from_env(self, monkeypatch): + monkeypatch.setenv("GRADIENT_ACCESS_TOKEN", access_token) + monkeypatch.setenv("GRADIENT_WORKSPACE_ID", workspace_id) + + embedder = GradientTextEmbedder() + assert embedder is not None + assert embedder._gradient.workspace_id == workspace_id + assert embedder._gradient._api_client.configuration.access_token == access_token + + @pytest.mark.unit + def test_init_without_access_token(self, monkeypatch): + monkeypatch.delenv("GRADIENT_ACCESS_TOKEN", raising=False) + + with pytest.raises(ValueError): + GradientTextEmbedder(workspace_id=workspace_id) + + @pytest.mark.unit + def test_init_without_workspace(self, monkeypatch): + monkeypatch.delenv("GRADIENT_WORKSPACE_ID", raising=False) + + with pytest.raises(ValueError): + GradientTextEmbedder(access_token=access_token) + + @pytest.mark.unit + def test_init_from_params(self): + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + assert embedder is not None + assert embedder._gradient.workspace_id == workspace_id + assert embedder._gradient._api_client.configuration.access_token == access_token + + @pytest.mark.unit + def test_init_from_params_precedence(self, monkeypatch): + monkeypatch.setenv("GRADIENT_ACCESS_TOKEN", "env_access_token") + monkeypatch.setenv("GRADIENT_WORKSPACE_ID", "env_workspace_id") + + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + assert embedder is not None + assert embedder._gradient.workspace_id == workspace_id + assert embedder._gradient._api_client.configuration.access_token == access_token + + @pytest.mark.unit + def test_to_dict(self): + component = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + data = component.to_dict() + assert data == { + "type": "gradient_haystack.embedders.gradient_text_embedder.GradientTextEmbedder", + "init_parameters": {"workspace_id": workspace_id, "model_name": "bge-large"}, + } + + @pytest.mark.unit + def test_warmup(self): + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._gradient.get_embeddings_model = MagicMock() + embedder.warm_up() + embedder._gradient.get_embeddings_model.assert_called_once_with(slug="bge-large") + + @pytest.mark.unit + def test_warmup_doesnt_reload(self): + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._gradient.get_embeddings_model = MagicMock(default_return_value="fake model") + embedder.warm_up() + embedder.warm_up() + embedder._gradient.get_embeddings_model.assert_called_once_with(slug="bge-large") + + @pytest.mark.unit + def test_run_fail_if_not_warmed_up(self): + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + + with pytest.raises(RuntimeError, match="warm_up()"): + embedder.run(text="The food was delicious") + + @pytest.mark.unit + def test_run_fail_when_no_embeddings_returned(self): + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._embedding_model = NonCallableMagicMock() + embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess(embeddings=[]) + + with pytest.raises(RuntimeError): + _result = embedder.run(text="The food was delicious") + embedder._embedding_model.generate_embeddings.assert_called_once_with( + inputs=[{"input": "The food was delicious"}] + ) + + @pytest.mark.unit + def test_run_empty_string(self): + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._embedding_model = NonCallableMagicMock() + embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": 0}] + ) + + result = embedder.run(text="") + embedder._embedding_model.generate_embeddings.assert_called_once_with(inputs=[{"input": ""}]) + + assert len(result["embedding"]) == 1024 # 1024 is the bge-large embedding size + assert all(isinstance(x, float) for x in result["embedding"]) + + @pytest.mark.unit + def test_run(self): + embedder = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) + embedder._embedding_model = NonCallableMagicMock() + embedder._embedding_model.generate_embeddings.return_value = GenerateEmbeddingSuccess( + embeddings=[{"embedding": np.random.rand(1024).tolist(), "index": 0}] + ) + + result = embedder.run(text="The food was delicious") + embedder._embedding_model.generate_embeddings.assert_called_once_with( + inputs=[{"input": "The food was delicious"}] + ) + + assert len(result["embedding"]) == 1024 # 1024 is the bge-large embedding size + assert all(isinstance(x, float) for x in result["embedding"]) From 8d0342f47dd0f54f9c36d600179e5635d91f690b Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sat, 2 Dec 2023 16:22:42 +0100 Subject: [PATCH 24/36] add gradient to the readme --- README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 978be06d8..c634f466b 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,14 @@ This repository contains integrations to extend the capabilities of [Haystack](https://github.com/deepset-ai/haystack) version 2.0 and onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai), see each integration's `README` file for details around installation, usage and support. -| Package | Type | PyPi Package | Status | -| ----------------------------------------------------------------- | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | -| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | -| [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | -| [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | -| [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) +| Package | Type | PyPi Package | Status | +| ------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | +| [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | +| [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) | +| [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | +| [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | +| [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) | ## Contributing From 82e0b56200d79b31493756e18bf85787a4f6beec Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sat, 2 Dec 2023 16:23:48 +0100 Subject: [PATCH 25/36] bump gradient version --- integrations/gradient/src/gradient_haystack/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/gradient/src/gradient_haystack/__about__.py b/integrations/gradient/src/gradient_haystack/__about__.py index 132530b41..b89a8a23d 100644 --- a/integrations/gradient/src/gradient_haystack/__about__.py +++ b/integrations/gradient/src/gradient_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present Massimiliano Pippi # # SPDX-License-Identifier: MIT -__version__ = "0.0.1" +__version__ = "0.1.0" From ec7ee2c5fc6c86a37dc2ff35ccde93618ed3ed2a Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 3 Dec 2023 09:14:26 +0100 Subject: [PATCH 26/36] fix license headers --- integrations/chroma/LICENSE | 2 +- integrations/chroma/tests/test_retriever.py | 3 + integrations/elasticsearch/LICENSE | 2 +- .../src/elasticsearch_haystack/__about__.py | 2 +- .../src/elasticsearch_haystack/__init__.py | 2 +- .../elasticsearch_haystack/bm25_retriever.py | 2 +- .../elasticsearch_haystack/document_store.py | 2 +- .../src/elasticsearch_haystack/filters.py | 3 + integrations/elasticsearch/tests/__init__.py | 2 +- .../tests/test_bm25_retriever.py | 2 +- .../tests/test_document_store.py | 2 +- .../elasticsearch/tests/test_filters.py | 3 + integrations/gradient/LICENSE.txt | 202 +++++++++++++++++- integrations/gradient/README.md | 7 +- integrations/gradient/pyproject.toml | 2 +- .../src/gradient_haystack/__about__.py | 4 +- .../src/gradient_haystack/__init__.py | 4 +- .../gradient_haystack/embedders/__init__.py | 4 +- .../gradient_haystack/generator/__init__.py | 4 +- integrations/gradient/tests/__init__.py | 4 +- integrations/instructor-embedders/LICENSE.txt | 2 +- .../embedding_backend/instructor_backend.py | 3 + .../instructor_document_embedder.py | 3 + .../instructor_text_embedder.py | 3 + integrations/opensearch/LICENSE | 2 +- .../src/opensearch_haystack/__about__.py | 3 + .../src/opensearch_haystack/__init__.py | 3 + .../src/opensearch_haystack/bm25_retriever.py | 3 + .../src/opensearch_haystack/document_store.py | 3 + .../embedding_retriever.py | 3 + .../src/opensearch_haystack/filters.py | 3 + integrations/opensearch/tests/__init__.py | 3 + .../opensearch/tests/test_bm25_retriever.py | 3 + .../opensearch/tests/test_document_store.py | 3 + .../tests/test_embedding_retriever.py | 3 + integrations/opensearch/tests/test_filters.py | 3 + .../unstructured/fileconverter/LICENSE | 2 +- .../fileconverter.py | 3 + .../fileconverter/tests/test_fileconverter.py | 3 + 39 files changed, 281 insertions(+), 31 deletions(-) diff --git a/integrations/chroma/LICENSE b/integrations/chroma/LICENSE index 261eeb9e9..de4c7f39f 100644 --- a/integrations/chroma/LICENSE +++ b/integrations/chroma/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2023 deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/chroma/tests/test_retriever.py b/integrations/chroma/tests/test_retriever.py index b77dd4ca4..c82b8ee0b 100644 --- a/integrations/chroma/tests/test_retriever.py +++ b/integrations/chroma/tests/test_retriever.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 import pytest from chroma_haystack.document_store import ChromaDocumentStore diff --git a/integrations/elasticsearch/LICENSE b/integrations/elasticsearch/LICENSE index 261eeb9e9..de4c7f39f 100644 --- a/integrations/elasticsearch/LICENSE +++ b/integrations/elasticsearch/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2023 deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py index 2faac960f..d4a92df1b 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present Silvano Cerza +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 __version__ = "0.0.2" diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/__init__.py b/integrations/elasticsearch/src/elasticsearch_haystack/__init__.py index af32a762d..0c9feacb2 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/__init__.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present Silvano Cerza +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 from elasticsearch_haystack.document_store import ElasticsearchDocumentStore diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py b/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py index 804e8db15..df85cf1a1 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present Silvano Cerza +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py index 6dae14341..17c2fa4af 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present Silvano Cerza +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 import logging diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/filters.py b/integrations/elasticsearch/src/elasticsearch_haystack/filters.py index bb5b15311..b5adc37db 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/filters.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/filters.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from datetime import datetime from typing import Any, Dict, List diff --git a/integrations/elasticsearch/tests/__init__.py b/integrations/elasticsearch/tests/__init__.py index ec55bfc66..e873bc332 100644 --- a/integrations/elasticsearch/tests/__init__.py +++ b/integrations/elasticsearch/tests/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present Silvano Cerza +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/elasticsearch/tests/test_bm25_retriever.py b/integrations/elasticsearch/tests/test_bm25_retriever.py index 8f19c8897..bc1fc55bb 100644 --- a/integrations/elasticsearch/tests/test_bm25_retriever.py +++ b/integrations/elasticsearch/tests/test_bm25_retriever.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present Silvano Cerza +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 from unittest.mock import Mock, patch diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index d6428e762..6a69b6e4b 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present Silvano Cerza +# SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/elasticsearch/tests/test_filters.py b/integrations/elasticsearch/tests/test_filters.py index 6db6a0dd2..3cf125fc7 100644 --- a/integrations/elasticsearch/tests/test_filters.py +++ b/integrations/elasticsearch/tests/test_filters.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 import pytest from haystack.errors import FilterError diff --git a/integrations/gradient/LICENSE.txt b/integrations/gradient/LICENSE.txt index cf4129e2b..de4c7f39f 100644 --- a/integrations/gradient/LICENSE.txt +++ b/integrations/gradient/LICENSE.txt @@ -1,9 +1,201 @@ -MIT License + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ -Copyright (c) 2023-present Massimiliano Pippi + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + 1. Definitions. -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023 deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/gradient/README.md b/integrations/gradient/README.md index 853f5b4c3..e1a46114b 100644 --- a/integrations/gradient/README.md +++ b/integrations/gradient/README.md @@ -7,8 +7,9 @@ **Table of Contents** -- [Installation](#installation) -- [License](#license) +- [gradient-haystack](#gradient-haystack) + - [Installation](#installation) + - [License](#license) ## Installation @@ -18,4 +19,4 @@ pip install gradient-haystack ## License -`gradient-haystack` is distributed under the terms of the [MIT](https://spdx.org/licenses/MIT.html) license. +`gradient-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/gradient/pyproject.toml b/integrations/gradient/pyproject.toml index afdd8ecb8..79a39a384 100644 --- a/integrations/gradient/pyproject.toml +++ b/integrations/gradient/pyproject.toml @@ -8,7 +8,7 @@ dynamic = ["version"] description = '' readme = "README.md" requires-python = ">=3.7" -license = "MIT" +license = "Apache-2.0" keywords = [] authors = [ { name = "Mateusz Haligowski", email = "mhaligowski@gmail.com" }, diff --git a/integrations/gradient/src/gradient_haystack/__about__.py b/integrations/gradient/src/gradient_haystack/__about__.py index b89a8a23d..bccfd8317 100644 --- a/integrations/gradient/src/gradient_haystack/__about__.py +++ b/integrations/gradient/src/gradient_haystack/__about__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# SPDX-FileCopyrightText: 2023-present deepset GmbH # -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: Apache-2.0 __version__ = "0.1.0" diff --git a/integrations/gradient/src/gradient_haystack/__init__.py b/integrations/gradient/src/gradient_haystack/__init__.py index bd78f6a28..e873bc332 100644 --- a/integrations/gradient/src/gradient_haystack/__init__.py +++ b/integrations/gradient/src/gradient_haystack/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# SPDX-FileCopyrightText: 2023-present deepset GmbH # -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/gradient/src/gradient_haystack/embedders/__init__.py b/integrations/gradient/src/gradient_haystack/embedders/__init__.py index bd78f6a28..e873bc332 100644 --- a/integrations/gradient/src/gradient_haystack/embedders/__init__.py +++ b/integrations/gradient/src/gradient_haystack/embedders/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# SPDX-FileCopyrightText: 2023-present deepset GmbH # -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/gradient/src/gradient_haystack/generator/__init__.py b/integrations/gradient/src/gradient_haystack/generator/__init__.py index bd78f6a28..e873bc332 100644 --- a/integrations/gradient/src/gradient_haystack/generator/__init__.py +++ b/integrations/gradient/src/gradient_haystack/generator/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# SPDX-FileCopyrightText: 2023-present deepset GmbH # -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/gradient/tests/__init__.py b/integrations/gradient/tests/__init__.py index bd78f6a28..e873bc332 100644 --- a/integrations/gradient/tests/__init__.py +++ b/integrations/gradient/tests/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2023-present Massimiliano Pippi +# SPDX-FileCopyrightText: 2023-present deepset GmbH # -# SPDX-License-Identifier: MIT +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/instructor-embedders/LICENSE.txt b/integrations/instructor-embedders/LICENSE.txt index 137069b82..ba46f22b9 100644 --- a/integrations/instructor-embedders/LICENSE.txt +++ b/integrations/instructor-embedders/LICENSE.txt @@ -58,7 +58,7 @@ APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. -Copyright [yyyy] [name of copyright owner] +Copyright 2023 deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py b/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py index b71f9ffdc..5be300dd3 100644 --- a/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py +++ b/integrations/instructor-embedders/instructor_embedders/embedding_backend/instructor_backend.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from typing import ClassVar, Dict, List, Optional, Union from haystack.lazy_imports import LazyImport diff --git a/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py b/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py index 31b6a2f6a..ba3f6c9b3 100644 --- a/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py +++ b/integrations/instructor-embedders/instructor_embedders/instructor_document_embedder.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional, Union from haystack import Document, component, default_from_dict, default_to_dict diff --git a/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py b/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py index 3a19f860d..043d562d5 100644 --- a/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py +++ b/integrations/instructor-embedders/instructor_embedders/instructor_text_embedder.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional, Union from haystack import component, default_from_dict, default_to_dict diff --git a/integrations/opensearch/LICENSE b/integrations/opensearch/LICENSE index 261eeb9e9..de4c7f39f 100644 --- a/integrations/opensearch/LICENSE +++ b/integrations/opensearch/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2023 deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/opensearch/src/opensearch_haystack/__about__.py b/integrations/opensearch/src/opensearch_haystack/__about__.py index 3b93d0be0..d4a92df1b 100644 --- a/integrations/opensearch/src/opensearch_haystack/__about__.py +++ b/integrations/opensearch/src/opensearch_haystack/__about__.py @@ -1 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 __version__ = "0.0.2" diff --git a/integrations/opensearch/src/opensearch_haystack/__init__.py b/integrations/opensearch/src/opensearch_haystack/__init__.py index 7112ecda6..a15411693 100644 --- a/integrations/opensearch/src/opensearch_haystack/__init__.py +++ b/integrations/opensearch/src/opensearch_haystack/__init__.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from opensearch_haystack.bm25_retriever import OpenSearchBM25Retriever from opensearch_haystack.document_store import OpenSearchDocumentStore from opensearch_haystack.embedding_retriever import OpenSearchEmbeddingRetriever diff --git a/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py b/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py index 91a133345..34184dc19 100644 --- a/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py +++ b/integrations/opensearch/src/opensearch_haystack/bm25_retriever.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional from haystack import component, default_from_dict, default_to_dict diff --git a/integrations/opensearch/src/opensearch_haystack/document_store.py b/integrations/opensearch/src/opensearch_haystack/document_store.py index e4167f777..2cc06c680 100644 --- a/integrations/opensearch/src/opensearch_haystack/document_store.py +++ b/integrations/opensearch/src/opensearch_haystack/document_store.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 import logging from typing import Any, Dict, List, Mapping, Optional, Union diff --git a/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py b/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py index 427920e8a..b6293fc52 100644 --- a/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py +++ b/integrations/opensearch/src/opensearch_haystack/embedding_retriever.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from typing import Any, Dict, List, Optional from haystack import component, default_from_dict, default_to_dict diff --git a/integrations/opensearch/src/opensearch_haystack/filters.py b/integrations/opensearch/src/opensearch_haystack/filters.py index 8f5418145..415304ec1 100644 --- a/integrations/opensearch/src/opensearch_haystack/filters.py +++ b/integrations/opensearch/src/opensearch_haystack/filters.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from datetime import datetime from typing import Any, Dict, List diff --git a/integrations/opensearch/tests/__init__.py b/integrations/opensearch/tests/__init__.py index e69de29bb..e873bc332 100644 --- a/integrations/opensearch/tests/__init__.py +++ b/integrations/opensearch/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/opensearch/tests/test_bm25_retriever.py b/integrations/opensearch/tests/test_bm25_retriever.py index c552113c9..b043e38d4 100644 --- a/integrations/opensearch/tests/test_bm25_retriever.py +++ b/integrations/opensearch/tests/test_bm25_retriever.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from unittest.mock import Mock, patch from haystack.dataclasses import Document diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index 8f6e0a13c..af26dea49 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 import random from typing import List from unittest.mock import patch diff --git a/integrations/opensearch/tests/test_embedding_retriever.py b/integrations/opensearch/tests/test_embedding_retriever.py index f97dd6e9a..db360d757 100644 --- a/integrations/opensearch/tests/test_embedding_retriever.py +++ b/integrations/opensearch/tests/test_embedding_retriever.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from unittest.mock import Mock, patch from haystack.dataclasses import Document diff --git a/integrations/opensearch/tests/test_filters.py b/integrations/opensearch/tests/test_filters.py index 09968121a..34a7682d5 100644 --- a/integrations/opensearch/tests/test_filters.py +++ b/integrations/opensearch/tests/test_filters.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 import pytest from haystack.errors import FilterError diff --git a/integrations/unstructured/fileconverter/LICENSE b/integrations/unstructured/fileconverter/LICENSE index 261eeb9e9..de4c7f39f 100644 --- a/integrations/unstructured/fileconverter/LICENSE +++ b/integrations/unstructured/fileconverter/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2023 deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py b/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py index 0f65365f9..5a565d00b 100644 --- a/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py +++ b/integrations/unstructured/fileconverter/src/unstructured_fileconverter_haystack/fileconverter.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 import logging import os from collections import defaultdict diff --git a/integrations/unstructured/fileconverter/tests/test_fileconverter.py b/integrations/unstructured/fileconverter/tests/test_fileconverter.py index 07c7be1f4..a9c724cba 100644 --- a/integrations/unstructured/fileconverter/tests/test_fileconverter.py +++ b/integrations/unstructured/fileconverter/tests/test_fileconverter.py @@ -1,3 +1,6 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 from pathlib import Path import pytest From e1140fb1f7f9a0dd989f3c0fe1d1c9c3fffbffdd Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 3 Dec 2023 09:16:43 +0100 Subject: [PATCH 27/36] stop running nightlies --- .github/workflows/nodes_text2speech.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/nodes_text2speech.yml b/.github/workflows/nodes_text2speech.yml index 315215d0d..555ad3c0d 100644 --- a/.github/workflows/nodes_text2speech.yml +++ b/.github/workflows/nodes_text2speech.yml @@ -1,8 +1,6 @@ name: Test / text2speech on: - schedule: - - cron: "0 0 * * *" pull_request: paths: - 'integrations/nodes/text2speech/**' From 44043d1e5367e49132b57613314bdfa3b78e9b7c Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 3 Dec 2023 17:13:51 +0100 Subject: [PATCH 28/36] update contribution guidelines --- .pre-commit-config.yaml | 32 ----------------- README.md | 56 +++++++++++++++++++++++++++--- {nodes => integrations}/hatch.toml | 3 -- 3 files changed, 52 insertions(+), 39 deletions(-) delete mode 100644 .pre-commit-config.yaml rename {nodes => integrations}/hatch.toml (72%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 29fb254af..000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 -fail_fast: true - -repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 - hooks: - - id: check-ast # checks Python syntax - - id: check-json # checks JSON syntax - - id: check-yaml # checks YAML syntax - - id: check-toml # checks TOML syntax - #- id: end-of-file-fixer # checks there is a newline at the end of the file # FIXME: JSON schema generator conflicts with this - - id: trailing-whitespace # trims trailing whitespace - - id: check-merge-conflict # checks for no merge conflict strings - - id: check-shebang-scripts-are-executable # checks all shell scripts have executable permissions - - id: mixed-line-ending # normalizes line endings - - id: no-commit-to-branch # prevents committing to main - #- id: pretty-format-json # indents and sorts JSON files # FIXME: JSON schema generator conflicts with this - -- repo: https://github.com/psf/black - rev: 22.8.0 # IMPORTANT: keep this aligned with the black version in pyproject.toml - hooks: - - id: black-jupyter - files: nodes/ - -- repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v0.971' - hooks: - - id: mypy - files: nodes/ \ No newline at end of file diff --git a/README.md b/README.md index c634f466b..e0aae3b1f 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,58 @@ This repository contains integrations to extend the capabilities of [Haystack](https://github.com/deepset-ai/haystack) version 2.0 and onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai), see each integration's `README` file for details around installation, usage and support. +## Contributing + +You will need `hatch` to work on or create new integrations. Run `pip install hatch` to install it. + +### Local development + +All the integrations are self contained, so the first step before working on one is to `cd` into the proper folder. +For example, to work on the Chroma Document Store, from the root of the repo: +```sh +$ cd integrations/chroma +``` + +From there, you can run the tests with `hatch`, that will take care of setting up an isolated Python environment: +```sh +hatch run test +``` + +Similarly, to run the linters: +```sh +hatch run lint:all +``` + +### Create a new integration + +> Core integrations follow the naming convention `PREFIX-haystack`, where `PREFIX` can be the name of the technology +> you're integrating Haystack with. For example, a deepset integration would be named as `deepset-haystack`. + +To create a new integration, from the root of the repo change directory into `integrations`: +```sh +cd integrations +``` + +From there, use `hatch` to create the scaffold of the new integration: +```sh +$ hatch --config hatch.toml new -i +Project name: deepset-haystack +Description []: An example integration, this text can be edited later + +deepset-haystack +├── src +│ └── deepset_haystack +│ ├── __about__.py +│ └── __init__.py +├── tests +│ └── __init__.py +├── LICENSE.txt +├── README.md +└── pyproject.toml +``` + +## Inventory + | Package | Type | PyPi Package | Status | | ------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | @@ -11,7 +63,3 @@ onwards. The code in this repo is maintained by [deepset](https://www.deepset.ai | [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | | [opensearch-haystack](integrations/opensearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/opensearch-haystack.svg)](https://pypi.org/project/opensearch-haystack) | [![Test / opensearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/opensearch.yml) | | [unstructured-fileconverter-haystack](integrations/unstructured/fileconverter/) | File converter | [![PyPI - Version](https://img.shields.io/pypi/v/unstructured-fileconverter-haystack.svg)](https://pypi.org/project/unstructured-fileconverter-haystack) | [![Test / unstructured / fileconverter](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/unstructured_fileconverter.yml) | - -## Contributing - -You will need `hatch` to create new projects in this folder. Run `pip install -r requirements.txt` to install it. diff --git a/nodes/hatch.toml b/integrations/hatch.toml similarity index 72% rename from nodes/hatch.toml rename to integrations/hatch.toml index be4d67218..2a213c5ab 100644 --- a/nodes/hatch.toml +++ b/integrations/hatch.toml @@ -7,6 +7,3 @@ headers = true default = [ "Apache-2.0", ] - -[template.plugins.default] -src-layout = false From c840d0562f1f303ea6bbedbd029f4e7cef2894b0 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 3 Dec 2023 18:58:18 +0100 Subject: [PATCH 29/36] fix labeller --- .github/labeler.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index f5bb7b448..5234f6a51 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,12 +1,16 @@ # Integrations -integration:instructor-embedders: -- components/embedders/instructor-embedders/**/* -integration:unstructured-fileconverter: -- components/converters/unstructured_fileconverter/**/* integration:chroma: -- document_stores/chroma/**/* +- integrations/chroma/**/* integration:elasticsearch: -- document_stores/elasticsearch/**/* +- integrations/elasticsearch/**/* +integration:gradient: +- integrations/gradient/**/* +integration:instructor-embedders: +- integrations/instructor-embedders/**/* +integration:opensearch: +- integrations/opensearch/**/* +integration:unstructured-fileconverter: +- integrations/unstructured/fileconverter/**/* # Topics topic:CI: - .github/* From 1239f2c49ce7659b8d4e7146e56ca9f3d76d78a1 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 3 Dec 2023 19:18:17 +0100 Subject: [PATCH 30/36] Update issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 19 ++++++++++++ .../new-integration-proposal.md | 31 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/new-integration-proposal.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..9c421ea06 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,19 @@ +--- +name: Bug report +about: Report a bug for an integration +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior. Feel free to link a Colab we can run to investigate the issue. + +**Describe your environment (please complete the following information):** + - OS: [e.g. iOS] + - Haystack version: + - Integration version: diff --git a/.github/ISSUE_TEMPLATE/new-integration-proposal.md b/.github/ISSUE_TEMPLATE/new-integration-proposal.md new file mode 100644 index 000000000..d12ed2085 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/new-integration-proposal.md @@ -0,0 +1,31 @@ +--- +name: New Integration Proposal +about: Track the creation process for a new integration +title: '' +labels: New integration request +assignees: '' + +--- + +## Summary and motivation + +Briefly explain the feature request: why do we need this feature? What use cases does it support? + +## Alternatives + +A clear and concise description of any alternative solutions or features you've considered. + +## Detailed design + +Explain the design in enough detail for somebody familiar with Haystack to understand, and for somebody familiar with the implementation to implement. Get into specifics and corner-cases, and include examples of how the feature is used. Also, if there's any new terminology involved, define it here. + +## Checklist + +If the feature request is accepted, ensure the following checklist is complete before closing the issue. + +- [ ] The package has been released on PyPI +- [ ] There is a Github workflow running the tests for the integration nightly and at every PR +- [ ] A label named like `integration:` has been added to this repo +- [ ] The [labeler.yml](https://github.com/deepset-ai/haystack-core-integrations/blob/main/.github/labeler.yml) file has been updated +- [ ] An integration tile has been added to https://github.com/deepset-ai/haystack-integrations +- [ ] The integration has been listed in the [Inventory section](https://github.com/deepset-ai/haystack-core-integrations#inventory) of this repo README From dac1eb8e61de6740e5512f17a54a1b5f41bdb019 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 3 Dec 2023 19:20:44 +0100 Subject: [PATCH 31/36] Update issue templates --- ...ature-request-for-existing-integrations.md | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/feature-request-for-existing-integrations.md diff --git a/.github/ISSUE_TEMPLATE/feature-request-for-existing-integrations.md b/.github/ISSUE_TEMPLATE/feature-request-for-existing-integrations.md new file mode 100644 index 000000000..77c014124 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request-for-existing-integrations.md @@ -0,0 +1,20 @@ +--- +name: Feature request for existing integrations +about: Suggest an idea for an integration +title: '' +labels: feature request +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. From 0a53fa7c1012a81146d4bed4f6e101e9df8ad687 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Sun, 3 Dec 2023 19:26:40 +0100 Subject: [PATCH 32/36] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 261eeb9e9..6134ab324 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2023-present deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 0e46ed1232bc8c106a0e0c98ccc4669727cf28bf Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 4 Dec 2023 11:12:26 +0100 Subject: [PATCH 33/36] Add `cohere_haystack` integration package (#75) * cohere integration stub * mypy config * Add the CohereGenerator --------- Co-authored-by: sunilkumardash9 * add workflow * add cohere to the readme * Update LICENSE.txt --------- Co-authored-by: sunilkumardash9 --- .github/workflows/cohere.yml | 56 +++++ README.md | 1 + integrations/cohere/LICENSE.txt | 201 ++++++++++++++++++ integrations/cohere/README.md | 21 ++ integrations/cohere/pyproject.toml | 168 +++++++++++++++ .../cohere/src/cohere_haystack/__about__.py | 4 + .../cohere/src/cohere_haystack/__init__.py | 3 + .../cohere/src/cohere_haystack/generator.py | 177 +++++++++++++++ integrations/cohere/tests/__init__.py | 3 + .../cohere/tests/test_cohere_generators.py | 181 ++++++++++++++++ 10 files changed, 815 insertions(+) create mode 100644 .github/workflows/cohere.yml create mode 100644 integrations/cohere/LICENSE.txt create mode 100644 integrations/cohere/README.md create mode 100644 integrations/cohere/pyproject.toml create mode 100644 integrations/cohere/src/cohere_haystack/__about__.py create mode 100644 integrations/cohere/src/cohere_haystack/__init__.py create mode 100644 integrations/cohere/src/cohere_haystack/generator.py create mode 100644 integrations/cohere/tests/__init__.py create mode 100644 integrations/cohere/tests/test_cohere_generators.py diff --git a/.github/workflows/cohere.yml b/.github/workflows/cohere.yml new file mode 100644 index 000000000..b40cd4953 --- /dev/null +++ b/.github/workflows/cohere.yml @@ -0,0 +1,56 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / cohere + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - 'integrations/cohere/**' + - '.github/workflows/cohere.yml' + +defaults: + run: + working-directory: integrations/cohere + +concurrency: + group: cohere-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.9', '3.10'] + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + + - name: Lint + if: matrix.python-version == '3.9' && runner.os == 'Linux' + run: hatch run lint:all + + - name: Run tests + run: hatch run cov \ No newline at end of file diff --git a/README.md b/README.md index e0aae3b1f..bbd816d98 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ deepset-haystack | Package | Type | PyPi Package | Status | | ------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [chroma-haystack](integrations/chroma/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/chroma-haystack) | [![Test / chroma](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/chroma.yml) | +| [cohere-haystack](integrations/cohere/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/chroma-haystack.svg)](https://pypi.org/project/cohere-haystack) | [![Test / cohere](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/cohere.yml) | | [elasticsearch-haystack](integrations/elasticsearch/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) | [![Test / elasticsearch](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) | | [gradient-haystack](integrations/gradient/) | Embedder, Generator | [![PyPI - Version](https://img.shields.io/pypi/v/gradient-haystack.svg)](https://pypi.org/project/gradient-haystack) | [![Test / gradient](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/gradient.yml) | | [instructor-embedders-haystack](integrations/instructor-embedders/) | Embedder | [![PyPI - Version](https://img.shields.io/pypi/v/instructor-embedders-haystack.svg)](https://pypi.org/project/instructor-embedders-haystack) | [![Test / instructor-embedders](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/instructor_embedders.yml) | diff --git a/integrations/cohere/LICENSE.txt b/integrations/cohere/LICENSE.txt new file mode 100644 index 000000000..6134ab324 --- /dev/null +++ b/integrations/cohere/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/cohere/README.md b/integrations/cohere/README.md new file mode 100644 index 000000000..79cefed21 --- /dev/null +++ b/integrations/cohere/README.md @@ -0,0 +1,21 @@ +# cohere-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/cohere-haystack.svg)](https://pypi.org/project/cohere-haystack) + +----- + +**Table of Contents** + +- [Installation](#installation) +- [License](#license) + +## Installation + +```console +pip install cohere-haystack +``` + +## License + +`cohere-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/cohere/pyproject.toml b/integrations/cohere/pyproject.toml new file mode 100644 index 000000000..e291907fd --- /dev/null +++ b/integrations/cohere/pyproject.toml @@ -0,0 +1,168 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "cohere-haystack" +dynamic = ["version"] +description = '' +readme = "README.md" +requires-python = ">=3.7" +license = "Apache-2.0" +keywords = [] +authors = [ + { name = "deepset GmbH", email = "info@deepset.ai" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai", + "cohere", +] + +[project.urls] +Documentation = "https://github.com/unknown/cohere-haystack#readme" +Issues = "https://github.com/unknown/cohere-haystack/issues" +Source = "https://github.com/unknown/cohere-haystack" + +[tool.hatch.version] +path = "src/cohere_haystack/__about__.py" + +[tool.hatch.envs.default] +dependencies = [ + "coverage[toml]>=6.5", + "pytest", +] +[tool.hatch.envs.default.scripts] +test = "pytest {args:tests}" +test-cov = "coverage run -m pytest {args:tests}" +cov-report = [ + "- coverage combine", + "coverage report", +] +cov = [ + "test-cov", + "cov-report", +] + +[[tool.hatch.envs.all.matrix]] +python = ["3.7", "3.8", "3.9", "3.10", "3.11"] + +[tool.hatch.envs.lint] +detached = true +dependencies = [ + "black>=23.1.0", + "mypy>=1.0.0", + "ruff>=0.0.243", +] +[tool.hatch.envs.lint.scripts] +typing = "mypy --install-types --non-interactive {args:src/cohere_haystack tests}" +style = [ + "ruff {args:.}", + "black --check --diff {args:.}", +] +fmt = [ + "black {args:.}", + "ruff --fix {args:.}", + "style", +] +all = [ + "style", + "typing", +] + +[tool.black] +target-version = ["py37"] +line-length = 120 +skip-string-normalization = true + +[tool.ruff] +target-version = "py37" +line-length = 120 +select = [ + "A", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "FBT", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow boolean positional values in function calls, like `dict.get(... True)` + "FBT003", + # Ignore checks for possible passwords + "S105", "S106", "S107", + # Ignore complexity + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", +] +unfixable = [ + # Don't touch unused imports + "F401", +] + +[tool.ruff.isort] +known-first-party = ["cohere_haystack"] + +[tool.ruff.flake8-tidy-imports] +ban-relative-imports = "all" + +[tool.ruff.per-file-ignores] +# Tests can use magic values, assertions, and relative imports +"tests/**/*" = ["PLR2004", "S101", "TID252"] + +[tool.coverage.run] +source_pkgs = ["cohere_haystack", "tests"] +branch = true +parallel = true +omit = [ + "src/cohere_haystack/__about__.py", +] + +[tool.coverage.paths] +cohere_haystack = ["src/cohere_haystack", "*/cohere-haystack/src/cohere_haystack"] +tests = ["tests", "*/cohere-haystack/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[[tool.mypy.overrides]] +module = [ + "cohere.*", + "haystack.*", + "pytest.*" +] +ignore_missing_imports = true \ No newline at end of file diff --git a/integrations/cohere/src/cohere_haystack/__about__.py b/integrations/cohere/src/cohere_haystack/__about__.py new file mode 100644 index 000000000..0e4fa27cf --- /dev/null +++ b/integrations/cohere/src/cohere_haystack/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +__version__ = "0.0.1" diff --git a/integrations/cohere/src/cohere_haystack/__init__.py b/integrations/cohere/src/cohere_haystack/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/integrations/cohere/src/cohere_haystack/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/cohere/src/cohere_haystack/generator.py b/integrations/cohere/src/cohere_haystack/generator.py new file mode 100644 index 000000000..4b18fb75d --- /dev/null +++ b/integrations/cohere/src/cohere_haystack/generator.py @@ -0,0 +1,177 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import logging +import os +import sys +from typing import Any, Callable, Dict, List, Optional + +from haystack import DeserializationError, component, default_from_dict, default_to_dict +from haystack.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install cohere'") as cohere_import: + from cohere import COHERE_API_URL, Client + +logger = logging.getLogger(__name__) + + +@component +class CohereGenerator: + """LLM Generator compatible with Cohere's generate endpoint. + + Queries the LLM using Cohere's API. Invocations are made using 'cohere' package. + See [Cohere API](https://docs.cohere.com/reference/generate) for more details. + + Example usage: + + ```python + from haystack.generators import CohereGenerator + generator = CohereGenerator(api_key="test-api-key") + generator.run(prompt="What's the capital of France?") + ``` + """ + + def __init__( + self, + api_key: Optional[str] = None, + model_name: str = "command", + streaming_callback: Optional[Callable] = None, + api_base_url: Optional[str] = None, + **kwargs, + ): + """ + Instantiates a `CohereGenerator` component. + + :param api_key: The API key for the Cohere API. If not set, it will be read from the COHERE_API_KEY env var. + :param model_name: The name of the model to use. Available models are: [command, command-light, command-nightly, + command-nightly-light]. Defaults to "command". + :param streaming_callback: A callback function to be called with the streaming response. Defaults to None. + :param api_base_url: The base URL of the Cohere API. Defaults to "https://api.cohere.ai". + :param kwargs: Additional model parameters. These will be used during generation. Refer to + https://docs.cohere.com/reference/generate for more details. + Some of the parameters are: + - 'max_tokens': The maximum number of tokens to be generated. Defaults to 1024. + - 'truncate': One of NONE|START|END to specify how the API will handle inputs longer than the maximum token + length. Defaults to END. + - 'temperature': A non-negative float that tunes the degree of randomness in generation. Lower temperatures + mean less random generations. + - 'preset': Identifier of a custom preset. A preset is a combination of parameters, such as prompt, + temperature etc. You can create presets in the playground. + - 'end_sequences': The generated text will be cut at the beginning of the earliest occurrence of an end + sequence. The sequence will be excluded from the text. + - 'stop_sequences': The generated text will be cut at the end of the earliest occurrence of a stop sequence. + The sequence will be included the text. + - 'k': Defaults to 0, min value of 0.01, max value of 0.99. + - 'p': Ensures that only the most likely tokens, with total probability mass of `p`, are considered for + generation at each step. If both `k` and `p` are enabled, `p` acts after `k`. + - 'frequency_penalty': Used to reduce repetitiveness of generated tokens. The higher the value, the stronger + a penalty is applied to previously present tokens, proportional to how many times they have already + appeared in the prompt or prior generation.' + - 'presence_penalty': Defaults to 0.0, min value of 0.0, max value of 1.0. Can be used to reduce + repetitiveness of generated tokens. Similar to `frequency_penalty`, except that this penalty is applied + equally to all tokens that have already appeared, regardless of their exact frequencies. + - 'return_likelihoods': One of GENERATION|ALL|NONE to specify how and if the token likelihoods are returned + with the response. Defaults to NONE. + - 'logit_bias': Used to prevent the model from generating unwanted tokens or to incentivize it to include + desired tokens. The format is {token_id: bias} where bias is a float between -10 and 10. + """ + cohere_import.check() + + if not api_key: + api_key = os.environ.get("COHERE_API_KEY") + if not api_key: + msg = ( + "CohereGenerator needs an API key to run." + "Either provide it as init parameter or set the env var COHERE_API_KEY." + ) + raise ValueError(msg) + + if not api_base_url: + api_base_url = COHERE_API_URL + + self.api_key = api_key + self.model_name = model_name + self.streaming_callback = streaming_callback + self.api_base_url = api_base_url + self.model_parameters = kwargs + self.client = Client(api_key=self.api_key, api_url=self.api_base_url) + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + if self.streaming_callback: + module = self.streaming_callback.__module__ + if module == "builtins": + callback_name = self.streaming_callback.__name__ + else: + callback_name = f"{module}.{self.streaming_callback.__name__}" + else: + callback_name = None + + return default_to_dict( + self, + model_name=self.model_name, + streaming_callback=callback_name, + api_base_url=self.api_base_url, + **self.model_parameters, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "CohereGenerator": + """ + Deserialize this component from a dictionary. + """ + init_params = data.get("init_parameters", {}) + streaming_callback = None + if "streaming_callback" in init_params and init_params["streaming_callback"]: + parts = init_params["streaming_callback"].split(".") + module_name = ".".join(parts[:-1]) + function_name = parts[-1] + module = sys.modules.get(module_name, None) + if not module: + msg = f"Could not locate the module of the streaming callback: {module_name}" + raise DeserializationError(msg) + streaming_callback = getattr(module, function_name, None) + if not streaming_callback: + msg = f"Could not locate the streaming callback: {function_name}" + raise DeserializationError(msg) + data["init_parameters"]["streaming_callback"] = streaming_callback + return default_from_dict(cls, data) + + @component.output_types(replies=List[str], metadata=List[Dict[str, Any]]) + def run(self, prompt: str): + """ + Queries the LLM with the prompts to produce replies. + :param prompt: The prompt to be sent to the generative model. + """ + response = self.client.generate( + model=self.model_name, prompt=prompt, stream=self.streaming_callback is not None, **self.model_parameters + ) + if self.streaming_callback: + metadata_dict: Dict[str, Any] = {} + for chunk in response: + self.streaming_callback(chunk) + metadata_dict["index"] = chunk.index + replies = response.texts + metadata_dict["finish_reason"] = response.finish_reason + metadata = [metadata_dict] + self._check_truncated_answers(metadata) + return {"replies": replies, "metadata": metadata} + + metadata = [{"finish_reason": resp.finish_reason} for resp in response] + replies = [resp.text for resp in response] + self._check_truncated_answers(metadata) + return {"replies": replies, "metadata": metadata} + + def _check_truncated_answers(self, metadata: List[Dict[str, Any]]): + """ + Check the `finish_reason` returned with the Cohere response. + If the `finish_reason` is `MAX_TOKEN`, log a warning to the user. + :param metadata: The metadata returned by the Cohere API. + """ + if metadata[0]["finish_reason"] == "MAX_TOKENS": + logger.warning( + "Responses have been truncated before reaching a natural stopping point. " + "Increase the max_tokens parameter to allow for longer completions." + ) diff --git a/integrations/cohere/tests/__init__.py b/integrations/cohere/tests/__init__.py new file mode 100644 index 000000000..e873bc332 --- /dev/null +++ b/integrations/cohere/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/cohere/tests/test_cohere_generators.py b/integrations/cohere/tests/test_cohere_generators.py new file mode 100644 index 000000000..d267847a4 --- /dev/null +++ b/integrations/cohere/tests/test_cohere_generators.py @@ -0,0 +1,181 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +import os + +import pytest + +from cohere_haystack.generator import CohereGenerator + + +def default_streaming_callback(chunk): + """ + Default callback function for streaming responses from Cohere API. + Prints the tokens of the first completion to stdout as soon as they are received and returns the chunk unchanged. + """ + print(chunk.text, flush=True, end="") # noqa: T201 + + +@pytest.mark.integration +class TestCohereGenerator: + def test_init_default(self): + import cohere + + component = CohereGenerator(api_key="test-api-key") + assert component.api_key == "test-api-key" + assert component.model_name == "command" + assert component.streaming_callback is None + assert component.api_base_url == cohere.COHERE_API_URL + assert component.model_parameters == {} + + def test_init_with_parameters(self): + callback = lambda x: x # noqa: E731 + component = CohereGenerator( + api_key="test-api-key", + model_name="command-light", + max_tokens=10, + some_test_param="test-params", + streaming_callback=callback, + api_base_url="test-base-url", + ) + assert component.api_key == "test-api-key" + assert component.model_name == "command-light" + assert component.streaming_callback == callback + assert component.api_base_url == "test-base-url" + assert component.model_parameters == {"max_tokens": 10, "some_test_param": "test-params"} + + def test_to_dict_default(self): + import cohere + + component = CohereGenerator(api_key="test-api-key") + data = component.to_dict() + assert data == { + "type": "cohere_haystack.generator.CohereGenerator", + "init_parameters": { + "model_name": "command", + "streaming_callback": None, + "api_base_url": cohere.COHERE_API_URL, + }, + } + + def test_to_dict_with_parameters(self): + component = CohereGenerator( + api_key="test-api-key", + model_name="command-light", + max_tokens=10, + some_test_param="test-params", + streaming_callback=default_streaming_callback, + api_base_url="test-base-url", + ) + data = component.to_dict() + assert data == { + "type": "cohere_haystack.generator.CohereGenerator", + "init_parameters": { + "model_name": "command-light", + "max_tokens": 10, + "some_test_param": "test-params", + "api_base_url": "test-base-url", + "streaming_callback": "tests.test_cohere_generators.default_streaming_callback", + }, + } + + def test_to_dict_with_lambda_streaming_callback(self): + component = CohereGenerator( + api_key="test-api-key", + model_name="command", + max_tokens=10, + some_test_param="test-params", + streaming_callback=lambda x: x, + api_base_url="test-base-url", + ) + data = component.to_dict() + assert data == { + "type": "cohere_haystack.generator.CohereGenerator", + "init_parameters": { + "model_name": "command", + "streaming_callback": "tests.test_cohere_generators.", + "api_base_url": "test-base-url", + "max_tokens": 10, + "some_test_param": "test-params", + }, + } + + def test_from_dict(self, monkeypatch): + monkeypatch.setenv("COHERE_API_KEY", "test-key") + data = { + "type": "cohere_haystack.generator.CohereGenerator", + "init_parameters": { + "model_name": "command", + "max_tokens": 10, + "some_test_param": "test-params", + "api_base_url": "test-base-url", + "streaming_callback": "tests.test_cohere_generators.default_streaming_callback", + }, + } + component = CohereGenerator.from_dict(data) + assert component.api_key == "test-key" + assert component.model_name == "command" + assert component.streaming_callback == default_streaming_callback + assert component.api_base_url == "test-base-url" + assert component.model_parameters == {"max_tokens": 10, "some_test_param": "test-params"} + + def test_check_truncated_answers(self, caplog): + component = CohereGenerator(api_key="test-api-key") + metadata = [{"finish_reason": "MAX_TOKENS"}] + component._check_truncated_answers(metadata) + assert caplog.records[0].message == ( + "Responses have been truncated before reaching a natural stopping point. " + "Increase the max_tokens parameter to allow for longer completions." + ) + + @pytest.mark.skipif( + not os.environ.get("COHERE_API_KEY", None), + reason="Export an env var called CO_API_KEY containing the Cohere API key to run this test.", + ) + @pytest.mark.integration + def test_cohere_generator_run(self): + component = CohereGenerator(api_key=os.environ.get("COHERE_API_KEY")) + results = component.run(prompt="What's the capital of France?") + assert len(results["replies"]) == 1 + assert "Paris" in results["replies"][0] + assert len(results["metadata"]) == 1 + assert results["metadata"][0]["finish_reason"] == "COMPLETE" + + @pytest.mark.skipif( + not os.environ.get("COHERE_API_KEY", None), + reason="Export an env var called COHERE_API_KEY containing the Cohere API key to run this test.", + ) + @pytest.mark.integration + def test_cohere_generator_run_wrong_model_name(self): + import cohere + + component = CohereGenerator(model_name="something-obviously-wrong", api_key=os.environ.get("COHERE_API_KEY")) + with pytest.raises( + cohere.CohereAPIError, + match="model not found, make sure the correct model ID was used and that you have access to the model.", + ): + component.run(prompt="What's the capital of France?") + + @pytest.mark.skipif( + not os.environ.get("COHERE_API_KEY", None), + reason="Export an env var called COHERE_API_KEY containing the Cohere API key to run this test.", + ) + @pytest.mark.integration + def test_cohere_generator_run_streaming(self): + class Callback: + def __init__(self): + self.responses = "" + + def __call__(self, chunk): + self.responses += chunk.text + return chunk + + callback = Callback() + component = CohereGenerator(os.environ.get("COHERE_API_KEY"), streaming_callback=callback) + results = component.run(prompt="What's the capital of France?") + + assert len(results["replies"]) == 1 + assert "Paris" in results["replies"][0] + assert len(results["metadata"]) == 1 + assert results["metadata"][0]["finish_reason"] == "COMPLETE" + assert callback.responses == results["replies"][0] From 3999dcc10d601cb5ca8eddf990efc2613f104fb7 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Mon, 4 Dec 2023 15:00:02 +0100 Subject: [PATCH 34/36] remove Document Store decorator (#76) * remove decorator * Update integrations/elasticsearch/src/elasticsearch_haystack/__about__.py Co-authored-by: Massimiliano Pippi * Update integrations/opensearch/src/opensearch_haystack/__about__.py Co-authored-by: Massimiliano Pippi --------- Co-authored-by: Massimiliano Pippi --- integrations/chroma/src/chroma_haystack/__about__.py | 2 +- integrations/chroma/src/chroma_haystack/document_store.py | 2 -- .../elasticsearch/src/elasticsearch_haystack/__about__.py | 2 +- .../elasticsearch/src/elasticsearch_haystack/document_store.py | 3 +-- integrations/opensearch/src/opensearch_haystack/__about__.py | 2 +- .../opensearch/src/opensearch_haystack/document_store.py | 3 +-- 6 files changed, 5 insertions(+), 9 deletions(-) diff --git a/integrations/chroma/src/chroma_haystack/__about__.py b/integrations/chroma/src/chroma_haystack/__about__.py index 4612bf993..0e08a8823 100644 --- a/integrations/chroma/src/chroma_haystack/__about__.py +++ b/integrations/chroma/src/chroma_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.7.0" +__version__ = "0.8.0" diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index b6840b2a7..c8c07eee8 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -9,7 +9,6 @@ import numpy as np from chromadb.api.types import GetResult, QueryResult, validate_where, validate_where_document from haystack.dataclasses import Document -from haystack.document_stores.decorator import document_store from haystack.document_stores.protocols import DuplicatePolicy from chroma_haystack.errors import ChromaDocumentStoreFilterError @@ -18,7 +17,6 @@ logger = logging.getLogger(__name__) -@document_store class ChromaDocumentStore: """ We use the `collection.get` API to implement the document store protocol, diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py index d4a92df1b..bccfd8317 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.0.2" +__version__ = "0.1.0" diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py index 17c2fa4af..b552a7e06 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py @@ -11,7 +11,7 @@ from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found] from haystack import default_from_dict, default_to_dict from haystack.dataclasses import Document -from haystack.document_stores import DocumentStoreError, DuplicateDocumentError, DuplicatePolicy, document_store +from haystack.document_stores import DocumentStoreError, DuplicateDocumentError, DuplicatePolicy from haystack.utils.filters import convert from elasticsearch_haystack.filters import _normalize_filters @@ -30,7 +30,6 @@ BM25_SCALING_FACTOR = 8 -@document_store class ElasticsearchDocumentStore: def __init__( self, diff --git a/integrations/opensearch/src/opensearch_haystack/__about__.py b/integrations/opensearch/src/opensearch_haystack/__about__.py index d4a92df1b..bccfd8317 100644 --- a/integrations/opensearch/src/opensearch_haystack/__about__.py +++ b/integrations/opensearch/src/opensearch_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.0.2" +__version__ = "0.1.0" diff --git a/integrations/opensearch/src/opensearch_haystack/document_store.py b/integrations/opensearch/src/opensearch_haystack/document_store.py index 2cc06c680..7f36df243 100644 --- a/integrations/opensearch/src/opensearch_haystack/document_store.py +++ b/integrations/opensearch/src/opensearch_haystack/document_store.py @@ -7,7 +7,7 @@ import numpy as np from haystack import default_from_dict, default_to_dict from haystack.dataclasses import Document -from haystack.document_stores import DocumentStoreError, DuplicateDocumentError, DuplicatePolicy, document_store +from haystack.document_stores import DocumentStoreError, DuplicateDocumentError, DuplicatePolicy from haystack.utils.filters import convert from opensearchpy import OpenSearch from opensearchpy.helpers import bulk @@ -28,7 +28,6 @@ BM25_SCALING_FACTOR = 8 -@document_store class OpenSearchDocumentStore: def __init__( self, From 270eb6bc2037b56ba42ce0321564c28497761155 Mon Sep 17 00:00:00 2001 From: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> Date: Tue, 5 Dec 2023 10:44:13 +0100 Subject: [PATCH 35/36] fix import and increase version (#77) --- integrations/chroma/src/chroma_haystack/__about__.py | 2 +- integrations/chroma/src/chroma_haystack/document_store.py | 2 +- .../elasticsearch/src/elasticsearch_haystack/__about__.py | 2 +- integrations/elasticsearch/tests/test_document_store.py | 2 +- integrations/opensearch/src/opensearch_haystack/__about__.py | 2 +- integrations/opensearch/tests/test_document_store.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/integrations/chroma/src/chroma_haystack/__about__.py b/integrations/chroma/src/chroma_haystack/__about__.py index 0e08a8823..63065fc23 100644 --- a/integrations/chroma/src/chroma_haystack/__about__.py +++ b/integrations/chroma/src/chroma_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.8.0" +__version__ = "0.8.1" diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index c8c07eee8..8d6a8437e 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -9,7 +9,7 @@ import numpy as np from chromadb.api.types import GetResult, QueryResult, validate_where, validate_where_document from haystack.dataclasses import Document -from haystack.document_stores.protocols import DuplicatePolicy +from haystack.document_stores.protocol import DuplicatePolicy from chroma_haystack.errors import ChromaDocumentStoreFilterError from chroma_haystack.utils import get_embedding_function diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py index bccfd8317..8430bf8d4 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py +++ b/integrations/elasticsearch/src/elasticsearch_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.1.0" +__version__ = "0.1.1" diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index 6a69b6e4b..fbc850182 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -10,7 +10,7 @@ from elasticsearch.exceptions import BadRequestError # type: ignore[import-not-found] from haystack.dataclasses.document import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError -from haystack.document_stores.protocols import DuplicatePolicy +from haystack.document_stores.protocol import DuplicatePolicy from haystack.testing.document_store import DocumentStoreBaseTests from elasticsearch_haystack.document_store import ElasticsearchDocumentStore diff --git a/integrations/opensearch/src/opensearch_haystack/__about__.py b/integrations/opensearch/src/opensearch_haystack/__about__.py index bccfd8317..8430bf8d4 100644 --- a/integrations/opensearch/src/opensearch_haystack/__about__.py +++ b/integrations/opensearch/src/opensearch_haystack/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -__version__ = "0.1.0" +__version__ = "0.1.1" diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py index af26dea49..b1e367745 100644 --- a/integrations/opensearch/tests/test_document_store.py +++ b/integrations/opensearch/tests/test_document_store.py @@ -8,7 +8,7 @@ import pytest from haystack.dataclasses.document import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError -from haystack.document_stores.protocols import DuplicatePolicy +from haystack.document_stores.protocol import DuplicatePolicy from haystack.testing.document_store import DocumentStoreBaseTests from opensearchpy.exceptions import RequestError From 31933a65280e20f8965f07fd0140c454acc0949d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bilge=20Y=C3=BCcel?= Date: Tue, 5 Dec 2023 15:58:32 +0100 Subject: [PATCH 36/36] Update elasticsearch test badge (#79) --- integrations/elasticsearch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/elasticsearch/README.md b/integrations/elasticsearch/README.md index 7e70ad6e4..9900f46b1 100644 --- a/integrations/elasticsearch/README.md +++ b/integrations/elasticsearch/README.md @@ -1,4 +1,4 @@ -[![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/document_stores_elasticsearch.yml) +[![test](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/elasticsearch.yml) [![PyPI - Version](https://img.shields.io/pypi/v/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/elasticsearch-haystack.svg)](https://pypi.org/project/elasticsearch-haystack)