From c3912e7c477f267556345075ac4bdc174b116dbf Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 18 Jan 2024 12:05:58 +0100 Subject: [PATCH 1/5] Remove ChromaSingleQueryRetriever (#240) * remove single-query retriever * handle top_k correctly --- integrations/chroma/example/example.py | 4 +-- .../components/retrievers/chroma/__init__.py | 4 +-- .../components/retrievers/chroma/retriever.py | 33 ++++--------------- 3 files changed, 11 insertions(+), 30 deletions(-) diff --git a/integrations/chroma/example/example.py b/integrations/chroma/example/example.py index 19763742d..1e6a7e402 100644 --- a/integrations/chroma/example/example.py +++ b/integrations/chroma/example/example.py @@ -23,7 +23,7 @@ querying = Pipeline() querying.add_component("retriever", ChromaQueryRetriever(document_store)) -results = querying.run({"retriever": {"queries": ["Variable declarations"], "top_k": 3}}) +results = querying.run({"retriever": {"query": "Variable declarations", "top_k": 3}}) -for d in results["retriever"]["documents"][0]: +for d in results["retriever"]["documents"]: print(d.meta, d.score) diff --git a/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/__init__.py b/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/__init__.py index e449f0067..d02300de2 100644 --- a/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/__init__.py +++ b/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/__init__.py @@ -1,3 +1,3 @@ -from .retriever import ChromaEmbeddingRetriever, ChromaQueryRetriever, ChromaSingleQueryRetriever +from .retriever import ChromaEmbeddingRetriever, ChromaQueryRetriever -__all__ = ["ChromaQueryRetriever", "ChromaEmbeddingRetriever", "ChromaSingleQueryRetriever"] +__all__ = ["ChromaQueryRetriever", "ChromaEmbeddingRetriever"] diff --git a/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/retriever.py b/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/retriever.py index f4b3909c6..9388171f4 100644 --- a/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/retriever.py +++ b/integrations/chroma/src/haystack_integrations/components/retrievers/chroma/retriever.py @@ -25,24 +25,24 @@ def __init__(self, document_store: ChromaDocumentStore, filters: Optional[Dict[s self.top_k = top_k self.document_store = document_store - @component.output_types(documents=List[List[Document]]) + @component.output_types(documents=List[Document]) def run( self, - queries: List[str], + query: str, _: Optional[Dict[str, Any]] = None, # filters not yet supported top_k: Optional[int] = None, ): """ Run the retriever on the given input data. - :param queries: The input data for the retriever. In this case, a list of queries. + :param query: The input data for the retriever. In this case, a plain-text query. :return: The retrieved documents. :raises ValueError: If the specified document store is not found or is not a MemoryDocumentStore instance. """ - if not top_k: - top_k = 3 - return {"documents": self.document_store.search(queries, top_k)} + top_k = top_k or self.top_k + + return {"documents": self.document_store.search([query], top_k)[0]} def to_dict(self) -> Dict[str, Any]: """ @@ -60,24 +60,6 @@ def from_dict(cls, data: Dict[str, Any]) -> "ChromaQueryRetriever": return default_from_dict(cls, data) -@component -class ChromaSingleQueryRetriever(ChromaQueryRetriever): - """ - A convenient wrapper to the standard query retriever that accepts a single query - and returns a list of documents - """ - - @component.output_types(documents=List[Document]) - def run( - self, - query: str, - filters: Optional[Dict[str, Any]] = None, # filters not yet supported - top_k: Optional[int] = None, - ): - queries = [query] - return super().run(queries, filters, top_k)[0] - - @component class ChromaEmbeddingRetriever(ChromaQueryRetriever): @component.output_types(documents=List[Document]) @@ -95,8 +77,7 @@ def run( :raises ValueError: If the specified document store is not found or is not a MemoryDocumentStore instance. """ - if not top_k: - top_k = 3 + top_k = top_k or self.top_k query_embeddings = [query_embedding] return {"documents": self.document_store.search_embeddings(query_embeddings, top_k)[0]} From 065a00f7d00588933815673b8a23e55af4e88ece Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 18 Jan 2024 13:28:15 +0100 Subject: [PATCH 2/5] chore!: Rename model_name to model in the Gradient integration (#228) * rename model_name into model * fix tests * rename model_name into model for text embedder * fix tests --- .../embedders/gradient_document_embedder.py | 10 +++++----- .../embedders/gradient_text_embedder.py | 10 +++++----- .../gradient/tests/test_gradient_document_embedder.py | 2 +- .../gradient/tests/test_gradient_text_embedder.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py index 551aa9dd5..34fc1f87f 100644 --- a/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py +++ b/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py @@ -28,7 +28,7 @@ class GradientDocumentEmbedder: embedder = GradientDocumentEmbedder( access_token=gradient_access_token, workspace_id=gradient_workspace_id, - model_name="bge_large")) + model="bge_large")) p = Pipeline() p.add_component(embedder, name="document_embedder") p.add_component(instance=GradientDocumentEmbedder( @@ -41,7 +41,7 @@ class GradientDocumentEmbedder: def __init__( self, *, - model_name: str = "bge-large", + model: str = "bge-large", batch_size: int = 32_768, access_token: Optional[str] = None, workspace_id: Optional[str] = None, @@ -51,7 +51,7 @@ def __init__( """ Create a GradientDocumentEmbedder component. - :param model_name: The name of the model to use. + :param model: The name of the model to use. :param batch_size: Update cycle for tqdm progress bar, default is to update every 32_768 docs. :param access_token: The Gradient access token. If not provided it's read from the environment variable GRADIENT_ACCESS_TOKEN. @@ -62,7 +62,7 @@ def __init__( """ self._batch_size = batch_size self._host = host - self._model_name = model_name + self._model_name = model self._progress_bar = progress_bar self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id) @@ -77,7 +77,7 @@ def to_dict(self) -> dict: """ Serialize the component to a Python dictionary. """ - return default_to_dict(self, workspace_id=self._gradient.workspace_id, model_name=self._model_name) + return default_to_dict(self, workspace_id=self._gradient.workspace_id, model=self._model_name) def warm_up(self) -> None: """ diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py b/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py index 2ddc229ce..ba753297c 100644 --- a/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py +++ b/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py @@ -13,7 +13,7 @@ class GradientTextEmbedder: embedder = GradientTextEmbedder( access_token=gradient_access_token, workspace_id=gradient_workspace_id, - model_name="bge_large") + model="bge_large") p = Pipeline() p.add_component(instance=embedder, name="text_embedder") p.add_component(instance=InMemoryEmbeddingRetriever(document_store=InMemoryDocumentStore()), name="retriever") @@ -25,7 +25,7 @@ class GradientTextEmbedder: def __init__( self, *, - model_name: str = "bge-large", + model: str = "bge-large", access_token: Optional[str] = None, workspace_id: Optional[str] = None, host: Optional[str] = None, @@ -33,7 +33,7 @@ def __init__( """ Create a GradientTextEmbedder component. - :param model_name: The name of the model to use. + :param model: The name of the model to use. :param access_token: The Gradient access token. If not provided it's read from the environment variable GRADIENT_ACCESS_TOKEN. :param workspace_id: The Gradient workspace ID. If not provided it's read from the environment @@ -41,7 +41,7 @@ def __init__( :param host: The Gradient host. By default it uses https://api.gradient.ai/. """ self._host = host - self._model_name = model_name + self._model_name = model self._gradient = Gradient(access_token=access_token, host=host, workspace_id=workspace_id) @@ -55,7 +55,7 @@ def to_dict(self) -> dict: """ Serialize the component to a Python dictionary. """ - return default_to_dict(self, workspace_id=self._gradient.workspace_id, model_name=self._model_name) + return default_to_dict(self, workspace_id=self._gradient.workspace_id, model=self._model_name) def warm_up(self) -> None: """ diff --git a/integrations/gradient/tests/test_gradient_document_embedder.py b/integrations/gradient/tests/test_gradient_document_embedder.py index 6e75360fe..dc59a76fb 100644 --- a/integrations/gradient/tests/test_gradient_document_embedder.py +++ b/integrations/gradient/tests/test_gradient_document_embedder.py @@ -54,7 +54,7 @@ def test_to_dict(self): data = component.to_dict() assert data == { "type": "gradient_haystack.embedders.gradient_document_embedder.GradientDocumentEmbedder", - "init_parameters": {"workspace_id": workspace_id, "model_name": "bge-large"}, + "init_parameters": {"workspace_id": workspace_id, "model": "bge-large"}, } def test_warmup(self): diff --git a/integrations/gradient/tests/test_gradient_text_embedder.py b/integrations/gradient/tests/test_gradient_text_embedder.py index 7ae846e93..bd4b396ca 100644 --- a/integrations/gradient/tests/test_gradient_text_embedder.py +++ b/integrations/gradient/tests/test_gradient_text_embedder.py @@ -53,7 +53,7 @@ def test_to_dict(self): data = component.to_dict() assert data == { "type": "gradient_haystack.embedders.gradient_text_embedder.GradientTextEmbedder", - "init_parameters": {"workspace_id": workspace_id, "model_name": "bge-large"}, + "init_parameters": {"workspace_id": workspace_id, "model": "bge-large"}, } def test_warmup(self): From 00a55b26b9d1d486bc210e730fcd91a82fea4c3d Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 18 Jan 2024 13:29:10 +0100 Subject: [PATCH 3/5] chore!: Rename model_name to model in the Jina integration (#230) * rename model_name to model in doc embedder * rename model_name to model in text embedder * fix tests * leftover --- .../jina/src/jina_haystack/document_embedder.py | 8 ++++---- .../jina/src/jina_haystack/text_embedder.py | 8 ++++---- integrations/jina/tests/test_document_embedder.py | 14 +++++++------- integrations/jina/tests/test_text_embedder.py | 10 +++++----- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/integrations/jina/src/jina_haystack/document_embedder.py b/integrations/jina/src/jina_haystack/document_embedder.py index 9f51a9e26..4696db5c8 100644 --- a/integrations/jina/src/jina_haystack/document_embedder.py +++ b/integrations/jina/src/jina_haystack/document_embedder.py @@ -36,7 +36,7 @@ class JinaDocumentEmbedder: def __init__( self, api_key: Optional[str] = None, - model_name: str = "jina-embeddings-v2-base-en", + model: str = "jina-embeddings-v2-base-en", prefix: str = "", suffix: str = "", batch_size: int = 32, @@ -48,7 +48,7 @@ def __init__( Create a JinaDocumentEmbedder component. :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable JINA_API_KEY (recommended). - :param model_name: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/` + :param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/` :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. :param batch_size: Number of Documents to encode at once. @@ -67,7 +67,7 @@ def __init__( ) raise ValueError(msg) - self.model_name = model_name + self.model_name = model self.prefix = prefix self.suffix = suffix self.batch_size = batch_size @@ -96,7 +96,7 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - model_name=self.model_name, + model=self.model_name, prefix=self.prefix, suffix=self.suffix, batch_size=self.batch_size, diff --git a/integrations/jina/src/jina_haystack/text_embedder.py b/integrations/jina/src/jina_haystack/text_embedder.py index f717f4748..3f18aa037 100644 --- a/integrations/jina/src/jina_haystack/text_embedder.py +++ b/integrations/jina/src/jina_haystack/text_embedder.py @@ -34,7 +34,7 @@ class JinaTextEmbedder: def __init__( self, api_key: Optional[str] = None, - model_name: str = "jina-embeddings-v2-base-en", + model: str = "jina-embeddings-v2-base-en", prefix: str = "", suffix: str = "", ): @@ -43,7 +43,7 @@ def __init__( :param api_key: The Jina API key. It can be explicitly provided or automatically read from the environment variable JINA_API_KEY (recommended). - :param model_name: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/` + :param model: The name of the Jina model to use. Check the list of available models on `https://jina.ai/embeddings/` :param prefix: A string to add to the beginning of each text. :param suffix: A string to add to the end of each text. """ @@ -57,7 +57,7 @@ def __init__( ) raise ValueError(msg) - self.model_name = model_name + self.model_name = model self.prefix = prefix self.suffix = suffix self._session = requests.Session() @@ -81,7 +81,7 @@ def to_dict(self) -> Dict[str, Any]: to the constructor. """ - return default_to_dict(self, model_name=self.model_name, prefix=self.prefix, suffix=self.suffix) + return default_to_dict(self, model=self.model_name, prefix=self.prefix, suffix=self.suffix) @component.output_types(embedding=List[float], meta=Dict[str, Any]) def run(self, text: str): diff --git a/integrations/jina/tests/test_document_embedder.py b/integrations/jina/tests/test_document_embedder.py index 2ebc5d358..43b6930c5 100644 --- a/integrations/jina/tests/test_document_embedder.py +++ b/integrations/jina/tests/test_document_embedder.py @@ -40,7 +40,7 @@ def test_init_default(self, monkeypatch): def test_init_with_parameters(self): embedder = JinaDocumentEmbedder( api_key="fake-api-key", - model_name="model", + model="model", prefix="prefix", suffix="suffix", batch_size=64, @@ -67,7 +67,7 @@ def test_to_dict(self): assert data == { "type": "jina_haystack.document_embedder.JinaDocumentEmbedder", "init_parameters": { - "model_name": "jina-embeddings-v2-base-en", + "model": "jina-embeddings-v2-base-en", "prefix": "", "suffix": "", "batch_size": 32, @@ -80,7 +80,7 @@ def test_to_dict(self): def test_to_dict_with_custom_init_parameters(self): component = JinaDocumentEmbedder( api_key="fake-api-key", - model_name="model", + model="model", prefix="prefix", suffix="suffix", batch_size=64, @@ -92,7 +92,7 @@ def test_to_dict_with_custom_init_parameters(self): assert data == { "type": "jina_haystack.document_embedder.JinaDocumentEmbedder", "init_parameters": { - "model_name": "model", + "model": "model", "prefix": "prefix", "suffix": "suffix", "batch_size": 64, @@ -141,7 +141,7 @@ def test_embed_batch(self): texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] with patch("requests.sessions.Session.post", side_effect=mock_session_post_response): - embedder = JinaDocumentEmbedder(api_key="fake-api-key", model_name="model") + embedder = JinaDocumentEmbedder(api_key="fake-api-key", model="model") embeddings, metadata = embedder._embed_batch(texts_to_embed=texts, batch_size=2) @@ -164,7 +164,7 @@ def test_run(self): with patch("requests.sessions.Session.post", side_effect=mock_session_post_response): embedder = JinaDocumentEmbedder( api_key="fake-api-key", - model_name=model, + model=model, prefix="prefix ", suffix=" suffix", meta_fields_to_embed=["topic"], @@ -194,7 +194,7 @@ def test_run_custom_batch_size(self): with patch("requests.sessions.Session.post", side_effect=mock_session_post_response): embedder = JinaDocumentEmbedder( api_key="fake-api-key", - model_name=model, + model=model, prefix="prefix ", suffix=" suffix", meta_fields_to_embed=["topic"], diff --git a/integrations/jina/tests/test_text_embedder.py b/integrations/jina/tests/test_text_embedder.py index 7dfd64a05..e2b68603d 100644 --- a/integrations/jina/tests/test_text_embedder.py +++ b/integrations/jina/tests/test_text_embedder.py @@ -22,7 +22,7 @@ def test_init_default(self, monkeypatch): def test_init_with_parameters(self): embedder = JinaTextEmbedder( api_key="fake-api-key", - model_name="model", + model="model", prefix="prefix", suffix="suffix", ) @@ -41,7 +41,7 @@ def test_to_dict(self): assert data == { "type": "jina_haystack.text_embedder.JinaTextEmbedder", "init_parameters": { - "model_name": "jina-embeddings-v2-base-en", + "model": "jina-embeddings-v2-base-en", "prefix": "", "suffix": "", }, @@ -50,7 +50,7 @@ def test_to_dict(self): def test_to_dict_with_custom_init_parameters(self): component = JinaTextEmbedder( api_key="fake-api-key", - model_name="model", + model="model", prefix="prefix", suffix="suffix", ) @@ -58,7 +58,7 @@ def test_to_dict_with_custom_init_parameters(self): assert data == { "type": "jina_haystack.text_embedder.JinaTextEmbedder", "init_parameters": { - "model_name": "model", + "model": "model", "prefix": "prefix", "suffix": "suffix", }, @@ -81,7 +81,7 @@ def test_run(self): mock_post.return_value = mock_response - embedder = JinaTextEmbedder(api_key="fake-api-key", model_name=model, prefix="prefix ", suffix=" suffix") + embedder = JinaTextEmbedder(api_key="fake-api-key", model=model, prefix="prefix ", suffix=" suffix") result = embedder.run(text="The food was delicious") assert len(result["embedding"]) == 3 From 9e0474fe5300f0dad0f539e5300d5fadfdddf9b6 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 19 Jan 2024 11:30:07 +0100 Subject: [PATCH 4/5] mount import paths under haystack_integrations (#244) * mount import paths under haystack_integrations * linter --- integrations/elasticsearch/pyproject.toml | 19 ++++++++++--------- .../retrievers/elasticsearch/__init__.py | 7 +++++++ .../elasticsearch}/bm25_retriever.py | 7 +++---- .../elasticsearch}/embedding_retriever.py | 3 +-- .../elasticsearch}/__init__.py | 2 +- .../elasticsearch}/document_store.py | 5 +++-- .../document_stores/elasticsearch}/filters.py | 0 .../tests/test_bm25_retriever.py | 17 ++++++++--------- .../tests/test_document_store.py | 12 ++++++------ .../tests/test_embedding_retriever.py | 19 ++++++++++--------- .../elasticsearch/tests/test_filters.py | 3 +-- 11 files changed, 50 insertions(+), 44 deletions(-) create mode 100644 integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py rename integrations/elasticsearch/src/{elasticsearch_haystack => haystack_integrations/components/retrievers/elasticsearch}/bm25_retriever.py (92%) rename integrations/elasticsearch/src/{elasticsearch_haystack => haystack_integrations/components/retrievers/elasticsearch}/embedding_retriever.py (96%) rename integrations/elasticsearch/src/{elasticsearch_haystack => haystack_integrations/document_stores/elasticsearch}/__init__.py (66%) rename integrations/elasticsearch/src/{elasticsearch_haystack => haystack_integrations/document_stores/elasticsearch}/document_store.py (99%) rename integrations/elasticsearch/src/{elasticsearch_haystack => haystack_integrations/document_stores/elasticsearch}/filters.py (100%) diff --git a/integrations/elasticsearch/pyproject.toml b/integrations/elasticsearch/pyproject.toml index 17c9158b9..af3d89c0c 100644 --- a/integrations/elasticsearch/pyproject.toml +++ b/integrations/elasticsearch/pyproject.toml @@ -33,6 +33,9 @@ Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/m Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/elasticsearch" +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + [tool.hatch.version] source = "vcs" tag-pattern = 'integrations\/elasticsearch-v(?P.*)' @@ -70,7 +73,7 @@ dependencies = [ "ruff>=0.0.243", ] [tool.hatch.envs.lint.scripts] -typing = "mypy --install-types --non-interactive {args:src/elasticsearch_haystack tests}" +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" style = [ "ruff {args:.}", "black --check --diff {args:.}", @@ -139,26 +142,23 @@ unfixable = [ ] [tool.ruff.isort] -known-first-party = ["elasticsearch_haystack"] +known-first-party = ["src"] [tool.ruff.flake8-tidy-imports] -ban-relative-imports = "all" +ban-relative-imports = "parents" [tool.ruff.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] [tool.coverage.run] -source_pkgs = ["elasticsearch_haystack", "tests"] +source_pkgs = ["src", "tests"] branch = true parallel = true -omit = [ - "src/elasticsearch_haystack/__about__.py", -] [tool.coverage.paths] -elasticsearch_haystack = ["src/elasticsearch_haystack", "*/elasticsearch-haystack/src/elasticsearch_haystack"] -tests = ["tests", "*/elasticsearch-haystack/tests"] +elasticsearch_haystack = ["src/haystack_integrations", "*/elasticsearch/src/haystack_integrations"] +tests = ["tests", "*/elasticsearch/src/tests"] [tool.coverage.report] exclude_lines = [ @@ -177,6 +177,7 @@ markers = [ [[tool.mypy.overrides]] module = [ "haystack.*", + "haystack_integrations.*", "pytest.*" ] ignore_missing_imports = true diff --git a/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py new file mode 100644 index 000000000..bb49c0fd7 --- /dev/null +++ b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from .bm25_retriever import ElasticsearchBM25Retriever +from .embedding_retriever import ElasticsearchEmbeddingRetriever + +__all__ = ["ElasticsearchBM25Retriever", "ElasticsearchEmbeddingRetriever"] diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py similarity index 92% rename from integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py rename to integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py index cb381e2eb..bd96a5fd8 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/bm25_retriever.py +++ b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/bm25_retriever.py @@ -5,8 +5,7 @@ from haystack import component, default_from_dict, default_to_dict from haystack.dataclasses import Document - -from elasticsearch_haystack.document_store import ElasticsearchDocumentStore +from haystack_integrations.document_stores.elasticsearch.document_store import ElasticsearchDocumentStore @component @@ -19,8 +18,8 @@ class ElasticsearchBM25Retriever: Usage example: ```python from haystack import Document - from elasticsearch_haystack.document_store import ElasticsearchDocumentStore - from elasticsearch_haystack.bm25_retriever import ElasticsearchBM25Retriever + from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore + from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200") retriever = ElasticsearchBM25Retriever(document_store=document_store) diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py similarity index 96% rename from integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py rename to integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py index 40c455a4f..a2c825d66 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/embedding_retriever.py +++ b/integrations/elasticsearch/src/haystack_integrations/components/retrievers/elasticsearch/embedding_retriever.py @@ -5,8 +5,7 @@ from haystack import component, default_from_dict, default_to_dict from haystack.dataclasses import Document - -from elasticsearch_haystack.document_store import ElasticsearchDocumentStore +from haystack_integrations.document_stores.elasticsearch.document_store import ElasticsearchDocumentStore @component diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/__init__.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/__init__.py similarity index 66% rename from integrations/elasticsearch/src/elasticsearch_haystack/__init__.py rename to integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/__init__.py index 0c9feacb2..4489971dc 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/__init__.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 -from elasticsearch_haystack.document_store import ElasticsearchDocumentStore +from .document_store import ElasticsearchDocumentStore __all__ = ["ElasticsearchDocumentStore"] diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py similarity index 99% rename from integrations/elasticsearch/src/elasticsearch_haystack/document_store.py rename to integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py index 20469f79f..013acacd1 100644 --- a/integrations/elasticsearch/src/elasticsearch_haystack/document_store.py +++ b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/document_store.py @@ -8,14 +8,15 @@ # There are no import stubs for elastic_transport and elasticsearch so mypy fails from elastic_transport import NodeConfig # type: ignore[import-not-found] -from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found] from haystack import default_from_dict, default_to_dict from haystack.dataclasses import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.utils.filters import convert -from elasticsearch_haystack.filters import _normalize_filters +from elasticsearch import Elasticsearch, helpers # type: ignore[import-not-found] + +from .filters import _normalize_filters logger = logging.getLogger(__name__) diff --git a/integrations/elasticsearch/src/elasticsearch_haystack/filters.py b/integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/filters.py similarity index 100% rename from integrations/elasticsearch/src/elasticsearch_haystack/filters.py rename to integrations/elasticsearch/src/haystack_integrations/document_stores/elasticsearch/filters.py diff --git a/integrations/elasticsearch/tests/test_bm25_retriever.py b/integrations/elasticsearch/tests/test_bm25_retriever.py index bc1fc55bb..dd88cd0a8 100644 --- a/integrations/elasticsearch/tests/test_bm25_retriever.py +++ b/integrations/elasticsearch/tests/test_bm25_retriever.py @@ -4,9 +4,8 @@ from unittest.mock import Mock, patch from haystack.dataclasses import Document - -from elasticsearch_haystack.bm25_retriever import ElasticsearchBM25Retriever -from elasticsearch_haystack.document_store import ElasticsearchDocumentStore +from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever +from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore def test_init_default(): @@ -18,13 +17,13 @@ def test_init_default(): assert not retriever._scale_score -@patch("elasticsearch_haystack.document_store.Elasticsearch") +@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_to_dict(_mock_elasticsearch_client): document_store = ElasticsearchDocumentStore(hosts="some fake host") retriever = ElasticsearchBM25Retriever(document_store=document_store) res = retriever.to_dict() assert res == { - "type": "elasticsearch_haystack.bm25_retriever.ElasticsearchBM25Retriever", + "type": "haystack_integrations.components.retrievers.elasticsearch.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { "init_parameters": { @@ -32,7 +31,7 @@ def test_to_dict(_mock_elasticsearch_client): "index": "default", "embedding_similarity_function": "cosine", }, - "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", + "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, "fuzziness": "AUTO", @@ -42,14 +41,14 @@ def test_to_dict(_mock_elasticsearch_client): } -@patch("elasticsearch_haystack.document_store.Elasticsearch") +@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_from_dict(_mock_elasticsearch_client): data = { - "type": "elasticsearch_haystack.bm25_retriever.ElasticsearchBM25Retriever", + "type": "haystack_integrations.components.retrievers.elasticsearch.bm25_retriever.ElasticsearchBM25Retriever", "init_parameters": { "document_store": { "init_parameters": {"hosts": "some fake host", "index": "default"}, - "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", + "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, "fuzziness": "AUTO", diff --git a/integrations/elasticsearch/tests/test_document_store.py b/integrations/elasticsearch/tests/test_document_store.py index b892d9ae4..8d2eab1fb 100644 --- a/integrations/elasticsearch/tests/test_document_store.py +++ b/integrations/elasticsearch/tests/test_document_store.py @@ -12,10 +12,10 @@ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import DocumentStoreBaseTests - -from elasticsearch_haystack.document_store import ElasticsearchDocumentStore +from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore +@pytest.mark.integration class TestDocumentStore(DocumentStoreBaseTests): """ Common test cases will be provided by `DocumentStoreBaseTests` but @@ -67,12 +67,12 @@ def assert_documents_are_equal(self, received: List[Document], expected: List[Do super().assert_documents_are_equal(received, expected) - @patch("elasticsearch_haystack.document_store.Elasticsearch") + @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_to_dict(self, _mock_elasticsearch_client): document_store = ElasticsearchDocumentStore(hosts="some hosts") res = document_store.to_dict() assert res == { - "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", + "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", "init_parameters": { "hosts": "some hosts", "index": "default", @@ -80,10 +80,10 @@ def test_to_dict(self, _mock_elasticsearch_client): }, } - @patch("elasticsearch_haystack.document_store.Elasticsearch") + @patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_from_dict(self, _mock_elasticsearch_client): data = { - "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", + "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", "init_parameters": { "hosts": "some hosts", "index": "default", diff --git a/integrations/elasticsearch/tests/test_embedding_retriever.py b/integrations/elasticsearch/tests/test_embedding_retriever.py index fd60b0940..f632c3655 100644 --- a/integrations/elasticsearch/tests/test_embedding_retriever.py +++ b/integrations/elasticsearch/tests/test_embedding_retriever.py @@ -4,9 +4,8 @@ from unittest.mock import Mock, patch from haystack.dataclasses import Document - -from elasticsearch_haystack.document_store import ElasticsearchDocumentStore -from elasticsearch_haystack.embedding_retriever import ElasticsearchEmbeddingRetriever +from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever +from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore def test_init_default(): @@ -18,13 +17,14 @@ def test_init_default(): assert retriever._num_candidates is None -@patch("elasticsearch_haystack.document_store.Elasticsearch") +@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_to_dict(_mock_elasticsearch_client): document_store = ElasticsearchDocumentStore(hosts="some fake host") retriever = ElasticsearchEmbeddingRetriever(document_store=document_store) res = retriever.to_dict() + t = "haystack_integrations.components.retrievers.elasticsearch.embedding_retriever.ElasticsearchEmbeddingRetriever" assert res == { - "type": "elasticsearch_haystack.embedding_retriever.ElasticsearchEmbeddingRetriever", + "type": t, "init_parameters": { "document_store": { "init_parameters": { @@ -32,7 +32,7 @@ def test_to_dict(_mock_elasticsearch_client): "index": "default", "embedding_similarity_function": "cosine", }, - "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", + "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, "top_k": 10, @@ -41,14 +41,15 @@ def test_to_dict(_mock_elasticsearch_client): } -@patch("elasticsearch_haystack.document_store.Elasticsearch") +@patch("haystack_integrations.document_stores.elasticsearch.document_store.Elasticsearch") def test_from_dict(_mock_elasticsearch_client): + t = "haystack_integrations.components.retrievers.elasticsearch.embedding_retriever.ElasticsearchEmbeddingRetriever" data = { - "type": "elasticsearch_haystack.embedding_retriever.ElasticsearchEmbeddingRetriever", + "type": t, "init_parameters": { "document_store": { "init_parameters": {"hosts": "some fake host", "index": "default"}, - "type": "elasticsearch_haystack.document_store.ElasticsearchDocumentStore", + "type": "haystack_integrations.document_stores.elasticsearch.document_store.ElasticsearchDocumentStore", }, "filters": {}, "top_k": 10, diff --git a/integrations/elasticsearch/tests/test_filters.py b/integrations/elasticsearch/tests/test_filters.py index 3cf125fc7..86e5cba74 100644 --- a/integrations/elasticsearch/tests/test_filters.py +++ b/integrations/elasticsearch/tests/test_filters.py @@ -3,8 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import pytest from haystack.errors import FilterError - -from elasticsearch_haystack.filters import _normalize_filters, _normalize_ranges +from haystack_integrations.document_stores.elasticsearch.filters import _normalize_filters, _normalize_ranges filters_data = [ ( From cd78080983e3b9e2caf745ed51116f29137d5297 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 19 Jan 2024 14:02:59 +0100 Subject: [PATCH 5/5] mount import path under haystack_integrations (#245) --- integrations/gradient/pyproject.toml | 17 +++++++++-------- .../src/gradient_haystack/embedders/__init__.py | 3 --- .../src/gradient_haystack/generator/__init__.py | 3 --- .../components/embedders/gradient/__init__.py | 7 +++++++ .../gradient}/gradient_document_embedder.py | 0 .../gradient}/gradient_text_embedder.py | 0 .../components/generators/gradient}/__init__.py | 3 +++ .../components/generators/gradient}/base.py | 0 .../tests/test_gradient_document_embedder.py | 5 +++-- .../tests/test_gradient_rag_pipelines.py | 5 ++--- .../tests/test_gradient_text_embedder.py | 4 ++-- 11 files changed, 26 insertions(+), 21 deletions(-) delete mode 100644 integrations/gradient/src/gradient_haystack/embedders/__init__.py delete mode 100644 integrations/gradient/src/gradient_haystack/generator/__init__.py create mode 100644 integrations/gradient/src/haystack_integrations/components/embedders/gradient/__init__.py rename integrations/gradient/src/{gradient_haystack/embedders => haystack_integrations/components/embedders/gradient}/gradient_document_embedder.py (100%) rename integrations/gradient/src/{gradient_haystack/embedders => haystack_integrations/components/embedders/gradient}/gradient_text_embedder.py (100%) rename integrations/gradient/src/{gradient_haystack => haystack_integrations/components/generators/gradient}/__init__.py (61%) rename integrations/gradient/src/{gradient_haystack/generator => haystack_integrations/components/generators/gradient}/base.py (100%) diff --git a/integrations/gradient/pyproject.toml b/integrations/gradient/pyproject.toml index 8b7ca65c4..22140bba5 100644 --- a/integrations/gradient/pyproject.toml +++ b/integrations/gradient/pyproject.toml @@ -37,6 +37,9 @@ Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/m Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/gradient" +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + [tool.hatch.version] source = "vcs" tag-pattern = 'integrations\/gradient-v(?P.*)' @@ -73,7 +76,7 @@ dependencies = [ "ruff>=0.0.243", ] [tool.hatch.envs.lint.scripts] -typing = "mypy --install-types --non-interactive {args:src/gradient_haystack tests}" +typing = "mypy --install-types --non-interactive --explicit-package-bases {args:src/ tests}" style = [ "ruff {args:.}", "black --check --diff {args:.}", @@ -139,25 +142,22 @@ unfixable = [ ] [tool.ruff.isort] -known-first-party = ["gradient_haystack"] +known-first-party = ["haystack_integrations"] [tool.ruff.flake8-tidy-imports] -ban-relative-imports = "all" +ban-relative-imports = "parents" [tool.ruff.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] [tool.coverage.run] -source_pkgs = ["gradient_haystack", "tests"] +source_pkgs = ["src", "tests"] branch = true parallel = true -omit = [ - "src/gradient_haystack/__about__.py", -] [tool.coverage.paths] -gradient_haystack = ["src/gradient_haystack", "*/gradient-haystack/src/gradient_haystack"] +gradient_haystack = ["src"] tests = ["tests", "*/gradient-haystack/tests"] [tool.coverage.report] @@ -171,6 +171,7 @@ exclude_lines = [ module = [ "gradientai.*", "haystack.*", + "haystack_integrations.*", "pytest.*", "numpy.*", ] diff --git a/integrations/gradient/src/gradient_haystack/embedders/__init__.py b/integrations/gradient/src/gradient_haystack/embedders/__init__.py deleted file mode 100644 index e873bc332..000000000 --- a/integrations/gradient/src/gradient_haystack/embedders/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-FileCopyrightText: 2023-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/gradient/src/gradient_haystack/generator/__init__.py b/integrations/gradient/src/gradient_haystack/generator/__init__.py deleted file mode 100644 index e873bc332..000000000 --- a/integrations/gradient/src/gradient_haystack/generator/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-FileCopyrightText: 2023-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/gradient/src/haystack_integrations/components/embedders/gradient/__init__.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/__init__.py new file mode 100644 index 000000000..7fbba1bab --- /dev/null +++ b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2023-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 +from .gradient_document_embedder import GradientDocumentEmbedder +from .gradient_text_embedder import GradientTextEmbedder + +__all__ = ["GradientDocumentEmbedder", "GradientTextEmbedder"] diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py similarity index 100% rename from integrations/gradient/src/gradient_haystack/embedders/gradient_document_embedder.py rename to integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_document_embedder.py diff --git a/integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py b/integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py similarity index 100% rename from integrations/gradient/src/gradient_haystack/embedders/gradient_text_embedder.py rename to integrations/gradient/src/haystack_integrations/components/embedders/gradient/gradient_text_embedder.py diff --git a/integrations/gradient/src/gradient_haystack/__init__.py b/integrations/gradient/src/haystack_integrations/components/generators/gradient/__init__.py similarity index 61% rename from integrations/gradient/src/gradient_haystack/__init__.py rename to integrations/gradient/src/haystack_integrations/components/generators/gradient/__init__.py index e873bc332..a9d7cd421 100644 --- a/integrations/gradient/src/gradient_haystack/__init__.py +++ b/integrations/gradient/src/haystack_integrations/components/generators/gradient/__init__.py @@ -1,3 +1,6 @@ # SPDX-FileCopyrightText: 2023-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 +from .base import GradientGenerator + +__all__ = ["GradientGenerator"] diff --git a/integrations/gradient/src/gradient_haystack/generator/base.py b/integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py similarity index 100% rename from integrations/gradient/src/gradient_haystack/generator/base.py rename to integrations/gradient/src/haystack_integrations/components/generators/gradient/base.py diff --git a/integrations/gradient/tests/test_gradient_document_embedder.py b/integrations/gradient/tests/test_gradient_document_embedder.py index dc59a76fb..738d1e8a4 100644 --- a/integrations/gradient/tests/test_gradient_document_embedder.py +++ b/integrations/gradient/tests/test_gradient_document_embedder.py @@ -5,7 +5,7 @@ from gradientai.openapi.client.models.generate_embedding_success import GenerateEmbeddingSuccess from haystack import Document -from gradient_haystack.embedders.gradient_document_embedder import GradientDocumentEmbedder +from haystack_integrations.components.embedders.gradient import GradientDocumentEmbedder access_token = "access_token" workspace_id = "workspace_id" @@ -52,8 +52,9 @@ def test_init_from_params_precedence(self, monkeypatch): def test_to_dict(self): component = GradientDocumentEmbedder(access_token=access_token, workspace_id=workspace_id) data = component.to_dict() + t = "haystack_integrations.components.embedders.gradient.gradient_document_embedder.GradientDocumentEmbedder" assert data == { - "type": "gradient_haystack.embedders.gradient_document_embedder.GradientDocumentEmbedder", + "type": t, "init_parameters": {"workspace_id": workspace_id, "model": "bge-large"}, } diff --git a/integrations/gradient/tests/test_gradient_rag_pipelines.py b/integrations/gradient/tests/test_gradient_rag_pipelines.py index 9d3b11486..af987620c 100644 --- a/integrations/gradient/tests/test_gradient_rag_pipelines.py +++ b/integrations/gradient/tests/test_gradient_rag_pipelines.py @@ -9,9 +9,8 @@ from haystack.components.writers import DocumentWriter from haystack.document_stores.in_memory import InMemoryDocumentStore -from gradient_haystack.embedders.gradient_document_embedder import GradientDocumentEmbedder -from gradient_haystack.embedders.gradient_text_embedder import GradientTextEmbedder -from gradient_haystack.generator.base import GradientGenerator +from haystack_integrations.components.embedders.gradient import GradientDocumentEmbedder, GradientTextEmbedder +from haystack_integrations.components.generators.gradient import GradientGenerator @pytest.mark.skipif( diff --git a/integrations/gradient/tests/test_gradient_text_embedder.py b/integrations/gradient/tests/test_gradient_text_embedder.py index bd4b396ca..3350c02b3 100644 --- a/integrations/gradient/tests/test_gradient_text_embedder.py +++ b/integrations/gradient/tests/test_gradient_text_embedder.py @@ -4,7 +4,7 @@ import pytest from gradientai.openapi.client.models.generate_embedding_success import GenerateEmbeddingSuccess -from gradient_haystack.embedders.gradient_text_embedder import GradientTextEmbedder +from haystack_integrations.components.embedders.gradient import GradientTextEmbedder access_token = "access_token" workspace_id = "workspace_id" @@ -52,7 +52,7 @@ def test_to_dict(self): component = GradientTextEmbedder(access_token=access_token, workspace_id=workspace_id) data = component.to_dict() assert data == { - "type": "gradient_haystack.embedders.gradient_text_embedder.GradientTextEmbedder", + "type": "haystack_integrations.components.embedders.gradient.gradient_text_embedder.GradientTextEmbedder", "init_parameters": {"workspace_id": workspace_id, "model": "bge-large"}, }