From ae62f2e422748b84010c368f211d30d1e46ad7de Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:53:45 +0530 Subject: [PATCH 01/14] Add embedder files --- .../embedders/cohere_text_embedder.py | 122 +++++++++++++ .../embedders/test_cohere_text_embedder.py | 160 ++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 haystack/preview/components/embedders/cohere_text_embedder.py create mode 100644 test/preview/components/embedders/test_cohere_text_embedder.py diff --git a/haystack/preview/components/embedders/cohere_text_embedder.py b/haystack/preview/components/embedders/cohere_text_embedder.py new file mode 100644 index 0000000000..c26b0029c4 --- /dev/null +++ b/haystack/preview/components/embedders/cohere_text_embedder.py @@ -0,0 +1,122 @@ +from typing import List, Optional, Dict, Any +import os + +from haystack.preview import component, default_to_dict, default_from_dict +from haystack.preview.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install cohere'") as cohere_import: + from cohere import Client, AsyncClient, CohereError + + +API_BASE_URL = "https://api.cohere.ai/v1/embed" + + +@component +class CohereTextEmbedder: + """ + A component for embedding strings using Cohere models. + """ + + def __init__( + self, + api_key: Optional[str] = None, + model_name: str = "embed-english-v2.0", + api_base_url: str = API_BASE_URL, + truncate: str = "END", + use_async_client: bool = False, + max_retries: Optional[int] = 3, + timeout: Optional[int] = 120, + ): + """ + Create a CohereTextEmbedder component. + + :param api_key: The Cohere API key. It can be explicitly provided or automatically read from the environment variable COHERE_API_KEY (recommended). + :param model_name: The name of the model to use, defaults to `"embed-english-v2.0"`. Supported Models are `"embed-english-v2.0"`/ `"large"`, `"embed-english-light-v2.0"`/ `"small"`, `"embed-multilingual-v2.0"`/ `"multilingual-22-12"`. + :param api_base_url: The Cohere API Base url, defaults to `https://api.cohere.ai/v1/embed`. + :param truncate: Truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"), defaults to `"END"`. Passing START will discard the start of the input. END will discard the end of the input. In both cases, input is discarded until the remaining input is exactly the maximum input token length for the model. If NONE is selected, when the input exceeds the maximum input token length an error will be returned. + :param use_async_client: Flag to select the AsyncClient, defaults to `False`. It is recommended to use AsyncClient for applications with many concurrent calls. + :param max_retries: maximal number of retries for requests, defaults to `3`. + :param timeout: request timeout in seconds, defaults to `120`. + """ + + if api_key is None: + try: + api_key = os.environ["COHERE_API_KEY"] + except KeyError as error_msg: + raise ValueError( + "CohereTextEmbedder expects an Cohere API key. " + "Please provide one by setting the environment variable COHERE_API_KEY (recommended) or by passing it explicitly." + ) from error_msg + + self.api_key = api_key + self.model_name = model_name + self.api_base_url = api_base_url + self.truncate = truncate + self.use_async_client = use_async_client + self.max_retries = max_retries + self.timeout = timeout + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + api_key=self.api_key, + model_name=self.model_name, + api_base_url=self.api_base_url, + truncate=self.truncate, + use_async_client=self.use_async_client, + max_retries=self.max_retries, + timeout=self.timeout, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "CohereTextEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + async def _get_async_response(self, cohere_async_client: AsyncClient, text: str): + try: + response = await cohere_async_client.embed(texts=[text], model=self.model_name, truncate=self.truncate) + metadata = response.meta + embedding = [list(map(float, emb)) for emb in response.embeddings][0] + + except CohereError as error_response: + print(error_response.message) + + return embedding, metadata + + @component.output_types(embedding=List[float], metadata=Dict[str, Any]) + def run(self, text: str): + """Embed a string.""" + if not isinstance(text, str): + raise TypeError( + "CohereTextEmbedder expects a string as input." + "In case you want to embed a list of Documents, please use the CohereDocumentEmbedder." + ) + + # Establish connection to API + + if self.use_async_client == True: + cohere_client = AsyncClient( + self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout + ) + embedding, metadata = self._get_async_response(cohere_client, text) + + else: + cohere_client = Client( + self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout + ) + + try: + response = cohere_client.embed(texts=[text], model=self.model_name, truncate=self.truncate) + metadata = response.meta + embedding = [list(map(float, emb)) for emb in response.embeddings][0] + + except CohereError as error_response: + print(error_response.message) + + return {"embedding": embedding, "metadata": metadata} diff --git a/test/preview/components/embedders/test_cohere_text_embedder.py b/test/preview/components/embedders/test_cohere_text_embedder.py new file mode 100644 index 0000000000..7f1b3f251c --- /dev/null +++ b/test/preview/components/embedders/test_cohere_text_embedder.py @@ -0,0 +1,160 @@ +from unittest.mock import patch +import pytest + +from cohere.responses.embeddings import Embeddings + + +from haystack.preview.components.embedders.cohere_text_embedder import CohereTextEmbedder + + +class TestCohereTextEmbedder: + @pytest.mark.unit + def test_init_default(self): + """ + Test default initialization parameters for CohereTextEmbedder. + """ + embedder = CohereTextEmbedder(api_key="test-api-key") + + assert embedder.api_key == "test-api-key" + assert embedder.model_name == "embed-english-v2.0" + assert embedder.api_base_url == "https://api.cohere.ai/v1/embed" + assert embedder.truncate == "END" + assert embedder.use_async_client == False + assert embedder.max_retries == 3 + assert embedder.timeout == 120 + + @pytest.mark.unit + def test_init_with_parameters(self): + """ + Test custom initialization parameters for CohereTextEmbedder. + """ + embedder = CohereTextEmbedder( + api_key="test-api-key", + model_name="embed-multilingual-v2.0", + api_base_url="https://custom-api-base-url.com", + truncate="START", + use_async_client=True, + max_retries=5, + timeout=60, + ) + assert embedder.api_key == "test-api-key" + assert embedder.model_name == "embed-multilingual-v2.0" + assert embedder.api_base_url == "https://custom-api-base-url.com" + assert embedder.truncate == "START" + assert embedder.use_async_client == True + assert embedder.max_retries == 5 + assert embedder.timeout == 60 + + @pytest.mark.unit + def test_to_dict(self): + """ + Test serialization of this component to a dictionary, using default initialization parameters. + """ + embedder_component = CohereTextEmbedder(api_key="test-api-key") + component_dict = embedder_component.to_dict() + assert component_dict == { + "type": "CohereTextEmbedder", + "init_parameters": { + "api_key": "test-api-key", + "model_name": "embed-english-v2.0", + "api_base_url": "https://api.cohere.ai/v1/embed", + "truncate": "END", + "use_async_client": False, + "max_retries": 3, + "timeout": 120, + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + """ + Test serialization of this component to a dictionary, using custom initialization parameters. + """ + embedder_component = CohereTextEmbedder( + api_key="test-api-key", + model_name="embed-multilingual-v2.0", + api_base_url="https://custom-api-base-url.com", + truncate="START", + use_async_client=True, + max_retries=5, + timeout=60, + ) + component_dict = embedder_component.to_dict() + assert component_dict == { + "type": "CohereTextEmbedder", + "init_parameters": { + "api_key": "test-api-key", + "model_name": "embed-multilingual-v2.0", + "api_base_url": "https://custom-api-base-url.com", + "truncate": "START", + "use_async_client": True, + "max_retries": 5, + "timeout": 60, + }, + } + + @pytest.mark.unit + def test_from_dict(self): + """ + Test deserialization of this component from a dictionary, using default initialization parameters. + """ + embedder_component_dict = { + "type": "CohereTextEmbedder", + "init_parameters": { + "api_key": "test-api-key", + "model_name": "embed-english-v2.0", + "api_base_url": "https://api.cohere.ai/v1/embed", + "truncate": "END", + "use_async_client": False, + "max_retries": 3, + "timeout": 120, + }, + } + embedder = CohereTextEmbedder.from_dict(embedder_component_dict) + + assert embedder.api_key == "test-api-key" + assert embedder.model_name == "embed-english-v2.0" + assert embedder.api_base_url == "https://api.cohere.ai/v1/embed" + assert embedder.truncate == "END" + assert embedder.use_async_client == False + assert embedder.max_retries == 3 + assert embedder.timeout == 120 + + @pytest.mark.unit + def test_from_dict_with_custom_init_parameters(self): + """ + Test deserialization of this component from a dictionary, using custom initialization parameters. + """ + embedder_component_dict = { + "type": "CohereTextEmbedder", + "init_parameters": { + "api_key": "test-api-key", + "model_name": "embed-multilingual-v2.0", + "api_base_url": "https://custom-api-base-url.com", + "truncate": "START", + "use_async_client": True, + "max_retries": 5, + "timeout": 60, + }, + } + embedder = CohereTextEmbedder.from_dict(embedder_component_dict) + + assert embedder.api_key == "test-api-key" + assert embedder.model_name == "embed-multilingual-v2.0" + assert embedder.api_base_url == "https://custom-api-base-url.com" + assert embedder.truncate == "START" + assert embedder.use_async_client == True + assert embedder.max_retries == 5 + assert embedder.timeout == 60 + + @pytest.mark.unit + def test_run_wrong_input_format(self): + """ + Test for checking incorrect input when creating embedding. + """ + embedder = CohereTextEmbedder(api_key="test-api-key") + + list_integers_input = ["text_snippet_1", "text_snippet_2"] + + with pytest.raises(TypeError, match="CohereTextEmbedder expects a string as input"): + embedder.run(text=list_integers_input) From 0b59e1a2dcbaf20dccce78ee48402eccca290250 Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Sat, 16 Sep 2023 17:22:02 +0530 Subject: [PATCH 02/14] Add release-notes --- .../notes/add-CohereTextEmbedder-a429ecf033f36631.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml diff --git a/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml b/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml new file mode 100644 index 0000000000..8e65b7c42b --- /dev/null +++ b/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml @@ -0,0 +1,4 @@ +--- +preview: + - | + Add `CohereTextEmbedder`, a component that uses Cohere embedding models to embed strings into vectors. From ca47506b46be4614bf64c7b985f5e1e7bc24399d Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Mon, 18 Sep 2023 20:11:17 +0530 Subject: [PATCH 03/14] Fix formatting in test_cohere_text_embedder.py --- test/preview/components/embedders/test_cohere_text_embedder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/preview/components/embedders/test_cohere_text_embedder.py b/test/preview/components/embedders/test_cohere_text_embedder.py index 7f1b3f251c..aab7850be9 100644 --- a/test/preview/components/embedders/test_cohere_text_embedder.py +++ b/test/preview/components/embedders/test_cohere_text_embedder.py @@ -1,9 +1,6 @@ from unittest.mock import patch import pytest - from cohere.responses.embeddings import Embeddings - - from haystack.preview.components.embedders.cohere_text_embedder import CohereTextEmbedder From 93c2499ef430a777be48f134e4de45a666b79ec3 Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:05:38 +0530 Subject: [PATCH 04/14] Add cohere document embedder --- .../embedders/cohere_document_embedder.py | 182 ++++++++++++++++++ .../test_cohere_document_embedder.py | 167 ++++++++++++++++ 2 files changed, 349 insertions(+) create mode 100644 haystack/preview/components/embedders/cohere_document_embedder.py create mode 100644 test/preview/components/embedders/test_cohere_document_embedder.py diff --git a/haystack/preview/components/embedders/cohere_document_embedder.py b/haystack/preview/components/embedders/cohere_document_embedder.py new file mode 100644 index 0000000000..fe5e17a403 --- /dev/null +++ b/haystack/preview/components/embedders/cohere_document_embedder.py @@ -0,0 +1,182 @@ +from typing import List, Optional, Union, Dict, Any, Tuple +import os +from tqdm import tqdm + +from haystack.preview import component, Document, default_to_dict, default_from_dict +from haystack.preview.lazy_imports import LazyImport + +with LazyImport(message="Run 'pip install cohere'") as cohere_import: + from cohere import Client, AsyncClient, CohereError + +API_BASE_URL = "https://api.cohere.ai/v1/embed" + + +@component +class CohereDocumentEmbedder: + """ + A component for computing Document embeddings using Cohere models. + The embedding of each Document is stored in the `embedding` field of the Document. + """ + + def __init__( + self, + api_key: Optional[str] = None, + model_name: str = "embed-english-v2.0", + api_base_url: str = API_BASE_URL, + truncate: str = "END", + use_async_client: bool = False, + max_retries: Optional[int] = 3, + timeout: Optional[int] = 120, + batch_size: int = 32, + progress_bar: bool = True, + metadata_fields_to_embed: Optional[List[str]] = None, + embedding_separator: str = "\n", + ): + """ + Create a CohereDocumentEmbedder component. + + :param api_key: The Cohere API key. It can be explicitly provided or automatically read from the environment variable COHERE_API_KEY (recommended). + :param model_name: The name of the model to use, defaults to `"embed-english-v2.0"`. Supported Models are `"embed-english-v2.0"`/ `"large"`, `"embed-english-light-v2.0"`/ `"small"`, `"embed-multilingual-v2.0"`/ `"multilingual-22-12"`. + :param api_base_url: The Cohere API Base url, defaults to `https://api.cohere.ai/v1/embed`. + :param truncate: Truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"), defaults to `"END"`. Passing START will discard the start of the input. END will discard the end of the input. In both cases, input is discarded until the remaining input is exactly the maximum input token length for the model. If NONE is selected, when the input exceeds the maximum input token length an error will be returned. + :param use_async_client: Flag to select the AsyncClient, defaults to `False`. It is recommended to use AsyncClient for applications with many concurrent calls. + :param max_retries: maximal number of retries for requests, defaults to `3`. + :param timeout: request timeout in seconds, defaults to `120`. + :param batch_size: Number of Documents to encode at once. + :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments + to keep the logs clean. + :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text. + :param embedding_separator: Separator used to concatenate the meta fields to the Document text. + """ + + if api_key is None: + try: + api_key = os.environ["COHERE_API_KEY"] + except KeyError as error_msg: + raise ValueError( + "CohereDocumentEmbedder expects an Cohere API key. " + "Please provide one by setting the environment variable COHERE_API_KEY (recommended) or by passing it explicitly." + ) from error_msg + + self.api_key = api_key + self.model_name = model_name + self.api_base_url = api_base_url + self.truncate = truncate + self.use_async_client = use_async_client + self.max_retries = max_retries + self.timeout = timeout + self.batch_size = batch_size + self.progress_bar = progress_bar + self.metadata_fields_to_embed = metadata_fields_to_embed or [] + self.embedding_separator = embedding_separator + + def to_dict(self) -> Dict[str, Any]: + """ + Serialize this component to a dictionary. + """ + return default_to_dict( + self, + api_key=self.api_key, + model_name=self.model_name, + api_base_url=self.api_base_url, + truncate=self.truncate, + use_async_client=self.use_async_client, + max_retries=self.max_retries, + timeout=self.timeout, + batch_size=self.batch_size, + progress_bar=self.progress_bar, + metadata_fields_to_embed=self.metadata_fields_to_embed, + embedding_separator=self.embedding_separator, + ) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "CohereDocumentEmbedder": + """ + Deserialize this component from a dictionary. + """ + return default_from_dict(cls, data) + + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: + """ + Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. + """ + texts_to_embed = [] + for doc in documents: + meta_values_to_embed = [ + str(doc.metadata[key]) + for key in self.metadata_fields_to_embed + if key in doc.metadata and doc.metadata[key] is not None + ] + + text_to_embed = self.embedding_separator.join(meta_values_to_embed + [doc.text or ""]) + texts_to_embed.append(text_to_embed) + return texts_to_embed + + def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[str], Dict[str, Any]]: + """ + Embed a list of texts in batches. + """ + + all_embeddings = [] + metadata = {} + cohere_client = Client( + self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout + ) + + for i in tqdm( + range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + ): + batch = texts_to_embed[i : i + batch_size] + response = cohere_client.embed(batch) + embeddings = [el["embedding"] for el in response.data] + all_embeddings.extend(embeddings) + + if "model" not in metadata: + metadata["model"] = response.model + if "usage" not in metadata: + metadata["usage"] = dict(response.usage.items()) + else: + metadata["usage"]["prompt_tokens"] += response.usage.prompt_tokens + metadata["usage"]["total_tokens"] += response.usage.total_tokens + + return all_embeddings, metadata + + @component.output_types(documents=List[Document], metadata=Dict[str, Any]) + def run(self, documents: List[Document]): + """ + Embed a list of Documents. + The embedding of each Document is stored in the `embedding` field of the Document. + + :param documents: A list of Documents to embed. + """ + if not isinstance(documents, list) or not isinstance(documents[0], Document): + raise TypeError( + "CohereDocumentEmbedder expects a list of Documents as input." + "In case you want to embed a string, please use the CohereTextEmbedder." + ) + + cohere_client = Client( + self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout + ) + + texts_to_embed = self._prepare_texts_to_embed(documents=documents) + + all_embeddings = [] + metadata = {} + for i in tqdm( + range(0, len(texts_to_embed), self.batch_size), disable=not self.progress_bar, desc="Calculating embeddings" + ): + batch = texts_to_embed[i : i + self.batch_size] + response = cohere_client.embed(batch) + embeddings = [list(map(float, emb)) for emb in response.embeddings] + all_embeddings.extend(embeddings) + + metadata = response.meta + + documents_with_embeddings = [] + for doc, emb in zip(documents, all_embeddings): + doc_as_dict = doc.to_dict() + doc_as_dict["embedding"] = emb + documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + + return {"documents": documents_with_embeddings, "metadata": metadata} diff --git a/test/preview/components/embedders/test_cohere_document_embedder.py b/test/preview/components/embedders/test_cohere_document_embedder.py new file mode 100644 index 0000000000..ee5dd9ef66 --- /dev/null +++ b/test/preview/components/embedders/test_cohere_document_embedder.py @@ -0,0 +1,167 @@ +from unittest.mock import patch, MagicMock +import pytest +from cohere.responses.embeddings import Embeddings +import numpy as np +from haystack.preview import Document +from haystack.preview.components.embedders.cohere_document_embedder import CohereDocumentEmbedder + + +class TestCohereDocumentEmbedder: + @pytest.mark.unit + def test_init_default(self): + embedder = CohereDocumentEmbedder(api_key="test-api-key") + assert embedder.api_key == "test-api-key" + assert embedder.model_name == "embed-english-v2.0" + assert embedder.api_base_url == "https://api.cohere.ai/v1/embed" + assert embedder.truncate == "END" + assert embedder.use_async_client == False + assert embedder.max_retries == 3 + assert embedder.timeout == 120 + assert embedder.batch_size == 32 + assert embedder.progress_bar == True + assert embedder.metadata_fields_to_embed == [] + assert embedder.embedding_separator == "\n" + + @pytest.mark.unit + def test_init_with_parameters(self): + embedder = CohereDocumentEmbedder( + api_key="test-api-key", + model_name="embed-multilingual-v2.0", + api_base_url="https://custom-api-base-url.com", + truncate="START", + use_async_client=True, + max_retries=5, + timeout=60, + batch_size=64, + progress_bar=False, + metadata_fields_to_embed=["test_field"], + embedding_separator="-", + ) + assert embedder.api_key == "test-api-key" + assert embedder.model_name == "embed-multilingual-v2.0" + assert embedder.api_base_url == "https://custom-api-base-url.com" + assert embedder.truncate == "START" + assert embedder.use_async_client == True + assert embedder.max_retries == 5 + assert embedder.timeout == 60 + assert embedder.batch_size == 64 + assert embedder.progress_bar == False + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == "-" + + @pytest.mark.unit + def test_to_dict(self): + embedder_component = CohereDocumentEmbedder(api_key="test-api-key") + component_dict = embedder_component.to_dict() + assert component_dict == { + "type": "CohereDocumentEmbedder", + "init_parameters": { + "api_key": "test-api-key", + "model_name": "embed-english-v2.0", + "api_base_url": "https://api.cohere.ai/v1/embed", + "truncate": "END", + "use_async_client": False, + "max_retries": 3, + "timeout": 120, + "batch_size": 32, + "progress_bar": True, + "metadata_fields_to_embed": [], + "embedding_separator": "\n", + }, + } + + @pytest.mark.unit + def test_to_dict_with_custom_init_parameters(self): + embedder_component = CohereDocumentEmbedder( + api_key="test-api-key", + model_name="embed-multilingual-v2.0", + api_base_url="https://custom-api-base-url.com", + truncate="START", + use_async_client=True, + max_retries=5, + timeout=60, + batch_size=64, + progress_bar=False, + metadata_fields_to_embed=["text_field"], + embedding_separator="-", + ) + component_dict = embedder_component.to_dict() + assert component_dict == { + "type": "CohereDocumentEmbedder", + "init_parameters": { + "api_key": "test-api-key", + "model_name": "embed-multilingual-v2.0", + "api_base_url": "https://custom-api-base-url.com", + "truncate": "START", + "use_async_client": True, + "max_retries": 5, + "timeout": 60, + "batch_size": 64, + "progress_bar": False, + "metadata_fields_to_embed": ["text_field"], + "embedding_separator": "-", + }, + } + + @pytest.mark.unit + def test_from_dict(self): + embedder_component_dict = { + "type": "CohereDocumentEmbedder", + "init_parameters": { + "api_key": "test-api-key", + "model_name": "embed-english-v2.0", + "api_base_url": "https://api.cohere.ai/v1/embed", + "truncate": "START", + "use_async_client": True, + "max_retries": 5, + "timeout": 60, + "batch_size": 32, + "progress_bar": False, + "metadata_fields_to_embed": ["test_field"], + "embedding_separator": "-", + }, + } + embedder = CohereDocumentEmbedder.from_dict(embedder_component_dict) + assert embedder.api_key == "test-api-key" + assert embedder.model_name == "embed-english-v2.0" + assert embedder.api_base_url == "https://api.cohere.ai/v1/embed" + assert embedder.truncate == "START" + assert embedder.use_async_client == True + assert embedder.max_retries == 5 + assert embedder.timeout == 60 + assert embedder.batch_size == 32 + assert embedder.progress_bar == False + assert embedder.metadata_fields_to_embed == ["test_field"] + assert embedder.embedding_separator == "-" + + @pytest.mark.unit + def test_run(self): + embedder = CohereDocumentEmbedder(api_key="test-api-key") + embedder.embedding_backend = MagicMock() + embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 2).tolist() + + docs = [ + Document(text="I love cheese", metadata={"topic": "Cuisine"}), + Document(text="A transformer is a deep learning architecture", metadata={"topic": "ML"}), + ] + + result = embedder.run(docs) + embeddings = result["documents"] + + assert isinstance(embeddings, list) + assert len(embeddings) == len(docs) + for embedding in embeddings: + assert isinstance(embedding, list) + assert isinstance(embedding[0], float) + + @pytest.mark.unit + def test_run_wrong_input_format(self): + embedder = CohereDocumentEmbedder(api_key="test-api-key") + + string_input = "text" + list_integers_input = [1, 2, 3] + + with pytest.raises(TypeError, match="CohereDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=string_input) + with pytest.raises(TypeError, match="CohereDocumentEmbedder expects a list of Documents as input"): + embedder.run(documents=list_integers_input) From ffdaf8925bdeff5fc6e0914374659086bfcc97b5 Mon Sep 17 00:00:00 2001 From: Varun Mathur <97465624+vrunm@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:10:33 +0530 Subject: [PATCH 05/14] Update add-CohereTextEmbedder-a429ecf033f36631.yaml --- releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml b/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml index 8e65b7c42b..152e0b22ef 100644 --- a/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml +++ b/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml @@ -2,3 +2,5 @@ preview: - | Add `CohereTextEmbedder`, a component that uses Cohere embedding models to embed strings into vectors. + - | + Add `CohereDocumentEmbedder`, a component that uses Cohere embedding models to embeds a list of Documents.. From af1ba0ef546cae8e8165bfb5c0bc85a1d31f2aa7 Mon Sep 17 00:00:00 2001 From: Varun Mathur <97465624+vrunm@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:10:58 +0530 Subject: [PATCH 06/14] Update add-CohereTextEmbedder-a429ecf033f36631.yaml --- releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml b/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml index 152e0b22ef..54e2a96612 100644 --- a/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml +++ b/releasenotes/notes/add-CohereTextEmbedder-a429ecf033f36631.yaml @@ -3,4 +3,4 @@ preview: - | Add `CohereTextEmbedder`, a component that uses Cohere embedding models to embed strings into vectors. - | - Add `CohereDocumentEmbedder`, a component that uses Cohere embedding models to embeds a list of Documents.. + Add `CohereDocumentEmbedder`, a component that uses Cohere embedding models to embeds a list of Documents. From e012b8f4b8b9a4e70d9022c4e434c82840f3a3dc Mon Sep 17 00:00:00 2001 From: Varun Mathur <97465624+vrunm@users.noreply.github.com> Date: Wed, 4 Oct 2023 12:25:41 +0530 Subject: [PATCH 07/14] Apply suggestions from code review Co-authored-by: Daria Fokina --- .../preview/components/embedders/cohere_document_embedder.py | 2 +- haystack/preview/components/embedders/cohere_text_embedder.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/preview/components/embedders/cohere_document_embedder.py b/haystack/preview/components/embedders/cohere_document_embedder.py index fe5e17a403..2b1838c2d5 100644 --- a/haystack/preview/components/embedders/cohere_document_embedder.py +++ b/haystack/preview/components/embedders/cohere_document_embedder.py @@ -41,7 +41,7 @@ def __init__( :param truncate: Truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"), defaults to `"END"`. Passing START will discard the start of the input. END will discard the end of the input. In both cases, input is discarded until the remaining input is exactly the maximum input token length for the model. If NONE is selected, when the input exceeds the maximum input token length an error will be returned. :param use_async_client: Flag to select the AsyncClient, defaults to `False`. It is recommended to use AsyncClient for applications with many concurrent calls. :param max_retries: maximal number of retries for requests, defaults to `3`. - :param timeout: request timeout in seconds, defaults to `120`. + :param timeout: Request timeout in seconds, defaults to `120`. :param batch_size: Number of Documents to encode at once. :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. diff --git a/haystack/preview/components/embedders/cohere_text_embedder.py b/haystack/preview/components/embedders/cohere_text_embedder.py index c26b0029c4..b1caac5724 100644 --- a/haystack/preview/components/embedders/cohere_text_embedder.py +++ b/haystack/preview/components/embedders/cohere_text_embedder.py @@ -35,8 +35,8 @@ def __init__( :param api_base_url: The Cohere API Base url, defaults to `https://api.cohere.ai/v1/embed`. :param truncate: Truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"), defaults to `"END"`. Passing START will discard the start of the input. END will discard the end of the input. In both cases, input is discarded until the remaining input is exactly the maximum input token length for the model. If NONE is selected, when the input exceeds the maximum input token length an error will be returned. :param use_async_client: Flag to select the AsyncClient, defaults to `False`. It is recommended to use AsyncClient for applications with many concurrent calls. - :param max_retries: maximal number of retries for requests, defaults to `3`. - :param timeout: request timeout in seconds, defaults to `120`. + :param max_retries: Maximum number of retries for requests, defaults to `3`. + :param timeout: Request timeout in seconds, defaults to `120`. """ if api_key is None: From ccde635aeb0112c240de87e5b1e22071e97ef755 Mon Sep 17 00:00:00 2001 From: Varun Mathur <97465624+vrunm@users.noreply.github.com> Date: Wed, 4 Oct 2023 12:26:05 +0530 Subject: [PATCH 08/14] Apply suggestions from code review Co-authored-by: Daria Fokina --- .../preview/components/embedders/cohere_document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/preview/components/embedders/cohere_document_embedder.py b/haystack/preview/components/embedders/cohere_document_embedder.py index 2b1838c2d5..1b68d950eb 100644 --- a/haystack/preview/components/embedders/cohere_document_embedder.py +++ b/haystack/preview/components/embedders/cohere_document_embedder.py @@ -40,7 +40,7 @@ def __init__( :param api_base_url: The Cohere API Base url, defaults to `https://api.cohere.ai/v1/embed`. :param truncate: Truncate embeddings that are too long from start or end, ("NONE"|"START"|"END"), defaults to `"END"`. Passing START will discard the start of the input. END will discard the end of the input. In both cases, input is discarded until the remaining input is exactly the maximum input token length for the model. If NONE is selected, when the input exceeds the maximum input token length an error will be returned. :param use_async_client: Flag to select the AsyncClient, defaults to `False`. It is recommended to use AsyncClient for applications with many concurrent calls. - :param max_retries: maximal number of retries for requests, defaults to `3`. + :param max_retries: Maximum number of retries for requests, defaults to `3`. :param timeout: Request timeout in seconds, defaults to `120`. :param batch_size: Number of Documents to encode at once. :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments From b12122d0bde3ed3e98af5b65564f8d823454506e Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Wed, 4 Oct 2023 15:29:10 +0530 Subject: [PATCH 09/14] Fix api key serialization --- .../components/embedders/cohere_document_embedder.py | 3 +-- .../preview/components/embedders/cohere_text_embedder.py | 1 - .../components/embedders/test_cohere_document_embedder.py | 6 ++---- .../components/embedders/test_cohere_text_embedder.py | 2 -- 4 files changed, 3 insertions(+), 9 deletions(-) diff --git a/haystack/preview/components/embedders/cohere_document_embedder.py b/haystack/preview/components/embedders/cohere_document_embedder.py index fe5e17a403..021e384ec5 100644 --- a/haystack/preview/components/embedders/cohere_document_embedder.py +++ b/haystack/preview/components/embedders/cohere_document_embedder.py @@ -42,7 +42,7 @@ def __init__( :param use_async_client: Flag to select the AsyncClient, defaults to `False`. It is recommended to use AsyncClient for applications with many concurrent calls. :param max_retries: maximal number of retries for requests, defaults to `3`. :param timeout: request timeout in seconds, defaults to `120`. - :param batch_size: Number of Documents to encode at once. + :param batch_size: Number of Documents to encode at once. :param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments to keep the logs clean. :param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text. @@ -76,7 +76,6 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - api_key=self.api_key, model_name=self.model_name, api_base_url=self.api_base_url, truncate=self.truncate, diff --git a/haystack/preview/components/embedders/cohere_text_embedder.py b/haystack/preview/components/embedders/cohere_text_embedder.py index c26b0029c4..f0577ee7a1 100644 --- a/haystack/preview/components/embedders/cohere_text_embedder.py +++ b/haystack/preview/components/embedders/cohere_text_embedder.py @@ -62,7 +62,6 @@ def to_dict(self) -> Dict[str, Any]: """ return default_to_dict( self, - api_key=self.api_key, model_name=self.model_name, api_base_url=self.api_base_url, truncate=self.truncate, diff --git a/test/preview/components/embedders/test_cohere_document_embedder.py b/test/preview/components/embedders/test_cohere_document_embedder.py index ee5dd9ef66..3457db364a 100644 --- a/test/preview/components/embedders/test_cohere_document_embedder.py +++ b/test/preview/components/embedders/test_cohere_document_embedder.py @@ -56,7 +56,6 @@ def test_to_dict(self): assert component_dict == { "type": "CohereDocumentEmbedder", "init_parameters": { - "api_key": "test-api-key", "model_name": "embed-english-v2.0", "api_base_url": "https://api.cohere.ai/v1/embed", "truncate": "END", @@ -89,7 +88,6 @@ def test_to_dict_with_custom_init_parameters(self): assert component_dict == { "type": "CohereDocumentEmbedder", "init_parameters": { - "api_key": "test-api-key", "model_name": "embed-multilingual-v2.0", "api_base_url": "https://custom-api-base-url.com", "truncate": "START", @@ -137,8 +135,8 @@ def test_from_dict(self): @pytest.mark.unit def test_run(self): embedder = CohereDocumentEmbedder(api_key="test-api-key") - embedder.embedding_backend = MagicMock() - embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 2).tolist() + embedder = MagicMock() + embedder.run = lambda x, **kwargs: np.random.rand(len(x), 2).tolist() docs = [ Document(text="I love cheese", metadata={"topic": "Cuisine"}), diff --git a/test/preview/components/embedders/test_cohere_text_embedder.py b/test/preview/components/embedders/test_cohere_text_embedder.py index aab7850be9..12330684a4 100644 --- a/test/preview/components/embedders/test_cohere_text_embedder.py +++ b/test/preview/components/embedders/test_cohere_text_embedder.py @@ -52,7 +52,6 @@ def test_to_dict(self): assert component_dict == { "type": "CohereTextEmbedder", "init_parameters": { - "api_key": "test-api-key", "model_name": "embed-english-v2.0", "api_base_url": "https://api.cohere.ai/v1/embed", "truncate": "END", @@ -80,7 +79,6 @@ def test_to_dict_with_custom_init_parameters(self): assert component_dict == { "type": "CohereTextEmbedder", "init_parameters": { - "api_key": "test-api-key", "model_name": "embed-multilingual-v2.0", "api_base_url": "https://custom-api-base-url.com", "truncate": "START", From 45d62787d02510871cb6d071c0c9831daa095bb3 Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Mon, 23 Oct 2023 13:06:07 +0530 Subject: [PATCH 10/14] add async for document embedder --- .../embedders/cohere_document_embedder.py | 97 +++++++++---------- .../embedders/test_cohere_text_embedder.py | 12 ++- 2 files changed, 56 insertions(+), 53 deletions(-) diff --git a/haystack/preview/components/embedders/cohere_document_embedder.py b/haystack/preview/components/embedders/cohere_document_embedder.py index 021e384ec5..0df493d41f 100644 --- a/haystack/preview/components/embedders/cohere_document_embedder.py +++ b/haystack/preview/components/embedders/cohere_document_embedder.py @@ -95,6 +95,17 @@ def from_dict(cls, data: Dict[str, Any]) -> "CohereDocumentEmbedder": """ return default_from_dict(cls, data) + async def _get_async_response(self, cohere_async_client: AsyncClient, documents: List[Document]): + try: + response = await cohere_async_client.embed(texts=[documents], model=self.model_name, truncate=self.truncate) + metadata = response.meta + embedding = [list(map(float, emb)) for emb in response.embeddings][0] + + except CohereError as error_response: + print(error_response.message) + + return embedding, metadata + def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: """ Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. @@ -102,44 +113,13 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: texts_to_embed = [] for doc in documents: meta_values_to_embed = [ - str(doc.metadata[key]) - for key in self.metadata_fields_to_embed - if key in doc.metadata and doc.metadata[key] is not None + str(doc.metadata[key]) for key in self.metadata_fields_to_embed if doc.metadata.get(key) is not None ] text_to_embed = self.embedding_separator.join(meta_values_to_embed + [doc.text or ""]) texts_to_embed.append(text_to_embed) return texts_to_embed - def _embed_batch(self, texts_to_embed: List[str], batch_size: int) -> Tuple[List[str], Dict[str, Any]]: - """ - Embed a list of texts in batches. - """ - - all_embeddings = [] - metadata = {} - cohere_client = Client( - self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout - ) - - for i in tqdm( - range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" - ): - batch = texts_to_embed[i : i + batch_size] - response = cohere_client.embed(batch) - embeddings = [el["embedding"] for el in response.data] - all_embeddings.extend(embeddings) - - if "model" not in metadata: - metadata["model"] = response.model - if "usage" not in metadata: - metadata["usage"] = dict(response.usage.items()) - else: - metadata["usage"]["prompt_tokens"] += response.usage.prompt_tokens - metadata["usage"]["total_tokens"] += response.usage.total_tokens - - return all_embeddings, metadata - @component.output_types(documents=List[Document], metadata=Dict[str, Any]) def run(self, documents: List[Document]): """ @@ -148,34 +128,47 @@ def run(self, documents: List[Document]): :param documents: A list of Documents to embed. """ + if not isinstance(documents, list) or not isinstance(documents[0], Document): raise TypeError( "CohereDocumentEmbedder expects a list of Documents as input." "In case you want to embed a string, please use the CohereTextEmbedder." ) - cohere_client = Client( - self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout - ) - - texts_to_embed = self._prepare_texts_to_embed(documents=documents) + # Establish connection to API - all_embeddings = [] - metadata = {} - for i in tqdm( - range(0, len(texts_to_embed), self.batch_size), disable=not self.progress_bar, desc="Calculating embeddings" - ): - batch = texts_to_embed[i : i + self.batch_size] - response = cohere_client.embed(batch) - embeddings = [list(map(float, emb)) for emb in response.embeddings] - all_embeddings.extend(embeddings) + if self.use_async_client == True: + cohere_client = AsyncClient( + self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout + ) + texts_to_embed = self._prepare_texts_to_embed(cohere_client, documents) - metadata = response.meta + else: + cohere_client = Client( + self.api_key, api_url=self.api_base_url, max_retries=self.max_retries, timeout=self.timeout + ) - documents_with_embeddings = [] - for doc, emb in zip(documents, all_embeddings): - doc_as_dict = doc.to_dict() - doc_as_dict["embedding"] = emb - documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + try: + all_embeddings = [] + metadata = {} + for i in tqdm( + range(0, len(texts_to_embed), self.batch_size), + disable=not self.progress_bar, + desc="Calculating embeddings", + ): + batch = texts_to_embed[i : i + self.batch_size] + response = cohere_client.embed(batch) + embeddings = [list(map(float, emb)) for emb in response.embeddings] + all_embeddings.extend(embeddings) + + metadata = response.meta + + documents_with_embeddings = [] + for doc, emb in zip(documents, all_embeddings): + doc_as_dict = doc.to_dict() + doc_as_dict["embedding"] = emb + documents_with_embeddings.append(Document.from_dict(doc_as_dict)) + except CohereError as error_response: + print(error_response.message) return {"documents": documents_with_embeddings, "metadata": metadata} diff --git a/test/preview/components/embedders/test_cohere_text_embedder.py b/test/preview/components/embedders/test_cohere_text_embedder.py index 12330684a4..bb0ed0694a 100644 --- a/test/preview/components/embedders/test_cohere_text_embedder.py +++ b/test/preview/components/embedders/test_cohere_text_embedder.py @@ -1,4 +1,4 @@ -from unittest.mock import patch +from unittest.mock import patch, MagicMock import pytest from cohere.responses.embeddings import Embeddings from haystack.preview.components.embedders.cohere_text_embedder import CohereTextEmbedder @@ -153,3 +153,13 @@ def test_run_wrong_input_format(self): with pytest.raises(TypeError, match="CohereTextEmbedder expects a string as input"): embedder.run(text=list_integers_input) + + @pytest.mark.integration + def test_run(self): + embedder = CohereTextEmbedder(api_key="test-api-key") + embedder = MagicMock() + text = "The food was delicious" + + result = embedder.run(text) + + assert all(isinstance(x, float) for x in result["embedding"]) From f166343a5ed2ef201b94c2d8afc2ff789688221c Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:11:57 +0530 Subject: [PATCH 11/14] update git workflow --- .../preview/components/embedders/cohere_document_embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/preview/components/embedders/cohere_document_embedder.py b/haystack/preview/components/embedders/cohere_document_embedder.py index 0df493d41f..fcfac89ed1 100644 --- a/haystack/preview/components/embedders/cohere_document_embedder.py +++ b/haystack/preview/components/embedders/cohere_document_embedder.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, Dict, Any, Tuple +from typing import List, Optional, Dict, Any import os from tqdm import tqdm From d8323d984ca1981f92af0940119d8ca73c653557 Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Thu, 2 Nov 2023 11:40:08 +0530 Subject: [PATCH 12/14] Update workflows --- .github/workflows/linting_preview.yml | 4 ++-- .github/workflows/tests_preview.yml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/linting_preview.yml b/.github/workflows/linting_preview.yml index 3e4cbfdc89..e3739619f4 100644 --- a/.github/workflows/linting_preview.yml +++ b/.github/workflows/linting_preview.yml @@ -37,7 +37,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' cohere - name: Mypy if: steps.files.outputs.any_changed == 'true' @@ -71,7 +71,7 @@ jobs: - name: Install Haystack run: | - pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' cohere pip install ./haystack-linter - name: Pylint diff --git a/.github/workflows/tests_preview.yml b/.github/workflows/tests_preview.yml index 9f570491b7..5ae9294bef 100644 --- a/.github/workflows/tests_preview.yml +++ b/.github/workflows/tests_preview.yml @@ -116,7 +116,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' cohere - name: Run run: pytest -m "unit" test/preview @@ -175,7 +175,7 @@ jobs: sudo apt install ffmpeg # for local Whisper tests - name: Install Haystack - run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' cohere - name: Run run: pytest --maxfail=5 -m "integration" test/preview @@ -230,7 +230,7 @@ jobs: colima start - name: Install Haystack - run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' cohere - name: Run Tika run: docker run -d -p 9998:9998 apache/tika:2.9.0.0 @@ -282,7 +282,7 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: Install Haystack - run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' + run: pip install .[dev,preview] langdetect transformers[torch,sentencepiece]==4.32.1 'sentence-transformers>=2.2.0' pypdf openai-whisper tika 'azure-ai-formrecognizer>=3.2.0b2' cohere - name: Run run: pytest --maxfail=5 -m "integration" test/preview -k 'not tika' From 0feea205e38597b64c7410336595501e064ead2d Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Thu, 23 Nov 2023 17:37:54 +0530 Subject: [PATCH 13/14] Fix conflicts --- .github/workflows/linting_preview.yml | 1 - .github/workflows/tests_preview.yml | 3 +-- .../components/embedders/test_cohere_document_embedder.py | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/linting_preview.yml b/.github/workflows/linting_preview.yml index 379aff6094..2052e25fc1 100644 --- a/.github/workflows/linting_preview.yml +++ b/.github/workflows/linting_preview.yml @@ -73,7 +73,6 @@ jobs: run: | pip install .[dev,preview,audio] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' cohere pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper - pip install ./haystack-linter - name: Pylint diff --git a/.github/workflows/tests_preview.yml b/.github/workflows/tests_preview.yml index 19fced9261..fad6e7d7d6 100644 --- a/.github/workflows/tests_preview.yml +++ b/.github/workflows/tests_preview.yml @@ -180,7 +180,7 @@ jobs: run: | pip install .[dev,preview,audio] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' cohere pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper - + - name: Run run: pytest --maxfail=5 -m "integration" test/preview @@ -292,7 +292,6 @@ jobs: pip install .[dev,preview,audio] langdetect transformers[torch,sentencepiece]==4.34.1 'sentence-transformers>=2.2.0' cohere pypdf markdown-it-py mdit_plain tika 'azure-ai-formrecognizer>=3.2.0b2' pip install --no-deps llvmlite numba 'openai-whisper>=20230918' # prevent outdated version of tiktoken pinned by openai-whisper - - name: Run run: pytest --maxfail=5 -m "integration" test/preview -k 'not tika' diff --git a/test/preview/components/embedders/test_cohere_document_embedder.py b/test/preview/components/embedders/test_cohere_document_embedder.py index 3457db364a..3dda63776e 100644 --- a/test/preview/components/embedders/test_cohere_document_embedder.py +++ b/test/preview/components/embedders/test_cohere_document_embedder.py @@ -139,8 +139,8 @@ def test_run(self): embedder.run = lambda x, **kwargs: np.random.rand(len(x), 2).tolist() docs = [ - Document(text="I love cheese", metadata={"topic": "Cuisine"}), - Document(text="A transformer is a deep learning architecture", metadata={"topic": "ML"}), + Document(content="I love cheese", meta={"topic": "Cuisine"}), + Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), ] result = embedder.run(docs) From d9128f17af365243eb5efe348f0976f0719e1cd8 Mon Sep 17 00:00:00 2001 From: vrunm <97465624+vrunm@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:21:26 +0530 Subject: [PATCH 14/14] Fix conflicts --- haystack/preview/components/embedders/__init__.py | 4 ++++ .../preview/components/embedders/cohere_document_embedder.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/haystack/preview/components/embedders/__init__.py b/haystack/preview/components/embedders/__init__.py index a0840d7e0a..bb5e4b9619 100644 --- a/haystack/preview/components/embedders/__init__.py +++ b/haystack/preview/components/embedders/__init__.py @@ -1,3 +1,5 @@ +from haystack.preview.components.embedders.cohere_text_embedder import CohereTextEmbedder +from haystack.preview.components.embedders.cohere_document_embedder import CohereDocumentEmbedder from haystack.preview.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder from haystack.preview.components.embedders.sentence_transformers_document_embedder import ( SentenceTransformersDocumentEmbedder, @@ -6,6 +8,8 @@ from haystack.preview.components.embedders.openai_text_embedder import OpenAITextEmbedder __all__ = [ + "CohereTextEmbedder", + "CohereDocumentEmbedder", "SentenceTransformersTextEmbedder", "SentenceTransformersDocumentEmbedder", "OpenAITextEmbedder", diff --git a/haystack/preview/components/embedders/cohere_document_embedder.py b/haystack/preview/components/embedders/cohere_document_embedder.py index fcfac89ed1..34b52ccffd 100644 --- a/haystack/preview/components/embedders/cohere_document_embedder.py +++ b/haystack/preview/components/embedders/cohere_document_embedder.py @@ -113,10 +113,10 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]: texts_to_embed = [] for doc in documents: meta_values_to_embed = [ - str(doc.metadata[key]) for key in self.metadata_fields_to_embed if doc.metadata.get(key) is not None + str(doc.meta[key]) for key in self.metadata_fields_to_embed if doc.meta.get(key) is not None ] - text_to_embed = self.embedding_separator.join(meta_values_to_embed + [doc.text or ""]) + text_to_embed = self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) texts_to_embed.append(text_to_embed) return texts_to_embed