Skip to content

Commit

Permalink
change metadata to meta (#152)
Browse files Browse the repository at this point in the history
  • Loading branch information
anakin87 authored Dec 28, 2023
1 parent 34ada7a commit 21bc1c8
Show file tree
Hide file tree
Showing 13 changed files with 68 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
timeout: int = 120,
batch_size: int = 32,
progress_bar: bool = True,
metadata_fields_to_embed: Optional[List[str]] = None,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n",
):
"""
Expand Down Expand Up @@ -74,7 +74,7 @@ def __init__(
:param batch_size: Number of Documents to encode at once.
:param progress_bar: Whether to show a progress bar or not. Can be helpful to disable in production deployments
to keep the logs clean.
:param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document text.
:param meta_fields_to_embed: List of meta fields that should be embedded along with the Document text.
:param embedding_separator: Separator used to concatenate the meta fields to the Document text.
"""

Expand All @@ -98,7 +98,7 @@ def __init__(
self.timeout = timeout
self.batch_size = batch_size
self.progress_bar = progress_bar
self.metadata_fields_to_embed = metadata_fields_to_embed or []
self.meta_fields_to_embed = meta_fields_to_embed or []
self.embedding_separator = embedding_separator

def to_dict(self) -> Dict[str, Any]:
Expand All @@ -116,7 +116,7 @@ def to_dict(self) -> Dict[str, Any]:
timeout=self.timeout,
batch_size=self.batch_size,
progress_bar=self.progress_bar,
metadata_fields_to_embed=self.metadata_fields_to_embed,
meta_fields_to_embed=self.meta_fields_to_embed,
embedding_separator=self.embedding_separator,
)

Expand All @@ -127,14 +127,14 @@ def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
texts_to_embed: List[str] = []
for doc in documents:
meta_values_to_embed = [
str(doc.meta[key]) for key in self.metadata_fields_to_embed if doc.meta.get(key) is not None
str(doc.meta[key]) for key in self.meta_fields_to_embed if doc.meta.get(key) is not None
]

text_to_embed = self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) # noqa: RUF005
texts_to_embed.append(text_to_embed)
return texts_to_embed

@component.output_types(documents=List[Document], metadata=Dict[str, Any])
@component.output_types(documents=List[Document], meta=Dict[str, Any])
def run(self, documents: List[Document]):
"""
Embed a list of Documents.
Expand All @@ -152,7 +152,7 @@ def run(self, documents: List[Document]):

if not documents:
# return early if we were passed an empty list
return {"documents": [], "metadata": {}}
return {"documents": [], "meta": {}}

texts_to_embed = self._prepare_texts_to_embed(documents)

Expand Down Expand Up @@ -180,4 +180,4 @@ def run(self, documents: List[Document]):
for doc, embeddings in zip(documents, all_embeddings):
doc.embedding = embeddings

return {"documents": documents, "metadata": metadata}
return {"documents": documents, "meta": metadata}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class CohereTextEmbedder:
print(text_embedder.run(text_to_embed))
# {'embedding': [-0.453125, 1.2236328, 2.0058594, ...]
# 'metadata': {'api_version': {'version': '1'}, 'billed_units': {'input_tokens': 4}}}
# 'meta': {'api_version': {'version': '1'}, 'billed_units': {'input_tokens': 4}}}
```
"""

Expand Down Expand Up @@ -101,7 +101,7 @@ def to_dict(self) -> Dict[str, Any]:
timeout=self.timeout,
)

@component.output_types(embedding=List[float], metadata=Dict[str, Any])
@component.output_types(embedding=List[float], meta=Dict[str, Any])
def run(self, text: str):
"""Embed a string."""
if not isinstance(text, str):
Expand All @@ -126,4 +126,4 @@ def run(self, text: str):
)
embedding, metadata = get_response(cohere_client, [text], self.model_name, self.input_type, self.truncate)

return {"embedding": embedding[0], "metadata": metadata}
return {"embedding": embedding[0], "meta": metadata}
6 changes: 3 additions & 3 deletions integrations/cohere/src/cohere_haystack/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "CohereGenerator":
data["init_parameters"]["streaming_callback"] = streaming_callback
return default_from_dict(cls, data)

@component.output_types(replies=List[str], metadata=List[Dict[str, Any]])
@component.output_types(replies=List[str], meta=List[Dict[str, Any]])
def run(self, prompt: str):
"""
Queries the LLM with the prompts to produce replies.
Expand All @@ -153,12 +153,12 @@ def run(self, prompt: str):
metadata_dict["finish_reason"] = response.finish_reason
metadata = [metadata_dict]
self._check_truncated_answers(metadata)
return {"replies": replies, "metadata": metadata}
return {"replies": replies, "meta": metadata}

metadata = [{"finish_reason": resp.finish_reason} for resp in cast(Generations, response)]
replies = [resp.text for resp in response]
self._check_truncated_answers(metadata)
return {"replies": replies, "metadata": metadata}
return {"replies": replies, "meta": metadata}

def _check_truncated_answers(self, metadata: List[Dict[str, Any]]):
"""
Expand Down
12 changes: 6 additions & 6 deletions integrations/cohere/tests/test_cohere_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def test_from_dict(self, monkeypatch):

def test_check_truncated_answers(self, caplog):
component = CohereGenerator(api_key="test-api-key")
metadata = [{"finish_reason": "MAX_TOKENS"}]
component._check_truncated_answers(metadata)
meta = [{"finish_reason": "MAX_TOKENS"}]
component._check_truncated_answers(meta)
assert caplog.records[0].message == (
"Responses have been truncated before reaching a natural stopping point. "
"Increase the max_tokens parameter to allow for longer completions."
Expand All @@ -136,8 +136,8 @@ def test_cohere_generator_run(self):
results = component.run(prompt="What's the capital of France?")
assert len(results["replies"]) == 1
assert "Paris" in results["replies"][0]
assert len(results["metadata"]) == 1
assert results["metadata"][0]["finish_reason"] == "COMPLETE"
assert len(results["meta"]) == 1
assert results["meta"][0]["finish_reason"] == "COMPLETE"

@pytest.mark.skipif(
not os.environ.get("COHERE_API_KEY", None),
Expand Down Expand Up @@ -174,6 +174,6 @@ def __call__(self, chunk):

assert len(results["replies"]) == 1
assert "Paris" in results["replies"][0]
assert len(results["metadata"]) == 1
assert results["metadata"][0]["finish_reason"] == "COMPLETE"
assert len(results["meta"]) == 1
assert results["meta"][0]["finish_reason"] == "COMPLETE"
assert callback.responses == results["replies"][0]
14 changes: 7 additions & 7 deletions integrations/cohere/tests/test_document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_init_default(self):
assert embedder.timeout == 120
assert embedder.batch_size == 32
assert embedder.progress_bar is True
assert embedder.metadata_fields_to_embed == []
assert embedder.meta_fields_to_embed == []
assert embedder.embedding_separator == "\n"

def test_init_with_parameters(self):
Expand All @@ -40,7 +40,7 @@ def test_init_with_parameters(self):
timeout=60,
batch_size=64,
progress_bar=False,
metadata_fields_to_embed=["test_field"],
meta_fields_to_embed=["test_field"],
embedding_separator="-",
)
assert embedder.api_key == "test-api-key"
Expand All @@ -53,7 +53,7 @@ def test_init_with_parameters(self):
assert embedder.timeout == 60
assert embedder.batch_size == 64
assert embedder.progress_bar is False
assert embedder.metadata_fields_to_embed == ["test_field"]
assert embedder.meta_fields_to_embed == ["test_field"]
assert embedder.embedding_separator == "-"

def test_to_dict(self):
Expand All @@ -71,7 +71,7 @@ def test_to_dict(self):
"timeout": 120,
"batch_size": 32,
"progress_bar": True,
"metadata_fields_to_embed": [],
"meta_fields_to_embed": [],
"embedding_separator": "\n",
},
}
Expand All @@ -88,7 +88,7 @@ def test_to_dict_with_custom_init_parameters(self):
timeout=60,
batch_size=64,
progress_bar=False,
metadata_fields_to_embed=["text_field"],
meta_fields_to_embed=["text_field"],
embedding_separator="-",
)
component_dict = embedder_component.to_dict()
Expand All @@ -104,7 +104,7 @@ def test_to_dict_with_custom_init_parameters(self):
"timeout": 60,
"batch_size": 64,
"progress_bar": False,
"metadata_fields_to_embed": ["text_field"],
"meta_fields_to_embed": ["text_field"],
"embedding_separator": "-",
},
}
Expand Down Expand Up @@ -139,4 +139,4 @@ def test_run_wrong_input_format(self):
with pytest.raises(TypeError, match="CohereDocumentEmbedder expects a list of Documents as input"):
embedder.run(documents=[1, 2, 3])

assert embedder.run(documents=[]) == {"documents": [], "metadata": {}}
assert embedder.run(documents=[]) == {"documents": [], "meta": {}}
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ def from_dict(cls, data: Dict[str, Any]) -> "VertexAIImageGenerator":
def run(self, prompt: str, negative_prompt: Optional[str] = None):
negative_prompt = negative_prompt or self._kwargs.get("negative_prompt")
res = self._model.generate_images(prompt=prompt, negative_prompt=negative_prompt, **self._kwargs)
images = [ByteStream(data=i._image_bytes, metadata=i.generation_parameters) for i in res.images]
images = [ByteStream(data=i._image_bytes, meta=i.generation_parameters) for i in res.images]
return {"images": images}
2 changes: 1 addition & 1 deletion integrations/gradient/tests/test_gradient_rag_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,4 @@ def test_gradient_embedding_retrieval_rag_pipeline(tmp_path):
assert spyword in generated_answer.data
assert generated_answer.query == question
assert hasattr(generated_answer, "documents")
assert hasattr(generated_answer, "metadata")
assert hasattr(generated_answer, "meta")
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(
batch_size: int = 32,
progress_bar: bool = True,
normalize_embeddings: bool = False,
metadata_fields_to_embed: Optional[List[str]] = None,
meta_fields_to_embed: Optional[List[str]] = None,
embedding_separator: str = "\n",
):
"""
Expand All @@ -91,7 +91,7 @@ def __init__(
:param batch_size: Number of strings to encode at once.
:param progress_bar: If true, displays progress bar during embedding.
:param normalize_embeddings: If set to true, returned vectors will have the length of 1.
:param metadata_fields_to_embed: List of meta fields that should be embedded along with the Document content.
:param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content.
:param embedding_separator: Separator used to concatenate the meta fields to the Document content.
"""

Expand All @@ -103,7 +103,7 @@ def __init__(
self.batch_size = batch_size
self.progress_bar = progress_bar
self.normalize_embeddings = normalize_embeddings
self.metadata_fields_to_embed = metadata_fields_to_embed or []
self.meta_fields_to_embed = meta_fields_to_embed or []
self.embedding_separator = embedding_separator

def to_dict(self) -> Dict[str, Any]:
Expand All @@ -119,7 +119,7 @@ def to_dict(self) -> Dict[str, Any]:
batch_size=self.batch_size,
progress_bar=self.progress_bar,
normalize_embeddings=self.normalize_embeddings,
metadata_fields_to_embed=self.metadata_fields_to_embed,
meta_fields_to_embed=self.meta_fields_to_embed,
embedding_separator=self.embedding_separator,
)

Expand Down Expand Up @@ -160,9 +160,7 @@ def run(self, documents: List[Document]):
texts_to_embed = []
for doc in documents:
meta_values_to_embed = [
str(doc.meta[key])
for key in self.metadata_fields_to_embed
if key in doc.meta and doc.meta[key] is not None
str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
]
text_to_embed = [
self.instruction,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_init_default(self):
assert embedder.batch_size == 32
assert embedder.progress_bar is True
assert embedder.normalize_embeddings is False
assert embedder.metadata_fields_to_embed == []
assert embedder.meta_fields_to_embed == []
assert embedder.embedding_separator == "\n"

def test_init_with_parameters(self):
Expand All @@ -35,7 +35,7 @@ def test_init_with_parameters(self):
batch_size=64,
progress_bar=False,
normalize_embeddings=True,
metadata_fields_to_embed=["test_field"],
meta_fields_to_embed=["test_field"],
embedding_separator=" | ",
)
assert embedder.model_name_or_path == "hkunlp/instructor-base"
Expand All @@ -45,7 +45,7 @@ def test_init_with_parameters(self):
assert embedder.batch_size == 64
assert embedder.progress_bar is False
assert embedder.normalize_embeddings is True
assert embedder.metadata_fields_to_embed == ["test_field"]
assert embedder.meta_fields_to_embed == ["test_field"]
assert embedder.embedding_separator == " | "

def test_to_dict(self):
Expand All @@ -65,7 +65,7 @@ def test_to_dict(self):
"progress_bar": True,
"normalize_embeddings": False,
"embedding_separator": "\n",
"metadata_fields_to_embed": [],
"meta_fields_to_embed": [],
},
}

Expand All @@ -81,7 +81,7 @@ def test_to_dict_with_custom_init_parameters(self):
batch_size=64,
progress_bar=False,
normalize_embeddings=True,
metadata_fields_to_embed=["test_field"],
meta_fields_to_embed=["test_field"],
embedding_separator=" | ",
)
embedder_dict = embedder.to_dict()
Expand All @@ -95,7 +95,7 @@ def test_to_dict_with_custom_init_parameters(self):
"batch_size": 64,
"progress_bar": False,
"normalize_embeddings": True,
"metadata_fields_to_embed": ["test_field"],
"meta_fields_to_embed": ["test_field"],
"embedding_separator": " | ",
},
}
Expand All @@ -114,7 +114,7 @@ def test_from_dict(self):
"batch_size": 32,
"progress_bar": True,
"normalize_embeddings": False,
"metadata_fields_to_embed": [],
"meta_fields_to_embed": [],
"embedding_separator": "\n",
},
}
Expand All @@ -126,7 +126,7 @@ def test_from_dict(self):
assert embedder.batch_size == 32
assert embedder.progress_bar is True
assert embedder.normalize_embeddings is False
assert embedder.metadata_fields_to_embed == []
assert embedder.meta_fields_to_embed == []
assert embedder.embedding_separator == "\n"

def test_from_dict_with_custom_init_parameters(self):
Expand All @@ -143,7 +143,7 @@ def test_from_dict_with_custom_init_parameters(self):
"batch_size": 64,
"progress_bar": False,
"normalize_embeddings": True,
"metadata_fields_to_embed": ["test_field"],
"meta_fields_to_embed": ["test_field"],
"embedding_separator": " | ",
},
}
Expand All @@ -155,7 +155,7 @@ def test_from_dict_with_custom_init_parameters(self):
assert embedder.batch_size == 64
assert embedder.progress_bar is False
assert embedder.normalize_embeddings is True
assert embedder.metadata_fields_to_embed == ["test_field"]
assert embedder.meta_fields_to_embed == ["test_field"]
assert embedder.embedding_separator == " | "

@patch("instructor_embedders_haystack.instructor_document_embedder._InstructorEmbeddingBackendFactory")
Expand Down Expand Up @@ -223,7 +223,7 @@ def test_embed_metadata(self):
embedder = InstructorDocumentEmbedder(
model_name_or_path="model",
instruction="Represent the financial document for retrieval",
metadata_fields_to_embed=["meta_field"],
meta_fields_to_embed=["meta_field"],
embedding_separator="\n",
)
embedder.embedding_backend = MagicMock()
Expand Down
Loading

0 comments on commit 21bc1c8

Please sign in to comment.