Skip to content

Commit

Permalink
refactor: Change Document.embedding type to list of floats (#6135)
Browse files Browse the repository at this point in the history
* Change Document.embedding type

* Add release notes

* Fix document_store testing

* Fix pylint

* Fix tests
  • Loading branch information
silvanocerza authored Oct 23, 2023
1 parent 8f28928 commit c8d162c
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 74 deletions.
8 changes: 3 additions & 5 deletions haystack/preview/dataclasses/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from dataclasses import asdict, dataclass, field, fields
from pathlib import Path
from typing import Any, Dict, Optional, Type
from typing import Any, Dict, List, Optional, Type

import numpy
import pandas
Expand Down Expand Up @@ -42,8 +42,6 @@ def document_decoder(self, dictionary):
dictionary["array"] = numpy.array(dictionary.get("array"))
if "dataframe" in dictionary and dictionary.get("dataframe"):
dictionary["dataframe"] = pandas.read_json(dictionary.get("dataframe", None))
if "embedding" in dictionary and dictionary.get("embedding"):
dictionary["embedding"] = numpy.array(dictionary.get("embedding"))

return dictionary

Expand Down Expand Up @@ -75,7 +73,7 @@ class Document:
mime_type: str = field(default="text/plain")
metadata: Dict[str, Any] = field(default_factory=dict)
score: Optional[float] = field(default=None)
embedding: Optional[numpy.ndarray] = field(default=None, repr=False)
embedding: Optional[List[float]] = field(default=None, repr=False)

def __str__(self):
fields = [f"mimetype: '{self.mime_type}'"]
Expand Down Expand Up @@ -120,7 +118,7 @@ def _create_id(self):
blob = self.blob or None
mime_type = self.mime_type or None
metadata = self.metadata or {}
embedding = self.embedding.tolist() if self.embedding is not None else None
embedding = self.embedding if self.embedding is not None else None
data = f"{text}{array}{dataframe}{blob}{mime_type}{metadata}{embedding}"
return hashlib.sha256(data.encode("utf-8")).hexdigest()

Expand Down
32 changes: 15 additions & 17 deletions haystack/preview/testing/document_store.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# pylint: disable=too-many-public-methods
from typing import List
import random

import pytest
import numpy as np
Expand All @@ -11,37 +12,41 @@
from haystack.preview.errors import FilterError


def _random_embeddings(n):
return [random.random() for _ in range(n)]


class DocumentStoreBaseTests:
@pytest.fixture
def docstore(self) -> DocumentStore:
raise NotImplementedError()

@pytest.fixture
def filterable_docs(self) -> List[Document]:
embedding_zero = np.zeros(768).astype(np.float32)
embedding_one = np.ones(768).astype(np.float32)
embedding_zero = [0.0] * 768
embedding_one = [1.0] * 768

documents = []
for i in range(3):
documents.append(
Document(
text=f"A Foo Document {i}",
metadata={"name": f"name_{i}", "page": "100", "chapter": "intro", "number": 2},
embedding=np.random.rand(768).astype(np.float32),
embedding=_random_embeddings(768),
)
)
documents.append(
Document(
text=f"A Bar Document {i}",
metadata={"name": f"name_{i}", "page": "123", "chapter": "abstract", "number": -2},
embedding=np.random.rand(768).astype(np.float32),
embedding=_random_embeddings(768),
)
)
documents.append(
Document(
text=f"A Foobar Document {i}",
metadata={"name": f"name_{i}", "page": "90", "chapter": "conclusion", "number": -10},
embedding=np.random.rand(768).astype(np.float32),
embedding=_random_embeddings(768),
)
)
documents.append(
Expand Down Expand Up @@ -209,11 +214,9 @@ def test_eq_filter_table(self, docstore: DocumentStore, filterable_docs: List[Do
@pytest.mark.unit
def test_eq_filter_embedding(self, docstore: DocumentStore, filterable_docs: List[Document]):
docstore.write_documents(filterable_docs)
embedding = np.zeros(768).astype(np.float32)
embedding = [0.0] * 768
result = docstore.filter_documents(filters={"embedding": embedding})
assert self.contains_same_docs(
result, [doc for doc in filterable_docs if np.array_equal(embedding, doc.embedding)] # type: ignore
)
assert self.contains_same_docs(result, [doc for doc in filterable_docs if embedding == doc.embedding])

@pytest.mark.unit
def test_in_filter_explicit(self, docstore: DocumentStore, filterable_docs: List[Document]):
Expand Down Expand Up @@ -248,17 +251,12 @@ def test_in_filter_table(self, docstore: DocumentStore, filterable_docs: List[Do
@pytest.mark.unit
def test_in_filter_embedding(self, docstore: DocumentStore, filterable_docs: List[Document]):
docstore.write_documents(filterable_docs)
embedding_zero = np.zeros(768, np.float32)
embedding_one = np.ones(768, np.float32)
embedding_zero = [0.0] * 768
embedding_one = [1.0] * 768
result = docstore.filter_documents(filters={"embedding": {"$in": [embedding_zero, embedding_one]}})
assert self.contains_same_docs(
result,
[
doc
for doc in filterable_docs
if isinstance(doc.embedding, np.ndarray)
and (np.array_equal(embedding_zero, doc.embedding) or np.array_equal(embedding_one, doc.embedding))
],
[doc for doc in filterable_docs if (embedding_zero == doc.embedding or embedding_one == doc.embedding)],
)

@pytest.mark.unit
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
preview:
- |
Change `Document`'s `embedding` field type from `numpy.ndarray` to `List[float]`
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ def test_valid_run(self):
top_k = 3
ds = InMemoryDocumentStore(embedding_similarity_function="cosine")
docs = [
Document(text="my document", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="another document", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="third document", embedding=np.array([0.5, 0.7, 0.5, 0.7])),
Document(text="my document", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="another document", embedding=[1.0, 1.0, 1.0, 1.0]),
Document(text="third document", embedding=[0.5, 0.7, 0.5, 0.7]),
]
ds.write_documents(docs)

Expand All @@ -142,17 +142,17 @@ def test_run_with_pipeline(self):
ds = InMemoryDocumentStore(embedding_similarity_function="cosine")
top_k = 2
docs = [
Document(text="my document", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="another document", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="third document", embedding=np.array([0.5, 0.7, 0.5, 0.7])),
Document(text="my document", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="another document", embedding=[1.0, 1.0, 1.0, 1.0]),
Document(text="third document", embedding=[0.5, 0.7, 0.5, 0.7]),
]
ds.write_documents(docs)
retriever = InMemoryEmbeddingRetriever(ds, top_k=top_k)

pipeline = Pipeline()
pipeline.add_component("retriever", retriever)
result: Dict[str, Any] = pipeline.run(
data={"retriever": {"query_embedding": np.array([0.1, 0.1, 0.1, 0.1]), "return_embedding": True}}
data={"retriever": {"query_embedding": [0.1, 0.1, 0.1, 0.1], "return_embedding": True}}
)

assert result
Expand Down
14 changes: 7 additions & 7 deletions test/preview/dataclasses/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def __eq__(self, other):
return True

foo = TestObject()
doc1 = Document(text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path("."), "obj": foo})
doc2 = Document(text="test text", metadata={"value": np.array([0, 1, 2]), "path": Path("."), "obj": foo})
doc1 = Document(text="test text", metadata={"value": [0, 1, 2], "path": Path("."), "obj": foo})
doc2 = Document(text="test text", metadata={"value": [0, 1, 2], "path": Path("."), "obj": foo})
assert doc1 == doc2


Expand Down Expand Up @@ -107,7 +107,7 @@ def test_full_document_to_dict():
mime_type="application/pdf",
metadata={"some": "values", "test": 10},
score=0.99,
embedding=np.zeros([10, 10]),
embedding=[10, 10],
)
dictionary = doc.to_dict()

Expand All @@ -121,7 +121,7 @@ def test_full_document_to_dict():
assert blob == doc.blob

embedding = dictionary.pop("embedding")
assert (embedding == doc.embedding).all()
assert embedding == doc.embedding

assert dictionary == {
"id": doc.id,
Expand All @@ -134,7 +134,7 @@ def test_full_document_to_dict():

@pytest.mark.unit
def test_document_with_most_attributes_from_dict():
embedding = np.zeros([10, 10])
embedding = [10, 10]
assert Document.from_dict(
{
"text": "test text",
Expand Down Expand Up @@ -194,7 +194,7 @@ def __repr__(self):
mime_type="application/pdf",
metadata={"some object": TestClass(), "a path": tmp_path / "test.txt"},
score=0.5,
embedding=np.array([1, 2, 3, 4]),
embedding=[1, 2, 3, 4],
)
assert doc_1.to_json() == json.dumps(
{
Expand Down Expand Up @@ -241,7 +241,7 @@ def __eq__(self, other):
# Note the object serialization
metadata={"some object": "<the object>", "a path": str((tmp_path / "test.txt").absolute())},
score=0.5,
embedding=np.array([1, 2, 3, 4]),
embedding=[1, 2, 3, 4],
)


Expand Down
72 changes: 34 additions & 38 deletions test/preview/document_stores/test_in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ def test_bm25_retrieval_with_two_queries(self, docstore: DocumentStore):
results = docstore.bm25_retrieval(query="Python", top_k=1)
assert results[0].text == "Python is a popular programming language"

@pytest.mark.skip(reason="Filter is not working properly, see https://github.com/deepset-ai/haystack/issues/6153")
def test_eq_filter_embedding(self, docstore: DocumentStore, filterable_docs):
pass

# Test a query, add a new document and make sure results are appropriately updated
@pytest.mark.unit
def test_bm25_retrieval_with_updated_docs(self, docstore: DocumentStore):
Expand Down Expand Up @@ -256,12 +260,12 @@ def test_embedding_retrieval(self):
docstore = InMemoryDocumentStore(embedding_similarity_function="cosine")
# Tests if the embedding retrieval method returns the correct document based on the input query embedding.
docs = [
Document(text="Hello world", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="Haystack supports multiple languages", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="Haystack supports multiple languages", embedding=[1.0, 1.0, 1.0, 1.0]),
]
docstore.write_documents(docs)
results = docstore.embedding_retrieval(
query_embedding=np.array([0.1, 0.1, 0.1, 0.1]), top_k=1, filters={}, scale_score=False
query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=1, filters={}, scale_score=False
)
assert len(results) == 1
assert results[0].text == "Haystack supports multiple languages"
Expand All @@ -280,7 +284,7 @@ def test_embedding_retrieval_no_embeddings(self, caplog):
docstore = InMemoryDocumentStore()
docs = [Document(text="Hello world"), Document(text="Haystack supports multiple languages")]
docstore.write_documents(docs)
results = docstore.embedding_retrieval(query_embedding=np.array([0.1, 0.1, 0.1, 0.1]))
results = docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1])
assert len(results) == 0
assert "No Documents found with embeddings. Returning empty list." in caplog.text

Expand All @@ -289,29 +293,29 @@ def test_embedding_retrieval_some_documents_wo_embeddings(self, caplog):
caplog.set_level(logging.INFO)
docstore = InMemoryDocumentStore()
docs = [
Document(text="Hello world", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="Haystack supports multiple languages"),
]
docstore.write_documents(docs)
docstore.embedding_retrieval(query_embedding=np.array([0.1, 0.1, 0.1, 0.1]))
docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1])
assert "Skipping some Documents that don't have an embedding." in caplog.text

@pytest.mark.unit
def test_embedding_retrieval_documents_different_embedding_sizes(self):
docstore = InMemoryDocumentStore()
docs = [
Document(text="Hello world", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="Haystack supports multiple languages", embedding=np.array([1.0, 1.0])),
]
docstore.write_documents(docs)

with pytest.raises(DocumentStoreError, match="The embedding size of all Documents should be the same."):
docstore.embedding_retrieval(query_embedding=np.array([0.1, 0.1, 0.1, 0.1]))
docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1])

@pytest.mark.unit
def test_embedding_retrieval_query_documents_different_embedding_sizes(self):
docstore = InMemoryDocumentStore()
docs = [Document(text="Hello world", embedding=np.array([0.1, 0.2, 0.3, 0.4]))]
docs = [Document(text="Hello world", embedding=[0.1, 0.2, 0.3, 0.4])]
docstore.write_documents(docs)

with pytest.raises(
Expand All @@ -324,81 +328,73 @@ def test_embedding_retrieval_query_documents_different_embedding_sizes(self):
def test_embedding_retrieval_with_different_top_k(self):
docstore = InMemoryDocumentStore()
docs = [
Document(text="Hello world", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="Haystack supports multiple languages", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="Python is a popular programming language", embedding=np.array([0.5, 0.5, 0.5, 0.5])),
Document(text="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="Haystack supports multiple languages", embedding=[1.0, 1.0, 1.0, 1.0]),
Document(text="Python is a popular programming language", embedding=[0.5, 0.5, 0.5, 0.5]),
]
docstore.write_documents(docs)

results = docstore.embedding_retrieval(query_embedding=np.array([0.1, 0.1, 0.1, 0.1]), top_k=2)
results = docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=2)
assert len(results) == 2

results = docstore.embedding_retrieval(query_embedding=np.array([0.1, 0.1, 0.1, 0.1]), top_k=3)
results = docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=3)
assert len(results) == 3

@pytest.mark.unit
def test_embedding_retrieval_with_scale_score(self):
docstore = InMemoryDocumentStore()
docs = [
Document(text="Hello world", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="Haystack supports multiple languages", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="Python is a popular programming language", embedding=np.array([0.5, 0.5, 0.5, 0.5])),
Document(text="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="Haystack supports multiple languages", embedding=[1.0, 1.0, 1.0, 1.0]),
Document(text="Python is a popular programming language", embedding=[0.5, 0.5, 0.5, 0.5]),
]
docstore.write_documents(docs)

results1 = docstore.embedding_retrieval(
query_embedding=np.array([0.1, 0.1, 0.1, 0.1]), top_k=1, scale_score=True
)
results1 = docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=1, scale_score=True)
# Confirm that score is scaled between 0 and 1
assert 0 <= results1[0].score <= 1

# Same query, different scale, scores differ when not scaled
results = docstore.embedding_retrieval(
query_embedding=np.array([0.1, 0.1, 0.1, 0.1]), top_k=1, scale_score=False
)
results = docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=1, scale_score=False)
assert results[0].score != results1[0].score

@pytest.mark.unit
def test_embedding_retrieval_return_embedding(self):
docstore = InMemoryDocumentStore(embedding_similarity_function="cosine")
docs = [
Document(text="Hello world", embedding=np.array([0.1, 0.2, 0.3, 0.4])),
Document(text="Haystack supports multiple languages", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="Hello world", embedding=[0.1, 0.2, 0.3, 0.4]),
Document(text="Haystack supports multiple languages", embedding=[1.0, 1.0, 1.0, 1.0]),
]
docstore.write_documents(docs)

results = docstore.embedding_retrieval(
query_embedding=np.array([0.1, 0.1, 0.1, 0.1]), top_k=1, return_embedding=False
)
results = docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=1, return_embedding=False)
assert results[0].embedding is None

results = docstore.embedding_retrieval(
query_embedding=np.array([0.1, 0.1, 0.1, 0.1]), top_k=1, return_embedding=True
)
assert (results[0].embedding == np.array([1.0, 1.0, 1.0, 1.0])).all()
results = docstore.embedding_retrieval(query_embedding=[0.1, 0.1, 0.1, 0.1], top_k=1, return_embedding=True)
assert results[0].embedding == [1.0, 1.0, 1.0, 1.0]

@pytest.mark.unit
def test_compute_cosine_similarity_scores(self):
docstore = InMemoryDocumentStore(embedding_similarity_function="cosine")
docs = [
Document(text="Document 1", embedding=np.array([1.0, 0.0, 0.0, 0.0])),
Document(text="Document 2", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="Document 1", embedding=[1.0, 0.0, 0.0, 0.0]),
Document(text="Document 2", embedding=[1.0, 1.0, 1.0, 1.0]),
]

scores = docstore._compute_query_embedding_similarity_scores(
embedding=np.array([0.1, 0.1, 0.1, 0.1]), documents=docs, scale_score=False
embedding=[0.1, 0.1, 0.1, 0.1], documents=docs, scale_score=False
)
assert scores == [0.5, 1.0]

@pytest.mark.unit
def test_compute_dot_product_similarity_scores(self):
docstore = InMemoryDocumentStore(embedding_similarity_function="dot_product")
docs = [
Document(text="Document 1", embedding=np.array([1.0, 0.0, 0.0, 0.0])),
Document(text="Document 2", embedding=np.array([1.0, 1.0, 1.0, 1.0])),
Document(text="Document 1", embedding=[1.0, 0.0, 0.0, 0.0]),
Document(text="Document 2", embedding=[1.0, 1.0, 1.0, 1.0]),
]

scores = docstore._compute_query_embedding_similarity_scores(
embedding=np.array([0.1, 0.1, 0.1, 0.1]), documents=docs, scale_score=False
embedding=[0.1, 0.1, 0.1, 0.1], documents=docs, scale_score=False
)
assert scores == [0.1, 0.4]

0 comments on commit c8d162c

Please sign in to comment.