Skip to content

Commit

Permalink
feat: check document store and retriever dimensions before calculatin…
Browse files Browse the repository at this point in the history
…g embeddings for all documents (#7357)

* Verify if embed dims of docustore and retriever are equal

* improve release note

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
AnushreeBannadabhavi and anakin87 authored Mar 14, 2024
1 parent 936e293 commit 553badc
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 0 deletions.
14 changes: 14 additions & 0 deletions haystack/document_stores/faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,18 @@ def write_documents(
def _create_document_field_map(self) -> Dict:
return {self.index: self.embedding_field}

def _validate_embedding_dimension(self, retriever: DenseRetriever, index: Optional[str] = None):
"""
Verify if the embedding dimension set in the document store and embedding dimension of the retriever are the same.
This check is done before calculating embeddings for all documents.
:param retriever: Retriever to use to get embeddings for text
:param index: Index name for which embeddings are to be updated. If set to None, the default self.index is used.
:return: None
"""
first_document = self.get_all_documents(index=index)[0]
embeddings = retriever.embed_documents([first_document])
self._validate_embeddings_shape(embeddings=embeddings, num_documents=1, embedding_dim=self.embedding_dim)

def update_embeddings(
self,
retriever: DenseRetriever,
Expand Down Expand Up @@ -373,6 +385,8 @@ def update_embeddings(
logger.warning("Calling DocumentStore.update_embeddings() on an empty index")
return

self._validate_embedding_dimension(retriever, index)

logger.info("Updating embeddings for %s docs...", document_count)
vector_id = self.faiss_indexes[index].ntotal

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Add a check to verify that the embedding dimension set in the FAISS Document Store and retriever are equal before running embedding calculations.
15 changes: 15 additions & 0 deletions test/document_stores/test_faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,21 @@ def test_index_mutual_exclusive_args(self, tmp_path):
isolation_level="AUTOCOMMIT",
)

@pytest.mark.unit
def test_validate_embedding_dimension_unequal_embedding_dim(self, ds, documents):
retriever = MockDenseRetriever(document_store=ds, embedding_dim=384)
ds.write_documents(documents)
assert ds.get_document_count() == len(documents)
with pytest.raises(RuntimeError):
ds._validate_embedding_dimension(retriever)

@pytest.mark.unit
def test_validate_embedding_dimension_equal_embedding_dim(self, ds, documents):
retriever = MockDenseRetriever(document_store=ds, embedding_dim=768)
ds.write_documents(documents)
assert ds.get_document_count() == len(documents)
ds._validate_embedding_dimension(retriever)

@pytest.mark.integration
def test_delete_index(self, ds, documents):
"""Contrary to other Document Stores, FAISSDocumentStore doesn't raise if the index is empty"""
Expand Down

0 comments on commit 553badc

Please sign in to comment.