Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pinecone: change dummy vector #6932

Merged
merged 2 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions haystack/document_stores/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,8 @@ def __init__(
# Initialize dictionary to store temporary set of document IDs
self.all_ids: dict = {}

# Dummy query to be used during searches
self.dummy_query = [0.0] * self.embedding_dim
# Dummy vector to be used during searches and as a placeholder for documents without embeddings
self.dummy_vector = [-10.0] * self.embedding_dim

if pinecone_index:
if not isinstance(pinecone_index, pinecone.Index):
Expand Down Expand Up @@ -384,9 +384,9 @@ def _get_vector_count(
return namespaces[namespace]["vector_count"] if namespace in namespaces else 0

# Due to missing support for metadata filtering in `describe_index_stats()` method for `gcp-starter`,
# use dummy query for getting vector count
# use dummy query vector for getting vector count
res = self.pinecone_indexes[index].query(
self.dummy_query,
self.dummy_vector,
top_k=self.top_k_limit,
include_values=False,
include_metadata=False,
Expand Down Expand Up @@ -684,9 +684,7 @@ def write_documents(
embeddings = [embed.tolist() if embed is not None else None for embed in embeddings_to_index]
else:
# Use dummy embeddings for all documents
embeddings_to_index = np.zeros((len(document_chunk), self.embedding_dim), dtype="float32")
# Convert embeddings to list objects
embeddings = [embed.tolist() if embed is not None else None for embed in embeddings_to_index]
embeddings = [self.dummy_vector] * len(document_chunk)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@anakin87 This is the only diff (aside from -10.0) right? Everything else is the same, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes!


data_to_write_to_pinecone = list(zip(ids, embeddings, metadata))
# Store chunk by chunk (for regular upsert) or chunk by chunk (for async upsert) in vector store
Expand Down Expand Up @@ -1582,7 +1580,7 @@ def _get_ids(
# Retrieve embeddings from Pinecone
try:
res = self.pinecone_indexes[index].query(
self.dummy_query,
self.dummy_vector,
top_k=batch_size,
include_values=False,
include_metadata=False,
Expand Down Expand Up @@ -1830,7 +1828,7 @@ def delete_labels(
self._index_connection_exists(index)

i = 0
dummy_query = np.asarray(self.dummy_query)
dummy_query = np.asarray(self.dummy_vector)

type_metadata = LABEL

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
fixes:
- |
Change the dummy vector used internally in the Pinecone Document Store.
A recent change to the Pinecone API does not allow to use vectors filled with zeros
as was the previous dummy vector.
Loading