Skip to content

Commit

Permalink
Pinecone - review docstrings and API reference (#503)
Browse files Browse the repository at this point in the history
* pinecone - review docstrings and API reference

* rerun CI

* Update integrations/pinecone/src/haystack_integrations/components/retrievers/pinecone/embedding_retriever.py

Co-authored-by: Madeesh Kannan <[email protected]>

* Update integrations/pinecone/src/haystack_integrations/components/retrievers/pinecone/embedding_retriever.py

Co-authored-by: Madeesh Kannan <[email protected]>

---------

Co-authored-by: Madeesh Kannan <[email protected]>
  • Loading branch information
anakin87 and shadeMe authored Feb 29, 2024
1 parent 6d1dd7f commit 3ddb10a
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 39 deletions.
4 changes: 1 addition & 3 deletions integrations/pinecone/pydoc/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ loaders:
modules:
[
"haystack_integrations.components.retrievers.pinecone.embedding_retriever",
"haystack_integrations.document_stores.pinecone.document_store",
"haystack_integrations.document_stores.pinecone.errors",
"haystack_integrations.document_stores.pinecone.filters",
"haystack_integrations.document_stores.pinecone.document_store"
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,41 @@
@component
class PineconeEmbeddingRetriever:
"""
Retrieves documents from the PineconeDocumentStore, based on their dense embeddings.
Retrieves documents from the `PineconeDocumentStore`, based on their dense embeddings.
Needs to be connected to the PineconeDocumentStore.
Usage example:
```python
import os
from haystack.document_stores.types import DuplicatePolicy
from haystack import Document
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack_integrations.components.retrievers.pinecone import PineconeEmbeddingRetriever
from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
os.environ["PINECONE_API_KEY"] = "YOUR_PINECONE_API_KEY"
document_store = PineconeDocumentStore(index="my_index", namespace="my_namespace", dimension=768)
documents = [Document(content="There are over 7,000 languages spoken around the world today."),
Document(content="Elephants have been observed to behave in a way that indicates..."),
Document(content="In certain places, you can witness the phenomenon of bioluminescent waves.")]
document_embedder = SentenceTransformersDocumentEmbedder()
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)
document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE)
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", PineconeEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query = "How many languages are there?"
res = query_pipeline.run({"text_embedder": {"text": query}})
assert res['retriever']['documents'][0].content == "There are over 7,000 languages spoken around the world today."
```
"""

def __init__(
Expand All @@ -25,13 +57,11 @@ def __init__(
top_k: int = 10,
):
"""
Create the PineconeEmbeddingRetriever component.
:param document_store: An instance of PineconeDocumentStore.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10.
:param document_store: The Pinecone Document Store.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:raises ValueError: If `document_store` is not an instance of PineconeDocumentStore.
:raises ValueError: If `document_store` is not an instance of `PineconeDocumentStore`.
"""
if not isinstance(document_store, PineconeDocumentStore):
msg = "document_store must be an instance of PineconeDocumentStore"
Expand All @@ -42,6 +72,11 @@ def __init__(
self.top_k = top_k

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
filters=self.filters,
Expand All @@ -51,6 +86,13 @@ def to_dict(self) -> Dict[str, Any]:

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
data["init_parameters"]["document_store"] = default_from_dict(
PineconeDocumentStore, data["init_parameters"]["document_store"]
)
Expand All @@ -59,10 +101,10 @@ def from_dict(cls, data: Dict[str, Any]) -> "PineconeEmbeddingRetriever":
@component.output_types(documents=List[Document])
def run(self, query_embedding: List[float]):
"""
Retrieve documents from the PineconeDocumentStore, based on their dense embeddings.
Retrieve documents from the `PineconeDocumentStore`, based on their dense embeddings.
:param query_embedding: Embedding of the query.
:return: List of Document similar to `query_embedding`.
:returns: List of Document similar to `query_embedding`.
"""
docs = self.document_store._embedding_retrieval(
query_embedding=query_embedding,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@


class PineconeDocumentStore:
"""
A Document Store using [Pinecone vector database](https://www.pinecone.io/).
"""

def __init__(
self,
*,
Expand All @@ -42,20 +46,17 @@ def __init__(
It is meant to be connected to a Pinecone index and namespace.
:param api_key: The Pinecone API key. It can be explicitly provided or automatically read from the
environment variable PINECONE_API_KEY (recommended).
:param environment: The Pinecone environment to connect to. Defaults to "us-west1-gcp".
environment variable `PINECONE_API_KEY` (recommended).
:param environment: The Pinecone environment to connect to.
:param index: The Pinecone index to connect to. If the index does not exist, it will be created.
Defaults to "default".
:param namespace: The Pinecone namespace to connect to. If the namespace does not exist, it will be created
at the first write. Defaults to "default".
:param batch_size: The number of documents to write in a single batch. Defaults to 100, as recommended by
Pinecone.
at the first write.
:param batch_size: The number of documents to write in a single batch. When setting this parameter,
consider [documented Pinecone limits](https://docs.pinecone.io/docs/limits).
:param dimension: The dimension of the embeddings. This parameter is only used when creating a new index.
Defaults to 768.
:param index_creation_kwargs: Additional keyword arguments to pass to the index creation method.
For example, you can specify `metric`, `pods`, `replicas`...
You can find the full list of supported arguments in the
[API reference](https://docs.pinecone.io/reference/create_index-1).
[API reference](https://docs.pinecone.io/reference/create_index).
"""
resolved_api_key = api_key.resolve_value()
Expand Down Expand Up @@ -95,10 +96,22 @@ def __init__(

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "PineconeDocumentStore":
"""
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["api_key"])
return default_from_dict(cls, data)

def to_dict(self) -> Dict[str, Any]:
"""
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
return default_to_dict(
self,
api_key=self.api_key.to_dict(),
Expand Down Expand Up @@ -128,7 +141,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D
:param policy: The duplicate policy to use when writing documents.
PineconeDocumentStore only supports `DuplicatePolicy.OVERWRITE`.
:return: The number of documents written to the document store.
:returns: The number of documents written to the document store.
"""
if len(documents) > 0 and not isinstance(documents[0], Document):
msg = "param 'documents' must contain a list of objects of type Document"
Expand Down Expand Up @@ -157,7 +170,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
refer to the [documentation](https://docs.haystack.deepset.ai/v2.0/docs/metadata-filtering)
:param filters: The filters to apply to the document list.
:return: A list of Documents that match the given filters.
:returns: A list of Documents that match the given filters.
"""

# Pinecone only performs vector similarity search
Expand All @@ -178,7 +191,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc

def delete_documents(self, document_ids: List[str]) -> None:
"""
Deletes all documents with a matching document_ids from the document store.
Deletes documents that match the provided `document_ids` from the document store.
:param document_ids: the document ids to delete
"""
Expand All @@ -197,14 +210,14 @@ def _embedding_retrieval(
This method is not mean to be part of the public interface of
`PineconeDocumentStore` nor called directly.
`PineconeDenseRetriever` uses this method directly and is the public interface for it.
`PineconeEmbeddingRetriever` uses this method directly and is the public interface for it.
:param query_embedding: Embedding of the query.
:param namespace: Pinecone namespace to query. Defaults the namespace of the document store.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param top_k: Maximum number of Documents to return, defaults to 10
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:return: List of Document that are most similar to `query_embedding`
:returns: List of Document that are most similar to `query_embedding`
"""

if not query_embedding:
Expand Down

This file was deleted.

0 comments on commit 3ddb10a

Please sign in to comment.