Skip to content

Commit

Permalink
docs: review Elastic (#541)
Browse files Browse the repository at this point in the history
* docs: review Elastic

* docs: correctly describe `DocumentStoreError`

Co-authored-by: Stefano Fiorucci <[email protected]>

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
wochinge and anakin87 authored Mar 6, 2024
1 parent 2c6b218 commit d7ad329
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@ def __init__(
:param document_store: An instance of ElasticsearchDocumentStore.
:param filters: Filters applied to the retrieved Documents, for more info
see `ElasticsearchDocumentStore.filter_documents`, defaults to None
:param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
See the official
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
for more details.
:param top_k: Maximum number of Documents to return, defaults to 10
:param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
see `ElasticsearchDocumentStore.filter_documents`.
:param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
for more details.
:param top_k: Maximum number of Documents to return.
:param scale_score: If `True` scales the Document`s scores between 0 and 1.
:raises ValueError: If `document_store` is not an instance of `ElasticsearchDocumentStore`.
"""

if not isinstance(document_store, ElasticsearchDocumentStore):
Expand Down Expand Up @@ -97,7 +97,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchBM25Retriever":
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
Deserialized component.
"""
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
data["init_parameters"]["document_store"]
Expand All @@ -109,11 +109,11 @@ def run(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: Optio
"""
Retrieve documents using the BM25 keyword-based algorithm.
:param query: String to search in Documents' text.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:param query: String to search in `Document`s' text.
:param filters: Filters applied to the retrieved `Document`s.
:param top_k: Maximum number of `Document` to return.
:returns: A dictionary with the following keys:
- `documents`: List of Documents that match the query.
- `documents`: List of `Document`s that match the query.
"""
docs = self._document_store._bm25_retrieval(
query=query,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class ElasticsearchEmbeddingRetriever:
result = retriever.run(query=query_embeddings)
for doc in result["documents"]:
print(doc.content)
print(doc.content)
```
"""

Expand All @@ -54,9 +54,9 @@ def __init__(
Create the ElasticsearchEmbeddingRetriever component.
:param document_store: An instance of ElasticsearchDocumentStore.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
:param top_k: Maximum number of Documents to return, defaults to 10
:param filters: Filters applied to the retrieved Documents.
Filters are applied during the approximate KNN search to ensure that top_k matching documents are returned.
:param top_k: Maximum number of Documents to return.
:param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
Increasing this value will improve search accuracy at the cost of slower search speeds.
You can read more about it in the Elasticsearch
Expand Down Expand Up @@ -95,7 +95,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchEmbeddingRetriever":
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
Deserialized component.
"""
data["init_parameters"]["document_store"] = ElasticsearchDocumentStore.from_dict(
data["init_parameters"]["document_store"]
Expand All @@ -108,10 +108,10 @@ def run(self, query_embedding: List[float], filters: Optional[Dict[str, Any]] =
Retrieve documents using a vector similarity metric.
:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents.
:param top_k: Maximum number of Documents to return.
:param filters: Filters applied to the retrieved `Document`s.
:param top_k: Maximum number of `Document`s to return.
:returns: A dictionary with the following keys:
- `documents`: List of Documents most similar to the given query_embedding
- `documents`: List of `Document`s most similar to the given `query_embedding`
"""
docs = self._document_store._embedding_retrieval(
query_embedding=query_embedding,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ class ElasticsearchDocumentStore:
ElasticsearchDocumentStore is a Document Store for Elasticsearch. It can be used with Elastic Cloud or your own
Elasticsearch cluster.
Usage example with Elastic Cloud:
Usage example (Elastic Cloud):
```python
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(cloud_id="YOUR_CLOUD_ID", api_key="YOUR_API_KEY")
```
Usage example with a self-hosted Elasticsearch instance:
Usage example (self-hosted Elasticsearch instance):
```python
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(hosts="http://localhost:9200")
Expand All @@ -69,8 +69,8 @@ def __init__(
):
"""
Creates a new ElasticsearchDocumentStore instance.
When no index is explicitly specified, it will use the default index "default".
It will also try to create that index if it doesn't exist yet. Otherwise it will use the existing one.
It will also try to create that index if it doesn't exist yet. Otherwise, it will use the existing one.
One can also set the similarity function used to compare Documents embeddings. This is mostly useful
when using the `ElasticsearchDocumentStore` in a Pipeline with an `ElasticsearchEmbeddingRetriever`.
Expand All @@ -81,14 +81,14 @@ def __init__(
For the full list of supported kwargs, see the official Elasticsearch
[reference](https://elasticsearch-py.readthedocs.io/en/stable/api.html#module-elasticsearch)
:param hosts: List of hosts running the Elasticsearch client. Defaults to None
:param index: Name of index in Elasticsearch, if it doesn't exist it will be created. Defaults to "default"
:param hosts: List of hosts running the Elasticsearch client.
:param index: Name of index in Elasticsearch.
:param embedding_similarity_function: The similarity function used to compare Documents embeddings.
Defaults to "cosine". This parameter only takes effect if the index does not yet exist and is created.
This parameter only takes effect if the index does not yet exist and is created.
To choose the most appropriate function, look for information about your embedding model.
To understand how document scores are computed, see the Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html#dense-vector-params)
:param **kwargs: Optional arguments that ``Elasticsearch`` takes.
:param **kwargs: Optional arguments that `Elasticsearch` takes.
"""
self._hosts = hosts
self._client = Elasticsearch(
Expand Down Expand Up @@ -140,7 +140,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ElasticsearchDocumentStore":
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
Deserialized component.
"""
return default_from_dict(cls, data)

Expand Down Expand Up @@ -186,7 +186,7 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc
:param filters: A dictionary of filters to apply. For more information on the structure of the filters,
see the official Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html)
:returns: List of Documents that match the filters.
:returns: List of `Document`s that match the filters.
"""
if filters and "operator" not in filters and "conditions" not in filters:
filters = convert(filters)
Expand All @@ -197,13 +197,14 @@ def filter_documents(self, filters: Optional[Dict[str, Any]] = None) -> List[Doc

def write_documents(self, documents: List[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
"""
Writes Documents to Elasticsearch.
If policy is not specified or set to DuplicatePolicy.NONE, it will raise an exception if a document with the
same ID already exists in the document store.
Writes `Document`s to Elasticsearch.
:param documents: List of Documents to write to the document store.
:param policy: DuplicatePolicy to apply when a document with the same ID already exists in the document store.
:raises ValueError: If `documents` is not a list of `Document`s.
:raises DuplicateDocumentError: If a document with the same ID already exists in the document store and
`policy` is set to `DuplicatePolicy.FAIL` or `DuplicatePolicy.NONE`.
:raises DocumentStoreError: If an error occurs while writing the documents to the document store.
:returns: Number of documents written to the document store.
"""
if len(documents) > 0:
Expand Down Expand Up @@ -253,13 +254,15 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D

return documents_written

def _deserialize_document(self, hit: Dict[str, Any]) -> Document:
@staticmethod
def _deserialize_document(hit: Dict[str, Any]) -> Document:
"""
Creates a Document from the search hit provided.
Creates a `Document` from the search hit provided.
This is mostly useful in self.filter_documents().
:param hit: A search hit from Elasticsearch.
:returns: Document created from the search hit.
:returns: `Document` created from the search hit.
"""
data = hit["_source"]

Expand All @@ -271,12 +274,11 @@ def _deserialize_document(self, hit: Dict[str, Any]) -> Document:

def delete_documents(self, document_ids: List[str]) -> None:
"""
Deletes all documents with a matching document_ids from the document store.
Deletes all `Document`s with a matching `document_ids` from the document store.
:param document_ids: the object_ids to delete
:param document_ids: the object IDs to delete
"""

#
helpers.bulk(
client=self._client,
actions=({"_op_type": "delete", "_id": id_} for id_ in document_ids),
Expand All @@ -295,27 +297,25 @@ def _bm25_retrieval(
scale_score: bool = False,
) -> List[Document]:
"""
Elasticsearch by defaults uses BM25 search algorithm.
Retrieves `Document`s from Elasticsearch using the BM25 search algorithm.
Even though this method is called `bm25_retrieval` it searches for `query`
using the search algorithm `_client` was configured with.
This method is not mean to be part of the public interface of
This method is not meant to be part of the public interface of
`ElasticsearchDocumentStore` nor called directly.
`ElasticsearchBM25Retriever` uses this method directly and is the public interface for it.
`query` must be a non-empty string, otherwise a `ValueError` will be raised.
:param query: String to search in saved Documents' text.
:param filters: Filters applied to the retrieved Documents, for more info
see `ElasticsearchDocumentStore.filter_documents`, defaults to None
:param fuzziness: Fuzziness parameter passed to Elasticsearch, defaults to "AUTO".
see the official
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
for valid values.
:param top_k: Maximum number of Documents to return, defaults to 10
:param scale_score: If `True` scales the Document`s scores between 0 and 1, defaults to False
:param query: String to search in saved `Document`s' text.
:param filters: Filters applied to the retrieved `Document`s, for more info
see `ElasticsearchDocumentStore.filter_documents`.
:param fuzziness: Fuzziness parameter passed to Elasticsearch. See the official
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html#fuzziness)
for valid values.
:param top_k: Maximum number of `Document`s to return.
:param scale_score: If `True` scales the `Document``s scores between 0 and 1.
:raises ValueError: If `query` is an empty string
:returns: List of Document that match `query`
:returns: List of `Document` that match `query`
"""

if not query:
Expand Down Expand Up @@ -361,22 +361,23 @@ def _embedding_retrieval(
) -> List[Document]:
"""
Retrieves documents that are most similar to the query embedding using a vector similarity metric.
It uses the Elasticsearch's Approximate k-Nearest Neighbors search algorithm.
This method is not mean to be part of the public interface of
This method is not meant to be part of the public interface of
`ElasticsearchDocumentStore` nor called directly.
`ElasticsearchEmbeddingRetriever` uses this method directly and is the public interface for it.
:param query_embedding: Embedding of the query.
:param filters: Filters applied to the retrieved Documents. Defaults to None.
:param filters: Filters applied to the retrieved `Document`s.
Filters are applied during the approximate kNN search to ensure that top_k matching documents are returned.
:param top_k: Maximum number of Documents to return, defaults to 10
:param top_k: Maximum number of `Document`s to return.
:param num_candidates: Number of approximate nearest neighbor candidates on each shard. Defaults to top_k * 10.
Increasing this value will improve search accuracy at the cost of slower search speeds.
You can read more about it in the Elasticsearch
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
:raises ValueError: If `query_embedding` is an empty list
:returns: List of Document that are most similar to `query_embedding`
[documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#tune-approximate-knn-for-speed-accuracy)
:raises ValueError: If `query_embedding` is an empty list.
:returns: List of `Document` that are most similar to `query_embedding`.
"""

if not query_embedding:
Expand Down

0 comments on commit d7ad329

Please sign in to comment.