Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Elastic data reader (#508)
Browse files Browse the repository at this point in the history

Co-authored-by: Sadegh Ranjbar <[email protected]>
  • Loading branch information
sadegh1404 and Sadegh Ranjbar authored Sep 10, 2023
1 parent c9f6646 commit 33155c7
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 15 deletions.
4 changes: 2 additions & 2 deletions llama_hub/elasticsearch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ The user initializes the loader with an Elasticsearch index. They then pass in a

## Usage

Here's an example usage of the ElasticsearchReader.
Here's an example usage of the ElasticsearchReader to load 100 documents.

```python
from llama_index import download_loader
Expand All @@ -20,7 +20,7 @@ reader = ElasticsearchReader(

query_dict = {"query": {"match": {"message": {"query": "this is a test"}}}}
documents = reader.load_data(
"<field_name>", query=query_dict, embedding_field="field_name"
"<field_name>", query=query_dict, embedding_field="field_name", size=100
)
```

Expand Down
29 changes: 17 additions & 12 deletions llama_hub/elasticsearch/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,59 +4,64 @@
"""


from typing import List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document



class ElasticsearchReader(BaseReader):
"""
Read documents from an Elasticsearch/Opensearch index.
These documents can then be used in a downstream Llama Index data structure.
Args:
endpoint (str): URL (http/https) of cluster
endpoint (str): URL (http/https) of cluster without port
index (str): Name of the index (required)
httpx_client_args (dict): Optional additional args to pass to the `httpx.Client`
basic_auth (set): basic authentication username password
"""

def __init__(
self, endpoint: str, index: str, httpx_client_args: Optional[dict] = None
self, endpoint: str, index: str, basic_auth: Optional[set] = None
):

"""Initialize with parameters."""
import httpx # noqa: F401
from elasticsearch import Elasticsearch

self._client = httpx.Client(base_url=endpoint, **(httpx_client_args or {}))
self._es_client = Elasticsearch(endpoint, basic_auth=basic_auth)
self._index = index
self._endpoint = endpoint

def load_data(
self,
field: str,
query: Optional[dict] = None,
embedding_field: Optional[str] = None,
self,
field: str,
query: Optional[dict] = None,
embedding_field: Optional[str] = None,
size: Optional[int] = 10
) -> List[Document]:
"""Read data from the Elasticsearch index.
Args:
field (str): Field in the document to retrieve text from
query (Optional[dict]): Elasticsearch JSON query DSL object.
For example:
{"query": {"match": {"message": {"query": "this is a test"}}}}
{ "query" : {"match": {"message": {"query": "this is a test"}}}}
embedding_field (Optional[str]): If there are embeddings stored in
this index, this field can be used
to set the embedding field on the returned Document list.
size (Optional[int]): The size of document to retrieve from elastic
Returns:
List[Document]: A list of documents.
"""
res = self._client.post(f"{self._index}/_search", json=query).json()
query = query['query'] if query is not None else None # To remain backward compatible
res = self._es_client.search(index=self._index, query=query, size=size)
documents = []
for hit in res["hits"]["hits"]:
value = hit["_source"][field]
_ = hit['_source'].pop(field)
embedding = hit["_source"].get(embedding_field or "", None)
documents.append(
Document(text=value, extra_info=hit["_source"], embedding=embedding)
Expand Down
2 changes: 1 addition & 1 deletion llama_hub/elasticsearch/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
httpx
elasticsearch==8.9

0 comments on commit 33155c7

Please sign in to comment.