Elastic data reader (#508)

Co-authored-by: Sadegh Ranjbar <[email protected]>
run-llama · Sep 10, 2023 · 33155c7 · 33155c7
1 parent c9f6646
commit 33155c7
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 15 deletions.
diff --git a/llama_hub/elasticsearch/README.md b/llama_hub/elasticsearch/README.md
@@ -5,7 +5,7 @@ The user initializes the loader with an Elasticsearch index. They then pass in a
 
 ## Usage
 
-Here's an example usage of the ElasticsearchReader.
+Here's an example usage of the ElasticsearchReader to load 100 documents.
 
 ```python
 from llama_index import download_loader
@@ -20,7 +20,7 @@ reader = ElasticsearchReader(
 
 query_dict = {"query": {"match": {"message": {"query": "this is a test"}}}}
 documents = reader.load_data(
-    "<field_name>", query=query_dict, embedding_field="field_name"
+    "<field_name>", query=query_dict, embedding_field="field_name", size=100
 )
 ```
 

diff --git a/llama_hub/elasticsearch/base.py b/llama_hub/elasticsearch/base.py
@@ -4,59 +4,64 @@
 
 """
 
-
 from typing import List, Optional
 
 from llama_index.readers.base import BaseReader
 from llama_index.readers.schema.base import Document
 
 
+
 class ElasticsearchReader(BaseReader):
     """
     Read documents from an Elasticsearch/Opensearch index.
 
     These documents can then be used in a downstream Llama Index data structure.
 
     Args:
-        endpoint (str): URL (http/https) of cluster
+        endpoint (str): URL (http/https) of cluster without port
         index (str): Name of the index (required)
-        httpx_client_args (dict): Optional additional args to pass to the `httpx.Client`
+        basic_auth (set): basic authentication username password
     """
 
     def __init__(
-        self, endpoint: str, index: str, httpx_client_args: Optional[dict] = None
+            self, endpoint: str, index: str, basic_auth: Optional[set] = None
     ):
+
         """Initialize with parameters."""
-        import httpx  # noqa: F401
+        from elasticsearch import Elasticsearch
 
-        self._client = httpx.Client(base_url=endpoint, **(httpx_client_args or {}))
+        self._es_client = Elasticsearch(endpoint, basic_auth=basic_auth)
         self._index = index
         self._endpoint = endpoint
 
     def load_data(
-        self,
-        field: str,
-        query: Optional[dict] = None,
-        embedding_field: Optional[str] = None,
+            self,
+            field: str,
+            query: Optional[dict] = None,
+            embedding_field: Optional[str] = None,
+            size: Optional[int] = 10
     ) -> List[Document]:
         """Read data from the Elasticsearch index.
 
         Args:
             field (str): Field in the document to retrieve text from
             query (Optional[dict]): Elasticsearch JSON query DSL object.
                 For example:
-                {"query": {"match": {"message": {"query": "this is a test"}}}}
+                { "query" : {"match": {"message": {"query": "this is a test"}}}}
             embedding_field (Optional[str]): If there are embeddings stored in
                 this index, this field can be used
                 to set the embedding field on the returned Document list.
+            size (Optional[int]): The size of document to retrieve from elastic
         Returns:
             List[Document]: A list of documents.
 
         """
-        res = self._client.post(f"{self._index}/_search", json=query).json()
+        query = query['query'] if query is not None else None  # To remain backward compatible
+        res = self._es_client.search(index=self._index, query=query, size=size)
         documents = []
         for hit in res["hits"]["hits"]:
             value = hit["_source"][field]
+            _ = hit['_source'].pop(field)
             embedding = hit["_source"].get(embedding_field or "", None)
             documents.append(
                 Document(text=value, extra_info=hit["_source"], embedding=embedding)

diff --git a/llama_hub/elasticsearch/requirements.txt b/llama_hub/elasticsearch/requirements.txt
@@ -1 +1 @@
-httpx
+elasticsearch==8.9