From 33155c7ac8b24da316bdfb8d6f1e74746669e554 Mon Sep 17 00:00:00 2001 From: sadegh1404 Date: Sun, 10 Sep 2023 08:57:29 +0330 Subject: [PATCH] Elastic data reader (#508) Co-authored-by: Sadegh Ranjbar --- llama_hub/elasticsearch/README.md | 4 ++-- llama_hub/elasticsearch/base.py | 29 ++++++++++++++---------- llama_hub/elasticsearch/requirements.txt | 2 +- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/llama_hub/elasticsearch/README.md b/llama_hub/elasticsearch/README.md index 20d3ac971c..ff7b5fa663 100644 --- a/llama_hub/elasticsearch/README.md +++ b/llama_hub/elasticsearch/README.md @@ -5,7 +5,7 @@ The user initializes the loader with an Elasticsearch index. They then pass in a ## Usage -Here's an example usage of the ElasticsearchReader. +Here's an example usage of the ElasticsearchReader to load 100 documents. ```python from llama_index import download_loader @@ -20,7 +20,7 @@ reader = ElasticsearchReader( query_dict = {"query": {"match": {"message": {"query": "this is a test"}}}} documents = reader.load_data( - "", query=query_dict, embedding_field="field_name" + "", query=query_dict, embedding_field="field_name", size=100 ) ``` diff --git a/llama_hub/elasticsearch/base.py b/llama_hub/elasticsearch/base.py index d72a4a0e3f..4b4ee64b38 100644 --- a/llama_hub/elasticsearch/base.py +++ b/llama_hub/elasticsearch/base.py @@ -4,13 +4,13 @@ """ - from typing import List, Optional from llama_index.readers.base import BaseReader from llama_index.readers.schema.base import Document + class ElasticsearchReader(BaseReader): """ Read documents from an Elasticsearch/Opensearch index. @@ -18,26 +18,28 @@ class ElasticsearchReader(BaseReader): These documents can then be used in a downstream Llama Index data structure. Args: - endpoint (str): URL (http/https) of cluster + endpoint (str): URL (http/https) of cluster without port index (str): Name of the index (required) - httpx_client_args (dict): Optional additional args to pass to the `httpx.Client` + basic_auth (set): basic authentication username password """ def __init__( - self, endpoint: str, index: str, httpx_client_args: Optional[dict] = None + self, endpoint: str, index: str, basic_auth: Optional[set] = None ): + """Initialize with parameters.""" - import httpx # noqa: F401 + from elasticsearch import Elasticsearch - self._client = httpx.Client(base_url=endpoint, **(httpx_client_args or {})) + self._es_client = Elasticsearch(endpoint, basic_auth=basic_auth) self._index = index self._endpoint = endpoint def load_data( - self, - field: str, - query: Optional[dict] = None, - embedding_field: Optional[str] = None, + self, + field: str, + query: Optional[dict] = None, + embedding_field: Optional[str] = None, + size: Optional[int] = 10 ) -> List[Document]: """Read data from the Elasticsearch index. @@ -45,18 +47,21 @@ def load_data( field (str): Field in the document to retrieve text from query (Optional[dict]): Elasticsearch JSON query DSL object. For example: - {"query": {"match": {"message": {"query": "this is a test"}}}} + { "query" : {"match": {"message": {"query": "this is a test"}}}} embedding_field (Optional[str]): If there are embeddings stored in this index, this field can be used to set the embedding field on the returned Document list. + size (Optional[int]): The size of document to retrieve from elastic Returns: List[Document]: A list of documents. """ - res = self._client.post(f"{self._index}/_search", json=query).json() + query = query['query'] if query is not None else None # To remain backward compatible + res = self._es_client.search(index=self._index, query=query, size=size) documents = [] for hit in res["hits"]["hits"]: value = hit["_source"][field] + _ = hit['_source'].pop(field) embedding = hit["_source"].get(embedding_field or "", None) documents.append( Document(text=value, extra_info=hit["_source"], embedding=embedding) diff --git a/llama_hub/elasticsearch/requirements.txt b/llama_hub/elasticsearch/requirements.txt index 79228389fc..7728977ee2 100644 --- a/llama_hub/elasticsearch/requirements.txt +++ b/llama_hub/elasticsearch/requirements.txt @@ -1 +1 @@ -httpx \ No newline at end of file +elasticsearch==8.9 \ No newline at end of file