run-llama · jerryjliu · Aug 24, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 23, 2023
diff --git a/llama_hub/bagel/README.md b/llama_hub/bagel/README.md
@@ -0,0 +1,35 @@
+# Bagel Loader
+
+The Bagel Loader returns a set of texts corresponding to query embeddings or query texts retrieved from a BagelDB.
+The user initializes the loader with a BagelDB. They then pass in a query vector or a query text along with optional query parameters like metadata, where, where documents and include.
+
+## Usage
+
+Here's an example usage of the BagelReader.
+
+```python
+from llama_index import download_loader
+
+BagelReader = download_loader("BagelReader")
+
+# The chroma reader loads data from a persisted Chroma collection.
+# This requires a collection name and a persist directory.
+reader = BagelReader(
+    collection_name="my_bagel_collection"
+)
+
+query_embeddings=[x1, x2, x3, ....]
+
+documents = reader.load_data(collection_name="demo", query_vector=query_embeddings, n_results=5)
+
+
+reader = BagelReader(
+    collection_name="my_bagel_collection_2"
+)
+
+query_texts = ["some text"]
+
+documents = reader.load_data(collection_name="demo", query_texts = query_texts, n_results=5)
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
diff --git a/llama_hub/bagel/__init__.py b/llama_hub/bagel/__init__.py
diff --git a/llama_hub/bagel/base.py b/llama_hub/bagel/base.py
@@ -0,0 +1,191 @@
+from typing import Any, Dict, List, Mapping, Optional, Sequence, TypeVar, Union
+
+from typing_extensions import Literal
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+# define types
+ID = str
+IDs = List[ID]
+
+Vector = Union[Sequence[float], Sequence[int]]
+Embedding = Vector
+Embeddings = List[Embedding]
+
+Metadata = Mapping[str, Union[str, int, float]]
+Metadatas = List[Metadata]
+
+# Metadata Query Grammar
+LiteralValue = Union[str, int, float]
+LogicalOperator = Union[Literal["$and"], Literal["$or"]]
+WhereOperator = Union[
+    Literal["$gt"],
+    Literal["$gte"],
+    Literal["$lt"],
+    Literal["$lte"],
+    Literal["$ne"],
+    Literal["$eq"],
+]
+OperatorExpression = Dict[Union[WhereOperator, LogicalOperator], LiteralValue]
+
+Where = Dict[
+    Union[str, LogicalOperator], Union[LiteralValue, OperatorExpression, List["Where"]]
+]
+
+WhereDocumentOperator = Union[Literal["$contains"], LogicalOperator]
+WhereDocument = Dict[WhereDocumentOperator, Union[str, List["WhereDocument"]]]
+
+ClusterMetadata = Dict[Any, Any]
+
+Doc = str
+Documents = List[Doc]
+
+Parameter = TypeVar("Parameter", Embedding, Doc, Metadata, ID)
+T = TypeVar("T")
+OneOrMany = Union[T, List[T]]
+
+# This should ust be List[Literal["documents", "embeddings", "metadatas", "distances"]]
+# However, this provokes an incompatibility with the Overrides library and Python 3.7
+Include = List[
+    Union[
+        Literal["documents"],
+        Literal["embeddings"],
+        Literal["metadatas"],
+        Literal["distances"],
+    ]
+]
+
+LiteralValue = LiteralValue
+LogicalOperator = LogicalOperator
+WhereOperator = WhereOperator
+OperatorExpression = OperatorExpression
+Where = Where
+WhereDocumentOperator = WhereDocumentOperator
+
+
+class BagelReader(BaseReader):
+    """Reader for Bagel files."""
+
+    def __init__(self, collection_name: str) -> None:
+        """Initialize BagelReader.
+
+        Args: collection_name: Name of the collection to load from.
+
+        Returns: None
+        """
+
+        try:
+            import bagel
+        except ImportError:
+            raise ImportError(
+                "`bagel` package not found, please run `pip install bagel`"
+            )
+        from bagel.config import Settings
+
+        if not collection_name:
+            raise ValueError("collection_name cannot be empty")
+
+        self.collection_name = collection_name
+
+        server_settings = Settings(
+            bagel_api_impl="rest", bagel_server_host="api.bageldb.ai"
+        )
+
+        self.client = bagel.Client(server_settings)
+
+        self._collection = self.client.get_cluster(collection_name)
+
+    def create_documents(self, results: Any) -> Any:
+        """Create documents from the results.
+
+        Args:
+            results: Results from the query.
+
+        Returns:
+            List of documents.
+        """
+
+        documents = []
+        # create a list of results
+        all_results = list(
+            zip(
+                results["ids"][0],
+                results["documents"][0],
+                results["embeddings"][0],
+                results["metadatas"][0],
+            )
+        )
+        # iterate through the results
+        for result in all_results:
+            # create a Llama Document
+            document = Document(
+                doc_id=result[0],
+                text=result[1],
+                embedding=result[2],
+                metadata=result[3],
+            )
+            documents.append(document)
+
+        return documents
+
+    def load_data(
+        self,
+        query_vector: Optional[OneOrMany[Embedding]] = None,
+        query_texts: Optional[OneOrMany[Doc]] = None,
+        limit: int = 10,
+        where: Optional[Where] = None,
+        where_document: Optional[WhereDocument] = None,
+        include: Include = ["metadatas", "documents", "embeddings", "distances"],
+    ) -> Any:
+        """Get the top n_results documents for provided query_embeddings or query_texts.
+
+        Args:
+            query_embeddings: The embeddings to get the closes neighbors of. Optional.
+            query_texts: The document texts to get the closes neighbors of. Optional.
+            n_results: The number of neighbors to return for each query. Optional.
+            where: A Where type dict used to filter results by. Optional.
+            where_document: A WhereDocument type dict used to filter. Optional.
+            include: A list of what to include in the results. Optional.
+
+        Returns:
+            Document(s) with the closest embeddings to the
+            query_embeddings or query_texts.
+        """
+        # get the results from the collection
+        # If neither query_embeddings nor query_texts are provided,
+        # or both are provided, raise an error
+        if (query_vector is None and query_texts is None) or (
+            query_vector is not None and query_texts is not None
+        ):
+            raise ValueError(
+                "You must provide either embeddings or texts to find, but not both"
+            )
+
+        if where is None:
+            where = {}
+
+        if where_document is None:
+            where_document = {}
+
+        results = self._collection.find(
+            query_embeddings=query_vector,
+            query_texts=query_texts,
+            n_results=limit,
+            where=where,
+            where_document=where_document,
+            include=include,
+        )
+
+        # check if there are results
+        if not results:
+            raise ValueError("No results found")
+
+        # check if there are embeddings or documents
+        if not results["embeddings"] and not results["documents"]:
+            raise ValueError("No embeddings or documents found")
+
+        # create documents from the results
+        documents = self.create_documents(results)
+
+        return documents
diff --git a/llama_hub/bagel/requirements.txt b/llama_hub/bagel/requirements.txt
@@ -0,0 +1 @@
+betabageldb
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -692,5 +692,12 @@
     "id": "zep",
     "author": "zep",
     "keywords": ["zep", "retriever", "memory", "storage"]
+  },
+  {
+    "BagelReader":{
+      "id":"bagel",
+      "author":"asif",
+      "keywords":["vector", "database", "bagelDB", "storage"]
+    }
   }
 }