diff --git a/llama_hub/bagel/README.md b/llama_hub/bagel/README.md new file mode 100644 index 0000000000..17af6f58c6 --- /dev/null +++ b/llama_hub/bagel/README.md @@ -0,0 +1,33 @@ +# Bagel Loader + +The Bagel Loader returns a set of texts corresponding to query embeddings or query texts retrieved from a BagelDB. +The user initializes the loader with a BagelDB. They then pass in a query vector or a query text along with optional query parameters like metadata, where, where documents and include. + +## Usage + +Here's an example usage of the BagelReader. + +```python +from llama_hub.bagel.base import BagelReader + +# The chroma reader loads data from a persisted Chroma collection. +# This requires a collection name and a persist directory. +reader = BagelReader( + collection_name="my_bagel_collection" +) + +query_embeddings=[x1, x2, x3, ....] + +documents = reader.load_data(collection_name="demo", query_vector=query_embeddings, n_results=5) + + +reader = BagelReader( + collection_name="my_bagel_collection_2" +) + +query_texts = ["some text"] + +documents = reader.load_data(collection_name="demo", query_texts = query_texts, n_results=5) +``` + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/llama_hub/bagel/__init__.py b/llama_hub/bagel/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/llama_hub/bagel/base.py b/llama_hub/bagel/base.py new file mode 100644 index 0000000000..0f33bb685d --- /dev/null +++ b/llama_hub/bagel/base.py @@ -0,0 +1,191 @@ +from typing import Any, Dict, List, Mapping, Optional, Sequence, TypeVar, Union + +from typing_extensions import Literal + +from llama_index.readers.base import BaseReader +from llama_index.readers.schema.base import Document + +# define types +ID = str +IDs = List[ID] + +Vector = Union[Sequence[float], Sequence[int]] +Embedding = Vector +Embeddings = List[Embedding] + +Metadata = Mapping[str, Union[str, int, float]] +Metadatas = List[Metadata] + +# Metadata Query Grammar +LiteralValue = Union[str, int, float] +LogicalOperator = Union[Literal["$and"], Literal["$or"]] +WhereOperator = Union[ + Literal["$gt"], + Literal["$gte"], + Literal["$lt"], + Literal["$lte"], + Literal["$ne"], + Literal["$eq"], +] +OperatorExpression = Dict[Union[WhereOperator, LogicalOperator], LiteralValue] + +Where = Dict[ + Union[str, LogicalOperator], Union[LiteralValue, OperatorExpression, List["Where"]] +] + +WhereDocumentOperator = Union[Literal["$contains"], LogicalOperator] +WhereDocument = Dict[WhereDocumentOperator, Union[str, List["WhereDocument"]]] + +ClusterMetadata = Dict[Any, Any] + +Doc = str +Documents = List[Doc] + +Parameter = TypeVar("Parameter", Embedding, Doc, Metadata, ID) +T = TypeVar("T") +OneOrMany = Union[T, List[T]] + +# This should ust be List[Literal["documents", "embeddings", "metadatas", "distances"]] +# However, this provokes an incompatibility with the Overrides library and Python 3.7 +Include = List[ + Union[ + Literal["documents"], + Literal["embeddings"], + Literal["metadatas"], + Literal["distances"], + ] +] + +LiteralValue = LiteralValue +LogicalOperator = LogicalOperator +WhereOperator = WhereOperator +OperatorExpression = OperatorExpression +Where = Where +WhereDocumentOperator = WhereDocumentOperator + + +class BagelReader(BaseReader): + """Reader for Bagel files.""" + + def __init__(self, collection_name: str) -> None: + """Initialize BagelReader. + + Args: collection_name: Name of the collection to load from. + + Returns: None + """ + + try: + import bagel + except ImportError: + raise ImportError( + "`bagel` package not found, please run `pip install bagel`" + ) + from bagel.config import Settings + + if not collection_name: + raise ValueError("collection_name cannot be empty") + + self.collection_name = collection_name + + server_settings = Settings( + bagel_api_impl="rest", bagel_server_host="api.bageldb.ai" + ) + + self.client = bagel.Client(server_settings) + + self._collection = self.client.get_cluster(collection_name) + + def create_documents(self, results: Any) -> Any: + """Create documents from the results. + + Args: + results: Results from the query. + + Returns: + List of documents. + """ + + documents = [] + # create a list of results + all_results = list( + zip( + results["ids"][0], + results["documents"][0], + results["embeddings"][0], + results["metadatas"][0], + ) + ) + # iterate through the results + for result in all_results: + # create a Llama Document + document = Document( + doc_id=result[0], + text=result[1], + embedding=result[2], + metadata=result[3], + ) + documents.append(document) + + return documents + + def load_data( + self, + query_vector: Optional[OneOrMany[Embedding]] = None, + query_texts: Optional[OneOrMany[Doc]] = None, + limit: int = 10, + where: Optional[Where] = None, + where_document: Optional[WhereDocument] = None, + include: Include = ["metadatas", "documents", "embeddings", "distances"], + ) -> Any: + """Get the top n_results documents for provided query_embeddings or query_texts. + + Args: + query_embeddings: The embeddings to get the closes neighbors of. Optional. + query_texts: The document texts to get the closes neighbors of. Optional. + n_results: The number of neighbors to return for each query. Optional. + where: A Where type dict used to filter results by. Optional. + where_document: A WhereDocument type dict used to filter. Optional. + include: A list of what to include in the results. Optional. + + Returns: + Document(s) with the closest embeddings to the + query_embeddings or query_texts. + """ + # get the results from the collection + # If neither query_embeddings nor query_texts are provided, + # or both are provided, raise an error + if (query_vector is None and query_texts is None) or ( + query_vector is not None and query_texts is not None + ): + raise ValueError( + "You must provide either embeddings or texts to find, but not both" + ) + + if where is None: + where = {} + + if where_document is None: + where_document = {} + + results = self._collection.find( + query_embeddings=query_vector, + query_texts=query_texts, + n_results=limit, + where=where, + where_document=where_document, + include=include, + ) + + # check if there are results + if not results: + raise ValueError("No results found") + + # check if there are embeddings or documents + if not results["embeddings"] and not results["documents"]: + raise ValueError("No embeddings or documents found") + + # create documents from the results + documents = self.create_documents(results) + + return documents diff --git a/llama_hub/bagel/requirements.txt b/llama_hub/bagel/requirements.txt new file mode 100644 index 0000000000..c2d64808e1 --- /dev/null +++ b/llama_hub/bagel/requirements.txt @@ -0,0 +1 @@ +betabageldb \ No newline at end of file diff --git a/llama_hub/library.json b/llama_hub/library.json index 675867c9e7..5b2728a5d3 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -68,7 +68,11 @@ "CJKPDFReader": { "id": "file/cjk_pdf", "author": "JiroShimaya", - "keywords": ["Japanese", "Chinese", "Korean"] + "keywords": [ + "Japanese", + "Chinese", + "Korean" + ] }, "DocxReader": { "id": "file/docx", @@ -81,12 +85,17 @@ "ImageReader": { "id": "file/image", "author": "ravi03071991", - "keywords": ["invoice", "receipt"] + "keywords": [ + "invoice", + "receipt" + ] }, "HubspotReader": { "id": "hubspot", "author": "ykhli", - "keywords": ["hubspot"] + "keywords": [ + "hubspot" + ] }, "EpubReader": { "id": "file/epub", @@ -107,22 +116,31 @@ "SimpleCSVReader": { "id": "file/simple_csv", "author": "vguillet", - "keywords": ["spreadsheet"] + "keywords": [ + "spreadsheet" + ] }, "PagedCSVReader": { "id": "file/paged_csv", "author": "thejessezhang", - "keywords": ["spreadsheet"] + "keywords": [ + "spreadsheet" + ] }, "PandasCSVReader": { "id": "file/pandas_csv", "author": "ephe-meral", - "keywords": ["spreadsheet"] + "keywords": [ + "spreadsheet" + ] }, "SDLReader": { "id": "file/sdl", "author": "ajhofmann", - "keywords": ["graphql", "schema"] + "keywords": [ + "graphql", + "schema" + ] }, "SimpleWebPageReader": { "id": "web/simple_web", @@ -135,27 +153,46 @@ "ReadabilityWebPageReader": { "id": "web/readability_web", "author": "pandazki", - "extra_files": ["Readability.js"] + "extra_files": [ + "Readability.js" + ] }, "BeautifulSoupWebReader": { "id": "web/beautiful_soup_web", "author": "thejessezhang", - "keywords": ["substack", "readthedocs", "documentation"] + "keywords": [ + "substack", + "readthedocs", + "documentation" + ] }, "RssReader": { "id": "web/rss", "author": "bborn", - "keywords": ["feed", "rss", "atom"] + "keywords": [ + "feed", + "rss", + "atom" + ] }, "SitemapReader": { "id": "web/sitemap", "author": "selamanse", - "keywords": ["sitemap", "website", "seo"] + "keywords": [ + "sitemap", + "website", + "seo" + ] }, "DatabaseReader": { "id": "database", "author": "kevinqz", - "keywords": ["sql", "postgres", "snowflake", "aws rds"] + "keywords": [ + "sql", + "postgres", + "snowflake", + "aws rds" + ] }, "GraphQLReader": { "id": "graphql", @@ -246,7 +283,9 @@ "YoutubeTranscriptReader": { "id": "youtube_transcript", "author": "ravi03071991", - "keywords": ["video"] + "keywords": [ + "video" + ] }, "MakeWrapper": { "id": "make_com" @@ -266,42 +305,77 @@ "UnstructuredReader": { "id": "file/unstructured", "author": "thejessezhang", - "keywords": ["sec", "html", "eml", "10k", "10q", "unstructured.io", "yaml", "yml"] + "keywords": [ + "sec", + "html", + "eml", + "10k", + "10q", + "unstructured.io", + "yaml", + "yml" + ] }, "KnowledgeBaseWebReader": { "id": "web/knowledge_base", "author": "jasonwcfan", - "keywords": ["documentation"] + "keywords": [ + "documentation" + ] }, "S3Reader": { "id": "s3", "author": "thejessezhang", - "keywords": ["aws s3", "bucket", "amazon web services"] + "keywords": [ + "aws s3", + "bucket", + "amazon web services" + ] }, "RemoteReader": { "id": "remote", "author": "thejessezhang", - "keywords": ["hosted", "url", "gutenberg"] + "keywords": [ + "hosted", + "url", + "gutenberg" + ] }, "RemoteDepthReader": { "id": "remote_depth", "author": "simonMoisselin", - "keywords": ["hosted", "url", "multiple"] + "keywords": [ + "hosted", + "url", + "multiple" + ] }, "DadJokesReader": { "id": "dad_jokes", "author": "sidu", - "keywords": ["jokes", "dad jokes"] + "keywords": [ + "jokes", + "dad jokes" + ] }, "WordLiftLoader": { "id": "wordlift", "author": "msftwarelab", - "keywords": ["wordlift", "knowledge graph", "graphql", "structured data", "seo"] + "keywords": [ + "wordlift", + "knowledge graph", + "graphql", + "structured data", + "seo" + ] }, "WhatsappChatLoader": { "id": "whatsapp", "author": "batmanscode", - "keywords": ["whatsapp", "chat"] + "keywords": [ + "whatsapp", + "chat" + ] }, "BilibiliTranscriptReader": { "id": "bilibili", @@ -310,17 +384,28 @@ "RedditReader": { "id": "reddit", "author": "vanessahlyan", - "keywords": ["reddit", "subreddit", "search", "comments"] + "keywords": [ + "reddit", + "subreddit", + "search", + "comments" + ] }, "MemosReader": { "id": "memos", "author": "bubu", - "keywords": ["memos", "note"] + "keywords": [ + "memos", + "note" + ] }, "SpotifyReader": { "id": "spotify", "author": "ong", - "keywords": ["spotify", "music"] + "keywords": [ + "spotify", + "music" + ] }, "GithubRepositoryReader": { "id": "github_repo", @@ -333,47 +418,79 @@ "source code", "placeholder" ], - "extra_files": ["github_client.py", "utils.py", "__init__.py"] + "extra_files": [ + "github_client.py", + "utils.py", + "__init__.py" + ] }, "RDFReader": { "id": "file/rdf", "author": "mommi84", - "keywords": ["rdf", "n-triples", "graph", "knowledge graph"] + "keywords": [ + "rdf", + "n-triples", + "graph", + "knowledge graph" + ] }, "ReadwiseReader": { "id": "readwise", "author": "alexbowe", - "keywords": ["readwise", "highlights", "reading", "pkm"] + "keywords": [ + "readwise", + "highlights", + "reading", + "pkm" + ] }, "PandasExcelReader": { "id": "file/pandas_excel", "author": "maccarini", - "keywords": ["spreadsheet"] + "keywords": [ + "spreadsheet" + ] }, "ZendeskReader": { "id": "zendesk", "author": "bbornsztein", - "keywords": ["zendesk", "knowledge base", "help center"] + "keywords": [ + "zendesk", + "knowledge base", + "help center" + ] }, "IntercomReader": { "id": "intercom", "author": "bbornsztein", - "keywords": ["intercom", "knowledge base", "help center"] + "keywords": [ + "intercom", + "knowledge base", + "help center" + ] }, "WordpressReader": { "id": "wordpress", "author": "bbornsztein", - "keywords": ["wordpress", "blog"] + "keywords": [ + "wordpress", + "blog" + ] }, "GmailReader": { "id": "gmail", "author": "bbornsztein", - "keywords": ["gmail", "email"] + "keywords": [ + "gmail", + "email" + ] }, "SteamshipFileReader": { "id": "steamship", "author": "douglas-reid", - "keywords": ["steamship"] + "keywords": [ + "steamship" + ] }, "GPTRepoReader": { "id": "gpt_repo", @@ -386,27 +503,41 @@ "HatenaBlogReader": { "id": "hatena_blog", "author": "Shoya SHIRAKI", - "keywords": ["hatena", "blog"] + "keywords": [ + "hatena", + "blog" + ] }, "OpendalReader": { "id": "opendal_reader", "author": "OpenDAL Contributors", - "keywords": ["storage"] + "keywords": [ + "storage" + ] }, "OpendalS3Reader": { "id": "opendal_reader/s3", "author": "OpenDAL Contributors", - "keywords": ["storage", "s3"] + "keywords": [ + "storage", + "s3" + ] }, "OpendalAzblobReader": { "id": "opendal_reader/azblob", "author": "OpenDAL Contributors", - "keywords": ["storage", "azblob"] + "keywords": [ + "storage", + "azblob" + ] }, "OpendalGcsReader": { "id": "opendal_reader/gcs", "author": "OpenDAL Contributors", - "keywords": ["storage", "gcs"] + "keywords": [ + "storage", + "gcs" + ] }, "ConfluenceReader": { "id": "confluence", @@ -419,12 +550,17 @@ "JiraReader": { "id": "jira", "author": "bearguy", - "keywords": ["jira"] + "keywords": [ + "jira" + ] }, "UnstructuredURLLoader": { "id": "web/unstructured_web", "author": "kravetsmic", - "keywords": ["unstructured.io", "url"] + "keywords": [ + "unstructured.io", + "url" + ] }, "GoogleSheetsReader": { "id": "google_sheets", @@ -433,17 +569,26 @@ "FeedlyRssReader": { "id": "feedly_rss", "author": "kychanbp", - "keywords": ["feedly", "rss"] + "keywords": [ + "feedly", + "rss" + ] }, "FlatPdfReader": { "id": "file/flat_pdf", "author": "emmanuel-oliveira", - "keywords": ["pdf", "flat", "flattened"] + "keywords": [ + "pdf", + "flat", + "flattened" + ] }, - "PDFMinerReader": { + "PDFMinerReader": { "id": "file/pdf_miner", "author": "thunderbug1", - "keywords": ["pdf"] + "keywords": [ + "pdf" + ] }, "MilvusReader": { "id": "milvus", @@ -452,7 +597,11 @@ "StackoverflowReader": { "id": "stackoverflow", "author": "allen-munsch", - "keywords": ["posts", "questions", "answers"] + "keywords": [ + "posts", + "questions", + "answers" + ] }, "ZulipReader": { "id": "zulip", @@ -461,67 +610,113 @@ "OutlookLocalCalendarReader": { "id": "outlook_localcalendar", "author": "tevslin", - "keywords": ["calendar", "outlook"] + "keywords": [ + "calendar", + "outlook" + ] }, "ApifyActor": { "id": "apify/actor", "author": "drobnikj", - "keywords": ["apify", "scraper", "scraping", "crawler"] + "keywords": [ + "apify", + "scraper", + "scraping", + "crawler" + ] }, "ApifyDataset": { "id": "apify/dataset", "author": "drobnikj", - "keywords": ["apify", "scraper", "scraping", "crawler"] + "keywords": [ + "apify", + "scraper", + "scraping", + "crawler" + ] }, "TrelloReader": { "id": "trello", "author": "bluzir", - "keywords": ["trello"] + "keywords": [ + "trello" + ] }, "DeepLakeReader": { "id": "deeplake", "author": "adolkhan", - "keywords": ["deeplake"] + "keywords": [ + "deeplake" + ] }, "ImageCaptionReader": { "id": "file/image_blip", "author": "FarisHijazi", - "keywords": ["image"] + "keywords": [ + "image" + ] }, "ImageVisionLLMReader": { "id": "file/image_blip2", "author": "FarisHijazi", - "keywords": ["image"] + "keywords": [ + "image" + ] }, "ImageTabularChartReader": { "id": "file/image_deplot", "author": "jon-chuang", - "keywords": ["image", "chart", "tabular", "figure"] + "keywords": [ + "image", + "chart", + "tabular", + "figure" + ] }, "IPYNBReader": { "id": "file/ipynb", "author": "FarisHijazi", - "keywords": ["jupyter", "notebook", "ipynb"] + "keywords": [ + "jupyter", + "notebook", + "ipynb" + ] }, "HuggingFaceFSReader": { "id": "huggingface/fs", "author": "jerryjliu", - "keywords": ["hugging", "face", "huggingface", "filesystem", "fs"] + "keywords": [ + "hugging", + "face", + "huggingface", + "filesystem", + "fs" + ] }, "DeepDoctectionReader": { "id": "file/deepdoctection", "author": "jerryjliu", - "keywords": ["doctection", "doc"] + "keywords": [ + "doctection", + "doc" + ] }, "PandasAIReader": { "id": "pandas_ai", "author": "jerryjliu", - "keywords": ["pandas", "ai"] + "keywords": [ + "pandas", + "ai" + ] }, "MetalReader": { "id": "metal", "author": "getmetal", - "keywords": ["metal", "retriever", "storage"] + "keywords": [ + "metal", + "retriever", + "storage" + ] }, "BoardDocsReader": { "id": "boarddocs", @@ -534,12 +729,18 @@ "PyMuPDFReader": { "id": "file/pymu_pdf", "author": "iamarunbrahma", - "keywords": ["pymupdf", "pdf"] + "keywords": [ + "pymupdf", + "pdf" + ] }, "MondayReader": { "id": "mondaydotcom", "author": "nadavgr", - "keywords": ["monday", "mondaydotcom"] + "keywords": [ + "monday", + "mondaydotcom" + ] }, "MangoppsGuidesReader": { "id": "mangoapps_guides", @@ -562,13 +763,22 @@ "WeatherReader": { "id": "weather", "author": "iamadhee", - "keywords": ["weather","openweather"] + "keywords": [ + "weather", + "openweather" + ] }, - "OpenMap": { + "OpenMap": { "id": "maps", "author": "carrotpy", - "keywords": ["open maps","maps","open street maps","overpass api","geo"] - }, + "keywords": [ + "open maps", + "maps", + "open street maps", + "overpass api", + "geo" + ] + }, "KalturaESearchReader": { "id": "kaltura/esearch", "author": "kaltura", @@ -587,7 +797,10 @@ "FirestoreReader": { "id": "firestore", "author": "rayzhudev", - "keywords": ["firestore", "datastore"] + "keywords": [ + "firestore", + "datastore" + ] }, "KibelaReader": { "id": "kibela", @@ -601,7 +814,10 @@ "repository", "issues" ], - "extra_files": ["github_client.py", "__init__.py"] + "extra_files": [ + "github_client.py", + "__init__.py" + ] }, "FirebaseRealtimeDatabaseReader": { "id": "firebase_realtimedb", @@ -632,10 +848,10 @@ "memsql" ] }, - "SECFilingsLoader":{ - "id":"sec_filings", - "author":"Athe-kunal", - "keywords":[ + "SECFilingsLoader": { + "id": "sec_filings", + "author": "Athe-kunal", + "keywords": [ "finance", "SEC Filings", "10-K", @@ -691,6 +907,21 @@ "ZepReader": { "id": "zep", "author": "zep", - "keywords": ["zep", "retriever", "memory", "storage"] + "keywords": [ + "zep", + "retriever", + "memory", + "storage" + ] + }, + "BagelReader": { + "id": "bagel", + "author": "asif", + "keywords": [ + "vector", + "database", + "bagelDB", + "storage" + ] } -} +} \ No newline at end of file