diff --git a/cookbook/multi_modal_RAG_vdms.ipynb b/cookbook/multi_modal_RAG_vdms.ipynb index 20a19810cf286..fbbfafb7490c1 100644 --- a/cookbook/multi_modal_RAG_vdms.ipynb +++ b/cookbook/multi_modal_RAG_vdms.ipynb @@ -42,12 +42,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "a1b9206b08ef626e15b356bf9e031171f7c7eb8f956a2733f196f0109246fe2b\n" + "87218928619b1301f3079123d7289b6c527481a72b352788867332568fd2f343\n", + "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "! docker run --rm -d -p 55559:55555 --name vdms_rag_nb intellabs/vdms:latest\n", + "%pip install --quiet -U vdms\n", "\n", "# Connect to VDMS Vector Store\n", "from langchain_community.vectorstores.vdms import VDMS_Client\n", @@ -72,10 +74,7 @@ "metadata": {}, "outputs": [], "source": [ - "! pip install --quiet -U vdms langchain-experimental\n", - "\n", - "# lock to 0.10.19 due to a persistent bug in more recent versions\n", - "! pip install --quiet pdf2image \"unstructured[all-docs]==0.10.19\" pillow pydantic lxml open_clip_torch" + "! pip install -q \"onnxruntime==1.17.0\" \"unstructured[all-docs]==0.10.19\" pillow pydantic lxml pillow matplotlib tiktoken open_clip_torch torch" ] }, { @@ -115,11 +114,12 @@ "import requests\n", "\n", "# Folder to store pdf and extracted images\n", - "datapath = Path(\"./data/multimodal_files\").resolve()\n", + "base_datapath = Path(\"./data/multimodal_files\").resolve()\n", + "datapath = base_datapath / \"images\"\n", "datapath.mkdir(parents=True, exist_ok=True)\n", "\n", "pdf_url = \"https://www.loc.gov/lcm/pdf/LCM_2020_1112.pdf\"\n", - "pdf_path = str(datapath / pdf_url.split(\"/\")[-1])\n", + "pdf_path = str(base_datapath / pdf_url.split(\"/\")[-1])\n", "with open(pdf_path, \"wb\") as f:\n", " f.write(requests.get(pdf_url).content)" ] @@ -310,12 +310,21 @@ "execution_count": 9, "id": "4c93fab3-74c4-4f1d-958a-0bc4cdd0797e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ - "from langchain_community.llms.ollama import Ollama\n", - "from langchain_core.messages import HumanMessage\n", + "%pip install -Uq langchain-ollama\n", + "from langchain_core.messages import HumanMessage, SystemMessage\n", "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n", + "from langchain_ollama.llms import OllamaLLM\n", "\n", "\n", "def prompt_func(data_dict):\n", @@ -340,8 +349,8 @@ " \"As an expert art critic and historian, your task is to analyze and interpret images, \"\n", " \"considering their historical and cultural significance. Alongside the images, you will be \"\n", " \"provided with related text to offer context. Both will be retrieved from a vectorstore based \"\n", - " \"on user-input keywords. Please convert answers to english and use your extensive knowledge \"\n", - " \"and analytical skills to provide a comprehensive summary that includes:\\n\"\n", + " \"on user-input keywords. Please use your extensive knowledge and analytical skills to provide a \"\n", + " \"comprehensive summary that includes:\\n\"\n", " \"- A detailed description of the visual elements in the image.\\n\"\n", " \"- The historical and cultural context of the image.\\n\"\n", " \"- An interpretation of the image's symbolism and meaning.\\n\"\n", @@ -359,7 +368,7 @@ " \"\"\"Multi-modal RAG chain\"\"\"\n", "\n", " # Multi-modal LLM\n", - " llm_model = Ollama(\n", + " llm_model = OllamaLLM(\n", " verbose=True, temperature=0.5, model=\"llava\", base_url=\"http://localhost:11434\"\n", " )\n", "\n", @@ -461,10 +470,15 @@ "name": "stdout", "output_type": "stream", "text": [ - " The image depicts a woman with several children. The woman appears to be of Cherokee heritage, as suggested by the text provided. The image is described as having been initially regretted by the subject, Florence Owens Thompson, due to her feeling that it did not accurately represent her leadership qualities.\n", - "The historical and cultural context of the image is tied to the Great Depression and the Dust Bowl, both of which affected the Cherokee people in Oklahoma. The photograph was taken during this period, and its subject, Florence Owens Thompson, was a leader within her community who worked tirelessly to help those affected by these crises.\n", - "The image's symbolism and meaning can be interpreted as a representation of resilience and strength in the face of adversity. The woman is depicted with multiple children, which could signify her role as a caregiver and protector during difficult times.\n", - "Connections between the image and the related text include Florence Owens Thompson's leadership qualities and her regretted feelings about the photograph. Additionally, the mention of Dorothea Lange, the photographer who took this photo, ties the image to its historical context and the broader narrative of the Great Depression and Dust Bowl in Oklahoma. \n" + " The image is a black and white photograph taken by Dorothea Lange in March 1936 as part of the Farm Security Administration's Office of War Information Collection. It depicts a woman, Florence Owens Thompson, a Cherokee from Oklahoma, standing next to her seven children. The photograph has been described as \"Great Photographs\" and is titled \"DESTITUTE PEA PICKERS IN CALIFORNIA. MOTHER OF SEVEN CHILDREN. AGE THIRTY-TWO. NIPOMO, CALIFORNIA.\"\n", + "\n", + "The woman in the photograph appears to be in a state of poverty and hardship. She is dressed in simple clothing, and her facial expression suggests that she is not happy about having her picture taken. The background of the image shows a barren landscape with no visible signs of prosperity or abundance.\n", + "\n", + "The related text provides some context for the photograph. It mentions that Florence Owens Thompson initially regretted having her picture taken by Dorothea Lange, as she felt that it did not accurately represent her character. However, her daughter Katherine later said that she was a strong and influential leader in her community.\n", + "\n", + "The interpretation of the image's symbolism and meaning could be that it serves as a powerful visual representation of the struggles faced by people during the Great Depression. The photograph captures the hardships and poverty experienced by many individuals at that time, and it highlights the resilience and strength of those who were affected by these difficult circumstances.\n", + "\n", + "In terms of connections between the image and the related text, the photograph serves as a visual complement to the written text, which provides additional information about the subject of the photograph and her community. The text helps to provide a more complete understanding of the context in which the photograph was taken and the impact it had on the people involved. Overall, the combination of the image and the related text offers a comprehensive summary that highlights the historical and cultural significance of this important photograph. \n" ] } ], @@ -491,6 +505,14 @@ "source": [ "! docker kill vdms_rag_nb" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe4a98ee", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -509,7 +531,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/cookbook/visual_RAG_vdms.ipynb b/cookbook/visual_RAG_vdms.ipynb index d0d87185d3a07..aee22ee16f461 100644 --- a/cookbook/visual_RAG_vdms.ipynb +++ b/cookbook/visual_RAG_vdms.ipynb @@ -26,7 +26,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2e44b44201c8778b462342ac97f5ccf05a4e02aa8a04505ecde97bf20dcc4cbb\n" + "183c5eb067431896e0bd138fbf7f124af6317b522152051fbb0dc977baf3802c\n" ] } ], @@ -363,7 +363,7 @@ "\t\tThere are 2 shoppers in this video. Shopper 1 is wearing a plaid shirt and a spectacle. Shopper 2 who is not completely captured in the frame seems to wear a black shirt and is moving away with his back turned towards the camera. There is a shelf towards the right of the camera frame. Shopper 2 is hanging an item back to a hanger and then quickly walks away in a similar fashion as shopper 2. Contents of the nearer side of the shelf with respect to camera seems to be camping lanterns and cleansing agents, arranged at the top. In the middle part of the shelf, various tools including grommets, a pocket saw, candles, and other helpful camping items can be observed. Midway through the shelf contains items which appear to be steel containers and items made up of plastic with red, green, orange, and yellow colors, while those at the bottom are packed in cardboard boxes. Contents at the farther part of the shelf are well stocked and organized but are not glaringly visible.\n", "\n", "\tMetadata:\n", - "\t\t{'fps': 24.0, 'id': 'c6e5f894-b905-46f5-ac9e-4487a9235561', 'total_frames': 120.0, 'video': 'clip16.mp4'}\n", + "\t\t{'fps': 24.0, 'total_frames': 120.0, 'video': 'clip16.mp4'}\n", "Retrieved Top matching video!\n", "\n", "\n" @@ -394,7 +394,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3edf8783e114487ca490d8dec5c46884", + "model_id": "62dd5deb78ff4ffdac9f13e7cfda1167", "version_major": 2, "version_minor": 0 }, @@ -404,6 +404,13 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:accelerate.big_modeling:Some parameters are on the meta device because they were offloaded to the cpu.\n" + ] } ], "source": [ @@ -555,7 +562,7 @@ "\t\tA single shopper is seen in this video standing facing the shelf and in the bottom part of the frame. He's wearing a light-colored shirt and a spectacle. The shopper is carrying a red colored basket in his left hand. The entire basket is not clearly visible, but it does seem to contain something in a blue colored package which the shopper has just placed in the basket given his right hand was seen inside the basket. Then the shopper leans towards the shelf and checks out an item in orange package. He picks this single item with his right hand and proceeds to place the item in the basket. The entire shelf looks well stocked except for the top part of the shelf which is empty. The shopper has not picked any item from this part of the shelf. The rest of the shelf looks well stocked and does not need any restocking. The contents on the farther part of the shelf consists of items, majority of which are packed in black, yellow, and green packages. No other details are visible of these items.\n", "\n", "\tMetadata:\n", - "\t\t{'fps': 24.0, 'id': '37ddc212-994e-4db0-877f-5ed09965ab90', 'total_frames': 162.0, 'video': 'clip10.mp4'}\n", + "\t\t{'fps': 24.0, 'total_frames': 162.0, 'video': 'clip10.mp4'}\n", "Retrieved Top matching video!\n", "\n", "\n" @@ -585,7 +592,7 @@ "User : Find a man holding a red shopping basket\n", "Assistant : Most relevant retrieved video is **clip9.mp4** \n", "\n", - "I see a person standing in front of a well-stocked shelf, they are wearing a light-colored shirt and glasses, and they have a red shopping basket in their left hand. They are leaning forward and picking up an item from the shelf with their right hand. The item is packaged in a blue-green box. Based on the scene description, I can confirm that the person is indeed holding a red shopping basket.\n" + "I see a person standing in front of a well-stocked shelf, they are wearing a light-colored shirt and glasses, and they have a red shopping basket in their left hand. They are leaning forward and picking up an item from the shelf with their right hand. The item is packaged in a blue-green box. Based on the available information, I cannot confirm whether the basket is empty or contains items. However, the rest of the\n" ] } ], @@ -655,7 +662,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": ".langchain-venv", "language": "python", "name": "python3" }, @@ -669,7 +676,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/docs/docs/integrations/providers/vdms.mdx b/docs/docs/integrations/providers/vdms.mdx index 2ed0ea4455dd5..a70a8cb07c212 100644 --- a/docs/docs/integrations/providers/vdms.mdx +++ b/docs/docs/integrations/providers/vdms.mdx @@ -18,7 +18,8 @@ There are two ways to get started with VDMS: #### Install VDMS on your local machine via docker ```bash - docker run -d -p 55555:55555 intellabs/vdms:latest +docker pull intellabs/vdms:latest +docker run -d -p 55555:55555 intellabs/vdms:latest ``` #### Install VDMS directly on your local machine @@ -49,7 +50,7 @@ vectorstore = VDMS.from_documents( docs, client=client, collection_name="langchain-demo", - embedding_function=HuggingFaceEmbeddings(model_name=model_name), + embedding=HuggingFaceEmbeddings(model_name=model_name), engine="FaissFlat" distance_strategy="L2", ) diff --git a/docs/docs/integrations/vectorstores/vdms.ipynb b/docs/docs/integrations/vectorstores/vdms.ipynb index dd3adceab3508..4ce5407dbcba3 100644 --- a/docs/docs/integrations/vectorstores/vdms.ipynb +++ b/docs/docs/integrations/vectorstores/vdms.ipynb @@ -7,28 +7,29 @@ "source": [ "# Intel's Visual Data Management System (VDMS)\n", "\n", - ">Intel's [VDMS](https://github.com/IntelLabs/vdms) is a storage solution for efficient access of big-”visual”-data that aims to achieve cloud scale by searching for relevant visual data via visual metadata stored as a graph and enabling machine friendly enhancements to visual data for faster access. VDMS is licensed under MIT.\n", + "This notebook covers how to get started with `VDMS` as a vector store.\n", + "\n", + ">Intel's [Visual Data Management System (VDMS)](https://github.com/IntelLabs/vdms) is a storage solution for efficient access of big-”visual”-data that aims to achieve cloud scale by searching for relevant visual data via visual metadata stored as a graph and enabling machine friendly enhancements to visual data for faster access. VDMS is licensed under MIT. For more information on `VDMS`, visit [this page](https://github.com/IntelLabs/vdms/wiki), and find the LangChain API reference [here](https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.vdms.VDMS.html).\n", "\n", "VDMS supports:\n", "* K nearest neighbor search\n", "* Euclidean distance (L2) and inner product (IP)\n", - "* Libraries for indexing and computing distances: TileDBDense, TileDBSparse, FaissFlat (Default), FaissIVFFlat, Flinng\n", + "* Libraries for indexing and computing distances: FaissFlat (Default), FaissHNSWFlat, FaissIVFFlat, Flinng, TileDBDense, TileDBSparse\n", "* Embeddings for text, images, and video\n", "* Vector and metadata searches\n", "\n", - "VDMS has server and client components. To setup the server, see the [installation instructions](https://github.com/IntelLabs/vdms/blob/master/INSTALL.md) or use the [docker image](https://hub.docker.com/r/intellabs/vdms).\n", + "## Setup\n", "\n", + "VDMS has server and client components. To setup the server, see the [installation instructions](https://github.com/IntelLabs/vdms/blob/master/INSTALL.md) or use the [docker image](https://hub.docker.com/r/intellabs/vdms).\n", "This notebook shows how to use VDMS as a vector store using the docker image.\n", "\n", - "You'll need to install `langchain-community` with `pip install -qU langchain-community` to use this integration\n", - "\n", - "To begin, install the Python packages for the VDMS client and Sentence Transformers:" + "To access `VDMS` vector store, you'll need to install the `langchain-community` package and the [VDMS Client Python Module](https://pypi.org/project/vdms/)." ] }, { "cell_type": "code", "execution_count": 1, - "id": "2167badd", + "id": "83a43688", "metadata": {}, "outputs": [ { @@ -40,30 +41,33 @@ } ], "source": [ - "# Pip install necessary package\n", - "%pip install --upgrade --quiet pip vdms sentence-transformers langchain-huggingface > /dev/null" + "%pip install -qU vdms" ] }, { "cell_type": "markdown", - "id": "af2b4512", + "id": "f47f73f4", "metadata": {}, "source": [ - "## Start VDMS Server\n", - "Here we start the VDMS server with port 55555." + "## Initialization\n", + "\n", + "### Start VDMS Server\n", + "\n", + "In this example, the VDMS Server is deployed via the publicly available Docker image.\n", + "Here we start the VDMS server with port 55555 and connect to it using the VDMS client.\n" ] }, { "cell_type": "code", "execution_count": 2, - "id": "4b1537c7", + "id": "4855307b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "b26917ffac236673ef1d035ab9c91fe999e29c9eb24aa6c7103d7baa6bf2f72d\n" + "9474de7bc05849faf0b9e545125bdbc060a398d3d8d043e91f5e18455824fb8b\n" ] } ], @@ -73,180 +77,684 @@ }, { "cell_type": "markdown", - "id": "2b5ffbf8", + "id": "7ed7ace6", "metadata": {}, "source": [ - "## Basic Example (using the Docker Container)\n", - "\n", - "In this basic example, we demonstrate adding documents into VDMS and using it as a vector database.\n", - "\n", - "You can run the VDMS Server in a Docker container separately to use with LangChain which connects to the server via the VDMS Python Client. \n", + "### Create Documents\n", "\n", - "VDMS has the ability to handle multiple collections of documents, but the LangChain interface expects one, so we need to specify the name of the collection . The default collection name used by LangChain is \"langchain\".\n" + "Create documents we can add to vectorstore." ] }, { "cell_type": "code", "execution_count": 3, - "id": "5201ba0c", + "id": "a810b2fe", "metadata": {}, "outputs": [], "source": [ - "import time\n", - "import warnings\n", + "import logging\n", "\n", - "warnings.filterwarnings(\"ignore\")\n", + "logging.basicConfig()\n", + "logging.getLogger(\"langchain_community.vectorstores.vdms\").setLevel(logging.INFO)\n", "\n", - "from langchain_community.document_loaders.text import TextLoader\n", - "from langchain_community.vectorstores import VDMS\n", - "from langchain_community.vectorstores.vdms import VDMS_Client\n", - "from langchain_huggingface import HuggingFaceEmbeddings\n", - "from langchain_text_splitters.character import CharacterTextSplitter\n", + "from uuid import uuid4\n", + "\n", + "from langchain_core.documents import Document\n", + "\n", + "document_1 = Document(\n", + " page_content=\"I had chocolate chip pancakes and scrambled eggs for breakfast this morning.\",\n", + " metadata={\"source\": \"tweet\"},\n", + " id=1,\n", + ")\n", + "\n", + "document_2 = Document(\n", + " page_content=\"The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.\",\n", + " metadata={\"source\": \"news\"},\n", + " id=2,\n", + ")\n", + "\n", + "document_3 = Document(\n", + " page_content=\"Building an exciting new project with LangChain - come check it out!\",\n", + " metadata={\"source\": \"tweet\"},\n", + " id=3,\n", + ")\n", + "\n", + "document_4 = Document(\n", + " page_content=\"Robbers broke into the city bank and stole $1 million in cash.\",\n", + " metadata={\"source\": \"news\"},\n", + " id=4,\n", + ")\n", + "\n", + "document_5 = Document(\n", + " page_content=\"Wow! That was an amazing movie. I can't wait to see it again.\",\n", + " metadata={\"source\": \"tweet\"},\n", + " id=5,\n", + ")\n", + "\n", + "document_6 = Document(\n", + " page_content=\"Is the new iPhone worth the price? Read this review to find out.\",\n", + " metadata={\"source\": \"website\"},\n", + " id=6,\n", + ")\n", + "\n", + "document_7 = Document(\n", + " page_content=\"The top 10 soccer players in the world right now.\",\n", + " metadata={\"source\": \"website\"},\n", + " id=7,\n", + ")\n", + "\n", + "document_8 = Document(\n", + " page_content=\"LangGraph is the best framework for building stateful, agentic applications!\",\n", + " metadata={\"source\": \"tweet\"},\n", + " id=8,\n", + ")\n", + "\n", + "document_9 = Document(\n", + " page_content=\"The stock market is down 500 points today due to fears of a recession.\",\n", + " metadata={\"source\": \"news\"},\n", + " id=9,\n", + ")\n", "\n", - "time.sleep(2)\n", - "DELIMITER = \"-\" * 50\n", + "document_10 = Document(\n", + " page_content=\"I have a bad feeling I am going to get deleted :(\",\n", + " metadata={\"source\": \"tweet\"},\n", + " id=10,\n", + ")\n", "\n", - "# Connect to VDMS Vector Store\n", - "vdms_client = VDMS_Client(host=\"localhost\", port=55555)" + "documents = [\n", + " document_1,\n", + " document_2,\n", + " document_3,\n", + " document_4,\n", + " document_5,\n", + " document_6,\n", + " document_7,\n", + " document_8,\n", + " document_9,\n", + " document_10,\n", + "]\n", + "uuids = [str(doc.id) for doc in documents]" ] }, { "cell_type": "markdown", - "id": "935069bc", + "id": "8fd45749", "metadata": {}, "source": [ - "Here are some helper functions for printing results." + "### Embedding Model\n", + "\n", + "We use HuggingFaceEmbeddings for this example as the embedding model." ] }, { "cell_type": "code", "execution_count": 4, - "id": "e78814eb", + "id": "c2f88a6f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n", + "# Embedding Dimensions: 768\n" + ] + } + ], "source": [ - "def print_document_details(doc):\n", - " print(f\"Content:\\n\\t{doc.page_content}\\n\")\n", - " print(\"Metadata:\")\n", - " for key, value in doc.metadata.items():\n", - " if value != \"Missing property\":\n", - " print(f\"\\t{key}:\\t{value}\")\n", - "\n", - "\n", - "def print_results(similarity_results, score=True):\n", - " print(f\"{DELIMITER}\\n\")\n", - " if score:\n", - " for doc, score in similarity_results:\n", - " print(f\"Score:\\t{score}\\n\")\n", - " print_document_details(doc)\n", - " print(f\"{DELIMITER}\\n\")\n", - " else:\n", - " for doc in similarity_results:\n", - " print_document_details(doc)\n", - " print(f\"{DELIMITER}\\n\")\n", - "\n", - "\n", - "def print_response(list_of_entities):\n", - " for ent in list_of_entities:\n", - " for key, value in ent.items():\n", - " if value != \"Missing property\":\n", - " print(f\"\\n{key}:\\n\\t{value}\")\n", - " print(f\"{DELIMITER}\\n\")" + "%pip install -qU langchain-huggingface\n", + "\n", + "from langchain_huggingface import HuggingFaceEmbeddings\n", + "\n", + "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", + "embeddings = HuggingFaceEmbeddings(model_name=model_name)\n", + "print(\n", + " f\"# Embedding Dimensions: {len(embeddings.embed_query('This is a test document.'))}\"\n", + ")" ] }, { "cell_type": "markdown", - "id": "88229867", + "id": "9d037340", "metadata": {}, "source": [ - "### Load Document and Obtain Embedding Function\n", - "Here we load the most recent State of the Union Address and split the document into chunks. \n", + "### Add items to vector store\n", + "\n", + "Use the VDMS Client to connect to a VDMS vectorstore using FAISS IndexFlat indexing (default) and Euclidean distance (default) as the distance metric for similarity search.\n", "\n", - "LangChain vector stores use a string/keyword `id` for bookkeeping documents. By default, `id` is a uuid but here we're defining it as an integer cast as a string. Additional metadata is also provided with the documents and the HuggingFaceEmbeddings are used for this example as the embedding function." + "We can add items to our vector store by using the `add_documents` function." ] }, { "cell_type": "code", "execution_count": 5, - "id": "2ebfc16c", + "id": "3fe4457f", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:Descriptor set my_collection_faiss_L2 created\n" + ] + } + ], + "source": [ + "from langchain_community.vectorstores import VDMS\n", + "from langchain_community.vectorstores.vdms import VDMS_Client\n", + "\n", + "collection_name = \"my_collection_faiss_L2\"\n", + "\n", + "vdms_client = VDMS_Client(host=\"localhost\", port=55555)\n", + "\n", + "vector_store = VDMS(\n", + " client=vdms_client,\n", + " embedding=embeddings,\n", + " collection_name=collection_name,\n", + " engine=\"FaissFlat\",\n", + " distance_strategy=\"L2\",\n", + ")\n", + "\n", + "inserted_ids = vector_store.add_documents(documents=documents, ids=uuids)" + ] + }, + { + "cell_type": "markdown", + "id": "213acf08", + "metadata": {}, + "source": [ + "## Query vector store\n", + "\n", + "Once your vector store has been created and the relevant documents have been added you will most likely wish to query it during the running of your chain or agent. \n", + "\n", + "### Query directly\n", + "\n", + "#### Similarity search\n", + "\n", + "Performing a simple similarity search can be done as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e2b96fcf", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:VDMS similarity search took 0.0082 seconds\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "# Documents: 42\n", - "# Embedding Dimensions: 768\n" + "* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]\n", + "* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]\n" + ] + } + ], + "source": [ + "results = vector_store.similarity_search(\n", + " \"LangChain provides abstractions to make working with LLMs easy\",\n", + " k=2,\n", + " filter={\"source\": [\"==\", \"tweet\"]},\n", + ")\n", + "for res in results:\n", + " print(f\"* {res.page_content} [{res.metadata}]\")" + ] + }, + { + "cell_type": "markdown", + "id": "cdd117ea", + "metadata": {}, + "source": [ + "#### Similarity search with score\n", + "\n", + "If you want to execute a similarity search and receive the corresponding scores you can run:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2768a331", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* [SIM=0.809] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'langchain_id': '2', 'source': 'news'}]\n", + "\n", + "\n" + ] + } + ], + "source": [ + "results = vector_store.similarity_search_with_score(\n", + " \"Will it be hot tomorrow?\", k=1, filter={\"source\": [\"==\", \"news\"]}\n", + ")\n", + "for res, score in results:\n", + " print(f\"* [SIM={score:0.3f}] {res.page_content} [{res.metadata}]\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "92b436c8", + "metadata": {}, + "source": [ + "#### Search by vector\n", + "\n", + "You can also search by vector:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8ea434a5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:VDMS similarity search took 0.0043 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]\n" + ] + } + ], + "source": [ + "results = vector_store.similarity_search_by_vector(\n", + " embedding=embeddings.embed_query(\"I love green eggs and ham!\"), k=1\n", + ")\n", + "for doc in results:\n", + " print(f\"* {doc.page_content} [{doc.metadata}]\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c1c1e6f", + "metadata": {}, + "source": [ + "### Query by turning into retriever\n", + "\n", + "You can also transform the vector store into a retriever for easier usage in your chains.\n", + "\n", + "Here is how to transform your vector store into a retriever and invoke with a simple query and filter." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e97b4493", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:VDMS similarity search took 0.0044 seconds\n" ] + }, + { + "data": { + "text/plain": [ + "[Document(id='4', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "# load the document and split it into chunks\n", + "retriever = vector_store.as_retriever(\n", + " search_type=\"similarity\",\n", + " search_kwargs={\"k\": 1},\n", + ")\n", + "retriever.invoke(\"Stealing from the bank is a crime\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7b6f7867", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:VDMS similarity search mmr took 0.0045 secs\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(metadata={'langchain_id': '4', 'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever = vector_store.as_retriever(\n", + " search_type=\"mmr\",\n", + " search_kwargs={\"k\": 1, \"fetch_k\": 5},\n", + ")\n", + "retriever.invoke(\"Stealing from the bank is a crime\", filter={\"source\": \"news\"})" + ] + }, + { + "cell_type": "markdown", + "id": "6bd251d6", + "metadata": {}, + "source": [ + "## Manage vector store\n", + "\n", + "In addition to interacting with the vectorstore to add items, we can also update and delete items." + ] + }, + { + "cell_type": "markdown", + "id": "335ae44b", + "metadata": {}, + "source": [ + "### Update items in vector store\n", + "\n", + "Now that we have added documents to our vector store, we can update existing documents by using the `update_documents` function. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "799ea40c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* [id: 1] I had chocolate chip pancakes and fried eggs for breakfast this morning. [{'source': 'tweet'}]\n", + "* [id: 2] The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees. [{'source': 'news'}]\n" + ] + } + ], + "source": [ + "updated_document_1 = Document(\n", + " page_content=\"I had chocolate chip pancakes and fried eggs for breakfast this morning.\",\n", + " metadata={\"source\": \"tweet\"},\n", + " id=1,\n", + ")\n", + "\n", + "updated_document_2 = Document(\n", + " page_content=\"The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees.\",\n", + " metadata={\"source\": \"news\"},\n", + " id=2,\n", + ")\n", + "\n", + "vector_store.update_document(\n", + " collection_name, document_id=uuids[0], document=updated_document_1\n", + ")\n", + "\n", + "# You can also update multiple documents at once\n", + "vector_store.update_documents(\n", + " collection_name,\n", + " ids=uuids[:2],\n", + " documents=[updated_document_1, updated_document_2],\n", + ")\n", + "\n", + "results = vector_store.get_by_ids(uuids[:2])\n", + "for doc in results:\n", + " print(f\"* [id: {doc.id}] {doc.page_content} [{doc.metadata}]\")" + ] + }, + { + "cell_type": "markdown", + "id": "b98df677", + "metadata": {}, + "source": [ + "### Delete items from vector store\n", + "\n", + "We can also delete items from our vector store as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "90c528ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "* [id: 1] I had chocolate chip pancakes and fried eggs for breakfast this morning. [{'source': 'tweet'}]\n", + "* [id: 2] The weather forecast for tomorrow is sunny and warm, with a high of 82 degrees. [{'source': 'news'}]\n", + "* [id: 3] Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]\n", + "* [id: 4] Robbers broke into the city bank and stole $1 million in cash. [{'source': 'news'}]\n", + "* [id: 5] Wow! That was an amazing movie. I can't wait to see it again. [{'source': 'tweet'}]\n", + "* [id: 6] Is the new iPhone worth the price? Read this review to find out. [{'source': 'website'}]\n", + "* [id: 7] The top 10 soccer players in the world right now. [{'source': 'website'}]\n", + "* [id: 8] LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]\n", + "* [id: 9] The stock market is down 500 points today due to fears of a recession. [{'source': 'news'}]\n" + ] + } + ], + "source": [ + "vector_store.delete(ids=[uuids[-1]])\n", + "\n", + "results = vector_store.get_by_ids(uuids)\n", + "\n", + "for doc in results:\n", + " print(f\"* [id: {doc.id}] {doc.page_content} [{doc.metadata}]\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d020d2a", + "metadata": {}, + "source": [ + "### Add Existing IDs\n", + "\n", + "IDs should be unique, therefore, if they already exist the data is overwritten. \n", + "In previous step, we deleted id `'10'`. Next we try to insert ALL data again but since other ids exist, they are skipped and only id `'10'` is inserted. This is done by setting the `delete_existing` keyword to `False`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "27e5edac", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:[!] Embeddings skipped for following ids because already exists: ['1', '2', '3', '4', '5', '6', '7', '8', '9']\n", + "Can retry with 'delete_existing' set to True\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ids inserted: ['10']\n" + ] + } + ], + "source": [ + "inserted_ids = vector_store.add_documents(\n", + " documents=documents, ids=uuids, delete_existing=False\n", + ")\n", + "print(f\"ids inserted: {inserted_ids}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e802cc2a", + "metadata": {}, + "source": [ + "Here we attempt to insert all data again but nothing is inserted." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "47b2e040", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:[!] Embeddings skipped for following ids because already exists: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']\n", + "Can retry with 'delete_existing' set to True\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ids inserted: []\n" + ] + } + ], + "source": [ + "inserted_ids = vector_store.add_documents(\n", + " documents=documents, ids=uuids, delete_existing=False\n", + ")\n", + "print(f\"ids inserted: {inserted_ids}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4a25da6", + "metadata": {}, + "source": [ + "Now to delete existing ids and re-insert data. No need to set `delete_existing` as this is the default behavior." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8a0990fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ids inserted: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']\n" + ] + } + ], + "source": [ + "inserted_ids = vector_store.add_documents(documents=documents, ids=uuids)\n", + "print(f\"ids inserted: {inserted_ids}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8118ae2f", + "metadata": {}, + "source": [ + "## Similarity Search using other engines\n", + "\n", + "VDMS supports various libraries for indexing and computing distances: FaissFlat (Default), FaissHNSWFlat, FaissIVFFlat, Flinng, TileDBDense, and TileDBSparse.\n", + "By default, the vectorstore uses FaissFlat. Below we show a few examples using the other engines.\n", + "\n", + "### Load Sample Document\n", + "\n", + "Here we load the most recent State of the Union Address and split the document into chunks. Additional metadata is also generated for each document chunk.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "30db3589", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Documents: 42\n" + ] + } + ], + "source": [ + "from langchain_community.document_loaders.text import TextLoader\n", + "from langchain_text_splitters.character import CharacterTextSplitter\n", + "\n", + "# Load the document and split it into chunks\n", "document_path = \"../../how_to/state_of_the_union.txt\"\n", "raw_documents = TextLoader(document_path).load()\n", "\n", - "# split it into chunks\n", + "# Split it into chunks\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "docs = text_splitter.split_documents(raw_documents)\n", "ids = []\n", "for doc_idx, doc in enumerate(docs):\n", - " ids.append(str(doc_idx + 1))\n", - " docs[doc_idx].metadata[\"id\"] = str(doc_idx + 1)\n", + " ids.append(int(doc_idx + 1))\n", + "\n", + " # Synthetic metadata\n", + " docs[doc_idx].metadata[\"id\"] = int(doc_idx + 1)\n", " docs[doc_idx].metadata[\"page_number\"] = int(doc_idx + 1)\n", " docs[doc_idx].metadata[\"president_included\"] = (\n", " \"president\" in doc.page_content.lower()\n", " )\n", - "print(f\"# Documents: {len(docs)}\")\n", - "\n", - "\n", - "# create the open-source embedding function\n", - "model_name = \"sentence-transformers/all-mpnet-base-v2\"\n", - "embedding = HuggingFaceEmbeddings(model_name=model_name)\n", - "print(\n", - " f\"# Embedding Dimensions: {len(embedding.embed_query('This is a test document.'))}\"\n", - ")" + "print(f\"# Documents: {len(docs)}\")" ] }, { "cell_type": "markdown", - "id": "a6a596f0", + "id": "bea87fb1", "metadata": {}, "source": [ - "### Similarity Search using Faiss Flat and Euclidean Distance (Default)\n", "\n", - "In this section, we add the documents to VDMS using FAISS IndexFlat indexing (default) and Euclidena distance (default) as the distance metric for simiarity search. We search for three documents (`k=3`) related to the query `What did the president say about Ketanji Brown Jackson`." + "### Similarity Search using Faiss HNSWFlat and Euclidean Distance\n", + "\n", + "Here, we add the documents to VDMS using Faiss IndexHNSWFlat indexing and L2 as the distance metric for similarity search. We search for three documents (`k=3`) related to the query `What did the president say about Ketanji Brown Jackson` and also return the score along with the document." ] }, { "cell_type": "code", - "execution_count": 6, - "id": "1f3f43d4", + "execution_count": 17, + "id": "4bf6614f", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:Descriptor set my_collection_FaissHNSWFlat_L2 created\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "--------------------------------------------------\n", - "\n", - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "* [SIM=1.203] Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", "\n", "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", "\n", "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. [{'id': 32, 'langchain_id': '32', 'page_number': 32, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", "\n", - "Content:\n", - "\tAs Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", + "* [SIM=1.495] As Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", "\n", "It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. \n", "\n", @@ -260,17 +768,10 @@ "\n", "My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", "\n", - "Our troops in Iraq and Afghanistan faced many dangers.\n", + "Our troops in Iraq and Afghanistan faced many dangers. [{'id': 37, 'langchain_id': '37', 'page_number': 37, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Metadata:\n", - "\tid:\t37\n", - "\tpage_number:\t37\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", "\n", - "Content:\n", - "\tA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "* [SIM=1.501] A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", "\n", "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", "\n", @@ -280,214 +781,118 @@ "\n", "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", "\n", - "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", + "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders. [{'id': 33, 'langchain_id': '33', 'page_number': 33, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Metadata:\n", - "\tid:\t33\n", - "\tpage_number:\t33\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", "\n" ] } ], "source": [ - "# add data\n", - "collection_name = \"my_collection_faiss_L2\"\n", - "db_FaissFlat = VDMS.from_documents(\n", + "db_FaissHNSWFlat = VDMS.from_documents(\n", " docs,\n", " client=vdms_client,\n", " ids=ids,\n", - " collection_name=collection_name,\n", - " embedding=embedding,\n", + " collection_name=\"my_collection_FaissHNSWFlat_L2\",\n", + " embedding=embeddings,\n", + " engine=\"FaissHNSWFlat\",\n", + " distance_strategy=\"L2\",\n", ")\n", - "\n", - "# Query (No metadata filtering)\n", - "k = 3\n", - "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "returned_docs = db_FaissFlat.similarity_search(query, k=k, filter=None)\n", - "print_results(returned_docs, score=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "c2e36c18", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--------------------------------------------------\n", - "\n", - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", - "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", - "\n", - "Content:\n", - "\tAnd for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \n", - "\n", - "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n", - "\n", - "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \n", - "\n", - "And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n", - "\n", - "So tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \n", - "\n", - "First, beat the opioid epidemic.\n", - "\n", - "Metadata:\n", - "\tid:\t35\n", - "\tpage_number:\t35\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", - "\n", - "Content:\n", - "\tLast month, I announced our plan to supercharge \n", - "the Cancer Moonshot that President Obama asked me to lead six years ago. \n", - "\n", - "Our goal is to cut the cancer death rate by at least 50% over the next 25 years, turn more cancers from death sentences into treatable diseases. \n", - "\n", - "More support for patients and families. \n", - "\n", - "To get there, I call on Congress to fund ARPA-H, the Advanced Research Projects Agency for Health. \n", - "\n", - "It’s based on DARPA—the Defense Department project that led to the Internet, GPS, and so much more. \n", - "\n", - "ARPA-H will have a singular purpose—to drive breakthroughs in cancer, Alzheimer’s, diabetes, and more. \n", - "\n", - "A unity agenda for the nation. \n", - "\n", - "We can do this. \n", - "\n", - "My fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. \n", - "\n", - "In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. \n", - "\n", - "We have fought for freedom, expanded liberty, defeated totalitarianism and terror.\n", - "\n", - "Metadata:\n", - "\tid:\t40\n", - "\tpage_number:\t40\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", - "\n" - ] - } - ], - "source": [ - "# Query (with filtering)\n", + "# Query\n", "k = 3\n", - "constraints = {\"page_number\": [\">\", 30], \"president_included\": [\"==\", True]}\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "returned_docs = db_FaissFlat.similarity_search(query, k=k, filter=constraints)\n", - "print_results(returned_docs, score=False)" + "docs_with_score = db_FaissHNSWFlat.similarity_search_with_score(query, k=k, filter=None)\n", + "\n", + "for res, score in docs_with_score:\n", + " print(f\"* [SIM={score:0.3f}] {res.page_content} [{res.metadata}]\\n\\n\")" ] }, { "cell_type": "markdown", - "id": "92ab3370", + "id": "062b6816", "metadata": {}, "source": [ "### Similarity Search using Faiss IVFFlat and Inner Product (IP) Distance\n", "\n", - "In this section, we add the documents to VDMS using Faiss IndexIVFFlat indexing and IP as the distance metric for similarity search. We search for three documents (`k=3`) related to the query `What did the president say about Ketanji Brown Jackson` and also return the score along with the document.\n" + "We add the documents to VDMS using Faiss IndexIVFFlat indexing and IP as the distance metric for similarity search. We search for three documents (`k=3`) related to the query `What did the president say about Ketanji Brown Jackson` and also return the score along with the document.\n" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "78f502cf", + "execution_count": 18, + "id": "cd43bee3", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:Descriptor set my_collection_FaissIVFFlat_IP created\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "--------------------------------------------------\n", + "* [SIM=0.164] And built the strongest, freest, and most prosperous nation the world has ever known. \n", "\n", - "Score:\t1.2032090425\n", + "Now is the hour. \n", "\n", - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "Our moment of responsibility. \n", "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "Our test of resolve and conscience, of history itself. \n", "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "It is in this moment that our character is formed. Our purpose is found. Our future is forged. \n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "Well I know this nation. \n", "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", + "We will meet the test. \n", "\n", - "Score:\t1.4952471256\n", + "To protect freedom and liberty, to expand fairness and opportunity. \n", "\n", - "Content:\n", - "\tAs Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", + "We will save democracy. \n", "\n", - "It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. \n", + "As hard as these times have been, I am more optimistic about America today than I have been my whole life. \n", "\n", - "And let’s get all Americans the mental health services they need. More people they can turn to for help, and full parity between physical and mental health care. \n", + "Because I see the future that is within our grasp. \n", "\n", - "Third, support our veterans. \n", + "Because I know there is simply nothing beyond our capacity. \n", "\n", - "Veterans are the best of us. \n", + "We are the only nation on Earth that has always turned every crisis we have faced into an opportunity. \n", "\n", - "I’ve always believed that we have a sacred obligation to equip all those we send to war and care for them and their families when they come home. \n", + "The only nation that can be defined by a single word: possibilities. \n", "\n", - "My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", + "So on this night, in our 245th year as a nation, I have come to report on the State of the Union. [{'id': 41, 'langchain_id': '41', 'page_number': 41, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Our troops in Iraq and Afghanistan faced many dangers.\n", "\n", - "Metadata:\n", - "\tid:\t37\n", - "\tpage_number:\t37\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", + "* [SIM=0.159] He and his Dad both have Type 1 diabetes, which means they need insulin every day. Insulin costs about $10 a vial to make. \n", "\n", - "Score:\t1.5008399487\n", + "But drug companies charge families like Joshua and his Dad up to 30 times more. I spoke with Joshua’s mom. \n", "\n", - "Content:\n", - "\tA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "Imagine what it’s like to look at your child who needs insulin and have no idea how you’re going to pay for it. \n", "\n", - "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", + "What it does to your dignity, your ability to look your child in the eye, to be the parent you expect to be. \n", "\n", - "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", + "Joshua is here with us tonight. Yesterday was his birthday. Happy birthday, buddy. \n", "\n", - "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "For Joshua, and for the 200,000 other young people with Type 1 diabetes, let’s cap the cost of insulin at $35 a month so everyone can afford it. \n", "\n", - "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", + "Drug companies will still do very well. And while we’re at it let Medicare negotiate lower prices for prescription drugs, like the VA already does. [{'id': 18, 'langchain_id': '18', 'page_number': 18, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", "\n", - "Metadata:\n", - "\tid:\t33\n", - "\tpage_number:\t33\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", + "* [SIM=0.138] And tonight I am announcing that we will join our allies in closing off American air space to all Russian flights – further isolating Russia – and adding an additional squeeze –on their economy. The Ruble has lost 30% of its value. \n", + "\n", + "The Russian stock market has lost 40% of its value and trading remains suspended. Russia’s economy is reeling and Putin alone is to blame. \n", + "\n", + "Together with our allies we are providing support to the Ukrainians in their fight for freedom. Military assistance. Economic assistance. Humanitarian assistance. \n", + "\n", + "We are giving more than $1 Billion in direct assistance to Ukraine. \n", + "\n", + "And we will continue to aid the Ukrainian people as they defend their country and to help ease their suffering. \n", + "\n", + "Let me be clear, our forces are not engaged and will not engage in conflict with Russian forces in Ukraine. \n", + "\n", + "Our forces are not going to Europe to fight in Ukraine, but to defend our NATO Allies – in the event that Putin decides to keep moving west. [{'id': 5, 'langchain_id': '5', 'page_number': 5, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}]\n", + "\n", "\n" ] } @@ -498,20 +903,21 @@ " client=vdms_client,\n", " ids=ids,\n", " collection_name=\"my_collection_FaissIVFFlat_IP\",\n", - " embedding=embedding,\n", + " embedding=embeddings,\n", " engine=\"FaissIVFFlat\",\n", " distance_strategy=\"IP\",\n", ")\n", - "# Query\n", + "\n", "k = 3\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs_with_score = db_FaissIVFFlat.similarity_search_with_score(query, k=k, filter=None)\n", - "print_results(docs_with_score)" + "for res, score in docs_with_score:\n", + " print(f\"* [SIM={score:0.3f}] {res.page_content} [{res.metadata}]\\n\\n\")" ] }, { "cell_type": "markdown", - "id": "e66d9125", + "id": "25f4e3d0", "metadata": {}, "source": [ "### Similarity Search using FLINNG and IP Distance\n", @@ -521,81 +927,77 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "add81beb", + "execution_count": 19, + "id": "a08ae933", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:Descriptor set my_collection_Flinng_IP created\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "--------------------------------------------------\n", + "* [SIM=0.000] Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n", "\n", - "Score:\t1.2032090425\n", + "Last year COVID-19 kept us apart. This year we are finally together again. \n", "\n", - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n", "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "With a duty to one another to the American people to the Constitution. \n", "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "And with an unwavering resolve that freedom will always triumph over tyranny. \n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n", "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", + "He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n", "\n", - "Score:\t1.4952471256\n", + "He met the Ukrainian people. \n", "\n", - "Content:\n", - "\tAs Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", + "From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. [{'id': 1, 'langchain_id': '1', 'page_number': 1, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt'}]\n", + "\n", + "\n", + "* [SIM=0.000] Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n", + "\n", + "Last year COVID-19 kept us apart. This year we are finally together again. \n", + "\n", + "Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n", + "\n", + "With a duty to one another to the American people to the Constitution. \n", "\n", - "It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. \n", + "And with an unwavering resolve that freedom will always triumph over tyranny. \n", "\n", - "And let’s get all Americans the mental health services they need. More people they can turn to for help, and full parity between physical and mental health care. \n", + "Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n", "\n", - "Third, support our veterans. \n", + "He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n", "\n", - "Veterans are the best of us. \n", + "He met the Ukrainian people. \n", "\n", - "I’ve always believed that we have a sacred obligation to equip all those we send to war and care for them and their families when they come home. \n", + "From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. [{'id': 1, 'langchain_id': '1', 'page_number': 1, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", "\n", - "Our troops in Iraq and Afghanistan faced many dangers.\n", + "* [SIM=0.000] Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n", "\n", - "Metadata:\n", - "\tid:\t37\n", - "\tpage_number:\t37\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", + "Last year COVID-19 kept us apart. This year we are finally together again. \n", "\n", - "Score:\t1.5008399487\n", + "Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n", "\n", - "Content:\n", - "\tA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "With a duty to one another to the American people to the Constitution. \n", "\n", - "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", + "And with an unwavering resolve that freedom will always triumph over tyranny. \n", "\n", - "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", + "Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n", "\n", - "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "He thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n", "\n", - "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", + "He met the Ukrainian people. \n", "\n", - "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", + "From President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. [{'id': 1, 'langchain_id': '1', 'page_number': 1, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Metadata:\n", - "\tid:\t33\n", - "\tpage_number:\t33\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", "\n" ] } @@ -606,7 +1008,7 @@ " client=vdms_client,\n", " ids=ids,\n", " collection_name=\"my_collection_Flinng_IP\",\n", - " embedding=embedding,\n", + " embedding=embeddings,\n", " engine=\"Flinng\",\n", " distance_strategy=\"IP\",\n", ")\n", @@ -614,12 +1016,13 @@ "k = 3\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs_with_score = db_Flinng.similarity_search_with_score(query, k=k, filter=None)\n", - "print_results(docs_with_score)" + "for res, score in docs_with_score:\n", + " print(f\"* [SIM={score:0.3f}] {res.page_content} [{res.metadata}]\\n\\n\")" ] }, { "cell_type": "markdown", - "id": "a5984766", + "id": "1d42e50a", "metadata": {}, "source": [ "### Similarity Search using TileDBDense and Euclidean Distance\n", @@ -630,38 +1033,31 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "3001ba6e", + "execution_count": 20, + "id": "270ad4d8", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:langchain_community.vectorstores.vdms:Descriptor set my_collection_tiledbD_L2 created\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "--------------------------------------------------\n", - "\n", - "Score:\t1.2032090425\n", - "\n", - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "* [SIM=1.203] Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", "\n", "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", "\n", "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", - "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. [{'id': 32, 'langchain_id': '32', 'page_number': 32, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Score:\t1.4952471256\n", "\n", - "Content:\n", - "\tAs Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", + "* [SIM=1.495] As Frances Haugen, who is here with us tonight, has shown, we must hold social media platforms accountable for the national experiment they’re conducting on our children for profit. \n", "\n", "It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children. \n", "\n", @@ -675,19 +1071,10 @@ "\n", "My administration is providing assistance with job training and housing, and now helping lower-income veterans get VA care debt-free. \n", "\n", - "Our troops in Iraq and Afghanistan faced many dangers.\n", - "\n", - "Metadata:\n", - "\tid:\t37\n", - "\tpage_number:\t37\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", + "Our troops in Iraq and Afghanistan faced many dangers. [{'id': 37, 'langchain_id': '37', 'page_number': 37, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Score:\t1.5008399487\n", "\n", - "Content:\n", - "\tA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "* [SIM=1.501] A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", "\n", "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", "\n", @@ -697,14 +1084,8 @@ "\n", "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", "\n", - "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", + "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders. [{'id': 33, 'langchain_id': '33', 'page_number': 33, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}]\n", "\n", - "Metadata:\n", - "\tid:\t33\n", - "\tpage_number:\t33\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", "\n" ] } @@ -715,7 +1096,7 @@ " client=vdms_client,\n", " ids=ids,\n", " collection_name=\"my_collection_tiledbD_L2\",\n", - " embedding=embedding,\n", + " embedding=embeddings,\n", " engine=\"TileDBDense\",\n", " distance_strategy=\"L2\",\n", ")\n", @@ -723,194 +1104,28 @@ "k = 3\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs_with_score = db_tiledbD.similarity_search_with_score(query, k=k, filter=None)\n", - "print_results(docs_with_score)" - ] - }, - { - "cell_type": "markdown", - "id": "9ed3ec50", - "metadata": {}, - "source": [ - "### Update and Delete\n", - "\n", - "While building toward a real application, you want to go beyond adding data, and also update and delete data.\n", - "\n", - "Here is a basic example showing how to do so. First, we will update the metadata for the document most relevant to the query by adding a date. " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "81a02810", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Original metadata: \n", - "\t{'id': '32', 'page_number': 32, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt'}\n", - "new metadata: \n", - "\t{'id': '32', 'page_number': 32, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt', 'last_date_read': {'_date': '2024-05-01T14:30:00'}}\n", - "--------------------------------------------------\n", - "\n", - "UPDATED ENTRY (id=32):\n", - "\n", - "content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", - "\n", - "id:\n", - "\t32\n", - "\n", - "last_date_read:\n", - "\t2024-05-01T14:30:00+00:00\n", - "\n", - "page_number:\n", - "\t32\n", - "\n", - "president_included:\n", - "\tTrue\n", - "\n", - "source:\n", - "\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", - "\n" - ] - } - ], - "source": [ - "from datetime import datetime\n", - "\n", - "doc = db_FaissFlat.similarity_search(query)[0]\n", - "print(f\"Original metadata: \\n\\t{doc.metadata}\")\n", - "\n", - "# Update the metadata for a document by adding last datetime document read\n", - "datetime_str = datetime(2024, 5, 1, 14, 30, 0).isoformat()\n", - "doc.metadata[\"last_date_read\"] = {\"_date\": datetime_str}\n", - "print(f\"new metadata: \\n\\t{doc.metadata}\")\n", - "print(f\"{DELIMITER}\\n\")\n", - "\n", - "# Update document in VDMS\n", - "id_to_update = doc.metadata[\"id\"]\n", - "db_FaissFlat.update_document(collection_name, id_to_update, doc)\n", - "response, response_array = db_FaissFlat.get(\n", - " collection_name,\n", - " constraints={\n", - " \"id\": [\"==\", id_to_update],\n", - " \"last_date_read\": [\">=\", {\"_date\": \"2024-05-01T00:00:00\"}],\n", - " },\n", - ")\n", - "\n", - "# Display Results\n", - "print(f\"UPDATED ENTRY (id={id_to_update}):\")\n", - "print_response([response[0][\"FindDescriptor\"][\"entities\"][0]])" - ] - }, - { - "cell_type": "markdown", - "id": "872a7dff", - "metadata": {}, - "source": [ - "Next we will delete the last document by ID (id=42)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "95537fe8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Documents before deletion: 42\n", - "Documents after deletion (id=42): 41\n" - ] - } - ], - "source": [ - "print(\"Documents before deletion: \", db_FaissFlat.count(collection_name))\n", - "\n", - "id_to_remove = ids[-1]\n", - "db_FaissFlat.delete(collection_name=collection_name, ids=[id_to_remove])\n", - "print(\n", - " f\"Documents after deletion (id={id_to_remove}): {db_FaissFlat.count(collection_name)}\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "18152965", - "metadata": {}, - "source": [ - "## Other Information\n", - "VDMS supports various types of visual data and operations. Some of the capabilities are integrated in the LangChain interface but additional workflow improvements will be added as VDMS is under continuous development.\n", - "\n", - "Addtional capabilities integrated into LangChain are below.\n", - "\n", - "### Similarity search by vector\n", - "Instead of searching by string query, you can also search by embedding/vector." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1db4d6ed", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", - "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tlast_date_read:\t2024-05-01T14:30:00+00:00\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n" - ] - } - ], - "source": [ - "embedding_vector = embedding.embed_query(query)\n", - "returned_docs = db_FaissFlat.similarity_search_by_vector(embedding_vector)\n", - "\n", - "# Print Results\n", - "print_document_details(returned_docs[0])" + "for res, score in docs_with_score:\n", + " print(f\"* [SIM={score:0.3f}] {res.page_content} [{res.metadata}]\\n\\n\")" ] }, { "cell_type": "markdown", - "id": "daf718b2", + "id": "50fffb45", "metadata": {}, "source": [ - "### Filtering on metadata\n", + "## Filtering on metadata\n", "\n", "It can be helpful to narrow down the collection before working with it.\n", "\n", - "For example, collections can be filtered on metadata using the get method. A dictionary is used to filter metadata. Here we retrieve the document where `id = 2` and remove it from the vector store." + "For example, collections can be filtered on metadata using the `get_by_constraints` method. A dictionary is used to filter metadata. Here we retrieve the document where `langchain_id = \"2\"` and remove it from the vector store. \n", + "\n", + "***NOTE:*** `id` was generated as additional metadata as an integer while `langchain_id` (the internal ID) is an unique string for each entry. " ] }, { "cell_type": "code", - "execution_count": 14, - "id": "2bc0313b", + "execution_count": 21, + "id": "af8354f9", "metadata": {}, "outputs": [ { @@ -918,12 +1133,8 @@ "output_type": "stream", "text": [ "Deleted entry:\n", - "\n", - "blob:\n", - "\tTrue\n", - "\n", - "content:\n", - "\tGroups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n", + "Content:\n", + "Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \n", "\n", "In this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. \n", "\n", @@ -943,56 +1154,41 @@ "\n", "It matters. American diplomacy matters. American resolve matters.\n", "\n", - "id:\n", - "\t2\n", - "\n", - "page_number:\n", - "\t2\n", - "\n", - "president_included:\n", - "\tTrue\n", + "Metadata:\n", + "{'id': 2, 'page_number': 2, 'president_included': True, 'source': '../../how_to/state_of_the_union.txt'}\n", "\n", - "source:\n", - "\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", "\n" ] } ], "source": [ - "response, response_array = db_FaissFlat.get(\n", - " collection_name,\n", + "results = db_FaissIVFFlat.get_by_constraints(\n", + " db_FaissIVFFlat.collection_name,\n", " limit=1,\n", " include=[\"metadata\", \"embeddings\"],\n", - " constraints={\"id\": [\"==\", \"2\"]},\n", + " constraints={\"id\": [\"==\", 2]},\n", ")\n", "\n", "# Delete id=2\n", - "db_FaissFlat.delete(collection_name=collection_name, ids=[\"2\"])\n", + "db_FaissIVFFlat.delete(collection_name=db_FaissIVFFlat.collection_name, ids=[2])\n", "\n", "print(\"Deleted entry:\")\n", - "print_response([response[0][\"FindDescriptor\"][\"entities\"][0]])" + "for doc in results:\n", + " print(f\"Content:\\n{doc.page_content}\\n\\nMetadata:\\n{doc.metadata}\\n\\n\")" ] }, { "cell_type": "markdown", - "id": "794a7552", + "id": "6c99c0c2", "metadata": {}, "source": [ - "### Retriever options\n", - "\n", - "This section goes over different options for how to use VDMS as a retriever.\n", - "\n", - "\n", - "#### Simiarity Search\n", - "\n", - "Here we use similarity search in the retriever object.\n" + "Here we use `id` to filter for a range of IDs since it is an integer." ] }, { "cell_type": "code", - "execution_count": 15, - "id": "120f55eb", + "execution_count": 22, + "id": "912e227d", "metadata": {}, "outputs": [ { @@ -1000,234 +1196,91 @@ "output_type": "stream", "text": [ "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "Putin’s latest attack on Ukraine was premeditated and unprovoked. \n", "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "He rejected repeated efforts at diplomacy. \n", "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "He thought the West and NATO wouldn’t respond. And he thought he could divide us at home. Putin was wrong. We were ready. Here is what we did. \n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "We prepared extensively and carefully. \n", "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tlast_date_read:\t2024-05-01T14:30:00+00:00\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n" - ] - } - ], - "source": [ - "retriever = db_FaissFlat.as_retriever()\n", - "relevant_docs = retriever.invoke(query)[0]\n", - "\n", - "print_document_details(relevant_docs)" - ] - }, - { - "cell_type": "markdown", - "id": "e8c0fb24", - "metadata": {}, - "source": [ - "#### Maximal Marginal Relevance Search (MMR)\n", - "\n", - "In addition to using similarity search in the retriever object, you can also use `mmr`." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "f00be6d0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "We spent months building a coalition of other freedom-loving nations from Europe and the Americas to Asia and Africa to confront Putin. \n", "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "I spent countless hours unifying our European allies. We shared with the world in advance what we knew Putin was planning and precisely how he would try to falsely justify his aggression. \n", "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "We countered Russia’s lies with truth. \n", + "\n", + "And now that he has acted the free world is holding him accountable. \n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "Along with twenty-seven members of the European Union including France, Germany, Italy, as well as countries like the United Kingdom, Canada, Japan, Korea, Australia, New Zealand, and many others, even Switzerland.\n", "\n", "Metadata:\n", - "\tid:\t32\n", - "\tlast_date_read:\t2024-05-01T14:30:00+00:00\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n" + "{'id': 3, 'page_number': 3, 'president_included': False, 'source': '../../how_to/state_of_the_union.txt'}\n", + "\n", + "\n" ] } ], "source": [ - "retriever = db_FaissFlat.as_retriever(search_type=\"mmr\")\n", - "relevant_docs = retriever.invoke(query)[0]\n", + "results = db_FaissIVFFlat.get_by_constraints(\n", + " db_FaissIVFFlat.collection_name,\n", + " limit=1,\n", + " include=[\"metadata\", \"embeddings\"],\n", + " constraints={\"id\": [\">\", 1, \"<=\", 3]},\n", + ")\n", "\n", - "print_document_details(relevant_docs)" - ] - }, - { - "cell_type": "markdown", - "id": "ffadbafc", - "metadata": {}, - "source": [ - "We can also use MMR directly." + "for doc in results:\n", + " print(f\"Content:\\n{doc.page_content}\\n\\nMetadata:\\n{doc.metadata}\\n\\n\")" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "ab911470", + "execution_count": 23, + "id": "c2104bdd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--------------------------------------------------\n", - "\n", - "Score:\t1.2032091618\n", - "\n", - "Content:\n", - "\tTonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", - "\n", - "Metadata:\n", - "\tid:\t32\n", - "\tlast_date_read:\t2024-05-01T14:30:00+00:00\n", - "\tpage_number:\t32\n", - "\tpresident_included:\tTrue\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", - "\n", - "Score:\t1.50705266\n", - "\n", - "Content:\n", - "\tBut cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n", - "\n", - "Danielle says Heath was a fighter to the very end. \n", - "\n", - "He didn’t know how to stop fighting, and neither did she. \n", - "\n", - "Through her pain she found purpose to demand we do better. \n", - "\n", - "Tonight, Danielle—we are. \n", - "\n", - "The VA is pioneering new ways of linking toxic exposures to diseases, already helping more veterans get benefits. \n", - "\n", - "And tonight, I’m announcing we’re expanding eligibility to veterans suffering from nine respiratory cancers. \n", - "\n", - "I’m also calling on Congress: pass a law to make sure veterans devastated by toxic exposures in Iraq and Afghanistan finally get the benefits and comprehensive health care they deserve. \n", - "\n", - "And fourth, let’s end cancer as we know it. \n", - "\n", - "This is personal to me and Jill, to Kamala, and to so many of you. \n", - "\n", - "Cancer is the #2 cause of death in America–second only to heart disease.\n", - "\n", - "Metadata:\n", - "\tid:\t39\n", - "\tpage_number:\t39\n", - "\tpresident_included:\tFalse\n", - "\tsource:\t../../how_to/state_of_the_union.txt\n", - "--------------------------------------------------\n", - "\n" + "vdms_vs_test_nb\n" ] } ], "source": [ - "mmr_resp = db_FaissFlat.max_marginal_relevance_search_with_score(query, k=2, fetch_k=10)\n", - "print_results(mmr_resp)" + "!docker kill vdms_vs_test_nb" ] }, { "cell_type": "markdown", - "id": "190bc4b5", - "metadata": {}, - "source": [ - "### Delete collection\n", - "Previously, we removed documents based on its `id`. Here, all documents are removed since no ID is provided." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "874e7af9", + "id": "a2b7b73c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Documents before deletion: 40\n", - "Documents after deletion: 0\n" - ] - } - ], "source": [ - "print(\"Documents before deletion: \", db_FaissFlat.count(collection_name))\n", + "## Usage for retrieval-augmented generation\n", "\n", - "db_FaissFlat.delete(collection_name=collection_name)\n", + "For guides on how to use this vector store for retrieval-augmented generation (RAG), see the following sections:\n", "\n", - "print(\"Documents after deletion: \", db_FaissFlat.count(collection_name))" + "- [Multi-modal RAG using VDMS](https://github.com/langchain-ai/langchain/blob/master/cookbook/multi_modal_RAG_vdms.ipynb)\n", + "- [Visual RAG using VDMS](https://github.com/langchain-ai/langchain/blob/master/cookbook/visual_RAG_vdms.ipynb)\n", + "- [Tutorials: working with external knowledge](https://python.langchain.com/docs/tutorials/#working-with-external-knowledge)\n", + "- [How-to: Question and answer with RAG](https://python.langchain.com/docs/how_to/#qa-with-rag)\n", + "- [Retrieval conceptual docs](https://python.langchain.com/docs/concepts/retrieval)" ] }, { "cell_type": "markdown", - "id": "68b7a400", - "metadata": {}, - "source": [ - "## Stop VDMS Server" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "08931796", + "id": "fed28359", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "vdms_vs_test_nb\n" - ] - } - ], "source": [ - "!docker kill vdms_vs_test_nb" + "## API reference\n", + "\n", + "For detailed documentation of all `VDMS` vector store features and configurations head to the API reference: https://python.langchain.com/api_reference/community/vectorstores/langchain_community.vectorstores.vdms.VDMS.html" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a60725a6", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": ".langchain-venv", "language": "python", "name": "python3" }, @@ -1241,7 +1294,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/libs/community/langchain_community/vectorstores/vdms.py b/libs/community/langchain_community/vectorstores/vdms.py index ed50d01462623..67f234f2b2ad0 100644 --- a/libs/community/langchain_community/vectorstores/vdms.py +++ b/libs/community/langchain_community/vectorstores/vdms.py @@ -3,17 +3,17 @@ import base64 import logging import os +import time import uuid from copy import deepcopy from typing import ( TYPE_CHECKING, Any, Callable, - Dict, Iterable, - List, Literal, Optional, + Sequence, Sized, Tuple, Type, @@ -25,94 +25,47 @@ from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.vectorstores import VectorStore +from typing_extensions import override from langchain_community.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: import vdms +TEXT_PROPERTY = "content" # Property name for the text +DEFAULT_COLLECTION_NAME = "langchain" +LANGCHAIN_ID_PROPERTY = "langchain_id" # Property name for the unique id +DEFAULT_INSERT_BATCH_SIZE = 500 +DEFAULT_K = 3 # Number of Documents to return. +DEFAULT_FETCH_K = ( + DEFAULT_K * 5 +) # Number of Documents to fetch to pass to knn when filters applied. +INVALID_METADATA_VALUE = ["Missing property", None, {}] # type: list +DEFAULT_PROPERTIES = ["_distance", LANGCHAIN_ID_PROPERTY, TEXT_PROPERTY] +INVALID_DOC_METADATA_KEYS = ["_distance", TEXT_PROPERTY, "blob"] DISTANCE_METRICS = Literal[ "L2", # Euclidean Distance "IP", # Inner Product ] -AVAILABLE_DISTANCE_METRICS: List[DISTANCE_METRICS] = list(get_args(DISTANCE_METRICS)) +AVAILABLE_DISTANCE_METRICS: list[DISTANCE_METRICS] = list(get_args(DISTANCE_METRICS)) ENGINES = Literal[ - "TileDBDense", # TileDB Dense - "TileDBSparse", # TileDB Sparse "FaissFlat", # FAISS IndexFlat + "FaissHNSWFlat", # FAISS IndexHNSWFlat "FaissIVFFlat", # FAISS IndexIVFFlat "Flinng", # FLINNG + "TileDBDense", # TileDB Dense + "TileDBSparse", # TileDB Sparse ] -AVAILABLE_ENGINES: List[ENGINES] = list(get_args(ENGINES)) -DEFAULT_COLLECTION_NAME = "langchain" -DEFAULT_INSERT_BATCH_SIZE = 32 -# Number of Documents to return. -DEFAULT_K = 3 -# Number of Documents to fetch to pass to knn when filters applied. -DEFAULT_FETCH_K = DEFAULT_K * 5 -DEFAULT_PROPERTIES = ["_distance", "id", "content"] -INVALID_DOC_METADATA_KEYS = ["_distance", "content", "blob"] -INVALID_METADATA_VALUE = ["Missing property", None, {}] # type: List - +AVAILABLE_ENGINES: list[ENGINES] = list(get_args(ENGINES)) logger = logging.getLogger(__name__) -def _len_check_if_sized(x: Any, y: Any, x_name: str, y_name: str) -> None: - """ - Check that sizes of two variables are the same - - Args: - x: Variable to compare - y: Variable to compare - x_name: Name for variable x - y_name: Name for variable y - """ - if isinstance(x, Sized) and isinstance(y, Sized) and len(x) != len(y): - raise ValueError( - f"{x_name} and {y_name} expected to be equal length but " - f"len({x_name})={len(x)} and len({y_name})={len(y)}" - ) - return - - -def _results_to_docs(results: Any) -> List[Document]: - return [doc for doc, _ in _results_to_docs_and_scores(results)] - - -def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: - final_res: List[Any] = [] - try: - responses, blobs = results[0] - if ( - len(responses) > 0 - and "FindDescriptor" in responses[0] - and "entities" in responses[0]["FindDescriptor"] - ): - result_entities = responses[0]["FindDescriptor"]["entities"] - # result_blobs = blobs - for ent in result_entities: - distance = round(ent["_distance"], 10) - txt_contents = ent["content"] - for p in INVALID_DOC_METADATA_KEYS: - if p in ent: - del ent[p] - props = { - mkey: mval - for mkey, mval in ent.items() - if mval not in INVALID_METADATA_VALUE - } - - final_res.append( - (Document(page_content=txt_contents, metadata=props), distance) - ) - except Exception as e: - logger.warning(f"No results returned. Error while parsing results: {e}") - return final_res - - -def VDMS_Client(host: str = "localhost", port: int = 55555) -> vdms.vdms: +def VDMS_Client( + host: str = "localhost", + port: int = 55555, +) -> vdms.vdms: """VDMS client for the VDMS server. Args: @@ -133,27 +86,32 @@ def VDMS_Client(host: str = "localhost", port: int = 55555) -> vdms.vdms: class VDMS(VectorStore): - """Intel Lab's VDMS for vector-store workloads. + """Intel Lab's VDMS for vectorstore workloads. To use, you should have both: - the ``vdms`` python package installed - a host (str) and port (int) associated with a deployed VDMS Server - Visit https://github.com/IntelLabs/vdms/wiki more information. + A single VDMS instance can support numerous vector stores and use + different distance metrics and engines. The vectorstores must have a + unique collection_name (DescriptorSet). + + Visit https://github.com/IntelLabs/vdms/wiki for more information. IT IS HIGHLY SUGGESTED TO NORMALIZE YOUR DATA. Args: client: VDMS Client used to connect to VDMS server + embedding: Any embedding function implementing + `langchain_core.embeddings.Embeddings` interface. + embedding_dimensions: Dimensions of embedding function collection_name: Name of data collection [Default: langchain] - distance_strategy: Method used to calculate distances. VDMS supports - "L2" (euclidean distance) or "IP" (inner product) [Default: L2] engine: Underlying implementation for indexing and computing distances. VDMS supports TileDBDense, TileDBSparse, FaissFlat, FaissIVFFlat, - and Flinng [Default: FaissFlat] - embedding: Any embedding function implementing - `langchain_core.embeddings.Embeddings` interface. - relevance_score_fn: Function for obtaining relevance score + FaissHNSWFlat, and Flinng [Default: FaissFlat] + distance_strategy: Method used to calculate distances. VDMS supports + "L2" (euclidean distance) or "IP" (inner product) [Default: L2] + log_level (int, optional): Logging level. Defaults to logging.WARNING. Example: .. code-block:: python @@ -171,338 +129,200 @@ class VDMS(VectorStore): ) """ + @override def __init__( self, client: vdms.vdms, *, embedding: Optional[Embeddings] = None, - collection_name: str = DEFAULT_COLLECTION_NAME, # DescriptorSet name - distance_strategy: DISTANCE_METRICS = "L2", + embedding_dimensions: Optional[int] = None, + collection_name: str = DEFAULT_COLLECTION_NAME, engine: ENGINES = "FaissFlat", + distance_strategy: DISTANCE_METRICS = "L2", relevance_score_fn: Optional[Callable[[float], float]] = None, - embedding_dimensions: Optional[int] = None, + **kwargs: Any, ) -> None: + self.collection_name = collection_name + # Check required parameters self._client = client self.similarity_search_engine = engine self.distance_strategy = distance_strategy self.embedding = embedding - self._check_required_inputs(collection_name, embedding_dimensions) - - # Update other parameters + self.utils = VDMS_Utils(client) + self._check_required_inputs(collection_name, embedding_dimensions, **kwargs) + self.updated_properties_flag = False self.override_relevance_score_fn = relevance_score_fn + self._add_set() - # Initialize collection - self._collection_name = self.add_set( - collection_name, - engine=self.similarity_search_engine, - metric=self.distance_strategy, - ) - - @property - def embeddings(self) -> Optional[Embeddings]: - return self.embedding - - def _embed_documents(self, texts: List[str]) -> List[List[float]]: - if isinstance(self.embedding, Embeddings): - return self.embedding.embed_documents(texts) - else: - p_str = "Must provide `embedding` which is expected" - p_str += " to be an Embeddings object" - raise ValueError(p_str) + """ FUNCTIONS TO OVERRIDE VectorStore METHODS """ - def _embed_video(self, paths: List[str], **kwargs: Any) -> List[List[float]]: - if self.embedding is not None and hasattr(self.embedding, "embed_video"): - return self.embedding.embed_video(paths=paths, **kwargs) - else: - raise ValueError( - "Must provide `embedding` which has attribute `embed_video`" - ) + @override + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[list[dict]] = None, + *, + ids: Optional[list[str]] = None, + **kwargs: Any, + ) -> list[str]: + """Run texts through the embeddings and add to the vectorstore. - def _embed_image(self, uris: List[str]) -> List[List[float]]: - if self.embedding is not None and hasattr(self.embedding, "embed_image"): - return self.embedding.embed_image(uris=uris) - else: - raise ValueError( - "Must provide `embedding` which has attribute `embed_image`" - ) + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of IDs associated with the texts. + **kwargs: vectorstore specific parameters. + One of the kwargs should be `ids` which is a list of ids + associated with the texts. - def _embed_query(self, text: str) -> List[float]: - if isinstance(self.embedding, Embeddings): - return self.embedding.embed_query(text) - else: - raise ValueError( - "Must provide `embedding` which is expected" - " to be an Embeddings object" - ) + Returns: + List of ids from adding the texts into the vectorstore. - def _select_relevance_score_fn(self) -> Callable[[float], float]: - """ - The 'correct' relevance function - may differ depending on a few things, including: - - the distance / similarity metric used by the VectorStore - - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - - embedding dimensionality - - etc. + Raises: + ValueError: If the number of metadatas does not match the number of texts. + ValueError: If the number of ids does not match the number of texts. """ - if self.override_relevance_score_fn is not None: - return self.override_relevance_score_fn - # Default strategy is to rely on distance strategy provided - # in vectorstore constructor - if self.distance_strategy.lower() in ["ip", "l2"]: - return lambda x: x - else: - raise ValueError( - "No supported normalization function" - f" for distance_strategy of {self.distance_strategy}." - "Consider providing relevance_score_fn to VDMS constructor." - ) + texts_ = list(texts) - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - filter: Optional[Dict[str, Any]] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and their similarity scores on a scale from 0 to 1.""" - if self.override_relevance_score_fn is None: - kwargs["normalize_distance"] = True - docs_and_scores = self.similarity_search_with_score( - query=query, - k=k, - fetch_k=fetch_k, - filter=filter, + embeddings = self._embed_documents(texts_) + + inserted_ids = self.add_data( + texts=texts_, + embeddings=embeddings, + metadatas=metadatas, + ids=ids, **kwargs, ) - docs_and_rel_scores: List[Any] = [] - for doc, score in docs_and_scores: - if self.override_relevance_score_fn is None: - docs_and_rel_scores.append((doc, score)) - else: - docs_and_rel_scores.append( - (doc, self.override_relevance_score_fn(score)) - ) - return docs_and_rel_scores + return inserted_ids + + @property + def embeddings(self) -> Optional[Embeddings]: + return self.embedding - def add( + def batch_delete( self, collection_name: str, - texts: List[str], - embeddings: List[List[float]], - metadatas: Optional[Union[List[None], List[Dict[str, Any]]]] = None, - ids: Optional[List[str]] = None, - ) -> List: - _len_check_if_sized(texts, embeddings, "texts", "embeddings") + constraints: dict, + ids: Optional[list[str]] = None, + batch_size: int = DEFAULT_INSERT_BATCH_SIZE, + ) -> list: + resp_dict: dict = {} + resp_dict.setdefault( + "FindDescriptor", {"entities": list(), "returned": 0, "status": 0} + ) + new_response = [ + # {"FindDescriptor": {"returned": 0, "status": 0, "entities": []}} + resp_dict + ] - metadatas = metadatas if metadatas is not None else [None for _ in texts] - _len_check_if_sized(texts, metadatas, "texts", "metadatas") + if ids is None: + return new_response - ids = ids if ids is not None else [str(uuid.uuid4()) for _ in texts] - _len_check_if_sized(texts, ids, "texts", "ids") + for start_idx in range(0, len(ids), batch_size): + end_idx = min(start_idx + batch_size - 1, len(ids)) + batch_ids = ids[start_idx:end_idx] - all_queries: List[Any] = [] - all_blobs: List[Any] = [] - inserted_ids: List[Any] = [] - for meta, emb, doc, id in zip(metadatas, embeddings, texts, ids): - query, blob = self.__get_add_query( - collection_name, metadata=meta, embedding=emb, document=doc, id=id - ) + all_queries = [] + for i in batch_ids: + tmp_ = {LANGCHAIN_ID_PROPERTY: ["==", i]} + if constraints is not None: + tmp_.update(constraints) + + query = self.utils.add_descriptor( + "FindDescriptor", + collection_name, + label=None, + ref=None, + props=None, + link=None, + k_neighbors=None, + constraints=tmp_, + results=None, + ) - if blob is not None: all_queries.append(query) - all_blobs.append(blob) - inserted_ids.append(id) - response, response_array = self.__run_vdms_query(all_queries, all_blobs) + if all_queries == []: + return new_response - return inserted_ids + response, _ = self.utils.run_vdms_query(all_queries) - def add_set( - self, - collection_name: str, - engine: ENGINES = "FaissFlat", - metric: DISTANCE_METRICS = "L2", - ) -> str: - query = _add_descriptorset( - "AddDescriptorSet", - collection_name, - self.embedding_dimension, - engine=getattr(engine, "value", engine), - metric=getattr(metric, "value", metric), - ) + for res in response: + if "FindDescriptor" in res: + new_response[0]["FindDescriptor"]["entities"].extend( + res["FindDescriptor"]["entities"] + ) - response, _ = self.__run_vdms_query([query]) + new_response[0]["FindDescriptor"]["returned"] = len( + new_response[0]["FindDescriptor"]["entities"] + ) + return new_response - if "FailedCommand" in response[0]: - raise ValueError(f"Failed to add collection {collection_name}") + @override + def delete(self, ids: Optional[list[str]] = None, **kwargs: Any) -> Optional[bool]: + """Delete by vector ID or other criteria. - return collection_name + Args: + ids: List of ids to delete. If None, delete all. Default is None. + **kwargs: Other keyword arguments that subclasses might use. - def __delete( - self, - collection_name: str, - ids: Union[None, List[str]] = None, - constraints: Union[None, Dict[str, Any]] = None, - ) -> bool: - """ - Deletes entire collection if id is not provided + Returns: + Optional[bool]: True if deletion is successful, False otherwise """ - all_queries: List[Any] = [] - all_blobs: List[Any] = [] - - collection_properties = self.__get_properties(collection_name) - results = {"list": collection_properties} - if constraints is None: - constraints = {"_deletion": ["==", 1]} + if "collection_name" in kwargs: + collection_name = kwargs.pop("collection_name") else: - constraints["_deletion"] = ["==", 1] + collection_name = self.collection_name - if ids is not None: - constraints["id"] = ["==", ids[0]] # if len(ids) > 1 else ids[0]] + if "constraints" in kwargs and isinstance(kwargs["constraints"], dict): + constraints = kwargs.pop("constraints") + constraints["_deletion"] = ["==", 1] + else: + constraints = {"_deletion": ["==", 1]} - query = _add_descriptor( - "FindDescriptor", + response = self.batch_delete( collection_name, - label=None, - ref=None, - props=None, - link=None, - k_neighbors=None, - constraints=constraints, - results=results, + constraints, + ids, + batch_size=kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE), ) - all_queries.append(query) - response, response_array = self.__run_vdms_query(all_queries, all_blobs) - # Update/store indices after deletion - query = _add_descriptorset( + query = self.utils.add_descriptor_set( "FindDescriptorSet", collection_name, storeIndex=True ) - responseSet, _ = self.__run_vdms_query([query], all_blobs) - return "FindDescriptor" in response[0] - - def __get_add_query( - self, - collection_name: str, - metadata: Optional[Any] = None, - embedding: Union[List[float], None] = None, - document: Optional[Any] = None, - id: Optional[str] = None, - ) -> Tuple[Dict[str, Dict[str, Any]], Union[bytes, None]]: - if id is None: - props: Dict[str, Any] = {} - else: - props = {"id": id} - id_exists, query = _check_descriptor_exists_by_id( - self._client, collection_name, id - ) - if id_exists: - skipped_value = { - prop_key: prop_val[-1] - for prop_key, prop_val in query["FindDescriptor"][ - "constraints" - ].items() - } - pstr = f"[!] Embedding with id ({id}) exists in DB;" - pstr += "Therefore, skipped and not inserted" - print(pstr) # noqa: T201 - print(f"\tSkipped values are: {skipped_value}") # noqa: T201 - return query, None - - if metadata: - props.update(metadata) - if document not in [None, ""]: - props["content"] = document - - for k in props.keys(): - if k not in self.collection_properties: - self.collection_properties.append(k) - - query = _add_descriptor( - "AddDescriptor", - collection_name, - label=None, - ref=None, - props=props, - link=None, - k_neighbors=None, - constraints=None, - results=None, - ) - - blob = embedding2bytes(embedding) - - return ( - query, - blob, - ) + _, _ = self.utils.run_vdms_query([query]) - def __get_properties( - self, - collection_name: str, - unique_entity: Optional[bool] = False, - deletion: Optional[bool] = False, - ) -> List[str]: - find_query = _find_property_entity( - collection_name, unique_entity=unique_entity, deletion=deletion - ) - response, response_blob = self.__run_vdms_query([find_query]) - if len(response_blob) > 0: - collection_properties = _bytes2str(response_blob[0]).split(",") - else: - collection_properties = deepcopy(DEFAULT_PROPERTIES) - return collection_properties + return "FindDescriptor" in response[0] - def __run_vdms_query( - self, - all_queries: List[Dict], - all_blobs: Optional[List] = [], - print_last_response: Optional[bool] = False, - ) -> Tuple[Any, Any]: - response, response_array = self._client.query(all_queries, all_blobs) + @override + def get_by_ids(self, ids: Sequence[str], /) -> list[Document]: + """Get documents by their IDs. - _ = _check_valid_response(all_queries, response) - if print_last_response: - self._client.print_last_response() - return response, response_array + Args: + ids: List of ids to retrieve. - def __update( - self, - collection_name: str, - ids: List[str], - documents: List[str], - embeddings: List[List[float]], - metadatas: Optional[Union[List[None], List[Dict[str, Any]]]] = None, - ) -> None: - """ - Updates (find, delete, add) a collection based on id. - If more than one collection returned with id, error occuers + Returns: + documents: List of Document objects found in the vectorstore. """ - _len_check_if_sized(ids, documents, "ids", "documents") - - _len_check_if_sized(ids, embeddings, "ids", "embeddings") - - metadatas = metadatas if metadatas is not None else [None for _ in ids] - _len_check_if_sized(ids, metadatas, "ids", "metadatas") - - orig_props = self.__get_properties(collection_name) - - updated_ids: List[Any] = [] - for meta, emb, doc, id in zip(metadatas, embeddings, documents, ids): - results = {"list": self.collection_properties} - constraints = {"_deletion": ["==", 1]} + collection_name = self.collection_name + all_constraints = [] + for id in ids: + constraints = { + LANGCHAIN_ID_PROPERTY: ["==", str(id)], + } + all_constraints.append(constraints) - if id is not None: - constraints["id"] = ["==", id] + results = {"list": self.utils.get_properties(collection_name)} - query = _add_descriptor( + docs = [] + for constraint in all_constraints: + query = self.utils.add_descriptor( "FindDescriptor", collection_name, label=None, @@ -510,247 +330,463 @@ def __update( props=None, link=None, k_neighbors=None, - constraints=constraints, + constraints=constraint, results=results, ) - response, response_array = self.__run_vdms_query([query]) + response, _ = self.utils.run_vdms_query([query]) - query, blob = self.__get_add_query( - collection_name, - metadata=meta, - embedding=emb, - document=doc, - id=id, - ) - if blob is not None: - response, response_array = self.__run_vdms_query([query], [blob]) - updated_ids.append(id) + if "FindDescriptor" in response[0]: + this_docs = [ + self.descriptor2document(doc) + for doc in response[0]["FindDescriptor"].get("entities", []) + ] + docs.extend(this_docs) + return docs - self.__update_properties( - collection_name, orig_props, self.collection_properties - ) + @override + def add_documents(self, documents: list[Document], **kwargs: Any) -> list[str]: + """Add or update documents in the vectorstore. - def __update_properties( - self, - collection_name: str, - current_collection_properties: List, - new_collection_properties: Optional[List], - ) -> None: - if new_collection_properties is not None: - old_collection_properties = deepcopy(current_collection_properties) - for prop in new_collection_properties: - if prop not in current_collection_properties: - current_collection_properties.append(prop) - - if current_collection_properties != old_collection_properties: - all_queries, blob_arr = _build_property_query( - collection_name, - command_type="update", - all_properties=current_collection_properties, + Args: + documents: Documents to add to the vectorstore. + kwargs: Additional keyword arguments. + - if kwargs contains ids and documents contain ids, + the ids in the kwargs will receive precedence. + - "delete_existing" will delete matching ids prior to adding + new document with same id (True); else add with duplicate + id (False) [Default: True] + + Returns: + List of IDs of the added texts. + + Raises: + ValueError: If the number of ids does not match the number of documents. + """ + # GET IDS & FORMAT DOCUMENTS + delete_existing: bool = kwargs.pop("delete_existing", True) + ids = None + if "ids" in kwargs: + # Get IDs + ids = kwargs.pop("ids") + if ids and len(ids) != len(documents): + raise ValueError( + "The number of ids must match the number of documents. " + "Got {len(ids)} ids and {len(documents)} documents." ) - response, _ = self.__run_vdms_query(all_queries, [blob_arr]) - def add_images( + # Get Documents + documents_ = [] + for id_, document in zip(ids, documents): + doc_with_id = Document( + page_content=document.page_content, + metadata=document.metadata, + id=id_, + ) + documents_.append(doc_with_id) + else: + # Get Documents + documents_ = documents + + if ids is None: + ids = [] + for doc in documents_: + if hasattr(doc, "id") and doc.id is not None: + ids.append(str(doc.id)) + elif "id" in doc.metadata: + ids.append(str(doc.metadata["id"])) + else: + ids.append(str(uuid.uuid4())) + + # Remove IDs if exist-TEST_REMOVAL + remove_ids = [doc.id for doc in self.get_by_ids(ids) if doc.id] + if len(remove_ids) > 0: + if delete_existing: + self.delete(ids=remove_ids, **kwargs) + remove_ids = [] + else: + pstr = "[!] Embeddings skipped for following ids because " + pstr += f"already exists: {remove_ids}\nCan retry with " + pstr += "'delete_existing' set to True" + logger.info(pstr) + valid_ids = [] + texts = [] + metadatas = [] + for id, doc in zip(ids, documents_): + if id not in remove_ids: + valid_ids.append(id) + texts.append(doc.page_content) + metadatas.append(doc.metadata) + + kwargs["ids"] = valid_ids + return self.add_texts(texts, metadatas, **kwargs) + + @override + def similarity_search( self, - uris: List[str], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - batch_size: int = DEFAULT_INSERT_BATCH_SIZE, - add_path: Optional[bool] = True, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[dict[str, list]] = None, **kwargs: Any, - ) -> List[str]: - """Run more images through the embeddings and add to the vectorstore. - - Images are added as embeddings (AddDescriptor) instead of separate - entity (AddImage) within VDMS to leverage similarity search capability + ) -> list[Document]: + """Return docs most similar to query. Args: - uris: List of paths to the images to add to the vectorstore. - metadatas: Optional list of metadatas associated with the images. - ids: Optional list of unique IDs. - batch_size (int): Number of concurrent requests to send to the server. - add_path: Bool to add image path as metadata + query: Query string to search for. + k: Number of Documents to return. + fetch_k: Number of candidates to fetch for knn (>= k). + filter: Filter by metadata. Defaults to None. + **kwargs: Arguments to pass to the search method. Returns: - List of ids from adding images into the vectorstore. + List of Documents most similar to the query. """ - # Map from uris to blobs to base64 - b64_texts = [self.encode_image(image_path=uri) for uri in uris] + assert self.embedding is not None, "Embedding function is not set" + query_embedding = self.get_embedding_from_query(query) + return self.similarity_search_by_vector( + query_embedding, k, fetch_k=fetch_k, filter=filter, **kwargs + ) - if add_path and metadatas: - for midx, uri in enumerate(uris): - metadatas[midx]["image_path"] = uri - elif add_path: - metadatas = [] - for uri in uris: - metadatas.append({"image_path": uri}) + @override + def similarity_search_with_score( + self, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[dict[str, list]] = None, + **kwargs: Any, + ) -> list[tuple[Document, float]]: + query_embedding = self.get_embedding_from_query(query) + results = self.query_by_embeddings( + query_embeddings=[query_embedding], + k=k, + fetch_k=fetch_k, + filter=filter, + **kwargs, + ) - # Populate IDs - ids = ids if ids is not None else [str(uuid.uuid4()) for _ in uris] + return self.results2docs_and_scores(results) - # Set embeddings - embeddings = self._embed_image(uris=uris) + @override + def similarity_search_by_vector( + self, + embedding: list[float], + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[dict[str, list]] = None, + **kwargs: Any, + ) -> list[Document]: + """Return docs most similar to embedding vector. - if metadatas is None: - metadatas = [{} for _ in uris] - else: - metadatas = [_validate_vdms_properties(m) for m in metadatas] + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. + fetch_k: Number of candidates to fetch for knn (>= k). + filter: Filter by metadata. Defaults to None. + **kwargs: Arguments to pass to the search method. - self.add_from( - texts=b64_texts, - embeddings=embeddings, - ids=ids, - metadatas=metadatas, - batch_size=batch_size, + Returns: + List of Documents most similar to the query vector. + """ + start_time = time.time() + results = self.query_by_embeddings( + query_embeddings=[embedding], + k=k, + fetch_k=fetch_k, + filter=filter, **kwargs, ) - return ids + logger.info( + f"VDMS similarity search took {time.time() - start_time:0.4f} seconds" + ) - def add_videos( + final_docs = [] + for this_result in results: + resp, resp_arr = this_result + try: + descriptor = resp[0]["FindDescriptor"].get("entities", []) + except ValueError: + descriptor = [] + if isinstance(descriptor, dict): + final_docs.append(self.descriptor2document(descriptor)) + elif isinstance(descriptor, list): + for desc in descriptor: + final_docs.append(self.descriptor2document(desc)) + else: + pass + return final_docs + + def similarity_search_with_relevance_scores( self, - paths: List[str], - texts: Optional[List[str]] = None, - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - batch_size: int = 1, - add_path: Optional[bool] = True, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + filter: Optional[dict[str, Any]] = None, **kwargs: Any, - ) -> List[str]: - """Run videos through the embeddings and add to the vectorstore. + ) -> list[Tuple[Document, float]]: + """Return docs and their similarity scores on a scale from 0 to 1.""" + if self.override_relevance_score_fn is not None: + kwargs["normalize_distance"] = False + # else: + # kwargs["normalize_distance"] = True - Videos are added as embeddings (AddDescriptor) instead of separate - entity (AddVideo) within VDMS to leverage similarity search capability + docs_and_scores = self.similarity_search_with_score( + query=query, + k=k, + fetch_k=fetch_k, + filter=filter, + **kwargs, + ) + + docs_and_rel_scores: list[Any] = [] + for doc, score in docs_and_scores: + if self.override_relevance_score_fn is None: + docs_and_rel_scores.append((doc, score)) + else: + docs_and_rel_scores.append( + (doc, self.override_relevance_score_fn(score)) + ) + return docs_and_rel_scores + + @override + def max_marginal_relevance_search( + self, + query: str, + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + lambda_mult: float = 0.5, + filter: Optional[dict[str, list]] = None, + **kwargs: Any, + ) -> list[Document]: + """Returns similar documents to the query that also have diversity + + This algorithm balances relevance and diversity in the search results. Args: - paths: List of paths to the videos to add to the vectorstore. - metadatas: Optional list of text associated with the videos. - metadatas: Optional list of metadatas associated with the videos. - ids: Optional list of unique IDs. - batch_size (int): Number of concurrent requests to send to the server. - add_path: Bool to add video path as metadata + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + **kwargs: Arguments to pass to the search method. Returns: - List of ids from adding videos into the vectorstore. + List of Document objects ordered by decreasing similarity/diversty. """ - if texts is None: - texts = ["" for _ in paths] + query_embedding = self.get_embedding_from_query(query) + return self.max_marginal_relevance_search_by_vector( + query_embedding, k, fetch_k, lambda_mult, filter, **kwargs + ) - if add_path and metadatas: - for midx, path in enumerate(paths): - metadatas[midx]["video_path"] = path - elif add_path: - metadatas = [] - for path in paths: - metadatas.append({"video_path": path}) + @override + def max_marginal_relevance_search_by_vector( + self, + embedding: list[float], + k: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + lambda_mult: float = 0.5, + filter: Optional[dict[str, list]] = None, + **kwargs: Any, + ) -> list[Document]: + """Return docs selected using the maximal marginal relevance. - # Populate IDs - ids = ids if ids is not None else [str(uuid.uuid4()) for _ in paths] + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. - # Set embeddings - embeddings = self._embed_video(paths=paths, **kwargs) + Args: + embedding: Embedding vector to search for. + k: Number of Documents to return. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[dict[str, str]]): Filter by metadata. Defaults to None. - if metadatas is None: - metadatas = [{} for _ in paths] + Returns: + List of Documents selected by maximal marginal relevance. + """ + start_time = time.time() + results = self.query_by_embeddings( + query_embeddings=[embedding], + k=k, + fetch_k=fetch_k, + filter=filter, + include=["metadatas", "documents", "distances", "embeddings"], + ) - self.add_from( - texts=texts, - embeddings=embeddings, - ids=ids, - metadatas=metadatas, - batch_size=batch_size, + if len(results[0][1]) == 0: + # No results returned + return [] + else: + embedding_list = [ + list(self.utils.bytes2embedding(result)) for result in results[0][1] + ] + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + logger.info( + f"VDMS similarity search mmr took {time.time() - start_time:0.4f} secs" + ) + candidates = self.results2docs(results) + return [r for i, r in enumerate(candidates) if i in mmr_selected] + + @classmethod + @override + def from_documents( + cls: Type[VDMS], + documents: list[Document], + embedding: Embeddings, + ids: Optional[list[str]] = None, + collection_name: str = DEFAULT_COLLECTION_NAME, # Add this line + **kwargs: Any, + ) -> VDMS: + """Creates a new vectorstore from a list of documents + + Args: + documents: List of documents + embedding: Embedding function to use. + ids: Optional list of IDs associated with the documents. + collection_name (str): Name of the collection to create. + kwargs: Additional keyword arguments. + + Returns: + VectorStore: VectorStore initialized from documents and embeddings. + """ + client: vdms.vdms = kwargs.pop("client") + + if "batch_size" not in kwargs: + kwargs["batch_size"] = DEFAULT_INSERT_BATCH_SIZE + + vectorstore = cls( + client=client, + embedding=embedding, + collection_name=collection_name, **kwargs, ) - return ids + vectorstore.add_documents(documents, ids=ids, **kwargs) + return vectorstore - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - batch_size: int = DEFAULT_INSERT_BATCH_SIZE, + @classmethod + @override + def from_texts( + cls: Type[VDMS], + texts: list[str], + embedding: Embeddings, + metadatas: Optional[list[dict]] = None, + *, + ids: Optional[list[str]] = None, + collection_name: str = DEFAULT_COLLECTION_NAME, **kwargs: Any, - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. + ) -> VDMS: + """Creates a new vectorstore from a list of texts Args: - texts: List of strings to add to the vectorstore. + texts: List of text strings + embedding: Embedding function to use. metadatas: Optional list of metadatas associated with the texts. - ids: Optional list of unique IDs. - batch_size (int): Number of concurrent requests to send to the server. + Default is None. + ids: Optional list of IDs associated with the texts. + collection_name (str): Name of the collection to create. + kwargs: Additional keyword arguments. + - "delete_existing" will delete matching ids prior to adding + new document with same id (True); else add with duplicate + id (False) [Default: True] Returns: - List of ids from adding the texts into the vectorstore. + VectorStore: VectorStore initialized from texts and embeddings. """ + client: vdms.vdms = kwargs.pop("client") + delete_existing: bool = kwargs.pop("delete_existing", True) + + if "batch_size" not in kwargs: + kwargs["batch_size"] = DEFAULT_INSERT_BATCH_SIZE + + vdms_store = cls( + client=client, + embedding=embedding, + collection_name=collection_name, + **kwargs, + ) - texts = list(texts) if ids is None: ids = [str(uuid.uuid4()) for _ in texts] - embeddings = self._embed_documents(texts) - - if metadatas is None: - metadatas = [{} for _ in texts] - else: - metadatas = [_validate_vdms_properties(m) for m in metadatas] + metadatas = metadatas if metadatas is not None else [{} for _ in ids] + vdms_store._len_check_if_sized(ids, texts, "ids", "texts") + vdms_store._len_check_if_sized(ids, metadatas, "ids", "metadatas") - inserted_ids = self.add_from( - texts=texts, - embeddings=embeddings, - ids=ids, - metadatas=metadatas, - batch_size=batch_size, + # Remove IDs if exist-TEST_REMOVAL + remove_ids = [doc.id for doc in vdms_store.get_by_ids(ids) if doc.id] + if len(remove_ids) > 0: + if delete_existing: + vdms_store.delete(ids=remove_ids, **kwargs) + remove_ids = [] + else: + pstr = "[!] Embeddings skipped for following ids because " + pstr += f"already exists: {remove_ids}. Retry with " + pstr += "'delete_existing' set to True" + logger.info(pstr) + + valid_ids = [] + valid_texts = [] + valid_metadatas = [] + for id, txt, meta in zip(ids, texts, metadatas): + if id not in remove_ids: + valid_ids.append(id) + valid_texts.append(txt) + valid_metadatas.append(meta) + + vdms_store.add_texts( + texts=valid_texts, + metadatas=valid_metadatas, + ids=valid_ids, **kwargs, ) - return inserted_ids + return vdms_store - def add_from( - self, - texts: List[str], - embeddings: List[List[float]], - ids: List[str], - metadatas: Optional[List[dict]] = None, - batch_size: int = DEFAULT_INSERT_BATCH_SIZE, - **kwargs: Any, - ) -> List[str]: - # Get initial properties - orig_props = self.__get_properties(self._collection_name) - inserted_ids: List[str] = [] - for start_idx in range(0, len(texts), batch_size): - end_idx = min(start_idx + batch_size, len(texts)) + """ OTHER FUNCS """ - batch_texts = texts[start_idx:end_idx] - batch_embedding_vectors = embeddings[start_idx:end_idx] - batch_ids = ids[start_idx:end_idx] - if metadatas: - batch_metadatas = metadatas[start_idx:end_idx] + def _add_set(self) -> None: + collection_name = self.collection_name + embedding_dimension = self.embedding_dimension + engine = self.similarity_search_engine + metric = self.distance_strategy - result = self.add( - self._collection_name, - embeddings=batch_embedding_vectors, - texts=batch_texts, - metadatas=batch_metadatas, - ids=batch_ids, - ) + query = self.utils.add_descriptor_set( + "AddDescriptorSet", + collection_name, + embedding_dimension, + engine=getattr(engine, "value", engine), + metric=getattr(metric, "value", metric), + ) - inserted_ids.extend(result) + response, _ = self.utils.run_vdms_query([query]) - # Update Properties - self.__update_properties( - self._collection_name, orig_props, self.collection_properties - ) - return inserted_ids + if "FailedCommand" in response[0]: + raise ValueError(f"Failed to add collection {collection_name}") + + if response[0]["AddDescriptorSet"]["status"] == 0: + status = "created" + else: + status = "exists" + + logger.info(f"Descriptor set {collection_name} {status}") def _check_required_inputs( - self, collection_name: str, embedding_dimensions: Union[int, None] + self, + collection_name: str, + embedding_dimensions: Union[int, None], + **kwargs: Any, ) -> None: - # Check connection to client - if not self._client.is_connected(): - raise ValueError( - "VDMS client must be connected to a VDMS server." - + "Please use VDMS_Client to establish a connection" - ) - # Check Distance Metric if self.distance_strategy not in AVAILABLE_DISTANCE_METRICS: raise ValueError("distance_strategy must be either 'L2' or 'IP'") @@ -759,7 +795,7 @@ def _check_required_inputs( if self.similarity_search_engine not in AVAILABLE_ENGINES: raise ValueError( "engine must be either 'TileDBDense', 'TileDBSparse', " - + "'FaissFlat', 'FaissIVFFlat', or 'Flinng'" + + "'FaissFlat', 'FaissIVFFlat', 'FaissHNSWFlat', or 'Flinng'" ) # Check Embedding Func is provided and store dimension size @@ -770,7 +806,8 @@ def _check_required_inputs( self.embedding_dimension = embedding_dimensions elif self.embedding is not None and hasattr(self.embedding, "embed_query"): self.embedding_dimension = len( - self._embed_query("This is a sample sentence.") + # self._embed_query("This is a sample sentence.") + self.embedding.embed_query("This is a sample sentence.") ) elif self.embedding is not None and ( hasattr(self.embedding, "embed_image") @@ -791,243 +828,452 @@ def _check_required_inputs( ) # Check for properties - current_props = self.__get_properties(collection_name) + current_props = self.utils.get_properties(collection_name) if hasattr(self, "collection_properties"): - self.collection_properties.extend(current_props) + missing_elements = list( + set(current_props) - set(self.collection_properties) + ) # element in current not in props + if len(missing_elements) > 0: + self.collection_properties.extend(missing_elements) else: - self.collection_properties: List[str] = current_props + self.collection_properties: list[str] = current_props - def count(self, collection_name: str) -> int: - all_queries: List[Any] = [] - all_blobs: List[Any] = [] + self.collection_properties.sort() - results = {"count": "", "list": ["id"]} # collection_properties} - query = _add_descriptor( - "FindDescriptor", - collection_name, - label=None, - ref=None, - props=None, - link=None, - k_neighbors=None, - constraints=None, - results=results, - ) + def _embed_documents(self, texts: list[str]) -> list[list[float]]: + if isinstance(self.embedding, Embeddings): + return self.embedding.embed_documents(texts) + else: + p_str = "Must provide `embedding` which is expected" + p_str += " to be an Embeddings object" + raise ValueError(p_str) - all_queries.append(query) + def _embed_image(self, uris: list[str]) -> list[list[float]]: + if self.embedding is not None and hasattr(self.embedding, "embed_image"): + return self.embedding.embed_image(uris=uris) + else: + raise ValueError( + "Must provide `embedding` which has attribute `embed_image`" + ) - response, response_array = self.__run_vdms_query(all_queries, all_blobs) - return response[0]["FindDescriptor"]["returned"] + def _embed_query(self, text: str) -> list[float]: + if isinstance(self.embedding, Embeddings): + return self.embedding.embed_query(text) + else: + raise ValueError( + "Must provide `embedding` which is expected" + " to be an Embeddings object" + ) - def decode_image(self, base64_image: str) -> bytes: - return base64.b64decode(base64_image) + def _embed_video(self, paths: list[str], **kwargs: Any) -> list[list[float]]: + if self.embedding is not None and hasattr(self.embedding, "embed_video"): + return self.embedding.embed_video(paths=paths, **kwargs) + else: + raise ValueError( + "Must provide `embedding` which has attribute `embed_video`" + ) - def delete( - self, - ids: Optional[List[str]] = None, - collection_name: Optional[str] = None, - constraints: Optional[Dict] = None, - **kwargs: Any, - ) -> bool: - """Delete by ID. These are the IDs in the vectorstore. + def _len_check_if_sized(self, x: Any, y: Any, x_name: str, y_name: str) -> None: + """ + Check that sizes of two variables are the same Args: - ids: List of ids to delete. + x: Variable to compare + y: Variable to compare + x_name: Name for variable x + y_name: Name for variable y + """ + if isinstance(x, Sized) and isinstance(y, Sized) and len(x) != len(y): + raise ValueError( + f"{x_name} and {y_name} expected to be equal length but " + f"len({x_name})={len(x)} and len({y_name})={len(y)}" + ) + return - Returns: - Optional[bool]: True if deletion is successful, - False otherwise, None if not implemented. + def update( + self, + collection_name: str, + ids: list[str], + texts: list[str], + embeddings: list[list[float]], + metadatas: Optional[list[dict]] = None, + **kwargs: Any, + ) -> None: + """ + Updates (find, delete, add) a collection based on id. + If more than one collection returned with id, error occurs """ - name = collection_name if collection_name is not None else self._collection_name - return self.__delete(name, ids=ids, constraints=constraints) - def get_k_candidates( + metadatas = metadatas if metadatas is not None else [{} for _ in ids] + self._len_check_if_sized(ids, texts, "ids", "texts") + self._len_check_if_sized(ids, embeddings, "ids", "embeddings") + self._len_check_if_sized(ids, metadatas, "ids", "metadatas") + + # Find and delete by ID + remove_ids = [doc.id for doc in self.get_by_ids(ids) if doc.id] + if len(remove_ids) > 0: + self.delete(ids=remove_ids) + + # Add as batch + if "batch_size" not in kwargs: + kwargs["batch_size"] = DEFAULT_INSERT_BATCH_SIZE + + _ = self.add_from( + texts=texts, + embeddings=embeddings, + ids=ids, + metadatas=metadatas, + **kwargs, + ) + + def push_update_properties( self, - setname: str, - fetch_k: Optional[int], - results: Optional[Dict[str, Any]] = None, - all_blobs: Optional[List] = None, - normalize: Optional[bool] = False, - ) -> Tuple[List[Dict[str, Any]], List, float]: - max_dist = 1 - command_str = "FindDescriptor" - query = _add_descriptor( - command_str, - setname, - k_neighbors=fetch_k, - results=results, + collection_name: str, + ) -> None: + pushed_props = self.utils.get_properties(collection_name) + missing_elements = list( + set(self.collection_properties) - set(pushed_props) + ) # element in current not in props + if len(missing_elements) > 0: + pushed_props.extend(missing_elements) + pushed_props.sort() + + all_queries, blob_arr = self.utils.build_property_query( + collection_name, + command_type="update", + all_properties=pushed_props, ) - response, response_array = self.__run_vdms_query([query], all_blobs) + response, _ = self.utils.run_vdms_query(all_queries, [blob_arr]) + self.updated_properties_flag = True - if normalize and command_str in response[0]: - max_dist = response[0][command_str]["entities"][-1]["_distance"] + def add_batch( + self, + collection_name: str, + texts: list[str], + embeddings: list[list[float]], + metadatas: Optional[list[dict]] = None, + ids: Optional[list[str]] = None, + ) -> list: + self._len_check_if_sized(texts, embeddings, "texts", "embeddings") - return response, response_array, max_dist + metadatas = metadatas if metadatas is not None else [{} for _ in texts] + self._len_check_if_sized(texts, metadatas, "texts", "metadatas") - def get_descriptor_response( - self, - command_str: str, - setname: str, - k_neighbors: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - constraints: Optional[dict] = None, - results: Optional[Dict[str, Any]] = None, - query_embedding: Optional[List[float]] = None, - normalize_distance: bool = False, - ) -> Tuple[List[Dict[str, Any]], List]: - all_blobs: List[Any] = [] - blob = embedding2bytes(query_embedding) - if blob is not None: - all_blobs.append(blob) + ids = ids if ids is not None else [str(uuid.uuid4()) for _ in texts] + self._len_check_if_sized(texts, ids, "texts", "ids") - if constraints is None: - # K results returned - response, response_array, max_dist = self.get_k_candidates( - setname, k_neighbors, results, all_blobs, normalize=normalize_distance + extended_emb: list[Any] = [] + batch_properties: list[dict] = [] + for meta, emb, doc, id in zip(metadatas, embeddings, texts, ids): + extended_emb.extend(emb) + batch_properties.append(self.get_props_from_metadata(doc, meta, id)) + all_blobs = [self.utils.embedding2bytes(extended_emb)] + all_queries = [ + self.utils.add_descriptor( + "AddDescriptor", + collection_name, + label=None, + ref=None, + props=batch_properties, + link=None, + k_neighbors=None, + constraints=None, + results=None, ) + ] + # if isinstance(all_queries, dict): + # all_queries = [all_queries] + response, _ = self.utils.run_vdms_query(all_queries, all_blobs) + + try: + return ids if response[0]["AddDescriptor"]["status"] == 0 else [] + except Exception: + if "OutOfJournalSpace" in response[0]["info"]: + try: + logger.info("OutOfJournalSpace: Splitting batch in half") + old_batch = len(all_queries) + new_batch_size = old_batch // 2 + emb_len = len(emb) + for start_idx in range(0, old_batch, new_batch_size): + end_idx_blob = min( + start_idx * emb_len + new_batch_size - 1, len(all_blobs) + ) + blobs = all_blobs[start_idx * emb_len : end_idx_blob] + + end_idx = min(start_idx + new_batch_size - 1, old_batch) + queries = all_queries[start_idx:end_idx] + response, _ = self.utils.run_vdms_query(queries, blobs) + except Exception: + raise ValueError(f"Lower batch_size to < {old_batch} and rerun") + + return ids if response[0]["AddDescriptor"]["status"] == 0 else [] + else: + return [] + + def add_data( + self, + texts: list[str], + embeddings: list[list[float]], + metadatas: Optional[list[dict]] = None, + ids: Optional[list[str]] = None, + **kwargs: Any, + ) -> list[str]: + if metadatas is None: + metadatas = [{} for _ in texts] else: - if results is None: - results = {"list": ["id"]} - elif "list" not in results: - results["list"] = ["id"] - elif "id" not in results["list"]: - results["list"].append("id") + metadatas = [self.utils.validate_vdms_properties(m) for m in metadatas] - # (1) Find docs satisfy constraints - query = _add_descriptor( - command_str, - setname, - constraints=constraints, - results=results, - ) - response, response_array = self.__run_vdms_query([query]) - if command_str in response[0] and response[0][command_str]["returned"] > 0: - ids_of_interest = [ - ent["id"] for ent in response[0][command_str]["entities"] - ] - else: - return [], [] + # Populate IDs + if ids is None: + ids = [] + for meta in metadatas: + if LANGCHAIN_ID_PROPERTY in meta: + ids.append(meta[LANGCHAIN_ID_PROPERTY]) + else: + ids.append(str(uuid.uuid4())) - # (2) Find top fetch_k results - response, response_array, max_dist = self.get_k_candidates( - setname, fetch_k, results, all_blobs, normalize=normalize_distance - ) - if command_str not in response[0] or ( - command_str in response[0] and response[0][command_str]["returned"] == 0 - ): - return [], [] + if "batch_size" not in kwargs: + kwargs["batch_size"] = DEFAULT_INSERT_BATCH_SIZE - # (3) Intersection of (1) & (2) using ids - new_entities: List[Dict] = [] - for ent in response[0][command_str]["entities"]: - if ent["id"] in ids_of_interest: - new_entities.append(ent) - if len(new_entities) == k_neighbors: - break - response[0][command_str]["entities"] = new_entities - response[0][command_str]["returned"] = len(new_entities) - if len(new_entities) < k_neighbors: - p_str = "Returned items < k_neighbors; Try increasing fetch_k" - print(p_str) # noqa: T201 + inserted_ids = self.add_from( + texts=texts, + embeddings=embeddings, + ids=ids, + metadatas=metadatas, + **kwargs, + ) + return inserted_ids - if normalize_distance: - max_dist = 1.0 if max_dist in [0, np.inf] else max_dist - for ent_idx, ent in enumerate(response[0][command_str]["entities"]): - ent["_distance"] = ent["_distance"] / max_dist - response[0][command_str]["entities"][ent_idx]["_distance"] = ent[ - "_distance" - ] + def add_from( + self, + texts: list[str], + embeddings: list[list[float]], + ids: list[str], + metadatas: Optional[list[dict]] = None, + **kwargs: Any, + ) -> list[str]: + # Get initial properties + inserted_ids: list[str] = [] + batch_size = int(kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)) + total_count = len(texts) - return response, response_array + for start_idx in range(0, total_count, batch_size): + end_idx = min(start_idx + batch_size - 1, total_count) - def encode_image(self, image_path: str) -> str: - with open(image_path, "rb") as f: - blob = f.read() - return base64.b64encode(blob).decode("utf-8") + batch_texts = texts[start_idx:end_idx] + batch_embedding_vectors = embeddings[start_idx:end_idx] + batch_ids = ids[start_idx:end_idx] - @classmethod - def from_documents( - cls: Type[VDMS], - documents: List[Document], - embedding: Optional[Embeddings] = None, - ids: Optional[List[str]] = None, - batch_size: int = DEFAULT_INSERT_BATCH_SIZE, - collection_name: str = DEFAULT_COLLECTION_NAME, # Add this line + if metadatas: + batch_metadatas = metadatas[start_idx:end_idx] + + try: + result_ids = self.add_batch( + self.collection_name, + embeddings=batch_embedding_vectors, + texts=batch_texts, + metadatas=batch_metadatas, + ids=batch_ids, + ) + + inserted_ids.extend(result_ids) + except Exception as e: + logger.error( + "Failed to insert batch starting at entity: %s-%s", + start_idx, + end_idx - 1, + ) + raise e + + # Update Properties + self.push_update_properties( + self.collection_name, + ) + return inserted_ids + + def add_images( + self, + uris: list[str], + metadatas: Optional[list[dict]] = None, + ids: Optional[list[str]] = None, + add_path: Optional[bool] = True, **kwargs: Any, - ) -> VDMS: - """Create a VDMS vectorstore from a list of documents. + ) -> list[str]: + """Run images through the embeddings and add to the vectorstore. + + Images are added as embeddings (AddDescriptor) instead of separate + entity (AddImage) within VDMS to leverage similarity search capability Args: - collection_name (str): Name of the collection to create. - documents (List[Document]): List of documents to add to vectorstore. - embedding (Embeddings): Embedding function. Defaults to None. - ids (Optional[List[str]]): List of document IDs. Defaults to None. + uris: List of paths to the images to add to the vectorstore. + metadatas: Optional list of metadatas associated with the images. + ids: Optional list of unique IDs. batch_size (int): Number of concurrent requests to send to the server. + add_path: Bool to add image path as metadata Returns: - VDMS: VDMS vectorstore. + List of ids from adding images into the vectorstore. """ - client: vdms.vdms = kwargs["client"] + # Map from uris to blobs to base64 + b64_texts = [self.encode_image(image_path=uri) for uri in uris] - return cls.from_texts( - client=client, - texts=[doc.page_content for doc in documents], - metadatas=[doc.metadata for doc in documents], - embedding=embedding, + if add_path and metadatas: + for midx, uri in enumerate(uris): + metadatas[midx]["image_path"] = uri + elif add_path: + metadatas = [] + for uri in uris: + metadatas.append({"image_path": uri}) + + # Set embeddings + embeddings = self._embed_image(uris=uris) + + inserted_ids = self.add_data( + texts=b64_texts, + embeddings=embeddings, + metadatas=metadatas, ids=ids, - batch_size=batch_size, - collection_name=collection_name, - # **kwargs, + **kwargs, ) + return inserted_ids - @classmethod - def from_texts( - cls: Type[VDMS], - texts: List[str], - embedding: Optional[Embeddings] = None, - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - batch_size: int = DEFAULT_INSERT_BATCH_SIZE, - collection_name: str = DEFAULT_COLLECTION_NAME, + def add_videos( + self, + paths: list[str], + texts: Optional[list[str]] = None, + metadatas: Optional[list[dict]] = None, + ids: Optional[list[str]] = None, + add_path: Optional[bool] = True, **kwargs: Any, - ) -> VDMS: - """Create a VDMS vectorstore from a raw documents. + ) -> list[str]: + """Run videos through the embeddings and add to the vectorstore. + + Videos are added as embeddings (AddDescriptor) instead of separate + entity (AddVideo) within VDMS to leverage similarity search capability Args: - texts (List[str]): List of texts to add to the collection. - embedding (Embeddings): Embedding function. Defaults to None. - metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. - ids (Optional[List[str]]): List of document IDs. Defaults to None. - batch_size (int): Number of concurrent requests to send to the server. - collection_name (str): Name of the collection to create. + paths: List of paths to the videos to add to the vectorstore. + text: Optional list of text associated with the videos. + metadatas: Optional list of metadatas associated with the videos. + ids: Optional list of unique IDs. + add_path: Bool to add video path as metadata Returns: - VDMS: VDMS vectorstore. + List of ids from adding videos into the vectorstore. """ - client: vdms.vdms = kwargs["client"] - vdms_collection = cls( - collection_name=collection_name, - embedding=embedding, - client=client, - # **kwargs, - ) - if ids is None: - ids = [str(uuid.uuid4()) for _ in texts] - vdms_collection.add_texts( + if texts is None: + texts = ["" for _ in paths] + + if add_path and metadatas: + for midx, path in enumerate(paths): + metadatas[midx]["video_path"] = path + elif add_path: + metadatas = [] + for path in paths: + metadatas.append({"video_path": path}) + + # Set embeddings + embeddings = self._embed_video(paths=paths, **kwargs) + + inserted_ids = self.add_data( texts=texts, + embeddings=embeddings, metadatas=metadatas, ids=ids, - batch_size=batch_size, # **kwargs + **kwargs, + ) + return inserted_ids + + def check_and_update_properties(self) -> None: + if self.updated_properties_flag: + pushed_props = self.utils.get_properties(self.collection_name) + self.collection_properties.sort() + pushed_props.sort() + if self.collection_properties != pushed_props: + self.collection_properties = pushed_props + self.updated_properties_flag = False + + def count(self, collection_name: str) -> int: + all_queries: list[Any] = [] + all_blobs: list[Any] = [] + + results = { + "count": "", + "list": [LANGCHAIN_ID_PROPERTY], + } + query = self.utils.add_descriptor( + "FindDescriptor", + collection_name, + label=None, + ref=None, + props=None, + link=None, + k_neighbors=None, + constraints=None, + results=results, ) - return vdms_collection - def get( + all_queries.append(query) + + response, response_array = self.utils.run_vdms_query(all_queries, all_blobs) + return response[0]["FindDescriptor"]["returned"] + + def decode_image(self, base64_image: str) -> bytes: + return base64.b64decode(base64_image) + + def chunk_query_to_minimize_journal_space( + self, all_queries: list, response: list[dict] + ) -> list[dict]: + if "info" in response[0] and "OutOfJournalSpace" in response[0]["info"]: + logger.info("OutOfJournalSpace: Deleting using batch_size of 50") + new_batch_size = 50 + resp_dict: dict = {} + resp_dict.setdefault( + "FindDescriptor", {"entities": list(), "returned": 0, "status": 0} + ) + new_response = [ + # {"FindDescriptor": {"returned": 0, "status": 0, "entities": []}} + resp_dict + ] + for start_idx in range(0, len(all_queries), new_batch_size): + end_idx = min(start_idx + new_batch_size - 1, len(all_queries)) + queries = all_queries[start_idx:end_idx] + response, _ = self.utils.run_vdms_query(queries) + new_response[0]["FindDescriptor"]["entities"].extend( + response[0]["FindDescriptor"]["entities"] + ) + new_response[0]["FindDescriptor"]["returned"] = len( + new_response[0]["FindDescriptor"]["entities"] + ) + return new_response + return response + + def descriptor2document(self, d: dict) -> Document: + metadata = {} + d_id = None + txt_contents = None + for k, v in d.items(): + if k not in INVALID_DOC_METADATA_KEYS: + metadata[k] = v + if LANGCHAIN_ID_PROPERTY in metadata: + d_id = metadata.pop(LANGCHAIN_ID_PROPERTY) + # if TEXT_PROPERTY in d: + txt_contents = d[TEXT_PROPERTY] + doc = Document(page_content=txt_contents, metadata=metadata, id=d_id) + return doc + + def encode_image(self, image_path: str) -> str: + with open(image_path, "rb") as f: + blob = f.read() + return base64.b64encode(blob).decode("utf-8") + + def get_by_constraints( self, collection_name: str, - constraints: Optional[Dict] = None, + constraints: Optional[dict] = None, limit: Optional[int] = None, - include: List[str] = ["metadata"], - ) -> Tuple[Any, Any]: + include: list[str] = ["metadata"], + ) -> list: """Gets the collection. Get embeddings and their associated data from the data store. If no constraints provided returns all embeddings up to limit. @@ -1041,24 +1287,23 @@ def get( Ids are always included. Defaults to `["metadatas", "documents"]`. Optional. """ - all_queries: List[Any] = [] - all_blobs: List[Any] = [] + all_queries: list[Any] = [] + all_blobs: list[Any] = [] - results: Dict[str, Any] = {"count": ""} + results: dict[str, Any] = {"count": ""} if limit is not None: results["limit"] = limit # Include metadata if "metadata" in include: - collection_properties = self.__get_properties(collection_name) - results["list"] = collection_properties + results["list"] = self.utils.get_properties(collection_name) # Include embedding if "embeddings" in include: results["blob"] = True - query = _add_descriptor( + query = self.utils.add_descriptor( "FindDescriptor", collection_name, k_neighbors=None, @@ -1068,199 +1313,105 @@ def get( all_queries.append(query) - response, response_array = self.__run_vdms_query(all_queries, all_blobs) - return response, response_array + response, _ = self.utils.run_vdms_query(all_queries, all_blobs) - def max_marginal_relevance_search( - self, - query: str, - k: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - lambda_mult: float = 0.5, - filter: Optional[Dict[str, List]] = None, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - query (str): Query to look up. Text or path for image or video. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - - Returns: - List of Documents selected by maximal marginal relevance. - """ - if self.embedding is None: - raise ValueError( - "For MMR search, you must specify an embedding function on" "creation." - ) + if "FindDescriptor" in response[0]: + this_docs = [ + self.descriptor2document(doc) + for doc in response[0]["FindDescriptor"].get("entities", []) + ] + return this_docs + return [] - # embedding_vector: List[float] = self._embed_query(query) - embedding_vector: List[float] + def get_embedding_from_query(self, query: str) -> list[float]: if not os.path.isfile(query) and hasattr(self.embedding, "embed_query"): - embedding_vector = self._embed_query(query) + query_embedding: list[float] = self._embed_query(query) elif os.path.isfile(query) and hasattr(self.embedding, "embed_image"): - embedding_vector = self._embed_image(uris=[query])[0] + query_embedding = self._embed_image(uris=[query])[0] elif os.path.isfile(query) and hasattr(self.embedding, "embed_video"): - embedding_vector = self._embed_video(paths=[query])[0] + query_embedding = self._embed_video(paths=[query])[0] else: error_msg = f"Could not generate embedding for query '{query}'." error_msg += "If using path for image or video, verify embedding model " error_msg += "has callable functions 'embed_image' or 'embed_video'." raise ValueError(error_msg) + return query_embedding - docs = self.max_marginal_relevance_search_by_vector( - embedding_vector, - k, - fetch_k, - lambda_mult=lambda_mult, - filter=filter, - ) - return docs - - def max_marginal_relevance_search_by_vector( + def get_props_from_metadata( self, - embedding: List[float], - k: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - lambda_mult: float = 0.5, - filter: Optional[Dict[str, List]] = None, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - - Returns: - List of Documents selected by maximal marginal relevance. - """ - results = self.query_collection_embeddings( - query_embeddings=[embedding], - n_results=fetch_k, - filter=filter, - include=["metadatas", "documents", "distances", "embeddings"], - ) - - if len(results[0][1]) == 0: - # No results returned - return [] + # collection_name: str, + document: str, + metadata: Optional[dict] = None, + id: Optional[str] = None, + ) -> dict[str, Any]: + if id is None: + props = {} else: - embedding_list = [ - list(_bytes2embedding(result)) for result in results[0][1] - ] + props = {LANGCHAIN_ID_PROPERTY: str(id)} + # id_exists, query = self.utils.check_descriptor_exists_by_id( + # self._client, collection_name, id + # ) + # if id_exists: + # skipped_value = { + # prop_key: prop_val[-1] + # for prop_key, prop_val in query["FindDescriptor"][ + # "constraints" + # ].items() + # } + # pstr = f"[!] Embedding with id ({id}) exists in DB;" + # pstr += "Therefore, skipped and not inserted" + # logger.warning(pstr) + # logger.warning(f"\tSkipped values are: {skipped_value}") + # return {} - mmr_selected = maximal_marginal_relevance( - np.array(embedding, dtype=np.float32), - embedding_list, - k=k, - lambda_mult=lambda_mult, - ) - - candidates = _results_to_docs(results) - - selected_results = [ - r for i, r in enumerate(candidates) if i in mmr_selected - ] - return selected_results - - def max_marginal_relevance_search_with_score( - self, - query: str, - k: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - lambda_mult: float = 0.5, - filter: Optional[Dict[str, List]] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs selected using the maximal marginal relevance. - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - query (str): Query to look up. Text or path for image or video. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - - Returns: - List of Documents selected by maximal marginal relevance. - """ - if self.embedding is None: - raise ValueError( - "For MMR search, you must specify an embedding function on" "creation." - ) - - if not os.path.isfile(query) and hasattr(self.embedding, "embed_query"): - embedding = self._embed_query(query) - elif os.path.isfile(query) and hasattr(self.embedding, "embed_image"): - embedding = self._embed_image(uris=[query])[0] - elif os.path.isfile(query) and hasattr(self.embedding, "embed_video"): - embedding = self._embed_video(paths=[query])[0] - else: - error_msg = f"Could not generate embedding for query '{query}'." - error_msg += "If using path for image or video, verify embedding model " - error_msg += "has callable functions 'embed_image' or 'embed_video'." - raise ValueError(error_msg) + if metadata: + for k, v in metadata.items(): + if k not in props: + props[k] = v + # props.update(metadata) + if LANGCHAIN_ID_PROPERTY not in props and "id" in metadata: + metadata[LANGCHAIN_ID_PROPERTY] = str(metadata["id"]) + if document not in [None, ""]: + props["content"] = document - docs = self.max_marginal_relevance_search_with_score_by_vector( - embedding, - k, - fetch_k, - lambda_mult=lambda_mult, - filter=filter, - ) - return docs + for k in props.keys(): + if k not in self.collection_properties: + self.collection_properties.append(k) + self.collection_properties.sort() + return props - def max_marginal_relevance_search_with_score_by_vector( + def max_marginal_relevance_search_by_vector_with_score( self, - embedding: List[float], + embedding: list[float], k: int = DEFAULT_K, fetch_k: int = DEFAULT_FETCH_K, lambda_mult: float = 0.5, - filter: Optional[Dict[str, List]] = None, + filter: Optional[dict[str, list]] = None, **kwargs: Any, - ) -> List[Tuple[Document, float]]: + ) -> list[tuple[Document, float]]: """Return docs selected using the maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. + embedding: Embedding vector to search for. + k: Number of Documents to return. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + filter (Optional[dict[str, str]]): Filter by metadata. Defaults to None. Returns: List of Documents selected by maximal marginal relevance. """ - results = self.query_collection_embeddings( + start_time = time.time() + results = self.query_by_embeddings( query_embeddings=[embedding], - n_results=fetch_k, + k=k, + fetch_k=fetch_k, filter=filter, include=["metadatas", "documents", "distances", "embeddings"], ) @@ -1270,7 +1421,7 @@ def max_marginal_relevance_search_with_score_by_vector( return [] else: embedding_list = [ - list(_bytes2embedding(result)) for result in results[0][1] + list(self.utils.bytes2embedding(result)) for result in results[0][1] ] mmr_selected = maximal_marginal_relevance( @@ -1280,186 +1431,139 @@ def max_marginal_relevance_search_with_score_by_vector( lambda_mult=lambda_mult, ) - candidates = _results_to_docs_and_scores(results) - - selected_results = [ - (r, s) for i, (r, s) in enumerate(candidates) if i in mmr_selected - ] - return selected_results - - def query_collection_embeddings( - self, - query_embeddings: Optional[List[List[float]]] = None, - collection_name: Optional[str] = None, - n_results: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - filter: Union[None, Dict[str, Any]] = None, - results: Union[None, Dict[str, Any]] = None, - normalize_distance: bool = False, - **kwargs: Any, - ) -> List[Tuple[Dict[str, Any], List]]: - all_responses: List[Any] = [] - - if collection_name is None: - collection_name = self._collection_name - - if query_embeddings is None: - return all_responses - - include = kwargs.get("include", ["metadatas"]) - if results is None and "metadatas" in include: - results = { - "list": self.collection_properties, - "blob": "embeddings" in include, - } - - for qemb in query_embeddings: - response, response_array = self.get_descriptor_response( - "FindDescriptor", - collection_name, - k_neighbors=n_results, - fetch_k=fetch_k, - constraints=filter, - results=results, - normalize_distance=normalize_distance, - query_embedding=qemb, + logger.info( + f"VDMS similarity search mmr took {time.time() - start_time:0.4f} secs" ) - all_responses.append([response, response_array]) - - return all_responses + candidates = self.results2docs_and_scores(results) + return [(r, s) for i, (r, s) in enumerate(candidates) if i in mmr_selected] - def similarity_search( + def max_marginal_relevance_search_with_score( self, query: str, k: int = DEFAULT_K, fetch_k: int = DEFAULT_FETCH_K, - filter: Optional[Dict[str, List]] = None, + lambda_mult: float = 0.5, + filter: Optional[dict[str, list]] = None, **kwargs: Any, - ) -> List[Document]: - """Run similarity search with VDMS. + ) -> list[tuple[Document, float]]: + """Returns similar documents to the query that also have diversity + + This algorithm balances relevance and diversity in the search results. Args: - query (str): Query to look up. Text or path for image or video. - k (int): Number of results to return. Defaults to 3. - fetch_k (int): Number of candidates to fetch for knn (>= k). - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + **kwargs: Arguments to pass to the search method. Returns: - List[Document]: List of documents most similar to the query text. + List of Document objects ordered by decreasing similarity/diversty. """ - docs_and_scores = self.similarity_search_with_score( - query, k=k, fetch_k=fetch_k, filter=filter, **kwargs + query_embedding = self.get_embedding_from_query(query) + return self.max_marginal_relevance_search_by_vector_with_score( + query_embedding, k, fetch_k, lambda_mult, filter, **kwargs ) - return [doc for doc, _ in docs_and_scores] - def similarity_search_by_vector( + def query_by_embeddings( self, - embedding: List[float], + query_embeddings: Optional[list[list[float]]] = None, + collection_name: Optional[str] = None, k: int = DEFAULT_K, fetch_k: int = DEFAULT_FETCH_K, - filter: Optional[Dict[str, List]] = None, + filter: Union[None, dict[str, Any]] = None, + results: Union[None, dict[str, Any]] = None, + normalize_distance: bool = False, **kwargs: Any, - ) -> List[Document]: - """Return docs most similar to embedding vector. - Args: - embedding (List[float]): Embedding to look up documents similar to. - k (int): Number of Documents to return. Defaults to 3. - fetch_k (int): Number of candidates to fetch for knn (>= k). - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. - Returns: - List of Documents most similar to the query vector. - """ - results = self.query_collection_embeddings( - query_embeddings=[embedding], - n_results=k, - fetch_k=fetch_k, - filter=filter, - **kwargs, - ) + ) -> list: + self.check_and_update_properties() - return _results_to_docs(results) - - def similarity_search_with_score( - self, - query: str, - k: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - filter: Optional[Dict[str, List]] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Run similarity search with VDMS with distance. + all_responses: list[Any] = [] - Args: - query (str): Query to look up. Text or path for image or video. - k (int): Number of results to return. Defaults to 3. - fetch_k (int): Number of candidates to fetch for knn (>= k). - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + if collection_name is None: + collection_name = self.collection_name - Returns: - List[Tuple[Document, float]]: List of documents most similar to - the query text and cosine distance in float for each. - Lower score represents more similarity. - """ - if self.embedding is None: - raise ValueError("Must provide embedding function") - else: - if not os.path.isfile(query) and hasattr(self.embedding, "embed_query"): - query_embedding: List[float] = self._embed_query(query) - elif os.path.isfile(query) and hasattr(self.embedding, "embed_image"): - query_embedding = self._embed_image(uris=[query])[0] - elif os.path.isfile(query) and hasattr(self.embedding, "embed_video"): - query_embedding = self._embed_video(paths=[query])[0] - else: - error_msg = f"Could not generate embedding for query '{query}'." - error_msg += "If using path for image or video, verify embedding model " - error_msg += "has callable functions 'embed_image' or 'embed_video'." - raise ValueError(error_msg) - - results = self.query_collection_embeddings( - query_embeddings=[query_embedding], - n_results=k, - fetch_k=fetch_k, - filter=filter, - **kwargs, - ) + if query_embeddings is None: + return all_responses - return _results_to_docs_and_scores(results) + include = kwargs.get("include", ["metadatas"]) + if results is None and "metadatas" in include: + results = { + "list": self.collection_properties, + "blob": "embeddings" in include, + } - def similarity_search_with_score_by_vector( - self, - embedding: List[float], - k: int = DEFAULT_K, - fetch_k: int = DEFAULT_FETCH_K, - filter: Optional[Dict[str, List]] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """ - Return docs most similar to embedding vector and similarity score. + for qemb in query_embeddings: + response: list[dict] = [] + response_array: list[bytes] = [] + if fetch_k >= k: + response, response_array = self.utils.get_descriptor_response( + "FindDescriptor", + collection_name, + k_neighbors=k, + fetch_k=fetch_k, + constraints=filter, + results=results, + normalize_distance=normalize_distance, + query_embedding=qemb, + ) - Args: - embedding (List[float]): Embedding to look up documents similar to. - k (int): Number of Documents to return. Defaults to 3. - fetch_k (int): Number of candidates to fetch for knn (>= k). - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + try: + num_returned = ( + len(response[0]["FindDescriptor"].get("entities", [])) + if "FindDescriptor" in response[0] + else 0 + ) + if num_returned != k: + logger.info( + f"Only {num_returned} returned. " + "Provide fetch_k > k ({k}); Currently set at {fetch_k}" + ) + result_entities = response[0]["FindDescriptor"].get("entities", []) + except ValueError: + result_entities = [] + for eidx, ent in enumerate(result_entities): + response[0]["FindDescriptor"]["entities"][eidx] = { + k: v for k, v in ent.items() if v not in INVALID_METADATA_VALUE + } - Returns: - List[Tuple[Document, float]]: List of documents most similar to - the query text. Lower score represents more similarity. - """ + all_responses.append([response, response_array]) - # kwargs["normalize_distance"] = True + return all_responses - results = self.query_collection_embeddings( - query_embeddings=[embedding], - n_results=k, - fetch_k=fetch_k, - filter=filter, - **kwargs, - ) - return _results_to_docs_and_scores(results) + def results2docs(self, results: Any) -> list[Document]: + return [doc for doc, _ in self.results2docs_and_scores(results)] + + def results2docs_and_scores(self, results: Any) -> list[Tuple[Document, float]]: + final_res: list[Any] = [] + try: + responses, blobs = results[0] + result_entities = responses[0]["FindDescriptor"].get("entities", []) + if len(result_entities) > 0: + for ent in result_entities: + distance = round(ent["_distance"], 10) + txt_contents = ent[TEXT_PROPERTY] + props = { + mkey: mval + for mkey, mval in ent.items() + if ( + mval not in INVALID_METADATA_VALUE + and mkey not in INVALID_DOC_METADATA_KEYS + ) + } + + final_res.append( + (Document(page_content=txt_contents, metadata=props), distance) + ) + except Exception as e: + logger.warning(f"No results returned. Error while parsing results: {e}") + return final_res def update_document( - self, collection_name: str, document_id: str, document: Document + self, collection_name: str, document_id: str, document: Document, **kwargs: Any ) -> None: """Update a document in the collection. @@ -1467,273 +1571,492 @@ def update_document( document_id (str): ID of the document to update. document (Document): Document to update. """ - return self.update_documents(collection_name, [document_id], [document]) + return self.update_documents( + collection_name, [document_id], [document], **kwargs + ) def update_documents( - self, collection_name: str, ids: List[str], documents: List[Document] + self, + collection_name: str, + ids: list[str], + documents: list[Document], + **kwargs: Any, ) -> None: """Update a document in the collection. Args: - ids (List[str]): List of ids of the document to update. - documents (List[Document]): List of documents to update. + ids (list[str]): List of ids of the document to update. + documents (list[Document]): List of documents to update. """ - text = [document.page_content for document in documents] - metadata = [ - _validate_vdms_properties(document.metadata) for document in documents - ] - embeddings = self._embed_documents(text) - - self.__update( - collection_name, - ids, - metadatas=metadata, - embeddings=embeddings, - documents=text, - ) - - -# VDMS UTILITY - - -def _add_descriptor( - command_str: str, - setname: str, - label: Optional[str] = None, - ref: Optional[int] = None, - props: Optional[dict] = None, - link: Optional[dict] = None, - k_neighbors: Optional[int] = None, - constraints: Optional[dict] = None, - results: Optional[dict] = None, -) -> Dict[str, Dict[str, Any]]: - entity: Dict[str, Any] = {"set": setname} + kwargs["delete_existing"] = True + self.add_documents(documents=documents, ids=ids, **kwargs) - if "Add" in command_str and label: - entity["label"] = label + def upsert(self, documents: list[Document], /, **kwargs: Any) -> list[str] | None: + """Update/Insert documents to the vectorstore. - if ref is not None: - entity["_ref"] = ref - - if props not in INVALID_METADATA_VALUE: - entity["properties"] = props + Args: + ids: IDs to update - Let's call get_pks to get ids with expression \n + documents (list[Document]): Documents to add to the vectorstore. - if "Add" in command_str and link is not None: - entity["link"] = link + Returns: + list[str]: IDs of the added texts. + """ + # For now, simply delete and add + # We could do something more efficient to update metadata, + # but we don't support changing the embedding of a descriptor. + ids: list[str] + + if documents is None or len(documents) == 0: + logger.debug("No documents to upsert.") + return None + + if "ids" in kwargs: + ids = kwargs.get("ids", []) + if ids and len(ids) != len(documents): + raise ValueError( + "The number of ids must match the number of documents. " + "Got {len(ids)} ids and {len(documents)} documents." + ) + else: + ids = [ + item.id + for item in documents + if hasattr(item, "id") and item.id is not None + ] - if "Find" in command_str and k_neighbors is not None: - entity["k_neighbors"] = int(k_neighbors) + try: + if ids is not None and len(ids): + kwargs["ids"] = ids + kwargs["delete_existing"] = True + return self.add_documents(documents=documents, **kwargs) + except Exception as e: + logger.error( + "Failed to upsert entities: %s error: %s", self.collection_name, e + ) + raise e - if "Find" in command_str and constraints not in INVALID_METADATA_VALUE: - entity["constraints"] = constraints - if "Find" in command_str and results not in INVALID_METADATA_VALUE: - entity["results"] = results +class VDMS_Utils: + def __init__(self, client: vdms.vdms) -> None: + self.client = client - query = {command_str: entity} - return query - - -def _add_descriptorset( - command_str: str, - name: str, - num_dims: Optional[int] = None, - engine: Optional[str] = None, - metric: Optional[str] = None, - ref: Optional[int] = None, - props: Optional[Dict] = None, - link: Optional[Dict] = None, - storeIndex: bool = False, - constraints: Optional[Dict] = None, - results: Optional[Dict] = None, -) -> Dict[str, Any]: - if command_str == "AddDescriptorSet" and all( - var is not None for var in [name, num_dims] - ): - entity: Dict[str, Any] = { - "name": name, - "dimensions": num_dims, - } + # Check connection to client + if not self.client.is_connected(): + raise ValueError( + "VDMS client must be connected to a VDMS server." + + "Please use VDMS_Client to establish a connection" + ) - if engine is not None: - entity["engine"] = engine + def add_descriptor( + self, + command_str: str, + setname: str, + label: Optional[str] = None, + ref: Optional[int] = None, + props: Optional[Union[dict, list]] = None, + link: Optional[dict] = None, + k_neighbors: Optional[int] = None, + constraints: Optional[dict] = None, + results: Optional[dict] = None, + ) -> dict[str, dict[str, Any]]: + entity: dict[str, Any] = {"set": setname} - if metric is not None: - entity["metric"] = metric + if "Add" in command_str and label: + entity["label"] = label if ref is not None: entity["_ref"] = ref - if props not in [None, {}]: + if isinstance(props, list) and len(props) > 1: + entity["batch_properties"] = props + elif ( + isinstance(props, list) + and len(props) == 1 + and props[0] not in INVALID_METADATA_VALUE + ): + entity["properties"] = props[0] + elif isinstance(props, dict) and props not in INVALID_METADATA_VALUE: entity["properties"] = props - if link is not None: + if "Add" in command_str and link is not None: entity["link"] = link - elif command_str == "FindDescriptorSet": - entity = {"set": name} - - if storeIndex: - entity["storeIndex"] = storeIndex + if "Find" in command_str and k_neighbors is not None: + entity["k_neighbors"] = int(k_neighbors) - if constraints not in [None, {}]: + if "Find" in command_str and constraints not in INVALID_METADATA_VALUE: entity["constraints"] = constraints - if results is not None: + if "Find" in command_str and results not in INVALID_METADATA_VALUE: entity["results"] = results - else: - raise ValueError(f"Unknown command: {command_str}") + query = {command_str: entity} + return query - query = {command_str: entity} - return query + def add_descriptor_set( + self, + command_str: str, + name: str, + num_dims: Optional[int] = None, + engine: Optional[str] = None, + metric: Optional[str] = None, + ref: Optional[int] = None, + props: Optional[dict] = None, + link: Optional[dict] = None, + storeIndex: bool = False, + constraints: Optional[dict] = None, + results: Optional[dict] = None, + ) -> dict[str, Any]: + if command_str == "AddDescriptorSet" and all( + var is not None for var in [name, num_dims] + ): + entity: dict[str, Any] = { + "name": name, + "dimensions": num_dims, + } + if engine is not None: + entity["engine"] = engine -def _add_entity_with_blob( - collection_name: str, all_properties: List -) -> Tuple[Dict[str, Any], bytes]: - all_properties_str = ",".join(all_properties) if len(all_properties) > 0 else "" + if metric is not None: + entity["metric"] = metric - querytype = "AddEntity" - entity: Dict[str, Any] = {} - entity["class"] = "properties" - entity["blob"] = True # New + if ref is not None: + entity["_ref"] = ref - props: Dict[str, Any] = {"name": collection_name} - props["type"] = "queryable properties" - props["content"] = all_properties_str - entity["properties"] = props + if props not in [None, {}]: + entity["properties"] = props - byte_data = _str2bytes(all_properties_str) + if link is not None: + entity["link"] = link - query: Dict[str, Any] = {} - query[querytype] = entity - return query, byte_data + elif command_str == "FindDescriptorSet": + entity = {"set": name} + if storeIndex: + entity["storeIndex"] = storeIndex -def _build_property_query( - collection_name: str, - command_type: str = "find", - all_properties: List = [], - ref: Optional[int] = None, -) -> Tuple[Any, Any]: - all_queries: List[Any] = [] - blob_arr: List[Any] = [] + if constraints not in [None, {}]: + entity["constraints"] = constraints - choices = ["find", "add", "update"] - if command_type.lower() not in choices: - raise ValueError("[!] Invalid type. Choices are : {}".format(",".join(choices))) + if results is not None: + entity["results"] = results - if command_type.lower() == "find": - query = _find_property_entity(collection_name, unique_entity=True) - all_queries.append(query) + else: + raise ValueError(f"Unknown command: {command_str}") - elif command_type.lower() == "add": - query, byte_data = _add_entity_with_blob(collection_name, all_properties) - all_queries.append(query) - blob_arr.append(byte_data) + query = {command_str: entity} + return query - elif command_type.lower() == "update": - # Find & Delete - query = _find_property_entity(collection_name, deletion=True) - all_queries.append(query) + def add_entity_with_blob( + self, collection_name: str, all_properties: list + ) -> Tuple[dict[str, Any], bytes]: + all_properties_str = ",".join(all_properties) if len(all_properties) > 0 else "" - # Add - query, byte_data = _add_entity_with_blob(collection_name, all_properties) - all_queries.append(query) - blob_arr.append(byte_data) + querytype = "AddEntity" + entity: dict[str, Any] = {} + entity["class"] = "properties" + entity["blob"] = True # New + + props: dict[str, Any] = {"name": collection_name} + props["type"] = "queryable properties" + props[TEXT_PROPERTY] = all_properties_str + entity["properties"] = props + + byte_data = self.str2bytes(all_properties_str) + + query: dict[str, Any] = {} + query[querytype] = entity + return query, byte_data + + def build_property_query( + self, + collection_name: str, + command_type: str = "find", + all_properties: list = [], + ref: Optional[int] = None, + ) -> Tuple[Any, Any]: + all_queries: list[Any] = [] + blob_arr: list[Any] = [] + + choices = ["find", "add", "update"] + if command_type.lower() not in choices: + raise ValueError( + "[!] Invalid type. Choices are : {}".format(",".join(choices)) + ) - return all_queries, blob_arr + if command_type.lower() == "find": + query = self.find_property_entity(collection_name, unique_entity=True) + all_queries.append(query) + elif command_type.lower() == "add": + query, byte_data = self.add_entity_with_blob( + collection_name, all_properties + ) + all_queries.append(query) + blob_arr.append(byte_data) -def _bytes2embedding(blob: bytes) -> Any: - emb = np.frombuffer(blob, dtype="float32") - return emb + elif command_type.lower() == "update": + # Find & Delete + query = self.find_property_entity(collection_name, deletion=True) + all_queries.append(query) + # Add + query, byte_data = self.add_entity_with_blob( + collection_name, all_properties + ) + all_queries.append(query) + blob_arr.append(byte_data) + + return all_queries, blob_arr + + def bytes2embedding(self, blob: bytes) -> Any: + emb = np.frombuffer(blob, dtype="float32") + return emb + + def bytes2str(self, in_bytes: bytes) -> str: + return in_bytes.decode() + + def check_valid_response( + self, all_queries: list[dict], response: Any + ) -> tuple[Any, bool]: + cmd_list = self.get_cmds_from_query(all_queries) + valid_res = isinstance(response, list) and any( + cmd in response[0] + and "returned" in response[0][cmd] + and response[0][cmd]["returned"] > 0 + for cmd in cmd_list + ) -def _bytes2str(in_bytes: bytes) -> str: - return in_bytes.decode() + # ID required + for ridx, res in enumerate(response): + if "FindDescriptor" in res: + ent = res["FindDescriptor"].get("entities", []) + response[ridx]["FindDescriptor"]["entities"] = [ + e for e in ent if LANGCHAIN_ID_PROPERTY in e + ] + response[ridx]["FindDescriptor"]["returned"] = len( + response[ridx]["FindDescriptor"]["entities"] + ) + return response, valid_res + + # def check_descriptor_exists_by_id( + # self, + # client: vdms.vdms, + # setname: str, + # id: str, + # ) -> Tuple[bool, Any]: + # constraints = {LANGCHAIN_ID_PROPERTY: ["==", id]} + # findDescriptor = self.add_descriptor( + # "FindDescriptor", + # setname, + # constraints=constraints, + # results={"list": [LANGCHAIN_ID_PROPERTY], "count": ""}, + # ) + # all_queries = [findDescriptor] + # res, _ = client.query(all_queries) + + # res, valid_res = self.check_valid_response(all_queries, res) + # return valid_res, findDescriptor + + def embedding2bytes( + self, embedding: Union[list[float], None] + ) -> Union[bytes, None]: + """Convert embedding to bytes.""" + + blob = None + if embedding is not None: + emb = np.array(embedding, dtype="float32") + blob = emb.tobytes() + return blob + + def find_property_entity( + self, + collection_name: str, + unique_entity: Optional[bool] = False, + deletion: Optional[bool] = False, + ) -> dict[str, dict[str, Any]]: + querytype = "FindEntity" + entity: dict[str, Any] = {} + entity["class"] = "properties" + if unique_entity: + entity["unique"] = unique_entity + + results: dict[str, Any] = {} + results["blob"] = True + results["count"] = "" + results["list"] = [TEXT_PROPERTY] + entity["results"] = results -def _get_cmds_from_query(all_queries: list) -> List[str]: - return list(set([k for q in all_queries for k in q.keys()])) + constraints: dict[str, Any] = {} + if deletion: + constraints["_deletion"] = ["==", 1] + constraints["name"] = ["==", collection_name] + entity["constraints"] = constraints + query: dict[str, Any] = {} + query[querytype] = entity + return query -def _check_valid_response(all_queries: List[dict], response: Any) -> bool: - cmd_list = _get_cmds_from_query(all_queries) - valid_res = isinstance(response, list) and any( - cmd in response[0] - and "returned" in response[0][cmd] - and response[0][cmd]["returned"] > 0 - for cmd in cmd_list - ) - return valid_res + def get_cmds_from_query(self, all_queries: list) -> list[str]: + return list(set([k for q in all_queries for k in q.keys()])) + def get_descriptor_response( + self, + command_str: str, + setname: str, + k_neighbors: int = DEFAULT_K, + fetch_k: int = DEFAULT_FETCH_K, + constraints: Optional[dict] = None, + results: Optional[dict[str, Any]] = None, + query_embedding: Optional[list[float]] = None, + normalize_distance: bool = False, + ) -> Tuple[list[dict[str, Any]], list]: + all_blobs: list[Any] = [] + if k_neighbors >= fetch_k: + raise ValueError(f"Provide fetch_k > k; Currently set at {fetch_k}") + blob = self.embedding2bytes(query_embedding) + if blob is not None: + all_blobs.append(blob) -def _check_descriptor_exists_by_id( - client: vdms.vdms, - setname: str, - id: str, -) -> Tuple[bool, Any]: - constraints = {"id": ["==", id]} - findDescriptor = _add_descriptor( - "FindDescriptor", - setname, - constraints=constraints, - results={"list": ["id"], "count": ""}, - ) - all_queries = [findDescriptor] - res, _ = client.query(all_queries) + if constraints is None: + # K results returned + response, response_array = self.get_k_candidates( + setname=setname, + k=fetch_k, + results=results, + all_blobs=all_blobs, + ) - valid_res = _check_valid_response(all_queries, res) - return valid_res, findDescriptor + if ( + len(response) > 0 + and command_str in response[0] + and "entities" in response[0][command_str] + ): + new_entities = response[0][command_str]["entities"][:k_neighbors] + response[0][command_str]["entities"] = new_entities + response[0][command_str]["returned"] = len(new_entities) + response_array = response_array[: len(new_entities)] + else: + if results is None: + results = {"list": [LANGCHAIN_ID_PROPERTY]} + elif "list" not in results: + results["list"] = [LANGCHAIN_ID_PROPERTY] + elif LANGCHAIN_ID_PROPERTY not in results["list"]: + results["list"].append(LANGCHAIN_ID_PROPERTY) + # (1) Find docs satisfy constraints + query = self.add_descriptor( + command_str, + setname, + constraints=constraints, + results=results, + ) + response, response_array = self.run_vdms_query([query]) + if command_str in response[0] and response[0][command_str]["returned"] > 0: + ids_of_interest = [ + ent[LANGCHAIN_ID_PROPERTY] + for ent in response[0][command_str]["entities"] + ] + else: + return [], [] -def embedding2bytes(embedding: Union[List[float], None]) -> Union[bytes, None]: - """Convert embedding to bytes.""" + # (2) Find top fetch_k results + response, response_array = self.get_k_candidates( + setname=setname, + k=fetch_k, + results=results, + all_blobs=all_blobs, + ) + if command_str not in response[0] or ( + command_str in response[0] and response[0][command_str]["returned"] == 0 + ): + return [], [] - blob = None - if embedding is not None: - emb = np.array(embedding, dtype="float32") - blob = emb.tobytes() - return blob + # (3) Intersection of (1) & (2) using ids + new_entities = [] + for ent in response[0][command_str]["entities"]: + if ent[LANGCHAIN_ID_PROPERTY] in ids_of_interest: + new_entities.append(ent) + if len(new_entities) == k_neighbors: + break + response[0][command_str]["entities"] = new_entities + response[0][command_str]["returned"] = len(new_entities) + response_array = response_array[: len(new_entities)] + if len(new_entities) < k_neighbors: + p_str = "Returned items < k_neighbors; Try increasing fetch_k" + logger.warning(p_str) + if normalize_distance: + max_dist = max( + [ent["_distance"] for ent in response[0][command_str]["entities"]] + ) + max_dist = 1.0 if max_dist in [0, np.inf] else max_dist + for ent_idx, ent in enumerate(response[0][command_str]["entities"]): + ent["_distance"] = ent["_distance"] / max_dist + response[0][command_str]["entities"][ent_idx]["_distance"] = ent[ + "_distance" + ] + return response, response_array -def _find_property_entity( - collection_name: str, - unique_entity: Optional[bool] = False, - deletion: Optional[bool] = False, -) -> Dict[str, Dict[str, Any]]: - querytype = "FindEntity" - entity: Dict[str, Any] = {} - entity["class"] = "properties" - if unique_entity: - entity["unique"] = unique_entity + def get_k_candidates( + self, + setname: str, + k: Optional[int] = None, + results: Optional[dict[str, Any]] = None, + all_blobs: Optional[list] = None, + ) -> Tuple[list[dict[str, Any]], list]: + command_str = "FindDescriptor" + query = self.add_descriptor( + command_str, + setname, + k_neighbors=k, + results=results, + ) + response, response_array = self.run_vdms_query([query], all_blobs) - results: Dict[str, Any] = {} - results["blob"] = True - results["count"] = "" - results["list"] = ["content"] - entity["results"] = results + if "FailedCommand" in response[0]: + return [], [] - constraints: Dict[str, Any] = {} - if deletion: - constraints["_deletion"] = ["==", 1] - constraints["name"] = ["==", collection_name] - entity["constraints"] = constraints + return response, response_array - query: Dict[str, Any] = {} - query[querytype] = entity - return query + def get_properties( + self, + collection_name: str, + unique_entity: Optional[bool] = False, + deletion: Optional[bool] = False, + ) -> list[str]: + find_query = self.find_property_entity( + collection_name, unique_entity=unique_entity, deletion=deletion + ) + response, response_blob = self.run_vdms_query([find_query]) + if len(response_blob) > 0: + collection_properties = self.bytes2str(response_blob[0]).split(",") + else: + collection_properties = deepcopy(DEFAULT_PROPERTIES) + collection_properties.sort() + return collection_properties + def run_vdms_query( + self, + all_queries: list[dict], + all_blobs: Optional[list] = [], + print_last_response: Optional[bool] = False, + ) -> Tuple[Any, Any]: + response, response_array = self.client.query(all_queries, all_blobs) -def _str2bytes(in_str: str) -> bytes: - return str.encode(in_str) + response, _ = self.check_valid_response(all_queries, response) + if print_last_response: + self.client.print_last_response() + return response, response_array + def str2bytes(self, in_str: str) -> bytes: + return str.encode(in_str) -def _validate_vdms_properties(metadata: Dict[str, Any]) -> Dict: - new_metadata: Dict[str, Any] = {} - for key, value in metadata.items(): - if not isinstance(value, list): - new_metadata[str(key)] = value - return new_metadata + def validate_vdms_properties(self, metadata: dict[str, Any]) -> dict: + new_metadata: dict[str, Any] = {} + for key, value in metadata.items(): + if not isinstance(value, list): + new_metadata[str(key)] = value + return new_metadata diff --git a/libs/community/tests/integration_tests/vectorstores/test_vdms.py b/libs/community/tests/integration_tests/vectorstores/test_vdms.py index a453a0fc20df3..7d6f71a38427a 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_vdms.py +++ b/libs/community/tests/integration_tests/vectorstores/test_vdms.py @@ -4,372 +4,31 @@ import logging import os -from typing import TYPE_CHECKING +import uuid import pytest -from langchain_core.documents import Document +from langchain_tests.integration_tests.vectorstores import VectorStoreIntegrationTests -from langchain_community.vectorstores import VDMS -from langchain_community.vectorstores.vdms import VDMS_Client, embedding2bytes -from tests.integration_tests.vectorstores.fake_embeddings import ( - ConsistentFakeEmbeddings, - FakeEmbeddings, -) - -if TYPE_CHECKING: - import vdms +from langchain_community.vectorstores.vdms import VDMS, VDMS_Client logging.basicConfig(level=logging.DEBUG) -embedding_function = FakeEmbeddings() -# The connection string matches the default settings in the docker-compose file -# located in the root of the repository: [root]/docker/docker-compose.yml # To spin up a detached VDMS server: -# cd [root]/docker -# docker compose up -d vdms -@pytest.fixture -@pytest.mark.enable_socket -def vdms_client() -> vdms.vdms: - return VDMS_Client( - host=os.getenv("VDMS_DBHOST", "localhost"), - port=int(os.getenv("VDMS_DBPORT", 6025)), - ) - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_init_from_client(vdms_client: vdms.vdms) -> None: - _ = VDMS( # type: ignore[call-arg] - embedding=embedding_function, - client=vdms_client, - ) - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_from_texts_with_metadatas(vdms_client: vdms.vdms) -> None: - """Test end to end construction and search.""" - collection_name = "test_from_texts_with_metadatas" - texts = ["foo", "bar", "baz"] - ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))] - metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - metadatas=metadatas, - collection_name=collection_name, - client=vdms_client, - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [ - Document(page_content="foo", metadata={"page": "1", "id": ids[0]}) - ] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_from_texts_with_metadatas_with_scores(vdms_client: vdms.vdms) -> None: - """Test end to end construction and scored search.""" - collection_name = "test_from_texts_with_metadatas_with_scores" - texts = ["foo", "bar", "baz"] - ids = [f"test_from_texts_with_metadatas_with_scores_{i}" for i in range(len(texts))] - metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - metadatas=metadatas, - collection_name=collection_name, - client=vdms_client, - ) - output = docsearch.similarity_search_with_score("foo", k=1, fetch_k=1) - assert output == [ - (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0) - ] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_from_texts_with_metadatas_with_scores_using_vector( - vdms_client: vdms.vdms, -) -> None: - """Test end to end construction and scored search, using embedding vector.""" - collection_name = "test_from_texts_with_metadatas_with_scores_using_vector" - texts = ["foo", "bar", "baz"] - ids = [f"test_from_texts_with_metadatas_{i}" for i in range(len(texts))] - metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - metadatas=metadatas, - collection_name=collection_name, - client=vdms_client, - ) - output = docsearch._similarity_search_with_relevance_scores("foo", k=1) - assert output == [ - (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0) - ] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_search_filter(vdms_client: vdms.vdms) -> None: - """Test end to end construction and search with metadata filtering.""" - collection_name = "test_search_filter" - texts = ["far", "bar", "baz"] - ids = [f"test_search_filter_{i}" for i in range(len(texts))] - metadatas = [{"first_letter": "{}".format(text[0])} for text in texts] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - metadatas=metadatas, - collection_name=collection_name, - client=vdms_client, - ) - output = docsearch.similarity_search( - "far", k=1, filter={"first_letter": ["==", "f"]} - ) - assert output == [ - Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]}) - ] - output = docsearch.similarity_search( - "far", k=2, filter={"first_letter": ["==", "b"]} - ) - assert output == [ - Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}), - Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}), - ] +# docker pull intellabs/vdms:latest +# docker run -d -p $VDMS_DBPORT:55555 intellabs/vdms:latest -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_search_filter_with_scores(vdms_client: vdms.vdms) -> None: - """Test end to end construction and scored search with metadata filtering.""" - collection_name = "test_search_filter_with_scores" - texts = ["far", "bar", "baz"] - ids = [f"test_search_filter_with_scores_{i}" for i in range(len(texts))] - metadatas = [{"first_letter": "{}".format(text[0])} for text in texts] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - metadatas=metadatas, - collection_name=collection_name, - client=vdms_client, - ) - output = docsearch.similarity_search_with_score( - "far", k=1, filter={"first_letter": ["==", "f"]} - ) - assert output == [ - ( - Document(page_content="far", metadata={"first_letter": "f", "id": ids[0]}), - 0.0, +class TestVDMSStandard(VectorStoreIntegrationTests): + @pytest.fixture + def vectorstore(self) -> VDMS: + test_name = uuid.uuid4().hex + client = VDMS_Client( + host=os.getenv("VDMS_DBHOST", "localhost"), + port=int(os.getenv("VDMS_DBPORT", 6025)), + ) + return VDMS( + client=client, + embedding=self.get_embeddings(), + collection_name=test_name, ) - ] - - output = docsearch.similarity_search_with_score( - "far", k=2, filter={"first_letter": ["==", "b"]} - ) - assert output == [ - ( - Document(page_content="bar", metadata={"first_letter": "b", "id": ids[1]}), - 1.0, - ), - ( - Document(page_content="baz", metadata={"first_letter": "b", "id": ids[2]}), - 4.0, - ), - ] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_mmr(vdms_client: vdms.vdms) -> None: - """Test end to end construction and search.""" - collection_name = "test_mmr" - texts = ["foo", "bar", "baz"] - ids = [f"test_mmr_{i}" for i in range(len(texts))] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - collection_name=collection_name, - client=vdms_client, - ) - output = docsearch.max_marginal_relevance_search("foo", k=1) - assert output == [Document(page_content="foo", metadata={"id": ids[0]})] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_mmr_by_vector(vdms_client: vdms.vdms) -> None: - """Test end to end construction and search.""" - collection_name = "test_mmr_by_vector" - texts = ["foo", "bar", "baz"] - ids = [f"test_mmr_by_vector_{i}" for i in range(len(texts))] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - collection_name=collection_name, - client=vdms_client, - ) - embedded_query = embedding_function.embed_query("foo") - output = docsearch.max_marginal_relevance_search_by_vector(embedded_query, k=1) - assert output == [Document(page_content="foo", metadata={"id": ids[0]})] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_with_include_parameter(vdms_client: vdms.vdms) -> None: - """Test end to end construction and include parameter.""" - collection_name = "test_with_include_parameter" - texts = ["foo", "bar", "baz"] - docsearch = VDMS.from_texts( - texts=texts, - embedding=embedding_function, - collection_name=collection_name, - client=vdms_client, - ) - - response, response_array = docsearch.get(collection_name, include=["embeddings"]) - for emb in embedding_function.embed_documents(texts): - assert embedding2bytes(emb) in response_array - - response, response_array = docsearch.get(collection_name) - assert response_array == [] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_update_document(vdms_client: vdms.vdms) -> None: - """Test the update_document function in the VDMS class.""" - collection_name = "test_update_document" - - # Make a consistent embedding - const_embedding_function = ConsistentFakeEmbeddings() - - # Initial document content and id - initial_content = "foo" - document_id = "doc1" - - # Create an instance of Document with initial content and metadata - original_doc = Document(page_content=initial_content, metadata={"page": "1"}) - - # Initialize a VDMS instance with the original document - docsearch = VDMS.from_documents( - client=vdms_client, - collection_name=collection_name, - documents=[original_doc], - embedding=const_embedding_function, - ids=[document_id], - ) - old_response, old_embedding = docsearch.get( - collection_name, - constraints={"id": ["==", document_id]}, - include=["metadata", "embeddings"], - ) - # old_embedding = response_array[0] - - # Define updated content for the document - updated_content = "updated foo" - - # Create a new Document instance with the updated content and the same id - updated_doc = Document(page_content=updated_content, metadata={"page": "1"}) - - # Update the document in the VDMS instance - docsearch.update_document( - collection_name, document_id=document_id, document=updated_doc - ) - - # Perform a similarity search with the updated content - output = docsearch.similarity_search(updated_content, k=3)[0] - - # Assert that the updated document is returned by the search - assert output == Document( - page_content=updated_content, metadata={"page": "1", "id": document_id} - ) - - # Assert that the new embedding is correct - new_response, new_embedding = docsearch.get( - collection_name, - constraints={"id": ["==", document_id]}, - include=["metadata", "embeddings"], - ) - # new_embedding = response_array[0] - - assert new_embedding[0] == embedding2bytes( - const_embedding_function.embed_documents([updated_content])[0] - ) - assert new_embedding != old_embedding - - assert ( - new_response[0]["FindDescriptor"]["entities"][0]["content"] - != old_response[0]["FindDescriptor"]["entities"][0]["content"] - ) - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_with_relevance_score(vdms_client: vdms.vdms) -> None: - """Test to make sure the relevance score is scaled to 0-1.""" - collection_name = "test_with_relevance_score" - texts = ["foo", "bar", "baz"] - ids = [f"test_relevance_scores_{i}" for i in range(len(texts))] - metadatas = [{"page": str(i)} for i in range(1, len(texts) + 1)] - docsearch = VDMS.from_texts( - texts=texts, - ids=ids, - embedding=embedding_function, - metadatas=metadatas, - collection_name=collection_name, - client=vdms_client, - ) - output = docsearch._similarity_search_with_relevance_scores("foo", k=3) - assert output == [ - (Document(page_content="foo", metadata={"page": "1", "id": ids[0]}), 0.0), - (Document(page_content="bar", metadata={"page": "2", "id": ids[1]}), 0.25), - (Document(page_content="baz", metadata={"page": "3", "id": ids[2]}), 1.0), - ] - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_add_documents_no_metadata(vdms_client: vdms.vdms) -> None: - collection_name = "test_add_documents_no_metadata" - db = VDMS( # type: ignore[call-arg] - collection_name=collection_name, - embedding=embedding_function, - client=vdms_client, - ) - db.add_documents([Document(page_content="foo")]) - - -@pytest.mark.requires("vdms") -@pytest.mark.enable_socket -def test_add_documents_mixed_metadata(vdms_client: vdms.vdms) -> None: - collection_name = "test_add_documents_mixed_metadata" - db = VDMS( # type: ignore[call-arg] - collection_name=collection_name, - embedding=embedding_function, - client=vdms_client, - ) - - docs = [ - Document(page_content="foo"), - Document(page_content="bar", metadata={"baz": 1}), - ] - ids = ["10", "11"] - actual_ids = db.add_documents(docs, ids=ids) - assert actual_ids == ids - - search = db.similarity_search("foo bar", k=2) - docs[0].metadata = {"id": ids[0]} - docs[1].metadata["id"] = ids[1] - assert sorted(search, key=lambda d: d.page_content) == sorted( - docs, key=lambda d: d.page_content - )