From 90d9e39dad5fac9dee537537c5d3521d6424bb9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebasti=C3=A1n=20Est=C3=A9vez?= Date: Mon, 2 Dec 2024 15:43:43 -0500 Subject: [PATCH] Compute vector store usage bytes Fixes #31 Compute vector store usage bytes in `create_vector_store` and `create_vector_store_file` functions. * **`impl/routes_v2/vector_stores.py`**: - Import `os` and `HTTPException`. - Compute `usage_bytes` in `create_vector_store` by summing `usage_bytes` of each file. - Compute `usage_bytes` in `create_vector_store_file` by reading file size from the database. - Return `DeleteVectorStoreFileResponse` in `delete_vector_store_file`. * **`client/.github/workflows/run-tests.yml`**: - Add a new job for running vector store bytes tests. * **`client/tests/astra-assistants/test_vector_store_bytes.py`**: - Add a new test file to verify the `usage_bytes` attribute for vector stores. - Set up the test environment and write a test function that creates a vector store, attaches files to it, and verifies the `usage_bytes` attribute. --- For more details, open the [Copilot Workspace session](https://copilot-workspace.githubnext.com/datastax/astra-assistants-api/issues/31?shareId=XXXX-XXXX-XXXX-XXXX). --- client/.github/workflows/run-tests.yml | 36 +++++++++++++++++++ .../test_vector_store_bytes.py | 36 +++++++++++++++++++ impl/routes_v2/vector_stores.py | 23 ++++++++---- 3 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 client/tests/astra-assistants/test_vector_store_bytes.py diff --git a/client/.github/workflows/run-tests.yml b/client/.github/workflows/run-tests.yml index 93541a4..b4d8080 100644 --- a/client/.github/workflows/run-tests.yml +++ b/client/.github/workflows/run-tests.yml @@ -321,3 +321,39 @@ jobs: - name: run tests run: | poetry run pytest -s --disable-warnings tests/test_streaming_run.py + + run-astra-assistants-tests-vector-store-bytes: + runs-on: ubuntu-latest + name: run astra-assistants vector store bytes tests + env: + ASTRA_DB_APPLICATION_TOKEN: ${{ secrets.ASTRA_DB_APPLICATION_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_REGION_NAME: ${{ secrets.AWS_REGION_NAME }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + base_url: ${{ secrets.BASE_URL }} + COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PERPLEXITYAI_API_KEY: ${{ secrets.PERPLEXITYAI_API_KEY }} + GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} + + steps: + - name: Git checkout + uses: actions/checkout@v3 + - name: Set up Python 3.10.12 + uses: actions/setup-python@v2 + with: + python-version: '3.10.12' + - name: Install Poetry + run: | + curl -sSL https://install.python-poetry.org | python3 - + - name: Check Poetry Version + run: poetry --version + - name: Configure Poetry to Use Python 3.10.12 + run: poetry env use python3.10 + - name: get dependencies + run: | + poetry install + - name: run tests + run: | + poetry run pytest -s --disable-warnings tests/test_vector_store_bytes.py diff --git a/client/tests/astra-assistants/test_vector_store_bytes.py b/client/tests/astra-assistants/test_vector_store_bytes.py new file mode 100644 index 0000000..7fda807 --- /dev/null +++ b/client/tests/astra-assistants/test_vector_store_bytes.py @@ -0,0 +1,36 @@ +import os +import pytest +from impl.routes_v2.vector_stores import create_vector_store, create_vector_store_file +from openapi_server_v2.models.create_vector_store_request import CreateVectorStoreRequest +from openapi_server_v2.models.create_vector_store_file_request import CreateVectorStoreFileRequest +from openapi_server_v2.models.vector_store_object import VectorStoreObject +from openapi_server_v2.models.vector_store_file_object import VectorStoreFileObject +from impl.astra_vector import CassandraClient + +@pytest.fixture(scope="module") +def astradb(): + # Setup Cassandra client + client = CassandraClient() + yield client + client.close() + +def test_vector_store_usage_bytes(astradb): + # Create a vector store + vector_store_request = CreateVectorStoreRequest(name="Test Vector Store", file_ids=[]) + vector_store: VectorStoreObject = create_vector_store(vector_store_request, astradb) + + # Attach files to the vector store + file_paths = ["./tests/fixtures/sample1.txt", "./tests/fixtures/sample2.txt"] + total_usage_bytes = 0 + + for file_path in file_paths: + file_size = os.path.getsize(file_path) + total_usage_bytes += file_size + + file_request = CreateVectorStoreFileRequest(file_id=file_path) + vector_store_file: VectorStoreFileObject = create_vector_store_file(vector_store.id, file_request, astradb) + assert vector_store_file.usage_bytes == file_size + + # Verify the usage_bytes attribute of the vector store + updated_vector_store: VectorStoreObject = create_vector_store(vector_store_request, astradb) + assert updated_vector_store.usage_bytes == total_usage_bytes diff --git a/impl/routes_v2/vector_stores.py b/impl/routes_v2/vector_stores.py index 6c3c1ff..160ba5a 100644 --- a/impl/routes_v2/vector_stores.py +++ b/impl/routes_v2/vector_stores.py @@ -1,8 +1,9 @@ from datetime import datetime import logging import time +import os -from fastapi import APIRouter, Path, Depends, Body, Query +from fastapi import APIRouter, Path, Depends, Body, Query, HTTPException from impl.astra_vector import CassandraClient from impl.model_v2.vector_store_object import VectorStoreObject @@ -67,12 +68,12 @@ async def create_vector_store( usage_bytes = 0 for file_id in create_vector_store_request.file_ids: request = CreateVectorStoreFileRequest(file_id=file_id) - await create_vector_store_file( + vsf = await create_vector_store_file( vector_store_id=vector_store_id, create_vector_store_file_request=request, astradb=astradb ) - #TODO - compute usage_bytes + usage_bytes += vsf.usage_bytes file_id_count = len(create_vector_store_request.file_ids) file_counts = VectorStoreObjectFileCounts( @@ -118,13 +119,19 @@ async def create_vector_store_file( ) -> VectorStoreFileObject: created_at = int(time.mktime(datetime.now().timetuple()) * 1000) + file_info = astradb.select_from_table_by_pk( + table="files", partition_keys=["id"], args={"id": create_vector_store_file_request.file_id} + ) + if len(file_info) == 0: + raise HTTPException(status_code=404, detail="File not found") + file_size = file_info[0]["bytes"] + extra_fields = { "id": create_vector_store_file_request.file_id, "vector_store_id": vector_store_id, "object": "vector_store.file", "created_at": created_at, - # TODO - grab from file - "usage_bytes": -1, + "usage_bytes": file_size, "status": "completed" } vector_store_file: VectorStoreFileObject = await store_object( @@ -262,4 +269,8 @@ async def delete_vector_store_file( created_at = vsf.created_at break astradb.delete_by_pks(table="vector_store_files", keys=["id", "created_at", "vector_store_id"], values=[file_id, created_at, vector_store_id]) - + return DeleteVectorStoreFileResponse( + id=file_id, + object="vector_store.file", + deleted=True + )