add metadata subset columns to query

lspataroG · Jun 28, 2024 · a33b972 · a33b972
1 parent 4e85436
commit a33b972
Show file tree

Hide file tree

Showing 2 changed files with 1,785 additions and 22 deletions.
diff --git a/libs/community/langchain_google_community/bq_storage_vectorstores/bruteforce.py b/libs/community/langchain_google_community/bq_storage_vectorstores/bruteforce.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-from pydantic import Field
 from typing import Any, Dict, List, Optional, Union
 from langchain_core.documents import Document
 from langchain_core.pydantic_v1 import root_validator
@@ -28,33 +27,32 @@ class BruteForceBQVectorStore(BaseBigQueryVectorStore):
         embedding_field: Name of the column storing text embeddings (default:
             "embedding").
         doc_id_field: Name of the column storing document IDs (default: "doc_id").
+        additional_columns_to_query: Metadata columns to select during the query,
+            if None queries all the colums (equivalent of SELECT *)
         credentials: Optional Google Cloud credentials object.
     """
 
     df: pd.DataFrame = None
     vectors: Any = None
     vectors_transpose: Any = None
     df_records: List[Dict] = None
+    additional_columns_to_query: List[str] = None
 
     @root_validator(pre=False, skip_on_failure=True)
     def initialize_bf_vector_index(cls, values: dict) -> dict:
-        values["df"] = None
-        values["vectors"] = None
-        values["vectors_transpose"] = None
-        values["df_records"] = None
-        values["_logger"].info("Run .sync_data() to initialise the data from Big Query")
-        return values
-
-
-    def sync_data(self):
-        self.df = self._query_table_to_df()
-        self.vectors = np.array(
-            self.df[self.embedding_field].tolist()
+        values["df"] = cls._query_table_to_df(values)
+        values["vectors"] = np.array(
+            values["df"][values["embedding_field"]].tolist()
         )
-        self.vectors_transpose = self.vectors.T
-        self.df_records = self.df.drop(
-            columns=[self.embedding_field]
+        values["vectors_transpose"] = values["vectors"].T
+        values["df_records"] = values["df"].drop(
+            columns=[values["embedding_field"]]
         ).to_dict("records")
+        return values
+
+    @classmethod
+    def sync_data(cls, values):
+        pass
 
     def _similarity_search_by_vectors_with_scores_and_embeddings(
         self,
@@ -123,18 +121,29 @@ def _similarity_search_by_vectors_with_scores_and_embeddings(
                 )
             documents.append(query_docs)
         return documents
-
-    def _query_table_to_df(self) -> pd.DataFrame:
+
+    @staticmethod
+    def _query_table_to_df(values) -> pd.DataFrame:
         from google.cloud import bigquery
-        table = self._full_table_id
-        query = f"SELECT * FROM {table}"
+        table = values["_full_table_id"]
+        metadata_fields = values["additional_columns_to_query"]
+        if metadata_fields is None:
+            select_str = "*"
+        else:
+            metadata_fields_str = ", ".join(metadata_fields)
+            select_str = f"""{values['doc_id_field']}, {values['content_field']}, 
+            {values['embedding_field']}, {metadata_fields_str}"""
+        query = f"""
+        SELECT {select_str}
+        FROM {table}
+        """
         # Create a query job to read the data
-        self._logger.info(f"Reading data from {table}. It might take a few minutes...")
+        values["_logger"].info(f"Reading data from {table}. It might take a few minutes...")
         job_config = bigquery.QueryJobConfig(
             use_query_cache=True,
             priority=bigquery.QueryPriority.INTERACTIVE
         )
-        query_job = self._bq_client.query(query, job_config=job_config)
+        query_job = values["_bq_client"].query(query, job_config=job_config)
         df = query_job.to_dataframe()
         return df
 

diff --git a/libs/rag_qa_with_bq_and_featurestore.ipynb b/libs/rag_qa_with_bq_and_featurestore.ipynb