Skip to content

Commit

Permalink
add metadata subset columns to query
Browse files Browse the repository at this point in the history
  • Loading branch information
lspataroG committed Jun 28, 2024
1 parent 4e85436 commit a33b972
Show file tree
Hide file tree
Showing 2 changed files with 1,785 additions and 22 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
import pandas as pd
from pydantic import Field
from typing import Any, Dict, List, Optional, Union
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import root_validator
Expand Down Expand Up @@ -28,33 +27,32 @@ class BruteForceBQVectorStore(BaseBigQueryVectorStore):
embedding_field: Name of the column storing text embeddings (default:
"embedding").
doc_id_field: Name of the column storing document IDs (default: "doc_id").
additional_columns_to_query: Metadata columns to select during the query,
if None queries all the colums (equivalent of SELECT *)
credentials: Optional Google Cloud credentials object.
"""

df: pd.DataFrame = None
vectors: Any = None
vectors_transpose: Any = None
df_records: List[Dict] = None
additional_columns_to_query: List[str] = None

@root_validator(pre=False, skip_on_failure=True)
def initialize_bf_vector_index(cls, values: dict) -> dict:
values["df"] = None
values["vectors"] = None
values["vectors_transpose"] = None
values["df_records"] = None
values["_logger"].info("Run .sync_data() to initialise the data from Big Query")
return values


def sync_data(self):
self.df = self._query_table_to_df()
self.vectors = np.array(
self.df[self.embedding_field].tolist()
values["df"] = cls._query_table_to_df(values)
values["vectors"] = np.array(
values["df"][values["embedding_field"]].tolist()
)
self.vectors_transpose = self.vectors.T
self.df_records = self.df.drop(
columns=[self.embedding_field]
values["vectors_transpose"] = values["vectors"].T
values["df_records"] = values["df"].drop(
columns=[values["embedding_field"]]
).to_dict("records")
return values

@classmethod
def sync_data(cls, values):
pass

def _similarity_search_by_vectors_with_scores_and_embeddings(
self,
Expand Down Expand Up @@ -123,18 +121,29 @@ def _similarity_search_by_vectors_with_scores_and_embeddings(
)
documents.append(query_docs)
return documents

def _query_table_to_df(self) -> pd.DataFrame:

@staticmethod
def _query_table_to_df(values) -> pd.DataFrame:
from google.cloud import bigquery
table = self._full_table_id
query = f"SELECT * FROM {table}"
table = values["_full_table_id"]
metadata_fields = values["additional_columns_to_query"]
if metadata_fields is None:
select_str = "*"
else:
metadata_fields_str = ", ".join(metadata_fields)
select_str = f"""{values['doc_id_field']}, {values['content_field']},
{values['embedding_field']}, {metadata_fields_str}"""
query = f"""
SELECT {select_str}
FROM {table}
"""
# Create a query job to read the data
self._logger.info(f"Reading data from {table}. It might take a few minutes...")
values["_logger"].info(f"Reading data from {table}. It might take a few minutes...")

Check failure on line 141 in libs/community/langchain_google_community/bq_storage_vectorstores/bruteforce.py

View workflow job for this annotation

GitHub Actions / cd libs/community / - / make lint #3.8

Ruff (E501)

langchain_google_community/bq_storage_vectorstores/bruteforce.py:141:89: E501 Line too long (92 > 88)

Check failure on line 141 in libs/community/langchain_google_community/bq_storage_vectorstores/bruteforce.py

View workflow job for this annotation

GitHub Actions / cd libs/community / - / make lint #3.11

Ruff (E501)

langchain_google_community/bq_storage_vectorstores/bruteforce.py:141:89: E501 Line too long (92 > 88)
job_config = bigquery.QueryJobConfig(
use_query_cache=True,
priority=bigquery.QueryPriority.INTERACTIVE
)
query_job = self._bq_client.query(query, job_config=job_config)
query_job = values["_bq_client"].query(query, job_config=job_config)
df = query_job.to_dataframe()
return df

Expand Down
1,754 changes: 1,754 additions & 0 deletions libs/rag_qa_with_bq_and_featurestore.ipynb

Large diffs are not rendered by default.

0 comments on commit a33b972

Please sign in to comment.