Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: destination vector db add option for embedding without field names #91

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
6 changes: 6 additions & 0 deletions airbyte_cdk/destinations/vector_db_based/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ class ProcessingConfigModel(BaseModel):
always_show=True,
examples=["text", "user.name", "users.*.name"],
)
omit_field_names_from_embeddings: bool = Field(
default=False,
title="Omit field names from embeddings",
description="Do not include the field names in the text that gets embedded. By default field names are embedded (e.g., 'user.name: John Doe \n user.email: [email protected]'). If set to true, only the values are embedded (e.g., 'John Doe \n [email protected]').",
always_show=True,
)
metadata_fields: Optional[List[str]] = Field(
default=[],
title="Fields to store as metadata",
Expand Down
23 changes: 21 additions & 2 deletions airbyte_cdk/destinations/vector_db_based/document_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import json
import logging
from dataclasses import dataclass
from typing import Any, Dict, List, Mapping, Optional, Tuple
from typing import Any, Dict, List, Mapping, Optional, Tuple, Union

import dpath
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
Expand Down Expand Up @@ -126,6 +126,7 @@ def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCata
self.text_fields = config.text_fields
self.metadata_fields = config.metadata_fields
self.field_name_mappings = config.field_name_mappings
self.omit_field_names_from_embeddings = config.omit_field_names_from_embeddings
self.logger = logging.getLogger("airbyte.document_processor")

def process(self, record: AirbyteRecordMessage) -> Tuple[List[Chunk], Optional[str]]:
Expand Down Expand Up @@ -163,10 +164,28 @@ def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]
relevant_fields = self._extract_relevant_fields(record, self.text_fields)
if len(relevant_fields) == 0:
return None
text = stringify_dict(relevant_fields)
text = self._generate_text_from_fields(relevant_fields)
metadata = self._extract_metadata(record)
return Document(page_content=text, metadata=metadata)

def _generate_text_from_fields(self, fields: Dict[str, Any]) -> str:
if self.omit_field_names_from_embeddings:
return self._extract_values_from_dict(fields)
else:
return stringify_dict(fields)

def _extract_values_from_dict(
self, data: Union[Dict[Any, Any], List[Any], Any], join_char: str = "\n"
) -> str:
if data is None:
return ""
elif isinstance(data, dict):
return join_char.join(self._extract_values_from_dict(value) for value in data.values())
elif isinstance(data, list):
return join_char.join(self._extract_values_from_dict(item) for item in data)
else:
return str(data)

def _extract_relevant_fields(
self, record: AirbyteRecordMessage, fields: Optional[List[str]]
) -> Dict[str, Any]:
Expand Down
7 changes: 7 additions & 0 deletions unit_tests/destinations/vector_db_based/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ def test_json_schema_generation():
"type": "array",
"items": {"type": "string"},
},
"omit_field_names_from_embeddings": {
"title": "Omit field names from embeddings",
"description": "Do not include the field names in the text that gets embedded. By default field names are embedded (e.g., 'user.name: John Doe \n user.email: [email protected]'). If set to true, only the values are embedded (e.g., 'John Doe \n [email protected]').",
"default": False,
"always_show": True,
"type": "boolean",
},
"metadata_fields": {
"title": "Fields to store as metadata",
"description": "List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.",
Expand Down
58 changes: 58 additions & 0 deletions unit_tests/destinations/vector_db_based/document_processor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ def test_complex_text_fields():
"non.*.existing",
]
processor.metadata_fields = ["non_text", "non_text_2", "id"]
processor.omit_field_names_from_embeddings = False

chunks, _ = processor.process(record)

Expand All @@ -214,6 +215,63 @@ def test_complex_text_fields():
}


def test_complex_text_fields_omit_field_names():
processor = initialize_processor()

record = AirbyteRecordMessage(
stream="stream1",
namespace="namespace1",
data={
"id": 1,
"nested": {
"texts": [
{"text": "This is the text"},
{"text": "And another"},
]
},
"non_text": "a",
"non_text_2": 1,
"text": "This is the regular text",
"other_nested": {"non_text": {"a": "xyz", "b": "abc"}},
"empty_list": [],
"empty_dict": {},
"large_nested": {"a": {"b": {"c": {"d": {"e": {"f": {"g": "h"}}}}}}},
},
emitted_at=1234,
)

processor.text_fields = [
"nested.texts.*.text",
"text",
"other_nested.non_text",
"non.*.existing",
"large_nested",
"empty_list",
"empty_dict",
]
processor.metadata_fields = ["non_text", "non_text_2", "id"]
processor.omit_field_names_from_embeddings = True

chunks, _ = processor.process(record)

assert len(chunks) == 1
assert (
chunks[0].page_content
== """This is the text
And another
This is the regular text
xyz
abc
h"""
)
assert chunks[0].metadata == {
"id": 1,
"non_text": "a",
"non_text_2": 1,
"_ab_stream": "namespace1_stream1",
}


def test_no_text_fields():
processor = initialize_processor()

Expand Down
Loading