Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

weaviate: migrate from weaviate python client v3 to v4 #463

Merged
merged 63 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
c8847af
upgrade to latest weaviate server
hsm207 Feb 22, 2024
885a777
upgrade to latest weaviate client
hsm207 Feb 22, 2024
104a981
reformat code
hsm207 Feb 22, 2024
fe7b1a3
create client using v4 api
hsm207 Feb 22, 2024
115d965
use v4 api to create collection
hsm207 Feb 22, 2024
e88b1ac
store collection obj for convenience
hsm207 Feb 22, 2024
2e8a498
Merge branch 'main' of https://github.com/deepset-ai/haystack-core-in…
hsm207 Feb 26, 2024
e45c7b5
upgrade filters to use v4 api
hsm207 Feb 27, 2024
3791501
upgrade batch write to use v4 api
hsm207 Feb 27, 2024
9d70bde
use v4 api cursor to retrieve all docs
hsm207 Feb 27, 2024
7a121b8
upgrade query with filters to use v4 api
hsm207 Feb 27, 2024
3e57ba2
upgrade filter documents to use v4 API
hsm207 Feb 27, 2024
c9544bd
update weaviate fixture to align with v4 API
hsm207 Feb 27, 2024
6c8c43a
update v4 to v3 conversion logic
hsm207 Feb 27, 2024
25d72ad
fix typo
hsm207 Feb 27, 2024
887e35f
fix date v4 to v3 conversion logic
hsm207 Feb 27, 2024
0e481d0
hardcode limit in query filter
hsm207 Feb 27, 2024
b4c5b49
fix typo
hsm207 Feb 27, 2024
d79c417
upgrade weaviate server
hsm207 Feb 27, 2024
c777658
update v4 to v3 object date conversion
hsm207 Feb 27, 2024
5e7b431
fix invert logic bug
hsm207 Feb 27, 2024
67ec755
upgrade delete function to v4 API
hsm207 Feb 27, 2024
3404229
update bm25 search to v4 API
hsm207 Feb 27, 2024
d9a5862
update count docs to v4 API
hsm207 Feb 27, 2024
d2a5280
update _write to use v4 API
hsm207 Feb 27, 2024
995287d
support optional filters in bm25
hsm207 Feb 27, 2024
756b9ef
update embedding retrieval to use v4 API
hsm207 Feb 27, 2024
4a31050
Merge branch 'main' of https://github.com/deepset-ai/haystack-core-in…
hsm207 Feb 27, 2024
928c88b
update from_dict for v4 API
hsm207 Feb 28, 2024
16b42b1
fix write invalid input test
hsm207 Feb 28, 2024
04983fe
update other test_from_dict for V4
hsm207 Feb 28, 2024
f3f49f3
update test_to_dict for v4
hsm207 Feb 28, 2024
1655f0f
update test_init for v4 API
hsm207 Feb 28, 2024
66c1fd0
try to pas test_init
hsm207 Feb 28, 2024
e2ea14c
pass test_init
hsm207 Feb 28, 2024
8ee53fd
add exception handling in _query_paginated
hsm207 Feb 28, 2024
1be0ec9
remove commented out code
hsm207 Feb 28, 2024
5b9d2b2
remove dead code
hsm207 Feb 28, 2024
6c4fd71
Merge branch 'main' of https://github.com/deepset-ai/haystack-core-in…
hsm207 Feb 28, 2024
d45c7b9
Merge branch 'main' into weaviate-client-v4
hsm207 Feb 29, 2024
2c3e446
Merge branch 'main' into weaviate-client-v4
hsm207 Feb 29, 2024
8d72423
remove commented out code
hsm207 Feb 29, 2024
3f8fa4c
Merge branch 'weaviate-client-v4' of https://github.com/hsm207/haysta…
hsm207 Feb 29, 2024
f35ef05
return weaviate traceback too when query error occurs
hsm207 Feb 29, 2024
bdbe86a
make _query_paginated return an iterator
hsm207 Mar 1, 2024
3463245
Merge branch 'main' of https://github.com/deepset-ai/haystack-core-in…
hsm207 Mar 2, 2024
6be11af
refactor _to_document
hsm207 Mar 4, 2024
6a6013b
Merge branch 'main' of https://github.com/deepset-ai/haystack-core-in…
hsm207 Mar 4, 2024
0fa483e
remove v4 to v3 object conv fn
hsm207 Mar 4, 2024
1202aa5
update to_dict serialization
hsm207 Mar 4, 2024
9647151
update test case
hsm207 Mar 4, 2024
207f427
update weaviate server
hsm207 Mar 5, 2024
9a30bdd
updates due to latest client changes
hsm207 Mar 5, 2024
f620aaf
update test case due to latest client changes
hsm207 Mar 5, 2024
1a8f367
Merge branch 'main' into weaviate-client-v4
hsm207 Mar 5, 2024
46ade1f
Merge branch 'main' into weaviate-client-v4
hsm207 Mar 6, 2024
7ff8dde
Fix filter converters return types
silvanocerza Mar 12, 2024
9727a1d
Rework query methods
silvanocerza Mar 12, 2024
8cfcf3d
Fix batch writing errors
silvanocerza Mar 12, 2024
a18e914
Handle different vector types in _to_document
silvanocerza Mar 12, 2024
5a7cc21
Add pagination tests
silvanocerza Mar 12, 2024
811f6bc
Merge branch 'main' into weaviate-client-v4
hsm207 Mar 13, 2024
1771e1d
Fix pagination test
silvanocerza Mar 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion integrations/weaviate/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ services:
- '8080'
- --scheme
- http
image: semitechnologies/weaviate:1.23.2
image: semitechnologies/weaviate:1.24.1
ports:
- 8080:8080
- 50051:50051
Expand Down
2 changes: 1 addition & 1 deletion integrations/weaviate/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ classifiers = [
]
dependencies = [
"haystack-ai",
"weaviate-client==3.*",
"weaviate-client",
"haystack-pydoc-tools",
"python-dateutil",
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
from haystack.errors import FilterError
from pandas import DataFrame

import weaviate
from weaviate.collections.classes.filters import Filter, FilterReturn

def convert_filters(filters: Dict[str, Any]) -> Dict[str, Any]:

def convert_filters(filters: Dict[str, Any]) -> FilterReturn:
"""
Convert filters from Haystack format to Weaviate format.
"""
Expand All @@ -14,7 +17,7 @@ def convert_filters(filters: Dict[str, Any]) -> Dict[str, Any]:
raise FilterError(msg)

if "field" in filters:
return {"operator": "And", "operands": [_parse_comparison_condition(filters)]}
return Filter.all_of([_parse_comparison_condition(filters)])
return _parse_logical_condition(filters)


Expand All @@ -29,7 +32,7 @@ def convert_filters(filters: Dict[str, Any]) -> Dict[str, Any]:
"not in": "in",
"AND": "OR",
"OR": "AND",
"NOT": "AND",
"NOT": "OR",
}


Expand All @@ -51,7 +54,13 @@ def _invert_condition(filters: Dict[str, Any]) -> Dict[str, Any]:
return inverted_condition


def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
LOGICAL_OPERATORS = {
"AND": Filter.all_of,
"OR": Filter.any_of,
}


def _parse_logical_condition(condition: Dict[str, Any]) -> FilterReturn:
if "operator" not in condition:
msg = f"'operator' key missing in {condition}"
raise FilterError(msg)
Expand All @@ -67,7 +76,7 @@ def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
operands.append(_parse_logical_condition(c))
else:
operands.append(_parse_comparison_condition(c))
return {"operator": operator.lower().capitalize(), "operands": operands}
return LOGICAL_OPERATORS[operator](operands)
elif operator == "NOT":
inverted_conditions = _invert_condition(condition)
return _parse_logical_condition(inverted_conditions)
Expand All @@ -76,28 +85,6 @@ def _parse_logical_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
raise FilterError(msg)


def _infer_value_type(value: Any) -> str:
if value is None:
return "valueNull"

if isinstance(value, bool):
return "valueBoolean"
if isinstance(value, int):
return "valueInt"
if isinstance(value, float):
return "valueNumber"

if isinstance(value, str):
try:
parser.isoparse(value)
return "valueDate"
except ValueError:
return "valueText"

msg = f"Unknown value type {type(value)}"
raise FilterError(msg)


def _handle_date(value: Any) -> str:
if isinstance(value, str):
try:
Expand All @@ -107,25 +94,22 @@ def _handle_date(value: Any) -> str:
return value


def _equal(field: str, value: Any) -> Dict[str, Any]:
def _equal(field: str, value: Any) -> FilterReturn:
if value is None:
return {"path": field, "operator": "IsNull", "valueBoolean": True}
return {"path": field, "operator": "Equal", _infer_value_type(value): _handle_date(value)}
return weaviate.classes.query.Filter.by_property(field).is_none(True)
return weaviate.classes.query.Filter.by_property(field).equal(_handle_date(value))


def _not_equal(field: str, value: Any) -> Dict[str, Any]:
def _not_equal(field: str, value: Any) -> FilterReturn:
if value is None:
return {"path": field, "operator": "IsNull", "valueBoolean": False}
return {
"operator": "Or",
"operands": [
{"path": field, "operator": "NotEqual", _infer_value_type(value): _handle_date(value)},
{"path": field, "operator": "IsNull", "valueBoolean": True},
],
}
return weaviate.classes.query.Filter.by_property(field).is_none(False)

return weaviate.classes.query.Filter.by_property(field).not_equal(
_handle_date(value)
) | weaviate.classes.query.Filter.by_property(field).is_none(True)

def _greater_than(field: str, value: Any) -> Dict[str, Any]:

def _greater_than(field: str, value: Any) -> FilterReturn:
if value is None:
# When the value is None and '>' is used we create a filter that would return a Document
# if it has a field set and not set at the same time.
Expand All @@ -144,10 +128,10 @@ def _greater_than(field: str, value: Any) -> Dict[str, Any]:
if type(value) in [list, DataFrame]:
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return {"path": field, "operator": "GreaterThan", _infer_value_type(value): _handle_date(value)}
return weaviate.classes.query.Filter.by_property(field).greater_than(_handle_date(value))


def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]:
def _greater_than_equal(field: str, value: Any) -> FilterReturn:
if value is None:
# When the value is None and '>=' is used we create a filter that would return a Document
# if it has a field set and not set at the same time.
Expand All @@ -166,10 +150,10 @@ def _greater_than_equal(field: str, value: Any) -> Dict[str, Any]:
if type(value) in [list, DataFrame]:
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return {"path": field, "operator": "GreaterThanEqual", _infer_value_type(value): _handle_date(value)}
return weaviate.classes.query.Filter.by_property(field).greater_or_equal(_handle_date(value))


def _less_than(field: str, value: Any) -> Dict[str, Any]:
def _less_than(field: str, value: Any) -> FilterReturn:
if value is None:
# When the value is None and '<' is used we create a filter that would return a Document
# if it has a field set and not set at the same time.
Expand All @@ -188,10 +172,10 @@ def _less_than(field: str, value: Any) -> Dict[str, Any]:
if type(value) in [list, DataFrame]:
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return {"path": field, "operator": "LessThan", _infer_value_type(value): _handle_date(value)}
return weaviate.classes.query.Filter.by_property(field).less_than(_handle_date(value))


def _less_than_equal(field: str, value: Any) -> Dict[str, Any]:
def _less_than_equal(field: str, value: Any) -> FilterReturn:
if value is None:
# When the value is None and '<=' is used we create a filter that would return a Document
# if it has a field set and not set at the same time.
Expand All @@ -210,22 +194,23 @@ def _less_than_equal(field: str, value: Any) -> Dict[str, Any]:
if type(value) in [list, DataFrame]:
msg = f"Filter value can't be of type {type(value)} using operators '>', '>=', '<', '<='"
raise FilterError(msg)
return {"path": field, "operator": "LessThanEqual", _infer_value_type(value): _handle_date(value)}
return weaviate.classes.query.Filter.by_property(field).less_or_equal(_handle_date(value))


def _in(field: str, value: Any) -> Dict[str, Any]:
def _in(field: str, value: Any) -> FilterReturn:
if not isinstance(value, list):
msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators"
raise FilterError(msg)

return {"operator": "And", "operands": [_equal(field, v) for v in value]}
return weaviate.classes.query.Filter.by_property(field).contains_any(value)


def _not_in(field: str, value: Any) -> Dict[str, Any]:
def _not_in(field: str, value: Any) -> FilterReturn:
if not isinstance(value, list):
msg = f"{field}'s value must be a list when using 'in' or 'not in' comparators"
raise FilterError(msg)
return {"operator": "And", "operands": [_not_equal(field, v) for v in value]}
operands = [weaviate.classes.query.Filter.by_property(field).not_equal(v) for v in value]
return Filter.all_of(operands)


COMPARISON_OPERATORS = {
Expand All @@ -240,7 +225,7 @@ def _not_in(field: str, value: Any) -> Dict[str, Any]:
}


def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
def _parse_comparison_condition(condition: Dict[str, Any]) -> FilterReturn:
field: str = condition["field"]

if field.startswith("meta."):
Expand All @@ -265,15 +250,11 @@ def _parse_comparison_condition(condition: Dict[str, Any]) -> Dict[str, Any]:
return COMPARISON_OPERATORS[operator](field, value)


def _match_no_document(field: str) -> Dict[str, Any]:
def _match_no_document(field: str) -> FilterReturn:
"""
Returns a filters that will match no Document, this is used to keep the behavior consistent
between different Document Stores.
"""
return {
"operator": "And",
"operands": [
{"path": field, "operator": "IsNull", "valueBoolean": False},
{"path": field, "operator": "IsNull", "valueBoolean": True},
],
}

operands = [weaviate.classes.query.Filter.by_property(field).is_none(val) for val in [False, True]]
return Filter.all_of(operands)
Loading