diff --git a/tests/restful_client_v2/base/testbase.py b/tests/restful_client_v2/base/testbase.py index c4d0d3f2bb07e..7d127a34f74e7 100644 --- a/tests/restful_client_v2/base/testbase.py +++ b/tests/restful_client_v2/base/testbase.py @@ -101,7 +101,8 @@ def init_collection(self, collection_name, pk_field="id", metric_type="L2", dim= batch_size = batch_size batch = nb // batch_size remainder = nb % batch_size - data = [] + + full_data = [] insert_ids = [] for i in range(batch): nb = batch_size @@ -116,6 +117,7 @@ def init_collection(self, collection_name, pk_field="id", metric_type="L2", dim= assert rsp['code'] == 0 if return_insert_id: insert_ids.extend(rsp['data']['insertIds']) + full_data.extend(data) # insert remainder data if remainder: nb = remainder @@ -128,10 +130,11 @@ def init_collection(self, collection_name, pk_field="id", metric_type="L2", dim= assert rsp['code'] == 0 if return_insert_id: insert_ids.extend(rsp['data']['insertIds']) + full_data.extend(data) if return_insert_id: - return schema_payload, data, insert_ids + return schema_payload, full_data, insert_ids - return schema_payload, data + return schema_payload, full_data def wait_collection_load_completed(self, name): t0 = time.time() diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py index 3d7703f4a6f4a..b3ecfe0808ec0 100644 --- a/tests/restful_client_v2/testcases/test_vector_operations.py +++ b/tests/restful_client_v2/testcases/test_vector_operations.py @@ -4,8 +4,10 @@ import sys import json import time + +import utils.utils from utils import constant -from utils.utils import gen_collection_name +from utils.utils import gen_collection_name, get_sorted_distance from utils.util_log import test_log as logger import pytest from base.testbase import TestBase @@ -921,12 +923,10 @@ def test_upsert_vector_pk_auto_id(self, nb, dim, insert_round, id_type): @pytest.mark.L0 class TestSearchVector(TestBase): - @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @pytest.mark.parametrize("is_partition_key", [True]) @pytest.mark.parametrize("enable_dynamic_schema", [True]) - @pytest.mark.skip(reason="behavior change;todo:@zhuwenxing") @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [16]) def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, auto_id, @@ -1011,14 +1011,7 @@ def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, aut "filter": "word_count > 100", "groupingField": "user_id", "outputFields": ["*"], - "searchParams": { - "metricType": "COSINE", - "params": { - "radius": "0.1", - "range_filter": "0.8" - } - }, - "limit": 100, + "limit": 100 } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 @@ -1032,10 +1025,10 @@ def test_search_vector_with_all_vector_datatype(self, nb, dim, insert_round, aut @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) - @pytest.mark.skip(reason="behavior change;todo:@zhuwenxing") @pytest.mark.parametrize("nq", [1, 2]) + @pytest.mark.parametrize("metric_type", ['COSINE', "L2", "IP"]) def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, auto_id, - is_partition_key, enable_dynamic_schema, nq): + is_partition_key, enable_dynamic_schema, nq, metric_type): """ Insert a vector with a simple payload """ @@ -1056,7 +1049,7 @@ def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, a ] }, "indexParams": [ - {"fieldName": "float_vector", "indexName": "float_vector", "metricType": "COSINE"}, + {"fieldName": "float_vector", "indexName": "float_vector", "metricType": metric_type}, ] } rsp = self.collection_client.collection_create(payload) @@ -1100,13 +1093,6 @@ def test_search_vector_with_float_vector_datatype(self, nb, dim, insert_round, a "filter": "word_count > 100", "groupingField": "user_id", "outputFields": ["*"], - "searchParams": { - "metricType": "COSINE", - "params": { - "radius": "0.1", - "range_filter": "0.8" - } - }, "limit": 100, } rsp = self.vector_client.vector_search(payload) @@ -1227,8 +1213,8 @@ def test_search_vector_with_sparse_float_vector_datatype(self, nb, dim, insert_r @pytest.mark.parametrize("enable_dynamic_schema", [True]) @pytest.mark.parametrize("nb", [3000]) @pytest.mark.parametrize("dim", [128]) - @pytest.mark.skip(reason="behavior change;todo:@zhuwenxing") - def test_search_vector_with_binary_vector_datatype(self, nb, dim, insert_round, auto_id, + @pytest.mark.parametrize("metric_type", ['HAMMING']) + def test_search_vector_with_binary_vector_datatype(self, metric_type, nb, dim, insert_round, auto_id, is_partition_key, enable_dynamic_schema): """ Insert a vector with a simple payload @@ -1250,7 +1236,7 @@ def test_search_vector_with_binary_vector_datatype(self, nb, dim, insert_round, ] }, "indexParams": [ - {"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": "HAMMING", + {"fieldName": "binary_vector", "indexName": "binary_vector", "metricType": metric_type, "params": {"index_type": "BIN_IVF_FLAT", "nlist": "512"}} ] } @@ -1301,13 +1287,6 @@ def test_search_vector_with_binary_vector_datatype(self, nb, dim, insert_round, "data": [gen_vector(datatype="BinaryVector", dim=dim)], "filter": "word_count > 100", "outputFields": ["*"], - "searchParams": { - "metricType": "HAMMING", - "params": { - "radius": "0.1", - "range_filter": "0.8" - } - }, "limit": 100, } rsp = self.vector_client.vector_search(payload) @@ -1549,6 +1528,130 @@ def test_search_vector_with_complex_int64_varchar_and_filter(self, filter_expr): if "like" in varchar_expr: assert name.startswith(prefix) + @pytest.mark.parametrize("consistency_level", ["Strong", "Bounded", "Eventually", "Session"]) + def test_search_vector_with_consistency_level(self, consistency_level): + """ + Search a vector with different consistency level + """ + name = gen_collection_name() + self.name = name + nb = 200 + dim = 128 + limit = 100 + schema_payload, data = self.init_collection(name, dim=dim, nb=nb) + names = [] + for item in data: + names.append(item.get("name")) + names.sort() + logger.info(f"names: {names}") + mid = len(names) // 2 + prefix = names[mid][0:2] + vector_field = schema_payload.get("vectorField") + # search data + vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() + output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) + payload = { + "collectionName": name, + "data": [vector_to_search], + "outputFields": output_fields, + "limit": limit, + "offset": 0, + "consistencyLevel": consistency_level + } + rsp = self.vector_client.vector_search(payload) + assert rsp['code'] == 0 + res = rsp['data'] + logger.info(f"res: {len(res)}") + assert len(res) == limit + + @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"]) + def test_search_vector_with_range_search(self, metric_type): + """ + Search a vector with range search with different metric type + """ + name = gen_collection_name() + self.name = name + nb = 3000 + dim = 128 + limit = 100 + schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type) + vector_field = schema_payload.get("vectorField") + # search data + vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() + training_data = [item[vector_field] for item in data] + distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type) + r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.2*limit))] # recall is not 100% so add 20% to make sure the range is correct + if metric_type == "L2": + r1, r2 = r2, r1 + output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) + payload = { + "collectionName": name, + "data": [vector_to_search], + "outputFields": output_fields, + "limit": limit, + "offset": 0, + "searchParams": { + "params": { + "radius": r1, + "range_filter": r2, + } + } + } + rsp = self.vector_client.vector_search(payload) + assert rsp['code'] == 0 + res = rsp['data'] + logger.info(f"res: {len(res)}") + assert len(res) == limit + for item in res: + distance = item.get("distance") + if metric_type == "L2": + assert r1 > distance > r2 + else: + assert r1 < distance < r2 + + @pytest.mark.parametrize("ignore_growing", [True, False]) + def test_search_vector_with_ignore_growing(self, ignore_growing): + """ + Search a vector with range search with different metric type + """ + name = gen_collection_name() + self.name = name + metric_type = "COSINE" + nb = 1000 + dim = 128 + limit = 100 + schema_payload, data = self.init_collection(name, dim=dim, nb=nb, metric_type=metric_type) + vector_field = schema_payload.get("vectorField") + # search data + vector_to_search = preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() + training_data = [item[vector_field] for item in data] + distance_sorted = get_sorted_distance(training_data, [vector_to_search], metric_type) + r1, r2 = distance_sorted[0][nb//2], distance_sorted[0][nb//2+limit+int((0.2*limit))] # recall is not 100% so add 20% to make sure the range is correct + if metric_type == "L2": + r1, r2 = r2, r1 + output_fields = get_common_fields_by_data(data, exclude_fields=[vector_field]) + + payload = { + "collectionName": name, + "data": [vector_to_search], + "outputFields": output_fields, + "limit": limit, + "offset": 0, + "searchParams": { + "ignore_growing": ignore_growing + + } + } + rsp = self.vector_client.vector_search(payload) + assert rsp['code'] == 0 + res = rsp['data'] + logger.info(f"res: {len(res)}") + if ignore_growing is True: + assert len(res) == 0 + else: + assert len(res) == limit + + @pytest.mark.L1 class TestSearchVectorNegative(TestBase): diff --git a/tests/restful_client_v2/utils/utils.py b/tests/restful_client_v2/utils/utils.py index cbd7640edf0eb..0c93e566cd99d 100644 --- a/tests/restful_client_v2/utils/utils.py +++ b/tests/restful_client_v2/utils/utils.py @@ -10,7 +10,7 @@ import requests from loguru import logger import datetime - +from sklearn.metrics import pairwise_distances fake = Faker() rng = np.random.default_rng() @@ -240,4 +240,28 @@ def get_all_fields_by_data(data, exclude_fields=None): return list(fields) +def ip_distance(x, y): + return np.dot(x, y) + + +def cosine_distance(u, v, epsilon=1e-8): + dot_product = np.dot(u, v) + norm_u = np.linalg.norm(u) + norm_v = np.linalg.norm(v) + return dot_product / (max(norm_u * norm_v, epsilon)) + + +def l2_distance(u, v): + return np.sum((u - v) ** 2) + +def get_sorted_distance(train_emb, test_emb, metric_type): + milvus_sklearn_metric_map = { + "L2": l2_distance, + "COSINE": cosine_distance, + "IP": ip_distance + } + distance = pairwise_distances(train_emb, Y=test_emb, metric=milvus_sklearn_metric_map[metric_type], n_jobs=-1) + distance = np.array(distance.T, order='C', dtype=np.float16) + distance_sorted = np.sort(distance, axis=1).tolist() + return distance_sorted