Skip to content

Commit

Permalink
test: add bulk insert case for text match feature
Browse files Browse the repository at this point in the history
Signed-off-by: zhuwenxing <[email protected]>
  • Loading branch information
zhuwenxing committed Sep 20, 2024
1 parent dce6734 commit 3f85e02
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 0 deletions.
30 changes: 30 additions & 0 deletions tests/python_client/common/bulk_insert_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class DataField:
fp16_vec_field = "float16_vec_field"
int_field = "int_scalar"
string_field = "string_scalar"
text_field = "text_scalar"
bool_field = "bool_scalar"
float_field = "float_scalar"
double_field = "double_scalar"
Expand Down Expand Up @@ -403,6 +404,21 @@ def gen_string_in_numpy_file(dir, data_field, rows, start=0, force=False):
return file_name


def gen_text_in_numpy_file(dir, data_field, rows, start=0, force=False):
file_name = f"{data_field}.npy"
file = f"{dir}/{file_name}"
if not os.path.exists(file) or force:
# non vector columns
data = []
if rows > 0:
data = [fake.text() + "milvus" for i in range(start, rows+start)]
arr = np.array(data)
# print(f"file_name: {file_name} data type: {arr.dtype}")
log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}")
np.save(file, arr)
return file_name


def gen_dynamic_field_in_numpy_file(dir, rows, start=0, force=False):
file_name = f"$meta.npy"
file = f"{dir}/{file_name}"
Expand Down Expand Up @@ -553,6 +569,11 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
data = [gen_unique_str(str(i)) for i in range(start, rows + start)]
else:
data = [None for _ in range(start, rows + start)]
elif data_field == DataField.text_field:
if not nullable:
data = [fake.text() + " milvus " for i in range(start, rows + start)]
else:
data = [None for _ in range(start, rows + start)]
elif data_field == DataField.bool_field:
if not nullable:
data = [random.choice([True, False]) for i in range(start, rows + start)]
Expand Down Expand Up @@ -598,6 +619,8 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
else:
data = pd.Series(
[np.array(None) for i in range(start, rows + start)])
else:
raise Exception("unsupported field name")
return data


Expand Down Expand Up @@ -714,6 +737,9 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
elif data_field == DataField.string_field:
if not nullable:
d[data_field] = gen_unique_str(str(r + start))
elif data_field == DataField.text_field:
if not nullable:
d[data_field] = fake.text() + " milvus "
elif data_field == DataField.bool_field:
if not nullable:
d[data_field] = random.choice([True, False])
Expand Down Expand Up @@ -746,6 +772,8 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)]
else:
d[data_field] = None
else:
raise Exception("unsupported field name")
if enable_dynamic_field:
d[str(r+start)] = r+start
d["name"] = fake.name()
Expand Down Expand Up @@ -845,6 +873,8 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
vector_type=vector_type, rows=rows, dim=dim, force=force)
elif data_field == DataField.string_field: # string field for numpy not supported yet at 2022-10-17
file_name = gen_string_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
elif data_field == DataField.text_field:
file_name = gen_text_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
elif data_field == DataField.bool_field:
file_name = gen_bool_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
elif data_field == DataField.json_field:
Expand Down
9 changes: 9 additions & 0 deletions tests/python_client/testcases/test_bulk_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,7 @@ def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
cf.gen_string_field(name=df.text_field, enable_match=True),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
Expand Down Expand Up @@ -895,6 +896,8 @@ def test_bulk_insert_all_field_with_new_json_format(self, auto_id, dim, entities
query_data = [r[expr_field] for r in res][:len(self.collection_wrap.partitions)]
res, _ = self.collection_wrap.query(expr=f"{expr_field} in {query_data}", output_fields=[expr_field])
assert len(res) == len(query_data)
res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field])
assert len(res) == entities
if enable_partition_key:
assert len(self.collection_wrap.partitions) > 1

Expand Down Expand Up @@ -929,6 +932,7 @@ def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_d
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
cf.gen_string_field(name=df.text_field, enable_match=True),
cf.gen_json_field(name=df.json_field),
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
Expand Down Expand Up @@ -1042,6 +1046,8 @@ def test_bulk_insert_all_field_with_numpy(self, auto_id, dim, entities, enable_d
query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)]
res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field])
assert len(res) == len(query_data)
res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field])
assert len(res) == entities
if enable_partition_key:
assert len(self.collection_wrap.partitions) > 1

Expand Down Expand Up @@ -1078,6 +1084,7 @@ def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable
cf.gen_int64_field(name=df.int_field, nullable=nullable),
cf.gen_float_field(name=df.float_field, nullable=nullable),
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable),
cf.gen_string_field(name=df.text_field, enable_match=True),
cf.gen_json_field(name=df.json_field, nullable=nullable),
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable),
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
Expand Down Expand Up @@ -1195,6 +1202,8 @@ def test_bulk_insert_all_field_with_parquet(self, auto_id, dim, entities, enable
query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)]
res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field])
assert len(res) == len(query_data)
res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field])
assert len(res) == entities
if enable_partition_key:
assert len(self.collection_wrap.partitions) > 1

Expand Down

0 comments on commit 3f85e02

Please sign in to comment.