Skip to content

Commit

Permalink
test: [cherry-pick]Update tokenizer and checker (#37325)
Browse files Browse the repository at this point in the history
pr: #37275 
pr: #37199
  • Loading branch information
zhuwenxing authored Oct 31, 2024
1 parent e585f6d commit aa2878b
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 30 deletions.
2 changes: 1 addition & 1 deletion tests/python_client/chaos/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ def __init__(self, collection_name=None, partition_name=None, shards_num=2, dim=
self.c_wrap.load(replica_number=self.replica_number)

self.p_wrap.init_partition(self.c_name, self.p_name)
if insert_data:
if insert_data and self.c_wrap.num_entities == 0:
log.info(f"collection {c_name} created, start to insert data")
t0 = time.perf_counter()
self.insert_data(nb=constants.ENTITIES_FOR_SEARCH, partition_name=self.p_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class TestGetCollections(TestcaseBase):
def test_get_collections_by_prefix(self,):
self._connect()
all_collections = self.utility_wrap.list_collections()[0]
all_collections = [c_name for c_name in all_collections if "Checker" in c_name]
all_collections = [c_name for c_name in all_collections if c_name.startswith("Checker")]
selected_collections_map = {}
for c_name in all_collections:
if Collection(name=c_name).num_entities < constants.ENTITIES_FOR_SEARCH:
Expand Down
11 changes: 8 additions & 3 deletions tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def remove_punctuation(text):
# Tokenize the corpus
def jieba_split(text):
text_without_punctuation = remove_punctuation(text)
return jieba.lcut(text_without_punctuation)
return jieba.cut_for_search(text_without_punctuation)

def blank_space_split(text):
text_without_punctuation = remove_punctuation(text)
Expand Down Expand Up @@ -156,7 +156,7 @@ def analyze_documents(texts, language="en"):
if isinstance(text, str):
new_texts.append(text)
# Tokenize the corpus
tokenized = tokenizer.tokenize(new_texts, return_as="tuple")
tokenized = tokenizer.tokenize(new_texts, return_as="tuple", show_progress=False)
# log.info(f"Tokenized: {tokenized}")
# Create a frequency counter
freq = Counter()
Expand All @@ -169,8 +169,13 @@ def analyze_documents(texts, language="en"):

# Convert token ids back to words
word_freq = Counter({id_to_word[token_id]: count for token_id, count in freq.items()})
log.debug(f"word freq {word_freq.most_common(10)}")


# if language in ["zh", "cn", "chinese"], remove the long words
# this is a trick to make the text match test case verification simple, because the long word can be still split
if language in ["zh", "cn", "chinese"]:
word_freq = Counter({word: count for word, count in word_freq.items() if 1< len(word) <= 3})
log.info(f"word freq {word_freq.most_common(10)}")
return word_freq


Expand Down
34 changes: 9 additions & 25 deletions tests/python_client/testcases/test_full_text_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,6 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -658,7 +657,6 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_
if i + batch_size < len(data)
else data[i: len(data)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -800,7 +798,6 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang,
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(df[i: i + batch_size])
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -938,7 +935,6 @@ def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
num_entities = collection_w.num_entities
# query with count(*)
res, _ = collection_w.query(
Expand Down Expand Up @@ -1190,7 +1186,6 @@ def test_upsert_for_full_text_search(self, tokenizer, nullable):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -1348,7 +1343,6 @@ def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nulla
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -1486,7 +1480,6 @@ def test_delete_for_full_text_search(self, tokenizer):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -1651,7 +1644,6 @@ def test_create_index_for_full_text_search_default(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -1775,7 +1767,6 @@ def test_create_full_text_search_with_invalid_index_type(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -1884,7 +1875,6 @@ def test_create_full_text_search_index_with_invalid_metric_type(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -1993,8 +1983,6 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()

error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"}
collection_w.create_index(
"emb",
Expand Down Expand Up @@ -2091,7 +2079,6 @@ def test_create_full_text_search_with_invalid_bm25_params(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2227,7 +2214,6 @@ def test_full_text_search_default(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2316,7 +2302,7 @@ def test_full_text_search_default(
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"

@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nq", [10])
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0.5])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
Expand Down Expand Up @@ -2409,7 +2395,10 @@ def test_full_text_search_with_jieba_tokenizer(
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
tokens = []
for item in word_freq.most_common(20):
if len(item[0]) == 2:
tokens.append(item[0])
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
Expand All @@ -2420,7 +2409,6 @@ def test_full_text_search_with_jieba_tokenizer(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2612,7 +2600,6 @@ def test_full_text_search_with_range_search(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2778,7 +2765,6 @@ def test_full_text_search_with_search_iterator(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -2812,16 +2798,17 @@ def test_full_text_search_with_search_iterator(
output_fields=["id", "text", "text_sparse_emb"],
limit=limit
)
iter_result = []
while True:
result = iterator.next()
if not result:
iterator.close()
break
else:
assert len(result) == batch_size
iter_result.append(len(result))
for r in iter_result[:-1]:
assert r == batch_size


# @pytest.mark.skip("skip")
class TestSearchWithFullTextSearchNegative(TestcaseBase):
"""
******************************************************************
Expand Down Expand Up @@ -2925,7 +2912,6 @@ def test_search_for_full_text_search_with_empty_string_search_data(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -3062,7 +3048,6 @@ def test_search_for_full_text_search_with_invalid_search_data(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down Expand Up @@ -3200,7 +3185,6 @@ def test_hybrid_search_with_full_text_search(
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"dense_emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
Expand Down

0 comments on commit aa2878b

Please sign in to comment.