test: [cherry-pick]Update tokenizer and checker (#37325)

pr: #37275 pr: #37199
milvus-io · Oct 31, 2024 · aa2878b · aa2878b
1 parent e585f6d
commit aa2878b
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 30 deletions.
diff --git a/tests/python_client/chaos/checker.py b/tests/python_client/chaos/checker.py
@@ -408,7 +408,7 @@ def __init__(self, collection_name=None, partition_name=None, shards_num=2, dim=
         self.c_wrap.load(replica_number=self.replica_number)
 
         self.p_wrap.init_partition(self.c_name, self.p_name)
-        if insert_data:
+        if insert_data and self.c_wrap.num_entities == 0:
             log.info(f"collection {c_name} created, start to insert data")
             t0 = time.perf_counter()
             self.insert_data(nb=constants.ENTITIES_FOR_SEARCH, partition_name=self.p_name)

diff --git a/tests/python_client/chaos/testcases/test_get_collections.py b/tests/python_client/chaos/testcases/test_get_collections.py
@@ -17,7 +17,7 @@ class TestGetCollections(TestcaseBase):
     def test_get_collections_by_prefix(self,):
         self._connect()
         all_collections = self.utility_wrap.list_collections()[0]
-        all_collections = [c_name for c_name in all_collections if "Checker" in c_name]
+        all_collections = [c_name for c_name in all_collections if c_name.startswith("Checker")]
         selected_collections_map = {}
         for c_name in all_collections:
             if Collection(name=c_name).num_entities < constants.ENTITIES_FOR_SEARCH:

diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py
@@ -127,7 +127,7 @@ def remove_punctuation(text):
     # Tokenize the corpus
     def jieba_split(text):
         text_without_punctuation = remove_punctuation(text)
-        return jieba.lcut(text_without_punctuation)
+        return jieba.cut_for_search(text_without_punctuation)
 
     def blank_space_split(text):
         text_without_punctuation = remove_punctuation(text)
@@ -156,7 +156,7 @@ def analyze_documents(texts, language="en"):
         if isinstance(text, str):
             new_texts.append(text)
     # Tokenize the corpus
-    tokenized = tokenizer.tokenize(new_texts, return_as="tuple")
+    tokenized = tokenizer.tokenize(new_texts, return_as="tuple", show_progress=False)
     # log.info(f"Tokenized: {tokenized}")
     # Create a frequency counter
     freq = Counter()
@@ -169,8 +169,13 @@ def analyze_documents(texts, language="en"):
 
     # Convert token ids back to words
     word_freq = Counter({id_to_word[token_id]: count for token_id, count in freq.items()})
-    log.debug(f"word freq {word_freq.most_common(10)}")
 
+
+    # if language in ["zh", "cn", "chinese"], remove the long words
+    # this is a trick to make the text match test case verification simple, because the long word can be still split
+    if language in ["zh", "cn", "chinese"]:
+        word_freq = Counter({word: count for word, count in word_freq.items() if 1< len(word) <= 3})
+    log.info(f"word freq {word_freq.most_common(10)}")
     return word_freq
 
 

diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py
@@ -506,7 +506,6 @@ def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullabl
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -658,7 +657,6 @@ def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_
                 if i + batch_size < len(data)
                 else data[i: len(data)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -800,7 +798,6 @@ def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang,
         batch_size = 5000
         for i in range(0, len(df), batch_size):
             collection_w.insert(df[i: i + batch_size])
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -938,7 +935,6 @@ def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         num_entities = collection_w.num_entities
         # query with count(*)
         res, _ = collection_w.query(
@@ -1190,7 +1186,6 @@ def test_upsert_for_full_text_search(self, tokenizer, nullable):
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -1348,7 +1343,6 @@ def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nulla
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -1486,7 +1480,6 @@ def test_delete_for_full_text_search(self, tokenizer):
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -1651,7 +1644,6 @@ def test_create_index_for_full_text_search_default(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -1775,7 +1767,6 @@ def test_create_full_text_search_with_invalid_index_type(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -1884,7 +1875,6 @@ def test_create_full_text_search_index_with_invalid_metric_type(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -1993,8 +1983,6 @@ def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
-
         error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"}
         collection_w.create_index(
             "emb",
@@ -2091,7 +2079,6 @@ def test_create_full_text_search_with_invalid_bm25_params(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2227,7 +2214,6 @@ def test_full_text_search_default(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2316,7 +2302,7 @@ def test_full_text_search_default(
                     overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
 
     @pytest.mark.tags(CaseLabel.L0)
-    @pytest.mark.parametrize("nq", [10])
+    @pytest.mark.parametrize("nq", [2])
     @pytest.mark.parametrize("empty_percent", [0.5])
     @pytest.mark.parametrize("enable_partition_key", [True])
     @pytest.mark.parametrize("enable_inverted_index", [True])
@@ -2409,7 +2395,10 @@ def test_full_text_search_with_jieba_tokenizer(
         log.info(f"dataframe\n{df}")
         texts = df["text"].to_list()
         word_freq = cf.analyze_documents(texts, language=language)
-        tokens = list(word_freq.keys())
+        tokens = []
+        for item in word_freq.most_common(20):
+            if len(item[0]) == 2:
+                tokens.append(item[0])
         if len(tokens) == 0:
             log.info(f"empty tokens, add a dummy token")
             tokens = ["dummy"]
@@ -2420,7 +2409,6 @@ def test_full_text_search_with_jieba_tokenizer(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2612,7 +2600,6 @@ def test_full_text_search_with_range_search(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2778,7 +2765,6 @@ def test_full_text_search_with_search_iterator(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2812,16 +2798,17 @@ def test_full_text_search_with_search_iterator(
             output_fields=["id", "text", "text_sparse_emb"],
             limit=limit
         )
+        iter_result = []
         while True:
             result = iterator.next()
             if not result:
                 iterator.close()
                 break
             else:
-                assert len(result) == batch_size
+                iter_result.append(len(result))
+        for r in iter_result[:-1]:
+            assert r == batch_size
 
-
-# @pytest.mark.skip("skip")
 class TestSearchWithFullTextSearchNegative(TestcaseBase):
     """
     ******************************************************************
@@ -2925,7 +2912,6 @@ def test_search_for_full_text_search_with_empty_string_search_data(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -3062,7 +3048,6 @@ def test_search_for_full_text_search_with_invalid_search_data(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -3200,7 +3185,6 @@ def test_hybrid_search_with_full_text_search(
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
-            collection_w.flush()
         collection_w.create_index(
             "dense_emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},