Skip to content

Commit

Permalink
test: add custom analyzer testcases (#37781)
Browse files Browse the repository at this point in the history
Signed-off-by: zhuwenxing <[email protected]>
  • Loading branch information
zhuwenxing authored Nov 27, 2024
1 parent 302650a commit 8188e14
Show file tree
Hide file tree
Showing 3 changed files with 993 additions and 270 deletions.
12 changes: 12 additions & 0 deletions tests/python_client/common/common_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,18 @@ def manual_check_text_match(df, word, col):
id_list.append(row["id"])
return id_list


def get_top_english_tokens(counter, n=10):
english_pattern = re.compile(r'^[a-zA-Z]+$')

english_tokens = {
word: freq
for word, freq in counter.items()
if english_pattern.match(str(word))
}
english_counter = Counter(english_tokens)
return english_counter.most_common(n)

def analyze_documents(texts, language="en"):

tokenizer = custom_tokenizer(language)
Expand Down
1 change: 1 addition & 0 deletions tests/python_client/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ ml-dtypes==0.2.0
# for full text search
bm25s==0.2.0
jieba==0.42.1
Unidecode==1.3.8


# for perf test
Expand Down
Loading

0 comments on commit 8188e14

Please sign in to comment.