From 57efa05341d46d034fa2dbecacd8f866df5541f9 Mon Sep 17 00:00:00 2001 From: zhijianma Date: Fri, 20 Oct 2023 09:13:57 +0800 Subject: [PATCH] bugfix: fix character repetition method --- data_juicer/ops/filter/character_repetition_filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_juicer/ops/filter/character_repetition_filter.py b/data_juicer/ops/filter/character_repetition_filter.py index 2e463c471..f6b65e35c 100644 --- a/data_juicer/ops/filter/character_repetition_filter.py +++ b/data_juicer/ops/filter/character_repetition_filter.py @@ -59,10 +59,10 @@ def compute_stats(self, sample): freq_char_ngrams = sorted(list(freq_char_ngrams.values()), reverse=True) - rep_more_than_one = len([el for el in freq_char_ngrams if el > 1]) + num_no_rep_char_ngrams = len([el for el in freq_char_ngrams if el == 1]) num_rep_char_ngrams = min( int(np.sqrt(len(freq_char_ngrams))), - len(freq_char_ngrams) - rep_more_than_one, + len(freq_char_ngrams) - num_no_rep_char_ngrams, ) sample[Fields.stats][StatsKeys.char_rep_ratio] = (sum( freq_char_ngrams[:num_rep_char_ngrams]) / sum(freq_char_ngrams)) \