Skip to content

Commit

Permalink
add todo
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Sep 21, 2024
1 parent a147fd5 commit 9ad0747
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/datatrove/pipeline/filters/gopher_quality_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(
self.max_symbol_word_ratio = max_symbol_word_ratio
self.max_bullet_lines_ratio = max_bullet_lines_ratio
self.max_ellipsis_lines_ratio = max_ellipsis_lines_ratio
self.max_non_alpha_words_ratio = max_non_alpha_words_ratio
self.max_non_alpha_words_ratio = max_non_alpha_words_ratio # TODO rename to min_alpha_words_ratio
self.min_stop_words = min_stop_words
self.stop_words = set(STOP_WORDS if stop_words is None else stop_words)
self.tokenizer = load_word_tokenizer(language)
Expand Down Expand Up @@ -114,6 +114,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
# that 80 % of words in a document contain at least one alphabetic character
if (
self.max_non_alpha_words_ratio
# nb of words with at least 1 alpha char < 0.8
and sum([any((c.isalpha() for c in w)) for w in words]) / n_words < self.max_non_alpha_words_ratio
):
return False, "gopher_below_alpha_threshold"
Expand Down

0 comments on commit 9ad0747

Please sign in to comment.