diff --git a/Makefile b/Makefile index 21cb794..347e3b2 100755 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ test: pytest coverage: - pytest -s --cov --cov-report html --cov-fail-under 97 + pytest -s --cov --cov-report html --cov-fail-under 100 yamllint: yamllint -d relaxed . diff --git a/find_similar/tokenize.py b/find_similar/tokenize.py index 66428c0..0870a36 100644 --- a/find_similar/tokenize.py +++ b/find_similar/tokenize.py @@ -101,11 +101,11 @@ def split_text_and_digits(text): :param text: enter text :return: list of separated texts """ - regex = r"^\D+[0]\D+$" + regex = r"^\D+[0]\D+$" # so0os match = re.search(regex, text, re.MULTILINE) if match: return [text] - # Проверяем на вольты и амперы + # Проверяем на вольты и амперы 55В -> 55 v regex = r"\d+[.]?\d?[в|а|В|А|B|A|a]{1}$" match = re.search(regex, text, re.MULTILINE) if match: diff --git a/testing/test_algorithm/test_tokenize.py b/testing/test_algorithm/test_tokenize.py index 28ef4aa..61b3162 100644 --- a/testing/test_algorithm/test_tokenize.py +++ b/testing/test_algorithm/test_tokenize.py @@ -1,6 +1,7 @@ """ Tests for tokenize """ +from unittest import mock import pytest from find_similar.calc_models import LanguageNotFoundException from find_similar.tokenize import ( @@ -15,6 +16,7 @@ HashebleSet, replace_yio, add_nltk_stopwords, + get_stopwords_from_nltk, ) @@ -47,6 +49,23 @@ def test_split_text_and_digits(): assert result == ["1", "some", "2", "string", "5", "with", "9"] +def test_split_text_and_digits_match(): + """ + Test split_text_and_digits when we use regex + """ + input_str = "Voltage 55В" + result = split_text_and_digits(input_str) + assert result == ["55", "v"] + + +def test_split_text_and_digits_other_match(): + """ + Test split_text_and_digits with first regex match ^\\D+[0]\\D+$ + """ + input_str = "so0os" + assert split_text_and_digits(input_str) == [input_str] + + def test_get_normal_form(): """ Test get_normal_form @@ -161,3 +180,29 @@ def test_remove_or_not_stopwords(): assert tokenize(text, "russian", remove_stopwords=True) == result result = {"что", "я", "о", "круг"} assert tokenize(text, "russian", remove_stopwords=False) == result + + +def test_get_stopwords_from_nltk_lookup_error(): + """ + Test get_stopwords_from_nltk when LookupError raised + """ + + class MockStopwords: + """ + Mock class for nltk.corpus.stopwords + """ + def words(self, *args, **kwargs): + """ + mock function to words + """ + raise LookupError + + def mock_download(*args, **kwargs): # pylint:disable=unused-argument + """ + Mock function for nltk.download + """ + + with mock.patch('find_similar.tokenize.stopwords', MockStopwords()): + with mock.patch('nltk.download', mock_download): + with pytest.raises(LookupError): + get_stopwords_from_nltk('english')