Skip to content

Commit

Permalink
Tokenizer lowercase (#37)
Browse files Browse the repository at this point in the history
* ensure tokenizer uses lowercase
* add additional tests
  • Loading branch information
barrust authored Mar 9, 2019
1 parent fe6a035 commit b6a1549
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
2 changes: 1 addition & 1 deletion spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def load_text(self, text, tokenizer=None):
tokenizer (function): The function to use to tokenize a string
"""
if tokenizer:
words = tokenizer(text)
words = [x.lower() for x in tokenizer(text)]
else:
words = _parse_into_words(text)

Expand Down
26 changes: 26 additions & 0 deletions tests/spellchecker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def test_candidates(self):
self.assertEqual(spell.candidates('ths'), cands)
self.assertEqual(spell.candidates('the'), {'the'})
self.assertEqual(spell.candidates('-'), {'-'})
# something that cannot exist... should return just the same element...
self.assertEqual(spell.candidates('manasaeds'), {'manasaeds'})

def test_words(self):
''' rest the parsing of words '''
Expand Down Expand Up @@ -130,6 +132,13 @@ def test_edit_distance_one(self):
spell = SpellChecker(language=None, local_dictionary=filepath, distance=1)
self.assertEqual(spell.candidates('hike'), {'bike'})

def test_edit_distance_two(self):
''' test a case where edit_distance_2 is called '''
here = os.path.dirname(__file__)
filepath = '{}/resources/small_dictionary.json'.format(here)
spell = SpellChecker(language=None, local_dictionary=filepath)
self.assertEqual(spell.edit_distance_2('hie'), ['bike'])

def test_edit_distance_one_property(self):
''' check the property setting of the distance property '''
spell = SpellChecker(distance=1)
Expand Down Expand Up @@ -290,3 +299,20 @@ def test_adding_unicode(self):
self.assertEqual("mañana" in spell2, True)

os.remove(new_filepath)

def test_tokenizer_file(self):
""" def using a custom tokenizer for file loading """
def tokens(txt):
for x in txt.split():
yield x

here = os.path.dirname(__file__)
filepath = '{}/resources/small_doc.txt'.format(here)
spell = SpellChecker(language=None) # just from this doc!
spell.word_frequency.load_text_file(filepath, tokenizer=tokens)
self.assertEqual(spell['a'], 3)
self.assertEqual(spell['storm'], 1)
self.assertEqual(spell['storm.'], 1)
self.assertFalse('awesome' in spell)
self.assertTrue(spell['whale'])
self.assertTrue('sea.' in spell)

0 comments on commit b6a1549

Please sign in to comment.