Tokenizer lowercase (#37)

* ensure tokenizer uses lowercase * add additional tests
barrust · Mar 9, 2019 · b6a1549 · b6a1549
1 parent fe6a035
commit b6a1549
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 1 deletion.
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -347,7 +347,7 @@ def load_text(self, text, tokenizer=None):
                 tokenizer (function): The function to use to tokenize a string
         """
         if tokenizer:
-            words = tokenizer(text)
+            words = [x.lower() for x in tokenizer(text)]
         else:
             words = _parse_into_words(text)
 

diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py
@@ -32,6 +32,8 @@ def test_candidates(self):
         self.assertEqual(spell.candidates('ths'), cands)
         self.assertEqual(spell.candidates('the'), {'the'})
         self.assertEqual(spell.candidates('-'), {'-'})
+        # something that cannot exist... should return just the same element...
+        self.assertEqual(spell.candidates('manasaeds'), {'manasaeds'})
 
     def test_words(self):
         ''' rest the parsing of words '''
@@ -130,6 +132,13 @@ def test_edit_distance_one(self):
         spell = SpellChecker(language=None, local_dictionary=filepath, distance=1)
         self.assertEqual(spell.candidates('hike'), {'bike'})
 
+    def test_edit_distance_two(self):
+        ''' test a case where edit_distance_2 is called '''
+        here = os.path.dirname(__file__)
+        filepath = '{}/resources/small_dictionary.json'.format(here)
+        spell = SpellChecker(language=None, local_dictionary=filepath)
+        self.assertEqual(spell.edit_distance_2('hie'), ['bike'])
+
     def test_edit_distance_one_property(self):
         ''' check the property setting of the distance property '''
         spell = SpellChecker(distance=1)
@@ -290,3 +299,20 @@ def test_adding_unicode(self):
         self.assertEqual("mañana" in spell2, True)
 
         os.remove(new_filepath)
+
+    def test_tokenizer_file(self):
+        """ def using a custom tokenizer for file loading """
+        def tokens(txt):
+            for x in txt.split():
+                yield x
+
+        here = os.path.dirname(__file__)
+        filepath = '{}/resources/small_doc.txt'.format(here)
+        spell = SpellChecker(language=None)  # just from this doc!
+        spell.word_frequency.load_text_file(filepath, tokenizer=tokens)
+        self.assertEqual(spell['a'], 3)
+        self.assertEqual(spell['storm'], 1)
+        self.assertEqual(spell['storm.'], 1)
+        self.assertFalse('awesome' in spell)
+        self.assertTrue(spell['whale'])
+        self.assertTrue('sea.' in spell)