diff --git a/.travis.yml b/.travis.yml index 762448b..1424686 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,7 +18,9 @@ env: - BUILD_WORD_TOKENIZER=0 install: - - pip install . + - python setup.py install + - pip install https://object-storage.tyo2.conoha.io/v1/nc_2520839e1f9641b08211a5c85243124a/sudachi/SudachiDict_full-20190718.tar.gz + - sudachipy link -t full python: - 3.6 diff --git a/README.md b/README.md index 74b5ea5..1f5ff5f 100644 --- a/README.md +++ b/README.md @@ -39,26 +39,68 @@ docker run -it himkt/tiny_tokenizer `python3 example/tokenize_document.py` +
+ ``` # python3 example/tokenize_document.py Finish creating word tokenizers Given document: 我輩は猫である。名前はまだない #0: 我輩は猫である。 -Tokenizer (identity): 我輩は猫である。 -Tokenizer (MeCab): 我輩 は 猫 で ある 。 -Tokenizer (KyTea): 我輩 は 猫 で あ る 。 -Tokenizer (Sentencepiece): ▁ 我 輩 は 猫 である 。 -Tokenizer (Character): 我 輩 は 猫 で あ る 。 +Tokenizer: MeCab +[我輩, は, 猫, で, ある, 。] +Tokenizer: MeCab +[我輩 (名詞), は (助詞), 猫 (名詞), で (助動詞), ある (助動詞), 。 (記号)] +Tokenizer: KyTea +[我輩, は, 猫, で, あ, る, 。] +Tokenizer: KyTea +[我輩 (名詞), は (助詞), 猫 (名詞), で (助動詞), あ (動詞), る (語尾), 。 (補助記号)] +Tokenizer: Sentencepiece +[▁, 我, 輩, は, 猫, である, 。] +Tokenizer: Sudachi (A) +[我輩, は, 猫, で, ある, 。] +Tokenizer: Sudachi (A) +[我輩 (代名詞), は (助詞), 猫 (名詞), で (助動詞), ある (動詞), 。 (補助記号)] +Tokenizer: Sudachi (B) +[我輩, は, 猫, で, ある, 。] +Tokenizer: Sudachi (B) +[我輩 (代名詞), は (助詞), 猫 (名詞), で (助動詞), ある (動詞), 。 (補助記号)] +Tokenizer: Sudachi (C) +[我輩, は, 猫, で, ある, 。] +Tokenizer: Sudachi (C) +[我輩 (代名詞), は (助詞), 猫 (名詞), で (助動詞), ある (動詞), 。 (補助記号)] +Tokenizer: Character +[我, 輩, は, 猫, で, あ, る, 。] #1: 名前はまだない -Tokenizer (identity): 名前はまだない -Tokenizer (MeCab): 名前 は まだ ない -Tokenizer (KyTea): 名前 は まだ な い -Tokenizer (Sentencepiece): ▁ 名前 はまだ ない -Tokenizer (Character): 名 前 は ま だ な い +Tokenizer: MeCab +[名前, は, まだ, ない] +Tokenizer: MeCab +[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)] +Tokenizer: KyTea +[名前, は, まだ, な, い] +Tokenizer: KyTea +[名前 (名詞), は (助詞), まだ (副詞), な (形容詞), い (語尾)] +Tokenizer: Sentencepiece +[▁, 名前, はまだ, ない] +Tokenizer: Sudachi (A) +[名前, は, まだ, ない] +Tokenizer: Sudachi (A) +[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)] +Tokenizer: Sudachi (B) +[名前, は, まだ, ない] +Tokenizer: Sudachi (B) +[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)] +Tokenizer: Sudachi (C) +[名前, は, まだ, ない] +Tokenizer: Sudachi (C) +[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)] +Tokenizer: Character +[名, 前, は, ま, だ, な, い] ``` +
+ ### Test diff --git a/example/tokenize_demo.py b/example/tokenize_demo.py index fb4dba5..46901fc 100644 --- a/example/tokenize_demo.py +++ b/example/tokenize_demo.py @@ -5,12 +5,18 @@ if __name__ == "__main__": sentence_tokenizer = SentenceTokenizer() word_tokenizers = [] - word_tokenizers.append(WordTokenizer(tokenizer="MeCab")) - word_tokenizers.append(WordTokenizer(tokenizer="MeCab", with_postag=True)) - word_tokenizers.append(WordTokenizer(tokenizer="KyTea")) - word_tokenizers.append(WordTokenizer(tokenizer="KyTea", with_postag=True)) - word_tokenizers.append(WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm")) # NOQA - word_tokenizers.append(WordTokenizer(tokenizer="Character")) + word_tokenizers.append(["MeCab", WordTokenizer(tokenizer="MeCab")]) + word_tokenizers.append(["MeCab", WordTokenizer(tokenizer="MeCab", with_postag=True)]) # NOQA + word_tokenizers.append(["KyTea", WordTokenizer(tokenizer="KyTea")]) + word_tokenizers.append(["KyTea", WordTokenizer(tokenizer="KyTea", with_postag=True)]) # NOQA + word_tokenizers.append(["Sentencepiece", WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm")]) # NOQA + word_tokenizers.append(["Sudachi (A)", WordTokenizer(tokenizer="Sudachi", mode="A")]) # NOQA + word_tokenizers.append(["Sudachi (A)", WordTokenizer(tokenizer="Sudachi", with_postag=True, mode="A")]) # NOQA + word_tokenizers.append(["Sudachi (B)", WordTokenizer(tokenizer="Sudachi", mode="B")]) # NOQA + word_tokenizers.append(["Sudachi (B)", WordTokenizer(tokenizer="Sudachi", with_postag=True, mode="B")]) # NOQA + word_tokenizers.append(["Sudachi (C)", WordTokenizer(tokenizer="Sudachi", mode="C")]) # NOQA + word_tokenizers.append(["Sudachi (C)", WordTokenizer(tokenizer="Sudachi", with_postag=True, mode="C")]) # NOQA + word_tokenizers.append(["Character", WordTokenizer(tokenizer="Character")]) # NOQA print("Finish creating word tokenizers") print() @@ -21,8 +27,9 @@ for sentence_id, sentence in enumerate(sentences): print(f"#{sentence_id}: {sentence}") - for tokenizer in word_tokenizers: + for name, tokenizer in word_tokenizers: + print(f"Tokenizer: {name}") result = tokenizer.tokenize(sentence) - print(f"Tokenizer ({tokenizer.name}): {result}") + print(result) print() diff --git a/setup.py b/setup.py index aee513c..9d8a25c 100644 --- a/setup.py +++ b/setup.py @@ -14,14 +14,15 @@ install_requires = [] if BUILD_WORD_TOKENIZER == 1: - install_requires.extend(["natto-py", "kytea", "sentencepiece"]) + install_requires.extend(["natto-py", "kytea", "sentencepiece", "SudachiPy"]) # NOQA + else: print("Install sentence tokenizer only") setup( name="tiny_tokenizer", - version="2.0.0", + version="2.1.0", description="Tiny Word/Sentence Tokenizer", author="himkt", author_email="himkt@klis.tsukuba.ac.jp", diff --git a/tests/test_word_tokenize.py b/tests/test_word_tokenize.py index cbf985c..a03b031 100644 --- a/tests/test_word_tokenize.py +++ b/tests/test_word_tokenize.py @@ -7,6 +7,7 @@ SENTENCE1 = "吾輩は猫である" +SENTENCE2 = "医薬品安全管理責任者" class WordTokenizerTest(unittest.TestCase): @@ -48,58 +49,64 @@ def test_word_tokenize_with_sentencepiece(self): result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) - def test_word_tokenize_with_character(self): - """Test Character tokenizer.""" - tokenizer = WordTokenizer( - tokenizer="Character" - ) - expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")] - result = tokenizer.tokenize(SENTENCE1) - self.assertEqual(expect, result) - - -class WordTokenizerWithLowerCaseTest(unittest.TestCase): - """Test ordinal word tokenizer.""" - - def test_word_tokenize_with_kytea(self): - """Test KyTea tokenizer.""" + def test_word_tokenize_with_sudachi_mode_a(self): + """Test Sudachi tokenizer.""" try: - tokenizer = WordTokenizer(tokenizer="kytea") + tokenizer = WordTokenizer( + tokenizer="Sudachi", + mode="A", + ) except ModuleNotFoundError: - pytest.skip("skip kytea") + pytest.skip("skip sudachi") - expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")] - result = tokenizer.tokenize(SENTENCE1) + expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")] + result = tokenizer.tokenize(SENTENCE2) self.assertEqual(expect, result) - def test_word_tokenize_with_mecab(self): - """Test MeCab tokenizer.""" + def test_word_tokenize_with_sudachi_mode_b(self): + """Test Sudachi tokenizer.""" try: - tokenizer = WordTokenizer(tokenizer="mecab") + tokenizer = WordTokenizer( + tokenizer="Sudachi", + mode="B", + ) except ModuleNotFoundError: - pytest.skip("skip mecab") + pytest.skip("skip sudachi") - expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")] - result = tokenizer.tokenize(SENTENCE1) + expect = [Token(surface=w) for w in "医薬品 安全 管理 責任者".split(" ")] + result = tokenizer.tokenize(SENTENCE2) self.assertEqual(expect, result) - def test_word_tokenize_with_sentencepiece(self): - """Test Sentencepiece tokenizer.""" + def test_word_tokenize_with_sudachi_mode_c(self): + """Test Sudachi tokenizer.""" try: tokenizer = WordTokenizer( - tokenizer="Sentencepiece", - model_path="data/model.spm" + tokenizer="Sudachi", + mode="C", ) except ModuleNotFoundError: - pytest.skip("skip sentencepiece") + pytest.skip("skip sudachi") - expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")] - result = tokenizer.tokenize(SENTENCE1) + expect = [Token(surface=w) for w in "医薬品安全管理責任者".split(" ")] + result = tokenizer.tokenize(SENTENCE2) self.assertEqual(expect, result) def test_word_tokenize_with_character(self): """Test Character tokenizer.""" - tokenizer = WordTokenizer(tokenizer="character") + tokenizer = WordTokenizer( + tokenizer="Character" + ) expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")] result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) + + def test_word_tokenize_using_lowercase(self): + """Test KyTea tokenizer.""" + try: + tokenizer = WordTokenizer(tokenizer="kytea") + except ModuleNotFoundError: + pytest.skip("skip kytea") + + expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")] + result = tokenizer.tokenize(SENTENCE1) + self.assertEqual(expect, result) diff --git a/tiny_tokenizer/__init__.py b/tiny_tokenizer/__init__.py index 54ce02f..b7d7128 100644 --- a/tiny_tokenizer/__init__.py +++ b/tiny_tokenizer/__init__.py @@ -2,4 +2,4 @@ from .sentence_tokenizer import SentenceTokenizer # NOQA from .word_tokenizer import WordTokenizer # NOQA -__version__ = "2.0.0" +__version__ = "2.1.0" diff --git a/tiny_tokenizer/word_tokenizer.py b/tiny_tokenizer/word_tokenizer.py index f0d2b68..23dc622 100644 --- a/tiny_tokenizer/word_tokenizer.py +++ b/tiny_tokenizer/word_tokenizer.py @@ -1,38 +1,29 @@ """Word Level Tokenizer.""" -from typing import Optional - import warnings +from typing import Optional class Token: """Token class""" - def __init__( - self, - surface: str, - postag: Optional[str] = None - ): + + def __init__(self, surface: str, postag: Optional[str] = None): self.surface = surface self.postag = postag def __repr__(self): representation = self.surface if self.postag is not None: - representation += f' ({self.postag})' + representation += f" ({self.postag})" return representation def __eq__(self, right): - return self.surface == right.surface and \ - self.postag == right.postag + return self.surface == right.surface and self.postag == right.postag class BaseWordLevelTokenizer: """Base class for word level tokenizer""" - def __init__( - self, - name: str, - with_postag: bool = False, - **kwargs, - ): + + def __init__(self, name: str, with_postag: bool = False, **kwargs): self.__name = name self.with_postag = with_postag @@ -47,24 +38,23 @@ def name(self): class MeCabTokenizer(BaseWordLevelTokenizer): - """Wrapper class of external text analyzers""" + """Wrapper class forexternal text analyzers""" + def __init__( - self, - dictionary_path: Optional[str] = None, - with_postag: bool = False, + self, dictionary_path: Optional[str] = None, with_postag: bool = False ): - super().__init__(name='mecab', with_postag=with_postag) + super().__init__(name="mecab", with_postag=with_postag) try: import natto except ModuleNotFoundError: raise ModuleNotFoundError("natto-py is not installed") - flag = '' + flag = "" if not self.with_postag: - flag += ' -Owakati' + flag += " -Owakati" if dictionary_path is not None: - flag += f' -u {dictionary_path}' + flag += f" -u {dictionary_path}" self.mecab = natto.MeCab(flag) @@ -73,42 +63,37 @@ def tokenize(self, text: str): return_result = [] parse_result = self.mecab.parse(text) if self.with_postag: - for elem in parse_result.split('\n')[:-1]: + for elem in parse_result.split("\n")[:-1]: surface, feature = elem.split() - postag = feature.split(',')[0] + postag = feature.split(",")[0] return_result.append(Token(surface=surface, postag=postag)) else: - for surface in parse_result.split(' '): + for surface in parse_result.split(" "): return_result.append(Token(surface=surface)) return return_result class KyTeaTokenizer(BaseWordLevelTokenizer): - """Wrapper class of KyTea""" - def __init__( - self, - with_postag: bool = False, - **kwargs, - ): + """Wrapper class forKyTea""" + + def __init__(self, with_postag: bool = False, **kwargs): super(KyTeaTokenizer, self).__init__( - name='kytea', - with_postag=with_postag - ) + name="kytea", with_postag=with_postag) try: import Mykytea except ModuleNotFoundError: raise ModuleNotFoundError("kytea is not installed") - flag = '' + flag = "" self.kytea = Mykytea.Mykytea(flag) def tokenize(self, text: str): return_result = [] if self.with_postag: - for elem in self.kytea.getTagsToString(text).split(' ')[:-1]: - surface, postag, _ = elem.split('/') + for elem in self.kytea.getTagsToString(text).split(" ")[:-1]: + surface, postag, _ = elem.split("/") return_result.append(Token(surface=surface, postag=postag)) else: @@ -119,13 +104,10 @@ def tokenize(self, text: str): class SentencepieceTokenizer(BaseWordLevelTokenizer): - """Wrapper class of Sentencepiece""" - def __init__( - self, - model_path: str, - **kwargs, - ): - super(SentencepieceTokenizer, self).__init__('sentencepiece') + """Wrapper class forSentencepiece""" + + def __init__(self, model_path: str, **kwargs): + super(SentencepieceTokenizer, self).__init__("sentencepiece") try: import sentencepiece except ModuleNotFoundError: @@ -135,14 +117,57 @@ def __init__( self.tokenizer.load(model_path) def tokenize(self, text: str): - return [Token(surface=subword) for subword in - self.tokenizer.EncodeAsPieces(text)] + return [ + Token(surface=subword) for subword in self.tokenizer.EncodeAsPieces(text) + ] + + +class SudachiTokenizer(BaseWordLevelTokenizer): + """Wrapper class for SudachiPy.""" + + def __init__(self, mode: str, with_postag: bool, **kwargs): + super(SudachiTokenizer, self).__init__("sudachi") + try: + from sudachipy import tokenizer + from sudachipy import dictionary + except ModuleNotFoundError: + raise ModuleNotFoundError("sudachipy is not installed") + try: + self.tokenizer = dictionary.Dictionary().create() + except KeyError: + msg = "please install dictionary" + msg += " ( see https://github.com/WorksApplications/SudachiPy#install-dict-packages )" # NOQA + raise KeyError(msg) + + _mode = mode.capitalize() + if _mode == "A": + self.mode = tokenizer.Tokenizer.SplitMode.A + elif _mode == "B": + self.mode = tokenizer.Tokenizer.SplitMode.B + elif _mode == "C": + self.mode = tokenizer.Tokenizer.SplitMode.C + else: + msg = "Invalid mode is specified. Mode should be 'A', 'B' or 'C'" + raise ValueError(msg) + + self.with_postag = with_postag + + def tokenize(self, text: str): + """Tokenize.""" + result = [] + for token in self.tokenizer.tokenize(text, self.mode): + _token = Token(surface=token.surface()) + if self.with_postag: + _token.postag = token.part_of_speech()[0] + result.append(_token) + return result class CharacterTokenizer(BaseWordLevelTokenizer): """Character tokenizer""" + def __init__(self): - super(CharacterTokenizer, self).__init__('character') + super(CharacterTokenizer, self).__init__("character") def tokenize(self, text: str): return [Token(surface=char) for char in list(text)] @@ -157,6 +182,7 @@ def __init__( with_postag: bool = False, dictionary_path: Optional[str] = None, model_path: Optional[str] = None, + mode: Optional[str] = None, ): """Create tokenizer. @@ -168,24 +194,27 @@ def __init__( self.with_postag = with_postag self.dictionary_path = dictionary_path self.model_path = model_path + if mode is not None: + self.mode = mode.lower() self.__setup_tokenizer() def __setup_tokenizer(self): - if self.__tokenizer_name == 'mecab': + if self.__tokenizer_name == "mecab": self.tokenizer = MeCabTokenizer( dictionary_path=self.dictionary_path, - with_postag=self.with_postag, - ) - if self.__tokenizer_name == 'kytea': - self.tokenizer = KyTeaTokenizer( with_postag=self.with_postag ) - if self.__tokenizer_name == 'sentencepiece': - self.tokenizer = SentencepieceTokenizer( - model_path=self.model_path + if self.__tokenizer_name == "kytea": + self.tokenizer = KyTeaTokenizer(with_postag=self.with_postag) + if self.__tokenizer_name == "sentencepiece": + self.tokenizer = SentencepieceTokenizer(model_path=self.model_path) + if self.__tokenizer_name == "sudachi": + self.tokenizer = SudachiTokenizer( + mode=self.mode, + with_postag=self.with_postag ) - if self.__tokenizer_name == 'character': + if self.__tokenizer_name == "character": self.tokenizer = CharacterTokenizer() def tokenize(self, text: str): @@ -198,11 +227,11 @@ def name(self): if __name__ == "__main__": - tokenizer = WordTokenizer(tokenizer='mecab', with_postag=False) - print(tokenizer.tokenize('我輩は猫である')) + tokenizer = WordTokenizer(tokenizer="mecab", with_postag=False) + print(tokenizer.tokenize("我輩は猫である")) - tokenizer = WordTokenizer(tokenizer='mecab', with_postag=True) - print(tokenizer.tokenize('我輩は猫である')) + tokenizer = WordTokenizer(tokenizer="mecab", with_postag=True) + print(tokenizer.tokenize("我輩は猫である")) tokenizer = WordTokenizer("kytea", with_postag=False) print(tokenizer.tokenize("我輩は猫である"))