Skip to content

Commit

Permalink
Merge pull request #20 from himkt/support-sudachi
Browse files Browse the repository at this point in the history
Support Sudachi tokenizer
  • Loading branch information
himkt authored Jul 22, 2019
2 parents f485ce9 + 7360805 commit 9b8a3b1
Show file tree
Hide file tree
Showing 7 changed files with 205 additions and 117 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ env:
- BUILD_WORD_TOKENIZER=0

install:
- pip install .
- python setup.py install
- pip install https://object-storage.tyo2.conoha.io/v1/nc_2520839e1f9641b08211a5c85243124a/sudachi/SudachiDict_full-20190718.tar.gz
- sudachipy link -t full

python:
- 3.6
Expand Down
62 changes: 52 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,68 @@ docker run -it himkt/tiny_tokenizer

`python3 example/tokenize_document.py`

<details>

```
# python3 example/tokenize_document.py
Finish creating word tokenizers
Given document: 我輩は猫である。名前はまだない
#0: 我輩は猫である。
Tokenizer (identity): 我輩は猫である。
Tokenizer (MeCab): 我輩 は 猫 で ある 。
Tokenizer (KyTea): 我輩 は 猫 で あ る 。
Tokenizer (Sentencepiece): ▁ 我 輩 は 猫 である 。
Tokenizer (Character): 我 輩 は 猫 で あ る 。
Tokenizer: MeCab
[我輩, は, 猫, で, ある, 。]
Tokenizer: MeCab
[我輩 (名詞), は (助詞), 猫 (名詞), で (助動詞), ある (助動詞), 。 (記号)]
Tokenizer: KyTea
[我輩, は, 猫, で, あ, る, 。]
Tokenizer: KyTea
[我輩 (名詞), は (助詞), 猫 (名詞), で (助動詞), あ (動詞), る (語尾), 。 (補助記号)]
Tokenizer: Sentencepiece
[▁, 我, 輩, は, 猫, である, 。]
Tokenizer: Sudachi (A)
[我輩, は, 猫, で, ある, 。]
Tokenizer: Sudachi (A)
[我輩 (代名詞), は (助詞), 猫 (名詞), で (助動詞), ある (動詞), 。 (補助記号)]
Tokenizer: Sudachi (B)
[我輩, は, 猫, で, ある, 。]
Tokenizer: Sudachi (B)
[我輩 (代名詞), は (助詞), 猫 (名詞), で (助動詞), ある (動詞), 。 (補助記号)]
Tokenizer: Sudachi (C)
[我輩, は, 猫, で, ある, 。]
Tokenizer: Sudachi (C)
[我輩 (代名詞), は (助詞), 猫 (名詞), で (助動詞), ある (動詞), 。 (補助記号)]
Tokenizer: Character
[我, 輩, は, 猫, で, あ, る, 。]
#1: 名前はまだない
Tokenizer (identity): 名前はまだない
Tokenizer (MeCab): 名前 は まだ ない
Tokenizer (KyTea): 名前 は まだ な い
Tokenizer (Sentencepiece): ▁ 名前 はまだ ない
Tokenizer (Character): 名 前 は ま だ な い
Tokenizer: MeCab
[名前, は, まだ, ない]
Tokenizer: MeCab
[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)]
Tokenizer: KyTea
[名前, は, まだ, な, い]
Tokenizer: KyTea
[名前 (名詞), は (助詞), まだ (副詞), な (形容詞), い (語尾)]
Tokenizer: Sentencepiece
[▁, 名前, はまだ, ない]
Tokenizer: Sudachi (A)
[名前, は, まだ, ない]
Tokenizer: Sudachi (A)
[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)]
Tokenizer: Sudachi (B)
[名前, は, まだ, ない]
Tokenizer: Sudachi (B)
[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)]
Tokenizer: Sudachi (C)
[名前, は, まだ, ない]
Tokenizer: Sudachi (C)
[名前 (名詞), は (助詞), まだ (副詞), ない (形容詞)]
Tokenizer: Character
[名, 前, は, ま, だ, な, い]
```

</details>


### Test

Expand Down
23 changes: 15 additions & 8 deletions example/tokenize_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
if __name__ == "__main__":
sentence_tokenizer = SentenceTokenizer()
word_tokenizers = []
word_tokenizers.append(WordTokenizer(tokenizer="MeCab"))
word_tokenizers.append(WordTokenizer(tokenizer="MeCab", with_postag=True))
word_tokenizers.append(WordTokenizer(tokenizer="KyTea"))
word_tokenizers.append(WordTokenizer(tokenizer="KyTea", with_postag=True))
word_tokenizers.append(WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm")) # NOQA
word_tokenizers.append(WordTokenizer(tokenizer="Character"))
word_tokenizers.append(["MeCab", WordTokenizer(tokenizer="MeCab")])
word_tokenizers.append(["MeCab", WordTokenizer(tokenizer="MeCab", with_postag=True)]) # NOQA
word_tokenizers.append(["KyTea", WordTokenizer(tokenizer="KyTea")])
word_tokenizers.append(["KyTea", WordTokenizer(tokenizer="KyTea", with_postag=True)]) # NOQA
word_tokenizers.append(["Sentencepiece", WordTokenizer(tokenizer="Sentencepiece", model_path="data/model.spm")]) # NOQA
word_tokenizers.append(["Sudachi (A)", WordTokenizer(tokenizer="Sudachi", mode="A")]) # NOQA
word_tokenizers.append(["Sudachi (A)", WordTokenizer(tokenizer="Sudachi", with_postag=True, mode="A")]) # NOQA
word_tokenizers.append(["Sudachi (B)", WordTokenizer(tokenizer="Sudachi", mode="B")]) # NOQA
word_tokenizers.append(["Sudachi (B)", WordTokenizer(tokenizer="Sudachi", with_postag=True, mode="B")]) # NOQA
word_tokenizers.append(["Sudachi (C)", WordTokenizer(tokenizer="Sudachi", mode="C")]) # NOQA
word_tokenizers.append(["Sudachi (C)", WordTokenizer(tokenizer="Sudachi", with_postag=True, mode="C")]) # NOQA
word_tokenizers.append(["Character", WordTokenizer(tokenizer="Character")]) # NOQA
print("Finish creating word tokenizers")
print()

Expand All @@ -21,8 +27,9 @@
for sentence_id, sentence in enumerate(sentences):
print(f"#{sentence_id}: {sentence}")

for tokenizer in word_tokenizers:
for name, tokenizer in word_tokenizers:
print(f"Tokenizer: {name}")
result = tokenizer.tokenize(sentence)
print(f"Tokenizer ({tokenizer.name}): {result}")
print(result)

print()
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@

install_requires = []
if BUILD_WORD_TOKENIZER == 1:
install_requires.extend(["natto-py", "kytea", "sentencepiece"])
install_requires.extend(["natto-py", "kytea", "sentencepiece", "SudachiPy"]) # NOQA

else:
print("Install sentence tokenizer only")


setup(
name="tiny_tokenizer",
version="2.0.0",
version="2.1.0",
description="Tiny Word/Sentence Tokenizer",
author="himkt",
author_email="[email protected]",
Expand Down
73 changes: 40 additions & 33 deletions tests/test_word_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@


SENTENCE1 = "吾輩は猫である"
SENTENCE2 = "医薬品安全管理責任者"


class WordTokenizerTest(unittest.TestCase):
Expand Down Expand Up @@ -48,58 +49,64 @@ def test_word_tokenize_with_sentencepiece(self):
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)

def test_word_tokenize_with_character(self):
"""Test Character tokenizer."""
tokenizer = WordTokenizer(
tokenizer="Character"
)
expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")]
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)


class WordTokenizerWithLowerCaseTest(unittest.TestCase):
"""Test ordinal word tokenizer."""

def test_word_tokenize_with_kytea(self):
"""Test KyTea tokenizer."""
def test_word_tokenize_with_sudachi_mode_a(self):
"""Test Sudachi tokenizer."""
try:
tokenizer = WordTokenizer(tokenizer="kytea")
tokenizer = WordTokenizer(
tokenizer="Sudachi",
mode="A",
)
except ModuleNotFoundError:
pytest.skip("skip kytea")
pytest.skip("skip sudachi")

expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")]
result = tokenizer.tokenize(SENTENCE1)
expect = [Token(surface=w) for w in "医薬 品 安全 管理 責任 者".split(" ")]
result = tokenizer.tokenize(SENTENCE2)
self.assertEqual(expect, result)

def test_word_tokenize_with_mecab(self):
"""Test MeCab tokenizer."""
def test_word_tokenize_with_sudachi_mode_b(self):
"""Test Sudachi tokenizer."""
try:
tokenizer = WordTokenizer(tokenizer="mecab")
tokenizer = WordTokenizer(
tokenizer="Sudachi",
mode="B",
)
except ModuleNotFoundError:
pytest.skip("skip mecab")
pytest.skip("skip sudachi")

expect = [Token(surface=w) for w in "吾輩 は 猫 で ある".split(" ")]
result = tokenizer.tokenize(SENTENCE1)
expect = [Token(surface=w) for w in "医薬品 安全 管理 責任者".split(" ")]
result = tokenizer.tokenize(SENTENCE2)
self.assertEqual(expect, result)

def test_word_tokenize_with_sentencepiece(self):
"""Test Sentencepiece tokenizer."""
def test_word_tokenize_with_sudachi_mode_c(self):
"""Test Sudachi tokenizer."""
try:
tokenizer = WordTokenizer(
tokenizer="Sentencepiece",
model_path="data/model.spm"
tokenizer="Sudachi",
mode="C",
)
except ModuleNotFoundError:
pytest.skip("skip sentencepiece")
pytest.skip("skip sudachi")

expect = [Token(surface=w) for w in "▁ 吾 輩 は 猫 である".split(" ")]
result = tokenizer.tokenize(SENTENCE1)
expect = [Token(surface=w) for w in "医薬品安全管理責任者".split(" ")]
result = tokenizer.tokenize(SENTENCE2)
self.assertEqual(expect, result)

def test_word_tokenize_with_character(self):
"""Test Character tokenizer."""
tokenizer = WordTokenizer(tokenizer="character")
tokenizer = WordTokenizer(
tokenizer="Character"
)
expect = [Token(surface=w) for w in "吾 輩 は 猫 で あ る".split(" ")]
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)

def test_word_tokenize_using_lowercase(self):
"""Test KyTea tokenizer."""
try:
tokenizer = WordTokenizer(tokenizer="kytea")
except ModuleNotFoundError:
pytest.skip("skip kytea")

expect = [Token(surface=w) for w in "吾輩 は 猫 で あ る".split(" ")]
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)
2 changes: 1 addition & 1 deletion tiny_tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .sentence_tokenizer import SentenceTokenizer # NOQA
from .word_tokenizer import WordTokenizer # NOQA

__version__ = "2.0.0"
__version__ = "2.1.0"
Loading

0 comments on commit 9b8a3b1

Please sign in to comment.