Skip to content

Commit

Permalink
Merge pull request #15 from himkt/add-option-ignoring-word
Browse files Browse the repository at this point in the history
Add option ignoring word
  • Loading branch information
himkt authored Jul 9, 2019
2 parents 443aaa5 + 2ce53ba commit 44743da
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 20 deletions.
6 changes: 5 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@ before_install:
- cd kytea-0.4.7 && ./configure && sudo make && sudo make install && cd ..
- sudo ldconfig -v

env:
-
- BUILD_WORD_TOKENIZER=0

install:
- pip install .

python:
- 3.6

script:
- nosetests tests
- python -m pytest
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ tiny_tokenizer requires following libraries.
You can install tiny_tokenizer via pip.
`pip install tiny_tokenizer`

Or, you can install tiny_tokenizer only with SentenceTokenizer by the following command.
`BUILD_WORD_TOKENIZER=0 pip install tiny_tokenizer`


### Quick start: Docker

You can use tiny_tokenizer using the Docker container.
Expand All @@ -30,6 +34,7 @@ docker build -t himkt/tiny_tokenizer .
docker run -it himkt/tiny_tokenizer
```


### Example

`python3 example/tokenize_document.py`
Expand All @@ -54,10 +59,11 @@ Tokenizer (Sentencepiece): ▁ 名前 はまだ ない
Tokenizer (Character): 名 前 は ま だ な い
```


### Test

```
nosetests
python -m pytest
```

### Acknowledgement
Expand Down
33 changes: 25 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,29 @@
from setuptools import find_packages
from setuptools import setup

from os import getenv

setup(name='tiny_tokenizer',
version='1.3.0',
description='Tiny Word/Sentence Tokenizer',
author='himkt',
author_email='[email protected]',
install_requires=['natto-py', 'kytea', 'sentencepiece'],
url='https://github.com/himkt/tiny_tokenizer',
packages=find_packages())

try:
BUILD_WORD_TOKENIZER = int(getenv('BUILD_WORD_TOKENIZER', 1))
except:
raise ValueError('BUILD_WORD_TOKENIZER should be integer')


install_requires = []
if BUILD_WORD_TOKENIZER == 1:
install_requires.extend(['natto-py', 'kytea', 'sentencepiece'])
else:
print('Install sentence tokenizer only')


setup(
name='tiny_tokenizer',
version='1.3.1',
description='Tiny Word/Sentence Tokenizer',
author='himkt',
author_email='[email protected]',
install_requires=install_requires,
url='https://github.com/himkt/tiny_tokenizer',
packages=find_packages()
)
38 changes: 32 additions & 6 deletions tests/test_word_tokenize.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from tiny_tokenizer.word_tokenizer import WordTokenizer

import unittest
import pytest


SENTENCE1 = '吾輩は猫である'
Expand All @@ -10,21 +12,33 @@ class WordTokenizerTest(unittest.TestCase):

def test_word_tokenize_with_kytea(self):
"""Test KyTea tokenizer."""
tokenizer = WordTokenizer('KyTea')
try:
tokenizer = WordTokenizer('KyTea')
except ModuleNotFoundError:
pytest.skip('skip kytea')

expect = '吾輩 は 猫 で あ る'.split(' ')
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)

def test_word_tokenize_with_mecab(self):
"""Test MeCab tokenizer."""
tokenizer = WordTokenizer('MeCab')
try:
tokenizer = WordTokenizer('MeCab')
except ModuleNotFoundError:
pytest.skip('skip mecab')

expect = '吾輩 は 猫 で ある'.split(' ')
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)

def test_word_tokenize_with_sentencepiece(self):
"""Test Sentencepiece tokenizer."""
tokenizer = WordTokenizer('Sentencepiece', 'data/model.spm')
try:
tokenizer = WordTokenizer('Sentencepiece', 'data/model.spm')
except ModuleNotFoundError:
pytest.skip('skip sentencepiece')

expect = '▁ 吾 輩 は 猫 である'.split(' ')
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)
Expand All @@ -49,21 +63,33 @@ class WordTokenizerWithLowerCaseTest(unittest.TestCase):

def test_word_tokenize_with_kytea(self):
"""Test KyTea tokenizer."""
tokenizer = WordTokenizer('kytea')
try:
tokenizer = WordTokenizer('kytea')
except ModuleNotFoundError:
pytest.skip('skip kytea')

expect = '吾輩 は 猫 で あ る'.split(' ')
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)

def test_word_tokenize_with_mecab(self):
"""Test MeCab tokenizer."""
tokenizer = WordTokenizer('mecab')
try:
tokenizer = WordTokenizer('mecab')
except ModuleNotFoundError:
pytest.skip('skip mecab')

expect = '吾輩 は 猫 で ある'.split(' ')
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)

def test_word_tokenize_with_sentencepiece(self):
"""Test Sentencepiece tokenizer."""
tokenizer = WordTokenizer('sentencepiece', 'data/model.spm')
try:
tokenizer = WordTokenizer('Sentencepiece', 'data/model.spm')
except ModuleNotFoundError:
pytest.skip('skip sentencepiece')

expect = '▁ 吾 輩 は 猫 である'.split(' ')
result = tokenizer.tokenize(SENTENCE1)
self.assertEqual(expect, result)
Expand Down
2 changes: 1 addition & 1 deletion tiny_tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .sentence_tokenizer import SentenceTokenizer
from .word_tokenizer import WordTokenizer

__version__ = '1.3.0'
__version__ = "1.3.1"
19 changes: 16 additions & 3 deletions tiny_tokenizer/word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,33 @@ def __init__(self, tokenizer=None, flags=''):

# use external libraries
if __tokenizer == 'mecab':
import natto
try:
import natto
except ModuleNotFoundError:
raise ModuleNotFoundError('natto-py is not installed')
exit()

flags = '-Owakati' if not flags else flags
self.__tokenizer = natto.MeCab(flags)
self.__tokenizer_name = 'MeCab'
self.tokenize = self.__mecab_tokenize

if __tokenizer == 'kytea':
import Mykytea
try:
import Mykytea
except ModuleNotFoundError:
raise ModuleNotFoundError('kytea is not installed')

self.__tokenizer = Mykytea.Mykytea(flags)
self.__tokenizer_name = 'KyTea'
self.tokenize = self.__kytea_tokenize

elif __tokenizer == 'sentencepiece':
import sentencepiece
try:
import sentencepiece
except ModuleNotFoundError:
raise ModuleNotFoundError('sentencepiece is not installed')

self.__tokenizer = sentencepiece.SentencePieceProcessor()
self.__tokenizer.load(flags)
self.__tokenizer_name = 'Sentencepiece'
Expand Down

0 comments on commit 44743da

Please sign in to comment.