diff --git a/.travis.yml b/.travis.yml index 4b42639..762448b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,10 @@ before_install: - cd kytea-0.4.7 && ./configure && sudo make && sudo make install && cd .. - sudo ldconfig -v +env: + - + - BUILD_WORD_TOKENIZER=0 + install: - pip install . @@ -20,4 +24,4 @@ python: - 3.6 script: - - nosetests tests + - python -m pytest diff --git a/README.md b/README.md index f821fff..74b5ea5 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,10 @@ tiny_tokenizer requires following libraries. You can install tiny_tokenizer via pip. `pip install tiny_tokenizer` +Or, you can install tiny_tokenizer only with SentenceTokenizer by the following command. +`BUILD_WORD_TOKENIZER=0 pip install tiny_tokenizer` + + ### Quick start: Docker You can use tiny_tokenizer using the Docker container. @@ -30,6 +34,7 @@ docker build -t himkt/tiny_tokenizer . docker run -it himkt/tiny_tokenizer ``` + ### Example `python3 example/tokenize_document.py` @@ -54,10 +59,11 @@ Tokenizer (Sentencepiece): ▁ 名前 はまだ ない Tokenizer (Character): 名 前 は ま だ な い ``` + ### Test ``` -nosetests +python -m pytest ``` ### Acknowledgement diff --git a/setup.py b/setup.py index 08b5f39..814647d 100644 --- a/setup.py +++ b/setup.py @@ -4,12 +4,29 @@ from setuptools import find_packages from setuptools import setup +from os import getenv -setup(name='tiny_tokenizer', - version='1.3.0', - description='Tiny Word/Sentence Tokenizer', - author='himkt', - author_email='himkt@klis.tsukuba.ac.jp', - install_requires=['natto-py', 'kytea', 'sentencepiece'], - url='https://github.com/himkt/tiny_tokenizer', - packages=find_packages()) + +try: + BUILD_WORD_TOKENIZER = int(getenv('BUILD_WORD_TOKENIZER', 1)) +except: + raise ValueError('BUILD_WORD_TOKENIZER should be integer') + + +install_requires = [] +if BUILD_WORD_TOKENIZER == 1: + install_requires.extend(['natto-py', 'kytea', 'sentencepiece']) +else: + print('Install sentence tokenizer only') + + +setup( + name='tiny_tokenizer', + version='1.3.1', + description='Tiny Word/Sentence Tokenizer', + author='himkt', + author_email='himkt@klis.tsukuba.ac.jp', + install_requires=install_requires, + url='https://github.com/himkt/tiny_tokenizer', + packages=find_packages() +) diff --git a/tests/test_word_tokenize.py b/tests/test_word_tokenize.py index fc369ce..bb1e8fb 100644 --- a/tests/test_word_tokenize.py +++ b/tests/test_word_tokenize.py @@ -1,5 +1,7 @@ from tiny_tokenizer.word_tokenizer import WordTokenizer + import unittest +import pytest SENTENCE1 = '吾輩は猫である' @@ -10,21 +12,33 @@ class WordTokenizerTest(unittest.TestCase): def test_word_tokenize_with_kytea(self): """Test KyTea tokenizer.""" - tokenizer = WordTokenizer('KyTea') + try: + tokenizer = WordTokenizer('KyTea') + except ModuleNotFoundError: + pytest.skip('skip kytea') + expect = '吾輩 は 猫 で あ る'.split(' ') result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) def test_word_tokenize_with_mecab(self): """Test MeCab tokenizer.""" - tokenizer = WordTokenizer('MeCab') + try: + tokenizer = WordTokenizer('MeCab') + except ModuleNotFoundError: + pytest.skip('skip mecab') + expect = '吾輩 は 猫 で ある'.split(' ') result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) def test_word_tokenize_with_sentencepiece(self): """Test Sentencepiece tokenizer.""" - tokenizer = WordTokenizer('Sentencepiece', 'data/model.spm') + try: + tokenizer = WordTokenizer('Sentencepiece', 'data/model.spm') + except ModuleNotFoundError: + pytest.skip('skip sentencepiece') + expect = '▁ 吾 輩 は 猫 である'.split(' ') result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) @@ -49,21 +63,33 @@ class WordTokenizerWithLowerCaseTest(unittest.TestCase): def test_word_tokenize_with_kytea(self): """Test KyTea tokenizer.""" - tokenizer = WordTokenizer('kytea') + try: + tokenizer = WordTokenizer('kytea') + except ModuleNotFoundError: + pytest.skip('skip kytea') + expect = '吾輩 は 猫 で あ る'.split(' ') result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) def test_word_tokenize_with_mecab(self): """Test MeCab tokenizer.""" - tokenizer = WordTokenizer('mecab') + try: + tokenizer = WordTokenizer('mecab') + except ModuleNotFoundError: + pytest.skip('skip mecab') + expect = '吾輩 は 猫 で ある'.split(' ') result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) def test_word_tokenize_with_sentencepiece(self): """Test Sentencepiece tokenizer.""" - tokenizer = WordTokenizer('sentencepiece', 'data/model.spm') + try: + tokenizer = WordTokenizer('Sentencepiece', 'data/model.spm') + except ModuleNotFoundError: + pytest.skip('skip sentencepiece') + expect = '▁ 吾 輩 は 猫 である'.split(' ') result = tokenizer.tokenize(SENTENCE1) self.assertEqual(expect, result) diff --git a/tiny_tokenizer/__init__.py b/tiny_tokenizer/__init__.py index c68b554..a932ffc 100644 --- a/tiny_tokenizer/__init__.py +++ b/tiny_tokenizer/__init__.py @@ -1,4 +1,4 @@ from .sentence_tokenizer import SentenceTokenizer from .word_tokenizer import WordTokenizer -__version__ = '1.3.0' +__version__ = "1.3.1" diff --git a/tiny_tokenizer/word_tokenizer.py b/tiny_tokenizer/word_tokenizer.py index 360e174..dedab21 100644 --- a/tiny_tokenizer/word_tokenizer.py +++ b/tiny_tokenizer/word_tokenizer.py @@ -27,20 +27,33 @@ def __init__(self, tokenizer=None, flags=''): # use external libraries if __tokenizer == 'mecab': - import natto + try: + import natto + except ModuleNotFoundError: + raise ModuleNotFoundError('natto-py is not installed') + exit() + flags = '-Owakati' if not flags else flags self.__tokenizer = natto.MeCab(flags) self.__tokenizer_name = 'MeCab' self.tokenize = self.__mecab_tokenize if __tokenizer == 'kytea': - import Mykytea + try: + import Mykytea + except ModuleNotFoundError: + raise ModuleNotFoundError('kytea is not installed') + self.__tokenizer = Mykytea.Mykytea(flags) self.__tokenizer_name = 'KyTea' self.tokenize = self.__kytea_tokenize elif __tokenizer == 'sentencepiece': - import sentencepiece + try: + import sentencepiece + except ModuleNotFoundError: + raise ModuleNotFoundError('sentencepiece is not installed') + self.__tokenizer = sentencepiece.SentencePieceProcessor() self.__tokenizer.load(flags) self.__tokenizer_name = 'Sentencepiece'