Skip to content

Release v2.1.0

Compare
Choose a tag to compare
@himkt himkt released this 22 Jul 12:15
9b8a3b1
  • Support Sudachi tokenizer: #20
from tiny_tokenizer import SentenceTokenizer
from tiny_tokenizer import WordTokenizer


if __name__ == "__main__":
    sentence_tokenizer = SentenceTokenizer()
    tokenizer = WordTokenizer(tokenizer="Sudachi", mode="A")
    #                                              ^^^^^^^^
    #                                 You can choose splitting mode.
    #
    #      (https://github.com/WorksApplications/SudachiPy#as-a-python-package)
    #

    sentence = "我輩は猫である."
    print("input: ", sentence)

    result = tokenizer.tokenize(sentence)
    print(result)