diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 09cc3f76..3af3b7a5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Reader; +import java.io.StringReader; import java.io.StringWriter; import java.nio.CharBuffer; import java.util.ArrayList; @@ -65,8 +66,9 @@ class JapaneseTokenizer implements Tokenizer { } @Override - public MorphemeList tokenize(Tokenizer.SplitMode mode, String text) { + public List tokenize(Tokenizer.SplitMode mode, String text) { if (text.isEmpty()) { + // return MorphemeList instance for the case internalCost is required. return MorphemeList.EMPTY; } UTF8InputText input = buildInputText(text); @@ -74,56 +76,45 @@ public MorphemeList tokenize(Tokenizer.SplitMode mode, String text) { } @Override - public Iterable tokenizeSentences(SplitMode mode, String text) { + public Iterable> tokenizeSentences(SplitMode mode, String text) { if (text.isEmpty()) { return Collections.emptyList(); } - SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this); - int length = analysis.tokenizeBuffer(text); - ArrayList result = analysis.result; - int bos = analysis.bos; - if (length < 0) { - // treat remaining thing as a single sentence - int eos = analysis.input.getText().length(); - if (bos != eos) { - UTF8InputText slice = analysis.input; - if (bos != 0) { - slice = slice.slice(bos, eos); - } - result.add(tokenizeSentence(mode, slice)); - } - } + StringReader input = new StringReader(text); + SentenceSplittingLazyAnalysis analysis = new SentenceSplittingLazyAnalysis(mode, this, input); + List> result = new ArrayList<>(); + analysis.forEachRemaining(result::add); return result; } @Override - public Iterable tokenizeSentences(SplitMode mode, Reader reader) throws IOException { - IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader); - CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT); - SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this); - - while (wrappedReader.read(buffer) > 0) { - buffer.flip(); - int length = analysis.tokenizeBuffer(buffer); - if (length < 0) { - buffer.position(analysis.bosPosition()); - buffer.compact(); - } - } - buffer.flip(); - ArrayList sentences = analysis.result; - - if (buffer.hasRemaining()) { - sentences.add(tokenizeSentence(mode, buildInputText(buffer))); - } + public Iterator> tokenizeSentences(SplitMode mode, Readable input) { + return new SentenceSplittingLazyAnalysis(mode, this, input); + } - return sentences; + @Override + public Iterator> lazyTokenizeSentences(SplitMode mode, Readable input) { + return tokenizeSentences(mode, input); } @Override - public Iterator> lazyTokenizeSentences(SplitMode mode, Readable readable) { - return new SentenceSplittingLazyAnalysis(mode, this, readable); + public List split(List morphemes, SplitMode mode) { + if (morphemes instanceof MorphemeList) { + return ((MorphemeList) morphemes).split(mode); + } + + List result = new ArrayList<>(); + for (Morpheme m : morphemes) { + if (m instanceof SingleMorphemeImpl) { + ((SingleMorphemeImpl) m).appendSplitsTo(result, mode); + } else { + for (Morpheme subsplit : m.split(mode)) { + result.add(subsplit); + } + } + } + return result; } @Override @@ -161,7 +152,7 @@ UTF8InputText buildInputText(CharSequence text) { return input; } - MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) { + List tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) { checkIfAlive(); buildLattice(input); diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java index 385f214e..907bcfb5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java @@ -215,15 +215,7 @@ public StringsCache getStrings() { } /* internal */ void appendSplitsTo(List result, Tokenizer.SplitMode mode) { - if (mode == Tokenizer.SplitMode.A) { - appendSplitsTo(result, getWordInfo().getAunitSplit()); - } else if (mode == Tokenizer.SplitMode.B) { - appendSplitsTo(result, getWordInfo().getBunitSplit()); - } else if (mode == Tokenizer.SplitMode.C) { - appendSplitsTo(result, getWordInfo().getCunitSplit()); - } else { - result.add(this); - } + appendSplitsTo(result, getWordInfo().getUnitSplit(mode)); } private void appendSplitsTo(List result, int[] splitsId) { diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java index 21571e9b..df3fdc4d 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeList.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,7 +48,7 @@ public class MorphemeList extends AbstractList { } @Override - public Morpheme get(int index) { + public MorphemeListItem get(int index) { return new MorphemeListItem(this, index); } @@ -91,7 +91,7 @@ WordInfo getWordInfo(int index) { return path.get(index).getWordInfo(); } - List split(Tokenizer.SplitMode mode, int index) { + MorphemeList split(Tokenizer.SplitMode mode, int index) { List nodes = new ArrayList<>(); LatticeNodeImpl node = path.get(index); node.appendSplitsTo(nodes, mode); @@ -106,18 +106,19 @@ List split(Tokenizer.SplitMode mode, int index) { * @param mode * requested split mode * @return current list or a new list in the requested split mode. + * + * @deprecated will be internal only. use {@link Tokenizer#split} instead. */ + @Deprecated public MorphemeList split(Tokenizer.SplitMode mode) { if (mode.compareTo(this.mode) >= 0) { return this; } List nodes = new ArrayList<>(); - for (LatticeNodeImpl node : path) { node.appendSplitsTo(nodes, mode); } - return new MorphemeList(inputText, grammar, lexicon, nodes, allowEmptyMorpheme, mode); } diff --git a/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java b/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java index e58d8fc6..125fd853 100644 --- a/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java +++ b/src/main/java/com/worksap/nlp/sudachi/MorphemeListItem.java @@ -77,7 +77,7 @@ public String surface() { } @Override - public List split(Tokenizer.SplitMode mode) { + public MorphemeList split(Tokenizer.SplitMode mode) { return list.split(mode, index); } diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java deleted file mode 100644 index 254e0e51..00000000 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2023 Works Applications Co., Ltd. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.worksap.nlp.sudachi; - -import com.worksap.nlp.sudachi.dictionary.LexiconSet; -import com.worksap.nlp.sudachi.sentdetect.SentenceDetector; - -import java.util.ArrayList; -import java.util.Iterator; - -/*internal*/ class SentenceSplittingAnalysis implements SentenceDetector.NonBreakCheker { - private final SentenceDetector detector = new SentenceDetector(); - - private final Tokenizer.SplitMode mode; - private final JapaneseTokenizer tokenizer; - final ArrayList result = new ArrayList<>(); - - SentenceSplittingAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer) { - this.mode = mode; - this.tokenizer = tokenizer; - } - - UTF8InputText input; - int bos; - - int tokenizeBuffer(CharSequence buffer) { - UTF8InputText input = tokenizer.buildInputText(buffer); - String normalized = input.getText(); - this.input = input; - - int bos = 0; - int length; - - this.bos = bos; - while ((length = detector.getEos(normalized, this)) > 0) { - int eos = bos + length; - if (eos < normalized.length()) { - eos = input.getNextInOriginal(eos - 1); - length = eos - bos; - } - UTF8InputText sentence = input.slice(bos, eos); - result.add(tokenizer.tokenizeSentence(mode, sentence)); - normalized = normalized.substring(length); - bos = eos; - this.bos = bos; - } - - // buffer is full, need to clean it up - if (length < 0 && buffer.length() == -length) { - result.add(tokenizer.tokenizeSentence(mode, input)); - return -length; - } - - return length; - } - - int bosPosition() { - return input.textIndexToOriginalTextIndex(bos); - } - - @Override - public boolean hasNonBreakWord(int length) { - UTF8InputText inp = input; - int byteEOS = inp.getCodePointsOffsetLength(0, bos + length); - byte[] bytes = inp.getByteText(); - LexiconSet lexicon = tokenizer.lexicon; - for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) { - Iterator iterator = lexicon.lookup(bytes, i); - while (iterator.hasNext()) { - int[] r = iterator.next(); - int l = r[1]; - if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) { - return true; - } - } - } - return false; - } -} diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java index 44ebfdce..72eb9caf 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java +++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java @@ -100,7 +100,7 @@ public boolean hasNext() { } @Override - public MorphemeList next() { + public List next() { int length = detector.getEos(normalized, this); if (length > 0) { // sentence found int eos = bos + length; diff --git a/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java b/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java index 55cdc87e..a8f2d482 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java +++ b/src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java @@ -118,16 +118,8 @@ public List split(Tokenizer.SplitMode mode) { * * @see LatticeNodeImpl.appendSplitsTo */ - private void appendSplitsTo(List result, Tokenizer.SplitMode mode) { - if (mode == Tokenizer.SplitMode.A) { - appendSplitsTo(result, getWordInfo().getAunitSplit()); - } else if (mode == Tokenizer.SplitMode.B) { - appendSplitsTo(result, getWordInfo().getBunitSplit()); - } else if (mode == Tokenizer.SplitMode.C) { - appendSplitsTo(result, getWordInfo().getCunitSplit()); - } else { - result.add(this); - } + /* internal */ void appendSplitsTo(List result, Tokenizer.SplitMode mode) { + appendSplitsTo(result, getWordInfo().getUnitSplit(mode)); } private void appendSplitsTo(List result, int[] splitIds) { diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index 89f6adef..95e2f1aa 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -37,7 +37,7 @@ public interface Tokenizer { * input text * @return a result of tokenizing */ - MorphemeList tokenize(SplitMode mode, String text); + List tokenize(SplitMode mode, String text); /** * @@ -48,7 +48,7 @@ public interface Tokenizer { * @return a result of tokenizing * @see #tokenize(SplitMode,String) */ - default MorphemeList tokenize(final String text) { + default List tokenize(final String text) { return tokenize(SplitMode.C, text); } @@ -62,7 +62,7 @@ default MorphemeList tokenize(final String text) { * input text * @return a result of tokenizing */ - Iterable tokenizeSentences(SplitMode mode, String text); + Iterable> tokenizeSentences(SplitMode mode, String text); /** * Tokenize sentences. Divide an input text into sentences and tokenize them @@ -73,42 +73,32 @@ default MorphemeList tokenize(final String text) { * @return a result of tokenizing * @see #tokenizeSentences(SplitMode,String) */ - default Iterable tokenizeSentences(String text) { + default Iterable> tokenizeSentences(String text) { return tokenizeSentences(SplitMode.C, text); } /** * Read an input text from {@code input}, divide it into sentences and tokenize - * them. It reads all text in the input and uses a lot of memory when the text - * is long. + * them. It reads the input lazily. * * @param mode * a mode of splitting * @param input - * a reader of input text - * @return a result of tokenizing - * @throws IOException - * if reading a stream is failed - * @deprecated use {@link #lazyTokenizeSentences(SplitMode, Readable)} instead. + * a readable input text + * @return an iterator of tokenized sentences */ - @Deprecated - Iterable tokenizeSentences(SplitMode mode, Reader input) throws IOException; + Iterator> tokenizeSentences(SplitMode mode, Readable input); /** - * Reads an input text from {@code input}, divide it into sentences and - * tokenizes them with {@link SplitMode}.C. It reads all text in the input and - * uses a lot of memory when the text is long. + * Read an input text from {@code input}, divide it into sentences and tokenize + * them with {@link SplitMode}.C. It reads the input lazily. * * @param input - * a reader of input text - * @return a result of tokenizing - * @throws IOException - * if reading a stream is failed - * @see #tokenizeSentences(SplitMode,Reader) - * @deprecated use {@link #lazyTokenizeSentences(Readable)} instead. + * a readable input text + * @return an iterator of tokenized sentences + * @see #tokenizeSentences(SplitMode,Readable) */ - @Deprecated - default Iterable tokenizeSentences(Reader input) throws IOException { + default Iterator> tokenizeSentences(Readable input) { return tokenizeSentences(SplitMode.C, input); } @@ -121,7 +111,10 @@ default Iterable tokenizeSentences(Reader input) throws IOExceptio * @param input * a readable input text * @return a result of tokenizing + * @deprecated renamed to {@link #tokenizeSentences(SplitMode, Readable)} + * */ + @Deprecated Iterator> lazyTokenizeSentences(SplitMode mode, Readable input); /** @@ -132,11 +125,25 @@ default Iterable tokenizeSentences(Reader input) throws IOExceptio * a readable input text * @return a result of tokenizing * @see #lazyTokenizeSentences(SplitMode,Readable) + * @deprecated renamed to {@link #tokenizeSentences(Readable)} */ + @Deprecated default Iterator> lazyTokenizeSentences(Readable input) { return lazyTokenizeSentences(SplitMode.C, input); } + /** + * Produce a copy of this list in a finer split mode. May return the given list + * if the mode is coarser than the current one. The given list is not modified. + * + * @param morphemes + * list of morphemes to split. + * @param mode + * requested split mode + * @return current list, or a new list in the requested split mode. + */ + List split(List morphemes, SplitMode mode); + /** * Prints lattice structure of the analysis into the passed {@link PrintStream}. * diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java index 844a6b98..9f3677a8 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLexicon.java @@ -208,7 +208,7 @@ public void calculateDynamicCosts(Tokenizer tokenizer) { } int headwordPtr = wordInfos.headwordPtr(wordId); String headword = strings.string(headwordPtr); - MorphemeList ms = tokenizer.tokenize(headword); + MorphemeList ms = (MorphemeList) tokenizer.tokenize(headword); int cost = ms.getInternalCost() + USER_DICT_COST_PAR_MORPH * ms.size(); if (cost > Short.MAX_VALUE) { cost = Short.MAX_VALUE; diff --git a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java index 8da239d6..9ccd2acb 100644 --- a/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java +++ b/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfo.java @@ -16,10 +16,11 @@ package com.worksap.nlp.sudachi.dictionary; -import com.worksap.nlp.sudachi.StringUtil; - import java.nio.ByteBuffer; +import com.worksap.nlp.sudachi.StringUtil; +import com.worksap.nlp.sudachi.Tokenizer; + /** * Internal morpheme information. This class does not contain any strings. * @@ -186,6 +187,23 @@ public int[] getCunitSplit() { return cUnitSplit; } + /** + * Returns the array of word IDs which the morpheme is compounded of in given + * mode. + * + * @return the word IDs of the given units + */ + public int[] getUnitSplit(Tokenizer.SplitMode mode) { + if (mode == Tokenizer.SplitMode.A) { + return getAunitSplit(); + } + if (mode == Tokenizer.SplitMode.B) { + return getBunitSplit(); + } + assert (mode == Tokenizer.SplitMode.C); + return getCunitSplit(); + } + /** * Returns the array of the morphemes which the morpheme is compounded of. * diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt index 85aebbae..c77467bd 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt @@ -26,6 +26,35 @@ import kotlin.test.assertFailsWith class JapaneseTokenizerStreamingTest { private val tokenizer = TestDictionary.user0().tokenizer() + @Test + fun streamingReadable() { + val reader = StringReader("あ".repeat(5000)) + val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() + val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } + assertEquals(5000, totalLength) + } + + @Test + fun callingNextWithoutTextFails() { + val reader = StringReader("東京") + val it = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) + + val morphemes = it.next() + assertEquals("東京", morphemes.get(0).surface()) + + assertFailsWith( + block = { it.next() }, + ) + } + + @Test + fun streamingLongTextShouldNotCauseOOM() { + val reader = StringReader("あ".repeat(10 * 1024 * 1024)) + val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() + val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } + assertEquals(10 * 1024 * 1024, totalLength) + } + class BadReader(private val data: String, private val window: Int = 512) : Reader() { private var position: Int = 0 @@ -50,61 +79,14 @@ class JapaneseTokenizerStreamingTest { override fun close() {} } - @Test - fun streamingTest() { - // Testing deprecated method `tokenizeSentences(Reader)` - val reader = StringReader("あ".repeat(5000)) - val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(5000, totalLength) - } - - @Test - fun streamingTestWithBadReader() { - // Testing deprecated method `tokenizeSentences(Reader)` - val reader = BadReader("あ".repeat(5000)) - val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(5000, totalLength) - } - - @Test - fun streamingReadable() { - val reader = StringReader("あ".repeat(5000)) - val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(5000, totalLength) - } - - @Test - fun callingNextWithoutTextFails() { - val reader = StringReader("東京") - val it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) - - val morphemes = it.next() - assertEquals("東京", morphemes.get(0).surface()) - - assertFailsWith( - block = { it.next() }, - ) - } - @Test fun streamingBlockingReadable() { val reader = BadReader("あ".repeat(5000)) - val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() + val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } assertEquals(5000, totalLength) } - @Test - fun streamingLongTextShouldNotCauseOOM() { - val reader = StringReader("あ".repeat(10 * 1024 * 1024)) - val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() - val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } - assertEquals(10 * 1024 * 1024, totalLength) - } - class FailReader(private val data: String) : Reader() { private var position: Int = 0 @@ -133,13 +115,13 @@ class JapaneseTokenizerStreamingTest { fun failsWhenReaderFails() { var reader = FailReader("あ".repeat(500)) // should not fail on the instantiation - var it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) + var it = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) assertFailsWith( block = { it.hasNext() }, ) reader = FailReader("あ".repeat(500)) - it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) + it = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) assertFailsWith( block = { it.next() }, ) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 50494e85..f34a3abf 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -27,6 +27,7 @@ import java.io.StringReader; import java.util.Iterator; import java.util.List; +import java.util.ArrayList; import javax.json.Json; import javax.json.JsonArray; @@ -140,7 +141,7 @@ public void tokenizeKanjiAlphabetWord() { @Test public void tokenizeSentences() { - Iterator it = tokenizer.tokenizeSentences("京都。東京.東京都。").iterator(); + Iterator> it = tokenizer.tokenizeSentences("京都。東京.東京都。").iterator(); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -157,7 +158,7 @@ public void tokenizeSentences() { @Test public void tokenizerWithDots() { - MorphemeList s = tokenizer.tokenize("京都…"); + List s = tokenizer.tokenize("京都…"); assertThat(s.size(), is(4)); assertThat(s.get(1).surface(), is("…")); assertThat(s.get(1).normalizedForm(), is(".")); @@ -169,7 +170,7 @@ public void tokenizerWithDots() { @Test public void tokenizerWithModifiedChar() { - Iterator it = tokenizer.tokenizeSentences("´´").iterator(); + Iterator> it = tokenizer.tokenizeSentences("´´").iterator(); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(4)); assertThat(it.hasNext(), is(false)); @@ -177,7 +178,7 @@ public void tokenizerWithModifiedChar() { @Test public void tokenizeSentencesWithSurrogatePair() { - Iterator it = tokenizer.tokenizeSentences("。😀").iterator(); + Iterator> it = tokenizer.tokenizeSentences("。😀").iterator(); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(1)); assertThat(it.hasNext(), is(true)); @@ -185,15 +186,10 @@ public void tokenizeSentencesWithSurrogatePair() { assertThat(it.hasNext(), is(false)); } - /** - * @deprecated testing deprecated method - * {@link #Tokenizer.tokenizeSentences(Reader)}. - */ - @Deprecated @Test public void tokenizerWithReader() throws IOException { StringReader reader = new StringReader("京都。東京.東京都。京都"); - Iterator it = tokenizer.tokenizeSentences(reader).iterator(); + Iterator> it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -205,11 +201,6 @@ public void tokenizerWithReader() throws IOException { assertThat(it.hasNext(), is(false)); } - /** - * @deprecated testing deprecated method - * {@link #Tokenizer.tokenizeSentences(Reader)}. - */ - @Deprecated @Test public void tokenizerWithLongReader() throws IOException { StringBuilder sb = new StringBuilder(); @@ -218,7 +209,7 @@ public void tokenizerWithLongReader() throws IOException { } sb.append("京都"); StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.tokenizeSentences(reader).iterator(); + Iterator> it = tokenizer.tokenizeSentences(reader); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); @@ -228,11 +219,6 @@ public void tokenizerWithLongReader() throws IOException { assertThat(it.hasNext(), is(false)); } - /** - * @deprecated testing deprecated method - * {@link #Tokenizer.tokenizeSentences(Reader)}. - */ - @Deprecated @Test public void tokenizerWithReaderAndNormalization() throws IOException { StringBuilder sb = new StringBuilder(); @@ -241,7 +227,7 @@ public void tokenizerWithReaderAndNormalization() throws IOException { sb.append("京都。"); } StringReader reader = new StringReader(sb.toString()); - Iterator it = tokenizer.tokenizeSentences(reader).iterator(); + Iterator> it = tokenizer.tokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(5)); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { @@ -254,6 +240,28 @@ public void tokenizerWithReaderAndNormalization() throws IOException { assertThat(it.hasNext(), is(false)); } + @Test + public void tokenizeSentencesWithSurrogatePairAtBufferLimit() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) { + sb.append("。"); + } + sb.append("😀"); + StringReader reader = new StringReader(sb.toString()); + Iterator> it = tokenizer.tokenizeSentences(reader); + + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1)); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(1)); + assertThat(it.hasNext(), is(false)); + } + + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.lazyTokenizeSentences(Readable)}. + */ + @Deprecated @Test public void lazyTokenizeSentences() { StringReader reader = new StringReader("京都。東京.東京都。京都"); @@ -275,6 +283,11 @@ public void lazyTokenizeSentences() { assertThat(it.hasNext(), is(false)); } + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.lazyTokenizeSentences(Readable)}. + */ + @Deprecated @Test public void lazyTokenizeSentencesWithLongText() { StringBuilder sb = new StringBuilder(); @@ -293,6 +306,11 @@ public void lazyTokenizeSentencesWithLongText() { assertThat(it.hasNext(), is(false)); } + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.lazyTokenizeSentences(Readable)}. + */ + @Deprecated @Test public void lazyTokenizeSentencesWithNormalization() { StringBuilder sb = new StringBuilder(); @@ -314,23 +332,6 @@ public void lazyTokenizeSentencesWithNormalization() { assertThat(it.hasNext(), is(false)); } - @Test - public void lazyTokenizeSentencesWithSurrogatePair() { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) { - sb.append("。"); - } - sb.append("😀"); - StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.lazyTokenizeSentences(reader); - - assertThat(it.hasNext(), is(true)); - assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1)); - assertThat(it.hasNext(), is(true)); - assertThat(it.next().size(), is(1)); - assertThat(it.hasNext(), is(false)); - } - @Test public void zeroLengthMorpheme() { List s = tokenizer.tokenize("…"); @@ -373,7 +374,7 @@ public void disableEmptyMorpheme() throws IOException { @Test public void splitC() { - MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東東京都"); + List morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東東京都"); assertThat(morphemesC.get(0).surface(), is("東")); assertThat(morphemesC.get(1).surface(), is("東")); assertThat(morphemesC.get(2).surface(), is("京都")); @@ -381,54 +382,54 @@ public void splitC() { @Test public void splitAfterTokenizeCtoA() { - MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); + List morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemesC.size(), is(1)); - MorphemeList morphemesA = morphemesC.split(Tokenizer.SplitMode.A); + List morphemesA = tokenizer.split(morphemesC, Tokenizer.SplitMode.A); assertThat(morphemesA.size(), is(2)); } @Test public void splitAfterTokenizeCtoB() { - MorphemeList morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); + List morphemesC = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemesC.size(), is(1)); - MorphemeList morphemesB = morphemesC.split(Tokenizer.SplitMode.B); + List morphemesB = tokenizer.split(morphemesC, Tokenizer.SplitMode.B); assertThat(morphemesB.size(), is(1)); } @Test public void splitAfterTokenizeCtoC() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "東京都"); assertThat(morphemes1.size(), is(1)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.C); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); } @Test public void splitAfterTokenizeAtoC() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.A, "東京都"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.A, "東京都"); assertThat(morphemes1.size(), is(2)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.C); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); } @Test public void splitAfterTokenizeBtoC() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.B, "東京都"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.B, "東京都"); assertThat(morphemes1.size(), is(1)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.C); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.C); assertThat(morphemes2, sameInstance(morphemes1)); } @Test public void splitWithZeroWidthTokens() { - MorphemeList morphemes1 = tokenizer.tokenize("…東京都…"); + List morphemes1 = tokenizer.tokenize("…東京都…"); assertThat(morphemes1.size(), is(7)); assertThat(morphemes1.get(0), morpheme("…", 0, 1)); assertThat(morphemes1.get(1), morpheme("", 1, 1)); assertThat(morphemes1.get(2), morpheme("", 1, 1)); assertThat(morphemes1.get(3), morpheme("東京都", 1, 4)); assertThat(morphemes1.get(4), morpheme("…", 4, 5)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.A); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.A); assertThat(morphemes2.size(), is(8)); assertThat(morphemes2.get(3), morpheme("東京", 1, 3)); assertThat(morphemes2.get(4), morpheme("都", 3, 4)); @@ -436,14 +437,42 @@ public void splitWithZeroWidthTokens() { @Test public void splitSingleToken() { - MorphemeList morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "な。な"); + List morphemes1 = tokenizer.tokenize(Tokenizer.SplitMode.C, "な。な"); assertThat(morphemes1.size(), is(1)); assertThat(morphemes1.get(0), morpheme("な。な", 0, 3)); - MorphemeList morphemes2 = morphemes1.split(Tokenizer.SplitMode.A); + List morphemes2 = tokenizer.split(morphemes1, Tokenizer.SplitMode.A); assertThat(morphemes2.get(0), morpheme("な。な", 0, 3)); assertThat(morphemes2.get(0).normalizedForm(), is("アイウ")); } + @Test + public void splitListOfSingleMorphemes() { + List morphemes = new ArrayList<>(); + + morphemes.add(dict.lookup("京都").get(0)); + morphemes.add(dict.lookup("東京都").get(0)); + + List splits = tokenizer.split(morphemes, Tokenizer.SplitMode.A); + assertThat(splits.size(), is(3)); + assertThat(splits.get(0).normalizedForm(), is("京都")); + assertThat(splits.get(1).normalizedForm(), is("東京")); + assertThat(splits.get(2).normalizedForm(), is("都")); + } + + @Test + public void splitMixedMorphemeList() { + List morphemes = new ArrayList<>(); + for (Morpheme m : tokenizer.tokenize(Tokenizer.SplitMode.C, "な。な")) { + morphemes.add(m); + } + morphemes.add(dict.lookup("東京都").get(0)); + + List splits = tokenizer.split(morphemes, Tokenizer.SplitMode.A); + assertThat(splits.size(), is(3)); + assertThat(splits.get(0).normalizedForm(), is("アイウ")); + assertThat(splits.get(2).normalizedForm(), is("都")); + } + @Test public void dumpInternalStructures() { String json = tokenizer.dumpInternalStructures("東京都");