Skip to content

Commit

Permalink
Merge pull request #254 from WorksApplications/feature/251-deprecate-…
Browse files Browse the repository at this point in the history
…tokenize-sentences

Deprecate tokenizeSentences and organize tokenizer method signatures
  • Loading branch information
mh-northlander authored Dec 2, 2024
2 parents 8d0cf70 + 1d24e46 commit 239d02b
Show file tree
Hide file tree
Showing 12 changed files with 209 additions and 290 deletions.
71 changes: 31 additions & 40 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.CharBuffer;
import java.util.ArrayList;
Expand Down Expand Up @@ -65,65 +66,55 @@ class JapaneseTokenizer implements Tokenizer {
}

@Override
public MorphemeList tokenize(Tokenizer.SplitMode mode, String text) {
public List<Morpheme> tokenize(Tokenizer.SplitMode mode, String text) {
if (text.isEmpty()) {
// return MorphemeList instance for the case internalCost is required.
return MorphemeList.EMPTY;
}
UTF8InputText input = buildInputText(text);
return tokenizeSentence(mode, input);
}

@Override
public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {
public Iterable<List<Morpheme>> tokenizeSentences(SplitMode mode, String text) {
if (text.isEmpty()) {
return Collections.emptyList();
}

SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
int length = analysis.tokenizeBuffer(text);
ArrayList<MorphemeList> result = analysis.result;
int bos = analysis.bos;
if (length < 0) {
// treat remaining thing as a single sentence
int eos = analysis.input.getText().length();
if (bos != eos) {
UTF8InputText slice = analysis.input;
if (bos != 0) {
slice = slice.slice(bos, eos);
}
result.add(tokenizeSentence(mode, slice));
}
}
StringReader input = new StringReader(text);
SentenceSplittingLazyAnalysis analysis = new SentenceSplittingLazyAnalysis(mode, this, input);
List<List<Morpheme>> result = new ArrayList<>();
analysis.forEachRemaining(result::add);
return result;
}

@Override
public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader);
CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);

while (wrappedReader.read(buffer) > 0) {
buffer.flip();
int length = analysis.tokenizeBuffer(buffer);
if (length < 0) {
buffer.position(analysis.bosPosition());
buffer.compact();
}
}
buffer.flip();
ArrayList<MorphemeList> sentences = analysis.result;

if (buffer.hasRemaining()) {
sentences.add(tokenizeSentence(mode, buildInputText(buffer)));
}
public Iterator<List<Morpheme>> tokenizeSentences(SplitMode mode, Readable input) {
return new SentenceSplittingLazyAnalysis(mode, this, input);
}

return sentences;
@Override
public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable input) {
return tokenizeSentences(mode, input);
}

@Override
public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable readable) {
return new SentenceSplittingLazyAnalysis(mode, this, readable);
public List<Morpheme> split(List<Morpheme> morphemes, SplitMode mode) {
if (morphemes instanceof MorphemeList) {
return ((MorphemeList) morphemes).split(mode);
}

List<Morpheme> result = new ArrayList<>();
for (Morpheme m : morphemes) {
if (m instanceof SingleMorphemeImpl) {
((SingleMorphemeImpl) m).appendSplitsTo(result, mode);
} else {
for (Morpheme subsplit : m.split(mode)) {
result.add(subsplit);
}
}
}
return result;
}

@Override
Expand Down Expand Up @@ -161,7 +152,7 @@ UTF8InputText buildInputText(CharSequence text) {
return input;
}

MorphemeList tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
List<Morpheme> tokenizeSentence(Tokenizer.SplitMode mode, UTF8InputText input) {
checkIfAlive();
buildLattice(input);

Expand Down
10 changes: 1 addition & 9 deletions src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -215,15 +215,7 @@ public StringsCache getStrings() {
}

/* internal */ void appendSplitsTo(List<LatticeNodeImpl> result, Tokenizer.SplitMode mode) {
if (mode == Tokenizer.SplitMode.A) {
appendSplitsTo(result, getWordInfo().getAunitSplit());
} else if (mode == Tokenizer.SplitMode.B) {
appendSplitsTo(result, getWordInfo().getBunitSplit());
} else if (mode == Tokenizer.SplitMode.C) {
appendSplitsTo(result, getWordInfo().getCunitSplit());
} else {
result.add(this);
}
appendSplitsTo(result, getWordInfo().getUnitSplit(mode));
}

private void appendSplitsTo(List<LatticeNodeImpl> result, int[] splitsId) {
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/com/worksap/nlp/sudachi/MorphemeList.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -48,7 +48,7 @@ public class MorphemeList extends AbstractList<Morpheme> {
}

@Override
public Morpheme get(int index) {
public MorphemeListItem get(int index) {
return new MorphemeListItem(this, index);
}

Expand Down Expand Up @@ -91,7 +91,7 @@ WordInfo getWordInfo(int index) {
return path.get(index).getWordInfo();
}

List<Morpheme> split(Tokenizer.SplitMode mode, int index) {
MorphemeList split(Tokenizer.SplitMode mode, int index) {
List<LatticeNodeImpl> nodes = new ArrayList<>();
LatticeNodeImpl node = path.get(index);
node.appendSplitsTo(nodes, mode);
Expand All @@ -106,18 +106,19 @@ List<Morpheme> split(Tokenizer.SplitMode mode, int index) {
* @param mode
* requested split mode
* @return current list or a new list in the requested split mode.
*
* @deprecated will be internal only. use {@link Tokenizer#split} instead.
*/
@Deprecated
public MorphemeList split(Tokenizer.SplitMode mode) {
if (mode.compareTo(this.mode) >= 0) {
return this;
}

List<LatticeNodeImpl> nodes = new ArrayList<>();

for (LatticeNodeImpl node : path) {
node.appendSplitsTo(nodes, mode);
}

return new MorphemeList(inputText, grammar, lexicon, nodes, allowEmptyMorpheme, mode);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public String surface() {
}

@Override
public List<Morpheme> split(Tokenizer.SplitMode mode) {
public MorphemeList split(Tokenizer.SplitMode mode) {
return list.split(mode, index);
}

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public boolean hasNext() {
}

@Override
public MorphemeList next() {
public List<Morpheme> next() {
int length = detector.getEos(normalized, this);
if (length > 0) { // sentence found
int eos = bos + length;
Expand Down
12 changes: 2 additions & 10 deletions src/main/java/com/worksap/nlp/sudachi/SingleMorphemeImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,16 +118,8 @@ public List<Morpheme> split(Tokenizer.SplitMode mode) {
*
* @see LatticeNodeImpl.appendSplitsTo
*/
private void appendSplitsTo(List<Morpheme> result, Tokenizer.SplitMode mode) {
if (mode == Tokenizer.SplitMode.A) {
appendSplitsTo(result, getWordInfo().getAunitSplit());
} else if (mode == Tokenizer.SplitMode.B) {
appendSplitsTo(result, getWordInfo().getBunitSplit());
} else if (mode == Tokenizer.SplitMode.C) {
appendSplitsTo(result, getWordInfo().getCunitSplit());
} else {
result.add(this);
}
/* internal */ void appendSplitsTo(List<Morpheme> result, Tokenizer.SplitMode mode) {
appendSplitsTo(result, getWordInfo().getUnitSplit(mode));
}

private void appendSplitsTo(List<Morpheme> result, int[] splitIds) {
Expand Down
Loading

0 comments on commit 239d02b

Please sign in to comment.