-
-
Notifications
You must be signed in to change notification settings - Fork 72
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #231 from WorksApplications/feature/lazy-tokenize-…
…sentences Lazy sentence split and tokenization
- Loading branch information
Showing
6 changed files
with
432 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
158 changes: 158 additions & 0 deletions
158
src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
/* | ||
* Copyright (c) 2024 Works Applications Co., Ltd. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.worksap.nlp.sudachi; | ||
|
||
import java.io.IOException; | ||
import java.io.UncheckedIOException; | ||
import java.nio.CharBuffer; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.NoSuchElementException; | ||
|
||
import com.worksap.nlp.sudachi.dictionary.LexiconSet; | ||
import com.worksap.nlp.sudachi.sentdetect.SentenceDetector; | ||
|
||
/** | ||
* Provides lazy sentence split and analysis. | ||
*/ | ||
/* internal */ class SentenceSplittingLazyAnalysis | ||
implements SentenceDetector.NonBreakCheker, Iterator<List<Morpheme>> { | ||
private final SentenceDetector detector = new SentenceDetector(); | ||
|
||
private final Tokenizer.SplitMode mode; | ||
private final JapaneseTokenizer tokenizer; | ||
private final Readable readable; | ||
|
||
SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) { | ||
this.mode = mode; | ||
this.tokenizer = tokenizer; | ||
this.readable = new IOTools.SurrogateAwareReadable(readable); | ||
|
||
this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT); | ||
this.buffer.flip(); | ||
this.input = tokenizer.buildInputText(""); | ||
} | ||
|
||
// input buffer | ||
private final CharBuffer buffer; | ||
// preprocessed InputText of the buffer. | ||
// used to normalize text for the sentence detection. | ||
private UTF8InputText input; | ||
// begining-of-sentence index of next sentence in the input | ||
private int bos = 0; | ||
// normalized text left. corresponds to `input.getSubstring(bos, | ||
// input.getText().length())` | ||
private String normalized = ""; | ||
|
||
/** Return bos position in the buffer. */ | ||
private int bosPosition() { | ||
return input.textIndexToOriginalTextIndex(bos); | ||
} | ||
|
||
/** | ||
* Reset the buffer discarding processed text, then read from the input. | ||
* | ||
* @return the number of chars added to the buffer. -1 if input reabable is at | ||
* its end. | ||
*/ | ||
private int reloadBuffer() throws IOException { | ||
buffer.position(bosPosition()); | ||
buffer.compact(); | ||
int nread = readable.read(buffer); | ||
buffer.flip(); | ||
|
||
// align with new buffer state | ||
input = tokenizer.buildInputText(buffer); | ||
bos = 0; | ||
normalized = input.getText(); | ||
|
||
return nread; | ||
} | ||
|
||
@Override | ||
public boolean hasNext() { | ||
if (!normalized.isEmpty()) { | ||
return true; | ||
} | ||
|
||
int nread; | ||
try { | ||
nread = reloadBuffer(); | ||
} catch (IOException e) { | ||
throw new UncheckedIOException(e.getMessage(), e); | ||
} | ||
|
||
return !(nread < 0 && !buffer.hasRemaining()); | ||
} | ||
|
||
@Override | ||
public MorphemeList next() { | ||
int length = detector.getEos(normalized, this); | ||
if (length > 0) { // sentence found | ||
int eos = bos + length; | ||
if (eos < normalized.length()) { | ||
eos = input.getNextInOriginal(eos - 1); | ||
length = eos - bos; | ||
} | ||
UTF8InputText sentence = input.slice(bos, eos); | ||
bos = eos; | ||
normalized = normalized.substring(length); | ||
return tokenizer.tokenizeSentence(mode, sentence); | ||
} | ||
|
||
// buffer is just after reload but no (safe) eos found. need to clean it up. | ||
// tokenize all text in the buffer. | ||
if (bos == 0 && length < 0) { | ||
bos = normalized.length(); | ||
normalized = ""; | ||
return tokenizer.tokenizeSentence(mode, input); | ||
} | ||
|
||
int nread; | ||
try { | ||
nread = reloadBuffer(); | ||
} catch (IOException e) { | ||
throw new UncheckedIOException(e.getMessage(), e); | ||
} | ||
|
||
if (nread < 0 && !buffer.hasRemaining()) { | ||
throw new NoSuchElementException("no texts left to analyze"); | ||
} | ||
|
||
// recursive call with reloaded buffer. | ||
return next(); | ||
} | ||
|
||
@Override | ||
public boolean hasNonBreakWord(int length) { | ||
UTF8InputText inp = input; | ||
int byteEOS = inp.getCodePointsOffsetLength(0, bos + length); | ||
byte[] bytes = inp.getByteText(); | ||
LexiconSet lexicon = tokenizer.lexicon; | ||
for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) { | ||
Iterator<int[]> iterator = lexicon.lookup(bytes, i); | ||
while (iterator.hasNext()) { | ||
int[] r = iterator.next(); | ||
int l = r[1]; | ||
if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) { | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.