Skip to content

Commit

Permalink
Merge pull request #231 from WorksApplications/feature/lazy-tokenize-…
Browse files Browse the repository at this point in the history
…sentences

Lazy sentence split and tokenization
  • Loading branch information
mh-northlander authored Jun 26, 2024
2 parents bbf8ce7 + d8e4d16 commit 26e731b
Show file tree
Hide file tree
Showing 6 changed files with 432 additions and 19 deletions.
61 changes: 52 additions & 9 deletions src/main/java/com/worksap/nlp/sudachi/IOTools.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 Works Applications Co., Ltd.
* Copyright (c) 2023-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,7 +17,6 @@
package com.worksap.nlp.sudachi;

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;

public class IOTools {
Expand All @@ -26,22 +25,21 @@ private IOTools() {
}

/**
* Read as much as possible from reader to the result buffer. Some readers
* perform filtering on input by reducing the number of read characters in each
* batch.
* Read as much as possible from the readable to the result buffer. Use this to
* make sure that the buffer is fulfilled or no text left unread.
*
* @param reader
* input reader
* @param readable
* input readable
* @param result
* buffer to read into
* @return number of read characters
* @throws IOException
* when read operation fails
*/
public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOException {
public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws IOException {
int totalRead = 0;
while (result.hasRemaining()) {
int read = reader.read(result);
int read = readable.read(result);
if (read < 0) {
if (totalRead == 0) {
return -1;
Expand All @@ -53,4 +51,49 @@ public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOExc
}
return totalRead;
}

/**
* Wrapper class for Readable, that uses {@link #readAsMuchAsCan} to read and
* guarantees that the last character read is not a high surrogate unless it is
* the last one in the readable.
*/
public static class SurrogateAwareReadable implements Readable {
private Readable readable;
char lastTrailingHighSurrogate;

SurrogateAwareReadable(Readable input) {
this.readable = input;
}

@Override
public int read(CharBuffer cb) throws IOException {
boolean trailingKept = false;
if (lastTrailingHighSurrogate != 0) {
cb.append(lastTrailingHighSurrogate);
lastTrailingHighSurrogate = 0;
trailingKept = true;
}

int nread = IOTools.readAsMuchAsCan(readable, cb);
if (nread < 0) {
if (!trailingKept) {
return -1;
}
// the last char in the readable is a high surrogate and there is nothing we can
// do.
return 1;
}
if (trailingKept) {
nread += 1;
}

char lastChar = cb.get(cb.position() - 1);
if (Character.isHighSurrogate(lastChar)) {
lastTrailingHighSurrogate = lastChar;
cb.position(cb.position() - 1);
nread -= 1;
}
return nread;
}
}
}
11 changes: 9 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,6 +23,7 @@
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import javax.json.Json;
Expand Down Expand Up @@ -98,10 +99,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {

@Override
public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader);
CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);

while (IOTools.readAsMuchAsCan(reader, buffer) > 0) {
while (wrappedReader.read(buffer) > 0) {
buffer.flip();
int length = analysis.tokenizeBuffer(buffer);
if (length < 0) {
Expand All @@ -119,6 +121,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) t
return sentences;
}

@Override
public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable readable) {
return new SentenceSplittingLazyAnalysis(mode, this, readable);
}

@Override
public void setDumpOutput(PrintStream output) {
dumpOutput = output;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.sudachi;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.CharBuffer;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

import com.worksap.nlp.sudachi.dictionary.LexiconSet;
import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;

/**
* Provides lazy sentence split and analysis.
*/
/* internal */ class SentenceSplittingLazyAnalysis
implements SentenceDetector.NonBreakCheker, Iterator<List<Morpheme>> {
private final SentenceDetector detector = new SentenceDetector();

private final Tokenizer.SplitMode mode;
private final JapaneseTokenizer tokenizer;
private final Readable readable;

SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) {
this.mode = mode;
this.tokenizer = tokenizer;
this.readable = new IOTools.SurrogateAwareReadable(readable);

this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
this.buffer.flip();
this.input = tokenizer.buildInputText("");
}

// input buffer
private final CharBuffer buffer;
// preprocessed InputText of the buffer.
// used to normalize text for the sentence detection.
private UTF8InputText input;
// begining-of-sentence index of next sentence in the input
private int bos = 0;
// normalized text left. corresponds to `input.getSubstring(bos,
// input.getText().length())`
private String normalized = "";

/** Return bos position in the buffer. */
private int bosPosition() {
return input.textIndexToOriginalTextIndex(bos);
}

/**
* Reset the buffer discarding processed text, then read from the input.
*
* @return the number of chars added to the buffer. -1 if input reabable is at
* its end.
*/
private int reloadBuffer() throws IOException {
buffer.position(bosPosition());
buffer.compact();
int nread = readable.read(buffer);
buffer.flip();

// align with new buffer state
input = tokenizer.buildInputText(buffer);
bos = 0;
normalized = input.getText();

return nread;
}

@Override
public boolean hasNext() {
if (!normalized.isEmpty()) {
return true;
}

int nread;
try {
nread = reloadBuffer();
} catch (IOException e) {
throw new UncheckedIOException(e.getMessage(), e);
}

return !(nread < 0 && !buffer.hasRemaining());
}

@Override
public MorphemeList next() {
int length = detector.getEos(normalized, this);
if (length > 0) { // sentence found
int eos = bos + length;
if (eos < normalized.length()) {
eos = input.getNextInOriginal(eos - 1);
length = eos - bos;
}
UTF8InputText sentence = input.slice(bos, eos);
bos = eos;
normalized = normalized.substring(length);
return tokenizer.tokenizeSentence(mode, sentence);
}

// buffer is just after reload but no (safe) eos found. need to clean it up.
// tokenize all text in the buffer.
if (bos == 0 && length < 0) {
bos = normalized.length();
normalized = "";
return tokenizer.tokenizeSentence(mode, input);
}

int nread;
try {
nread = reloadBuffer();
} catch (IOException e) {
throw new UncheckedIOException(e.getMessage(), e);
}

if (nread < 0 && !buffer.hasRemaining()) {
throw new NoSuchElementException("no texts left to analyze");
}

// recursive call with reloaded buffer.
return next();
}

@Override
public boolean hasNonBreakWord(int length) {
UTF8InputText inp = input;
int byteEOS = inp.getCodePointsOffsetLength(0, bos + length);
byte[] bytes = inp.getByteText();
LexiconSet lexicon = tokenizer.lexicon;
for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
Iterator<int[]> iterator = lexicon.lookup(bytes, i);
while (iterator.hasNext()) {
int[] r = iterator.next();
int l = r[1];
if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) {
return true;
}
}
}
return false;
}
}
43 changes: 38 additions & 5 deletions src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,8 @@
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.util.Iterator;
import java.util.List;

/**
* A tokenizer of morphological analysis.
Expand Down Expand Up @@ -52,7 +54,7 @@ default MorphemeList tokenize(final String text) {

/**
* Tokenize sentences. This method divide an input text into sentences and
* tokenizes them.
* tokenizes them. When the text is long, it uses a lot of memory.
*
* @param mode
* a mode of splitting
Expand All @@ -64,7 +66,7 @@ default MorphemeList tokenize(final String text) {

/**
* Tokenize sentences. Divide an input text into sentences and tokenize them
* with {@link SplitMode}.C.
* with {@link SplitMode}.C. When the text is long, it uses a lot of memory.
*
* @param text
* input text
Expand All @@ -77,7 +79,8 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {

/**
* Read an input text from {@code input}, divide it into sentences and tokenize
* them.
* them. It reads all text in the input and uses a lot of memory when the text
* is long.
*
* @param mode
* a mode of splitting
Expand All @@ -86,24 +89,54 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
* @return a result of tokenizing
* @throws IOException
* if reading a stream is failed
* @deprecated use {@link #lazyTokenizeSentences(SplitMode, Readable)} instead.
*/
@Deprecated
Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader input) throws IOException;

/**
* Reads an input text from {@code input}, divide it into sentences and
* tokenizes them with {@link SplitMode}.C.
* tokenizes them with {@link SplitMode}.C. It reads all text in the input and
* uses a lot of memory when the text is long.
*
* @param input
* a reader of input text
* @return a result of tokenizing
* @throws IOException
* if reading a stream is failed
* @see #tokenizeSentences(SplitMode,Reader)
* @deprecated use {@link #lazyTokenizeSentences(Readable)} instead.
*/
@Deprecated
default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOException {
return tokenizeSentences(SplitMode.C, input);
}

/**
* Read an input text from {@code input}, divide it into sentences and tokenize
* them. It reads the input lazily.
*
* @param mode
* a mode of splitting
* @param input
* a readable input text
* @return a result of tokenizing
*/
Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable input);

/**
* Read an input text from {@code input}, divide it into sentences and tokenize
* them with {@link SplitMode}.C. It reads the input lazily.
*
* @param input
* a readable input text
* @return a result of tokenizing
* @see #lazyTokenizeSentences(SplitMode,Readable)
*/
default Iterator<List<Morpheme>> lazyTokenizeSentences(Readable input) {
return lazyTokenizeSentences(SplitMode.C, input);
}

/**
* Prints lattice structure of the analysis into the passed {@link PrintStream}.
*
Expand Down
Loading

0 comments on commit 26e731b

Please sign in to comment.