Merge pull request #231 from WorksApplications/feature/lazy-tokenize-…

…sentences Lazy sentence split and tokenization
WorksApplications · Jun 26, 2024 · 26e731b · 26e731b
2 parents bbf8ce7 + d8e4d16
commit 26e731b
Show file tree

Hide file tree

Showing 6 changed files with 432 additions and 19 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/IOTools.java b/src/main/java/com/worksap/nlp/sudachi/IOTools.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Works Applications Co., Ltd.
+ * Copyright (c) 2023-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 package com.worksap.nlp.sudachi;
 
 import java.io.IOException;
-import java.io.Reader;
 import java.nio.CharBuffer;
 
 public class IOTools {
@@ -26,22 +25,21 @@ private IOTools() {
     }
 
     /**
-     * Read as much as possible from reader to the result buffer. Some readers
-     * perform filtering on input by reducing the number of read characters in each
-     * batch.
+     * Read as much as possible from the readable to the result buffer. Use this to
+     * make sure that the buffer is fulfilled or no text left unread.
      *
-     * @param reader
-     *            input reader
+     * @param readable
+     *            input readable
      * @param result
      *            buffer to read into
      * @return number of read characters
      * @throws IOException
      *             when read operation fails
      */
-    public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOException {
+    public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws IOException {
         int totalRead = 0;
         while (result.hasRemaining()) {
-            int read = reader.read(result);
+            int read = readable.read(result);
             if (read < 0) {
                 if (totalRead == 0) {
                     return -1;
@@ -53,4 +51,49 @@ public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOExc
         }
         return totalRead;
     }
+
+    /**
+     * Wrapper class for Readable, that uses {@link #readAsMuchAsCan} to read and
+     * guarantees that the last character read is not a high surrogate unless it is
+     * the last one in the readable.
+     */
+    public static class SurrogateAwareReadable implements Readable {
+        private Readable readable;
+        char lastTrailingHighSurrogate;
+
+        SurrogateAwareReadable(Readable input) {
+            this.readable = input;
+        }
+
+        @Override
+        public int read(CharBuffer cb) throws IOException {
+            boolean trailingKept = false;
+            if (lastTrailingHighSurrogate != 0) {
+                cb.append(lastTrailingHighSurrogate);
+                lastTrailingHighSurrogate = 0;
+                trailingKept = true;
+            }
+
+            int nread = IOTools.readAsMuchAsCan(readable, cb);
+            if (nread < 0) {
+                if (!trailingKept) {
+                    return -1;
+                }
+                // the last char in the readable is a high surrogate and there is nothing we can
+                // do.
+                return 1;
+            }
+            if (trailingKept) {
+                nread += 1;
+            }
+
+            char lastChar = cb.get(cb.position() - 1);
+            if (Character.isHighSurrogate(lastChar)) {
+                lastTrailingHighSurrogate = lastChar;
+                cb.position(cb.position() - 1);
+                nread -= 1;
+            }
+            return nread;
+        }
+    }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 import java.nio.CharBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.List;
 
 import javax.json.Json;
@@ -98,10 +99,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {
 
     @Override
     public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
+        IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader);
         CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
         SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
 
-        while (IOTools.readAsMuchAsCan(reader, buffer) > 0) {
+        while (wrappedReader.read(buffer) > 0) {
             buffer.flip();
             int length = analysis.tokenizeBuffer(buffer);
             if (length < 0) {
@@ -119,6 +121,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) t
         return sentences;
     }
 
+    @Override
+    public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable readable) {
+        return new SentenceSplittingLazyAnalysis(mode, this, readable);
+    }
+
     @Override
     public void setDumpOutput(PrintStream output) {
         dumpOutput = output;

diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2024 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.sudachi;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.CharBuffer;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+import com.worksap.nlp.sudachi.dictionary.LexiconSet;
+import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;
+
+/**
+ * Provides lazy sentence split and analysis.
+ */
+/* internal */ class SentenceSplittingLazyAnalysis
+        implements SentenceDetector.NonBreakCheker, Iterator<List<Morpheme>> {
+    private final SentenceDetector detector = new SentenceDetector();
+
+    private final Tokenizer.SplitMode mode;
+    private final JapaneseTokenizer tokenizer;
+    private final Readable readable;
+
+    SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) {
+        this.mode = mode;
+        this.tokenizer = tokenizer;
+        this.readable = new IOTools.SurrogateAwareReadable(readable);
+
+        this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
+        this.buffer.flip();
+        this.input = tokenizer.buildInputText("");
+    }
+
+    // input buffer
+    private final CharBuffer buffer;
+    // preprocessed InputText of the buffer.
+    // used to normalize text for the sentence detection.
+    private UTF8InputText input;
+    // begining-of-sentence index of next sentence in the input
+    private int bos = 0;
+    // normalized text left. corresponds to `input.getSubstring(bos,
+    // input.getText().length())`
+    private String normalized = "";
+
+    /** Return bos position in the buffer. */
+    private int bosPosition() {
+        return input.textIndexToOriginalTextIndex(bos);
+    }
+
+    /**
+     * Reset the buffer discarding processed text, then read from the input.
+     * 
+     * @return the number of chars added to the buffer. -1 if input reabable is at
+     *         its end.
+     */
+    private int reloadBuffer() throws IOException {
+        buffer.position(bosPosition());
+        buffer.compact();
+        int nread = readable.read(buffer);
+        buffer.flip();
+
+        // align with new buffer state
+        input = tokenizer.buildInputText(buffer);
+        bos = 0;
+        normalized = input.getText();
+
+        return nread;
+    }
+
+    @Override
+    public boolean hasNext() {
+        if (!normalized.isEmpty()) {
+            return true;
+        }
+
+        int nread;
+        try {
+            nread = reloadBuffer();
+        } catch (IOException e) {
+            throw new UncheckedIOException(e.getMessage(), e);
+        }
+
+        return !(nread < 0 && !buffer.hasRemaining());
+    }
+
+    @Override
+    public MorphemeList next() {
+        int length = detector.getEos(normalized, this);
+        if (length > 0) { // sentence found
+            int eos = bos + length;
+            if (eos < normalized.length()) {
+                eos = input.getNextInOriginal(eos - 1);
+                length = eos - bos;
+            }
+            UTF8InputText sentence = input.slice(bos, eos);
+            bos = eos;
+            normalized = normalized.substring(length);
+            return tokenizer.tokenizeSentence(mode, sentence);
+        }
+
+        // buffer is just after reload but no (safe) eos found. need to clean it up.
+        // tokenize all text in the buffer.
+        if (bos == 0 && length < 0) {
+            bos = normalized.length();
+            normalized = "";
+            return tokenizer.tokenizeSentence(mode, input);
+        }
+
+        int nread;
+        try {
+            nread = reloadBuffer();
+        } catch (IOException e) {
+            throw new UncheckedIOException(e.getMessage(), e);
+        }
+
+        if (nread < 0 && !buffer.hasRemaining()) {
+            throw new NoSuchElementException("no texts left to analyze");
+        }
+
+        // recursive call with reloaded buffer.
+        return next();
+    }
+
+    @Override
+    public boolean hasNonBreakWord(int length) {
+        UTF8InputText inp = input;
+        int byteEOS = inp.getCodePointsOffsetLength(0, bos + length);
+        byte[] bytes = inp.getByteText();
+        LexiconSet lexicon = tokenizer.lexicon;
+        for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
+            Iterator<int[]> iterator = lexicon.lookup(bytes, i);
+            while (iterator.hasNext()) {
+                int[] r = iterator.next();
+                int l = r[1];
+                if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+}
diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 import java.io.IOException;
 import java.io.PrintStream;
 import java.io.Reader;
+import java.util.Iterator;
+import java.util.List;
 
 /**
  * A tokenizer of morphological analysis.
@@ -52,7 +54,7 @@ default MorphemeList tokenize(final String text) {
 
     /**
      * Tokenize sentences. This method divide an input text into sentences and
-     * tokenizes them.
+     * tokenizes them. When the text is long, it uses a lot of memory.
      *
      * @param mode
      *            a mode of splitting
@@ -64,7 +66,7 @@ default MorphemeList tokenize(final String text) {
 
     /**
      * Tokenize sentences. Divide an input text into sentences and tokenize them
-     * with {@link SplitMode}.C.
+     * with {@link SplitMode}.C. When the text is long, it uses a lot of memory.
      *
      * @param text
      *            input text
@@ -77,7 +79,8 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
 
     /**
      * Read an input text from {@code input}, divide it into sentences and tokenize
-     * them.
+     * them. It reads all text in the input and uses a lot of memory when the text
+     * is long.
      *
      * @param mode
      *            a mode of splitting
@@ -86,24 +89,54 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @return a result of tokenizing
      * @throws IOException
      *             if reading a stream is failed
+     * @deprecated use {@link #lazyTokenizeSentences(SplitMode, Readable)} instead.
      */
+    @Deprecated
     Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader input) throws IOException;
 
     /**
      * Reads an input text from {@code input}, divide it into sentences and
-     * tokenizes them with {@link SplitMode}.C.
+     * tokenizes them with {@link SplitMode}.C. It reads all text in the input and
+     * uses a lot of memory when the text is long.
      *
      * @param input
      *            a reader of input text
      * @return a result of tokenizing
      * @throws IOException
      *             if reading a stream is failed
      * @see #tokenizeSentences(SplitMode,Reader)
+     * @deprecated use {@link #lazyTokenizeSentences(Readable)} instead.
      */
+    @Deprecated
     default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOException {
         return tokenizeSentences(SplitMode.C, input);
     }
 
+    /**
+     * Read an input text from {@code input}, divide it into sentences and tokenize
+     * them. It reads the input lazily.
+     *
+     * @param mode
+     *            a mode of splitting
+     * @param input
+     *            a readable input text
+     * @return a result of tokenizing
+     */
+    Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable input);
+
+    /**
+     * Read an input text from {@code input}, divide it into sentences and tokenize
+     * them with {@link SplitMode}.C. It reads the input lazily.
+     *
+     * @param input
+     *            a readable input text
+     * @return a result of tokenizing
+     * @see #lazyTokenizeSentences(SplitMode,Readable)
+     */
+    default Iterator<List<Morpheme>> lazyTokenizeSentences(Readable input) {
+        return lazyTokenizeSentences(SplitMode.C, input);
+    }
+
     /**
      * Prints lattice structure of the analysis into the passed {@link PrintStream}.
      *