Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add offset correction for split filter #149

Merged
merged 13 commits into from
Nov 11, 2024
Merged
9 changes: 6 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ jobs:
- 'os:2.6.0'
env:
mainJob: ${{ matrix.es-version == 'es:8.15.2' }}
sudachiVersion: 20241021
sudachiKind: core
continue-on-error: true

steps:
Expand Down Expand Up @@ -93,15 +95,16 @@ jobs:
- name: Cache dictionary download
uses: actions/cache@v4
with:
path: build/integration/sudachi-dictionary-20230110-small.zip
key: sudachi-dictionary-20230110
path: build/integration/sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}.zip
key: sudachi-dictionary-${{ env.sudachiVersion }}-${{ env.sudachiKind }}
- name: Integration test
env:
ES_KIND: ${{ env.ENGINE_KIND }}
ES_VERSION: ${{ env.ENGINE_VERSION }}
PLUGIN_VERSION: ${{ env.PROJ_VERSION }}
RUN_ES_DAEMON: 1
DIC_VERSION: 20230110
DIC_VERSION: ${{ env.sudachiVersion }}
DIC_KIND: ${{ env.sudachiKind }}
run: |
bash test-scripts/00-install-elasticsearch.sh
sleep 30
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 Works Applications Co., Ltd.
* Copyright (c) 2023-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,8 @@

package com.worksap.nlp.lucene.sudachi.ja.attributes;

import java.util.List;

import com.worksap.nlp.sudachi.Morpheme;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Attribute;
Expand All @@ -36,4 +38,17 @@ public interface MorphemeAttribute extends Attribute {
* new object
*/
void setMorpheme(Morpheme morpheme);

/**
* @return The offset mapping for the current morpheme
*/
List<Integer> getOffsets();

/**
* Set the offset mapping for the morpheme
*
* @param offsets
* actual offset for each offset in the morpheme
*/
void setOffsets(List<Integer> offsets);
}
213 changes: 138 additions & 75 deletions src/main/java/com/worksap/nlp/lucene/sudachi/ja/SudachiSplitFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
import com.worksap.nlp.lucene.sudachi.ja.attributes.*;
import com.worksap.nlp.lucene.sudachi.ja.util.Strings;
import com.worksap.nlp.sudachi.Morpheme;

import com.worksap.nlp.sudachi.Tokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
Expand All @@ -40,56 +39,18 @@ public enum Mode {

public static final Mode DEFAULT_MODE = Mode.SEARCH;

static class OovChars {
private int length;
private char[] buffer = new char[0];
private int reserved;
private int index;
private int baseOffset;

public void setOov(int offset, char[] src, int length) {
baseOffset = offset;
this.length = length;
if (reserved < length) {
buffer = new char[length];
reserved = length;
}
System.arraycopy(src, 0, buffer, 0, length);
index = 0;
}

public boolean hasNext() {
return index < length;
}

public char next() {
if (index < length) {
return buffer[index++];
} else {
throw new IllegalStateException();
}
}

public int index() {
return index;
}

public int offset() {
return baseOffset + index;
}
}

private final Mode mode;
private final Tokenizer.SplitMode splitMode;

private final CharTermAttribute termAtt;
private final OffsetAttribute offsetAtt;
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLengthAtt;
private final MorphemeAttribute morphemeAtt;
private ListIterator<Morpheme> aUnitIterator;
private final OovChars oovChars = new OovChars();

private int aUnitOffset = 0;
private final MorphemeSubunits subunits = new MorphemeSubunits();
private final OovChars oovChars = new OovChars();
private List<Integer> offsetMap;

public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode splitMode) {
super(input);
Expand All @@ -105,72 +66,174 @@ public SudachiSplitFilter(TokenStream input, Mode mode, Tokenizer.SplitMode spli

@Override
public final boolean incrementToken() throws IOException {
// continue to write current split
if (oovChars.hasNext()) {
clearAttributes();
setOOVAttribute();
return true;
}
if (aUnitIterator != null && aUnitIterator.hasNext()) {
if (subunits.hasNext()) {
clearAttributes();
setAUnitAttribute(aUnitIterator.next());
setAUnitAttribute();
return true;
}

// move to next morpheme
if (!input.incrementToken()) {
return false;
}

Morpheme m = morphemeAtt.getMorpheme();
this.offsetMap = morphemeAtt.getOffsets();
if (m == null) {
return true;
}

if (input.incrementToken()) {
// oov does not have splits
// split into characters in extended mode
if (m.isOOV()) {
int length = 0;
Morpheme m = morphemeAtt.getMorpheme();
if (m == null) {
return true;
}
termAtt.setEmpty().append(m.surface());
if (mode == Mode.EXTENDED && m.isOOV() && (length = Strings.codepointCount(termAtt)) > 1) {
oovChars.setOov(offsetAtt.startOffset(), termAtt.buffer(), termAtt.length());
if (mode == Mode.EXTENDED && (length = Strings.codepointCount(termAtt)) > 1) {
// OovChars requires character length
oovChars.setOov(termAtt.buffer(), termAtt.length());
// Position length should be codepoint length
posLengthAtt.setPositionLength(length);
} else if (splitMode != Tokenizer.SplitMode.C) {
List<Morpheme> subUnits = m.split(splitMode);
if (subUnits.size() > 1) {
aUnitIterator = subUnits.listIterator();
aUnitOffset = offsetAtt.startOffset();
posLengthAtt.setPositionLength(subUnits.size());
} else {
posLengthAtt.setPositionLength(1);
}
}
return true;
} else {
return false;
}

// C split is the longest split
if (splitMode == Tokenizer.SplitMode.C) {
return true;
}

// split into A/B units
List<Morpheme> subsplits = m.split(splitMode);
if (subsplits.size() > 1) {
subunits.setUnits(subsplits);
posLengthAtt.setPositionLength(subunits.size());
}

return true;
}

private int correctOffset(int currectOff) {
// assert (0 <= currectOff && currectOff <= this.offsetMap.size());
return this.offsetMap.get(currectOff);
}

private void setAUnitAttribute(Morpheme morpheme) {
private void setAUnitAttribute() {
posLengthAtt.setPositionLength(1);
if (aUnitIterator.previousIndex() == 0) {
if (subunits.index() == 0) {
posIncAtt.setPositionIncrement(0);
} else {
posIncAtt.setPositionIncrement(1);
}
int length = morpheme.end() - morpheme.begin();
offsetAtt.setOffset(aUnitOffset, aUnitOffset + length);
aUnitOffset += length;
morphemeAtt.setMorpheme(morpheme);
termAtt.setEmpty().append(morpheme.surface());

MorphemeSubunits.Subunit su = subunits.next();
termAtt.setEmpty().append(su.morpheme.surface());
morphemeAtt.setMorpheme(su.morpheme);
morphemeAtt.setOffsets(offsetMap.subList(su.begin, su.end + 1));
offsetAtt.setOffset(correctOffset(su.begin), correctOffset(su.end));
}

private void setOOVAttribute() {
int offset = oovChars.offset();
posLengthAtt.setPositionLength(1);
if (oovChars.index() == 0) {
posIncAtt.setPositionIncrement(0);
} else {
posIncAtt.setPositionIncrement(1);
}

int startOffset = oovChars.offset();
char c = oovChars.next();
termAtt.setEmpty().append(c);
if (Character.isSurrogate(c) && oovChars.hasNext()) {
termAtt.append(oovChars.next());
offsetAtt.setOffset(offset, offset + 2);
} else {
offsetAtt.setOffset(offset, offset + 1);
}
int endOffset = oovChars.offset();
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset));
}

static class OovChars {
private int reserved;
private char[] buffer = new char[0];
private int length;
private int index;

public void setOov(char[] src, int length) {
this.length = length;
if (reserved < length) {
buffer = new char[length];
reserved = length;
}
System.arraycopy(src, 0, buffer, 0, length);
index = 0;
}

public boolean hasNext() {
return index < length;
}

public char next() {
if (index < length) {
return buffer[index++];
}
throw new IllegalStateException();
}

public int index() {
return index;
}

public int offset() {
return index;
}
}

static class MorphemeSubunits {
static class Subunit {
final Morpheme morpheme;
final int begin;
final int end;

public Subunit(Morpheme morpheme, int begin, int end) {
this.morpheme = morpheme;
this.begin = begin;
this.end = end;
}
}

private List<Morpheme> morphemes;
private int size;
private int index;
private int baseOffset;

public void setUnits(List<Morpheme> morphemes) {
this.morphemes = morphemes;
size = morphemes.size();
index = 0;
baseOffset = morphemes.get(0).begin();
}

public boolean hasNext() {
return index < size;
}

public Subunit next() {
if (!hasNext()) {
throw new IllegalStateException();
}
Morpheme m = morphemes.get(index++);
return new Subunit(m, m.begin() - baseOffset, m.end() - baseOffset);
}

public int size() {
return size;
}

public int index() {
return index;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@ class SudachiTokenizer(
override fun incrementToken(): Boolean {
clearAttributes()
var m = iterator.next() ?: return false
val baseOffset = iterator.baseOffset

morphemeAtt.setMorpheme(m)
posLenAtt.positionLength = 1
posIncAtt.positionIncrement = 1
val baseOffset = iterator.baseOffset
morphemeAtt.setOffsets((m.begin()..m.end()).map { i -> correctOffset(baseOffset + i) })
posLenAtt.setPositionLength(1)
posIncAtt.setPositionIncrement(1)
offsetAtt.setOffset(correctOffset(baseOffset + m.begin()), correctOffset(baseOffset + m.end()))
termAtt.setEmpty().append(m.surface())
return true
Expand Down
Loading