Skip to content

Commit

Permalink
Merge pull request #121 from WorksApplications/feature/explain
Browse files Browse the repository at this point in the history
explain morpheme detail
  • Loading branch information
mh-northlander authored May 21, 2024
2 parents 50a38d6 + fede704 commit e4e3012
Show file tree
Hide file tree
Showing 9 changed files with 263 additions and 6 deletions.
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import org.jetbrains.kotlin.gradle.dsl.JvmTarget
plugins {
id 'java-library'
id 'org.jetbrains.kotlin.jvm' version '1.8.0'
id "org.jetbrains.kotlin.plugin.serialization" version "1.8.0"
id 'com.diffplug.spotless' version '6.16.0'
id 'org.sonarqube' version '4.0.0.2929'
id("org.jetbrains.kotlinx.kover") version "0.7.0"
Expand Down Expand Up @@ -44,6 +45,7 @@ dependencies {
testImplementation('org.jetbrains.kotlin:kotlin-test-junit') {
exclude(group: 'org.hamcrest')
}
testImplementation('org.jetbrains.kotlinx:kotlinx-serialization-json:1.6.3')
kover(project(':integration'))
kover(project(':testlib'))
}
Expand Down
7 changes: 5 additions & 2 deletions buildSrc/src/main/groovy/com/worksap/nlp/tools/engines.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ enum EsSupport implements EngineSupport {

enum OsSupport implements EngineSupport {
Os20("os-2.00"),
Os210("os-2.10")
Os27("os-2.07"),
Os210("os-2.10"),

String tag

Expand All @@ -59,8 +60,10 @@ enum OsSupport implements EngineSupport {


static OsSupport supportVersion(Version version) {
if (version.ge(2, 0) && version.lt(2, 10)) {
if (version.ge(2, 0) && version.lt(2, 7)) {
return Os20
} else if (version.ge(2, 7) && version.lt(2, 10)) {
return Os27
} else if (version.ge(2, 10)) {
return Os210
}
Expand Down
25 changes: 25 additions & 0 deletions src/main/ext/es-7.15-ge/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.elasticsearch.xcontent.ToXContent

typealias ToXContentParams = org.elasticsearch.xcontent.ToXContent.Params

typealias XContentBuilder = org.elasticsearch.xcontent.XContentBuilder
25 changes: 25 additions & 0 deletions src/main/ext/es-7.15-lt/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.elasticsearch.common.xcontent.ToXContent

typealias ToXContentParams = org.elasticsearch.common.xcontent.ToXContent.Params

typealias XContentBuilder = org.elasticsearch.common.xcontent.XContentBuilder
25 changes: 25 additions & 0 deletions src/main/ext/os-2.07-ge/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.opensearch.core.xcontent.ToXContent

typealias ToXContentParams = org.opensearch.core.xcontent.ToXContent.Params

typealias XContentBuilder = org.opensearch.core.xcontent.XContentBuilder
25 changes: 25 additions & 0 deletions src/main/ext/os-2.07-lt/xcontent-aliases.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@file:Suppress("PackageDirectoryMismatch")

package com.worksap.nlp.lucene.aliases

typealias ToXContent = org.opensearch.common.xcontent.ToXContent

typealias ToXContentParams = org.opensearch.common.xcontent.ToXContent.Params

typealias XContentBuilder = org.opensearch.common.xcontent.XContentBuilder
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023 Works Applications Co., Ltd.
* Copyright (c) 2022-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,13 +16,36 @@

package com.worksap.nlp.lucene.sudachi.ja.attributes

import com.worksap.nlp.lucene.aliases.ToXContent
import com.worksap.nlp.lucene.aliases.ToXContentParams
import com.worksap.nlp.lucene.aliases.XContentBuilder
import com.worksap.nlp.lucene.sudachi.ja.reflect
import com.worksap.nlp.sudachi.Morpheme
import org.apache.lucene.util.AttributeImpl
import org.apache.lucene.util.AttributeReflector

class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute {
private var morpheme: Morpheme? = null
private var morpheme: MorphemeWrapper? = null

private class MorphemeWrapper(morpheme: Morpheme) : ToXContent {
private val morpheme = morpheme

override fun toXContent(builder: XContentBuilder, params: ToXContentParams): XContentBuilder {
builder.value(
mapOf(
"surface" to morpheme.surface(),
"dictionaryForm" to morpheme.dictionaryForm(),
"normalizedForm" to morpheme.normalizedForm(),
"readingForm" to morpheme.readingForm(),
"partOfSpeech" to morpheme.partOfSpeech(),
))
return builder
}

fun unwrap(): Morpheme {
return morpheme
}
}

override fun clear() {
morpheme = null
Expand All @@ -37,10 +60,10 @@ class MorphemeAttributeImpl : AttributeImpl(), MorphemeAttribute {
}

override fun getMorpheme(): Morpheme? {
return morpheme
return morpheme?.let { m -> m.unwrap() }
}

override fun setMorpheme(morpheme: Morpheme?) {
this.morpheme = morpheme
this.morpheme = morpheme?.let { m -> MorphemeWrapper(m) }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.lucene.sudachi.ja.attributes

import com.worksap.nlp.lucene.aliases.ToXContent
import com.worksap.nlp.lucene.aliases.XContentBuilder
import com.worksap.nlp.search.aliases.XContentType
import com.worksap.nlp.sudachi.Config
import com.worksap.nlp.sudachi.DictionaryFactory
import com.worksap.nlp.sudachi.Morpheme
import com.worksap.nlp.test.TestDictionary
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertNotNull
import kotlin.test.assertNull
import kotlin.test.assertTrue
import kotlinx.serialization.Serializable
import kotlinx.serialization.json.Json
import org.junit.Before
import org.junit.Rule

class MorphemeAttributeImplTest {
@JvmField @Rule var testDic = TestDictionary("system")

private lateinit var config: Config

fun getFirstMorpheme(text: String): Morpheme? {
val dict = DictionaryFactory().create(config)
val tok = dict.create()
val morphemes = tok.tokenize(text)

return if (morphemes.size == 0) null else morphemes.get(0)
}

@Before
fun setup() {
val configDir = testDic.root.toPath().resolve("config/sudachi")
config = Config.fromFile(configDir.resolve("sudachi.json"))
}

@Test
fun setMorpheme() {
var morphemeAtt = MorphemeAttributeImpl()
assertNull(morphemeAtt.getMorpheme())

val morpheme = getFirstMorpheme("東京都")!!
morphemeAtt.setMorpheme(morpheme)
assertEquals(morpheme, morphemeAtt.getMorpheme())

morphemeAtt.setMorpheme(null)
assertNull(morphemeAtt.getMorpheme())
}

@Test
fun toXContent() {
var morphemeAtt = MorphemeAttributeImpl()
val morpheme = getFirstMorpheme("東京都")!!
morphemeAtt.setMorpheme(morpheme)

val builder = XContentBuilder.builder(XContentType.JSON.xContent())
builder.startObject()
morphemeAtt.reflectWith(
fun(attClass, key, value) {
assertEquals(MorphemeAttribute::class.java, attClass)
assertEquals("morpheme", key)
assertTrue(value is ToXContent)

builder.field(key, value)
})
builder.endObject()
builder.flush()

val serialized = builder.getOutputStream().toString()
val deserialized = Json.decodeFromString<MorphemeHolder>(serialized)

assertNotNull(deserialized.morpheme)
assertEquals(morpheme.surface(), deserialized.morpheme.surface)
assertEquals(morpheme.dictionaryForm(), deserialized.morpheme.dictionaryForm)
assertEquals(morpheme.normalizedForm(), deserialized.morpheme.normalizedForm)
assertEquals(morpheme.readingForm(), deserialized.morpheme.readingForm)
assertEquals(morpheme.partOfSpeech(), deserialized.morpheme.partOfSpeech)
}
}

@Serializable data class MorphemeHolder(val morpheme: MorphemeAttributeHolder)

@Serializable
data class MorphemeAttributeHolder(
val surface: String,
val dictionaryForm: String,
val normalizedForm: String,
val readingForm: String,
val partOfSpeech: List<String>,
)
21 changes: 21 additions & 0 deletions test-scripts/01-integration-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,27 @@ def test_tokenize_using_sudachi_tokenizer(self):
self.assertEqual(6, tokens[3]["end_offset"])
return

def test_explain_tokenizer_details(self):
body = {"tokenizer": "sudachi_tokenizer",
"text": "すだち", "explain": True}
resp = es_instance.analyze(body)
self.assertEqual(200, resp.status)

morpheme = json.loads(resp.data)[
"detail"]["tokenizer"]["tokens"][0]["morpheme"]
self.assertIn("surface", morpheme)
self.assertEqual("すだち", morpheme["surface"])
self.assertIn("dictionaryForm", morpheme)
self.assertEqual("すだち", morpheme["dictionaryForm"])
self.assertIn("normalizedForm", morpheme)
self.assertEqual("酢橘", morpheme["normalizedForm"])
self.assertIn("readingForm", morpheme)
self.assertEqual("スダチ", morpheme["readingForm"])
self.assertIn("partOfSpeech", morpheme)
self.assertEqual(["名詞", "普通名詞", "一般", "*", "*", "*"],
morpheme["partOfSpeech"])
return


class TestICUFiltered(unittest.TestCase):
# requires analysis-icu plugin installed
Expand Down

0 comments on commit e4e3012

Please sign in to comment.