Skip to content

Commit

Permalink
优化subword拆分
Browse files Browse the repository at this point in the history
Former-commit-id: 89bb400
  • Loading branch information
jimichan committed Jun 15, 2020
1 parent e580ea5 commit dc37a11
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 12 deletions.
14 changes: 8 additions & 6 deletions mynlp/src/main/java/com/mayabot/nlp/segment/IterableMode.kt
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ class AtomIterable(val wrap: Iterable<WordTerm>) : Iterable<WordTerm> {


class AtomIterator(val from: Iterator<WordTerm>) : AbstractIterator<WordTerm>() {

var buffer: LinkedList<WordTerm>? = null

override fun computeNext() {
val b = buffer
if (b != null) {
Expand All @@ -87,13 +89,13 @@ class AtomIterator(val from: Iterator<WordTerm>) : AbstractIterator<WordTerm>()
}

if (from.hasNext()) {
val next = from.next()
if (next.hasSubword()) {
val x = LinkedList(next.subword)
buffer = x
setNext(x.poll()!!)
val term = from.next()
if (term.hasSubword()) {
val subwords = LinkedList(term.subword)
buffer = subwords
setNext(subwords.poll()!!)
} else {
setNext(next)
setNext(term)
}
} else {
done()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@ import java.util.function.Consumer
*/
class SentenceCollector : WordTermCollector {

/**
* 从wordnet中计算出子词的方法。
*/
override var pickUpSubword: WordTermCollector.PickUpSubword? = null

/**
* 给一个初始化wordnet的机会,填充更多的可能性
*/
override var fillSubword: WordTermCollector.FillSubword? = null

override fun collect(txtChars:CharArray?,wordnet: Wordnet, wordPath: Wordpath, consumer: Consumer<WordTerm>) {
override fun collect(txtChars: CharArray?, wordnet: Wordnet, wordPath: Wordpath, consumer: Consumer<WordTerm>) {

val vertexIterator = wordPath.iteratorVertex()

Expand All @@ -41,9 +47,8 @@ class SentenceCollector : WordTermCollector {

val pick = pickUpSubword

if (pick != null && term.length() >= 3) {
pick.pickup(term, wordnet, wordPath)
}
//给当前的term计算子词
pick?.pickup(term, wordnet, wordPath)

consumer.accept(term)
}
Expand Down
11 changes: 9 additions & 2 deletions mynlp/src/test/java/com/mayabot/nlp/Test.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
package com.mayabot.nlp;

import com.mayabot.nlp.segment.Lexer;
import com.mayabot.nlp.segment.Lexers;

public class Test {
public static void main(String[] args) {
System.out.println(Lexers.core().scan(
"瑞金二路街道举办“法官进社区”系列讲座活动信息来源:瑞金二路街道发布时间:2014-03-31字体: 【】日前,瑞金二路街道举办“法官进社区”系列讲座活动,邀请区法院民一庭审判长来讲《婚姻法》、《继承法》等法律。各居委调委会主任、委员共计70余人参加。法官对《婚姻法》、《继承法》等法律进行了多角度、多层次、深入浅出的精彩阐述,并结合自己的工作经历,用实际案例对相关法律条款进行了耐心、详细的讲解。通过开展“法官进社区”活动,统一建立社区法官制度和社区司法联络员制度,积极参与社会矛盾化解和社会管理创新,促进社区居民自治、民主管理。"));
String text = "上海市人民政府";

Lexer lexer = Lexers.coreBuilder()
.collector().indexPickup().done().build();

lexer.scan(text).forEach(x -> {
System.out.println(x);
});
}
}

0 comments on commit dc37a11

Please sign in to comment.