forked from FooSoft/yomichan-import
-
Notifications
You must be signed in to change notification settings - Fork 0
/
jmdict_references.go
170 lines (160 loc) Β· 5.65 KB
/
jmdict_references.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
package rikaitan
import (
"fmt"
"strconv"
"strings"
)
/*
* In the future, JMdict will be updated to include sequence numbers
* with each cross reference. At that time, most of the functions and
* types defined in this file will become unnecessary. see:
* https://www.edrdg.org/jmdict_edict_list/2022/msg00008.html
*/
type searchValue struct {
sequence sequence
index int
isPriority bool
}
type searchHash struct {
hash hash
isPriority bool
}
func parseReference(reference string) (headword, int, bool) {
// Reference strings in JMDict currently consist of 3 parts at
// most, separated by γ» characters. The latter two parts are
// optional. When the sense number is not specified, it is
// implied to be the first sense.
var h headword
var senseNumber int
ok := true
refParts := strings.Split(reference, "γ»")
if len(refParts) == 1 {
// (Kanji) or (Reading)
h = headword{Expression: refParts[0], Reading: refParts[0]}
senseNumber = 1
} else if len(refParts) == 2 {
// [Kanji + (Reading or Sense)] or (Reading + Sense)
val, err := strconv.Atoi(refParts[1])
if err == nil {
h = headword{Expression: refParts[0], Reading: refParts[0]}
senseNumber = val
} else {
h = headword{Expression: refParts[0], Reading: refParts[1]}
senseNumber = 1
}
} else if len(refParts) == 3 {
// Expression + Reading + Sense
h = headword{Expression: refParts[0], Reading: refParts[1]}
val, err := strconv.Atoi(strings.TrimSpace(refParts[2]))
if err == nil {
senseNumber = val
} else {
errortext := "Unexpected format (3rd part not integer) for x-ref \"" + reference + "\""
fmt.Println(errortext)
ok = false
}
} else {
errortext := "Unexpected format for x-ref \"" + reference + "\""
fmt.Println(errortext)
ok = false
}
return h, senseNumber, ok
}
func (meta *jmdictMetadata) MakeReferenceToSeqMap() {
meta.referenceToSeq = make(map[string]sequence)
meta.MakeHashToSearchValuesMap()
for _, reference := range meta.references {
if meta.referenceToSeq[reference] != 0 {
continue
}
seq := meta.FindBestSequence(reference)
if seq != 0 {
meta.referenceToSeq[reference] = seq
} else {
fmt.Println("Unable to convert reference to sequence number: `" + reference + "`")
}
}
}
func (meta *jmdictMetadata) MakeHashToSearchValuesMap() {
meta.hashToSearchValues = make(map[hash][]searchValue)
for seq, searchHashes := range meta.seqToSearchHashes {
for idx, searchHash := range searchHashes {
searchValue := searchValue{
sequence: seq,
index: idx,
isPriority: searchHash.isPriority,
}
meta.hashToSearchValues[searchHash.hash] =
append(meta.hashToSearchValues[searchHash.hash], searchValue)
}
}
}
/*
* This function attemps to convert a JMdict reference string into a
* single definite sequence number. These reference strings are often
* ambiguous, so we have to resort to using heuristics.
*
* Generally, correspondence is determined by the order in which term
* pairs are extracted from each JMdict entry. Take for example the
* JMdict entry for γζ¬, which contains a reference to ζ¬ (without a
* reading specified). To correlate this reference with a sequence
* number, our program searches each entry for the hash ofγζ¬γ»ζ¬γ.
* There are two entries in which it is found in JMdict (English):
*
* sequence 1260670: γε
γ»γγ¨γγγε
γ»ε
γγγγγ¨γ»γγ¨γγγζ¬γ»γγ¨γγγζ¬γ»ζ¬γγγη΄ γ»γγ¨γγγη΄ γ»η΄ γγγεΊγ»γγ¨γγγεΊγ»εΊγ
* sequence 1522150: γζ¬γ»γ»γγγγζ¬γ»ζ¬γγγγ»γγ»γ»γγ
*
* Because γζ¬γ»ζ¬γ is closer to the beginning of the array in the
* latter (i.e., has the lowest index), sequence number 1522150 is
* returned.
*
* In situations in which multiple sequences are found with the same
* index, the entry with a priority tag ("news1", "ichi1", "spec1",
* "spec2", "gai1") is given preference. This mostly affects
* katakana-only loanwords like γ©γ°.
*
* To improve accuracy, this method also checks to see if the
* reference's specified sense number really exists in the
* corresponding entry. For example, sequence 1582850 γε¦δ½γ§γ»γγγγ§γ
* has a reference to sense #2 of γγγ (no kanji specified), which
* could belong to 13 different sequences. However, sequences 1582850
* and 2829697 are the only 2 of those 13 which contain more than one
* sense. Incidentally, sequence 1582850 is the correct match.
*
* All else being equal, the entry with the smallest sequence number
* is chosen. References in the JMdict file are currently ambiguous,
* and getting this perfect won't be possible until reference sequence
* numbers are included in the file. See:
* https://github.com/JMdictProject/JMdictIssues/issues/61
*/
func (meta *jmdictMetadata) FindBestSequence(reference string) sequence {
bestSeq := 0
lowestIndex := 100000
bestIsPriority := false
headword, senseNumber, ok := parseReference(reference)
if !ok {
return bestSeq
}
hash := headword.Hash()
for _, v := range meta.hashToSearchValues[hash] {
if meta.seqToSenseCount[v.sequence] < senseNumber {
// entry must contain the specified sense
continue
} else if lowestIndex < v.index {
// lower indices are better
continue
} else if (lowestIndex == v.index) && (bestIsPriority && !v.isPriority) {
// if indices match, check priority
continue
} else if (lowestIndex == v.index) && (bestIsPriority == v.isPriority) && (bestSeq < v.sequence) {
// if indices and priority match, check sequence number.
// lower sequence numbers are better
continue
} else {
lowestIndex = v.index
bestSeq = v.sequence
bestIsPriority = v.isPriority
}
}
return bestSeq
}