-
Notifications
You must be signed in to change notification settings - Fork 4
/
scanner.go
298 lines (244 loc) · 6.52 KB
/
scanner.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// (c) 2013 Rick Arnold. Licensed under the BSD license (see LICENSE).
package props
import (
"bytes"
"fmt"
"strings"
"unicode"
"unicode/utf16"
)
const hexChars = "0123456789ABCDEFabcdef"
// scanner is used to parse the property file format as defined by Java. The
// parsing logic is based on the "Lexical Scanning in Go" presentation by
// Rob Pike at http://cuddle.googlecode.com/hg/talk/lex.html
type scanner struct {
p *Properties
// the key and value for the current line
key bytes.Buffer
value bytes.Buffer
// the current output buffer; either key or value
current *bytes.Buffer
// the current UTF-16 escapes (multiple escapes in a row are used to
// represent characters that require more than 2 bytes)
utfUnits []bytes.Buffer
}
func (s *scanner) finishEscape() stateFunc {
s.utfUnits = nil
if s.current == &s.key {
return stateKey
} else {
return stateValue
}
}
func (s *scanner) startUtfEscape() stateFunc {
if s.utfUnits == nil {
s.utfUnits = make([]bytes.Buffer, 0, 4)
}
s.utfUnits = append(s.utfUnits, bytes.Buffer{})
return stateUtfEscape
}
func (s *scanner) finishUtfEscape() {
if len(s.utfUnits) <= 0 {
return
}
units := make([]uint16, 0, len(s.utfUnits))
for _, v := range s.utfUnits {
var unit uint16
_, err := fmt.Sscanf(strings.ToLower(v.String()), "%x", &unit)
if err != nil {
s.current.WriteRune(unicode.ReplacementChar)
s.finishEscape()
return
}
units = append(units, unit)
}
for _, r := range utf16.Decode(units) {
s.current.WriteRune(r)
}
s.finishEscape()
}
func (s *scanner) checkEscape(ch rune) stateFunc {
if ch == '\\' {
if s.current == nil {
s.current = &s.key
}
return stateEscape
}
s.finishUtfEscape()
return nil
}
func (s *scanner) done() {
if s.key.Len() > 0 {
s.p.values[s.key.String()] = s.value.String()
}
}
// stateFunc represents a single state in the scanner's state machine.
type stateFunc func(*scanner, rune) stateFunc
// stateNone is the default state at the beginning of each line.
func stateNone(s *scanner, ch rune) stateFunc {
if next := s.checkEscape(ch); next != nil {
return next
}
if ch == '#' || ch == '!' {
return stateComment
}
if isWhitespace(ch) {
return stateNone
}
s.current = &s.key
s.current.WriteRune(ch)
return stateKey
}
// stateComment indicates that the current line is a comment; all characters
// up to the next newline will be ignored.
func stateComment(_ *scanner, ch rune) stateFunc {
if ch == '\r' || ch == '\n' {
return stateNone
}
return stateComment
}
// stateKey indicates that the key is being read; all characters up to the
// first (unescaped) whitespace, '=', or ':' will be considered part of the
// key.
func stateKey(s *scanner, ch rune) stateFunc {
if next := s.checkEscape(ch); next != nil {
return next
}
if ch == '=' || ch == ':' {
s.current = &s.value
return stateSeparatorChar
}
if ch == '\r' || ch == '\n' {
return finishEntry(s)
}
if isWhitespace(ch) {
s.current = &s.value
return stateSeparator
}
s.current.WriteRune(ch)
return stateKey
}
// stateSeparator indicates that whitespace between the key and value is being
// read.
func stateSeparator(s *scanner, ch rune) stateFunc {
if next := s.checkEscape(ch); next != nil {
return next
}
if ch == '=' || ch == ':' {
return stateSeparatorChar
}
if ch == '\r' || ch == '\n' {
return finishEntry(s)
}
if isWhitespace(ch) {
return stateSeparator
}
s.current.WriteRune(ch)
return stateValue
}
// stateSeparatorChar indicates that the '=' or ':' character or whitespace
// before the value is being read.
func stateSeparatorChar(s *scanner, ch rune) stateFunc {
if next := s.checkEscape(ch); next != nil {
return next
}
if ch == '\r' || ch == '\n' {
return finishEntry(s)
}
if isWhitespace(ch) {
return stateSeparatorChar
}
s.current.WriteRune(ch)
return stateValue
}
// stateValue indicates that the value text is being read.
func stateValue(s *scanner, ch rune) stateFunc {
if next := s.checkEscape(ch); next != nil {
return next
}
if ch == '\r' || ch == '\n' {
return finishEntry(s)
}
s.current.WriteRune(ch)
return stateValue
}
// stateContinued indicates that an escaped newline or corresponding leading
// whitespace on the next line is being read. The first non-whitespace
// character will continue the key or value previously being read.
func stateContinued(s *scanner, ch rune) stateFunc {
if isWhitespace(ch) {
return stateContinued
}
if next := s.checkEscape(ch); next != nil {
return next
}
s.current.WriteRune(ch)
return s.finishEscape()
}
// stateEscape indicates that an escaped character is being read. Valid escapes
// will be replaced by special characters such as '\n'; invalid escapes will
// write the escaped character unchanged. Once the escaped character is read,
// normal scanning of the key or value resumes.
func stateEscape(s *scanner, ch rune) stateFunc {
if ch == 'u' {
return s.startUtfEscape()
}
s.finishUtfEscape()
if ch == '\n' || ch == '\r' {
return stateContinued
}
if ch == 't' {
s.current.WriteRune('\t')
} else if ch == 'n' {
s.current.WriteRune('\n')
} else if ch == 'r' {
s.current.WriteRune('\r')
} else if ch == 'f' {
s.current.WriteRune('\f')
} else {
s.current.WriteRune(ch)
}
return s.finishEscape()
}
// stateUtfEscape indicates that a UTF-16 escape is being read. If the escape
// contains 4 hex digits it is added to the current list of escaped code units;
// otherwise the Unicode replacement character is used. Characters that require
// more than 2 bytes to represent will require multiple escapes in a row.
func stateUtfEscape(s *scanner, ch rune) stateFunc {
if ch == 'u' {
return stateUtfEscape
}
if strings.ContainsRune(hexChars, ch) {
unit := &s.utfUnits[len(s.utfUnits)-1]
unit.WriteRune(ch)
if unit.Len() == 4 {
if s.current == &s.key {
return stateKey
} else {
return stateValue
}
} else {
return stateUtfEscape
}
} else if ch == '\n' || ch == '\r' {
s.current.WriteRune(unicode.ReplacementChar)
s.finishEscape()
return finishEntry(s)
}
s.current.WriteRune(unicode.ReplacementChar)
return s.finishEscape()
}
// finishEntry handles the end of a property file entry and resets the
// scanner for the next entry
func finishEntry(s *scanner) stateFunc {
s.p.values[s.key.String()] = s.value.String()
s.key.Reset()
s.value.Reset()
s.current = &s.key
return stateNone
}
// isWhitespace returns true for any character considered to be whitespace
// by the property file format.
func isWhitespace(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == '\f'
}