-
Notifications
You must be signed in to change notification settings - Fork 0
/
scanner.go
100 lines (82 loc) · 2.03 KB
/
scanner.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package rewritehtml
import (
"errors"
"golang.org/x/net/html"
"io"
)
// Scanner wraps html.Tokenizer and turns it into a Reader.
type Scanner struct {
maxBuf int
buffer []byte
tokenizer *html.Tokenizer
previousTag string
}
// NewScanner returns a scanner that is ready to Read an HTML document.
func NewScanner() *Scanner {
return &Scanner{}
}
func (s *Scanner) SetMaxBuf(maxBuf int) {
s.maxBuf = maxBuf
}
// Concat resets the tokenizer and returns any unconsumed data to the buffer.
func (s *Scanner) Concat(p []byte) {
if s.tokenizer != nil {
s.buffer = append(s.tokenizer.Buffered(), s.buffer...)
s.tokenizer = nil
}
s.buffer = append(s.buffer, p...)
}
// Buffered returns the remaining available data.
func (s *Scanner) Buffered() int {
return len(s.buffer)
}
// Read attempts to write as much from the internal buffer to p as possible.
func (s *Scanner) Read(p []byte) (int, error) {
size := len(p)
have := len(s.buffer)
read := have
if size < have {
read = size
}
if have == 0 {
return 0, io.ErrNoProgress
}
copy(p, s.buffer[:read])
s.buffer = s.buffer[read:]
return read, nil
}
// Drain returns a reader that will consume the remaining buffer.
func (s *Scanner) Drain() io.Reader {
s.Concat(nil)
return NewFragmentReader(s, true)
}
// Next advances the html.Tokenizer and returns the current parse state.
func (s *Scanner) Next(atEOF bool) (raw []byte, token *html.Token, err error) {
for {
if s.tokenizer == nil {
s.tokenizer = html.NewTokenizerFragment(NewFragmentReader(s, atEOF), s.previousTag)
s.tokenizer.SetMaxBuf(s.maxBuf)
}
tt := s.tokenizer.Next()
if tt == html.ErrorToken {
nextErr := s.tokenizer.Err()
if errors.Is(nextErr, io.ErrNoProgress) {
s.Concat(nil)
if atEOF {
// recreate tokenizer
continue
}
}
return nil, nil, nextErr
}
raw := s.tokenizer.Raw()
token := s.tokenizer.Token()
if tt == html.StartTagToken {
s.previousTag = token.Data
}
if tt == html.EndTagToken {
s.previousTag = ""
}
return raw, &token, nil
}
}