forked from sajari/docconv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xml.go
98 lines (89 loc) · 1.94 KB
/
xml.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
package docconv
import (
"bytes"
"encoding/xml"
"fmt"
"io"
)
// ConvertXML converts an XML file to text.
func ConvertXML(r io.Reader) (string, map[string]string, error) {
meta := make(map[string]string)
cleanXML, err := Tidy(r, true)
if err != nil {
return "", nil, fmt.Errorf("tidy error: %v", err)
}
result, err := XMLToText(bytes.NewReader(cleanXML), []string{}, []string{}, true)
if err != nil {
return "", nil, fmt.Errorf("error from XMLToText: %v", err)
}
return result, meta, nil
}
// XMLToText converts XML to plain text given how to treat elements.
func XMLToText(r io.Reader, breaks []string, skip []string, strict bool) (string, error) {
var result string
dec := xml.NewDecoder(io.LimitReader(r, maxBytes))
dec.Strict = strict
for {
t, err := dec.Token()
if err != nil {
if err == io.EOF {
break
}
return "", err
}
switch v := t.(type) {
case xml.CharData:
result += string(v)
case xml.StartElement:
for _, breakElement := range breaks {
if v.Name.Local == breakElement {
result += "\n"
}
}
for _, skipElement := range skip {
if v.Name.Local == skipElement {
depth := 1
for {
t, err := dec.Token()
if err != nil {
// An io.EOF here is actually an error.
return "", err
}
switch t.(type) {
case xml.StartElement:
depth++
case xml.EndElement:
depth--
}
if depth == 0 {
break
}
}
}
}
}
}
return result, nil
}
// XMLToMap converts XML to a nested string map.
func XMLToMap(r io.Reader) (map[string]string, error) {
m := make(map[string]string)
dec := xml.NewDecoder(io.LimitReader(r, maxBytes))
var tagName string
for {
t, err := dec.Token()
if err != nil {
if err == io.EOF {
break
}
return nil, err
}
switch v := t.(type) {
case xml.StartElement:
tagName = string(v.Name.Local)
case xml.CharData:
m[tagName] = string(v)
}
}
return m, nil
}