-
Notifications
You must be signed in to change notification settings - Fork 0
/
formatting.py
89 lines (73 loc) · 2.6 KB
/
formatting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import re
def minify(text):
"""
Minifies a syntactically correct XML file, i.e. it removes
extra spaces and new lines.
Parameters:
----------
- text (str):
XML file string.
Returns:
--------
str:
Minifed text.
"""
# remove all comments
text = re.sub(r'<\s*!\s*-\s*-\s*[\S\s]+?-\s*-\s*>', r'', text)
# remove all extra newlines or tabs after > or before <
# avoid changing newlines in tag content while doing so
# also remove all spaces before start of file and after end
text = re.sub(r'\A\s+|(?<=>)\s+(?=<)|\s+\Z', r'', text)
# remove all extra spaces after opening tag and before closing tag
text = re.sub(r'\s+>', r'>', text)
text = re.sub(r'\s+/>', r'/>', text)
text = re.sub(r'<\s+', r'<', text)
text = re.sub(r'</\s+', r'</', text)
# remove all extra spaces after tag name and before first attribute if any
text = re.sub(r'(<\S+)(\s+)', r'\g<1> ', text)
# remove all spaces surrounding = when defining an attribute
# and avoid messing with tag contents if similar structures exist
text = re.sub(r'(\s*=\s*)(?=[^<]+>)', r'=', text)
# remove all extra spaces between different attributes
text = re.sub(r'\"\s+(?=[^<]+>)', r'" ', text)
text = re.sub(r"\'\s+(?=[^<]+>)", r"' ", text)
# remove any non-significant spaces and newlines after
# closing tag and before text content, or before opening
# tag and after text content.
text = re.sub(r'\s+<', r'<', text)
text = re.sub(r'>\s+', r'>', text)
return text
def prettify(text, tab_length=4):
"""
Prettifies/beautifies a syntactically correct XML file, i.e. it adds
newlines and tabs where appropriate.
Parameters:
----------
- text (str):
XML file string.
- tab_length (int, optional):
Desired tab length (in spaces) for indentation. (Default is 4)
Returns:
--------
str:
Prettified text.
"""
text = minify(text)
# Get any text between tags, or any text content, and put in list
tokens = re.findall(r'<[^>]+>|(?<=>)[^<]+(?=<)', text)
text = []
tabs = ""
for token in tokens:
if (token[0] == '<' and token[1] != '/'):
text.append('\n' + tabs + token)
if (token[-2] != '/'):
tabs += ' ' * tab_length
elif (token[0] == '<' and token[1] == '/'):
tabs = tabs[:-tab_length]
text.append('\n' + tabs + token)
else:
text.append('\n' + tabs + token)
text = ''.join(text)
# remove extra \n at beginning of "text"
text = text[1:]
return text