diff --git a/README.md b/README.md
index 7689983..ec5e0bf 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ In this case no table will have a `parent` attribute, elements will not have the
followed by `closeElement("bar")`
- ` 5 < 6 ` is seen as valid text contents
- No support for custom entity expansion other than the standard XML
- entities (`< > " ' &`) and numeric ASCII entities
+ entities (`< > " ' &`) and numeric entities
(e.g. `
` or `<`)
- XML Declarations (``) are incorrectly reported
as Processing Instructions
@@ -157,6 +157,13 @@ In this case no table will have a `parent` attribute, elements will not have the
## History
+### v0.7 2014-Sep-26
++ Decodes entities above 127 as UTF8 (decimal and hexadecimal).
+ - The encoding specified by the document is (still) ignored.
+ If you parse an XML file encoded in some other format, that
+ intermixes 'raw' high-byte characters with high-byte entities,
+ the result will be a broken encoding.
+
### v0.6.1 2014-Sep-25
+ Fixes Issue #6, adding support for ASCII hexadecimal entities (e.g. `<`). (Thanks Leorex/Ben Bishop)
diff --git a/slaxml.lua b/slaxml.lua
index 7fa26d2..5bf8189 100644
--- a/slaxml.lua
+++ b/slaxml.lua
@@ -1,9 +1,9 @@
--[=====================================================================[
-v0.6.1 Copyright © 2013-2014 Gavin Kistner ; MIT Licensed
+v0.7 Copyright © 2013-2014 Gavin Kistner ; MIT Licensed
See http://github.com/Phrogz/SLAXML for details.
--]=====================================================================]
local SLAXML = {
- VERSION = "0.6.1",
+ VERSION = "0.7",
_call = {
pi = function(target,content)
print(string.format("%s %s?>",target,content))
@@ -12,16 +12,16 @@ local SLAXML = {
print(string.format("",content))
end,
startElement = function(name,nsURI,nsPrefix)
- io.write("<")
+ io.write("<")
if nsPrefix then io.write(nsPrefix,":") end
- io.write(name)
+ io.write(name)
if nsURI then io.write(" (ns='",nsURI,"')") end
- print(">")
+ print(">")
end,
attribute = function(name,value,nsURI,nsPrefix)
io.write(' ')
if nsPrefix then io.write(nsPrefix,":") end
- io.write(name,'=',string.format('%q',value))
+ io.write(name,'=',string.format('%q',value))
if nsURI then io.write(" (ns='",nsURI,"')") end
io.write("\n")
end,
@@ -42,7 +42,7 @@ function SLAXML:parse(xml,options)
if not options then options = { stripWhitespace=false } end
-- Cache references for maximum speed
- local find, sub, gsub, char, push, pop = string.find, string.sub, string.gsub, string.char, table.insert, table.remove
+ local find, sub, gsub, char, push, pop, concat = string.find, string.sub, string.gsub, string.char, table.insert, table.remove, table.concat
local first, last, match1, match2, match3, pos2, nsURI
local unpack = unpack or table.unpack
local pos = 1
@@ -52,11 +52,32 @@ function SLAXML:parse(xml,options)
local currentAttributes={}
local currentAttributeCt -- manually track length since the table is re-used
local nsStack = {}
+ local anyElement = false
local entityMap = { ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
- local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and char('0'..s) or orig end
+ local utf8bits = { {0x7FF,{192,32},{128,64}}, {0xFFFF,{224,16},{128,64},{128,64}}, {0x1FFFFF,{240,8},{128,64},{128,64},{128,64}} }
+ function utf8(decimal) -- decode a code point to a utf8-encoded string
+ if decimal<=127 then
+ return char(decimal)
+ else
+ local charbytes = {}
+ for b,lim in ipairs(utf8bits) do
+ if decimal<=lim[1] then
+ for i=b+1,2,-1 do
+ local prefix,max = lim[i+1][1],lim[i+1][2]
+ local mod = decimal % max
+ charbytes[i] = char( prefix + mod )
+ decimal = ( decimal - mod ) / max
+ end
+ charbytes[1] = char( decimal + lim[2][1] )
+ break
+ end
+ end
+ return concat(charbytes)
+ end
+ end
+ local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and utf8(tonumber('0'..s)) or orig end
local function unescape(str) return gsub( str, '(&(#?)([%d%a]+);)', entitySwap ) end
- local anyElement = false
local function finishText()
if first>textStart and self._call.text then
diff --git a/test/files/utf8.xml b/test/files/utf8.xml
new file mode 100644
index 0000000..6c1402e
--- /dev/null
+++ b/test/files/utf8.xml
@@ -0,0 +1,6 @@
+
+
+ crêpes: €3
+ crêpes: €3
+ crêpes: €3
+
\ No newline at end of file
diff --git a/test/test.lua b/test/test.lua
index a74f26d..0d07b89 100644
--- a/test/test.lua
+++ b/test/test.lua
@@ -260,4 +260,12 @@ function test_whitespace()
assertEqual(a.kids[2].value,"\nIt's a [raw][[raw]] >\nstring that not care\n about honey badgers.\n\n ")
end
+function test_utf8()
+ local root = SLAXML:dom(XML['utf8'],{stripWhitespace=true}).root
+ for _,s in ipairs(root.kids) do
+ assertEqual(s.attr.a,"crêpes: €3")
+ assertEqual(s.kids[1].value,"crêpes: €3")
+ end
+end
+
runTests{ useANSI=false }
\ No newline at end of file