Skip to content

Commit

Permalink
Decode high-value entities into UTF8
Browse files Browse the repository at this point in the history
  • Loading branch information
Phrogz committed Sep 26, 2014
1 parent cb852af commit 9bf74d6
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 10 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ In this case no table will have a `parent` attribute, elements will not have the
followed by `closeElement("bar")`
- `<foo> 5 < 6 </foo>` is seen as valid text contents
- No support for custom entity expansion other than the standard XML
entities (`&lt; &gt; &quot; &apos; &amp;`) and numeric ASCII entities
entities (`&lt; &gt; &quot; &apos; &amp;`) and numeric entities
(e.g. `&#10;` or `&#x3c;`)
- XML Declarations (`<?xml version="1.x"?>`) are incorrectly reported
as Processing Instructions
Expand All @@ -157,6 +157,13 @@ In this case no table will have a `parent` attribute, elements will not have the

## History

### v0.7 2014-Sep-26
+ Decodes entities above 127 as UTF8 (decimal and hexadecimal).
- The encoding specified by the document is (still) ignored.
If you parse an XML file encoded in some other format, that
intermixes 'raw' high-byte characters with high-byte entities,
the result will be a broken encoding.

### v0.6.1 2014-Sep-25
+ Fixes Issue #6, adding support for ASCII hexadecimal entities (e.g. `&#x3c;`). (Thanks Leorex/Ben Bishop)

Expand Down
39 changes: 30 additions & 9 deletions slaxml.lua
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
--[=====================================================================[
v0.6.1 Copyright © 2013-2014 Gavin Kistner <[email protected]>; MIT Licensed
v0.7 Copyright © 2013-2014 Gavin Kistner <[email protected]>; MIT Licensed
See http://github.com/Phrogz/SLAXML for details.
--]=====================================================================]
local SLAXML = {
VERSION = "0.6.1",
VERSION = "0.7",
_call = {
pi = function(target,content)
print(string.format("<?%s %s?>",target,content))
Expand All @@ -12,16 +12,16 @@ local SLAXML = {
print(string.format("<!-- %s -->",content))
end,
startElement = function(name,nsURI,nsPrefix)
io.write("<")
io.write("<")
if nsPrefix then io.write(nsPrefix,":") end
io.write(name)
io.write(name)
if nsURI then io.write(" (ns='",nsURI,"')") end
print(">")
print(">")
end,
attribute = function(name,value,nsURI,nsPrefix)
io.write(' ')
if nsPrefix then io.write(nsPrefix,":") end
io.write(name,'=',string.format('%q',value))
io.write(name,'=',string.format('%q',value))
if nsURI then io.write(" (ns='",nsURI,"')") end
io.write("\n")
end,
Expand All @@ -42,7 +42,7 @@ function SLAXML:parse(xml,options)
if not options then options = { stripWhitespace=false } end

-- Cache references for maximum speed
local find, sub, gsub, char, push, pop = string.find, string.sub, string.gsub, string.char, table.insert, table.remove
local find, sub, gsub, char, push, pop, concat = string.find, string.sub, string.gsub, string.char, table.insert, table.remove, table.concat
local first, last, match1, match2, match3, pos2, nsURI
local unpack = unpack or table.unpack
local pos = 1
Expand All @@ -52,11 +52,32 @@ function SLAXML:parse(xml,options)
local currentAttributes={}
local currentAttributeCt -- manually track length since the table is re-used
local nsStack = {}
local anyElement = false

local entityMap = { ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and char('0'..s) or orig end
local utf8bits = { {0x7FF,{192,32},{128,64}}, {0xFFFF,{224,16},{128,64},{128,64}}, {0x1FFFFF,{240,8},{128,64},{128,64},{128,64}} }
function utf8(decimal) -- decode a code point to a utf8-encoded string
if decimal<=127 then
return char(decimal)
else
local charbytes = {}
for b,lim in ipairs(utf8bits) do
if decimal<=lim[1] then
for i=b+1,2,-1 do
local prefix,max = lim[i+1][1],lim[i+1][2]
local mod = decimal % max
charbytes[i] = char( prefix + mod )
decimal = ( decimal - mod ) / max
end
charbytes[1] = char( decimal + lim[2][1] )
break
end
end
return concat(charbytes)
end
end
local entitySwap = function(orig,n,s) return entityMap[s] or n=="#" and utf8(tonumber('0'..s)) or orig end
local function unescape(str) return gsub( str, '(&(#?)([%d%a]+);)', entitySwap ) end
local anyElement = false

local function finishText()
if first>textStart and self._call.text then
Expand Down
6 changes: 6 additions & 0 deletions test/files/utf8.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<root>
<s a="crêpes: €3">crêpes: €3</s>
<s a="cr&#234;pes: &#8364;3">cr&#234;pes: &#8364;3</s>
<s a="cr&#xea;pes: &#x20ac;3">cr&#xea;pes: &#x20ac;3</s>
</root>
8 changes: 8 additions & 0 deletions test/test.lua
Original file line number Diff line number Diff line change
Expand Up @@ -260,4 +260,12 @@ function test_whitespace()
assertEqual(a.kids[2].value,"\nIt's a [raw][[raw]] >\nstring that <do/> not care\n about honey badgers.\n\n ")
end

function test_utf8()
local root = SLAXML:dom(XML['utf8'],{stripWhitespace=true}).root
for _,s in ipairs(root.kids) do
assertEqual(s.attr.a,"crêpes: €3")
assertEqual(s.kids[1].value,"crêpes: €3")
end
end

runTests{ useANSI=false }

0 comments on commit 9bf74d6

Please sign in to comment.