Skip to content

Commit

Permalink
Simplify handling of complex mixed elements when schema says it is a …
Browse files Browse the repository at this point in the history
…string; should always just repeat the content as string (#625)
  • Loading branch information
srowen authored Jan 3, 2023
1 parent 31b72da commit 72957d5
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ private[xml] object StaxXmlParser extends Serializable {
case _ => convertObject(parser, st, options)
}
}
case (_: Characters, _: StringType) =>
StaxXmlParserUtils.currentStructureAsString(parser)
case (c: Characters, _: DataType) if c.isWhiteSpace =>
// When `Characters` is found, we need to look further to decide
// if this is really data or space between other elements.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,52 +123,33 @@ private[xml] object StaxXmlParserUtils {
* Convert the current structure of XML document to a XML string.
*/
def currentStructureAsString(parser: XMLEventReader): String = {
// (Hyukjin) I could not find a proper method to produce the current document
// as a string. For Jackson, there is a method `copyCurrentStructure()`.
// So, it ended up with manually converting event by event to string.
def convertChildren(): String = {
var childrenXmlString = ""
parser.peek match {
case _: StartElement =>
childrenXmlString += currentStructureAsString(parser)
case c: Characters if c.isWhiteSpace =>
// There can be a `Characters` event between `StartElement`s.
// So, we need to check further to decide if this is a data or just
// a whitespace between them.
childrenXmlString += c.getData
parser.next
parser.peek match {
case _: StartElement =>
childrenXmlString += currentStructureAsString(parser)
case _: XMLEvent =>
// do nothing
}
case c: Characters =>
childrenXmlString += c.getData
case _: XMLEvent =>
// do nothing
}
childrenXmlString
}

var xmlString = ""
var shouldStop = false
while (!shouldStop) {
val xmlString = new StringBuilder()
var indent = 0
do {
parser.nextEvent match {
case e: StartElement =>
val attributes = e.getAttributes.asScala.map { a =>
xmlString.append('<').append(e.getName)
e.getAttributes.asScala.foreach { a =>
val att = a.asInstanceOf[Attribute]
" " + att.getName + "=\"" + att.getValue + "\""
}.mkString("")
xmlString += "<" + e.getName + attributes + ">"
xmlString += convertChildren()
xmlString.append(' ').append(att.getName).append("=\"").
append(att.getValue).append('"')
}
xmlString.append('>')
indent += 1
case e: EndElement =>
xmlString += "</" + e.getName + ">"
shouldStop = checkEndElement(parser)
xmlString.append("</").append(e.getName).append('>')
indent -= 1
case c: Characters =>
xmlString.append(c.getData)
case _: XMLEvent => // do nothing
}
}
xmlString
} while (parser.peek() match {
case _: EndElement =>
// until the unclosed end element for the whole parent is found
indent > 0
case _ => true
})
xmlString.toString()
}

/**
Expand Down
9 changes: 9 additions & 0 deletions src/test/resources/mixed_children_as_string.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<books>
<book>
<text>
Lorem ipsum dolor sit amet. Ut <i>voluptas</i> distinctio et impedit deserunt aut quam fugit et quaerat odit et nesciunt earum non dolores culpa et sunt nobis. Aut accusamus iste sed odio debitis et quasi amet rem quam sequi et voluptatem placeat aut voluptates iste? Vel nisi rerum sit eligendi excepturi et galisum animi et ipsa nihil vel consequatur velit eos velit nesciunt.
Quo voluptatibus sint ab officiis aperiam non obcaecati rerum eos veniam iste eum ipsam modi. <i>Non</i> voluptatem illum qui molestiae magni qui maxime commodi et accusantium similique qui necessitatibus <i>minus</i>?
At quod rerum et porro nisi ut tempore error et enim optio cum Quis voluptatibus qui dolores sapiente cum cupiditate quia. Ut incidunt neque aut provident quaerat qui quia <i>illum</i>. Ab esse commodi ad earum molestias non internos atque non <i>consequatur</i> inventore 33 galisum nobis hic distinctio impedit! Est dicta iusto est <i>numquam</i> incidunt cum autem temporibus.
</text>
</book>
</books>
25 changes: 23 additions & 2 deletions src/test/scala/com/databricks/spark/xml/XmlSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1239,8 +1239,7 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
.schema(schema)
.xml(resDir + "cars-attribute.xml")
.collect()
assert(result.head.get(0) ===
"<year>2015</year><make>Chevy</make><model>Volt</model><comment foo=\"bar\">No</comment>")
assert(result.head.getString(0).contains("<comment foo=\"bar\">No</comment>"))
}

test("rootTag with simple attributes") {
Expand Down Expand Up @@ -1458,6 +1457,28 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
}
}

test("Issue 614: mixed content element parsed as string in schema") {
val textResults = spark.read
.schema(buildSchema(field("text")))
.option("rowTag", "book")
.xml(resDir + "mixed_children_as_string.xml")
val textHead = textResults.select("text").head().getString(0)
assert(textHead.contains(
"Lorem ipsum dolor sit amet. Ut <i>voluptas</i> distinctio et impedit deserunt"))
assert(textHead.contains(
"<i>numquam</i> incidunt cum autem temporibus."))

val bookResults = spark.read
.schema(buildSchema(field("book")))
.option("rowTag", "books")
.xml(resDir + "mixed_children_as_string.xml")
val bookHead = bookResults.select("book").head().getString(0)
assert(bookHead.contains(
"Lorem ipsum dolor sit amet. Ut <i>voluptas</i> distinctio et impedit deserunt"))
assert(bookHead.contains(
"<i>numquam</i> incidunt cum autem temporibus."))
}

private def getLines(path: Path): Seq[String] = {
val source = Source.fromFile(path.toFile)
try {
Expand Down

0 comments on commit 72957d5

Please sign in to comment.