From 72957d5939b699ba1b76e0a2d9e28a0965f1798b Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Tue, 3 Jan 2023 07:43:35 -0600 Subject: [PATCH] Simplify handling of complex mixed elements when schema says it is a string; should always just repeat the content as string (#625) --- .../spark/xml/parsers/StaxXmlParser.scala | 2 + .../xml/parsers/StaxXmlParserUtils.scala | 61 +++++++------------ .../resources/mixed_children_as_string.xml | 9 +++ .../com/databricks/spark/xml/XmlSuite.scala | 25 +++++++- 4 files changed, 55 insertions(+), 42 deletions(-) create mode 100644 src/test/resources/mixed_children_as_string.xml diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParser.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParser.scala index d1f0d126..749347ff 100644 --- a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParser.scala +++ b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParser.scala @@ -173,6 +173,8 @@ private[xml] object StaxXmlParser extends Serializable { case _ => convertObject(parser, st, options) } } + case (_: Characters, _: StringType) => + StaxXmlParserUtils.currentStructureAsString(parser) case (c: Characters, _: DataType) if c.isWhiteSpace => // When `Characters` is found, we need to look further to decide // if this is really data or space between other elements. diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala index 951df87e..dea94800 100644 --- a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala +++ b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlParserUtils.scala @@ -123,52 +123,33 @@ private[xml] object StaxXmlParserUtils { * Convert the current structure of XML document to a XML string. */ def currentStructureAsString(parser: XMLEventReader): String = { - // (Hyukjin) I could not find a proper method to produce the current document - // as a string. For Jackson, there is a method `copyCurrentStructure()`. - // So, it ended up with manually converting event by event to string. - def convertChildren(): String = { - var childrenXmlString = "" - parser.peek match { - case _: StartElement => - childrenXmlString += currentStructureAsString(parser) - case c: Characters if c.isWhiteSpace => - // There can be a `Characters` event between `StartElement`s. - // So, we need to check further to decide if this is a data or just - // a whitespace between them. - childrenXmlString += c.getData - parser.next - parser.peek match { - case _: StartElement => - childrenXmlString += currentStructureAsString(parser) - case _: XMLEvent => - // do nothing - } - case c: Characters => - childrenXmlString += c.getData - case _: XMLEvent => - // do nothing - } - childrenXmlString - } - - var xmlString = "" - var shouldStop = false - while (!shouldStop) { + val xmlString = new StringBuilder() + var indent = 0 + do { parser.nextEvent match { case e: StartElement => - val attributes = e.getAttributes.asScala.map { a => + xmlString.append('<').append(e.getName) + e.getAttributes.asScala.foreach { a => val att = a.asInstanceOf[Attribute] - " " + att.getName + "=\"" + att.getValue + "\"" - }.mkString("") - xmlString += "<" + e.getName + attributes + ">" - xmlString += convertChildren() + xmlString.append(' ').append(att.getName).append("=\""). + append(att.getValue).append('"') + } + xmlString.append('>') + indent += 1 case e: EndElement => - xmlString += "" - shouldStop = checkEndElement(parser) + xmlString.append("') + indent -= 1 + case c: Characters => + xmlString.append(c.getData) case _: XMLEvent => // do nothing } - } - xmlString + } while (parser.peek() match { + case _: EndElement => + // until the unclosed end element for the whole parent is found + indent > 0 + case _ => true + }) + xmlString.toString() } /** diff --git a/src/test/resources/mixed_children_as_string.xml b/src/test/resources/mixed_children_as_string.xml new file mode 100644 index 00000000..ee2e4a7b --- /dev/null +++ b/src/test/resources/mixed_children_as_string.xml @@ -0,0 +1,9 @@ + + + + Lorem ipsum dolor sit amet. Ut voluptas distinctio et impedit deserunt aut quam fugit et quaerat odit et nesciunt earum non dolores culpa et sunt nobis. Aut accusamus iste sed odio debitis et quasi amet rem quam sequi et voluptatem placeat aut voluptates iste? Vel nisi rerum sit eligendi excepturi et galisum animi et ipsa nihil vel consequatur velit eos velit nesciunt. + Quo voluptatibus sint ab officiis aperiam non obcaecati rerum eos veniam iste eum ipsam modi. Non voluptatem illum qui molestiae magni qui maxime commodi et accusantium similique qui necessitatibus minus? + At quod rerum et porro nisi ut tempore error et enim optio cum Quis voluptatibus qui dolores sapiente cum cupiditate quia. Ut incidunt neque aut provident quaerat qui quia illum. Ab esse commodi ad earum molestias non internos atque non consequatur inventore 33 galisum nobis hic distinctio impedit! Est dicta iusto est numquam incidunt cum autem temporibus. + + + \ No newline at end of file diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index 28bda0f6..830e6f54 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -1239,8 +1239,7 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { .schema(schema) .xml(resDir + "cars-attribute.xml") .collect() - assert(result.head.get(0) === - "2015ChevyVoltNo") + assert(result.head.getString(0).contains("No")) } test("rootTag with simple attributes") { @@ -1458,6 +1457,28 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { } } + test("Issue 614: mixed content element parsed as string in schema") { + val textResults = spark.read + .schema(buildSchema(field("text"))) + .option("rowTag", "book") + .xml(resDir + "mixed_children_as_string.xml") + val textHead = textResults.select("text").head().getString(0) + assert(textHead.contains( + "Lorem ipsum dolor sit amet. Ut voluptas distinctio et impedit deserunt")) + assert(textHead.contains( + "numquam incidunt cum autem temporibus.")) + + val bookResults = spark.read + .schema(buildSchema(field("book"))) + .option("rowTag", "books") + .xml(resDir + "mixed_children_as_string.xml") + val bookHead = bookResults.select("book").head().getString(0) + assert(bookHead.contains( + "Lorem ipsum dolor sit amet. Ut voluptas distinctio et impedit deserunt")) + assert(bookHead.contains( + "numquam incidunt cum autem temporibus.")) + } + private def getLines(path: Path): Seq[String] = { val source = Source.fromFile(path.toFile) try {