diff --git a/README.md b/README.md index 54cd51a2..49e5e6d5 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Defaults to `false`. New in 0.11.0. * `timestampFormat`: Specifies an additional timestamp format that will be tried when parsing values as `TimestampType` columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to try several formats, including [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT), -including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0. +including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used. * `dateFormat`: Specifies an additional timestamp format that will be tried when parsing values as `DateType` columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0. @@ -83,7 +83,7 @@ When writing files the API accepts several options: * `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified. * `timestampFormat`: Controls the format used to write `TimestampType` format columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). -Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0. +Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used. * `dateFormat`: Controls the format used to write `DateType` format columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0. diff --git a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala index 198bf208..934878ec 100755 --- a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala +++ b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala @@ -67,10 +67,16 @@ class DefaultSource (options.charset, options.rowTag) } + val paramsWithTZ = + sqlContext.sparkContext.getConf.getOption("spark.sql.session.timeZone") match { + case Some(tz) => parameters.updated("timezone", tz) + case None => parameters + } + XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), Some(path), - parameters, + paramsWithTZ, schema)(sqlContext) } diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala index 21994fcb..299a999e 100644 --- a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala +++ b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala @@ -64,6 +64,7 @@ private[xml] class XmlOptions( parameters.getOrElse("wildcardColName", XmlOptions.DEFAULT_WILDCARD_COL_NAME) val ignoreNamespace = parameters.get("ignoreNamespace").map(_.toBoolean).getOrElse(false) val timestampFormat = parameters.get("timestampFormat") + val timezone = parameters.get("timezone") val dateFormat = parameters.get("dateFormat") } diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 7222442e..2e81affc 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -17,17 +17,17 @@ package com.databricks.spark.xml.util import java.math.BigDecimal import java.sql.{Date, Timestamp} -import java.text.NumberFormat -import java.time.{LocalDate, ZoneId, ZonedDateTime} +import java.text.{NumberFormat, ParsePosition} +import java.time.{Instant, LocalDate, ZoneId} import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} import java.util.Locale - import scala.util.Try import scala.util.control.Exception._ - import org.apache.spark.sql.types._ import com.databricks.spark.xml.XmlOptions +import java.time.temporal.TemporalQueries + /** * Utility functions for type casting */ @@ -116,11 +116,29 @@ private[xml] object TypeCast { ) private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = { - val formatters = options.timestampFormat.map(DateTimeFormatter.ofPattern). - map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters) - formatters.foreach { format => + supportedXmlTimestampFormatters.foreach { format => + try { + return Timestamp.from(Instant.from(format.parse(value))) + } catch { + case _: Exception => // continue + } + } + options.timestampFormat.foreach { formatString => + // Check if there is offset or timezone and apply Spark timeZone if not + // Useful to support Java 8 and Java 11+ as they prioritize zone and offset differently + val hasTemporalInformation = formatString.indexOf("V") + + formatString.indexOf("z") + + formatString.indexOf("O") + + formatString.indexOf("X") + + formatString.indexOf("x") + + formatString.indexOf("Z") != (-6) + val format = if (hasTemporalInformation) { + DateTimeFormatter.ofPattern(formatString) + } else { + DateTimeFormatter.ofPattern(formatString).withZone(options.timezone.map(ZoneId.of).orNull) + } try { - return Timestamp.from(ZonedDateTime.parse(value, format).toInstant) + return Timestamp.from(Instant.from(format.parse(value))) } catch { case _: Exception => // continue } diff --git a/src/test/resources/time.xml b/src/test/resources/time.xml index ee0609c7..0374d3e4 100644 --- a/src/test/resources/time.xml +++ b/src/test/resources/time.xml @@ -2,4 +2,6 @@ John Smith 12-03-2011 10:15:30 PST + 2011/12/03 06:15:30 + 2011/12/03 16:15:30 +1000 \ No newline at end of file diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index 830e6f54..e7700ae7 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -1370,7 +1370,13 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { .option("rowTag", "book") .xml(resDir + "time.xml") val expectedSchema = - buildSchema(field("author"), field("time", TimestampType), field("time2", StringType)) + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", StringType), + field("time3", StringType), + field("time4", StringType) + ) assert(df.schema === expectedSchema) assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) } @@ -1392,11 +1398,54 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { .option("timestampFormat", "MM-dd-yyyy HH:mm:ss z") .xml(resDir + "time.xml") val expectedSchema = - buildSchema(field("author"), field("time", TimestampType), field("time2", TimestampType)) + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", TimestampType), + field("time3", StringType), + field("time4", StringType), + ) assert(df.schema === expectedSchema) + assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) assert(df.collect().head.getAs[Timestamp](2).getTime === 1322936130000L) } + test("Test custom timestampFormat without timezone") { + val df = spark.read + .option("rowTag", "book") + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss") + .xml(resDir + "time.xml") + val expectedSchema = + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", StringType), + field("time3", TimestampType), + field("time4", StringType) + ) + assert(df.schema === expectedSchema) + assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) + assert(df.collect().head.getAs[Timestamp](3).getTime === 1322892930000L) + } + + test("Test custom timestampFormat with offset") { + val df = spark.read + .option("rowTag", "book") + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss xx") + .xml(resDir + "time.xml") + val expectedSchema = + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", StringType), + field("time3", StringType), + field("time4", TimestampType) + ) + assert(df.schema === expectedSchema) + assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) + assert(df.collect().head.getAs[Timestamp](4).getTime === 1322892930000L) + } + test("Test null number type is null not 0.0") { val schema = buildSchema( struct("Header", diff --git a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala index fbc7885c..2fce0794 100644 --- a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala @@ -162,4 +162,73 @@ final class TypeCastSuite extends AnyFunSuite { Locale.setDefault(defaultLocale) } } + + test("Parsing built-in timestamp formatters") { + val options = XmlOptions(Map()) + val expectedResult = Timestamp.from( + ZonedDateTime.of(2002, 5, 30, 21, 46, 54, 0, ZoneId.of("UTC")) + .toInstant + ) + assert( + TypeCast.castTo("2002-05-30 21:46:54", TimestampType, options) === expectedResult + ) + assert( + TypeCast.castTo("2002-05-30T21:46:54", TimestampType, options) === expectedResult + ) + assert( + TypeCast.castTo("2002-05-30T21:46:54+00:00", TimestampType, options) === expectedResult + ) + assert( + TypeCast.castTo("2002-05-30T21:46:54.0000Z", TimestampType, options) === expectedResult + ) + } + + test("Custom timestamp format is used to parse correctly") { + var options = XmlOptions(Map("timestampFormat" -> "MM-dd-yyyy HH:mm:ss", "timezone" -> "UTC")) + assert( + TypeCast.castTo("12-03-2011 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) + ) + + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss", "timezone" -> "UTC")) + assert( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) + ) + + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss", + "timezone" -> "Asia/Shanghai")) + assert( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) !== + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) + ) + + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss", + "timezone" -> "Asia/Shanghai")) + assert( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("Asia/Shanghai")) + .toInstant + ) + ) + + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss")) + intercept[IllegalArgumentException]( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) + ) + } }