diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index a657f4df0e3d..17280289ed1b 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -28,7 +28,6 @@ use arrow::datatypes::{ DataType, Field, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, }; - use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result}; /// The type signature of an instantiation of binary operator expression such as @@ -155,7 +154,7 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result rhs: rhs.clone(), ret, }) - } else if let Some(coerced) = temporal_coercion(lhs, rhs) { + } else if let Some(coerced) = temporal_coercion_strict_timezone(lhs, rhs) { // Temporal arithmetic by first coercing to a common time representation // e.g. Date32 - Timestamp let ret = get_result(&coerced, &coerced).map_err(|e| { @@ -492,7 +491,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option bool { ) } +/// Non-strict Timezone Coercion is useful in scenarios where we can guarantee +/// a stable relationship between two timestamps of different timezones. +/// +/// An example of this is binary comparisons (<, >, ==, etc). Arrow stores timestamps +/// as relative to UTC epoch, and then adds the timezone as an offset. As a result, we can always +/// do a binary comparison between the two times. +/// +/// Timezone coercion is handled by the following rules: +/// - If only one has a timezone, coerce the other to match +/// - If both have a timezone, coerce to the left type +/// - "UTC" and "+00:00" are considered equivalent +fn temporal_coercion_nonstrict_timezone( + lhs_type: &DataType, + rhs_type: &DataType, +) -> Option { + use arrow::datatypes::DataType::*; + + match (lhs_type, rhs_type) { + (Timestamp(lhs_unit, lhs_tz), Timestamp(rhs_unit, rhs_tz)) => { + let tz = match (lhs_tz, rhs_tz) { + // If both have a timezone, use the left timezone. + (Some(lhs_tz), Some(_rhs_tz)) => Some(Arc::clone(lhs_tz)), + (Some(lhs_tz), None) => Some(Arc::clone(lhs_tz)), + (None, Some(rhs_tz)) => Some(Arc::clone(rhs_tz)), + (None, None) => None, + }; + + let unit = timeunit_coercion(lhs_unit, rhs_unit); + + Some(Timestamp(unit, tz)) + } + _ => temporal_coercion(lhs_type, rhs_type), + } +} + +/// Strict Timezone coercion is useful in scenarios where we cannot guarantee a stable relationship +/// between two timestamps with different timezones or do not want implicit coercion between them. +/// +/// An example of this when attempting to coerce function arguments. Functions already have a mechanism +/// for defining which timestamp types they want to support, so we do not want to do any further coercion. +/// /// Coercion rules for Temporal columns: the type that both lhs and rhs can be /// casted to for the purpose of a date computation /// For interval arithmetic, it doesn't handle datetime type +/- interval -fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// Timezone coercion is handled by the following rules: +/// - If only one has a timezone, coerce the other to match +/// - If both have a timezone, throw an error +/// - "UTC" and "+00:00" are considered equivalent +fn temporal_coercion_strict_timezone( + lhs_type: &DataType, + rhs_type: &DataType, +) -> Option { use arrow::datatypes::DataType::*; - use arrow::datatypes::IntervalUnit::*; - use arrow::datatypes::TimeUnit::*; match (lhs_type, rhs_type) { - (Interval(_), Interval(_)) => Some(Interval(MonthDayNano)), - (Date64, Date32) | (Date32, Date64) => Some(Date64), - (Timestamp(_, None), Date64) | (Date64, Timestamp(_, None)) => { - Some(Timestamp(Nanosecond, None)) - } - (Timestamp(_, _tz), Date64) | (Date64, Timestamp(_, _tz)) => { - Some(Timestamp(Nanosecond, None)) - } - (Timestamp(_, None), Date32) | (Date32, Timestamp(_, None)) => { - Some(Timestamp(Nanosecond, None)) - } - (Timestamp(_, _tz), Date32) | (Date32, Timestamp(_, _tz)) => { - Some(Timestamp(Nanosecond, None)) - } (Timestamp(lhs_unit, lhs_tz), Timestamp(rhs_unit, rhs_tz)) => { let tz = match (lhs_tz, rhs_tz) { (Some(lhs_tz), Some(rhs_tz)) => { @@ -1078,31 +1109,60 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option None, }; - let unit = match (lhs_unit, rhs_unit) { - (Second, Millisecond) => Second, - (Second, Microsecond) => Second, - (Second, Nanosecond) => Second, - (Millisecond, Second) => Second, - (Millisecond, Microsecond) => Millisecond, - (Millisecond, Nanosecond) => Millisecond, - (Microsecond, Second) => Second, - (Microsecond, Millisecond) => Millisecond, - (Microsecond, Nanosecond) => Microsecond, - (Nanosecond, Second) => Second, - (Nanosecond, Millisecond) => Millisecond, - (Nanosecond, Microsecond) => Microsecond, - (l, r) => { - assert_eq!(l, r); - *l - } - }; + let unit = timeunit_coercion(lhs_unit, rhs_unit); Some(Timestamp(unit, tz)) } + _ => temporal_coercion(lhs_type, rhs_type), + } +} + +fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { + use arrow::datatypes::DataType::*; + use arrow::datatypes::IntervalUnit::*; + use arrow::datatypes::TimeUnit::*; + + match (lhs_type, rhs_type) { + (Interval(_), Interval(_)) => Some(Interval(MonthDayNano)), + (Date64, Date32) | (Date32, Date64) => Some(Date64), + (Timestamp(_, None), Date64) | (Date64, Timestamp(_, None)) => { + Some(Timestamp(Nanosecond, None)) + } + (Timestamp(_, _tz), Date64) | (Date64, Timestamp(_, _tz)) => { + Some(Timestamp(Nanosecond, None)) + } + (Timestamp(_, None), Date32) | (Date32, Timestamp(_, None)) => { + Some(Timestamp(Nanosecond, None)) + } + (Timestamp(_, _tz), Date32) | (Date32, Timestamp(_, _tz)) => { + Some(Timestamp(Nanosecond, None)) + } _ => None, } } +fn timeunit_coercion(lhs_unit: &TimeUnit, rhs_unit: &TimeUnit) -> TimeUnit { + use arrow::datatypes::TimeUnit::*; + match (lhs_unit, rhs_unit) { + (Second, Millisecond) => Second, + (Second, Microsecond) => Second, + (Second, Nanosecond) => Second, + (Millisecond, Second) => Second, + (Millisecond, Microsecond) => Millisecond, + (Millisecond, Nanosecond) => Millisecond, + (Microsecond, Second) => Second, + (Microsecond, Millisecond) => Millisecond, + (Microsecond, Nanosecond) => Microsecond, + (Nanosecond, Second) => Second, + (Nanosecond, Millisecond) => Millisecond, + (Nanosecond, Microsecond) => Microsecond, + (l, r) => { + assert_eq!(l, r); + *l + } + } +} + /// coercion rules from NULL type. Since NULL can be casted to any other type in arrow, /// either lhs or rhs is NULL, if NULL can be casted to type of the other side, the coercion is valid. fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { @@ -1727,6 +1787,33 @@ mod tests { DataType::LargeBinary ); + // Timestamps + let utc: Option> = Some("UTC".into()); + test_coercion_binary_rule!( + DataType::Timestamp(TimeUnit::Second, utc.clone()), + DataType::Timestamp(TimeUnit::Second, utc.clone()), + Operator::Eq, + DataType::Timestamp(TimeUnit::Second, utc.clone()) + ); + test_coercion_binary_rule!( + DataType::Timestamp(TimeUnit::Second, utc.clone()), + DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())), + Operator::Eq, + DataType::Timestamp(TimeUnit::Second, utc.clone()) + ); + test_coercion_binary_rule!( + DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into())), + DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())), + Operator::Eq, + DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into())) + ); + test_coercion_binary_rule!( + DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())), + DataType::Timestamp(TimeUnit::Second, utc.clone()), + Operator::Eq, + DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())) + ); + // TODO add other data type Ok(()) } diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index 2ca2d49997a6..b63aad49d152 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -3021,3 +3021,99 @@ drop view t_utc; statement ok drop view t_timezone; + +# test comparisons across timestamps +statement ok +create table t AS +VALUES + ('2024-01-01T00:00:01Z'), + ('2024-02-01T00:00:01Z'), + ('2024-03-01T00:00:01Z') +; + +statement ok +create view t_utc as +select column1::timestamp AT TIME ZONE 'UTC' as "column1" +from t; + +statement ok +create view t_europe as +select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1" +from t; + +query P +SELECT column1 FROM t_utc WHERE column1 < '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'; +---- +2024-01-01T00:00:01Z +2024-02-01T00:00:01Z + +query P +SELECT column1 FROM t_europe WHERE column1 = '2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles'; +---- +2024-02-01T00:00:01+01:00 + +query P +SELECT column1 FROM t_europe WHERE column1 BETWEEN '2020-01-01T00:00:00' AT TIME ZONE 'Australia/Brisbane' AND '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'; +---- +2024-01-01T00:00:01+01:00 +2024-02-01T00:00:01+01:00 + +query P +SELECT column1 FROM t_utc WHERE column1 IN ('2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles'); +---- +2024-02-01T00:00:01Z + +query P +SELECT column1 as u from t_utc UNION SELECT column1 from t_europe ORDER BY u; +---- +2023-12-31T23:00:01Z +2024-01-01T00:00:01Z +2024-01-31T23:00:01Z +2024-02-01T00:00:01Z +2024-02-29T23:00:01Z +2024-03-01T00:00:01Z + +query P +SELECT column1 as e from t_europe UNION SELECT column1 from t_utc ORDER BY e; +---- +2024-01-01T00:00:01+01:00 +2024-01-01T01:00:01+01:00 +2024-02-01T00:00:01+01:00 +2024-02-01T01:00:01+01:00 +2024-03-01T00:00:01+01:00 +2024-03-01T01:00:01+01:00 + +query P +SELECT nvl2(null, '2020-01-01T00:00:00-04:00'::timestamp, '2021-02-03T04:05:06Z'::timestamp) +---- +2021-02-03T04:05:06 + +query ? +SELECT make_array('2020-01-01T00:00:00-04:00'::timestamp, '2021-01-01T01:02:03Z'::timestamp); +---- +[2020-01-01T04:00:00, 2021-01-01T01:02:03] + +query P +SELECT * FROM VALUES + ('2023-12-31T23:00:00Z' AT TIME ZONE 'UTC'), + ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'); +---- +2023-12-31T15:00:00-08:00 +2024-02-01T00:00:00-08:00 + +query P +SELECT * FROM VALUES + ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'), + ('2023-12-31T23:00:00' AT TIME ZONE 'UTC'); +---- +2024-02-01T08:00:00Z +2023-12-31T23:00:00Z + +statement ok +drop table t; + +statement ok +drop view t_utc; + +statement ok +drop view t_europe;