Skip to content

Commit

Permalink
Support cross-timezone timestamp comparison via coercsion (#11711)
Browse files Browse the repository at this point in the history
* feat: enable comparisons across timezones

* test: add tests for timezone changes

* test: fix test

* chore: cargofmt

* chore: improve documentation and code cleanup

* feat: use nonstrict timezone coercion for values
  • Loading branch information
jeffreyssmith2nd authored Jul 31, 2024
1 parent fa50636 commit ae2ca6a
Show file tree
Hide file tree
Showing 2 changed files with 222 additions and 39 deletions.
165 changes: 126 additions & 39 deletions datafusion/expr/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ use arrow::datatypes::{
DataType, Field, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
};

use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result};

/// The type signature of an instantiation of binary operator expression such as
Expand Down Expand Up @@ -155,7 +154,7 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result<Signature>
rhs: rhs.clone(),
ret,
})
} else if let Some(coerced) = temporal_coercion(lhs, rhs) {
} else if let Some(coerced) = temporal_coercion_strict_timezone(lhs, rhs) {
// Temporal arithmetic by first coercing to a common time representation
// e.g. Date32 - Timestamp
let ret = get_result(&coerced, &coerced).map_err(|e| {
Expand Down Expand Up @@ -492,7 +491,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<D
}
binary_numeric_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_coercion(lhs_type, rhs_type, true))
.or_else(|| temporal_coercion(lhs_type, rhs_type))
.or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
.or_else(|| string_coercion(lhs_type, rhs_type))
.or_else(|| list_coercion(lhs_type, rhs_type))
.or_else(|| null_coercion(lhs_type, rhs_type))
Expand All @@ -508,7 +507,7 @@ pub fn values_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
return Some(lhs_type.clone());
}
binary_numeric_coercion(lhs_type, rhs_type)
.or_else(|| temporal_coercion(lhs_type, rhs_type))
.or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type))
.or_else(|| string_coercion(lhs_type, rhs_type))
.or_else(|| binary_coercion(lhs_type, rhs_type))
}
Expand Down Expand Up @@ -1036,29 +1035,61 @@ fn is_time_with_valid_unit(datatype: DataType) -> bool {
)
}

/// Non-strict Timezone Coercion is useful in scenarios where we can guarantee
/// a stable relationship between two timestamps of different timezones.
///
/// An example of this is binary comparisons (<, >, ==, etc). Arrow stores timestamps
/// as relative to UTC epoch, and then adds the timezone as an offset. As a result, we can always
/// do a binary comparison between the two times.
///
/// Timezone coercion is handled by the following rules:
/// - If only one has a timezone, coerce the other to match
/// - If both have a timezone, coerce to the left type
/// - "UTC" and "+00:00" are considered equivalent
fn temporal_coercion_nonstrict_timezone(
lhs_type: &DataType,
rhs_type: &DataType,
) -> Option<DataType> {
use arrow::datatypes::DataType::*;

match (lhs_type, rhs_type) {
(Timestamp(lhs_unit, lhs_tz), Timestamp(rhs_unit, rhs_tz)) => {
let tz = match (lhs_tz, rhs_tz) {
// If both have a timezone, use the left timezone.
(Some(lhs_tz), Some(_rhs_tz)) => Some(Arc::clone(lhs_tz)),
(Some(lhs_tz), None) => Some(Arc::clone(lhs_tz)),
(None, Some(rhs_tz)) => Some(Arc::clone(rhs_tz)),
(None, None) => None,
};

let unit = timeunit_coercion(lhs_unit, rhs_unit);

Some(Timestamp(unit, tz))
}
_ => temporal_coercion(lhs_type, rhs_type),
}
}

/// Strict Timezone coercion is useful in scenarios where we cannot guarantee a stable relationship
/// between two timestamps with different timezones or do not want implicit coercion between them.
///
/// An example of this when attempting to coerce function arguments. Functions already have a mechanism
/// for defining which timestamp types they want to support, so we do not want to do any further coercion.
///
/// Coercion rules for Temporal columns: the type that both lhs and rhs can be
/// casted to for the purpose of a date computation
/// For interval arithmetic, it doesn't handle datetime type +/- interval
fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
/// Timezone coercion is handled by the following rules:
/// - If only one has a timezone, coerce the other to match
/// - If both have a timezone, throw an error
/// - "UTC" and "+00:00" are considered equivalent
fn temporal_coercion_strict_timezone(
lhs_type: &DataType,
rhs_type: &DataType,
) -> Option<DataType> {
use arrow::datatypes::DataType::*;
use arrow::datatypes::IntervalUnit::*;
use arrow::datatypes::TimeUnit::*;

match (lhs_type, rhs_type) {
(Interval(_), Interval(_)) => Some(Interval(MonthDayNano)),
(Date64, Date32) | (Date32, Date64) => Some(Date64),
(Timestamp(_, None), Date64) | (Date64, Timestamp(_, None)) => {
Some(Timestamp(Nanosecond, None))
}
(Timestamp(_, _tz), Date64) | (Date64, Timestamp(_, _tz)) => {
Some(Timestamp(Nanosecond, None))
}
(Timestamp(_, None), Date32) | (Date32, Timestamp(_, None)) => {
Some(Timestamp(Nanosecond, None))
}
(Timestamp(_, _tz), Date32) | (Date32, Timestamp(_, _tz)) => {
Some(Timestamp(Nanosecond, None))
}
(Timestamp(lhs_unit, lhs_tz), Timestamp(rhs_unit, rhs_tz)) => {
let tz = match (lhs_tz, rhs_tz) {
(Some(lhs_tz), Some(rhs_tz)) => {
Expand All @@ -1078,31 +1109,60 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp
(None, None) => None,
};

let unit = match (lhs_unit, rhs_unit) {
(Second, Millisecond) => Second,
(Second, Microsecond) => Second,
(Second, Nanosecond) => Second,
(Millisecond, Second) => Second,
(Millisecond, Microsecond) => Millisecond,
(Millisecond, Nanosecond) => Millisecond,
(Microsecond, Second) => Second,
(Microsecond, Millisecond) => Millisecond,
(Microsecond, Nanosecond) => Microsecond,
(Nanosecond, Second) => Second,
(Nanosecond, Millisecond) => Millisecond,
(Nanosecond, Microsecond) => Microsecond,
(l, r) => {
assert_eq!(l, r);
*l
}
};
let unit = timeunit_coercion(lhs_unit, rhs_unit);

Some(Timestamp(unit, tz))
}
_ => temporal_coercion(lhs_type, rhs_type),
}
}

fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
use arrow::datatypes::IntervalUnit::*;
use arrow::datatypes::TimeUnit::*;

match (lhs_type, rhs_type) {
(Interval(_), Interval(_)) => Some(Interval(MonthDayNano)),
(Date64, Date32) | (Date32, Date64) => Some(Date64),
(Timestamp(_, None), Date64) | (Date64, Timestamp(_, None)) => {
Some(Timestamp(Nanosecond, None))
}
(Timestamp(_, _tz), Date64) | (Date64, Timestamp(_, _tz)) => {
Some(Timestamp(Nanosecond, None))
}
(Timestamp(_, None), Date32) | (Date32, Timestamp(_, None)) => {
Some(Timestamp(Nanosecond, None))
}
(Timestamp(_, _tz), Date32) | (Date32, Timestamp(_, _tz)) => {
Some(Timestamp(Nanosecond, None))
}
_ => None,
}
}

fn timeunit_coercion(lhs_unit: &TimeUnit, rhs_unit: &TimeUnit) -> TimeUnit {
use arrow::datatypes::TimeUnit::*;
match (lhs_unit, rhs_unit) {
(Second, Millisecond) => Second,
(Second, Microsecond) => Second,
(Second, Nanosecond) => Second,
(Millisecond, Second) => Second,
(Millisecond, Microsecond) => Millisecond,
(Millisecond, Nanosecond) => Millisecond,
(Microsecond, Second) => Second,
(Microsecond, Millisecond) => Millisecond,
(Microsecond, Nanosecond) => Microsecond,
(Nanosecond, Second) => Second,
(Nanosecond, Millisecond) => Millisecond,
(Nanosecond, Microsecond) => Microsecond,
(l, r) => {
assert_eq!(l, r);
*l
}
}
}

/// coercion rules from NULL type. Since NULL can be casted to any other type in arrow,
/// either lhs or rhs is NULL, if NULL can be casted to type of the other side, the coercion is valid.
fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
Expand Down Expand Up @@ -1727,6 +1787,33 @@ mod tests {
DataType::LargeBinary
);

// Timestamps
let utc: Option<Arc<str>> = Some("UTC".into());
test_coercion_binary_rule!(
DataType::Timestamp(TimeUnit::Second, utc.clone()),
DataType::Timestamp(TimeUnit::Second, utc.clone()),
Operator::Eq,
DataType::Timestamp(TimeUnit::Second, utc.clone())
);
test_coercion_binary_rule!(
DataType::Timestamp(TimeUnit::Second, utc.clone()),
DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
Operator::Eq,
DataType::Timestamp(TimeUnit::Second, utc.clone())
);
test_coercion_binary_rule!(
DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into())),
DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
Operator::Eq,
DataType::Timestamp(TimeUnit::Second, Some("America/New_York".into()))
);
test_coercion_binary_rule!(
DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into())),
DataType::Timestamp(TimeUnit::Second, utc.clone()),
Operator::Eq,
DataType::Timestamp(TimeUnit::Second, Some("Europe/Brussels".into()))
);

// TODO add other data type
Ok(())
}
Expand Down
96 changes: 96 additions & 0 deletions datafusion/sqllogictest/test_files/timestamps.slt
Original file line number Diff line number Diff line change
Expand Up @@ -3021,3 +3021,99 @@ drop view t_utc;

statement ok
drop view t_timezone;

# test comparisons across timestamps
statement ok
create table t AS
VALUES
('2024-01-01T00:00:01Z'),
('2024-02-01T00:00:01Z'),
('2024-03-01T00:00:01Z')
;

statement ok
create view t_utc as
select column1::timestamp AT TIME ZONE 'UTC' as "column1"
from t;

statement ok
create view t_europe as
select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
from t;

query P
SELECT column1 FROM t_utc WHERE column1 < '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
----
2024-01-01T00:00:01Z
2024-02-01T00:00:01Z

query P
SELECT column1 FROM t_europe WHERE column1 = '2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles';
----
2024-02-01T00:00:01+01:00

query P
SELECT column1 FROM t_europe WHERE column1 BETWEEN '2020-01-01T00:00:00' AT TIME ZONE 'Australia/Brisbane' AND '2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles';
----
2024-01-01T00:00:01+01:00
2024-02-01T00:00:01+01:00

query P
SELECT column1 FROM t_utc WHERE column1 IN ('2024-01-31T16:00:01' AT TIME ZONE 'America/Los_Angeles');
----
2024-02-01T00:00:01Z

query P
SELECT column1 as u from t_utc UNION SELECT column1 from t_europe ORDER BY u;
----
2023-12-31T23:00:01Z
2024-01-01T00:00:01Z
2024-01-31T23:00:01Z
2024-02-01T00:00:01Z
2024-02-29T23:00:01Z
2024-03-01T00:00:01Z

query P
SELECT column1 as e from t_europe UNION SELECT column1 from t_utc ORDER BY e;
----
2024-01-01T00:00:01+01:00
2024-01-01T01:00:01+01:00
2024-02-01T00:00:01+01:00
2024-02-01T01:00:01+01:00
2024-03-01T00:00:01+01:00
2024-03-01T01:00:01+01:00

query P
SELECT nvl2(null, '2020-01-01T00:00:00-04:00'::timestamp, '2021-02-03T04:05:06Z'::timestamp)
----
2021-02-03T04:05:06

query ?
SELECT make_array('2020-01-01T00:00:00-04:00'::timestamp, '2021-01-01T01:02:03Z'::timestamp);
----
[2020-01-01T04:00:00, 2021-01-01T01:02:03]

query P
SELECT * FROM VALUES
('2023-12-31T23:00:00Z' AT TIME ZONE 'UTC'),
('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles');
----
2023-12-31T15:00:00-08:00
2024-02-01T00:00:00-08:00

query P
SELECT * FROM VALUES
('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles'),
('2023-12-31T23:00:00' AT TIME ZONE 'UTC');
----
2024-02-01T08:00:00Z
2023-12-31T23:00:00Z

statement ok
drop table t;

statement ok
drop view t_utc;

statement ok
drop view t_europe;

0 comments on commit ae2ca6a

Please sign in to comment.