Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Ignore names of technical inner fields (of List and Map types) when comparing datatypes for logical equivalence #13522

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
326 changes: 320 additions & 6 deletions datafusion/common/src/dfschema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -656,9 +656,26 @@ impl DFSchema {
(othertype, DataType::Dictionary(_, v1)) => v1.as_ref() == othertype,
(DataType::List(f1), DataType::List(f2))
| (DataType::LargeList(f1), DataType::LargeList(f2))
| (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
| (DataType::Map(f1, _), DataType::Map(f2, _)) => {
Self::field_is_logically_equal(f1, f2)
| (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => {
// Don't compare the names of the technical inner field
// Usually "item" but that's not mandated
Comment on lines +660 to +661
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that depends what "logically equal" actually means.
The definition for this function is currently by-an-example, we should try to formalize what we mean.

Self::datatype_is_logically_equal(f1.data_type(), f2.data_type())
}
(DataType::Map(f1, _), DataType::Map(f2, _)) => {
// Don't compare the names of the technical inner fields
// Usually "entries", "key", "value" but that's not mandated
match (f1.data_type(), f2.data_type()) {
(DataType::Struct(f1_inner), DataType::Struct(f2_inner)) => {
f1_inner.len() == f2_inner.len()
&& f1_inner.iter().zip(f2_inner.iter()).all(|(f1, f2)| {
Self::datatype_is_logically_equal(
f1.data_type(),
f2.data_type(),
)
})
}
_ => panic!("Map type should have an inner struct field"),
}
}
(DataType::Struct(fields1), DataType::Struct(fields2)) => {
let iter1 = fields1.iter();
Expand Down Expand Up @@ -695,9 +712,26 @@ impl DFSchema {
}
(DataType::List(f1), DataType::List(f2))
| (DataType::LargeList(f1), DataType::LargeList(f2))
| (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
| (DataType::Map(f1, _), DataType::Map(f2, _)) => {
Self::field_is_semantically_equal(f1, f2)
| (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) => {
// Don't compare the names of the technical inner field
// Usually "item" but that's not mandated
Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type())
}
(DataType::Map(f1, _), DataType::Map(f2, _)) => {
// Don't compare the names of the technical inner fields
// Usually "entries", "key", "value" but that's not mandated
match (f1.data_type(), f2.data_type()) {
(DataType::Struct(f1_inner), DataType::Struct(f2_inner)) => {
f1_inner.len() == f2_inner.len()
&& f1_inner.iter().zip(f2_inner.iter()).all(|(f1, f2)| {
Self::datatype_is_semantically_equal(
f1.data_type(),
f2.data_type(),
)
})
}
_ => panic!("Map type should have an inner struct field"),
}
}
(DataType::Struct(fields1), DataType::Struct(fields2)) => {
let iter1 = fields1.iter();
Expand Down Expand Up @@ -1332,6 +1366,286 @@ mod tests {
Ok(())
}

#[test]
fn test_datatype_is_logically_equal() {
assert!(DFSchema::datatype_is_logically_equal(
&DataType::Int8,
&DataType::Int8
));

assert!(!DFSchema::datatype_is_logically_equal(
&DataType::Int8,
&DataType::Int16
));

// Test lists

// Succeeds if both have the same element type, disregards names and nullability
assert!(DFSchema::datatype_is_logically_equal(
&DataType::List(Field::new("item", DataType::Int8, true).into()),
&DataType::List(Field::new("element", DataType::Int8, false).into())
));

// Fails if element type is different
assert!(!DFSchema::datatype_is_logically_equal(
&DataType::List(Field::new("item", DataType::Int8, true).into()),
&DataType::List(Field::new("item", DataType::Int16, true).into())
));

// Test maps
let map_field = DataType::Map(
Field::new(
"entries",
DataType::Struct(Fields::from(vec![
Field::new("key", DataType::Int8, false),
Field::new("value", DataType::Int8, true),
])),
true,
)
.into(),
true,
);

// Succeeds if both maps have the same key and value types, disregards names and nullability
assert!(DFSchema::datatype_is_logically_equal(
&map_field,
&DataType::Map(
Field::new(
"pairs",
DataType::Struct(Fields::from(vec![
Field::new("one", DataType::Int8, false),
Field::new("two", DataType::Int8, false)
])),
true
)
.into(),
true
)
));
// Fails if value type is different
assert!(!DFSchema::datatype_is_logically_equal(
&map_field,
&DataType::Map(
Field::new(
"entries",
DataType::Struct(Fields::from(vec![
Field::new("key", DataType::Int8, false),
Field::new("value", DataType::Int16, true)
])),
true
)
.into(),
true
)
));

// Fails if key type is different
assert!(!DFSchema::datatype_is_logically_equal(
&map_field,
&DataType::Map(
Field::new(
"entries",
DataType::Struct(Fields::from(vec![
Field::new("key", DataType::Int16, false),
Field::new("value", DataType::Int8, true)
])),
true
)
.into(),
true
)
));

// Test structs

let struct_field = DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int8, true),
Field::new("b", DataType::Int8, true),
]));

// Succeeds if both have same names and datatypes, ignores nullability
assert!(DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int8, false),
Field::new("b", DataType::Int8, true),
]))
));

// Fails if field names are different
assert!(!DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![
Field::new("x", DataType::Int8, true),
Field::new("y", DataType::Int8, true),
]))
));

// Fails if types are different
assert!(!DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int16, true),
Field::new("b", DataType::Int8, true),
]))
));

// Fails if more or less fields
assert!(!DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int8, true),]))
));
}

#[test]
fn test_datatype_is_logically_equivalent_to_dictionary() {
// Dictionary is logically equal to its value type
assert!(DFSchema::datatype_is_logically_equal(
&DataType::Utf8,
&DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
));
}

#[test]
fn test_datatype_is_semantically_equal() {
assert!(DFSchema::datatype_is_semantically_equal(
&DataType::Int8,
&DataType::Int8
));

assert!(!DFSchema::datatype_is_semantically_equal(
&DataType::Int8,
&DataType::Int16
));

// Test lists

// Succeeds if both have the same element type, disregards names and nullability
assert!(DFSchema::datatype_is_semantically_equal(
&DataType::List(Field::new("item", DataType::Int8, true).into()),
&DataType::List(Field::new("element", DataType::Int8, false).into())
));

// Fails if element type is different
assert!(!DFSchema::datatype_is_semantically_equal(
&DataType::List(Field::new("item", DataType::Int8, true).into()),
&DataType::List(Field::new("item", DataType::Int16, true).into())
));

// Test maps
let map_field = DataType::Map(
Field::new(
"entries",
DataType::Struct(Fields::from(vec![
Field::new("key", DataType::Int8, false),
Field::new("value", DataType::Int8, true),
])),
true,
)
.into(),
true,
);

// Succeeds if both maps have the same key and value types, disregards names and nullability
assert!(DFSchema::datatype_is_semantically_equal(
&map_field,
&DataType::Map(
Field::new(
"pairs",
DataType::Struct(Fields::from(vec![
Field::new("one", DataType::Int8, false),
Field::new("two", DataType::Int8, false)
])),
true
)
.into(),
true
)
));
// Fails if value type is different
assert!(!DFSchema::datatype_is_semantically_equal(
&map_field,
&DataType::Map(
Field::new(
"entries",
DataType::Struct(Fields::from(vec![
Field::new("key", DataType::Int8, false),
Field::new("value", DataType::Int16, true)
])),
true
)
.into(),
true
)
));

// Fails if key type is different
assert!(!DFSchema::datatype_is_semantically_equal(
&map_field,
&DataType::Map(
Field::new(
"entries",
DataType::Struct(Fields::from(vec![
Field::new("key", DataType::Int16, false),
Field::new("value", DataType::Int8, true)
])),
true
)
.into(),
true
)
));

// Test structs

let struct_field = DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int8, true),
Field::new("b", DataType::Int8, true),
]));

// Succeeds if both have same names and datatypes, ignores nullability
assert!(DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int8, false),
Field::new("b", DataType::Int8, true),
]))
));

// Fails if field names are different
assert!(!DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![
Field::new("x", DataType::Int8, true),
Field::new("y", DataType::Int8, true),
]))
));

// Fails if types are different
assert!(!DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Int16, true),
Field::new("b", DataType::Int8, true),
]))
));

// Fails if more or less fields
assert!(!DFSchema::datatype_is_logically_equal(
&struct_field,
&DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int8, true),]))
));
}

#[test]
fn test_datatype_is_not_semantically_equivalent_to_dictionary() {
// Dictionary is not semantically equal to its value type
assert!(!DFSchema::datatype_is_semantically_equal(
&DataType::Utf8,
&DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8))
));
}

fn test_schema_2() -> Schema {
Schema::new(vec![
Field::new("c100", DataType::Boolean, true),
Expand Down