apache · andygrove · May 3, 2024 · Apr 27, 2024 · Apr 28, 2024 · Apr 28, 2024
diff --git a/core/src/errors.rs b/core/src/errors.rs
@@ -72,6 +72,15 @@ pub enum CometError {
         to_type: String,
     },
 
+    #[error("[CAST_OVERFLOW] The value {value} of the type \"{from_type}\" cannot be cast to \"{to_type}\" \
+        due to an overflow. Use `try_cast` to tolerate overflow and return NULL instead. If necessary \
+        set \"spark.sql.ansi.enabled\" to \"false\" to bypass this error.")]
+    CastOverFlow {
+        value: String,
+        from_type: String,
+        to_type: String,
+    },
+
     #[error(transparent)]
     Arrow {
         #[from]

diff --git a/core/src/execution/datafusion/expressions/cast.rs b/core/src/execution/datafusion/expressions/cast.rs
@@ -28,7 +28,10 @@ use arrow::{
     record_batch::RecordBatch,
     util::display::FormatOptions,
 };
-use arrow_array::{Array, ArrayRef, BooleanArray, GenericStringArray, OffsetSizeTrait};
+use arrow_array::{
+    types::{Int16Type, Int32Type, Int64Type, Int8Type},
+    Array, ArrayRef, BooleanArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
+};
 use arrow_schema::{DataType, Schema};
 use datafusion::logical_expr::ColumnarValue;
 use datafusion_common::{internal_err, Result as DataFusionResult, ScalarValue};
@@ -64,6 +67,62 @@ pub struct Cast {
     pub timezone: String,
 }
 
+macro_rules! cast_int_to_int_macro {
+    (
+        $array: expr,
+        $eval_mode:expr,
+        $from_arrow_primitive_type: ty,
+        $to_arrow_primitive_type: ty,
+        $from_data_type: expr,
+        $to_native_type: ty,
+        $spark_from_data_type_name: expr,
+        $spark_to_data_type_name: expr
+    ) => {{
+        let cast_array = $array
+            .as_any()
+            .downcast_ref::<PrimitiveArray<$from_arrow_primitive_type>>()
+            .unwrap();
+        let spark_int_literal_suffix = match $from_data_type {
+            &DataType::Int64 => "L",
+            &DataType::Int16 => "S",
+            &DataType::Int8 => "T",
+            _ => "",
+        };
+
+        let output_array = match $eval_mode {
+            EvalMode::Legacy => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        Ok::<Option<$to_native_type>, CometError>(Some(value as $to_native_type))
+                    }
+                    _ => Ok(None),
+                })
+                .collect::<Result<PrimitiveArray<$to_arrow_primitive_type>, _>>(),
+            _ => cast_array
+                .iter()
+                .map(|value| match value {
+                    Some(value) => {
+                        let res = <$to_native_type>::try_from(value);
+                        if res.is_err() {
+                            Err(CometError::CastOverFlow {
+                                value: value.to_string() + spark_int_literal_suffix,
+                                from_type: $spark_from_data_type_name.to_string(),
+                                to_type: $spark_to_data_type_name.to_string(),
+                            })
+                        } else {
+                            Ok::<Option<$to_native_type>, CometError>(Some(res.unwrap()))
+                        }
+                    }
+                    _ => Ok(None),
+                })
+                .collect::<Result<PrimitiveArray<$to_arrow_primitive_type>, _>>(),
+        }?;
+        let result: CometResult<ArrayRef> = Ok(Arc::new(output_array) as ArrayRef);
+        result
+    }};
+}
+
 impl Cast {
     pub fn new(
         child: Arc<dyn PhysicalExpr>,
@@ -103,12 +162,54 @@ impl Cast {
             (DataType::LargeUtf8, DataType::Boolean) => {
                 Self::spark_cast_utf8_to_boolean::<i64>(&array, self.eval_mode)?
             }
+            (DataType::Int64, DataType::Int32)
+            | (DataType::Int64, DataType::Int16)
+            | (DataType::Int64, DataType::Int8)
+            | (DataType::Int32, DataType::Int16)
+            | (DataType::Int32, DataType::Int8)
+            | (DataType::Int16, DataType::Int8)
+                if self.eval_mode != EvalMode::Try =>
+            {
+                Self::spark_cast_int_to_int(&array, self.eval_mode, from_type, to_type)?
+            }
             _ => cast_with_options(&array, to_type, &CAST_OPTIONS)?,
         };
         let result = spark_cast(cast_result, from_type, to_type);
         Ok(result)
     }
 
+    fn spark_cast_int_to_int(
+        array: &dyn Array,
+        eval_mode: EvalMode,
+        from_type: &DataType,
+        to_type: &DataType,
+    ) -> CometResult<ArrayRef> {
+        match (from_type, to_type) {
+            (DataType::Int64, DataType::Int32) => cast_int_to_int_macro!(
+                array, eval_mode, Int64Type, Int32Type, from_type, i32, "BIGINT", "INT"
+            ),
+            (DataType::Int64, DataType::Int16) => cast_int_to_int_macro!(
+                array, eval_mode, Int64Type, Int16Type, from_type, i16, "BIGINT", "SMALLINT"
+            ),
+            (DataType::Int64, DataType::Int8) => cast_int_to_int_macro!(
+                array, eval_mode, Int64Type, Int8Type, from_type, i8, "BIGINT", "TINYINT"
+            ),
+            (DataType::Int32, DataType::Int16) => cast_int_to_int_macro!(
+                array, eval_mode, Int32Type, Int16Type, from_type, i16, "INT", "SMALLINT"
+            ),
+            (DataType::Int32, DataType::Int8) => cast_int_to_int_macro!(
+                array, eval_mode, Int32Type, Int8Type, from_type, i8, "INT", "TINYINT"
+            ),
+            (DataType::Int16, DataType::Int8) => cast_int_to_int_macro!(
+                array, eval_mode, Int16Type, Int8Type, from_type, i8, "SMALLINT", "TINYINT"
+            ),
+            _ => unreachable!(
+                "{}",
+                format!("invalid integer type {to_type} in cast from {from_type}")
+            ),
+        }
+    }
+
     fn spark_cast_utf8_to_boolean<OffsetSize>(
         from: &dyn Array,
         eval_mode: EvalMode,

diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala
@@ -574,6 +574,30 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     // TODO: implement
   }
 
+  test("cast short to byte") {
+    castTest(generateShorts, DataTypes.ByteType)
+  }
+
+  test("cast int to byte") {
+    castTest(generateInts, DataTypes.ByteType)
+  }
+
+  test("cast int to short") {
+    castTest(generateInts, DataTypes.ShortType)
+  }
+
+  test("cast long to byte") {
+    castTest(generateLongs, DataTypes.ByteType)
+  }
+
+  test("cast long to short") {
+    castTest(generateLongs, DataTypes.ShortType)
+  }
+
+  test("cast long to int") {
+    castTest(generateLongs, DataTypes.IntegerType)
+  }
+
   private def generateFloats(): DataFrame = {
     val r = new Random(0)
     val values = Seq(
@@ -722,7 +746,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
               // Comet message is in format `The value 'VALUE' of the type FROM_TYPE cannot be cast to TO_TYPE`
               // We just check that the comet message contains the same invalid value as the Spark message
               val sparkInvalidValue = sparkMessage.substring(sparkMessage.indexOf(':') + 2)
-              assert(cometMessage.contains(sparkInvalidValue))
+              assert(
+                cometMessage.contains(sparkInvalidValue) || cometMessage.contains("overflow"))
             }
         }