From 4ac7de17ca4a427872a1b0988a53444e30380222 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Thu, 25 Jan 2024 13:16:01 +0300
Subject: [PATCH 01/27] Fix optimize projections bug (#8960)

* Fix optimize projections bug

* Add new dataframe test
---
 datafusion/core/tests/dataframe/mod.rs        | 63 ++++++++++++-
 .../optimizer/src/optimize_projections.rs     | 27 +++++-
 datafusion/sqllogictest/test_files/window.slt | 88 +++++++++++--------
 3 files changed, 137 insertions(+), 41 deletions(-)
diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs
index 588b4647e5c1..89ab04dfee89 100644
--- a/datafusion/core/tests/dataframe/mod.rs
+++ b/datafusion/core/tests/dataframe/mod.rs
@@ -34,18 +34,19 @@ use std::sync::Arc;
 use datafusion::dataframe::DataFrame;
 use datafusion::datasource::MemTable;
 use datafusion::error::Result;
-use datafusion::execution::context::SessionContext;
+use datafusion::execution::context::{SessionContext, SessionState};
 use datafusion::prelude::JoinType;
 use datafusion::prelude::{CsvReadOptions, ParquetReadOptions};
 use datafusion::test_util::parquet_test_data;
 use datafusion::{assert_batches_eq, assert_batches_sorted_eq};
 use datafusion_common::{assert_contains, DataFusionError, ScalarValue, UnnestOptions};
 use datafusion_execution::config::SessionConfig;
+use datafusion_execution::runtime_env::RuntimeEnv;
 use datafusion_expr::expr::{GroupingSet, Sort};
 use datafusion_expr::{
-    array_agg, avg, col, count, exists, expr, in_subquery, lit, max, out_ref_col,
-    scalar_subquery, sum, wildcard, AggregateFunction, Expr, ExprSchemable, WindowFrame,
-    WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
+    array_agg, avg, cast, col, count, exists, expr, in_subquery, lit, max, out_ref_col,
+    scalar_subquery, sum, when, wildcard, AggregateFunction, Expr, ExprSchemable,
+    WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
 };
 use datafusion_physical_expr::var_provider::{VarProvider, VarType};
 
@@ -1430,6 +1431,60 @@ async fn unnest_analyze_metrics() -> Result<()> {
     Ok(())
 }
 
+#[tokio::test]
+async fn consecutive_projection_same_schema() -> Result<()> {
+    let config = SessionConfig::new();
+    let runtime = Arc::new(RuntimeEnv::default());
+    let state = SessionState::new_with_config_rt(config, runtime);
+    let ctx = SessionContext::new_with_state(state);
+
+    let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
+
+    let batch =
+        RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![0, 1]))])
+            .unwrap();
+
+    let df = ctx.read_batch(batch).unwrap();
+    df.clone().show().await.unwrap();
+
+    // Add `t` column full of nulls
+    let df = df
+        .with_column("t", cast(Expr::Literal(ScalarValue::Null), DataType::Int32))
+        .unwrap();
+    df.clone().show().await.unwrap();
+
+    let df = df
+        // (case when id = 1 then 10 else t) as t
+        .with_column(
+            "t",
+            when(col("id").eq(lit(1)), lit(10))
+                .otherwise(col("t"))
+                .unwrap(),
+        )
+        .unwrap()
+        // (case when id = 1 then 10 else t) as t2
+        .with_column(
+            "t2",
+            when(col("id").eq(lit(1)), lit(10))
+                .otherwise(col("t"))
+                .unwrap(),
+        )
+        .unwrap();
+
+    let results = df.collect().await?;
+    let expected = [
+        "+----+----+----+",
+        "| id | t  | t2 |",
+        "+----+----+----+",
+        "| 0  |    |    |",
+        "| 1  | 10 | 10 |",
+        "+----+----+----+",
+    ];
+    assert_batches_sorted_eq!(expected, &results);
+
+    Ok(())
+}
+
 async fn create_test_table(name: &str) -> Result<DataFrame> {
     let schema = Arc::new(Schema::new(vec![
         Field::new("a", DataType::Utf8, false),
diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index ab0cb0a26551..f87f5fdea99f 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -867,7 +867,9 @@ fn rewrite_projection_given_requirements(
     return if let Some(input) =
         optimize_projections(&proj.input, config, &required_indices)?
     {
-        if &projection_schema(&input, &exprs_used)? == input.schema() {
+        if &projection_schema(&input, &exprs_used)? == input.schema()
+            && exprs_used.iter().all(is_expr_trivial)
+        {
             Ok(Some(input))
         } else {
             Projection::try_new(exprs_used, Arc::new(input))
@@ -899,7 +901,7 @@ mod tests {
     use datafusion_common::{Result, TableReference};
     use datafusion_expr::{
         binary_expr, col, count, lit, logical_plan::builder::LogicalPlanBuilder, not,
-        table_scan, try_cast, Expr, Like, LogicalPlan, Operator,
+        table_scan, try_cast, when, Expr, Like, LogicalPlan, Operator,
     };
 
     fn assert_optimized_plan_equal(plan: &LogicalPlan, expected: &str) -> Result<()> {
@@ -1163,4 +1165,25 @@ mod tests {
         \n  TableScan: test projection=[a]";
         assert_optimized_plan_equal(&plan, expected)
     }
+
+    // Test outer projection isn't discarded despite the same schema as inner
+    // https://github.com/apache/arrow-datafusion/issues/8942
+    #[test]
+    fn test_derived_column() -> Result<()> {
+        let table_scan = test_table_scan()?;
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .project(vec![col("a"), lit(0).alias("d")])?
+            .project(vec![
+                col("a"),
+                when(col("a").eq(lit(1)), lit(10))
+                    .otherwise(col("d"))?
+                    .alias("d"),
+            ])?
+            .build()?;
+
+        let expected = "Projection: test.a, CASE WHEN test.a = Int32(1) THEN Int32(10) ELSE d END AS d\
+        \n  Projection: test.a, Int32(0) AS d\
+        \n    TableScan: test projection=[a]";
+        assert_optimized_plan_equal(&plan, expected)
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index f6d8a1ce8fff..9b46dfb3398a 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -2947,25 +2947,34 @@ logical_plan
 Projection: annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum1, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING AS sum2, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum3, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING AS sum4, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum5, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING AS sum6, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum7, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING AS sum8, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum9, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW AS sum10, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum11, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING AS sum12
 --Limit: skip=0, fetch=5
 ----WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING]]
-------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING]]
---------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
-----------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING]]
-------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW]]
---------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
-----------------Projection: CAST(annotated_data_infinite2.c AS Int64) AS CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d
-------------------TableScan: annotated_data_infinite2 projection=[a, b, c, d]
+------Projection: CAST(annotated_data_infinite2.c AS Int64) AS CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING
+--------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING]]
+----------Projection: CAST(annotated_data_infinite2.c AS Int64) AS CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING
+------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
+--------------Projection: CAST(annotated_data_infinite2.c AS Int64) AS CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING
+----------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING]]
+------------------Projection: CAST(annotated_data_infinite2.c AS Int64) AS CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
+--------------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW]]
+----------------------Projection: CAST(annotated_data_infinite2.c AS Int64) AS CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING
+------------------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c AS annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
+--------------------------Projection: CAST(annotated_data_infinite2.c AS Int64) AS CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.c, annotated_data_infinite2.d
+----------------------------TableScan: annotated_data_infinite2 projection=[a, b, c, d]
 physical_plan
 ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12]
 --GlobalLimitExec: skip=0, fetch=5
 ----BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.a ASC NULLS LAST, annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Preceding(UInt64(1)) }], mode=[Linear]
-------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(1)) }], mode=[PartiallySorted([1, 0])]
---------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
-----------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Following(UInt64(1)), end_bound: Following(UInt64(5)) }], mode=[PartiallySorted([0])]
-------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: CurrentRow }], mode=[PartiallySorted([0, 1])]
---------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
-----------------ProjectionExec: expr=[CAST(c@2 AS Int64) as CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
-------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
-
+------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING]
+--------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(1)) }], mode=[PartiallySorted([1, 0])]
+----------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]
+------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.b, annotated_data_infinite2.a] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
+--------------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING]
+----------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.b ASC NULLS LAST, annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Following(UInt64(1)), end_bound: Following(UInt64(5)) }], mode=[PartiallySorted([0])]
+------------------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW]
+--------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b, annotated_data_infinite2.d] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: CurrentRow }], mode=[PartiallySorted([0, 1])]
+----------------------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]
+------------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_infinite2.c) PARTITION BY [annotated_data_infinite2.a, annotated_data_infinite2.b] ORDER BY [annotated_data_infinite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
+--------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as CAST(annotated_data_infinite2.c AS Int64)annotated_data_infinite2.c, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
+----------------------------StreamingTableExec: partition_sizes=1, projection=[a, b, c, d], infinite_source=true, output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST]
 
 query IIIIIIIIIIIIIII
 SELECT a, b, c,
@@ -3017,31 +3026,40 @@ Limit: skip=0, fetch=5
 --Sort: annotated_data_finite2.c ASC NULLS LAST, fetch=5
 ----Projection: annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum1, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING AS sum2, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum3, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING AS sum4, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum5, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING AS sum6, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum7, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING AS sum8, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum9, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW AS sum10, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING AS sum11, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING AS sum12
 ------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING]]
---------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING]]
-----------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
-------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING]]
---------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW]]
-----------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
-------------------Projection: CAST(annotated_data_finite2.c AS Int64) AS CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, annotated_data_finite2.d
---------------------TableScan: annotated_data_finite2 projection=[a, b, c, d]
+--------Projection: CAST(annotated_data_finite2.c AS Int64) AS CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, annotated_data_finite2.d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING
+----------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING]]
+------------Projection: CAST(annotated_data_finite2.c AS Int64) AS CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, annotated_data_finite2.d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING
+--------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
+----------------Projection: CAST(annotated_data_finite2.c AS Int64) AS CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, annotated_data_finite2.d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING
+------------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING]]
+--------------------Projection: CAST(annotated_data_finite2.c AS Int64) AS CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, annotated_data_finite2.d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW
+----------------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW]]
+------------------------Projection: CAST(annotated_data_finite2.c AS Int64) AS CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, annotated_data_finite2.d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING
+--------------------------WindowAggr: windowExpr=[[SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c AS annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]]
+----------------------------Projection: CAST(annotated_data_finite2.c AS Int64) AS CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.c, annotated_data_finite2.d
+------------------------------TableScan: annotated_data_finite2 projection=[a, b, c, d]
 physical_plan
 GlobalLimitExec: skip=0, fetch=5
 --SortExec: TopK(fetch=5), expr=[c@2 ASC NULLS LAST]
 ----ProjectionExec: expr=[a@1 as a, b@2 as b, c@3 as c, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as sum1, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as sum2, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@15 as sum3, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING@16 as sum4, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as sum5, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as sum6, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as sum7, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as sum8, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as sum9, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as sum10, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as sum11, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as sum12]
 ------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.d] ORDER BY [annotated_data_finite2.a ASC NULLS LAST, annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Preceding(UInt64(1)) }], mode=[Sorted]
 --------SortExec: expr=[d@4 ASC NULLS LAST,a@1 ASC NULLS LAST,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST]
-----------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(1)) }], mode=[Sorted]
-------------SortExec: expr=[b@2 ASC NULLS LAST,a@1 ASC NULLS LAST,d@4 ASC NULLS LAST,c@3 ASC NULLS LAST]
---------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
-----------------SortExec: expr=[b@2 ASC NULLS LAST,a@1 ASC NULLS LAST,c@3 ASC NULLS LAST]
-------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Following(UInt64(1)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
---------------------SortExec: expr=[a@1 ASC NULLS LAST,d@4 ASC NULLS LAST,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST]
-----------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: CurrentRow }], mode=[Sorted]
-------------------------SortExec: expr=[a@1 ASC NULLS LAST,b@2 ASC NULLS LAST,d@4 ASC NULLS LAST,c@3 ASC NULLS LAST]
---------------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
-----------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
-------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true
-
+----------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@13 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING@14 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING]
+------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: CurrentRow, end_bound: Following(UInt64(1)) }], mode=[Sorted]
+--------------SortExec: expr=[b@2 ASC NULLS LAST,a@1 ASC NULLS LAST,d@4 ASC NULLS LAST,c@3 ASC NULLS LAST]
+----------------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@11 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@12 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]
+------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.b, annotated_data_finite2.a] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
+--------------------SortExec: expr=[b@2 ASC NULLS LAST,a@1 ASC NULLS LAST,c@3 ASC NULLS LAST]
+----------------------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@9 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING@10 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING]
+------------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.b ASC NULLS LAST, annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 1 FOLLOWING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Following(UInt64(1)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
+--------------------------SortExec: expr=[a@1 ASC NULLS LAST,d@4 ASC NULLS LAST,b@2 ASC NULLS LAST,c@3 ASC NULLS LAST]
+----------------------------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@7 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW@8 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW]
+------------------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: CurrentRow }], mode=[Sorted]
+--------------------------------SortExec: expr=[a@1 ASC NULLS LAST,b@2 ASC NULLS LAST,d@4 ASC NULLS LAST,c@3 ASC NULLS LAST]
+----------------------------------ProjectionExec: expr=[CAST(c@3 AS Int64) as CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, a@1 as a, b@2 as b, c@3 as c, d@4 as d, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING@5 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING@6 as SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING]
+------------------------------------BoundedWindowAggExec: wdw=[SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(2)), end_bound: Following(UInt64(1)) }, SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Ok(Field { name: "SUM(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(5)), end_bound: Following(UInt64(5)) }], mode=[Sorted]
+--------------------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as CAST(annotated_data_finite2.c AS Int64)annotated_data_finite2.c, a@0 as a, b@1 as b, c@2 as c, d@3 as d]
+----------------------------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], has_header=true
 
 query IIIIIIIIIIIIIII
 SELECT a, b, c,
@@ -3971,4 +3989,4 @@ Int64
 query T
 select arrow_typeof(nth_value(a, 1) over ()) from (select 1 a)
 ----
-Int64
\ No newline at end of file
+Int64

From 4a3986aca9826a2e15ec0ea2c22e2a4311bc7c56 Mon Sep 17 00:00:00 2001
From: junxiangMu <63799833+guojidan@users.noreply.github.com>
Date: Thu, 25 Jan 2024 20:00:25 +0800
Subject: [PATCH 02/27] NOT operator not return internal error when args are
 not boolean value (#8982)

* optimize NOT Expr logic

* fix Null type

* fix test case

* fmt
---
 datafusion/optimizer/src/analyzer/type_coercion.rs |  6 +++++-
 datafusion/physical-expr/src/expressions/not.rs    | 11 +----------
 datafusion/sqllogictest/test_files/scalar.slt      |  4 ++--
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs
index 8c4e907e6734..c0dad2ef4006 100644
--- a/datafusion/optimizer/src/analyzer/type_coercion.rs
+++ b/datafusion/optimizer/src/analyzer/type_coercion.rs
@@ -44,7 +44,7 @@ use datafusion_expr::type_coercion::other::{
 use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8};
 use datafusion_expr::utils::merge_schema;
 use datafusion_expr::{
-    is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown,
+    is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not,
     type_coercion, AggregateFunction, BuiltinScalarFunction, Expr, ExprSchemable,
     LogicalPlan, Operator, Projection, ScalarFunctionDefinition, Signature, WindowFrame,
     WindowFrameBound, WindowFrameUnits,
@@ -176,6 +176,10 @@ impl TreeNodeRewriter for TypeCoercionRewriter {
                     negated,
                 )))
             }
+            Expr::Not(expr) => {
+                let expr = not(get_casted_expr_for_bool_op(&expr, &self.schema)?);
+                Ok(expr)
+            }
             Expr::IsTrue(expr) => {
                 let expr = is_true(get_casted_expr_for_bool_op(&expr, &self.schema)?);
                 Ok(expr)
diff --git a/datafusion/physical-expr/src/expressions/not.rs b/datafusion/physical-expr/src/expressions/not.rs
index 4ceccc6932fe..f17df73e3070 100644
--- a/datafusion/physical-expr/src/expressions/not.rs
+++ b/datafusion/physical-expr/src/expressions/not.rs
@@ -26,9 +26,7 @@ use crate::physical_expr::down_cast_any_ref;
 use crate::PhysicalExpr;
 use arrow::datatypes::{DataType, Schema};
 use arrow::record_batch::RecordBatch;
-use datafusion_common::{
-    cast::as_boolean_array, internal_err, DataFusionError, Result, ScalarValue,
-};
+use datafusion_common::{cast::as_boolean_array, Result, ScalarValue};
 use datafusion_expr::ColumnarValue;
 
 /// Not expression
@@ -83,13 +81,6 @@ impl PhysicalExpr for NotExpr {
                 if scalar.is_null() {
                     return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
                 }
-                let value_type = scalar.data_type();
-                if value_type != DataType::Boolean {
-                    return internal_err!(
-                        "NOT '{:?}' can't be evaluated because the expression's type is {:?}, not boolean or NULL",
-                        self.arg, value_type
-                    );
-                }
                 let bool_value: bool = scalar.try_into()?;
                 Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(
                     !bool_value,
diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt
index 3e8ebe54c09c..5b3ecab5fd76 100644
--- a/datafusion/sqllogictest/test_files/scalar.slt
+++ b/datafusion/sqllogictest/test_files/scalar.slt
@@ -1527,7 +1527,7 @@ SELECT not(true), not(false)
 ----
 false true
 
-query error
+query error type_coercion\ncaused by\nError during planning: Cannot infer common argument type for comparison operation Int64 IS DISTINCT FROM Boolean
 SELECT not(1), not(0)
 
 query ?B
@@ -1535,7 +1535,7 @@ SELECT null, not(null)
 ----
 NULL NULL
 
-query error
+query error type_coercion\ncaused by\nError during planning: Cannot infer common argument type for comparison operation Utf8 IS DISTINCT FROM Boolean
 SELECT NOT('hi')
 
 # test_negative_expressions()

From 928162fd8536bad6657ac6241b22b1a2bac8621d Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 25 Jan 2024 07:15:29 -0500
Subject: [PATCH 03/27] Minor: Add new Extended ClickBench benchmark queries
 (#8950)

---
 benchmarks/queries/clickbench/README.md    | 177 +++++++++++++++++++--
 benchmarks/queries/clickbench/extended.sql |   4 +-
 2 files changed, 170 insertions(+), 11 deletions(-)

diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md
index d5105afd4832..e03b7d519d91 100644
--- a/benchmarks/queries/clickbench/README.md
+++ b/benchmarks/queries/clickbench/README.md
@@ -11,23 +11,180 @@ ClickBench is focused on aggregation and filtering performance (though it has no
 [ClickBench repository]: https://github.com/ClickHouse/ClickBench/blob/main/datafusion/queries.sql
 
 ## "Extended" Queries 
-The "extended" queries are not part of the official ClickBench benchmark. 
-Instead they are used to test other DataFusion features that are not 
-covered by the standard benchmark
 
-Each description below is for the corresponding line in `extended.sql` (line 1
-is `Q0`, line 2 is `Q1`, etc.)  
+The "extended" queries are not part of the official ClickBench benchmark.
+Instead they are used to test other DataFusion features that are not covered by
+the standard benchmark  Each description below is for the corresponding line in
+`extended.sql` (line 1 is `Q0`, line 2 is `Q1`, etc.)
+
+### Q0: Data Exploration
+
+**Question**: "How many distinct searches, mobile phones, and mobile phone models are there in the dataset?"
+
+**Important Query Properties**: multiple `COUNT DISTINCT`s, with low and high cardinality
+distinct string columns.
+
+```sql
+SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") 
+FROM hits;
+```
+
+
+### Q1: Data Exploration
+
+**Question**: "How many distinct "hit color", "browser country" and "language" are there in the dataset?"
+
+**Important Query Properties**: multiple `COUNT DISTINCT`s. All three are small strings (length either 1 or 2).
 
-### Q0
-Models initial Data exploration, to understand some statistics of data. 
-Import Query Properties: multiple `COUNT DISTINCT` on strings
 
 ```sql
-SELECT 
-    COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") 
+SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")
 FROM hits;
 ```
 
+### Q2: Top 10 anaylsis
 
+**Question**: "Find the top 10 "browser country" by number of distinct "social network"s, 
+including the distinct counts of  "hit color", "browser language",
+and "social action"."
 
+**Important Query Properties**: GROUP BY short, string, multiple `COUNT DISTINCT`s. There are several small strings (length either 1 or 2).
 
+```sql
+SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction")
+FROM hits 
+GROUP BY 1 
+ORDER BY 2 DESC 
+LIMIT 10;
+```
+
+
+## Data Notes
+
+Here are some interesting statistics about the data used in the queries
+Max length of `"SearchPhrase"` is 1113 characters
+```sql
+❯ select min(length("SearchPhrase")) as "SearchPhrase_len_min", max(length("SearchPhrase")) "SearchPhrase_len_max" from 'hits.parquet' limit 10;
++----------------------+----------------------+
+| SearchPhrase_len_min | SearchPhrase_len_max |
++----------------------+----------------------+
+| 0                    | 1113                 |
++----------------------+----------------------+
+```
+
+
+Here is the schema of the data
+```sql
+❯ describe 'hits.parquet';
++-----------------------+-----------+-------------+
+| column_name           | data_type | is_nullable |
++-----------------------+-----------+-------------+
+| WatchID               | Int64     | NO          |
+| JavaEnable            | Int16     | NO          |
+| Title                 | Utf8      | NO          |
+| GoodEvent             | Int16     | NO          |
+| EventTime             | Int64     | NO          |
+| EventDate             | UInt16    | NO          |
+| CounterID             | Int32     | NO          |
+| ClientIP              | Int32     | NO          |
+| RegionID              | Int32     | NO          |
+| UserID                | Int64     | NO          |
+| CounterClass          | Int16     | NO          |
+| OS                    | Int16     | NO          |
+| UserAgent             | Int16     | NO          |
+| URL                   | Utf8      | NO          |
+| Referer               | Utf8      | NO          |
+| IsRefresh             | Int16     | NO          |
+| RefererCategoryID     | Int16     | NO          |
+| RefererRegionID       | Int32     | NO          |
+| URLCategoryID         | Int16     | NO          |
+| URLRegionID           | Int32     | NO          |
+| ResolutionWidth       | Int16     | NO          |
+| ResolutionHeight      | Int16     | NO          |
+| ResolutionDepth       | Int16     | NO          |
+| FlashMajor            | Int16     | NO          |
+| FlashMinor            | Int16     | NO          |
+| FlashMinor2           | Utf8      | NO          |
+| NetMajor              | Int16     | NO          |
+| NetMinor              | Int16     | NO          |
+| UserAgentMajor        | Int16     | NO          |
+| UserAgentMinor        | Utf8      | NO          |
+| CookieEnable          | Int16     | NO          |
+| JavascriptEnable      | Int16     | NO          |
+| IsMobile              | Int16     | NO          |
+| MobilePhone           | Int16     | NO          |
+| MobilePhoneModel      | Utf8      | NO          |
+| Params                | Utf8      | NO          |
+| IPNetworkID           | Int32     | NO          |
+| TraficSourceID        | Int16     | NO          |
+| SearchEngineID        | Int16     | NO          |
+| SearchPhrase          | Utf8      | NO          |
+| AdvEngineID           | Int16     | NO          |
+| IsArtifical           | Int16     | NO          |
+| WindowClientWidth     | Int16     | NO          |
+| WindowClientHeight    | Int16     | NO          |
+| ClientTimeZone        | Int16     | NO          |
+| ClientEventTime       | Int64     | NO          |
+| SilverlightVersion1   | Int16     | NO          |
+| SilverlightVersion2   | Int16     | NO          |
+| SilverlightVersion3   | Int32     | NO          |
+| SilverlightVersion4   | Int16     | NO          |
+| PageCharset           | Utf8      | NO          |
+| CodeVersion           | Int32     | NO          |
+| IsLink                | Int16     | NO          |
+| IsDownload            | Int16     | NO          |
+| IsNotBounce           | Int16     | NO          |
+| FUniqID               | Int64     | NO          |
+| OriginalURL           | Utf8      | NO          |
+| HID                   | Int32     | NO          |
+| IsOldCounter          | Int16     | NO          |
+| IsEvent               | Int16     | NO          |
+| IsParameter           | Int16     | NO          |
+| DontCountHits         | Int16     | NO          |
+| WithHash              | Int16     | NO          |
+| HitColor              | Utf8      | NO          |
+| LocalEventTime        | Int64     | NO          |
+| Age                   | Int16     | NO          |
+| Sex                   | Int16     | NO          |
+| Income                | Int16     | NO          |
+| Interests             | Int16     | NO          |
+| Robotness             | Int16     | NO          |
+| RemoteIP              | Int32     | NO          |
+| WindowName            | Int32     | NO          |
+| OpenerName            | Int32     | NO          |
+| HistoryLength         | Int16     | NO          |
+| BrowserLanguage       | Utf8      | NO          |
+| BrowserCountry        | Utf8      | NO          |
+| SocialNetwork         | Utf8      | NO          |
+| SocialAction          | Utf8      | NO          |
+| HTTPError             | Int16     | NO          |
+| SendTiming            | Int32     | NO          |
+| DNSTiming             | Int32     | NO          |
+| ConnectTiming         | Int32     | NO          |
+| ResponseStartTiming   | Int32     | NO          |
+| ResponseEndTiming     | Int32     | NO          |
+| FetchTiming           | Int32     | NO          |
+| SocialSourceNetworkID | Int16     | NO          |
+| SocialSourcePage      | Utf8      | NO          |
+| ParamPrice            | Int64     | NO          |
+| ParamOrderID          | Utf8      | NO          |
+| ParamCurrency         | Utf8      | NO          |
+| ParamCurrencyID       | Int16     | NO          |
+| OpenstatServiceName   | Utf8      | NO          |
+| OpenstatCampaignID    | Utf8      | NO          |
+| OpenstatAdID          | Utf8      | NO          |
+| OpenstatSourceID      | Utf8      | NO          |
+| UTMSource             | Utf8      | NO          |
+| UTMMedium             | Utf8      | NO          |
+| UTMCampaign           | Utf8      | NO          |
+| UTMContent            | Utf8      | NO          |
+| UTMTerm               | Utf8      | NO          |
+| FromTag               | Utf8      | NO          |
+| HasGCLID              | Int16     | NO          |
+| RefererHash           | Int64     | NO          |
+| URLHash               | Int64     | NO          |
+| CLID                  | Int32     | NO          |
++-----------------------+-----------+-------------+
+105 rows in set. Query took 0.034 seconds.
+
+```
diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql
index 82c0266af61a..0a2999fceb49 100644
--- a/benchmarks/queries/clickbench/extended.sql
+++ b/benchmarks/queries/clickbench/extended.sql
@@ -1 +1,3 @@
-SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
\ No newline at end of file
+SELECT COUNT(DISTINCT "SearchPhrase"), COUNT(DISTINCT "MobilePhone"), COUNT(DISTINCT "MobilePhoneModel") FROM hits;
+SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTINCT "BrowserLanguage")  FROM hits;
+SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
\ No newline at end of file

From 80a42bf7f61629e097ca274396917337e91ca17a Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 25 Jan 2024 07:22:06 -0500
Subject: [PATCH 04/27] Minor: Add comments to MSRV CI check to help if it
 fails (#8995)

---
 .github/workflows/rust.yml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 375c9f2c2c5a..d384e4bc7ebf 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -488,7 +488,7 @@ jobs:
 
   # Verify MSRV for the crates which are directly used by other projects.
   msrv:
-    name: Verify MSRV
+    name: Verify MSRV (Min Supported Rust Version)
     runs-on: ubuntu-latest
     container:
       image: amd64/rust
@@ -500,7 +500,13 @@ jobs:
         run: cargo install cargo-msrv
       - name: Check datafusion
         working-directory: datafusion/core
-        run: cargo msrv verify
+        run: |
+          # If you encounter an error with any of the commands below
+          # it means some crate in your dependency tree has a higher 
+          # MSRV (Min Supported Rust Version) than the one specified 
+          # in the `rust-version` key of `Cargo.toml`. Check your 
+          # dependencies or update the version in `Cargo.toml`
+          cargo msrv verify
       - name: Check datafusion-substrait
         working-directory: datafusion/substrait
         run: cargo msrv verify

From 7a0af5be2323443faa75cc5876651a72c3253af8 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 25 Jan 2024 07:22:34 -0500
Subject: [PATCH 05/27] Minor: Document memory management design on MemoryPool
 (#8966)

---
 datafusion/execution/src/memory_pool/mod.rs | 68 +++++++++++++++------
 1 file changed, 49 insertions(+), 19 deletions(-)

diff --git a/datafusion/execution/src/memory_pool/mod.rs b/datafusion/execution/src/memory_pool/mod.rs
index 55555014f2ef..58ed1ebff04c 100644
--- a/datafusion/execution/src/memory_pool/mod.rs
+++ b/datafusion/execution/src/memory_pool/mod.rs
@@ -25,30 +25,60 @@ pub mod proxy;
 
 pub use pool::*;
 
-/// The pool of memory on which [`MemoryReservation`]s record their
-/// memory reservations.
+/// Tracks and potentially limits memory use across operators during execution.
 ///
-/// DataFusion is a streaming query engine, processing most queries
-/// without buffering the entire input. However, certain operations
-/// such as sorting and grouping/joining with a large number of
-/// distinct groups/keys, can require buffering intermediate results
-/// and for large datasets this can require large amounts of memory.
+/// # Memory Management Overview
 ///
-/// In order to avoid allocating memory until the OS or the container
-/// system kills the process, DataFusion operators only allocate
-/// memory they are able to reserve from the configured
-/// [`MemoryPool`]. Once the memory tracked by the pool is exhausted,
-/// operators must either free memory by spilling to local disk or
-/// error.
+/// DataFusion is a streaming query engine, processing most queries without
+/// buffering the entire input. Most operators require a fixed amount of memory
+/// based on the schema and target batch size. However, certain operations such
+/// as sorting and grouping/joining, require buffering intermediate results,
+/// which can require memory proportional to the number of input rows.
 ///
-/// A `MemoryPool` can be shared by concurrently executing plans in
-/// the same process to control memory usage in a multi-tenant system.
+/// Rather than tracking all allocations, DataFusion takes a pragmatic approach:
+/// Intermediate memory used as data streams through the system is not accounted
+/// (it assumed to be "small") but the large consumers of memory must register
+/// and constrain their use. This design trades off the additional code
+/// complexity of memory tracking with limiting resource usage.
 ///
-/// The following memory pool implementations are available:
+/// When limiting memory with a `MemoryPool` you should typically reserve some
+/// overhead (e.g. 10%) for the "small" memory allocations that are not tracked.
 ///
-/// * [`UnboundedMemoryPool`]
-/// * [`GreedyMemoryPool`]
-/// * [`FairSpillPool`]
+/// # Memory Management Design
+///
+/// As explained above, DataFusion's design ONLY limits operators that require
+/// "large" amounts of memory (proportional to number of input rows), such as
+/// `GroupByHashExec`. It does NOT track and limit memory used internally by
+/// other operators such as `ParquetExec` or the `RecordBatch`es that flow
+/// between operators.
+///
+/// In order to avoid allocating memory until the OS or the container system
+/// kills the process, DataFusion `ExecutionPlan`s (operators) that consume
+/// large amounts of memory must first request their desired allocation from a
+/// [`MemoryPool`] before allocating more.  The request is typically managed via
+/// a  [`MemoryReservation`].
+///
+/// If the allocation is successful, the operator should proceed and allocate
+/// the desired memory. If the allocation fails, the operator must either first
+/// free memory (e.g. by spilling to local disk) and try again, or error.
+///
+/// Note that a `MemoryPool` can be shared by concurrently executing plans,
+/// which can be used to control memory usage in a multi-tenant system.
+///
+/// # Implementing `MemoryPool`
+///
+/// You can implement a custom allocation policy by implementing the
+/// [`MemoryPool`] trait and configuring a `SessionContext` appropriately.
+/// However, mDataFusion comes with the following simple memory pool implementations that
+/// handle many common cases:
+///
+/// * [`UnboundedMemoryPool`]: no memory limits (the default)
+///
+/// * [`GreedyMemoryPool`]: Limits memory usage to a fixed size using a "first
+/// come first served" policy
+///
+/// * [`FairSpillPool`]: Limits memory usage to a fixed size, allocating memory
+/// to all spilling operators fairly
 pub trait MemoryPool: Send + Sync + std::fmt::Debug {
     /// Registers a new [`MemoryConsumer`]
     ///

From 5e9c9a1f7cecabe6e6c40c8296adb517fac0da13 Mon Sep 17 00:00:00 2001
From: comphead <comphead@users.noreply.github.com>
Date: Thu, 25 Jan 2024 09:03:32 -0800
Subject: [PATCH 06/27] Fix LEAD/LAG window functions when default value null
 (#8989)

---
 .../physical-expr/src/window/lead_lag.rs      | 21 +++++++++++--------
 datafusion/sqllogictest/test_files/window.slt | 14 +++++++++++++
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/datafusion/physical-expr/src/window/lead_lag.rs b/datafusion/physical-expr/src/window/lead_lag.rs
index 054a4c13e6b6..d8072be83950 100644
--- a/datafusion/physical-expr/src/window/lead_lag.rs
+++ b/datafusion/physical-expr/src/window/lead_lag.rs
@@ -23,8 +23,9 @@ use crate::PhysicalExpr;
 use arrow::array::ArrayRef;
 use arrow::compute::cast;
 use arrow::datatypes::{DataType, Field};
-use datafusion_common::{arrow_datafusion_err, ScalarValue};
-use datafusion_common::{internal_err, DataFusionError, Result};
+use datafusion_common::{
+    arrow_datafusion_err, exec_err, DataFusionError, Result, ScalarValue,
+};
 use datafusion_expr::PartitionEvaluator;
 use std::any::Any;
 use std::cmp::min;
@@ -236,14 +237,16 @@ fn get_default_value(
     default_value: Option<&ScalarValue>,
     dtype: &DataType,
 ) -> Result<ScalarValue> {
-    if let Some(value) = default_value {
-        if let ScalarValue::Int64(Some(val)) = value {
-            ScalarValue::try_from_string(val.to_string(), dtype)
-        } else {
-            internal_err!("Expects default value to have Int64 type")
+    match default_value {
+        Some(v) if v.data_type() == DataType::Int64 => {
+            ScalarValue::try_from_string(v.to_string(), dtype)
         }
-    } else {
-        Ok(ScalarValue::try_from(dtype)?)
+        Some(v) if !v.data_type().is_null() => exec_err!(
+            "Unexpected datatype for default value: {}. Expected: Int64",
+            v.data_type()
+        ),
+        // If None or Null datatype
+        _ => Ok(ScalarValue::try_from(dtype)?),
     }
 }
 
diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index 9b46dfb3398a..303e8e035e7c 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -3990,3 +3990,17 @@ query T
 select arrow_typeof(nth_value(a, 1) over ()) from (select 1 a)
 ----
 Int64
+
+# test LEAD window function works NULL as default value
+query I
+select lead(a, 1, null) over (order by a) from (select 1 a union all select 2 a)
+----
+2
+NULL
+
+# test LAG window function works NULL as default value
+query I
+select lag(a, 1, null) over (order by a) from (select 1 a union all select 2 a)
+----
+NULL
+1

From eb6d63fb939b0ea01b6404ca5e44e50ee83e2dbc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Thu, 25 Jan 2024 10:34:44 -0800
Subject: [PATCH 07/27] Optimize MIN/MAX when relation is empty (#8940)

* Optimize MIN/MAX when relation is empty

* Fix clippy
---
 .../aggregate_statistics.rs                   | 98 ++++++++++++-------
 .../sqllogictest/test_files/aggregate.slt     | 20 ++++
 2 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs
index 0a53c775aa89..4fe11c14a758 100644
--- a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs
+++ b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs
@@ -198,30 +198,45 @@ fn take_optimizable_min(
     stats: &Statistics,
 ) -> Option<(ScalarValue, String)> {
     if let Precision::Exact(num_rows) = &stats.num_rows {
-        if *num_rows > 0 {
-            let col_stats = &stats.column_statistics;
-            if let Some(casted_expr) =
-                agg_expr.as_any().downcast_ref::<expressions::Min>()
-            {
-                if casted_expr.expressions().len() == 1 {
-                    // TODO optimize with exprs other than Column
-                    if let Some(col_expr) = casted_expr.expressions()[0]
-                        .as_any()
-                        .downcast_ref::<expressions::Column>()
+        match *num_rows {
+            0 => {
+                // MIN/MAX with 0 rows is always null
+                if let Some(casted_expr) =
+                    agg_expr.as_any().downcast_ref::<expressions::Min>()
+                {
+                    if let Ok(min_data_type) =
+                        ScalarValue::try_from(casted_expr.field().unwrap().data_type())
                     {
-                        if let Precision::Exact(val) =
-                            &col_stats[col_expr.index()].min_value
+                        return Some((min_data_type, casted_expr.name().to_string()));
+                    }
+                }
+            }
+            value if value > 0 => {
+                let col_stats = &stats.column_statistics;
+                if let Some(casted_expr) =
+                    agg_expr.as_any().downcast_ref::<expressions::Min>()
+                {
+                    if casted_expr.expressions().len() == 1 {
+                        // TODO optimize with exprs other than Column
+                        if let Some(col_expr) = casted_expr.expressions()[0]
+                            .as_any()
+                            .downcast_ref::<expressions::Column>()
                         {
-                            if !val.is_null() {
-                                return Some((
-                                    val.clone(),
-                                    casted_expr.name().to_string(),
-                                ));
+                            if let Precision::Exact(val) =
+                                &col_stats[col_expr.index()].min_value
+                            {
+                                if !val.is_null() {
+                                    return Some((
+                                        val.clone(),
+                                        casted_expr.name().to_string(),
+                                    ));
+                                }
                             }
                         }
                     }
                 }
             }
+            _ => {}
         }
     }
     None
@@ -233,30 +248,45 @@ fn take_optimizable_max(
     stats: &Statistics,
 ) -> Option<(ScalarValue, String)> {
     if let Precision::Exact(num_rows) = &stats.num_rows {
-        if *num_rows > 0 {
-            let col_stats = &stats.column_statistics;
-            if let Some(casted_expr) =
-                agg_expr.as_any().downcast_ref::<expressions::Max>()
-            {
-                if casted_expr.expressions().len() == 1 {
-                    // TODO optimize with exprs other than Column
-                    if let Some(col_expr) = casted_expr.expressions()[0]
-                        .as_any()
-                        .downcast_ref::<expressions::Column>()
+        match *num_rows {
+            0 => {
+                // MIN/MAX with 0 rows is always null
+                if let Some(casted_expr) =
+                    agg_expr.as_any().downcast_ref::<expressions::Max>()
+                {
+                    if let Ok(max_data_type) =
+                        ScalarValue::try_from(casted_expr.field().unwrap().data_type())
                     {
-                        if let Precision::Exact(val) =
-                            &col_stats[col_expr.index()].max_value
+                        return Some((max_data_type, casted_expr.name().to_string()));
+                    }
+                }
+            }
+            value if value > 0 => {
+                let col_stats = &stats.column_statistics;
+                if let Some(casted_expr) =
+                    agg_expr.as_any().downcast_ref::<expressions::Max>()
+                {
+                    if casted_expr.expressions().len() == 1 {
+                        // TODO optimize with exprs other than Column
+                        if let Some(col_expr) = casted_expr.expressions()[0]
+                            .as_any()
+                            .downcast_ref::<expressions::Column>()
                         {
-                            if !val.is_null() {
-                                return Some((
-                                    val.clone(),
-                                    casted_expr.name().to_string(),
-                                ));
+                            if let Precision::Exact(val) =
+                                &col_stats[col_expr.index()].max_value
+                            {
+                                if !val.is_null() {
+                                    return Some((
+                                        val.clone(),
+                                        casted_expr.name().to_string(),
+                                    ));
+                                }
                             }
                         }
                     }
                 }
             }
+            _ => {}
         }
     }
     None
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
index e9c92f53e0fa..5cd728c4344b 100644
--- a/datafusion/sqllogictest/test_files/aggregate.slt
+++ b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -3084,6 +3084,26 @@ SELECT MAX(col0) FROM empty WHERE col0=1;
 ----
 NULL
 
+query TT
+EXPLAIN SELECT MIN(col0) FROM empty;
+----
+logical_plan
+Aggregate: groupBy=[[]], aggr=[[MIN(empty.col0)]]
+--TableScan: empty projection=[col0]
+physical_plan
+ProjectionExec: expr=[NULL as MIN(empty.col0)]
+--PlaceholderRowExec
+
+query TT
+EXPLAIN SELECT MAX(col0) FROM empty;
+----
+logical_plan
+Aggregate: groupBy=[[]], aggr=[[MAX(empty.col0)]]
+--TableScan: empty projection=[col0]
+physical_plan
+ProjectionExec: expr=[NULL as MAX(empty.col0)]
+--PlaceholderRowExec
+
 statement ok
 DROP TABLE empty;
 

From b97daf7fc834920f4a312670da9d5560e2facbe7 Mon Sep 17 00:00:00 2001
From: "Reilly.tang" <tang.ruilin@foxmail.com>
Date: Fri, 26 Jan 2024 03:48:31 +0800
Subject: [PATCH 08/27] [task #8203] Port tests in joins.rs to sqllogictest
 (#8996)

Signed-off-by: tangruilin <tang.ruilin@foxmail.com>
---
 datafusion/core/tests/sql/joins.rs           | 59 ----------------
 datafusion/core/tests/sql/mod.rs             | 73 --------------------
 datafusion/sqllogictest/test_files/joins.slt | 70 +++++++++++++++++++
 3 files changed, 70 insertions(+), 132 deletions(-)

diff --git a/datafusion/core/tests/sql/joins.rs b/datafusion/core/tests/sql/joins.rs
index 0cc102002ec3..f7d5205db0d3 100644
--- a/datafusion/core/tests/sql/joins.rs
+++ b/datafusion/core/tests/sql/joins.rs
@@ -20,65 +20,6 @@ use datafusion::test_util::register_unbounded_file_with_ordering;
 
 use super::*;
 
-#[tokio::test]
-#[ignore]
-/// TODO: need to repair. Wrong Test: ambiguous column name: a
-async fn nestedjoin_with_alias() -> Result<()> {
-    // repro case for https://github.com/apache/arrow-datafusion/issues/2867
-    let sql = "select * from ((select 1 as a, 2 as b) c INNER JOIN (select 1 as a, 3 as d) e on c.a = e.a) f;";
-    let expected = [
-        "+---+---+---+---+",
-        "| a | b | a | d |",
-        "+---+---+---+---+",
-        "| 1 | 2 | 1 | 3 |",
-        "+---+---+---+---+",
-    ];
-    let ctx = SessionContext::new();
-    let actual = execute_to_batches(&ctx, sql).await;
-    assert_batches_eq!(expected, &actual);
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn join_partitioned() -> Result<()> {
-    // self join on partition id (workaround for duplicate column name)
-    let results = execute_with_partition(
-        "SELECT 1 FROM test JOIN (SELECT c1 AS id1 FROM test) AS a ON c1=id1",
-        4,
-    )
-    .await?;
-
-    assert_eq!(
-        results.iter().map(|b| b.num_rows()).sum::<usize>(),
-        4 * 10 * 10
-    );
-
-    Ok(())
-}
-
-#[tokio::test]
-#[ignore = "Test ignored, will be enabled after fixing the NAAJ bug"]
-// https://github.com/apache/arrow-datafusion/issues/4211
-async fn null_aware_left_anti_join() -> Result<()> {
-    let test_repartition_joins = vec![true, false];
-    for repartition_joins in test_repartition_joins {
-        let ctx = create_left_semi_anti_join_context_with_null_ids(
-            "t1_id",
-            "t2_id",
-            repartition_joins,
-        )
-        .unwrap();
-
-        let sql = "SELECT t1_id, t1_name FROM t1 WHERE t1_id NOT IN (SELECT t2_id FROM t2) ORDER BY t1_id";
-        let actual = execute_to_batches(&ctx, sql).await;
-        let expected = ["++", "++"];
-        assert_batches_eq!(expected, &actual);
-    }
-
-    Ok(())
-}
-
 #[tokio::test]
 async fn join_change_in_planner() -> Result<()> {
     let config = SessionConfig::new().with_target_partitions(8);
diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
index 40ae75cd7f80..8852854a8b5d 100644
--- a/datafusion/core/tests/sql/mod.rs
+++ b/datafusion/core/tests/sql/mod.rs
@@ -76,79 +76,6 @@ pub mod repartition;
 pub mod select;
 mod sql_api;
 
-fn create_left_semi_anti_join_context_with_null_ids(
-    column_left: &str,
-    column_right: &str,
-    repartition_joins: bool,
-) -> Result<SessionContext> {
-    let ctx = SessionContext::new_with_config(
-        SessionConfig::new()
-            .with_repartition_joins(repartition_joins)
-            .with_target_partitions(2)
-            .with_batch_size(4096),
-    );
-
-    let t1_schema = Arc::new(Schema::new(vec![
-        Field::new(column_left, DataType::UInt32, true),
-        Field::new("t1_name", DataType::Utf8, true),
-        Field::new("t1_int", DataType::UInt32, true),
-    ]));
-    let t1_data = RecordBatch::try_new(
-        t1_schema,
-        vec![
-            Arc::new(UInt32Array::from(vec![
-                Some(11),
-                Some(11),
-                Some(22),
-                Some(33),
-                Some(44),
-                None,
-            ])),
-            Arc::new(StringArray::from(vec![
-                Some("a"),
-                Some("a"),
-                Some("b"),
-                Some("c"),
-                Some("d"),
-                Some("e"),
-            ])),
-            Arc::new(UInt32Array::from(vec![1, 1, 2, 3, 4, 0])),
-        ],
-    )?;
-    ctx.register_batch("t1", t1_data)?;
-
-    let t2_schema = Arc::new(Schema::new(vec![
-        Field::new(column_right, DataType::UInt32, true),
-        Field::new("t2_name", DataType::Utf8, true),
-        Field::new("t2_int", DataType::UInt32, true),
-    ]));
-    let t2_data = RecordBatch::try_new(
-        t2_schema,
-        vec![
-            Arc::new(UInt32Array::from(vec![
-                Some(11),
-                Some(11),
-                Some(22),
-                Some(44),
-                Some(55),
-                None,
-            ])),
-            Arc::new(StringArray::from(vec![
-                Some("z"),
-                Some("z"),
-                Some("y"),
-                Some("x"),
-                Some("w"),
-                Some("v"),
-            ])),
-            Arc::new(UInt32Array::from(vec![3, 3, 1, 3, 3, 0])),
-        ],
-    )?;
-    ctx.register_batch("t2", t2_data)?;
-
-    Ok(ctx)
-}
-
 async fn register_aggregate_csv_by_sql(ctx: &SessionContext) {
     let testdata = datafusion::test_util::arrow_test_data();
 
diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt
index e605813b207f..9619696679d2 100644
--- a/datafusion/sqllogictest/test_files/joins.slt
+++ b/datafusion/sqllogictest/test_files/joins.slt
@@ -3523,3 +3523,73 @@ set datafusion.optimizer.prefer_existing_sort = false;
 
 statement ok
 drop table annotated_data;
+
+####
+#  nestedjoin_with_alias_test
+####
+
+query IIII
+select * from ((select 1 as a, 2 as b) c INNER JOIN (select 1 as c, 3 as d) e on c.a = e.c) f;
+----
+1 2 1 3
+
+####
+#  create_left_semi_anti_join_context_with_null_ids_table_test
+####
+
+statement ok
+CREATE TABLE join_test_left(t1_id INT UNSIGNED, t1_name VARCHAR, t1_int INT UNSIGNED)
+AS VALUES
+(11, 'a', 1),
+(11, 'a', 1),
+(22, 'b', 2),
+(33, 'c', 3),
+(44, 'd', 4),
+(NULL, 'e', 0);
+
+statement ok
+CREATE TABLE join_test_right(t2_id INT UNSIGNED, t2_name VARCHAR, t2_int INT UNSIGNED)
+AS VALUES
+(11, 'z', 3),
+(11, 'z', 3),
+(22, 'y', 1),
+(33, 'x', 3),
+(44, 'w', 3),
+(NULL, 'v', 0);
+
+query IT
+SELECT t1_id, t1_name FROM join_test_left WHERE t1_id NOT IN (SELECT t2_id FROM join_test_right) ORDER BY t1_id;
+----
+NULL e
+
+####
+# join_partitioned_test
+####
+
+statement ok
+CREATE TABLE join_partitioned_table(c1 INT UNSIGNED, c2 INT UNSIGNED, c3 BOOLEAN)
+AS VALUES
+(4, 1, true),
+(4, 2, false),
+(4, 3, true),
+(4, 4, false);
+
+query I
+SELECT 1 FROM join_partitioned_table JOIN (SELECT c1 AS id1 FROM join_partitioned_table) AS a ON c1=id1;
+----
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1

From fa65c68b50cf2eb3ee3ca38f5fd6a63924c944c4 Mon Sep 17 00:00:00 2001
From: "Reilly.tang" <tang.ruilin@foxmail.com>
Date: Fri, 26 Jan 2024 05:21:28 +0800
Subject: [PATCH 09/27] [task #8213]Port tests in select.rs to sqllogictest
 (#8967)

* [task #8213]Part of  Port tests in select.rs to sqllogictest

Signed-off-by: tangruilin <tang.ruilin@foxmail.com>

* test large strings as well

* Restore parameter tests

* make test deterministic

---------

Signed-off-by: tangruilin <tang.ruilin@foxmail.com>
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/core/tests/sql/mod.rs              |  18 +-
 datafusion/core/tests/sql/select.rs           | 681 ++----------------
 datafusion/sqllogictest/test_files/select.slt | 343 ++++++++-
 3 files changed, 410 insertions(+), 632 deletions(-)

diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
index 8852854a8b5d..2389c86306ab 100644
--- a/datafusion/core/tests/sql/mod.rs
+++ b/datafusion/core/tests/sql/mod.rs
@@ -22,16 +22,15 @@ use arrow::{
     util::display::array_value_to_string,
 };
 
-use datafusion::datasource::TableProvider;
 use datafusion::error::Result;
 use datafusion::logical_expr::{Aggregate, LogicalPlan, TableScan};
+use datafusion::physical_plan::collect;
 use datafusion::physical_plan::metrics::MetricValue;
 use datafusion::physical_plan::ExecutionPlan;
 use datafusion::physical_plan::ExecutionPlanVisitor;
 use datafusion::prelude::*;
 use datafusion::test_util;
 use datafusion::{assert_batches_eq, assert_batches_sorted_eq};
-use datafusion::{datasource::MemTable, physical_plan::collect};
 use datafusion::{execution::context::SessionContext, physical_plan::displayable};
 use datafusion_common::{assert_contains, assert_not_contains};
 use object_store::path::Path;
@@ -249,21 +248,6 @@ async fn register_alltypes_parquet(ctx: &SessionContext) {
     .unwrap();
 }
 
-/// Return a new table provider that has a single Int32 column with
-/// values between `seq_start` and `seq_end`
-pub fn table_with_sequence(
-    seq_start: i32,
-    seq_end: i32,
-) -> Result<Arc<dyn TableProvider>> {
-    let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)]));
-    let arr = Arc::new(Int32Array::from((seq_start..=seq_end).collect::<Vec<_>>()));
-    let partitions = vec![vec![RecordBatch::try_new(
-        schema.clone(),
-        vec![arr as ArrayRef],
-    )?]];
-    Ok(Arc::new(MemTable::try_new(schema, partitions)?))
-}
-
 pub struct ExplainNormalizer {
     replacements: Vec<(String, String)>,
 }
diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs
index 71369c73008c..667d3eeab31e 100644
--- a/datafusion/core/tests/sql/select.rs
+++ b/datafusion/core/tests/sql/select.rs
@@ -20,459 +20,81 @@ use datafusion_common::ScalarValue;
 use tempfile::TempDir;
 
 #[tokio::test]
-async fn query_get_indexed_field() -> Result<()> {
-    let ctx = SessionContext::new();
-    let schema = Arc::new(Schema::new(vec![Field::new_list(
-        "some_list",
-        Field::new("item", DataType::Int64, true),
-        false,
-    )]));
-    let builder = PrimitiveBuilder::<Int64Type>::with_capacity(3);
-    let mut lb = ListBuilder::new(builder);
-    for int_vec in [[0, 1, 2], [4, 5, 6], [7, 8, 9]] {
-        let builder = lb.values();
-        for int in int_vec {
-            builder.append_value(int);
-        }
-        lb.append(true);
-    }
-
-    let data = RecordBatch::try_new(schema.clone(), vec![Arc::new(lb.finish())])?;
-
-    ctx.register_batch("ints", data)?;
-
-    // Original column is micros, convert to millis and check timestamp
-    let sql = "SELECT some_list[1] as i0 FROM ints LIMIT 3";
-    let actual = execute_to_batches(&ctx, sql).await;
-    #[rustfmt::skip]
-    let expected = ["+----+",
-        "| i0 |",
-        "+----+",
-        "| 0  |",
-        "| 4  |",
-        "| 7  |",
-        "+----+"];
-    assert_batches_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn query_nested_get_indexed_field() -> Result<()> {
-    let ctx = SessionContext::new();
-    let nested_dt = DataType::List(Arc::new(Field::new("item", DataType::Int64, true)));
-    // Nested schema of { "some_list": [[i64]] }
-    let schema = Arc::new(Schema::new(vec![Field::new(
-        "some_list",
-        DataType::List(Arc::new(Field::new("item", nested_dt.clone(), true))),
-        false,
-    )]));
-
-    let builder = PrimitiveBuilder::<Int64Type>::with_capacity(3);
-    let nested_lb = ListBuilder::new(builder);
-    let mut lb = ListBuilder::new(nested_lb);
-    for int_vec_vec in [
-        [[0, 1], [2, 3], [3, 4]],
-        [[5, 6], [7, 8], [9, 10]],
-        [[11, 12], [13, 14], [15, 16]],
-    ] {
-        let nested_builder = lb.values();
-        for int_vec in int_vec_vec {
-            let builder = nested_builder.values();
-            for int in int_vec {
-                builder.append_value(int);
-            }
-            nested_builder.append(true);
-        }
-        lb.append(true);
-    }
-
-    let data = RecordBatch::try_new(schema.clone(), vec![Arc::new(lb.finish())])?;
-
-    ctx.register_batch("ints", data)?;
-
-    // Original column is micros, convert to millis and check timestamp
-    let sql = "SELECT some_list[1] as i0 FROM ints LIMIT 3";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+----------+",
-        "| i0       |",
-        "+----------+",
-        "| [0, 1]   |",
-        "| [5, 6]   |",
-        "| [11, 12] |",
-        "+----------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-    let sql = "SELECT some_list[1][1] as i0 FROM ints LIMIT 3";
-    let actual = execute_to_batches(&ctx, sql).await;
-    #[rustfmt::skip]
-    let expected = ["+----+",
-        "| i0 |",
-        "+----+",
-        "| 0  |",
-        "| 5  |",
-        "| 11 |",
-        "+----+"];
-    assert_batches_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn query_nested_get_indexed_field_on_struct() -> Result<()> {
-    let ctx = SessionContext::new();
-    let nested_dt = DataType::List(Arc::new(Field::new("item", DataType::Int64, true)));
-    // Nested schema of { "some_struct": { "bar": [i64] } }
-    let struct_fields = vec![Field::new("bar", nested_dt.clone(), true)];
-    let schema = Arc::new(Schema::new(vec![Field::new(
-        "some_struct",
-        DataType::Struct(struct_fields.clone().into()),
-        false,
-    )]));
-
-    let builder = PrimitiveBuilder::<Int64Type>::with_capacity(3);
-    let nested_lb = ListBuilder::new(builder);
-    let mut sb = StructBuilder::new(struct_fields, vec![Box::new(nested_lb)]);
-    for int_vec in [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]] {
-        let lb = sb.field_builder::<ListBuilder<Int64Builder>>(0).unwrap();
-        for int in int_vec {
-            lb.values().append_value(int);
-        }
-        lb.append(true);
-        sb.append(true);
-    }
-    let s = sb.finish();
-    let data = RecordBatch::try_new(schema.clone(), vec![Arc::new(s)])?;
-
-    ctx.register_batch("structs", data)?;
-
-    // Original column is micros, convert to millis and check timestamp
-    let sql = "SELECT some_struct['bar'] as l0 FROM structs LIMIT 3";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+----------------+",
-        "| l0             |",
-        "+----------------+",
-        "| [0, 1, 2, 3]   |",
-        "| [4, 5, 6, 7]   |",
-        "| [8, 9, 10, 11] |",
-        "+----------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // Access to field of struct by CompoundIdentifier
-    let sql = "SELECT some_struct.bar as l0 FROM structs LIMIT 3";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+----------------+",
-        "| l0             |",
-        "+----------------+",
-        "| [0, 1, 2, 3]   |",
-        "| [4, 5, 6, 7]   |",
-        "| [8, 9, 10, 11] |",
-        "+----------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    let sql = "SELECT some_struct['bar'][1] as i0 FROM structs LIMIT 3";
-    let actual = execute_to_batches(&ctx, sql).await;
-    #[rustfmt::skip]
-    let expected = ["+----+",
-        "| i0 |",
-        "+----+",
-        "| 0  |",
-        "| 4  |",
-        "| 8  |",
-        "+----+"];
-    assert_batches_eq!(expected, &actual);
-    Ok(())
-}
-
-#[tokio::test]
-async fn query_on_string_dictionary() -> Result<()> {
-    // Test to ensure DataFusion can operate on dictionary types
-    // Use StringDictionary (32 bit indexes = keys)
-    let d1: DictionaryArray<Int32Type> =
-        vec![Some("one"), None, Some("three")].into_iter().collect();
-
-    let d2: DictionaryArray<Int32Type> = vec![Some("blarg"), None, Some("three")]
-        .into_iter()
-        .collect();
-
-    let d3: StringArray = vec![Some("XYZ"), None, Some("three")].into_iter().collect();
-
-    let batch = RecordBatch::try_from_iter(vec![
-        ("d1", Arc::new(d1) as ArrayRef),
-        ("d2", Arc::new(d2) as ArrayRef),
-        ("d3", Arc::new(d3) as ArrayRef),
-    ])
-    .unwrap();
-
-    let ctx = SessionContext::new();
-    ctx.register_batch("test", batch)?;
-
-    // Basic SELECT
-    let sql = "SELECT d1 FROM test";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+",
-        "| d1    |",
-        "+-------+",
-        "| one   |",
-        "|       |",
-        "| three |",
-        "+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // basic filtering
-    let sql = "SELECT d1 FROM test WHERE d1 IS NOT NULL";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+",
-        "| d1    |",
-        "+-------+",
-        "| one   |",
-        "| three |",
-        "+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // comparison with constant
-    let sql = "SELECT d1 FROM test WHERE d1 = 'three'";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+",
-        "| d1    |",
-        "+-------+",
-        "| three |",
-        "+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // comparison with another dictionary column
-    let sql = "SELECT d1 FROM test WHERE d1 = d2";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+",
-        "| d1    |",
-        "+-------+",
-        "| three |",
-        "+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // order comparison with another dictionary column
-    let sql = "SELECT d1 FROM test WHERE d1 <= d2";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+",
-        "| d1    |",
-        "+-------+",
-        "| three |",
-        "+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // comparison with a non dictionary column
-    let sql = "SELECT d1 FROM test WHERE d1 = d3";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+",
-        "| d1    |",
-        "+-------+",
-        "| three |",
-        "+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // filtering with constant
-    let sql = "SELECT d1 FROM test WHERE d1 = 'three'";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+",
-        "| d1    |",
-        "+-------+",
-        "| three |",
-        "+-------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // Expression evaluation
-    let sql = "SELECT concat(d1, '-foo') FROM test";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+------------------------------+",
-        "| concat(test.d1,Utf8(\"-foo\")) |",
-        "+------------------------------+",
-        "| one-foo                      |",
-        "| -foo                         |",
-        "| three-foo                    |",
-        "+------------------------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // Expression evaluation with two dictionaries
-    let sql = "SELECT concat(d1, d2) FROM test";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------------------------+",
-        "| concat(test.d1,test.d2) |",
-        "+-------------------------+",
-        "| oneblarg                |",
-        "|                         |",
-        "| threethree              |",
-        "+-------------------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // aggregation
-    let sql = "SELECT COUNT(d1) FROM test";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+----------------+",
-        "| COUNT(test.d1) |",
-        "+----------------+",
-        "| 2              |",
-        "+----------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // aggregation min
-    let sql = "SELECT MIN(d1) FROM test";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+--------------+",
-        "| MIN(test.d1) |",
-        "+--------------+",
-        "| one          |",
-        "+--------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // aggregation max
-    let sql = "SELECT MAX(d1) FROM test";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+--------------+",
-        "| MAX(test.d1) |",
-        "+--------------+",
-        "| three        |",
-        "+--------------+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    // grouping
-    let sql = "SELECT d1, COUNT(*) FROM test group by d1";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+----------+",
-        "| d1    | COUNT(*) |",
-        "+-------+----------+",
-        "|       | 1        |",
-        "| one   | 1        |",
-        "| three | 1        |",
-        "+-------+----------+",
-    ];
-    assert_batches_sorted_eq!(expected, &actual);
+async fn test_list_query_parameters() -> Result<()> {
+    let tmp_dir = TempDir::new()?;
+    let partition_count = 4;
+    let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?;
 
-    // window functions
-    let sql = "SELECT d1, row_number() OVER (partition by d1) as rn1 FROM test";
-    let actual = execute_to_batches(&ctx, sql).await;
-    let expected = [
-        "+-------+-----+",
-        "| d1    | rn1 |",
-        "+-------+-----+",
-        "|       | 1   |",
-        "| one   | 1   |",
-        "| three | 1   |",
-        "+-------+-----+",
+    let results = ctx
+        .sql("SELECT * FROM test WHERE c1 = $1")
+        .await?
+        .with_param_values(vec![ScalarValue::from(3i32)])?
+        .collect()
+        .await?;
+    let expected = vec![
+        "+----+----+-------+",
+        "| c1 | c2 | c3    |",
+        "+----+----+-------+",
+        "| 3  | 1  | false |",
+        "| 3  | 10 | true  |",
+        "| 3  | 2  | true  |",
+        "| 3  | 3  | false |",
+        "| 3  | 4  | true  |",
+        "| 3  | 5  | false |",
+        "| 3  | 6  | true  |",
+        "| 3  | 7  | false |",
+        "| 3  | 8  | true  |",
+        "| 3  | 9  | false |",
+        "+----+----+-------+",
     ];
-    assert_batches_sorted_eq!(expected, &actual);
-
+    assert_batches_sorted_eq!(expected, &results);
     Ok(())
 }
 
 #[tokio::test]
-async fn sort_on_window_null_string() -> Result<()> {
-    let d1: DictionaryArray<Int32Type> =
-        vec![Some("one"), None, Some("three")].into_iter().collect();
-    let d2: StringArray = vec![Some("ONE"), None, Some("THREE")].into_iter().collect();
-    let d3: LargeStringArray =
-        vec![Some("One"), None, Some("Three")].into_iter().collect();
-
-    let batch = RecordBatch::try_from_iter(vec![
-        ("d1", Arc::new(d1) as ArrayRef),
-        ("d2", Arc::new(d2) as ArrayRef),
-        ("d3", Arc::new(d3) as ArrayRef),
-    ])
-    .unwrap();
-
-    let ctx =
-        SessionContext::new_with_config(SessionConfig::new().with_target_partitions(1));
-    ctx.register_batch("test", batch)?;
-
-    let sql =
-        "SELECT d1, row_number() OVER (partition by d1) as rn1 FROM test order by d1 asc";
-
-    let actual = execute_to_batches(&ctx, sql).await;
-    // NULLS LAST
-    let expected = [
-        "+-------+-----+",
-        "| d1    | rn1 |",
-        "+-------+-----+",
-        "| one   | 1   |",
-        "| three | 1   |",
-        "|       | 1   |",
-        "+-------+-----+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    let sql =
-        "SELECT d2, row_number() OVER (partition by d2) as rn1 FROM test ORDER BY d2 asc";
-    let actual = execute_to_batches(&ctx, sql).await;
-    // NULLS LAST
-    let expected = [
-        "+-------+-----+",
-        "| d2    | rn1 |",
-        "+-------+-----+",
-        "| ONE   | 1   |",
-        "| THREE | 1   |",
-        "|       | 1   |",
-        "+-------+-----+",
-    ];
-    assert_batches_eq!(expected, &actual);
-
-    let sql =
-        "SELECT d2, row_number() OVER (partition by d2 order by d2 desc) as rn1 FROM test ORDER BY d2 desc";
+async fn test_named_query_parameters() -> Result<()> {
+    let tmp_dir = TempDir::new()?;
+    let partition_count = 4;
+    let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?;
 
-    let actual = execute_to_batches(&ctx, sql).await;
-    // NULLS FIRST
-    let expected = [
-        "+-------+-----+",
-        "| d2    | rn1 |",
-        "+-------+-----+",
-        "|       | 1   |",
-        "| THREE | 1   |",
-        "| ONE   | 1   |",
-        "+-------+-----+",
+    // sql to statement then to logical plan with parameters
+    // c1 defined as UINT32, c2 defined as UInt64
+    let results = ctx
+        .sql("SELECT c1, c2 FROM test WHERE c1 > $coo AND c1 < $foo")
+        .await?
+        .with_param_values(vec![
+            ("foo", ScalarValue::UInt32(Some(3))),
+            ("coo", ScalarValue::UInt32(Some(0))),
+        ])?
+        .collect()
+        .await?;
+    let expected = vec![
+        "+----+----+",
+        "| c1 | c2 |",
+        "+----+----+",
+        "| 1  | 1  |",
+        "| 1  | 2  |",
+        "| 1  | 3  |",
+        "| 1  | 4  |",
+        "| 1  | 5  |",
+        "| 1  | 6  |",
+        "| 1  | 7  |",
+        "| 1  | 8  |",
+        "| 1  | 9  |",
+        "| 1  | 10 |",
+        "| 2  | 1  |",
+        "| 2  | 2  |",
+        "| 2  | 3  |",
+        "| 2  | 4  |",
+        "| 2  | 5  |",
+        "| 2  | 6  |",
+        "| 2  | 7  |",
+        "| 2  | 8  |",
+        "| 2  | 9  |",
+        "| 2  | 10 |",
+        "+----+----+",
     ];
-    assert_batches_eq!(expected, &actual);
-
-    // FIXME sort on LargeUtf8 String has bug.
-    // let sql =
-    //     "SELECT d3, row_number() OVER (partition by d3) as rn1 FROM test";
-    // let actual = execute_to_batches(&ctx, sql).await;
-    // let expected = vec![
-    //     "+-------+-----+",
-    //     "| d3    | rn1 |",
-    //     "+-------+-----+",
-    //     "|       | 1   |",
-    //     "| One   | 1   |",
-    //     "| Three | 1   |",
-    //     "+-------+-----+",
-    // ];
-    // assert_batches_eq!(expected, &actual);
-
+    assert_batches_sorted_eq!(expected, &results);
     Ok(())
 }
 
@@ -576,85 +198,6 @@ async fn prepared_statement_invalid_types() -> Result<()> {
     Ok(())
 }
 
-#[tokio::test]
-async fn test_list_query_parameters() -> Result<()> {
-    let tmp_dir = TempDir::new()?;
-    let partition_count = 4;
-    let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?;
-
-    let results = ctx
-        .sql("SELECT * FROM test WHERE c1 = $1")
-        .await?
-        .with_param_values(vec![ScalarValue::from(3i32)])?
-        .collect()
-        .await?;
-    let expected = vec![
-        "+----+----+-------+",
-        "| c1 | c2 | c3    |",
-        "+----+----+-------+",
-        "| 3  | 1  | false |",
-        "| 3  | 10 | true  |",
-        "| 3  | 2  | true  |",
-        "| 3  | 3  | false |",
-        "| 3  | 4  | true  |",
-        "| 3  | 5  | false |",
-        "| 3  | 6  | true  |",
-        "| 3  | 7  | false |",
-        "| 3  | 8  | true  |",
-        "| 3  | 9  | false |",
-        "+----+----+-------+",
-    ];
-    assert_batches_sorted_eq!(expected, &results);
-    Ok(())
-}
-
-#[tokio::test]
-async fn test_named_query_parameters() -> Result<()> {
-    let tmp_dir = TempDir::new()?;
-    let partition_count = 4;
-    let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?;
-
-    // sql to statement then to logical plan with parameters
-    // c1 defined as UINT32, c2 defined as UInt64
-    let results = ctx
-        .sql("SELECT c1, c2 FROM test WHERE c1 > $coo AND c1 < $foo")
-        .await?
-        .with_param_values(vec![
-            ("foo", ScalarValue::UInt32(Some(3))),
-            ("coo", ScalarValue::UInt32(Some(0))),
-        ])?
-        .collect()
-        .await?;
-    let expected = vec![
-        "+----+----+",
-        "| c1 | c2 |",
-        "+----+----+",
-        "| 1  | 1  |",
-        "| 1  | 2  |",
-        "| 1  | 3  |",
-        "| 1  | 4  |",
-        "| 1  | 5  |",
-        "| 1  | 6  |",
-        "| 1  | 7  |",
-        "| 1  | 8  |",
-        "| 1  | 9  |",
-        "| 1  | 10 |",
-        "| 2  | 1  |",
-        "| 2  | 2  |",
-        "| 2  | 3  |",
-        "| 2  | 4  |",
-        "| 2  | 5  |",
-        "| 2  | 6  |",
-        "| 2  | 7  |",
-        "| 2  | 8  |",
-        "| 2  | 9  |",
-        "| 2  | 10 |",
-        "+----+----+",
-    ];
-    assert_batches_sorted_eq!(expected, &results);
-    Ok(())
-}
-
 #[tokio::test]
 async fn test_parameter_type_coercion() -> Result<()> {
     let ctx = SessionContext::new();
@@ -708,93 +251,3 @@ async fn test_parameter_invalid_types() -> Result<()> {
 );
     Ok(())
 }
-
-#[tokio::test]
-async fn parallel_query_with_filter() -> Result<()> {
-    let tmp_dir = TempDir::new()?;
-    let partition_count = 4;
-    let ctx = create_ctx_with_partition(&tmp_dir, partition_count).await?;
-
-    let dataframe = ctx
-        .sql("SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3")
-        .await?;
-    let results = dataframe.collect().await.unwrap();
-    let expected = vec![
-        "+----+----+",
-        "| c1 | c2 |",
-        "+----+----+",
-        "| 1  | 1  |",
-        "| 1  | 10 |",
-        "| 1  | 2  |",
-        "| 1  | 3  |",
-        "| 1  | 4  |",
-        "| 1  | 5  |",
-        "| 1  | 6  |",
-        "| 1  | 7  |",
-        "| 1  | 8  |",
-        "| 1  | 9  |",
-        "| 2  | 1  |",
-        "| 2  | 10 |",
-        "| 2  | 2  |",
-        "| 2  | 3  |",
-        "| 2  | 4  |",
-        "| 2  | 5  |",
-        "| 2  | 6  |",
-        "| 2  | 7  |",
-        "| 2  | 8  |",
-        "| 2  | 9  |",
-        "+----+----+",
-    ];
-    assert_batches_sorted_eq!(expected, &results);
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn boolean_literal() -> Result<()> {
-    let results =
-        execute_with_partition("SELECT c1, c3 FROM test WHERE c1 > 2 AND c3 = true", 4)
-            .await?;
-
-    let expected = [
-        "+----+------+",
-        "| c1 | c3   |",
-        "+----+------+",
-        "| 3  | true |",
-        "| 3  | true |",
-        "| 3  | true |",
-        "| 3  | true |",
-        "| 3  | true |",
-        "+----+------+",
-    ];
-    assert_batches_sorted_eq!(expected, &results);
-
-    Ok(())
-}
-
-#[tokio::test]
-async fn unprojected_filter() {
-    let config = SessionConfig::new();
-    let ctx = SessionContext::new_with_config(config);
-    let df = ctx.read_table(table_with_sequence(1, 3).unwrap()).unwrap();
-
-    let df = df
-        .filter(col("i").gt(lit(2)))
-        .unwrap()
-        .select(vec![col("i") + col("i")])
-        .unwrap();
-
-    let plan = df.clone().into_optimized_plan().unwrap();
-    println!("{}", plan.display_indent());
-
-    let results = df.collect().await.unwrap();
-
-    let expected = [
-        "+-----------------------+",
-        "| ?table?.i + ?table?.i |",
-        "+-----------------------+",
-        "| 6                     |",
-        "+-----------------------+",
-    ];
-    assert_batches_sorted_eq!(expected, &results);
-}
diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt
index faa5370c70ef..5216b14cb2d2 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -46,11 +46,294 @@ STORED AS CSV
 WITH HEADER ROW
 LOCATION '../core/tests/data/aggregate_simple.csv'
 
-
 ##########
 ## SELECT Tests
 ##########
 
+##########
+## window_null_string_table_test
+##########
+
+statement ok
+CREATE TABLE window_null_string_value_prepare(x string, y string, z string)
+AS VALUES
+('one', 'ONE', 'One'),
+(NULL, NULL, NULL),
+('three', 'THREE', 'Three');
+
+statement ok
+CREATE TABLE window_null_string_table
+AS SELECT arrow_cast(x, 'Dictionary(Int32, Utf8)') as d1,
+y as d2,
+arrow_cast(z, 'LargeUtf8') as d3 FROM window_null_string_value_prepare;
+
+query ?I
+SELECT d1, row_number() OVER (partition by d1) as rn1 FROM window_null_string_table order by d1 asc;
+----
+one 1
+three 1
+NULL 1
+
+query TI
+SELECT d2, row_number() OVER (partition by d2) as rn1 FROM window_null_string_table ORDER BY d2 asc;
+----
+ONE 1
+THREE 1
+NULL 1
+
+query TI
+SELECT d2, row_number() OVER (partition by d2 order by d2 desc) as rn1 FROM window_null_string_table ORDER BY d2 desc
+----
+NULL 1
+THREE 1
+ONE 1
+
+# Test large string as well
+query TI rowsort
+SELECT d3, row_number() OVER (partition by d3) as rn1 FROM window_null_string_table;
+----
+NULL 1
+One 1
+Three 1
+
+
+statement ok
+CREATE TABLE test (
+  c1 BIGINT NOT NULL,
+  c2 BIGINT NOT NULL,
+  c3 BOOLEAN NOT NULL,
+) AS VALUES (0, 1, false),
+(0, 10, true),
+(0, 2, true),
+(0, 3, false),
+(0, 4, true),
+(0, 5, false),
+(0, 6, true),
+(0, 7, false),
+(0, 8, true),
+(0, 9, false),
+(1, 1, false),
+(1, 10, true),
+(1, 2, true),
+(1, 3, false),
+(1, 4, true),
+(1, 5, false),
+(1, 6, true),
+(1, 7, false),
+(1, 8, true),
+(1, 9, false),
+(2, 1, false),
+(2, 10, true),
+(2, 2, true),
+(2, 3, false),
+(2, 4, true),
+(2, 5, false),
+(2, 6, true),
+(2, 7, false),
+(2, 8, true),
+(2, 9, false),
+(3, 1, false),
+(3, 10, true),
+(3, 2, true),
+(3, 3, false),
+(3, 4, true),
+(3, 5, false),
+(3, 6, true),
+(3, 7, false),
+(3, 8, true),
+(3, 9, false);
+
+
+# parallel_query_with_filter
+query II
+SELECT c1, c2 FROM test WHERE c1 > 0 AND c1 < 3;
+----
+1 1
+1 10
+1 2
+1 3
+1 4
+1 5
+1 6
+1 7
+1 8
+1 9
+2 1
+2 10
+2 2
+2 3
+2 4
+2 5
+2 6
+2 7
+2 8
+2 9
+
+######
+# Boolean literal
+######
+query IB
+SELECT c1, c3 FROM test WHERE c1 > 2 AND c3 = true;
+----
+3 true
+3 true
+3 true
+3 true
+3 true
+
+statement ok
+drop table test;
+
+######
+# struct test
+######
+# Prepare the table with struct values for testing
+statement ok
+CREATE TABLE struct_value
+AS VALUES
+(make_array(0, 1, 2, 3)),
+(make_array(4, 5, 6, 7)),
+(make_array(8, 9, 10, 11));
+
+statement ok
+CREATE TABLE nested_get_indexed_field_on_struct_table
+AS SELECT struct(column1) as some_struct from struct_value;
+
+# Original column is micros, convert to millis and check timestamp
+query ?
+SELECT some_struct['c0'] FROM nested_get_indexed_field_on_struct_table LIMIT 3;
+----
+[0, 1, 2, 3]
+[4, 5, 6, 7]
+[8, 9, 10, 11]
+
+# Access to field of struct by CompoundIdentifier
+query ?
+SELECT some_struct.c0 as l0 FROM nested_get_indexed_field_on_struct_table LIMIT 3;
+----
+[0, 1, 2, 3]
+[4, 5, 6, 7]
+[8, 9, 10, 11]
+
+query I
+SELECT some_struct['c0'][1] as i0 FROM nested_get_indexed_field_on_struct_table LIMIT 3;
+----
+0
+4
+8
+
+# Basic SELECT
+####
+#  dictionary_test
+####
+
+# Prepare the table with dictionary values for testing
+statement ok
+CREATE TABLE value(x string, y string, z string)
+AS VALUES
+('one', 'blarg', 'XYZ'),
+(NULL, NULL, NULL),
+('three', 'three', 'three');
+
+statement ok
+CREATE TABLE string_dictionary_table
+AS SELECT arrow_cast(x, 'Dictionary(Int32, Utf8)') as d1,
+arrow_cast(y, 'Dictionary(Int32, Utf8)') as d2,
+z as d3 FROM value;
+
+query ?
+SELECT d1 FROM string_dictionary_table;
+----
+one
+NULL
+three
+
+# basic filtering
+query ?
+SELECT d1 FROM string_dictionary_table WHERE d1 IS NOT NULL;
+----
+one
+three
+
+# comparison with constant
+query ?
+SELECT d1 FROM string_dictionary_table WHERE d1 = 'three';
+----
+three
+
+# comparison with another dictionary column
+query ?
+SELECT d1 FROM string_dictionary_table WHERE d1 = d2;
+----
+three
+
+# order comparison with another dictionary column
+query ?
+SELECT d1 FROM string_dictionary_table WHERE d1 <= d2;
+----
+three
+
+# comparison with a non dictionary column
+query ?
+SELECT d1 FROM string_dictionary_table WHERE d1 = d3;
+----
+three
+
+# filtering with constant
+query ?
+SELECT d1 FROM string_dictionary_table WHERE d1 = 'three';
+----
+three
+
+# Expression evaluation
+query T
+SELECT concat(d1, '-foo') FROM string_dictionary_table;
+----
+one-foo
+-foo
+three-foo
+
+# Expression evaluation with two dictionaries
+query T
+SELECT concat(d1, d2) FROM string_dictionary_table;
+----
+oneblarg
+(empty)
+threethree
+
+# aggregation
+query I
+SELECT COUNT(d1) FROM string_dictionary_table;
+----
+2
+
+# aggregation min
+query T
+SELECT MIN(d1) FROM string_dictionary_table;
+----
+one
+
+# aggregation max
+query T
+SELECT MAX(d1) FROM string_dictionary_table;
+----
+three
+
+# grouping
+query ?I
+SELECT d1, COUNT(*) FROM string_dictionary_table group by d1 order by d1;
+----
+one 1
+three 1
+NULL 1
+
+# window functions
+query ?I
+SELECT d1, row_number() OVER (partition by d1) as rn1 FROM string_dictionary_table order by d1;
+----
+one 1
+three 1
+NULL 1
 
 # select_values_list
 statement error DataFusion error: SQL error: ParserError\("Expected \(, found: EOF"\)
@@ -1176,11 +1459,69 @@ SELECT y = 0 or 1 / y < 1, x = 0 or y = 0 or 1 / y < 1 / x from t;
 statement ok
 DROP TABLE t;
 
+##########
+## indexed_field_test
+##########
+statement ok
+CREATE TABLE indexed_field
+AS VALUES (make_array(0, 1, 2)),
+(make_array(4, 5, 6)),
+(make_array(7, 8, 9))
+
+# query_get_indexed_field
+query I
+SELECT column1[1] AS i0
+FROM indexed_field LIMIT 3;
+----
+0
+4
+7
+
+##########
+## nested_indexed_field_test
+##########
+statement ok
+CREATE TABLE nested_indexed_field
+AS VALUES (make_array([0, 1], [2, 3], [3, 4])),
+(make_array([5, 6], [7, 8], [9, 10])),
+(make_array([11, 12], [13, 14], [15, 16]))
+
+# query nested_indexed_field
+query ?
+SELECT column1[1] AS i0
+FROM nested_indexed_field LIMIT 3;
+----
+[0, 1]
+[5, 6]
+[11, 12]
+
+query I
+SELECT column1[1][1] AS i0
+FROM nested_indexed_field LIMIT 3;
+----
+0
+5
+11
+
 query I
 SELECT CASE 1 WHEN 2 THEN 4 / 0 END;
 ----
 NULL
 
+
+######
+# Unprojected filter
+######
+
+statement ok
+CREATE TABLE test(i INT) AS
+VALUES (1), (2), (3);
+
+query I
+SELECT i + i FROM test WHERE i > 2;
+----
+6
+
 query error DataFusion error: Arrow error: Parser error: Error parsing timestamp from 'I AM NOT A TIMESTAMP': error parsing date
 SELECT to_timestamp('I AM NOT A TIMESTAMP');
 

From 6e4abf517750d7dde2a5d527e76d361fdcd16cd0 Mon Sep 17 00:00:00 2001
From: Dejan Simic <10134699+simicd@users.noreply.github.com>
Date: Thu, 25 Jan 2024 22:22:04 +0100
Subject: [PATCH 10/27] test: Port (last) `repartition.rs` query to
 sqllogictest (#8936)

* Migrate last repartition query

* Add reference to issue

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>

* Fix missing statement

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/core/tests/sql/mod.rs              |  1 -
 datafusion/core/tests/sql/repartition.rs      | 59 -------------------
 .../sqllogictest/test_files/repartition.slt   | 56 ++++++++++++++++++
 3 files changed, 56 insertions(+), 60 deletions(-)
 delete mode 100644 datafusion/core/tests/sql/repartition.rs

diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs
index 2389c86306ab..246191e48ad2 100644
--- a/datafusion/core/tests/sql/mod.rs
+++ b/datafusion/core/tests/sql/mod.rs
@@ -71,7 +71,6 @@ pub mod create_drop;
 pub mod explain_analyze;
 pub mod expr;
 pub mod joins;
-pub mod repartition;
 pub mod select;
 mod sql_api;
 
diff --git a/datafusion/core/tests/sql/repartition.rs b/datafusion/core/tests/sql/repartition.rs
deleted file mode 100644
index 332f18e941aa..000000000000
--- a/datafusion/core/tests/sql/repartition.rs
+++ /dev/null
@@ -1,59 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::array::UInt32Array;
-use arrow::datatypes::{DataType, Field, Schema};
-use arrow::record_batch::RecordBatch;
-use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec;
-use datafusion::physical_plan::repartition::RepartitionExec;
-use datafusion::physical_plan::{ExecutionPlan, Partitioning};
-use datafusion::prelude::{SessionConfig, SessionContext};
-use datafusion::test_util::UnboundedExec;
-use datafusion_common::Result;
-use datafusion_physical_expr::expressions::Column;
-use datafusion_physical_expr::PhysicalExpr;
-use futures::StreamExt;
-use std::sync::Arc;
-
-/// See <https://github.com/apache/arrow-datafusion/issues/5278>
-#[tokio::test]
-async fn unbounded_repartition() -> Result<()> {
-    let config = SessionConfig::new();
-    let ctx = SessionContext::new_with_config(config);
-    let task = ctx.task_ctx();
-    let schema = Arc::new(Schema::new(vec![Field::new("a2", DataType::UInt32, false)]));
-    let batch = RecordBatch::try_new(
-        Arc::clone(&schema),
-        vec![Arc::new(UInt32Array::from(vec![1]))],
-    )?;
-    let input = Arc::new(UnboundedExec::new(None, batch.clone(), 1));
-    let on: Vec<Arc<dyn PhysicalExpr>> = vec![Arc::new(Column::new("a2", 0))];
-    let plan = Arc::new(RepartitionExec::try_new(input, Partitioning::Hash(on, 3))?);
-    let plan = Arc::new(CoalescePartitionsExec::new(plan.clone()));
-    let mut stream = plan.execute(0, task)?;
-
-    // Note: `tokio::time::timeout` does NOT help here because in the mentioned issue, the whole runtime is blocked by a
-    // CPU-spinning thread. Using a multithread runtime with multiple threads is NOT a solution since this would not
-    // trigger the bug (the bug is not specific to a single-thread RT though, it's just the only way to trigger it reliably).
-    let batch_actual = stream
-        .next()
-        .await
-        .expect("not terminated")
-        .expect("no error in stream");
-    assert_eq!(batch_actual, batch);
-    Ok(())
-}
diff --git a/datafusion/sqllogictest/test_files/repartition.slt b/datafusion/sqllogictest/test_files/repartition.slt
index 9829299f43e5..7c141adf82b1 100644
--- a/datafusion/sqllogictest/test_files/repartition.slt
+++ b/datafusion/sqllogictest/test_files/repartition.slt
@@ -71,3 +71,59 @@ AggregateExec: mode=FinalPartitioned, gby=[column1@0 as column1], aggr=[SUM(parq
 # Cleanup
 statement ok
 DROP TABLE parquet_table;
+
+
+
+# Unbounded repartition
+# See https://github.com/apache/arrow-datafusion/issues/5278
+# Set up unbounded table and run a query - the query plan should display a `RepartitionExec`
+# and a `CoalescePartitionsExec`
+statement ok
+CREATE UNBOUNDED EXTERNAL TABLE sink_table (
+        c1  VARCHAR NOT NULL,
+        c2  TINYINT NOT NULL,
+        c3  SMALLINT NOT NULL,
+        c4  SMALLINT NOT NULL,
+        c5  INTEGER NOT NULL,
+        c6  BIGINT NOT NULL,
+        c7  SMALLINT NOT NULL,
+        c8  INT NOT NULL,
+        c9  INT UNSIGNED NOT NULL,
+        c10 BIGINT UNSIGNED NOT NULL,
+        c11 FLOAT NOT NULL,
+        c12 DOUBLE NOT NULL,
+        c13 VARCHAR NOT NULL
+    )
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '../../testing/data/csv/aggregate_test_100.csv';
+
+query TII
+SELECT c1, c2, c3 FROM sink_table WHERE c3 > 0 LIMIT 5;
+----
+c 2 1
+b 1 29
+e 3 104
+a 3 13
+d 1 38
+
+statement ok
+set datafusion.execution.target_partitions = 3;
+
+statement ok
+set datafusion.optimizer.enable_round_robin_repartition = true;
+
+query TT
+EXPLAIN SELECT c1, c2, c3 FROM sink_table WHERE c3 > 0 LIMIT 5;
+----
+logical_plan
+Limit: skip=0, fetch=5
+--Filter: sink_table.c3 > Int16(0)
+----TableScan: sink_table projection=[c1, c2, c3]
+physical_plan
+GlobalLimitExec: skip=0, fetch=5
+--CoalescePartitionsExec
+----CoalesceBatchesExec: target_batch_size=8192
+------FilterExec: c3@2 > 0
+--------RepartitionExec: partitioning=RoundRobinBatch(3), input_partitions=1
+----------StreamingTableExec: partition_sizes=1, projection=[c1, c2, c3], infinite_source=true

From 4d02cc0114908d4f805b2323f20751b1f6d9c2f4 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 25 Jan 2024 17:10:12 -0500
Subject: [PATCH 11/27] Update to sqlparser `0.42.0` (#9000)

* Update to sqlparser `0.42.0`

* Update datafusion Cargo.lock
---
 Cargo.toml                      |  2 +-
 datafusion-cli/Cargo.lock       | 30 ++++++++---------
 datafusion/sql/src/parser.rs    | 12 +++----
 datafusion/sql/src/planner.rs   |  2 ++
 datafusion/sql/src/statement.rs | 59 ++++++++++++++++++++++++++-------
 5 files changed, 71 insertions(+), 34 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index cd88e18fe17c..3b1362d22426 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -70,7 +70,7 @@ parquet = { version = "50.0.0", default-features = false, features = ["arrow", "
 rand = "0.8"
 rstest = "0.18.0"
 serde_json = "1"
-sqlparser = { version = "0.41.0", features = ["visitor"] }
+sqlparser = { version = "0.43.0", features = ["visitor"] }
 tempfile = "3"
 thiserror = "1.0.44"
 url = "2.2"
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index c90b59b924f6..a718f7591a45 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -867,15 +867,15 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "chrono"
-version = "0.4.31"
+version = "0.4.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb"
 dependencies = [
  "android-tzdata",
  "iana-time-zone",
  "num-traits",
  "serde",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.0",
 ]
 
 [[package]]
@@ -2475,18 +2475,18 @@ dependencies = [
 
 [[package]]
 name = "pin-project"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422"
+checksum = "0302c4a0442c456bd56f841aee5c3bfd17967563f6fadc9ceb9f9c23cf3807e0"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405"
+checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -2579,9 +2579,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.76"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
@@ -2673,9 +2673,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.10.2"
+version = "1.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -2685,9 +2685,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.3"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -3148,9 +3148,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
 
 [[package]]
 name = "sqlparser"
-version = "0.41.0"
+version = "0.43.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cc2c25a6c66789625ef164b4c7d2e548d627902280c13710d33da8222169964"
+checksum = "a748c164141797ef0a712aaf16aa71df6f23e80ffea446daa2dd30e3325f89f3"
 dependencies = [
  "log",
  "sqlparser_derive",
diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs
index dbd72ec5eb7a..effc1d096cfd 100644
--- a/datafusion/sql/src/parser.rs
+++ b/datafusion/sql/src/parser.rs
@@ -366,7 +366,7 @@ impl<'a> DFParser<'a> {
             CopyToSource::Query(query)
         } else {
             // parse as table reference
-            let table_name = self.parser.parse_object_name()?;
+            let table_name = self.parser.parse_object_name(true)?;
             CopyToSource::Relation(table_name)
         };
 
@@ -465,7 +465,7 @@ impl<'a> DFParser<'a> {
 
         loop {
             if let Token::Word(_) = self.parser.peek_token().token {
-                let identifier = self.parser.parse_identifier()?;
+                let identifier = self.parser.parse_identifier(false)?;
                 partitions.push(identifier.to_string());
             } else {
                 return self.expected("partition name", self.parser.peek_token());
@@ -567,17 +567,17 @@ impl<'a> DFParser<'a> {
     }
 
     fn parse_column_def(&mut self) -> Result<ColumnDef, ParserError> {
-        let name = self.parser.parse_identifier()?;
+        let name = self.parser.parse_identifier(false)?;
         let data_type = self.parser.parse_data_type()?;
         let collation = if self.parser.parse_keyword(Keyword::COLLATE) {
-            Some(self.parser.parse_object_name()?)
+            Some(self.parser.parse_object_name(false)?)
         } else {
             None
         };
         let mut options = vec![];
         loop {
             if self.parser.parse_keyword(Keyword::CONSTRAINT) {
-                let name = Some(self.parser.parse_identifier()?);
+                let name = Some(self.parser.parse_identifier(false)?);
                 if let Some(option) = self.parser.parse_optional_column_option()? {
                     options.push(ColumnOptionDef { name, option });
                 } else {
@@ -608,7 +608,7 @@ impl<'a> DFParser<'a> {
         let if_not_exists =
             self.parser
                 .parse_keywords(&[Keyword::IF, Keyword::NOT, Keyword::EXISTS]);
-        let table_name = self.parser.parse_object_name()?;
+        let table_name = self.parser.parse_object_name(true)?;
         let (columns, constraints) = self.parse_columns()?;
 
         #[derive(Default)]
diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index d4dd42edcd39..f1d4f3ff5619 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -465,6 +465,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             | SQLDataType::Int64
             | SQLDataType::Float64
             | SQLDataType::Struct(_)
+            | SQLDataType::JSONB
+            | SQLDataType::Unspecified
             => not_impl_err!(
                 "Unsupported SQL type {sql_type:?}"
             ),
diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs
index b9fb4c65dc2c..84a394f324cf 100644
--- a/datafusion/sql/src/statement.rs
+++ b/datafusion/sql/src/statement.rs
@@ -52,9 +52,10 @@ use datafusion_expr::{
 };
 use sqlparser::ast;
 use sqlparser::ast::{
-    Assignment, ColumnDef, Expr as SQLExpr, Expr, Ident, ObjectName, ObjectType, Query,
-    SchemaName, SetExpr, ShowCreateObject, ShowStatementFilter, Statement,
-    TableConstraint, TableFactor, TableWithJoins, TransactionMode, UnaryOperator, Value,
+    Assignment, ColumnDef, CreateTableOptions, Expr as SQLExpr, Expr, Ident, ObjectName,
+    ObjectType, Query, SchemaName, SetExpr, ShowCreateObject, ShowStatementFilter,
+    Statement, TableConstraint, TableFactor, TableWithJoins, TransactionMode,
+    UnaryOperator, Value,
 };
 use sqlparser::parser::ParserError::ParserError;
 
@@ -90,18 +91,21 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConst
     for column in columns {
         for ast::ColumnOptionDef { name, option } in &column.options {
             match option {
-                ast::ColumnOption::Unique { is_primary } => {
-                    constraints.push(ast::TableConstraint::Unique {
-                        name: name.clone(),
-                        columns: vec![column.name.clone()],
-                        is_primary: *is_primary,
-                    })
-                }
+                ast::ColumnOption::Unique {
+                    is_primary,
+                    characteristics,
+                } => constraints.push(ast::TableConstraint::Unique {
+                    name: name.clone(),
+                    columns: vec![column.name.clone()],
+                    is_primary: *is_primary,
+                    characteristics: *characteristics,
+                }),
                 ast::ColumnOption::ForeignKey {
                     foreign_table,
                     referred_columns,
                     on_delete,
                     on_update,
+                    characteristics,
                 } => constraints.push(ast::TableConstraint::ForeignKey {
                     name: name.clone(),
                     columns: vec![],
@@ -109,6 +113,7 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConst
                     referred_columns: referred_columns.to_vec(),
                     on_delete: *on_delete,
                     on_update: *on_update,
+                    characteristics: *characteristics,
                 }),
                 ast::ColumnOption::Check(expr) => {
                     constraints.push(ast::TableConstraint::Check {
@@ -124,6 +129,7 @@ fn calc_inline_constraints_from_columns(columns: &[ColumnDef]) -> Vec<TableConst
                 | ast::ColumnOption::CharacterSet(_)
                 | ast::ColumnOption::Generated { .. }
                 | ast::ColumnOption::Comment(_)
+                | ast::ColumnOption::Options(_)
                 | ast::ColumnOption::OnUpdate(_) => {}
             }
         }
@@ -292,9 +298,22 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 name,
                 columns,
                 query,
-                with_options,
+                options: CreateTableOptions::None,
                 ..
-            } if with_options.is_empty() => {
+            } => {
+                let columns = columns
+                    .into_iter()
+                    .map(|view_column_def| {
+                        if let Some(options) = view_column_def.options {
+                            plan_err!(
+                                "Options not supported for view columns: {options:?}"
+                            )
+                        } else {
+                            Ok(view_column_def.name)
+                        }
+                    })
+                    .collect::<Result<Vec<_>>>()?;
+
                 let mut plan = self.query_to_plan(*query, &mut PlannerContext::new())?;
                 plan = self.apply_expr_alias(plan, columns)?;
 
@@ -440,6 +459,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 on,
                 returning,
                 ignore,
+                table_alias,
+                replace_into,
+                priority,
             } => {
                 if or.is_some() {
                     plan_err!("Inserts with or clauses not supported")?;
@@ -465,6 +487,19 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 let Some(source) = source else {
                     plan_err!("Inserts without a source not supported")?
                 };
+                if let Some(table_alias) = table_alias {
+                    plan_err!(
+                        "Inserts with a table alias not supported: {table_alias:?}"
+                    )?
+                };
+                if replace_into {
+                    plan_err!("Inserts with a `REPLACE INTO` clause not supported")?
+                };
+                if let Some(priority) = priority {
+                    plan_err!(
+                        "Inserts with a `PRIORITY` clause not supported: {priority:?}"
+                    )?
+                };
                 let _ = into; // optional keyword doesn't change behavior
                 self.insert_to_plan(table_name, columns, source, overwrite)
             }

From 8a4bad46540598c6acdf432bde08c2a4c76c5039 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Fri, 26 Jan 2024 09:21:38 +0300
Subject: [PATCH 12/27] Add new test (#8992)

---
 .../optimizer/src/optimize_projections.rs     | 31 ++++++++++---------
 datafusion/sqllogictest/test_files/select.slt | 20 ++++++++++++
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs
index f87f5fdea99f..103599564252 100644
--- a/datafusion/optimizer/src/optimize_projections.rs
+++ b/datafusion/optimizer/src/optimize_projections.rs
@@ -218,6 +218,22 @@ fn optimize_projections(
             // Only use the absolutely necessary aggregate expressions required
             // by the parent:
             let mut new_aggr_expr = get_at_indices(&aggregate.aggr_expr, &aggregate_reqs);
+
+            // Aggregations always need at least one aggregate expression.
+            // With a nested count, we don't require any column as input, but
+            // still need to create a correct aggregate, which may be optimized
+            // out later. As an example, consider the following query:
+            //
+            // SELECT COUNT(*) FROM (SELECT COUNT(*) FROM [...])
+            //
+            // which always returns 1.
+            if new_aggr_expr.is_empty()
+                && new_group_bys.is_empty()
+                && !aggregate.aggr_expr.is_empty()
+            {
+                new_aggr_expr = vec![aggregate.aggr_expr[0].clone()];
+            }
+
             let all_exprs_iter = new_group_bys.iter().chain(new_aggr_expr.iter());
             let schema = aggregate.input.schema();
             let necessary_indices = indices_referred_by_exprs(schema, all_exprs_iter)?;
@@ -238,21 +254,6 @@ fn optimize_projections(
             let (aggregate_input, _) =
                 add_projection_on_top_if_helpful(aggregate_input, necessary_exprs)?;
 
-            // Aggregations always need at least one aggregate expression.
-            // With a nested count, we don't require any column as input, but
-            // still need to create a correct aggregate, which may be optimized
-            // out later. As an example, consider the following query:
-            //
-            // SELECT COUNT(*) FROM (SELECT COUNT(*) FROM [...])
-            //
-            // which always returns 1.
-            if new_aggr_expr.is_empty()
-                && new_group_bys.is_empty()
-                && !aggregate.aggr_expr.is_empty()
-            {
-                new_aggr_expr = vec![aggregate.aggr_expr[0].clone()];
-            }
-
             // Create a new aggregate plan with the updated input and only the
             // absolutely necessary fields:
             return Aggregate::try_new(
diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt
index 5216b14cb2d2..50c62eff7772 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -1527,3 +1527,23 @@ SELECT to_timestamp('I AM NOT A TIMESTAMP');
 
 query error DataFusion error: Arrow error: Cast error: Cannot cast string '' to value of Int32 type
 SELECT CAST('' AS int);
+
+# See issue: https://github.com/apache/arrow-datafusion/issues/8978
+statement ok
+create table users (id int, name varchar);
+
+statement ok
+insert into users values (1, 'Tom');
+
+statement ok
+create view v as select count(id) from users;
+
+query I
+select * from v;
+----
+1
+
+query I
+select count(1) from v;
+----
+1

From bee7136a04c60a2c06caa630cf1b72f32f7dc574 Mon Sep 17 00:00:00 2001
From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com>
Date: Fri, 26 Jan 2024 13:30:56 +0300
Subject: [PATCH 13/27] Make Topk aggregate tests deterministic (#8998)

* Make tests deterministic

* Add duplicate timestamps
---
 .../test_files/aggregates_topk.slt            | 38 ++++++++++---------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/datafusion/sqllogictest/test_files/aggregates_topk.slt b/datafusion/sqllogictest/test_files/aggregates_topk.slt
index 6b6204e09f40..bd8f00e04158 100644
--- a/datafusion/sqllogictest/test_files/aggregates_topk.slt
+++ b/datafusion/sqllogictest/test_files/aggregates_topk.slt
@@ -26,9 +26,11 @@ CREATE TABLE traces(trace_id varchar, timestamp bigint, other bigint) AS VALUES
 ('a', NULL, NULL),
 ('a', 1, 1),
 ('a', -1, -1),
-('b', 0, 0),
-('c', 1, 1),
+('b', -2, 0),
+('c', 4, 1),
+('c', 4, 2),
 ('c', 2, 2),
+('c', 2, 4),
 ('b', 3, 3);
 
 statement ok
@@ -57,34 +59,34 @@ GlobalLimitExec: skip=0, fetch=4
 query TI
 select trace_id, MAX(timestamp) from traces group by trace_id order by MAX(timestamp) desc limit 4;
 ----
+c 4
 b 3
-c 2
 a 1
 NULL 0
 
 query TI
 select trace_id, MIN(timestamp) from traces group by trace_id order by MIN(timestamp) asc limit 4;
 ----
+b -2
 a -1
 NULL 0
-b 0
-c 1
+c 2
 
 query TII
 select trace_id, other, MIN(timestamp) from traces group by trace_id, other order by MIN(timestamp) asc limit 4;
 ----
+b 0 -2
 a -1 -1
-b 0 0
 NULL 0 0
-c 1 1
+a 1 1
 
 query TII
 select trace_id, MIN(other), MIN(timestamp) from traces group by trace_id order by MIN(timestamp), MIN(other) limit 4;
 ----
+b 0 -2
 a -1 -1
 NULL 0 0
-b 0 0
-c 1 1
+c 1 2
 
 statement ok
 set datafusion.optimizer.enable_topk_aggregation = true;
@@ -168,45 +170,45 @@ GlobalLimitExec: skip=0, fetch=4
 query TI
 select trace_id, MAX(timestamp) from traces group by trace_id order by MAX(timestamp) desc limit 4;
 ----
+c 4
 b 3
-c 2
 a 1
 NULL 0
 
 query TI
 select trace_id, MIN(timestamp) from traces group by trace_id order by MIN(timestamp) asc limit 4;
 ----
+b -2
 a -1
 NULL 0
-b 0
-c 1
+c 2
 
 query TI
 select trace_id, MAX(timestamp) from traces group by trace_id order by MAX(timestamp) desc limit 3;
 ----
+c 4
 b 3
-c 2
 a 1
 
 query TI
 select trace_id, MIN(timestamp) from traces group by trace_id order by MIN(timestamp) asc limit 3;
 ----
+b -2
 a -1
 NULL 0
-b 0
 
 query TII
 select trace_id, other, MIN(timestamp) from traces group by trace_id, other order by MIN(timestamp) asc limit 4;
 ----
+b 0 -2
 a -1 -1
-b 0 0
 NULL 0 0
-c 1 1
+a 1 1
 
 query TII
 select trace_id, MIN(other), MIN(timestamp) from traces group by trace_id order by MIN(timestamp), MIN(other) limit 4;
 ----
+b 0 -2
 a -1 -1
 NULL 0 0
-b 0 0
-c 1 1
+c 1 2

From bd38142de4da21fb7fefa1db997fa5c7eb95ffcc Mon Sep 17 00:00:00 2001
From: Marko Grujic <markoog@gmail.com>
Date: Fri, 26 Jan 2024 12:02:15 +0100
Subject: [PATCH 14/27] Add support for Postgres LIKE operators (#8894)

* Add support for PG LIKE operators

* Bump sqlparser dep from branch to merge commit
---
 datafusion/expr/src/operator.rs               |  24 ++++
 datafusion/expr/src/type_coercion/binary.rs   |   7 ++
 .../physical-expr/src/expressions/binary.rs   | 105 +++++++++++++++++-
 datafusion/sql/src/expr/binary_op.rs          |   4 +
 .../sqllogictest/test_files/predicates.slt    |  24 ++++
 .../substrait/src/logical_plan/producer.rs    |   4 +
 docs/source/user-guide/sql/operators.md       |  52 +++++++++
 7 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/datafusion/expr/src/operator.rs b/datafusion/expr/src/operator.rs
index 57888a11d426..a10312e23446 100644
--- a/datafusion/expr/src/operator.rs
+++ b/datafusion/expr/src/operator.rs
@@ -69,6 +69,14 @@ pub enum Operator {
     RegexNotMatch,
     /// Case insensitive regex not match
     RegexNotIMatch,
+    /// Case sensitive pattern match
+    LikeMatch,
+    /// Case insensitive pattern match
+    ILikeMatch,
+    /// Case sensitive pattern not match
+    NotLikeMatch,
+    /// Case insensitive pattern not match
+    NotILikeMatch,
     /// Bitwise and, like `&`
     BitwiseAnd,
     /// Bitwise or, like `|`
@@ -100,6 +108,10 @@ impl Operator {
             Operator::GtEq => Some(Operator::Lt),
             Operator::IsDistinctFrom => Some(Operator::IsNotDistinctFrom),
             Operator::IsNotDistinctFrom => Some(Operator::IsDistinctFrom),
+            Operator::LikeMatch => Some(Operator::NotLikeMatch),
+            Operator::ILikeMatch => Some(Operator::NotILikeMatch),
+            Operator::NotLikeMatch => Some(Operator::LikeMatch),
+            Operator::NotILikeMatch => Some(Operator::ILikeMatch),
             Operator::Plus
             | Operator::Minus
             | Operator::Multiply
@@ -192,6 +204,10 @@ impl Operator {
             | Operator::RegexIMatch
             | Operator::RegexNotMatch
             | Operator::RegexNotIMatch
+            | Operator::LikeMatch
+            | Operator::ILikeMatch
+            | Operator::NotLikeMatch
+            | Operator::NotILikeMatch
             | Operator::BitwiseAnd
             | Operator::BitwiseOr
             | Operator::BitwiseXor
@@ -221,6 +237,10 @@ impl Operator {
             | Operator::RegexNotMatch
             | Operator::RegexIMatch
             | Operator::RegexNotIMatch
+            | Operator::LikeMatch
+            | Operator::ILikeMatch
+            | Operator::NotLikeMatch
+            | Operator::NotILikeMatch
             | Operator::BitwiseAnd
             | Operator::BitwiseOr
             | Operator::BitwiseShiftLeft
@@ -253,6 +273,10 @@ impl fmt::Display for Operator {
             Operator::RegexIMatch => "~*",
             Operator::RegexNotMatch => "!~",
             Operator::RegexNotIMatch => "!~*",
+            Operator::LikeMatch => "~~",
+            Operator::ILikeMatch => "~~*",
+            Operator::NotLikeMatch => "!~~",
+            Operator::NotILikeMatch => "!~~*",
             Operator::IsDistinctFrom => "IS DISTINCT FROM",
             Operator::IsNotDistinctFrom => "IS NOT DISTINCT FROM",
             Operator::BitwiseAnd => "&",
diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs
index 6bacc1870079..70015c699296 100644
--- a/datafusion/expr/src/type_coercion/binary.rs
+++ b/datafusion/expr/src/type_coercion/binary.rs
@@ -101,6 +101,13 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result<Signature>
                 )
             })
         }
+        LikeMatch | ILikeMatch | NotLikeMatch | NotILikeMatch => {
+            regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| {
+                plan_datafusion_err!(
+                    "Cannot infer common argument type for regex operation {lhs} {op} {rhs}"
+                )
+            })
+        }
         BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseShiftRight | BitwiseShiftLeft => {
             bitwise_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| {
                 plan_datafusion_err!(
diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs
index 8c4078dbce8c..3f13030092c1 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -28,12 +28,12 @@ use crate::sort_properties::SortProperties;
 use crate::PhysicalExpr;
 
 use arrow::array::*;
-use arrow::compute::cast;
 use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene};
 use arrow::compute::kernels::cmp::*;
 use arrow::compute::kernels::comparison::regexp_is_match_utf8;
 use arrow::compute::kernels::comparison::regexp_is_match_utf8_scalar;
 use arrow::compute::kernels::concat_elements::concat_elements_utf8;
+use arrow::compute::{cast, ilike, like, nilike, nlike};
 use arrow::datatypes::*;
 use arrow::record_batch::RecordBatch;
 
@@ -281,6 +281,10 @@ impl PhysicalExpr for BinaryExpr {
             Operator::GtEq => return apply_cmp(&lhs, &rhs, gt_eq),
             Operator::IsDistinctFrom => return apply_cmp(&lhs, &rhs, distinct),
             Operator::IsNotDistinctFrom => return apply_cmp(&lhs, &rhs, not_distinct),
+            Operator::LikeMatch => return apply_cmp(&lhs, &rhs, like),
+            Operator::ILikeMatch => return apply_cmp(&lhs, &rhs, ilike),
+            Operator::NotLikeMatch => return apply_cmp(&lhs, &rhs, nlike),
+            Operator::NotILikeMatch => return apply_cmp(&lhs, &rhs, nilike),
             _ => {}
         }
 
@@ -554,7 +558,8 @@ impl BinaryExpr {
         use Operator::*;
         match &self.op {
             IsDistinctFrom | IsNotDistinctFrom | Lt | LtEq | Gt | GtEq | Eq | NotEq
-            | Plus | Minus | Multiply | Divide | Modulo => unreachable!(),
+            | Plus | Minus | Multiply | Divide | Modulo | LikeMatch | ILikeMatch
+            | NotLikeMatch | NotILikeMatch => unreachable!(),
             And => {
                 if left_data_type == &DataType::Boolean {
                     boolean_op!(&left, &right, and_kleene)
@@ -970,6 +975,102 @@ mod tests {
             DataType::Boolean,
             [false, false, false, false, true],
         );
+        test_coercion!(
+            StringArray,
+            DataType::Utf8,
+            vec!["abc"; 5],
+            StringArray,
+            DataType::Utf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::LikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [true, false, false, true, false],
+        );
+        test_coercion!(
+            StringArray,
+            DataType::Utf8,
+            vec!["abc"; 5],
+            StringArray,
+            DataType::Utf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::ILikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [true, true, false, true, true],
+        );
+        test_coercion!(
+            StringArray,
+            DataType::Utf8,
+            vec!["abc"; 5],
+            StringArray,
+            DataType::Utf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::NotLikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [false, true, true, false, true],
+        );
+        test_coercion!(
+            StringArray,
+            DataType::Utf8,
+            vec!["abc"; 5],
+            StringArray,
+            DataType::Utf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::NotILikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [false, false, true, false, false],
+        );
+        test_coercion!(
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["abc"; 5],
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::LikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [true, false, false, true, false],
+        );
+        test_coercion!(
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["abc"; 5],
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::ILikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [true, true, false, true, true],
+        );
+        test_coercion!(
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["abc"; 5],
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::NotLikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [false, true, true, false, true],
+        );
+        test_coercion!(
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["abc"; 5],
+            LargeStringArray,
+            DataType::LargeUtf8,
+            vec!["a__", "A%BC", "A_BC", "abc", "a%C"],
+            Operator::NotILikeMatch,
+            BooleanArray,
+            DataType::Boolean,
+            [false, false, true, false, false],
+        );
         test_coercion!(
             Int16Array,
             DataType::Int16,
diff --git a/datafusion/sql/src/expr/binary_op.rs b/datafusion/sql/src/expr/binary_op.rs
index d9c85663e50e..78efaca09938 100644
--- a/datafusion/sql/src/expr/binary_op.rs
+++ b/datafusion/sql/src/expr/binary_op.rs
@@ -40,6 +40,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
             BinaryOperator::PGRegexIMatch => Ok(Operator::RegexIMatch),
             BinaryOperator::PGRegexNotMatch => Ok(Operator::RegexNotMatch),
             BinaryOperator::PGRegexNotIMatch => Ok(Operator::RegexNotIMatch),
+            BinaryOperator::PGLikeMatch => Ok(Operator::LikeMatch),
+            BinaryOperator::PGILikeMatch => Ok(Operator::ILikeMatch),
+            BinaryOperator::PGNotLikeMatch => Ok(Operator::NotLikeMatch),
+            BinaryOperator::PGNotILikeMatch => Ok(Operator::NotILikeMatch),
             BinaryOperator::BitwiseAnd => Ok(Operator::BitwiseAnd),
             BinaryOperator::BitwiseOr => Ok(Operator::BitwiseOr),
             BinaryOperator::BitwiseXor => Ok(Operator::BitwiseXor),
diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt
index b5347f997a5a..ba407f6d2852 100644
--- a/datafusion/sqllogictest/test_files/predicates.slt
+++ b/datafusion/sqllogictest/test_files/predicates.slt
@@ -220,6 +220,30 @@ SELECT * FROM test WHERE column1 !~* 'z'
 foo
 Barrr
 
+query T
+SELECT * FROM test WHERE column1 ~~ '__z%'
+----
+Bazzz
+
+query T
+SELECT * FROM test WHERE column1 ~~* '__z%'
+----
+Bazzz
+ZZZZZ
+
+query T
+SELECT * FROM test WHERE column1 !~~ '__z%'
+----
+foo
+Barrr
+ZZZZZ
+
+query T
+SELECT * FROM test WHERE column1 !~~* '__z%'
+----
+foo
+Barrr
+
 statement ok
 DROP TABLE test;
 
diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs
index ab0e8c860858..fc9517c90a45 100644
--- a/datafusion/substrait/src/logical_plan/producer.rs
+++ b/datafusion/substrait/src/logical_plan/producer.rs
@@ -577,6 +577,10 @@ pub fn operator_to_name(op: Operator) -> &'static str {
         Operator::RegexIMatch => "regex_imatch",
         Operator::RegexNotMatch => "regex_not_match",
         Operator::RegexNotIMatch => "regex_not_imatch",
+        Operator::LikeMatch => "like_match",
+        Operator::ILikeMatch => "like_imatch",
+        Operator::NotLikeMatch => "like_not_match",
+        Operator::NotILikeMatch => "like_not_imatch",
         Operator::BitwiseAnd => "bitwise_and",
         Operator::BitwiseOr => "bitwise_or",
         Operator::StringConcat => "str_concat",
diff --git a/docs/source/user-guide/sql/operators.md b/docs/source/user-guide/sql/operators.md
index 265e56bb2c34..872ef55dd39d 100644
--- a/docs/source/user-guide/sql/operators.md
+++ b/docs/source/user-guide/sql/operators.md
@@ -263,6 +263,58 @@ Not Regex Case-Insensitive Match
 +---------------------------------------------------+
 ```
 
+### `~~`
+
+Like Match
+
+```sql
+❯ SELECT 'datafusion' ~~ 'dat_f%n';
++---------------------------------------+
+| Utf8("datafusion") ~~ Utf8("dat_f%n") |
++---------------------------------------+
+| true                                  |
++---------------------------------------+
+```
+
+### `~~*`
+
+Case-Insensitive Like Match
+
+```sql
+❯ SELECT 'datafusion' ~~* 'Dat_F%n';
++----------------------------------------+
+| Utf8("datafusion") ~~* Utf8("Dat_F%n") |
++----------------------------------------+
+| true                                   |
++----------------------------------------+
+```
+
+### `!~~`
+
+Not Like Match
+
+```sql
+❯ SELECT 'datafusion' !~~ 'Dat_F%n';
++----------------------------------------+
+| Utf8("datafusion") !~~ Utf8("Dat_F%n") |
++----------------------------------------+
+| true                                   |
++----------------------------------------+
+```
+
+### `!~~*`
+
+Not Case-Insensitive Like Match
+
+```sql
+❯ SELECT 'datafusion' !~~* 'Dat%F_n';
++-----------------------------------------+
+| Utf8("datafusion") !~~* Utf8("Dat%F_n") |
++-----------------------------------------+
+| true                                    |
++-----------------------------------------+
+```
+
 ## Logical Operators
 
 - [AND](#and)

From 35c7b2c85bc436ef6544d0eaa2792675fe27a836 Mon Sep 17 00:00:00 2001
From: Whis Liao <xhwhis@gmail.com>
Date: Fri, 26 Jan 2024 19:38:22 +0800
Subject: [PATCH 15/27] bug: Datafusion doesn't respect case sensitive table
 references (#8964)

* fix: table alias will process uppercase alias into lowercase

* test: uppercase table alias

* test: add table alias sqllogictest

* test: move tests to sqllogictests

* test: update cte.slt

* chore: add test descriptions
---
 datafusion/sql/src/planner.rs                 |  2 +-
 datafusion/sqllogictest/test_files/cte.slt    | 19 ++++++++++++++++
 datafusion/sqllogictest/test_files/select.slt | 22 +++++++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs
index f1d4f3ff5619..012b1c51a5c1 100644
--- a/datafusion/sql/src/planner.rs
+++ b/datafusion/sql/src/planner.rs
@@ -298,7 +298,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         let plan = self.apply_expr_alias(plan, alias.columns)?;
 
         LogicalPlanBuilder::from(plan)
-            .alias(self.normalizer.normalize(alias.name))?
+            .alias(TableReference::bare(self.normalizer.normalize(alias.name)))?
             .build()
     }
 
diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt
index c62b56584682..d341833ba1b6 100644
--- a/datafusion/sqllogictest/test_files/cte.slt
+++ b/datafusion/sqllogictest/test_files/cte.slt
@@ -19,3 +19,22 @@ query II
 select * from (WITH source AS (select 1 as e) SELECT * FROM source) t1,   (WITH source AS (select 1 as e) SELECT * FROM source) t2
 ----
 1 1
+
+# Ensure table aliases can be case sensitive
+query I
+WITH "T" AS (SELECT 1 a) SELECT "T".* FROM "T"
+----
+1
+
+# Ensure table aliases can be case sensitive
+query TT
+EXPLAIN WITH "NUMBERS" AS (SELECT 1 as a, 2 as b, 3 as c) SELECT "NUMBERS".* FROM "NUMBERS"
+----
+logical_plan
+Projection: NUMBERS.a, NUMBERS.b, NUMBERS.c
+--SubqueryAlias: NUMBERS
+----Projection: Int64(1) AS a, Int64(2) AS b, Int64(3) AS c
+------EmptyRelation
+physical_plan
+ProjectionExec: expr=[1 as a, 2 as b, 3 as c]
+--PlaceholderRowExec
diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt
index 50c62eff7772..2b47fec7ac7b 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -515,6 +515,28 @@ select
 ----
 false true false true true false false true false true true false true true false false true
 
+# select uppercase alias table
+query I
+SELECT "T".* from (SELECT 1 a) AS "T"
+----
+1
+
+# explain select uppercase alias table
+query TT
+EXPLAIN SELECT * FROM ((SELECT column1 FROM foo) "T1" CROSS JOIN (SELECT column2 FROM foo) "T2") AS "F"
+----
+logical_plan
+SubqueryAlias: F
+--CrossJoin:
+----SubqueryAlias: T1
+------TableScan: foo projection=[column1]
+----SubqueryAlias: T2
+------TableScan: foo projection=[column2]
+physical_plan
+CrossJoinExec
+--MemoryExec: partitions=1, partition_sizes=[1]
+--MemoryExec: partitions=1, partition_sizes=[1]
+
 # select NaNs
 query BBBB
 select (isnan('NaN'::double) AND 'NaN'::double > 0) a, (isnan('-NaN'::double) AND '-NaN'::double < 0) b, (isnan('NaN'::float) AND 'NaN'::float > 0) c, (isnan('-NaN'::float) AND '-NaN'::float < 0) d

From 7005e2ede63ec5e4ced4289824788e58c5f8f4b4 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 26 Jan 2024 06:38:43 -0500
Subject: [PATCH 16/27] Document parallelism and thread scheduling in the
 architecture guide (#8986)

* Document parallelism and thread scheduling in the architecture guid

* Apply suggestions from code review

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

---------

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
---
 datafusion/core/src/lib.rs | 40 ++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 8fc724a22443..365f359f495d 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -342,16 +342,20 @@
 //!
 //! [`ExecutionPlan`]s process data using the [Apache Arrow] memory
 //! format, making heavy use of functions from the [arrow]
-//! crate. Calling [`execute`] produces 1 or more partitions of data,
-//! consisting an operator that implements
-//! [`SendableRecordBatchStream`].
-//!
-//! Values are represented with [`ColumnarValue`], which are either
+//! crate. Values are represented with [`ColumnarValue`], which are either
 //! [`ScalarValue`] (single constant values) or [`ArrayRef`] (Arrow
 //! Arrays).
 //!
-//! Balanced parallelism is achieved using [`RepartitionExec`], which
-//! implements a [Volcano style] "Exchange".
+//! Calling [`execute`] produces 1 or more partitions of data,
+//! as a [`SendableRecordBatchStream`], which implements a pull based execution
+//! API. Calling `.next().await` will incrementally compute and return the next
+//! [`RecordBatch`]. Balanced parallelism is achieved using [Volcano style]
+//! "Exchange" operations implemented by [`RepartitionExec`].
+//!
+//! While some recent research such as [Morsel-Driven Parallelism] describes challenges
+//! with the pull style Volcano execution model on NUMA architectures, in practice DataFusion achieves
+//! similar scalability as systems that use morsel driven approach such as DuckDB.
+//! See the [DataFusion paper submitted to SIGMOD] for more details.
 //!
 //! [`execute`]: physical_plan::ExecutionPlan::execute
 //! [`SendableRecordBatchStream`]: crate::physical_plan::SendableRecordBatchStream
@@ -364,8 +368,26 @@
 //!
 //! [`RepartitionExec`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/repartition/struct.RepartitionExec.html
 //! [Volcano style]: https://w6113.github.io/files/papers/volcanoparallelism-89.pdf
+//! [Morsel-Driven Parallelism]: https://db.in.tum.de/~leis/papers/morsels.pdf
+//! [DataFusion paper submitted SIGMOD]: https://github.com/apache/arrow-datafusion/files/13874720/DataFusion_Query_Engine___SIGMOD_2024.pdf
 //! [implementors of `ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#implementors
 //!
+//! ## Thread Scheduling
+//!
+//! DataFusion incrementally computes output from a [`SendableRecordBatchStream`]
+//! with `target_partitions` threads. Parallelism is implementing using multiple
+//! [Tokio] [`task`]s, which are executed by threads managed by a tokio Runtime.
+//! While tokio is most commonly used
+//! for asynchronous network I/O, its combination of an efficient, work-stealing
+//! scheduler, first class compiler support for automatic continuation generation,
+//! and exceptional performance makes it a compelling choice for CPU intensive
+//! applications as well. This is explained in more detail in [Using Rustlang’s Async Tokio
+//! Runtime for CPU-Bound Tasks].
+//!
+//! [Tokio]:  https://tokio.rs
+//! [`task`]: tokio::task
+//! [Using Rustlang’s Async Tokio Runtime for CPU-Bound Tasks]: https://thenewstack.io/using-rustlangs-async-tokio-runtime-for-cpu-bound-tasks/
+//!
 //! ## State Management and Configuration
 //!
 //! [`ConfigOptions`] contain options to control DataFusion's
@@ -393,10 +415,12 @@
 //!
 //! The amount of memory and temporary local disk space used by
 //! DataFusion when running a plan can be controlled using the
-//! [`MemoryPool`] and [`DiskManager`].
+//! [`MemoryPool`] and [`DiskManager`]. Other runtime options can be
+//! found on [`RuntimeEnv`].
 //!
 //! [`DiskManager`]: crate::execution::DiskManager
 //! [`MemoryPool`]: crate::execution::memory_pool::MemoryPool
+//! [`RuntimeEnv`]: crate::execution::runtime_env::RuntimeEnv
 //! [`ObjectStoreRegistry`]: crate::datasource::object_store::ObjectStoreRegistry
 //!
 //! ## Crate Organization

From ec6abece2dcfa68007b87c69eefa6b0d7333f628 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Berkay=20=C5=9Eahin?=
 <124376117+berkaysynnada@users.noreply.github.com>
Date: Fri, 26 Jan 2024 14:40:20 +0300
Subject: [PATCH 17/27] Fix None Projections in Projection Pushdown (#9005)

* Fix none projections

* Update select.slt
---
 .../physical_optimizer/projection_pushdown.rs | 26 +++++++++++++------
 datafusion/sqllogictest/test_files/select.slt | 14 ++++++++++
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs
index 34d1af85565a..1d1bee61805e 100644
--- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs
+++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs
@@ -163,8 +163,12 @@ fn try_swapping_with_csv(
     // This process can be moved into CsvExec, but it would be an overlap of their responsibility.
     all_alias_free_columns(projection.expr()).then(|| {
         let mut file_scan = csv.base_config().clone();
-        let new_projections =
-            new_projections_for_columns(projection, &file_scan.projection);
+        let new_projections = new_projections_for_columns(
+            projection,
+            &file_scan
+                .projection
+                .unwrap_or((0..csv.schema().fields().len()).collect()),
+        );
         file_scan.projection = Some(new_projections);
 
         Arc::new(CsvExec::new(
@@ -188,8 +192,11 @@ fn try_swapping_with_memory(
     // This process can be moved into MemoryExec, but it would be an overlap of their responsibility.
     all_alias_free_columns(projection.expr())
         .then(|| {
-            let new_projections =
-                new_projections_for_columns(projection, memory.projection());
+            let all_projections = (0..memory.schema().fields().len()).collect();
+            let new_projections = new_projections_for_columns(
+                projection,
+                memory.projection().as_ref().unwrap_or(&all_projections),
+            );
 
             MemoryExec::try_new(
                 memory.partitions(),
@@ -216,8 +223,11 @@ fn try_swapping_with_streaming_table(
         .projection()
         .as_ref()
         .map(|i| i.as_ref().to_vec());
-    let new_projections =
-        new_projections_for_columns(projection, &streaming_table_projections);
+    let new_projections = new_projections_for_columns(
+        projection,
+        &streaming_table_projections
+            .unwrap_or((0..streaming_table.schema().fields().len()).collect()),
+    );
 
     let mut lex_orderings = vec![];
     for lex_ordering in streaming_table.projected_output_ordering().into_iter() {
@@ -833,7 +843,7 @@ fn all_alias_free_columns(exprs: &[(Arc<dyn PhysicalExpr>, String)]) -> bool {
 /// ensure that all expressions are `Column` expressions without aliases.
 fn new_projections_for_columns(
     projection: &ProjectionExec,
-    source: &Option<Vec<usize>>,
+    source: &[usize],
 ) -> Vec<usize> {
     projection
         .expr()
@@ -841,7 +851,7 @@ fn new_projections_for_columns(
         .filter_map(|(expr, _)| {
             expr.as_any()
                 .downcast_ref::<Column>()
-                .and_then(|expr| source.as_ref().map(|proj| proj[expr.index()]))
+                .map(|expr| source[expr.index()])
         })
         .collect()
 }
diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt
index 2b47fec7ac7b..b7bbc0706576 100644
--- a/datafusion/sqllogictest/test_files/select.slt
+++ b/datafusion/sqllogictest/test_files/select.slt
@@ -1569,3 +1569,17 @@ query I
 select count(1) from v;
 ----
 1
+
+# run below query without logical optimizations
+statement ok
+set datafusion.optimizer.max_passes=0;
+
+statement ok
+CREATE TABLE t(a int, b int);
+
+query I
+select a from t;
+----
+
+statement ok
+set datafusion.optimizer.max_passes=3;
\ No newline at end of file

From b3fe6aa68adb644d275d8914b3802c153b4a3a27 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 26 Jan 2024 08:42:23 -0800
Subject: [PATCH 18/27] Lead and Lag window functions should support default
 value with data type other than Int64 (#9001)

---
 datafusion/common/src/scalar.rs                 | 10 ++++++++++
 datafusion/physical-expr/src/window/lead_lag.rs | 14 +++-----------
 datafusion/sqllogictest/test_files/window.slt   | 14 ++++++++++++++
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs
index 99b8cff20de7..2f9e374bd7f4 100644
--- a/datafusion/common/src/scalar.rs
+++ b/datafusion/common/src/scalar.rs
@@ -2364,6 +2364,16 @@ impl ScalarValue {
         ScalarValue::try_from_array(&cast_arr, 0)
     }
 
+    /// Try to cast this value to a ScalarValue of type `data_type`
+    pub fn cast_to(&self, data_type: &DataType) -> Result<Self> {
+        let cast_options = CastOptions {
+            safe: false,
+            format_options: Default::default(),
+        };
+        let cast_arr = cast_with_options(&self.to_array()?, data_type, &cast_options)?;
+        ScalarValue::try_from_array(&cast_arr, 0)
+    }
+
     fn eq_array_decimal(
         array: &ArrayRef,
         index: usize,
diff --git a/datafusion/physical-expr/src/window/lead_lag.rs b/datafusion/physical-expr/src/window/lead_lag.rs
index d8072be83950..c218b5555afc 100644
--- a/datafusion/physical-expr/src/window/lead_lag.rs
+++ b/datafusion/physical-expr/src/window/lead_lag.rs
@@ -23,9 +23,7 @@ use crate::PhysicalExpr;
 use arrow::array::ArrayRef;
 use arrow::compute::cast;
 use arrow::datatypes::{DataType, Field};
-use datafusion_common::{
-    arrow_datafusion_err, exec_err, DataFusionError, Result, ScalarValue,
-};
+use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue};
 use datafusion_expr::PartitionEvaluator;
 use std::any::Any;
 use std::cmp::min;
@@ -238,15 +236,9 @@ fn get_default_value(
     dtype: &DataType,
 ) -> Result<ScalarValue> {
     match default_value {
-        Some(v) if v.data_type() == DataType::Int64 => {
-            ScalarValue::try_from_string(v.to_string(), dtype)
-        }
-        Some(v) if !v.data_type().is_null() => exec_err!(
-            "Unexpected datatype for default value: {}. Expected: Int64",
-            v.data_type()
-        ),
+        Some(v) if !v.data_type().is_null() => v.cast_to(dtype),
         // If None or Null datatype
-        _ => Ok(ScalarValue::try_from(dtype)?),
+        _ => ScalarValue::try_from(dtype),
     }
 }
 
diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index 303e8e035e7c..aec2fed73847 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -4004,3 +4004,17 @@ select lag(a, 1, null) over (order by a) from (select 1 a union all select 2 a)
 ----
 NULL
 1
+
+# test LEAD window function with string default value
+query T
+select lead(a, 1, 'default') over (order by a) from (select '1' a union all select '2' a)
+----
+2
+default
+
+# test LAG window function with string default value
+query T
+select lag(a, 1, 'default') over (order by a) from (select '1' a union all select '2' a)
+----
+default
+1

From c42bf4842b296ff34fac641a76ac4eb5dd7c8374 Mon Sep 17 00:00:00 2001
From: Suyan <suyanhanx@gmail.com>
Date: Sat, 27 Jan 2024 00:49:43 +0800
Subject: [PATCH 19/27] chore: fix license badge in README (#9008)

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 16d813bea5f5..cb89aff4aec7 100644
--- a/README.md
+++ b/README.md
@@ -20,18 +20,18 @@
 # DataFusion
 
 [![Crates.io][crates-badge]][crates-url]
-[![MIT licensed][mit-badge]][mit-url]
+[![Apache licensed][license-badge]][license-url]
 [![Build Status][actions-badge]][actions-url]
 [![Discord chat][discord-badge]][discord-url]
 
 [crates-badge]: https://img.shields.io/crates/v/datafusion.svg
 [crates-url]: https://crates.io/crates/datafusion
-[mit-badge]: https://img.shields.io/badge/license-MIT-blue.svg
-[mit-url]: https://github.com/apache/arrow-datafusion/blob/main/LICENSE.txt
+[license-badge]: https://img.shields.io/badge/license-Apache%20v2-blue.svg
+[license-url]: https://github.com/apache/arrow-datafusion/blob/main/LICENSE.txt
 [actions-badge]: https://github.com/apache/arrow-datafusion/actions/workflows/rust.yml/badge.svg
 [actions-url]: https://github.com/apache/arrow-datafusion/actions?query=branch%3Amain
 [discord-badge]: https://img.shields.io/discord/885562378132000778.svg?logo=discord&style=flat-square
-[discord-url]: https://discord.com/channels/885562378132000778/885562378132000781
+[discord-url]: https://discord.com/invite/Qw5gKqHxUM
 
 [Website](https://github.com/apache/arrow-datafusion) |
 [Guides](https://github.com/apache/arrow-datafusion/tree/main/docs) |

From fc752557204f4b52ab4cb38b5caff99b1b73b902 Mon Sep 17 00:00:00 2001
From: Curtis Lee Fulton <curtis@merlin.video>
Date: Fri, 26 Jan 2024 10:52:00 -0800
Subject: [PATCH 20/27] Minor: fix: #9010 - Optimizer schema change assert
 error is incorrect (#9012)

* fix: #9010 - Optimizer schema change assert error has incorrect error

* test: #9010 flip expected order of schema change errors
---
 datafusion/optimizer/src/optimizer.rs | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs
index f53e70ab6489..a192348f6936 100644
--- a/datafusion/optimizer/src/optimizer.rs
+++ b/datafusion/optimizer/src/optimizer.rs
@@ -294,7 +294,7 @@ impl Optimizer {
                     self.optimize_recursively(rule, &new_plan, config)
                         .and_then(|plan| {
                             if let Some(plan) = &plan {
-                                assert_schema_is_the_same(rule.name(), &new_plan, plan)?;
+                                assert_schema_is_the_same(rule.name(), plan, &new_plan)?;
                             }
                             Ok(plan)
                         });
@@ -501,15 +501,14 @@ mod tests {
         let err = opt.optimize(&plan, &config, &observe).unwrap_err();
         assert_eq!(
             "Optimizer rule 'get table_scan rule' failed\ncaused by\nget table_scan rule\ncaused by\n\
-             Internal error: Failed due to a difference in schemas, \
-             original schema: DFSchema { fields: [], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \
-             new schema: DFSchema { fields: [\
-             DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \
-             DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \
-             DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \
-             metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }.\
-             \nThis was likely caused by a bug in DataFusion's code \
-             and we would welcome that you file an bug report in our issue tracker",
+            Internal error: Failed due to a difference in schemas, \
+            original schema: DFSchema { fields: [\
+            DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \
+            DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \
+            DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \
+            metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \
+            new schema: DFSchema { fields: [], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }.\
+            \nThis was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker",
             err.strip_backtrace()
         );
     }

From 095e228090d2a8269d7475eb08b4d850f01173c8 Mon Sep 17 00:00:00 2001
From: Trent Hauck <trent@trenthauck.com>
Date: Fri, 26 Jan 2024 12:46:41 -0800
Subject: [PATCH 21/27] docs: fix array_position docs (#9003)

---
 docs/source/user-guide/sql/scalar_functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md
index 6c526e3ada75..4ad58a5067ed 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -2176,7 +2176,7 @@ array_pop_back(array)
 
 ### `array_position`
 
-Returns a string with an input string repeated a specified number.
+Returns the position of the first occurrence of the specified element in the array.
 
 ```
 array_position(array, element)

From ed2453901949101556c0c89b9acf442873c06ce8 Mon Sep 17 00:00:00 2001
From: comphead <comphead@users.noreply.github.com>
Date: Fri, 26 Jan 2024 13:57:07 -0800
Subject: [PATCH 22/27] Rename `CatalogList` to `CatalogProviderList` (#9002)

* Rename `CatalogList` to `CatalogProviderList`
---
 datafusion-cli/src/catalog.rs                 | 11 ++++++----
 .../examples/external_dependency/catalog.rs   | 14 ++++++-------
 .../core/src/catalog/information_schema.rs    |  6 +++---
 datafusion/core/src/catalog/mod.rs            | 20 +++++++++++--------
 datafusion/core/src/execution/context/mod.rs  | 19 +++++++++---------
 docs/source/library-user-guide/catalogs.md    | 16 +++++++--------
 6 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs
index d790e3118a11..cca2b44ad983 100644
--- a/datafusion-cli/src/catalog.rs
+++ b/datafusion-cli/src/catalog.rs
@@ -17,7 +17,7 @@
 
 use async_trait::async_trait;
 use datafusion::catalog::schema::SchemaProvider;
-use datafusion::catalog::{CatalogList, CatalogProvider};
+use datafusion::catalog::{CatalogProvider, CatalogProviderList};
 use datafusion::datasource::listing::{
     ListingTable, ListingTableConfig, ListingTableUrl,
 };
@@ -31,17 +31,20 @@ use std::sync::{Arc, Weak};
 /// Wraps another catalog, automatically creating table providers
 /// for local files if needed
 pub struct DynamicFileCatalog {
-    inner: Arc<dyn CatalogList>,
+    inner: Arc<dyn CatalogProviderList>,
     state: Weak<RwLock<SessionState>>,
 }
 
 impl DynamicFileCatalog {
-    pub fn new(inner: Arc<dyn CatalogList>, state: Weak<RwLock<SessionState>>) -> Self {
+    pub fn new(
+        inner: Arc<dyn CatalogProviderList>,
+        state: Weak<RwLock<SessionState>>,
+    ) -> Self {
         Self { inner, state }
     }
 }
 
-impl CatalogList for DynamicFileCatalog {
+impl CatalogProviderList for DynamicFileCatalog {
     fn as_any(&self) -> &dyn Any {
         self
     }
diff --git a/datafusion-examples/examples/external_dependency/catalog.rs b/datafusion-examples/examples/external_dependency/catalog.rs
index aa9fd103a50c..29e505fb1dcb 100644
--- a/datafusion-examples/examples/external_dependency/catalog.rs
+++ b/datafusion-examples/examples/external_dependency/catalog.rs
@@ -24,7 +24,7 @@ use datafusion::{
     arrow::util::pretty,
     catalog::{
         schema::SchemaProvider,
-        {CatalogList, CatalogProvider},
+        {CatalogProviderList, CatalogProvider},
     },
     datasource::{
         file_format::{csv::CsvFormat, parquet::ParquetFormat, FileFormat},
@@ -53,9 +53,9 @@ async fn main() -> Result<()> {
     .unwrap();
     let mut ctx = SessionContext::new();
     let state = ctx.state();
-    let catlist = Arc::new(CustomCatalogList::new());
+    let catlist = Arc::new(CustomCatalogProvderList::new());
     // use our custom catalog list for context. each context has a single catalog list.
-    // context will by default have MemoryCatalogList
+    // context will by default have [`MemoryCatalogProviderList`]
     ctx.register_catalog_list(catlist.clone());
 
     // initialize our catalog and schemas
@@ -250,18 +250,18 @@ impl CatalogProvider for DirCatalog {
         }
     }
 }
-/// Catalog lists holds multiple catalogs. Each context has a single catalog list.
-struct CustomCatalogList {
+/// Catalog lists holds multiple catalog providers. Each context has a single catalog list.
+struct CustomCatalogProviderList {
     catalogs: RwLock<HashMap<String, Arc<dyn CatalogProvider>>>,
 }
-impl CustomCatalogList {
+impl CustomCatalogProviderList {
     fn new() -> Self {
         Self {
             catalogs: RwLock::new(HashMap::new()),
         }
     }
 }
-impl CatalogList for CustomCatalogList {
+impl CatalogProviderList for CustomCatalogProviderList {
     fn as_any(&self) -> &dyn Any {
         self
     }
diff --git a/datafusion/core/src/catalog/information_schema.rs b/datafusion/core/src/catalog/information_schema.rs
index 3a8fef2d25ab..0e8dbb123ed8 100644
--- a/datafusion/core/src/catalog/information_schema.rs
+++ b/datafusion/core/src/catalog/information_schema.rs
@@ -39,7 +39,7 @@ use crate::{
     physical_plan::streaming::PartitionStream,
 };
 
-use super::{schema::SchemaProvider, CatalogList};
+use super::{schema::SchemaProvider, CatalogProviderList};
 
 pub(crate) const INFORMATION_SCHEMA: &str = "information_schema";
 pub(crate) const TABLES: &str = "tables";
@@ -62,7 +62,7 @@ pub struct InformationSchemaProvider {
 
 impl InformationSchemaProvider {
     /// Creates a new [`InformationSchemaProvider`] for the provided `catalog_list`
-    pub fn new(catalog_list: Arc<dyn CatalogList>) -> Self {
+    pub fn new(catalog_list: Arc<dyn CatalogProviderList>) -> Self {
         Self {
             config: InformationSchemaConfig { catalog_list },
         }
@@ -71,7 +71,7 @@ impl InformationSchemaProvider {
 
 #[derive(Clone)]
 struct InformationSchemaConfig {
-    catalog_list: Arc<dyn CatalogList>,
+    catalog_list: Arc<dyn CatalogProviderList>,
 }
 
 impl InformationSchemaConfig {
diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs
index da7e1f5e2193..6eba43f7df79 100644
--- a/datafusion/core/src/catalog/mod.rs
+++ b/datafusion/core/src/catalog/mod.rs
@@ -33,7 +33,7 @@ use std::sync::Arc;
 ///
 /// Please see the documentation on `CatalogProvider` for details of
 /// implementing a custom catalog.
-pub trait CatalogList: Sync + Send {
+pub trait CatalogProviderList: Sync + Send {
     /// Returns the catalog list as [`Any`]
     /// so that it can be downcast to a specific implementation.
     fn as_any(&self) -> &dyn Any;
@@ -53,14 +53,18 @@ pub trait CatalogList: Sync + Send {
     fn catalog(&self, name: &str) -> Option<Arc<dyn CatalogProvider>>;
 }
 
+/// See [`CatalogProviderList`]
+#[deprecated(since = "35.0.0", note = "use [`CatalogProviderList`] instead")]
+pub trait CatalogList: CatalogProviderList {}
+
 /// Simple in-memory list of catalogs
-pub struct MemoryCatalogList {
+pub struct MemoryCatalogProviderList {
     /// Collection of catalogs containing schemas and ultimately TableProviders
     pub catalogs: DashMap<String, Arc<dyn CatalogProvider>>,
 }
 
-impl MemoryCatalogList {
-    /// Instantiates a new `MemoryCatalogList` with an empty collection of catalogs
+impl MemoryCatalogProviderList {
+    /// Instantiates a new `MemoryCatalogProviderList` with an empty collection of catalogs
     pub fn new() -> Self {
         Self {
             catalogs: DashMap::new(),
@@ -68,13 +72,13 @@ impl MemoryCatalogList {
     }
 }
 
-impl Default for MemoryCatalogList {
+impl Default for MemoryCatalogProviderList {
     fn default() -> Self {
         Self::new()
     }
 }
 
-impl CatalogList for MemoryCatalogList {
+impl CatalogProviderList for MemoryCatalogProviderList {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -105,14 +109,14 @@ impl CatalogList for MemoryCatalogList {
 /// types, and how to access the data.
 ///
 /// The Catalog API consists:
-/// * [`CatalogList`]: a collection of `CatalogProvider`s
+/// * [`CatalogProviderList`]: a collection of `CatalogProvider`s
 /// * [`CatalogProvider`]: a collection of `SchemaProvider`s (sometimes called a "database" in other systems)
 /// * [`SchemaProvider`]:  a collection of `TableProvider`s (often called a "schema" in other systems)
 /// * [`TableProvider]`:  individual tables
 ///
 /// # Implementing Catalogs
 ///
-/// To implement a catalog, you implement at least one of the [`CatalogList`],
+/// To implement a catalog, you implement at least one of the [`CatalogProviderList`],
 /// [`CatalogProvider`] and [`SchemaProvider`] traits and register them
 /// appropriately the [`SessionContext`].
 ///
diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs
index 9b623d7a51ec..b5ad6174821b 100644
--- a/datafusion/core/src/execution/context/mod.rs
+++ b/datafusion/core/src/execution/context/mod.rs
@@ -24,7 +24,7 @@ mod json;
 mod parquet;
 
 use crate::{
-    catalog::{CatalogList, MemoryCatalogList},
+    catalog::{CatalogProviderList, MemoryCatalogProviderList},
     datasource::{
         cte_worktable::CteWorkTable,
         function::{TableFunction, TableFunctionImpl},
@@ -1173,8 +1173,8 @@ impl SessionContext {
         Arc::downgrade(&self.state)
     }
 
-    /// Register [`CatalogList`] in [`SessionState`]
-    pub fn register_catalog_list(&mut self, catalog_list: Arc<dyn CatalogList>) {
+    /// Register [`CatalogProviderList`] in [`SessionState`]
+    pub fn register_catalog_list(&mut self, catalog_list: Arc<dyn CatalogProviderList>) {
         self.state.write().catalog_list = catalog_list;
     }
 }
@@ -1245,7 +1245,7 @@ pub struct SessionState {
     /// Responsible for planning `LogicalPlan`s, and `ExecutionPlan`
     query_planner: Arc<dyn QueryPlanner + Send + Sync>,
     /// Collection of catalogs containing schemas and ultimately TableProviders
-    catalog_list: Arc<dyn CatalogList>,
+    catalog_list: Arc<dyn CatalogProviderList>,
     /// Table Functions
     table_functions: HashMap<String, Arc<TableFunction>>,
     /// Scalar functions that are registered with the context
@@ -1285,7 +1285,8 @@ impl SessionState {
     /// Returns new [`SessionState`] using the provided
     /// [`SessionConfig`] and [`RuntimeEnv`].
     pub fn new_with_config_rt(config: SessionConfig, runtime: Arc<RuntimeEnv>) -> Self {
-        let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc<dyn CatalogList>;
+        let catalog_list =
+            Arc::new(MemoryCatalogProviderList::new()) as Arc<dyn CatalogProviderList>;
         Self::new_with_config_rt_and_catalog_list(config, runtime, catalog_list)
     }
 
@@ -1297,11 +1298,11 @@ impl SessionState {
     }
 
     /// Returns new [`SessionState`] using the provided
-    /// [`SessionConfig`],  [`RuntimeEnv`], and [`CatalogList`]
+    /// [`SessionConfig`],  [`RuntimeEnv`], and [`CatalogProviderList`]
     pub fn new_with_config_rt_and_catalog_list(
         config: SessionConfig,
         runtime: Arc<RuntimeEnv>,
-        catalog_list: Arc<dyn CatalogList>,
+        catalog_list: Arc<dyn CatalogProviderList>,
     ) -> Self {
         let session_id = Uuid::new_v4().to_string();
 
@@ -1366,7 +1367,7 @@ impl SessionState {
     pub fn with_config_rt_and_catalog_list(
         config: SessionConfig,
         runtime: Arc<RuntimeEnv>,
-        catalog_list: Arc<dyn CatalogList>,
+        catalog_list: Arc<dyn CatalogProviderList>,
     ) -> Self {
         Self::new_with_config_rt_and_catalog_list(config, runtime, catalog_list)
     }
@@ -1840,7 +1841,7 @@ impl SessionState {
     }
 
     /// Return catalog list
-    pub fn catalog_list(&self) -> Arc<dyn CatalogList> {
+    pub fn catalog_list(&self) -> Arc<dyn CatalogProviderList> {
         self.catalog_list.clone()
     }
 
diff --git a/docs/source/library-user-guide/catalogs.md b/docs/source/library-user-guide/catalogs.md
index e53d16366350..d30e26f1964a 100644
--- a/docs/source/library-user-guide/catalogs.md
+++ b/docs/source/library-user-guide/catalogs.md
@@ -23,7 +23,7 @@ This section describes how to create and manage catalogs, schemas, and tables in
 
 ## General Concepts
 
-CatalogList, Catalogs, schemas, and tables are organized in a hierarchy. A CatalogList contains catalogs, a catalog contains schemas and a schema contains tables.
+CatalogProviderList, Catalogs, schemas, and tables are organized in a hierarchy. A CatalogProviderList contains catalog providers, a catalog provider contains schemas and a schema contains tables.
 
 DataFusion comes with a basic in memory catalog functionality in the [`catalog` module]. You can use these in memory implementations as is, or extend DataFusion with your own catalog implementations, for example based on local files or files on remote object storage.
 
@@ -31,9 +31,9 @@ DataFusion comes with a basic in memory catalog functionality in the [`catalog`
 
 Similarly to other concepts in DataFusion, you'll implement various traits to create your own catalogs, schemas, and tables. The following sections describe the traits you'll need to implement.
 
-The `CatalogList` trait has methods to register new catalogs, get a catalog by name and list all catalogs .The `CatalogProvider` trait has methods to set a schema to a name, get a schema by name, and list all schemas. The `SchemaProvider`, which can be registered with a `CatalogProvider`, has methods to set a table to a name, get a table by name, list all tables, deregister a table, and check for a table's existence. The `TableProvider` trait has methods to scan underlying data and use it in DataFusion. The `TableProvider` trait is covered in more detail [here](./custom-table-providers.md).
+The `CatalogProviderList` trait has methods to register new catalogs, get a catalog by name and list all catalogs .The `CatalogProvider` trait has methods to set a schema to a name, get a schema by name, and list all schemas. The `SchemaProvider`, which can be registered with a `CatalogProvider`, has methods to set a table to a name, get a table by name, list all tables, deregister a table, and check for a table's existence. The `TableProvider` trait has methods to scan underlying data and use it in DataFusion. The `TableProvider` trait is covered in more detail [here](./custom-table-providers.md).
 
-In the following example, we'll implement an in memory catalog, starting with the `SchemaProvider` trait as we need one to register with the `CatalogProvider`. Finally we will implement `CatalogList` to register the `CatalogProvider`.
+In the following example, we'll implement an in memory catalog, starting with the `SchemaProvider` trait as we need one to register with the `CatalogProvider`. Finally we will implement `CatalogProviderList` to register the `CatalogProvider`.
 
 ## Implementing `MemorySchemaProvider`
 
@@ -169,19 +169,19 @@ impl CatalogProvider for MemoryCatalogProvider {
 
 Again, this is fairly straightforward, as there's an underlying data structure to store the state, via key-value pairs.
 
-## Implementing `MemoryCatalogList`
+## Implementing `MemoryCatalogProviderList`
 
 ```rust
-pub struct MemoryCatalogList {
+pub struct MemoryCatalogProviderList {
     /// Collection of catalogs containing schemas and ultimately TableProviders
     pub catalogs: DashMap<String, Arc<dyn CatalogProvider>>,
 }
 ```
 
-With that the `CatalogList` trait can be implemented.
+With that the `CatalogProviderList` trait can be implemented.
 
 ```rust
-impl CatalogList for MemoryCatalogList {
+impl CatalogProviderList for MemoryCatalogProviderList {
     fn as_any(&self) -> &dyn Any {
         self
     }
@@ -213,4 +213,4 @@ To recap, you need to:
 1. Implement the `TableProvider` trait to create a table provider, or use an existing one.
 2. Implement the `SchemaProvider` trait to create a schema provider, or use an existing one.
 3. Implement the `CatalogProvider` trait to create a catalog provider, or use an existing one.
-4. Implement the `CatalogList` trait to create a CatalogList, or use an existing one.
+4. Implement the `CatalogProviderList` trait to create a CatalogProviderList, or use an existing one.

From a7a74fa522aaef07e6605f414308f3c99bd1ea06 Mon Sep 17 00:00:00 2001
From: Marko Grujic <markoog@gmail.com>
Date: Sat, 27 Jan 2024 17:17:11 +0100
Subject: [PATCH 23/27] Safeguard against potential inexact row count being
 smaller than exact null count (#9007)

* Safeguard against potential inexact row count being smaller than exact null count

* Add test hitting the former overflow panic
---
 datafusion/common/src/stats.rs              |   7 +-
 datafusion/physical-plan/src/joins/utils.rs | 204 +++++++++++---------
 2 files changed, 120 insertions(+), 91 deletions(-)

diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs
index 7ad8992ca9ae..a10e05a55c64 100644
--- a/datafusion/common/src/stats.rs
+++ b/datafusion/common/src/stats.rs
@@ -48,14 +48,15 @@ impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Precision<T> {
 
     /// Transform the value in this [`Precision`] object, if one exists, using
     /// the given function. Preserves the exactness state.
-    pub fn map<F>(self, f: F) -> Precision<T>
+    pub fn map<U, F>(self, f: F) -> Precision<U>
     where
-        F: Fn(T) -> T,
+        F: Fn(T) -> U,
+        U: Debug + Clone + PartialEq + Eq + PartialOrd,
     {
         match self {
             Precision::Exact(val) => Precision::Exact(f(val)),
             Precision::Inexact(val) => Precision::Inexact(f(val)),
-            _ => self,
+            _ => Precision::<U>::Absent,
         }
     }
 
diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs
index 6ab08d3db022..cd987ab40d45 100644
--- a/datafusion/physical-plan/src/joins/utils.rs
+++ b/datafusion/physical-plan/src/joins/utils.rs
@@ -955,7 +955,12 @@ fn max_distinct_count(
             let result = match num_rows {
                 Precision::Absent => Precision::Absent,
                 Precision::Inexact(count) => {
-                    Precision::Inexact(count - stats.null_count.get_value().unwrap_or(&0))
+                    // To safeguard against inexact number of rows (e.g. 0) being smaller than
+                    // an exact null count we need to do a checked subtraction.
+                    match count.checked_sub(*stats.null_count.get_value().unwrap_or(&0)) {
+                        None => Precision::Inexact(0),
+                        Some(non_null_count) => Precision::Inexact(non_null_count),
+                    }
                 }
                 Precision::Exact(count) => {
                     let count = count - stats.null_count.get_value().unwrap_or(&0);
@@ -1468,6 +1473,7 @@ mod tests {
     use arrow::error::{ArrowError, Result as ArrowResult};
     use arrow_schema::SortOptions;
 
+    use datafusion_common::stats::Precision::{Absent, Exact, Inexact};
     use datafusion_common::{arrow_datafusion_err, arrow_err, ScalarValue};
 
     fn check(left: &[Column], right: &[Column], on: &[(Column, Column)]) -> Result<()> {
@@ -1635,25 +1641,26 @@ mod tests {
     }
 
     fn create_column_stats(
-        min: Option<i64>,
-        max: Option<i64>,
-        distinct_count: Option<usize>,
+        min: Precision<i64>,
+        max: Precision<i64>,
+        distinct_count: Precision<usize>,
+        null_count: Precision<usize>,
     ) -> ColumnStatistics {
         ColumnStatistics {
-            distinct_count: distinct_count
-                .map(Precision::Inexact)
-                .unwrap_or(Precision::Absent),
-            min_value: min
-                .map(|size| Precision::Inexact(ScalarValue::from(size)))
-                .unwrap_or(Precision::Absent),
-            max_value: max
-                .map(|size| Precision::Inexact(ScalarValue::from(size)))
-                .unwrap_or(Precision::Absent),
-            ..Default::default()
+            distinct_count,
+            min_value: min.map(ScalarValue::from),
+            max_value: max.map(ScalarValue::from),
+            null_count,
         }
     }
 
-    type PartialStats = (usize, Option<i64>, Option<i64>, Option<usize>);
+    type PartialStats = (
+        usize,
+        Precision<i64>,
+        Precision<i64>,
+        Precision<usize>,
+        Precision<usize>,
+    );
 
     // This is mainly for validating the all edge cases of the estimation, but
     // more advanced (and real world test cases) are below where we need some control
@@ -1670,133 +1677,156 @@ mod tests {
             //
             // distinct(left) == NaN, distinct(right) == NaN
             (
-                (10, Some(1), Some(10), None),
-                (10, Some(1), Some(10), None),
-                Some(Precision::Inexact(10)),
+                (10, Inexact(1), Inexact(10), Absent, Absent),
+                (10, Inexact(1), Inexact(10), Absent, Absent),
+                Some(Inexact(10)),
             ),
             // range(left) > range(right)
             (
-                (10, Some(6), Some(10), None),
-                (10, Some(8), Some(10), None),
-                Some(Precision::Inexact(20)),
+                (10, Inexact(6), Inexact(10), Absent, Absent),
+                (10, Inexact(8), Inexact(10), Absent, Absent),
+                Some(Inexact(20)),
             ),
             // range(right) > range(left)
             (
-                (10, Some(8), Some(10), None),
-                (10, Some(6), Some(10), None),
-                Some(Precision::Inexact(20)),
+                (10, Inexact(8), Inexact(10), Absent, Absent),
+                (10, Inexact(6), Inexact(10), Absent, Absent),
+                Some(Inexact(20)),
             ),
             // range(left) > len(left), range(right) > len(right)
             (
-                (10, Some(1), Some(15), None),
-                (20, Some(1), Some(40), None),
-                Some(Precision::Inexact(10)),
+                (10, Inexact(1), Inexact(15), Absent, Absent),
+                (20, Inexact(1), Inexact(40), Absent, Absent),
+                Some(Inexact(10)),
             ),
             // When we have distinct count.
             (
-                (10, Some(1), Some(10), Some(10)),
-                (10, Some(1), Some(10), Some(10)),
-                Some(Precision::Inexact(10)),
+                (10, Inexact(1), Inexact(10), Inexact(10), Absent),
+                (10, Inexact(1), Inexact(10), Inexact(10), Absent),
+                Some(Inexact(10)),
             ),
             // distinct(left) > distinct(right)
             (
-                (10, Some(1), Some(10), Some(5)),
-                (10, Some(1), Some(10), Some(2)),
-                Some(Precision::Inexact(20)),
+                (10, Inexact(1), Inexact(10), Inexact(5), Absent),
+                (10, Inexact(1), Inexact(10), Inexact(2), Absent),
+                Some(Inexact(20)),
             ),
             // distinct(right) > distinct(left)
             (
-                (10, Some(1), Some(10), Some(2)),
-                (10, Some(1), Some(10), Some(5)),
-                Some(Precision::Inexact(20)),
+                (10, Inexact(1), Inexact(10), Inexact(2), Absent),
+                (10, Inexact(1), Inexact(10), Inexact(5), Absent),
+                Some(Inexact(20)),
             ),
             // min(left) < 0 (range(left) > range(right))
             (
-                (10, Some(-5), Some(5), None),
-                (10, Some(1), Some(5), None),
-                Some(Precision::Inexact(10)),
+                (10, Inexact(-5), Inexact(5), Absent, Absent),
+                (10, Inexact(1), Inexact(5), Absent, Absent),
+                Some(Inexact(10)),
             ),
             // min(right) < 0, max(right) < 0 (range(right) > range(left))
             (
-                (10, Some(-25), Some(-20), None),
-                (10, Some(-25), Some(-15), None),
-                Some(Precision::Inexact(10)),
+                (10, Inexact(-25), Inexact(-20), Absent, Absent),
+                (10, Inexact(-25), Inexact(-15), Absent, Absent),
+                Some(Inexact(10)),
             ),
             // range(left) < 0, range(right) >= 0
             // (there isn't a case where both left and right ranges are negative
             //  so one of them is always going to work, this just proves negative
             //  ranges with bigger absolute values are not are not accidentally used).
             (
-                (10, Some(-10), Some(0), None),
-                (10, Some(0), Some(10), Some(5)),
-                Some(Precision::Inexact(10)),
+                (10, Inexact(-10), Inexact(0), Absent, Absent),
+                (10, Inexact(0), Inexact(10), Inexact(5), Absent),
+                Some(Inexact(10)),
             ),
             // range(left) = 1, range(right) = 1
             (
-                (10, Some(1), Some(1), None),
-                (10, Some(1), Some(1), None),
-                Some(Precision::Inexact(100)),
+                (10, Inexact(1), Inexact(1), Absent, Absent),
+                (10, Inexact(1), Inexact(1), Absent, Absent),
+                Some(Inexact(100)),
             ),
             //
             // Edge cases
             // ==========
             //
             // No column level stats.
-            ((10, None, None, None), (10, None, None, None), None),
+            (
+                (10, Absent, Absent, Absent, Absent),
+                (10, Absent, Absent, Absent, Absent),
+                None,
+            ),
             // No min or max (or both).
-            ((10, None, None, Some(3)), (10, None, None, Some(3)), None),
             (
-                (10, Some(2), None, Some(3)),
-                (10, None, Some(5), Some(3)),
+                (10, Absent, Absent, Inexact(3), Absent),
+                (10, Absent, Absent, Inexact(3), Absent),
+                None,
+            ),
+            (
+                (10, Inexact(2), Absent, Inexact(3), Absent),
+                (10, Absent, Inexact(5), Inexact(3), Absent),
                 None,
             ),
             (
-                (10, None, Some(3), Some(3)),
-                (10, Some(1), None, Some(3)),
+                (10, Absent, Inexact(3), Inexact(3), Absent),
+                (10, Inexact(1), Absent, Inexact(3), Absent),
+                None,
+            ),
+            (
+                (10, Absent, Inexact(3), Absent, Absent),
+                (10, Inexact(1), Absent, Absent, Absent),
                 None,
             ),
-            ((10, None, Some(3), None), (10, Some(1), None, None), None),
             // Non overlapping min/max (when exact=False).
             (
-                (10, Some(0), Some(10), None),
-                (10, Some(11), Some(20), None),
-                Some(Precision::Inexact(0)),
+                (10, Inexact(0), Inexact(10), Absent, Absent),
+                (10, Inexact(11), Inexact(20), Absent, Absent),
+                Some(Inexact(0)),
             ),
             (
-                (10, Some(11), Some(20), None),
-                (10, Some(0), Some(10), None),
-                Some(Precision::Inexact(0)),
+                (10, Inexact(11), Inexact(20), Absent, Absent),
+                (10, Inexact(0), Inexact(10), Absent, Absent),
+                Some(Inexact(0)),
             ),
             // distinct(left) = 0, distinct(right) = 0
             (
-                (10, Some(1), Some(10), Some(0)),
-                (10, Some(1), Some(10), Some(0)),
+                (10, Inexact(1), Inexact(10), Inexact(0), Absent),
+                (10, Inexact(1), Inexact(10), Inexact(0), Absent),
                 None,
             ),
+            // Inexact row count < exact null count with absent distinct count
+            (
+                (0, Inexact(1), Inexact(10), Absent, Exact(5)),
+                (10, Inexact(1), Inexact(10), Absent, Absent),
+                Some(Inexact(0)),
+            ),
         ];
 
         for (left_info, right_info, expected_cardinality) in cases {
             let left_num_rows = left_info.0;
-            let left_col_stats =
-                vec![create_column_stats(left_info.1, left_info.2, left_info.3)];
+            let left_col_stats = vec![create_column_stats(
+                left_info.1,
+                left_info.2,
+                left_info.3,
+                left_info.4,
+            )];
 
             let right_num_rows = right_info.0;
             let right_col_stats = vec![create_column_stats(
                 right_info.1,
                 right_info.2,
                 right_info.3,
+                right_info.4,
             )];
 
             assert_eq!(
                 estimate_inner_join_cardinality(
                     Statistics {
-                        num_rows: Precision::Inexact(left_num_rows),
-                        total_byte_size: Precision::Absent,
+                        num_rows: Inexact(left_num_rows),
+                        total_byte_size: Absent,
                         column_statistics: left_col_stats.clone(),
                     },
                     Statistics {
-                        num_rows: Precision::Inexact(right_num_rows),
-                        total_byte_size: Precision::Absent,
+                        num_rows: Inexact(right_num_rows),
+                        total_byte_size: Absent,
                         column_statistics: right_col_stats.clone(),
                     },
                 ),
@@ -1814,9 +1844,7 @@ mod tests {
             );
 
             assert_eq!(
-                partial_join_stats
-                    .clone()
-                    .map(|s| Precision::Inexact(s.num_rows)),
+                partial_join_stats.clone().map(|s| Inexact(s.num_rows)),
                 expected_cardinality.clone()
             );
             assert_eq!(
@@ -1832,13 +1860,13 @@ mod tests {
     #[test]
     fn test_inner_join_cardinality_multiple_column() -> Result<()> {
         let left_col_stats = vec![
-            create_column_stats(Some(0), Some(100), Some(100)),
-            create_column_stats(Some(100), Some(500), Some(150)),
+            create_column_stats(Inexact(0), Inexact(100), Inexact(100), Absent),
+            create_column_stats(Inexact(100), Inexact(500), Inexact(150), Absent),
         ];
 
         let right_col_stats = vec![
-            create_column_stats(Some(0), Some(100), Some(50)),
-            create_column_stats(Some(100), Some(500), Some(200)),
+            create_column_stats(Inexact(0), Inexact(100), Inexact(50), Absent),
+            create_column_stats(Inexact(100), Inexact(500), Inexact(200), Absent),
         ];
 
         // We have statistics about 4 columns, where the highest distinct
@@ -1916,15 +1944,15 @@ mod tests {
         ];
 
         let left_col_stats = vec![
-            create_column_stats(Some(0), Some(100), Some(100)),
-            create_column_stats(Some(0), Some(500), Some(500)),
-            create_column_stats(Some(1000), Some(10000), None),
+            create_column_stats(Inexact(0), Inexact(100), Inexact(100), Absent),
+            create_column_stats(Inexact(0), Inexact(500), Inexact(500), Absent),
+            create_column_stats(Inexact(1000), Inexact(10000), Absent, Absent),
         ];
 
         let right_col_stats = vec![
-            create_column_stats(Some(0), Some(100), Some(50)),
-            create_column_stats(Some(0), Some(2000), Some(2500)),
-            create_column_stats(Some(0), Some(100), None),
+            create_column_stats(Inexact(0), Inexact(100), Inexact(50), Absent),
+            create_column_stats(Inexact(0), Inexact(2000), Inexact(2500), Absent),
+            create_column_stats(Inexact(0), Inexact(100), Absent, Absent),
         ];
 
         for (join_type, expected_num_rows) in cases {
@@ -1965,15 +1993,15 @@ mod tests {
         // Join on a=c, x=y (ignores b/d) where x and y does not intersect
 
         let left_col_stats = vec![
-            create_column_stats(Some(0), Some(100), Some(100)),
-            create_column_stats(Some(0), Some(500), Some(500)),
-            create_column_stats(Some(1000), Some(10000), None),
+            create_column_stats(Inexact(0), Inexact(100), Inexact(100), Absent),
+            create_column_stats(Inexact(0), Inexact(500), Inexact(500), Absent),
+            create_column_stats(Inexact(1000), Inexact(10000), Absent, Absent),
         ];
 
         let right_col_stats = vec![
-            create_column_stats(Some(0), Some(100), Some(50)),
-            create_column_stats(Some(0), Some(2000), Some(2500)),
-            create_column_stats(Some(0), Some(100), None),
+            create_column_stats(Inexact(0), Inexact(100), Inexact(50), Absent),
+            create_column_stats(Inexact(0), Inexact(2000), Inexact(2500), Absent),
+            create_column_stats(Inexact(0), Inexact(100), Absent, Absent),
         ];
 
         let join_on = vec![

From a6cdd0d77f7ca3f8fb09b2358e3758b8829f8a81 Mon Sep 17 00:00:00 2001
From: Matt Gapp <61894094+matthewgapp@users.noreply.github.com>
Date: Sat, 27 Jan 2024 08:23:04 -0800
Subject: [PATCH 24/27] Recursive CTEs: Stage 3 - add execution support (#8840)

* rebase all execution and preceding recursive cte work

add config flag for recursive ctes

update docs from script

update slt test for doc change

restore testing pin

add sql -> logical plan support

* impl cte as work table

* move SharedState to continuance

* impl WorkTableState

wip: readying pr to implement only logical plan

fix sql integration test

wip: add sql test for logical plan

wip: format test assertion

wip: remove uncessary with qualifier method

some docs

more docs

Add comments to `RecursiveQuery`

Update datfusion-cli Cargo.lock

Fix clippy

better errors and comments

add sql -> logical plan support

* impl cte as work table

* move SharedState to continuance

* impl WorkTableState

wip: readying pr to implement only logical plan

fix sql integration test

wip: add sql test for logical plan

wip: format test assertion

wip: remove uncessary with qualifier method

some docs

more docs

impl execution support

add sql -> logical plan support

* impl cte as work table

* move SharedState to continuance

* impl WorkTableState

wip: readying pr to implement only logical plan

partway through porting over isidentical's work

Continuing implementation with fixes and improvements

Lint fixes

ensure that repartitions are not added immediately after RecursiveExec
in the physical-plan

add trivial sqllogictest

more recursive tests

remove test that asserts recursive cte should fail

additional cte test

wip: remove tokio from physical plan dev deps

format cargo tomls

fix issue where CTE could not be referenced more than 1 time

wip: fixes after rebase but tpcds_physical_q54 keeps overflowing its stack

Impl NamedRelation as CteWorkTable

* impl cte as work table

* move SharedState to continuance

* impl WorkTableState

* upd

* assign work table state

* upd

* upd

fix min repro but still broken on larger test case

set config in sql logic tests

clean up cte slt tests

fixes

fix option

add group by test case and more test case files

wip

add window function recursive cte example

simplify stream impl for recrusive query stream

add explain to trivial test case

move setting of recursive ctes to slt file and add test to ensure multiple record batches are produced each iteration

remove tokio dep and remove mut

lint, comments and remove tokio stream

update submodule pin to feat branch that contains csvs

update submodule pin to feat branch that contains csvs

* error if recursive ctes are nested

* error if recursive cte is referenced multiple times within the recursive term

* wip

* fix rebase

* move testing files into main repo

* update testing pin to main pin

* tweaks
---
 .../core/src/datasource/cte_worktable.rs      |  10 +-
 .../physical_optimizer/projection_pushdown.rs |   2 +-
 datafusion/core/src/physical_planner.rs       |   9 +-
 .../core/tests/data/recursive_cte/balance.csv |   5 +
 .../core/tests/data/recursive_cte/growth.csv  |   4 +
 .../core/tests/data/recursive_cte/prices.csv  | 101 +++
 .../core/tests/data/recursive_cte/sales.csv   |  10 +
 .../tests/data/recursive_cte/salespersons.csv |   8 +
 .../core/tests/data/recursive_cte/time.csv    |   5 +
 datafusion/physical-plan/src/lib.rs           |   2 +
 .../physical-plan/src/recursive_query.rs      | 377 +++++++++++
 datafusion/physical-plan/src/work_table.rs    | 192 ++++++
 datafusion/sql/src/query.rs                   |   1 -
 datafusion/sqllogictest/test_files/cte.slt    | 613 ++++++++++++++++++
 14 files changed, 1330 insertions(+), 9 deletions(-)
 create mode 100644 datafusion/core/tests/data/recursive_cte/balance.csv
 create mode 100644 datafusion/core/tests/data/recursive_cte/growth.csv
 create mode 100644 datafusion/core/tests/data/recursive_cte/prices.csv
 create mode 100644 datafusion/core/tests/data/recursive_cte/sales.csv
 create mode 100644 datafusion/core/tests/data/recursive_cte/salespersons.csv
 create mode 100644 datafusion/core/tests/data/recursive_cte/time.csv
 create mode 100644 datafusion/physical-plan/src/recursive_query.rs
 create mode 100644 datafusion/physical-plan/src/work_table.rs

diff --git a/datafusion/core/src/datasource/cte_worktable.rs b/datafusion/core/src/datasource/cte_worktable.rs
index de13e73e003b..71075839b9a0 100644
--- a/datafusion/core/src/datasource/cte_worktable.rs
+++ b/datafusion/core/src/datasource/cte_worktable.rs
@@ -22,7 +22,7 @@ use std::sync::Arc;
 
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
-use datafusion_common::not_impl_err;
+use datafusion_physical_plan::work_table::WorkTableExec;
 
 use crate::{
     error::Result,
@@ -30,8 +30,6 @@ use crate::{
     physical_plan::ExecutionPlan,
 };
 
-use datafusion_common::DataFusionError;
-
 use crate::datasource::{TableProvider, TableType};
 use crate::execution::context::SessionState;
 
@@ -84,7 +82,11 @@ impl TableProvider for CteWorkTable {
         _filters: &[Expr],
         _limit: Option<usize>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        not_impl_err!("scan not implemented for CteWorkTable yet")
+        // TODO: pushdown filters and limits
+        Ok(Arc::new(WorkTableExec::new(
+            self.name.clone(),
+            self.table_schema.clone(),
+        )))
     }
 
     fn supports_filter_pushdown(
diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs
index 1d1bee61805e..2d20c487e473 100644
--- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs
+++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs
@@ -248,7 +248,7 @@ fn try_swapping_with_streaming_table(
     StreamingTableExec::try_new(
         streaming_table.partition_schema().clone(),
         streaming_table.partitions().clone(),
-        Some(&new_projections),
+        Some(new_projections.as_ref()),
         lex_orderings,
         streaming_table.is_infinite(),
     )
diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index ac7827fafc2c..ac3b7ebaeac1 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -58,6 +58,7 @@ use crate::physical_plan::joins::{
 use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
 use crate::physical_plan::memory::MemoryExec;
 use crate::physical_plan::projection::ProjectionExec;
+use crate::physical_plan::recursive_query::RecursiveQueryExec;
 use crate::physical_plan::repartition::RepartitionExec;
 use crate::physical_plan::sorts::sort::SortExec;
 use crate::physical_plan::union::UnionExec;
@@ -894,7 +895,7 @@ impl DefaultPhysicalPlanner {
                     let filter = FilterExec::try_new(runtime_expr, physical_input)?;
                     Ok(Arc::new(filter.with_default_selectivity(selectivity)?))
                 }
-                LogicalPlan::Union(Union { inputs, .. }) => {
+                LogicalPlan::Union(Union { inputs, schema: _ }) => {
                     let physical_plans = self.create_initial_plan_multi(inputs.iter().map(|lp| lp.as_ref()), session_state).await?;
 
                     Ok(Arc::new(UnionExec::new(physical_plans)))
@@ -1288,8 +1289,10 @@ impl DefaultPhysicalPlanner {
                         Ok(plan)
                     }
                 }
-                LogicalPlan::RecursiveQuery(RecursiveQuery { name: _, static_term: _, recursive_term: _, is_distinct: _,.. }) => {
-                    not_impl_err!("Physical counterpart of RecursiveQuery is not implemented yet")
+                LogicalPlan::RecursiveQuery(RecursiveQuery { name, static_term, recursive_term, is_distinct,.. }) => {
+                    let static_term = self.create_initial_plan(static_term, session_state).await?;
+                    let recursive_term = self.create_initial_plan(recursive_term, session_state).await?;
+                    Ok(Arc::new(RecursiveQueryExec::try_new(name.clone(), static_term, recursive_term, *is_distinct)?))
                 }
             };
             exec_plan
diff --git a/datafusion/core/tests/data/recursive_cte/balance.csv b/datafusion/core/tests/data/recursive_cte/balance.csv
new file mode 100644
index 000000000000..a77c742dd2e5
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/balance.csv
@@ -0,0 +1,5 @@
+time,name,account_balance
+1,John,100
+1,Tim,200
+2,John,300
+2,Tim,400
\ No newline at end of file
diff --git a/datafusion/core/tests/data/recursive_cte/growth.csv b/datafusion/core/tests/data/recursive_cte/growth.csv
new file mode 100644
index 000000000000..912208bad2eb
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/growth.csv
@@ -0,0 +1,4 @@
+name,account_growth
+John,3
+Tim,20
+Eliza,150
\ No newline at end of file
diff --git a/datafusion/core/tests/data/recursive_cte/prices.csv b/datafusion/core/tests/data/recursive_cte/prices.csv
new file mode 100644
index 000000000000..b294ecfad774
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/prices.csv
@@ -0,0 +1,101 @@
+Index,product,price,prices_row_num
+1,Holden,334.8,1
+2,Mercedes-Benz,623.22,2
+3,Aston Martin,363.48,3
+4,GMC,615.67,4
+5,Lincoln,521.13,5
+6,Mitsubishi,143.05,6
+7,Infiniti,861.82,7
+8,Ford,330.57,8
+9,GMC,136.87,9
+10,Toyota,106.29,10
+11,Pontiac,686.95,11
+12,Ford,197.48,12
+13,Honda,774.42,13
+14,Dodge,854.26,14
+15,Bentley,628.82,15
+16,Chevrolet,756.82,16
+17,Volkswagen,438.51,17
+18,Mazda,156.15,18
+19,Hyundai,322.43,19
+20,Oldsmobile,979.95,20
+21,Geo,359.59,21
+22,Ford,960.75,22
+23,Subaru,106.75,23
+24,Pontiac,13.4,24
+25,Mercedes-Benz,858.46,25
+26,Subaru,55.72,26
+27,BMW,316.69,27
+28,Chevrolet,290.32,28
+29,Mercury,296.8,29
+30,Dodge,410.78,30
+31,Oldsmobile,18.07,31
+32,Subaru,442.22,32
+33,Dodge,93.29,33
+34,Honda,282.9,34
+35,Chevrolet,750.87,35
+36,Lexus,249.82,36
+37,Ford,732.33,37
+38,Toyota,680.78,38
+39,Nissan,657.01,39
+40,Mazda,200.76,40
+41,Nissan,251.44,41
+42,Buick,714.44,42
+43,Ford,436.2,43
+44,Volvo,865.53,44
+45,Saab,471.52,45
+46,Mercedes-Benz,51.13,46
+47,Chrysler,943.52,47
+48,Lamborghini,181.6,48
+49,Hyundai,634.89,49
+50,Ford,757.58,50
+51,Porsche,294.64,51
+52,Ford,261.34,52
+53,Chrysler,822.01,53
+54,Audi,430.68,54
+55,Mitsubishi,69.12,55
+56,Mazda,723.16,56
+57,Mazda,711.46,57
+58,Land Rover,435.15,58
+59,Buick,189.58,59
+60,GMC,651.92,60
+61,Mazda,491.37,61
+62,BMW,346.18,62
+63,Ford,456.25,63
+64,Ford,10.65,64
+65,Mazda,985.39,65
+66,Mercedes-Benz,955.79,66
+67,Honda,550.95,67
+68,Mitsubishi,127.6,68
+69,Mercedes-Benz,840.65,69
+70,Infiniti,647.45,70
+71,Bentley,827.26,71
+72,Lincoln,822.22,72
+73,Plymouth,970.55,73
+74,Ford,595.05,74
+75,Maybach,808.46,75
+76,Chevrolet,341.48,76
+77,Jaguar,759.03,77
+78,Land Rover,625.01,78
+79,Lincoln,289.13,79
+80,Suzuki,285.24,80
+81,GMC,253.4,81
+82,Oldsmobile,174.76,82
+83,Lincoln,434.17,83
+84,Dodge,887.38,84
+85,Mercedes-Benz,308.65,85
+86,GMC,182.71,86
+87,Ford,619.62,87
+88,Lexus,228.63,88
+89,Hyundai,901.06,89
+90,Chevrolet,615.65,90
+91,GMC,460.19,91
+92,Mercedes-Benz,729.28,92
+93,Dodge,414.69,93
+94,Maserati,300.83,94
+95,Suzuki,503.64,95
+96,Audi,275.05,96
+97,Ford,303.25,97
+98,Lotus,101.01,98
+99,Lincoln,721.05,99
+100,Kia,833.31,100
\ No newline at end of file
diff --git a/datafusion/core/tests/data/recursive_cte/sales.csv b/datafusion/core/tests/data/recursive_cte/sales.csv
new file mode 100644
index 000000000000..12299c39d635
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/sales.csv
@@ -0,0 +1,10 @@
+region_id,salesperson_id,sale_amount
+101,1,1000
+102,2,500
+101,2,700
+103,3,800
+102,4,300
+101,4,400
+102,5,600
+103,6,500
+101,7,900
\ No newline at end of file
diff --git a/datafusion/core/tests/data/recursive_cte/salespersons.csv b/datafusion/core/tests/data/recursive_cte/salespersons.csv
new file mode 100644
index 000000000000..dc941c450246
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/salespersons.csv
@@ -0,0 +1,8 @@
+salesperson_id,manager_id
+1,
+2,1
+3,1
+4,2
+5,2
+6,3
+7,3
\ No newline at end of file
diff --git a/datafusion/core/tests/data/recursive_cte/time.csv b/datafusion/core/tests/data/recursive_cte/time.csv
new file mode 100644
index 000000000000..21026bd41a4a
--- /dev/null
+++ b/datafusion/core/tests/data/recursive_cte/time.csv
@@ -0,0 +1,5 @@
+time,other
+1,foo
+2,bar
+4,baz
+5,qux
diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index 01d4f8941802..0a9eab5c8633 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -61,6 +61,7 @@ pub mod metrics;
 mod ordering;
 pub mod placeholder_row;
 pub mod projection;
+pub mod recursive_query;
 pub mod repartition;
 pub mod sorts;
 pub mod stream;
@@ -71,6 +72,7 @@ pub mod union;
 pub mod unnest;
 pub mod values;
 pub mod windows;
+pub mod work_table;
 
 pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay};
 pub use crate::metrics::Metric;
diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs
new file mode 100644
index 000000000000..614ab990ac49
--- /dev/null
+++ b/datafusion/physical-plan/src/recursive_query.rs
@@ -0,0 +1,377 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the recursive query plan
+
+use std::any::Any;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::expressions::PhysicalSortExpr;
+use super::metrics::BaselineMetrics;
+use super::RecordBatchStream;
+use super::{
+    metrics::{ExecutionPlanMetricsSet, MetricsSet},
+    work_table::{WorkTable, WorkTableExec},
+    SendableRecordBatchStream, Statistics,
+};
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::{not_impl_err, DataFusionError, Result};
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::Partitioning;
+use futures::{ready, Stream, StreamExt};
+
+use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
+
+/// Recursive query execution plan.
+///
+/// This plan has two components: a base part (the static term) and
+/// a dynamic part (the recursive term). The execution will start from
+/// the base, and as long as the previous iteration produced at least
+/// a single new row (taking care of the distinction) the recursive
+/// part will be continuously executed.
+///
+/// Before each execution of the dynamic part, the rows from the previous
+/// iteration will be available in a "working table" (not a real table,
+/// can be only accessed using a continuance operation).
+///
+/// Note that there won't be any limit or checks applied to detect
+/// an infinite recursion, so it is up to the planner to ensure that
+/// it won't happen.
+#[derive(Debug)]
+pub struct RecursiveQueryExec {
+    /// Name of the query handler
+    name: String,
+    /// The working table of cte
+    work_table: Arc<WorkTable>,
+    /// The base part (static term)
+    static_term: Arc<dyn ExecutionPlan>,
+    /// The dynamic part (recursive term)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// Distinction
+    is_distinct: bool,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl RecursiveQueryExec {
+    /// Create a new RecursiveQueryExec
+    pub fn try_new(
+        name: String,
+        static_term: Arc<dyn ExecutionPlan>,
+        recursive_term: Arc<dyn ExecutionPlan>,
+        is_distinct: bool,
+    ) -> Result<Self> {
+        // Each recursive query needs its own work table
+        let work_table = Arc::new(WorkTable::new());
+        // Use the same work table for both the WorkTableExec and the recursive term
+        let recursive_term = assign_work_table(recursive_term, work_table.clone())?;
+        Ok(RecursiveQueryExec {
+            name,
+            static_term,
+            recursive_term,
+            is_distinct,
+            work_table,
+            metrics: ExecutionPlanMetricsSet::new(),
+        })
+    }
+}
+
+impl ExecutionPlan for RecursiveQueryExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.static_term.schema()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![self.static_term.clone(), self.recursive_term.clone()]
+    }
+
+    // Distribution on a recursive query is really tricky to handle.
+    // For now, we are going to use a single partition but in the
+    // future we might find a better way to handle this.
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    // TODO: control these hints and see whether we can
+    // infer some from the child plans (static/recurisve terms).
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false, false]
+    }
+
+    fn required_input_distribution(&self) -> Vec<datafusion_physical_expr::Distribution> {
+        vec![
+            datafusion_physical_expr::Distribution::SinglePartition,
+            datafusion_physical_expr::Distribution::SinglePartition,
+        ]
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(RecursiveQueryExec {
+            name: self.name.clone(),
+            static_term: children[0].clone(),
+            recursive_term: children[1].clone(),
+            is_distinct: self.is_distinct,
+            work_table: self.work_table.clone(),
+            metrics: self.metrics.clone(),
+        }))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // TODO: we might be able to handle multiple partitions in the future.
+        if partition != 0 {
+            return Err(DataFusionError::Internal(format!(
+                "RecursiveQueryExec got an invalid partition {} (expected 0)",
+                partition
+            )));
+        }
+
+        let static_stream = self.static_term.execute(partition, context.clone())?;
+        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
+        Ok(Box::pin(RecursiveQueryStream::new(
+            context,
+            self.work_table.clone(),
+            self.recursive_term.clone(),
+            static_stream,
+            baseline_metrics,
+        )))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+impl DisplayAs for RecursiveQueryExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(
+                    f,
+                    "RecursiveQueryExec: name={}, is_distinct={}",
+                    self.name, self.is_distinct
+                )
+            }
+        }
+    }
+}
+
+/// The actual logic of the recursive queries happens during the streaming
+/// process. A simplified version of the algorithm is the following:
+///
+/// buffer = []
+///
+/// while batch := static_stream.next():
+///    buffer.push(batch)
+///    yield buffer
+///
+/// while buffer.len() > 0:
+///    sender, receiver = Channel()
+///    register_continuation(handle_name, receiver)
+///    sender.send(buffer.drain())
+///    recursive_stream = recursive_term.execute()
+///    while batch := recursive_stream.next():
+///        buffer.append(batch)
+///        yield buffer
+///
+struct RecursiveQueryStream {
+    /// The context to be used for managing handlers & executing new tasks
+    task_context: Arc<TaskContext>,
+    /// The working table state, representing the self referencing cte table
+    work_table: Arc<WorkTable>,
+    /// The dynamic part (recursive term) as is (without being executed)
+    recursive_term: Arc<dyn ExecutionPlan>,
+    /// The static part (static term) as a stream. If the processing of this
+    /// part is completed, then it will be None.
+    static_stream: Option<SendableRecordBatchStream>,
+    /// The dynamic part (recursive term) as a stream. If the processing of this
+    /// part has not started yet, or has been completed, then it will be None.
+    recursive_stream: Option<SendableRecordBatchStream>,
+    /// The schema of the output.
+    schema: SchemaRef,
+    /// In-memory buffer for storing a copy of the current results. Will be
+    /// cleared after each iteration.
+    buffer: Vec<RecordBatch>,
+    // /// Metrics.
+    _baseline_metrics: BaselineMetrics,
+}
+
+impl RecursiveQueryStream {
+    /// Create a new recursive query stream
+    fn new(
+        task_context: Arc<TaskContext>,
+        work_table: Arc<WorkTable>,
+        recursive_term: Arc<dyn ExecutionPlan>,
+        static_stream: SendableRecordBatchStream,
+        baseline_metrics: BaselineMetrics,
+    ) -> Self {
+        let schema = static_stream.schema();
+        Self {
+            task_context,
+            work_table,
+            recursive_term,
+            static_stream: Some(static_stream),
+            recursive_stream: None,
+            schema,
+            buffer: vec![],
+            _baseline_metrics: baseline_metrics,
+        }
+    }
+
+    /// Push a clone of the given batch to the in memory buffer, and then return
+    /// a poll with it.
+    fn push_batch(
+        mut self: std::pin::Pin<&mut Self>,
+        batch: RecordBatch,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        self.buffer.push(batch.clone());
+        Poll::Ready(Some(Ok(batch)))
+    }
+
+    /// Start polling for the next iteration, will be called either after the static term
+    /// is completed or another term is completed. It will follow the algorithm above on
+    /// to check whether the recursion has ended.
+    fn poll_next_iteration(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<RecordBatch>>> {
+        let total_length = self
+            .buffer
+            .iter()
+            .fold(0, |acc, batch| acc + batch.num_rows());
+
+        if total_length == 0 {
+            return Poll::Ready(None);
+        }
+
+        // Update the work table with the current buffer
+        let batches = self.buffer.drain(..).collect();
+        self.work_table.write(batches);
+
+        // We always execute (and re-execute iteratively) the first partition.
+        // Downstream plans should not expect any partitioning.
+        let partition = 0;
+
+        self.recursive_stream = Some(
+            self.recursive_term
+                .execute(partition, self.task_context.clone())?,
+        );
+        self.poll_next(cx)
+    }
+}
+
+fn assign_work_table(
+    plan: Arc<dyn ExecutionPlan>,
+    work_table: Arc<WorkTable>,
+) -> Result<Arc<dyn ExecutionPlan>> {
+    let mut work_table_refs = 0;
+    plan.transform_down_mut(&mut |plan| {
+        if let Some(exec) = plan.as_any().downcast_ref::<WorkTableExec>() {
+            if work_table_refs > 0 {
+                not_impl_err!(
+                    "Multiple recursive references to the same CTE are not supported"
+                )
+            } else {
+                work_table_refs += 1;
+                Ok(Transformed::Yes(Arc::new(
+                    exec.with_work_table(work_table.clone()),
+                )))
+            }
+        } else if plan.as_any().is::<RecursiveQueryExec>() {
+            not_impl_err!("Recursive queries cannot be nested")
+        } else {
+            Ok(Transformed::No(plan))
+        }
+    })
+}
+
+impl Stream for RecursiveQueryStream {
+    type Item = Result<RecordBatch>;
+
+    fn poll_next(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        // TODO: we should use this poll to record some metrics!
+        if let Some(static_stream) = &mut self.static_stream {
+            // While the static term's stream is available, we'll be forwarding the batches from it (also
+            // saving them for the initial iteration of the recursive term).
+            let batch_result = ready!(static_stream.poll_next_unpin(cx));
+            match &batch_result {
+                None => {
+                    // Once this is done, we can start running the setup for the recursive term.
+                    self.static_stream = None;
+                    self.poll_next_iteration(cx)
+                }
+                Some(Ok(batch)) => self.push_batch(batch.clone()),
+                _ => Poll::Ready(batch_result),
+            }
+        } else if let Some(recursive_stream) = &mut self.recursive_stream {
+            let batch_result = ready!(recursive_stream.poll_next_unpin(cx));
+            match batch_result {
+                None => {
+                    self.recursive_stream = None;
+                    self.poll_next_iteration(cx)
+                }
+                Some(Ok(batch)) => self.push_batch(batch.clone()),
+                _ => Poll::Ready(batch_result),
+            }
+        } else {
+            Poll::Ready(None)
+        }
+    }
+}
+
+impl RecordBatchStream for RecursiveQueryStream {
+    /// Get the schema
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+}
+
+#[cfg(test)]
+mod tests {}
diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs
new file mode 100644
index 000000000000..c74a596f3dae
--- /dev/null
+++ b/datafusion/physical-plan/src/work_table.rs
@@ -0,0 +1,192 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines the work table query plan
+
+use std::any::Any;
+use std::sync::{Arc, Mutex};
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion_execution::TaskContext;
+use datafusion_physical_expr::Partitioning;
+
+use crate::memory::MemoryStream;
+use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
+
+use super::expressions::PhysicalSortExpr;
+
+use super::{
+    metrics::{ExecutionPlanMetricsSet, MetricsSet},
+    SendableRecordBatchStream, Statistics,
+};
+use datafusion_common::{internal_err, DataFusionError, Result};
+
+/// The name is from PostgreSQL's terminology.
+/// See <https://wiki.postgresql.org/wiki/CTEReadme#How_Recursion_Works>
+/// This table serves as a mirror or buffer between each iteration of a recursive query.
+#[derive(Debug)]
+pub(super) struct WorkTable {
+    batches: Mutex<Option<Vec<RecordBatch>>>,
+}
+
+impl WorkTable {
+    /// Create a new work table.
+    pub(super) fn new() -> Self {
+        Self {
+            batches: Mutex::new(None),
+        }
+    }
+
+    /// Take the previously written batches from the work table.
+    /// This will be called by the [`WorkTableExec`] when it is executed.
+    fn take(&self) -> Vec<RecordBatch> {
+        let batches = self.batches.lock().unwrap().take().unwrap_or_default();
+        batches
+    }
+
+    /// Write the results of a recursive query iteration to the work table.
+    pub(super) fn write(&self, input: Vec<RecordBatch>) {
+        self.batches.lock().unwrap().replace(input);
+    }
+}
+
+/// A temporary "working table" operation where the input data will be
+/// taken from the named handle during the execution and will be re-published
+/// as is (kind of like a mirror).
+///
+/// Most notably used in the implementation of recursive queries where the
+/// underlying relation does not exist yet but the data will come as the previous
+/// term is evaluated. This table will be used such that the recursive plan
+/// will register a receiver in the task context and this plan will use that
+/// receiver to get the data and stream it back up so that the batches are available
+/// in the next iteration.
+#[derive(Clone, Debug)]
+pub struct WorkTableExec {
+    /// Name of the relation handler
+    name: String,
+    /// The schema of the stream
+    schema: SchemaRef,
+    /// The work table
+    work_table: Arc<WorkTable>,
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+}
+
+impl WorkTableExec {
+    /// Create a new execution plan for a worktable exec.
+    pub fn new(name: String, schema: SchemaRef) -> Self {
+        Self {
+            name,
+            schema,
+            metrics: ExecutionPlanMetricsSet::new(),
+            work_table: Arc::new(WorkTable::new()),
+        }
+    }
+
+    pub(super) fn with_work_table(&self, work_table: Arc<WorkTable>) -> Self {
+        Self {
+            name: self.name.clone(),
+            schema: self.schema.clone(),
+            metrics: ExecutionPlanMetricsSet::new(),
+            work_table,
+        }
+    }
+}
+
+impl DisplayAs for WorkTableExec {
+    fn fmt_as(
+        &self,
+        t: DisplayFormatType,
+        f: &mut std::fmt::Formatter,
+    ) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "WorkTableExec: name={}", self.name)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for WorkTableExec {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![]
+    }
+
+    fn output_partitioning(&self) -> Partitioning {
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        None
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        _: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(self.clone())
+    }
+
+    /// Stream the batches that were written to the work table.
+    fn execute(
+        &self,
+        partition: usize,
+        _context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        // WorkTable streams must be the plan base.
+        if partition != 0 {
+            return internal_err!(
+                "WorkTableExec got an invalid partition {partition} (expected 0)"
+            );
+        }
+
+        let batches = self.work_table.take();
+        Ok(Box::pin(MemoryStream::try_new(
+            batches,
+            self.schema.clone(),
+            None,
+        )?))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+#[cfg(test)]
+mod tests {}
diff --git a/datafusion/sql/src/query.rs b/datafusion/sql/src/query.rs
index af0b91ae6c7e..ea8edd0771c8 100644
--- a/datafusion/sql/src/query.rs
+++ b/datafusion/sql/src/query.rs
@@ -54,7 +54,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
         let set_expr = query.body;
         if let Some(with) = query.with {
             // Process CTEs from top to bottom
-
             let is_recursive = with.recursive;
 
             for cte in with.cte_tables {
diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt
index d341833ba1b6..6b9db5589391 100644
--- a/datafusion/sqllogictest/test_files/cte.slt
+++ b/datafusion/sqllogictest/test_files/cte.slt
@@ -38,3 +38,616 @@ Projection: NUMBERS.a, NUMBERS.b, NUMBERS.c
 physical_plan
 ProjectionExec: expr=[1 as a, 2 as b, 3 as c]
 --PlaceholderRowExec
+
+
+
+# enable recursive CTEs
+statement ok
+set datafusion.execution.enable_recursive_ctes = true;
+
+# trivial recursive CTE works
+query I rowsort
+WITH RECURSIVE nodes AS ( 
+    SELECT 1 as id
+    UNION ALL 
+    SELECT id + 1 as id 
+    FROM nodes
+    WHERE id < 10
+)
+SELECT * FROM nodes
+----
+1
+10
+2
+3
+4
+5
+6
+7
+8
+9
+
+# explain trivial recursive CTE
+query TT
+EXPLAIN WITH RECURSIVE nodes AS ( 
+    SELECT 1 as id
+    UNION ALL 
+    SELECT id + 1 as id 
+    FROM nodes
+    WHERE id < 10
+)
+SELECT * FROM nodes
+----
+logical_plan
+Projection: nodes.id
+--SubqueryAlias: nodes
+----RecursiveQuery: is_distinct=false
+------Projection: Int64(1) AS id
+--------EmptyRelation
+------Projection: nodes.id + Int64(1) AS id
+--------Filter: nodes.id < Int64(10)
+----------TableScan: nodes
+physical_plan
+RecursiveQueryExec: name=nodes, is_distinct=false
+--ProjectionExec: expr=[1 as id]
+----PlaceholderRowExec
+--CoalescePartitionsExec
+----ProjectionExec: expr=[id@0 + 1 as id]
+------CoalesceBatchesExec: target_batch_size=8192
+--------FilterExec: id@0 < 10
+----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+------------WorkTableExec: name=nodes
+
+# setup
+statement ok
+CREATE EXTERNAL TABLE balance STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/balance.csv'
+
+# setup
+statement ok
+CREATE EXTERNAL TABLE growth STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/growth.csv'
+
+# setup
+statement ok
+set datafusion.execution.batch_size = 2;
+
+# recursive CTE with static term derived from table works.
+# use explain to ensure that batch size is set to 2. This should produce multiple batches per iteration since the input
+# table 'balances' has 4 rows
+query TT
+EXPLAIN WITH RECURSIVE balances AS (
+    SELECT * from balance
+    UNION ALL 
+    SELECT time + 1 as time, name, account_balance + 10 as account_balance
+    FROM balances
+    WHERE time < 10
+)
+SELECT * FROM balances
+ORDER BY time, name, account_balance
+----
+logical_plan
+Sort: balances.time ASC NULLS LAST, balances.name ASC NULLS LAST, balances.account_balance ASC NULLS LAST
+--Projection: balances.time, balances.name, balances.account_balance
+----SubqueryAlias: balances
+------RecursiveQuery: is_distinct=false
+--------Projection: balance.time, balance.name, balance.account_balance
+----------TableScan: balance
+--------Projection: balances.time + Int64(1) AS time, balances.name, balances.account_balance + Int64(10) AS account_balance
+----------Filter: balances.time < Int64(10)
+------------TableScan: balances
+physical_plan
+SortExec: expr=[time@0 ASC NULLS LAST,name@1 ASC NULLS LAST,account_balance@2 ASC NULLS LAST]
+--RecursiveQueryExec: name=balances, is_distinct=false
+----CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/recursive_cte/balance.csv]]}, projection=[time, name, account_balance], has_header=true
+----CoalescePartitionsExec
+------ProjectionExec: expr=[time@0 + 1 as time, name@1 as name, account_balance@2 + 10 as account_balance]
+--------CoalesceBatchesExec: target_batch_size=2
+----------FilterExec: time@0 < 10
+------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+--------------WorkTableExec: name=balances
+
+# recursive CTE with static term derived from table works
+# note that this is run with batch size set to 2. This should produce multiple batches per iteration since the input
+# table 'balances' has 4 rows
+query ITI
+WITH RECURSIVE balances AS (
+    SELECT * from balance
+    UNION ALL 
+    SELECT time + 1 as time, name, account_balance + 10 as account_balance
+    FROM balances
+    WHERE time < 10
+)
+SELECT * FROM balances
+ORDER BY time, name, account_balance
+----
+1 John 100
+1 Tim 200
+2 John 110
+2 John 300
+2 Tim 210
+2 Tim 400
+3 John 120
+3 John 310
+3 Tim 220
+3 Tim 410
+4 John 130
+4 John 320
+4 Tim 230
+4 Tim 420
+5 John 140
+5 John 330
+5 Tim 240
+5 Tim 430
+6 John 150
+6 John 340
+6 Tim 250
+6 Tim 440
+7 John 160
+7 John 350
+7 Tim 260
+7 Tim 450
+8 John 170
+8 John 360
+8 Tim 270
+8 Tim 460
+9 John 180
+9 John 370
+9 Tim 280
+9 Tim 470
+10 John 190
+10 John 380
+10 Tim 290
+10 Tim 480
+
+# reset batch size to default
+statement ok
+set datafusion.execution.batch_size = 8182;
+
+# recursive CTE with recursive join works
+query ITI
+WITH RECURSIVE balances AS (
+    SELECT time as time, name as name, account_balance as account_balance
+    FROM balance
+    UNION ALL 
+    SELECT time + 1 as time, balances.name, account_balance + growth.account_growth as account_balance
+    FROM balances
+    JOIN growth
+    ON balances.name = growth.name
+    WHERE time < 10
+)
+SELECT * FROM balances
+ORDER BY time, name, account_balance
+----
+1 John 100
+1 Tim 200
+2 John 103
+2 John 300
+2 Tim 220
+2 Tim 400
+3 John 106
+3 John 303
+3 Tim 240
+3 Tim 420
+4 John 109
+4 John 306
+4 Tim 260
+4 Tim 440
+5 John 112
+5 John 309
+5 Tim 280
+5 Tim 460
+6 John 115
+6 John 312
+6 Tim 300
+6 Tim 480
+7 John 118
+7 John 315
+7 Tim 320
+7 Tim 500
+8 John 121
+8 John 318
+8 Tim 340
+8 Tim 520
+9 John 124
+9 John 321
+9 Tim 360
+9 Tim 540
+10 John 127
+10 John 324
+10 Tim 380
+10 Tim 560
+
+# recursive CTE with aggregations works
+query I rowsort
+WITH RECURSIVE nodes AS ( 
+    SELECT 1 as id
+    UNION ALL 
+    SELECT id + 1 as id 
+    FROM nodes
+    WHERE id < 10
+)
+SELECT sum(id) FROM nodes
+----
+55
+
+# setup
+statement ok
+CREATE TABLE t(a BIGINT) AS VALUES(1),(2),(3);
+
+# referencing CTE multiple times does not error
+query II rowsort
+WITH RECURSIVE my_cte AS (
+    SELECT a from t 
+    UNION ALL 
+    SELECT a+2 as a
+    FROM my_cte 
+    WHERE a<5
+)
+SELECT * FROM my_cte t1, my_cte
+----
+1 1
+1 2
+1 3
+1 3
+1 4
+1 5
+1 5
+1 6
+2 1
+2 2
+2 3
+2 3
+2 4
+2 5
+2 5
+2 6
+3 1
+3 1
+3 2
+3 2
+3 3
+3 3
+3 3
+3 3
+3 4
+3 4
+3 5
+3 5
+3 5
+3 5
+3 6
+3 6
+4 1
+4 2
+4 3
+4 3
+4 4
+4 5
+4 5
+4 6
+5 1
+5 1
+5 2
+5 2
+5 3
+5 3
+5 3
+5 3
+5 4
+5 4
+5 5
+5 5
+5 5
+5 5
+5 6
+5 6
+6 1
+6 2
+6 3
+6 3
+6 4
+6 5
+6 5
+6 6
+
+# CTE within recursive CTE works and does not result in 'index out of bounds: the len is 0 but the index is 0'
+query I
+WITH RECURSIVE "recursive_cte" AS (
+    SELECT 1 as "val"
+  UNION ALL (
+    WITH "sub_cte" AS (
+      SELECT
+        time,
+        1 as "val"
+      FROM
+        (SELECT DISTINCT "time" FROM "balance")
+    )
+    SELECT
+      2 as "val"
+    FROM
+      "recursive_cte" 
+      FULL JOIN "sub_cte" ON 1 = 1
+    WHERE
+      "recursive_cte"."val" < 2
+  )
+)
+SELECT
+   *
+FROM
+  "recursive_cte";
+----
+1
+2
+2
+
+# setup
+statement ok
+CREATE EXTERNAL TABLE prices STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/prices.csv'
+
+# CTE within window function inside nested CTE works. This test demonstrates using a nested window function to recursively iterate over a column.
+query RRII
+WITH RECURSIVE "recursive_cte" AS (
+  (
+    WITH "min_prices_row_num_cte" AS (
+      SELECT
+        MIN("prices"."prices_row_num") AS "prices_row_num"
+      FROM
+        "prices"
+    ),
+    "min_prices_row_num_cte_second" AS (
+      SELECT
+        MIN("prices"."prices_row_num") AS "prices_row_num_advancement"
+      FROM
+        "prices"
+      WHERE
+        "prices"."prices_row_num" > (
+          SELECT
+            "prices_row_num"
+          FROM
+            "min_prices_row_num_cte"
+        )
+    )
+    SELECT
+      0.0 AS "beg",
+      (0.0 + 50) AS "end",
+      (
+        SELECT
+          "prices_row_num"
+        FROM
+          "min_prices_row_num_cte"
+      ) AS "prices_row_num",
+      (
+        SELECT
+          "prices_row_num_advancement"
+        FROM
+          "min_prices_row_num_cte_second"
+      ) AS "prices_row_num_advancement"
+    FROM
+      "prices"
+    WHERE
+      "prices"."prices_row_num" = (
+        SELECT
+          DISTINCT "prices_row_num"
+        FROM
+          "min_prices_row_num_cte"
+      )
+  )
+  UNION ALL (
+    WITH "min_prices_row_num_cte" AS (
+      SELECT
+        "prices"."prices_row_num" AS "prices_row_num",
+        LEAD("prices"."prices_row_num", 1) OVER (
+          ORDER BY "prices_row_num"
+        ) AS "prices_row_num_advancement"
+      FROM
+        (
+          SELECT
+            DISTINCT "prices_row_num"
+          FROM
+            "prices"
+        ) AS "prices"
+    )
+    SELECT
+      "recursive_cte"."end" AS "beg",
+      ("recursive_cte"."end" + 50) AS "end",
+      "min_prices_row_num_cte"."prices_row_num" AS "prices_row_num",
+      "min_prices_row_num_cte"."prices_row_num_advancement" AS "prices_row_num_advancement"
+    FROM
+      "recursive_cte"
+      FULL JOIN "prices" ON "prices"."prices_row_num" = "recursive_cte"."prices_row_num_advancement"
+      FULL JOIN "min_prices_row_num_cte" ON "min_prices_row_num_cte"."prices_row_num" = COALESCE(
+        "prices"."prices_row_num",
+        "recursive_cte"."prices_row_num_advancement"
+      )
+    WHERE
+      "recursive_cte"."prices_row_num_advancement" IS NOT NULL
+  )
+)
+SELECT
+  DISTINCT *
+FROM
+  "recursive_cte"
+ORDER BY
+  "prices_row_num" ASC;
+----
+0 50 1 2
+50 100 2 3
+100 150 3 4
+150 200 4 5
+200 250 5 6
+250 300 6 7
+300 350 7 8
+350 400 8 9
+400 450 9 10
+450 500 10 11
+500 550 11 12
+550 600 12 13
+600 650 13 14
+650 700 14 15
+700 750 15 16
+750 800 16 17
+800 850 17 18
+850 900 18 19
+900 950 19 20
+950 1000 20 21
+1000 1050 21 22
+1050 1100 22 23
+1100 1150 23 24
+1150 1200 24 25
+1200 1250 25 26
+1250 1300 26 27
+1300 1350 27 28
+1350 1400 28 29
+1400 1450 29 30
+1450 1500 30 31
+1500 1550 31 32
+1550 1600 32 33
+1600 1650 33 34
+1650 1700 34 35
+1700 1750 35 36
+1750 1800 36 37
+1800 1850 37 38
+1850 1900 38 39
+1900 1950 39 40
+1950 2000 40 41
+2000 2050 41 42
+2050 2100 42 43
+2100 2150 43 44
+2150 2200 44 45
+2200 2250 45 46
+2250 2300 46 47
+2300 2350 47 48
+2350 2400 48 49
+2400 2450 49 50
+2450 2500 50 51
+2500 2550 51 52
+2550 2600 52 53
+2600 2650 53 54
+2650 2700 54 55
+2700 2750 55 56
+2750 2800 56 57
+2800 2850 57 58
+2850 2900 58 59
+2900 2950 59 60
+2950 3000 60 61
+3000 3050 61 62
+3050 3100 62 63
+3100 3150 63 64
+3150 3200 64 65
+3200 3250 65 66
+3250 3300 66 67
+3300 3350 67 68
+3350 3400 68 69
+3400 3450 69 70
+3450 3500 70 71
+3500 3550 71 72
+3550 3600 72 73
+3600 3650 73 74
+3650 3700 74 75
+3700 3750 75 76
+3750 3800 76 77
+3800 3850 77 78
+3850 3900 78 79
+3900 3950 79 80
+3950 4000 80 81
+4000 4050 81 82
+4050 4100 82 83
+4100 4150 83 84
+4150 4200 84 85
+4200 4250 85 86
+4250 4300 86 87
+4300 4350 87 88
+4350 4400 88 89
+4400 4450 89 90
+4450 4500 90 91
+4500 4550 91 92
+4550 4600 92 93
+4600 4650 93 94
+4650 4700 94 95
+4700 4750 95 96
+4750 4800 96 97
+4800 4850 97 98
+4850 4900 98 99
+4900 4950 99 100
+4950 5000 100 NULL
+
+# setup
+statement ok
+CREATE EXTERNAL TABLE sales STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/sales.csv'
+
+# setup
+statement ok
+CREATE EXTERNAL TABLE salespersons STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/salespersons.csv'
+
+
+# group by works within recursive cte. This test case demonstrates rolling up a hierarchy of salespeople to their managers.
+query III
+WITH RECURSIVE region_sales AS (
+    -- Anchor member
+    SELECT
+        s.salesperson_id AS salesperson_id,
+        SUM(s.sale_amount) AS amount,
+        0 as level
+    FROM
+        sales s
+    GROUP BY
+        s.salesperson_id
+    UNION ALL
+    -- Recursive member
+    SELECT
+      sp.manager_id AS salesperson_id,
+      SUM(rs.amount) AS amount,
+      MIN(rs.level) + 1 as level 
+    FROM
+        region_sales rs
+    INNER JOIN salespersons sp ON rs.salesperson_id = sp.salesperson_id
+    WHERE sp.manager_id IS NOT NULL
+    GROUP BY
+      sp.manager_id
+)
+SELECT
+    salesperson_id,
+    MAX(amount) as amount,
+    MAX(level) as hierarchy_level 
+FROM
+    region_sales
+GROUP BY
+  salesperson_id
+ORDER BY
+  hierarchy_level ASC, salesperson_id ASC;
+----
+4 700 0
+5 600 0
+6 500 0
+7 900 0
+2 1300 1
+3 1400 1
+1 2700 2
+
+#expect error from recursive CTE with nested recursive terms
+query error DataFusion error: This feature is not implemented: Recursive queries cannot be nested
+WITH RECURSIVE outer_cte AS (
+    SELECT 1 as a
+    UNION ALL (
+        WITH  RECURSIVE nested_cte AS (
+           SELECT 1 as a
+           UNION ALL
+           SELECT a+2 as a
+	   FROM nested_cte where a < 3
+         )
+    SELECT outer_cte.a +2
+    FROM outer_cte JOIN nested_cte USING(a)
+    WHERE nested_cte.a < 4
+   )
+)
+SELECT a FROM outer_cte;
+
+# expect error when recursive CTE is referenced multiple times in the recursive term
+query error DataFusion error: This feature is not implemented: Multiple recursive references to the same CTE are not supported
+WITH RECURSIVE my_cte AS (
+    SELECT 1 as a
+    UNION ALL
+    SELECT my_cte.a+2 as a
+    FROM my_cte join my_cte c2 using(a)
+    WHERE my_cte.a<5
+)
+SELECT a FROM my_cte;

From 7a5f2054305fd92852b589473afbc9bb034379d7 Mon Sep 17 00:00:00 2001
From: Jonah Gao <jonahgaox@gmail.com>
Date: Sun, 28 Jan 2024 00:36:30 +0800
Subject: [PATCH 25/27] test: move the creation of the nan_table to slt (#9022)

---
 datafusion/sqllogictest/src/test_context.rs   | 25 -------------------
 datafusion/sqllogictest/test_files/scalar.slt |  3 ++-
 2 files changed, 2 insertions(+), 26 deletions(-)

diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
index 889ccdcd66d4..dd27727e3ad5 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -73,10 +73,6 @@ impl TestContext {
 
         let file_name = relative_path.file_name().unwrap().to_str().unwrap();
         match file_name {
-            "scalar.slt" => {
-                info!("Registering scalar tables");
-                register_scalar_tables(test_ctx.session_ctx()).await;
-            }
             "information_schema_table_types.slt" => {
                 info!("Registering local temporary table");
                 register_temp_table(test_ctx.session_ctx()).await;
@@ -170,27 +166,6 @@ pub async fn register_avro_tables(ctx: &mut crate::TestContext) {
         .unwrap();
 }
 
-pub async fn register_scalar_tables(ctx: &SessionContext) {
-    register_nan_table(ctx)
-}
-
-/// Register a table with a NaN value (different than NULL, and can
-/// not be created via SQL)
-fn register_nan_table(ctx: &SessionContext) {
-    let schema = Arc::new(Schema::new(vec![Field::new("c1", DataType::Float64, true)]));
-
-    let data = RecordBatch::try_new(
-        schema,
-        vec![Arc::new(Float64Array::from(vec![
-            Some(1.0),
-            None,
-            Some(f64::NAN),
-        ]))],
-    )
-    .unwrap();
-    ctx.register_batch("test_float", data).unwrap();
-}
-
 /// Generate a partitioned CSV file and register it with an execution context
 pub async fn register_partition_table(test_ctx: &mut TestContext) {
     test_ctx.enable_testdir();
diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt
index 5b3ecab5fd76..0f76c722e946 100644
--- a/datafusion/sqllogictest/test_files/scalar.slt
+++ b/datafusion/sqllogictest/test_files/scalar.slt
@@ -1383,7 +1383,8 @@ create table test_boolean(c1 boolean) as values (false), (null), (true);
 statement ok
 create table test_int32(c1 int) as values (0), (1), (null), (3);
 
-## Note that test_float has a NaN (which is not possible to create in SQL) so it is registered via rust.
+statement ok
+create table test_float(c1 double) as values (1.0), (null), ('NaN'::double);
 
 # query_not()
 

From 9c4affe785a187832927fee41c69721e346a2816 Mon Sep 17 00:00:00 2001
From: Mehmet Ozan Kabak <ozankabak@gmail.com>
Date: Sun, 28 Jan 2024 01:05:56 +0300
Subject: [PATCH 26/27] TreeNode refactor code deduplication: Part 3 (#8817)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Reduce code duplication

* fix doc tests

* addressing ozan's todos

* resolving merge conflicts

* remove test duplication

* Update enforce_sorting.rs

* tmp

* remove coalesce fix

* enforce dist and sort refactors

* Diff decreasing

* Review Part 1

* Review Part 2

* optimize tests

* Review Part 3

* remove new_default

* Review Part 4

* Review Part 5

* Review Part 6

* Review Part 7

* Resolve logical conflicts

* Review Part 8

* Remove clone from PlanContext

* renaming

* Remove deriving Clone on ExprContext

* Review Part 9

* Review Part 10

* Fix failing tests

* Review Part 11

* Stabilize previously unstable tests

* Review Part 12

* Adapt tests to upstream changes

* Review Part 13

* Review Part 14

* Fix import

* Move cross check inside assert optimized

* Move cross check to asset_optimized macro

* Retract aggregate topk tests (these will be solved with another PR)

* better code documentation

---------

Co-authored-by: berkaysynnada <berkay.sahin@synnada.ai>
Co-authored-by: Mustafa Akur <mustafa.akur@synnada.ai>
Co-authored-by: Berkay Şahin <124376117+berkaysynnada@users.noreply.github.com>
---
 datafusion/common/src/tree_node.rs            | 137 +--
 .../enforce_distribution.rs                   | 767 ++++++---------
 .../src/physical_optimizer/enforce_sorting.rs | 878 ++++++++----------
 .../src/physical_optimizer/join_selection.rs  | 283 +++---
 .../physical_optimizer/pipeline_checker.rs    |  67 +-
 .../replace_with_order_preserving_variants.rs | 322 +++----
 .../src/physical_optimizer/sort_pushdown.rs   | 201 ++--
 .../core/src/physical_optimizer/test_utils.rs |  27 +-
 .../core/src/physical_optimizer/utils.rs      |  29 +-
 datafusion/expr/src/tree_node/expr.rs         | 193 ++--
 datafusion/expr/src/tree_node/plan.rs         |  58 +-
 .../src/equivalence/properties.rs             |  38 +-
 .../physical-expr/src/intervals/cp_solver.rs  |   2 +-
 .../physical-expr/src/sort_properties.rs      |  68 +-
 datafusion/physical-expr/src/tree_node.rs     |  71 +-
 datafusion/physical-expr/src/utils/mod.rs     |  53 +-
 datafusion/physical-plan/src/tree_node.rs     |  70 +-
 17 files changed, 1479 insertions(+), 1785 deletions(-)

diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs
index 5f11c8cc1d11..c5c4ee824d61 100644
--- a/datafusion/common/src/tree_node.rs
+++ b/datafusion/common/src/tree_node.rs
@@ -18,11 +18,29 @@
 //! This module provides common traits for visiting or rewriting tree
 //! data structures easily.
 
-use std::borrow::Cow;
 use std::sync::Arc;
 
 use crate::Result;
 
+/// If the function returns [`VisitRecursion::Continue`], the normal execution of the
+/// function continues. If it returns [`VisitRecursion::Skip`], the function returns
+/// with [`VisitRecursion::Continue`] to jump next recursion step, bypassing further
+/// exploration of the current step. In case of [`VisitRecursion::Stop`], the function
+/// return with [`VisitRecursion::Stop`] and recursion halts.
+#[macro_export]
+macro_rules! handle_tree_recursion {
+    ($EXPR:expr) => {
+        match $EXPR {
+            VisitRecursion::Continue => {}
+            // If the recursion should skip, do not apply to its children, let
+            // the recursion continue:
+            VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
+            // If the recursion should stop, do not apply to its children:
+            VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
+        }
+    };
+}
+
 /// Defines a visitable and rewriteable a tree node. This trait is
 /// implemented for plans ([`ExecutionPlan`] and [`LogicalPlan`]) as
 /// well as expression trees ([`PhysicalExpr`], [`Expr`]) in
@@ -33,27 +51,18 @@ use crate::Result;
 /// [`PhysicalExpr`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.PhysicalExpr.html
 /// [`LogicalPlan`]: https://docs.rs/datafusion-expr/latest/datafusion_expr/logical_plan/enum.LogicalPlan.html
 /// [`Expr`]: https://docs.rs/datafusion-expr/latest/datafusion_expr/expr/enum.Expr.html
-pub trait TreeNode: Sized + Clone {
-    /// Returns all children of the TreeNode
-    fn children_nodes(&self) -> Vec<Cow<Self>>;
-
-    /// Use preorder to iterate the node on the tree so that we can
-    /// stop fast for some cases.
+pub trait TreeNode: Sized {
+    /// Applies `op` to the node and its children. `op` is applied in a preoder way,
+    /// and it is controlled by [`VisitRecursion`], which means result of the `op`
+    /// on the self node can cause an early return.
     ///
     /// The `op` closure can be used to collect some info from the
     /// tree node or do some checking for the tree node.
-    fn apply<F>(&self, op: &mut F) -> Result<VisitRecursion>
-    where
-        F: FnMut(&Self) -> Result<VisitRecursion>,
-    {
-        match op(self)? {
-            VisitRecursion::Continue => {}
-            // If the recursion should skip, do not apply to its children. And let the recursion continue
-            VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
-            // If the recursion should stop, do not apply to its children
-            VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
-        };
-
+    fn apply<F: FnMut(&Self) -> Result<VisitRecursion>>(
+        &self,
+        op: &mut F,
+    ) -> Result<VisitRecursion> {
+        handle_tree_recursion!(op(self)?);
         self.apply_children(&mut |node| node.apply(op))
     }
 
@@ -89,22 +98,8 @@ pub trait TreeNode: Sized + Clone {
         &self,
         visitor: &mut V,
     ) -> Result<VisitRecursion> {
-        match visitor.pre_visit(self)? {
-            VisitRecursion::Continue => {}
-            // If the recursion should skip, do not apply to its children. And let the recursion continue
-            VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
-            // If the recursion should stop, do not apply to its children
-            VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
-        };
-
-        match self.apply_children(&mut |node| node.visit(visitor))? {
-            VisitRecursion::Continue => {}
-            // If the recursion should skip, do not apply to its children. And let the recursion continue
-            VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
-            // If the recursion should stop, do not apply to its children
-            VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
-        }
-
+        handle_tree_recursion!(visitor.pre_visit(self)?);
+        handle_tree_recursion!(self.apply_children(&mut |node| node.visit(visitor))?);
         visitor.post_visit(self)
     }
 
@@ -148,7 +143,6 @@ pub trait TreeNode: Sized + Clone {
         F: Fn(Self) -> Result<Transformed<Self>>,
     {
         let after_op_children = self.map_children(|node| node.transform_up(op))?;
-
         let new_node = op(after_op_children)?.into();
         Ok(new_node)
     }
@@ -161,7 +155,6 @@ pub trait TreeNode: Sized + Clone {
         F: FnMut(Self) -> Result<Transformed<Self>>,
     {
         let after_op_children = self.map_children(|node| node.transform_up_mut(op))?;
-
         let new_node = op(after_op_children)?.into();
         Ok(new_node)
     }
@@ -215,17 +208,7 @@ pub trait TreeNode: Sized + Clone {
     /// Apply the closure `F` to the node's children
     fn apply_children<F>(&self, op: &mut F) -> Result<VisitRecursion>
     where
-        F: FnMut(&Self) -> Result<VisitRecursion>,
-    {
-        for child in self.children_nodes() {
-            match op(&child)? {
-                VisitRecursion::Continue => {}
-                VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
-                VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
-            }
-        }
-        Ok(VisitRecursion::Continue)
-    }
+        F: FnMut(&Self) -> Result<VisitRecursion>;
 
     /// Apply transform `F` to the node's children, the transform `F` might have a direction(Preorder or Postorder)
     fn map_children<F>(self, transform: F) -> Result<Self>
@@ -356,8 +339,15 @@ pub trait DynTreeNode {
 /// Blanket implementation for Arc for any tye that implements
 /// [`DynTreeNode`] (such as [`Arc<dyn PhysicalExpr>`])
 impl<T: DynTreeNode + ?Sized> TreeNode for Arc<T> {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.arc_children().into_iter().map(Cow::Owned).collect()
+    /// Apply the closure `F` to the node's children
+    fn apply_children<F>(&self, op: &mut F) -> Result<VisitRecursion>
+    where
+        F: FnMut(&Self) -> Result<VisitRecursion>,
+    {
+        for child in self.arc_children() {
+            handle_tree_recursion!(op(&child)?)
+        }
+        Ok(VisitRecursion::Continue)
     }
 
     fn map_children<F>(self, transform: F) -> Result<Self>
@@ -366,12 +356,53 @@ impl<T: DynTreeNode + ?Sized> TreeNode for Arc<T> {
     {
         let children = self.arc_children();
         if !children.is_empty() {
-            let new_children: Result<Vec<_>> =
-                children.into_iter().map(transform).collect();
+            let new_children =
+                children.into_iter().map(transform).collect::<Result<_>>()?;
             let arc_self = Arc::clone(&self);
-            self.with_new_arc_children(arc_self, new_children?)
+            self.with_new_arc_children(arc_self, new_children)
         } else {
             Ok(self)
         }
     }
 }
+
+/// Instead of implementing [`TreeNode`], it's recommended to implement a [`ConcreteTreeNode`] for
+/// trees that contain nodes with payloads. This approach ensures safe execution of algorithms
+/// involving payloads, by enforcing rules for detaching and reattaching child nodes.
+pub trait ConcreteTreeNode: Sized {
+    /// Provides read-only access to child nodes.
+    fn children(&self) -> Vec<&Self>;
+
+    /// Detaches the node from its children, returning the node itself and its detached children.
+    fn take_children(self) -> (Self, Vec<Self>);
+
+    /// Reattaches updated child nodes to the node, returning the updated node.
+    fn with_new_children(self, children: Vec<Self>) -> Result<Self>;
+}
+
+impl<T: ConcreteTreeNode> TreeNode for T {
+    /// Apply the closure `F` to the node's children
+    fn apply_children<F>(&self, op: &mut F) -> Result<VisitRecursion>
+    where
+        F: FnMut(&Self) -> Result<VisitRecursion>,
+    {
+        for child in self.children() {
+            handle_tree_recursion!(op(child)?)
+        }
+        Ok(VisitRecursion::Continue)
+    }
+
+    fn map_children<F>(self, transform: F) -> Result<Self>
+    where
+        F: FnMut(Self) -> Result<Self>,
+    {
+        let (new_self, children) = self.take_children();
+        if !children.is_empty() {
+            let new_children =
+                children.into_iter().map(transform).collect::<Result<_>>()?;
+            new_self.with_new_children(new_children)
+        } else {
+            Ok(new_self)
+        }
+    }
+}
diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
index a2f530c0e689..0c5c2d78b690 100644
--- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs
@@ -21,9 +21,7 @@
 //! according to the configuration), this rule increases partition counts in
 //! the physical plan.
 
-use std::borrow::Cow;
-use std::fmt;
-use std::fmt::Formatter;
+use std::fmt::Debug;
 use std::sync::Arc;
 
 use super::output_requirements::OutputRequirementExec;
@@ -41,11 +39,10 @@ use crate::physical_plan::joins::{
 use crate::physical_plan::projection::ProjectionExec;
 use crate::physical_plan::repartition::RepartitionExec;
 use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use crate::physical_plan::tree_node::PlanContext;
 use crate::physical_plan::union::{can_interleave, InterleaveExec, UnionExec};
 use crate::physical_plan::windows::WindowAggExec;
-use crate::physical_plan::{
-    with_new_children_if_necessary, Distribution, ExecutionPlan, Partitioning,
-};
+use crate::physical_plan::{Distribution, ExecutionPlan, Partitioning};
 
 use arrow::compute::SortOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
@@ -57,8 +54,8 @@ use datafusion_physical_expr::{
     PhysicalSortRequirement,
 };
 use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::unbounded_output;
 use datafusion_physical_plan::windows::{get_best_fitting_window, BoundedWindowAggExec};
-use datafusion_physical_plan::{get_plan_string, unbounded_output};
 
 use itertools::izip;
 
@@ -199,7 +196,7 @@ impl PhysicalOptimizerRule for EnforceDistribution {
 
         let adjusted = if top_down_join_key_reordering {
             // Run a top-down process to adjust input key ordering recursively
-            let plan_requirements = PlanWithKeyRequirements::new(plan);
+            let plan_requirements = PlanWithKeyRequirements::new_default(plan);
             let adjusted =
                 plan_requirements.transform_down(&adjust_input_keys_ordering)?;
             adjusted.plan
@@ -210,7 +207,7 @@ impl PhysicalOptimizerRule for EnforceDistribution {
             })?
         };
 
-        let distribution_context = DistributionContext::new(adjusted);
+        let distribution_context = DistributionContext::new_default(adjusted);
         // Distribution enforcement needs to be applied bottom-up.
         let distribution_context =
             distribution_context.transform_up(&|distribution_context| {
@@ -273,8 +270,7 @@ impl PhysicalOptimizerRule for EnforceDistribution {
 fn adjust_input_keys_ordering(
     mut requirements: PlanWithKeyRequirements,
 ) -> Result<Transformed<PlanWithKeyRequirements>> {
-    let parent_required = requirements.required_key_ordering.clone();
-    let plan_any = requirements.plan.as_any();
+    let plan = requirements.plan.clone();
 
     if let Some(HashJoinExec {
         left,
@@ -285,7 +281,7 @@ fn adjust_input_keys_ordering(
         mode,
         null_equals_null,
         ..
-    }) = plan_any.downcast_ref::<HashJoinExec>()
+    }) = plan.as_any().downcast_ref::<HashJoinExec>()
     {
         match mode {
             PartitionMode::Partitioned => {
@@ -302,50 +298,44 @@ fn adjust_input_keys_ordering(
                         )
                         .map(|e| Arc::new(e) as _)
                     };
-                reorder_partitioned_join_keys(
-                    requirements.plan.clone(),
-                    &parent_required,
+                return reorder_partitioned_join_keys(
+                    requirements,
                     on,
                     vec![],
                     &join_constructor,
                 )
-                .map(Transformed::Yes)
+                .map(Transformed::Yes);
             }
             PartitionMode::CollectLeft => {
-                let new_right_request = match join_type {
+                // Push down requirements to the right side
+                requirements.children[1].data = match join_type {
                     JoinType::Inner | JoinType::Right => shift_right_required(
-                        &parent_required,
+                        &requirements.data,
                         left.schema().fields().len(),
-                    ),
+                    )
+                    .unwrap_or_default(),
                     JoinType::RightSemi | JoinType::RightAnti => {
-                        Some(parent_required.clone())
+                        requirements.data.clone()
                     }
                     JoinType::Left
                     | JoinType::LeftSemi
                     | JoinType::LeftAnti
-                    | JoinType::Full => None,
+                    | JoinType::Full => vec![],
                 };
-
-                // Push down requirements to the right side
-                requirements.children[1].required_key_ordering =
-                    new_right_request.unwrap_or(vec![]);
-                Ok(Transformed::Yes(requirements))
             }
             PartitionMode::Auto => {
                 // Can not satisfy, clear the current requirements and generate new empty requirements
-                Ok(Transformed::Yes(PlanWithKeyRequirements::new(
-                    requirements.plan,
-                )))
+                requirements.data.clear();
             }
         }
     } else if let Some(CrossJoinExec { left, .. }) =
-        plan_any.downcast_ref::<CrossJoinExec>()
+        plan.as_any().downcast_ref::<CrossJoinExec>()
     {
         let left_columns_len = left.schema().fields().len();
         // Push down requirements to the right side
-        requirements.children[1].required_key_ordering =
-            shift_right_required(&parent_required, left_columns_len).unwrap_or_default();
-        Ok(Transformed::Yes(requirements))
+        requirements.children[1].data =
+            shift_right_required(&requirements.data, left_columns_len)
+                .unwrap_or_default();
     } else if let Some(SortMergeJoinExec {
         left,
         right,
@@ -354,7 +344,7 @@ fn adjust_input_keys_ordering(
         sort_options,
         null_equals_null,
         ..
-    }) = plan_any.downcast_ref::<SortMergeJoinExec>()
+    }) = plan.as_any().downcast_ref::<SortMergeJoinExec>()
     {
         let join_constructor =
             |new_conditions: (Vec<(Column, Column)>, Vec<SortOptions>)| {
@@ -368,65 +358,56 @@ fn adjust_input_keys_ordering(
                 )
                 .map(|e| Arc::new(e) as _)
             };
-        reorder_partitioned_join_keys(
-            requirements.plan.clone(),
-            &parent_required,
+        return reorder_partitioned_join_keys(
+            requirements,
             on,
             sort_options.clone(),
             &join_constructor,
         )
-        .map(Transformed::Yes)
-    } else if let Some(aggregate_exec) = plan_any.downcast_ref::<AggregateExec>() {
-        if !parent_required.is_empty() {
-            match aggregate_exec.mode() {
-                AggregateMode::FinalPartitioned => reorder_aggregate_keys(
-                    requirements.plan.clone(),
-                    &parent_required,
-                    aggregate_exec,
-                )
-                .map(Transformed::Yes),
-                _ => Ok(Transformed::Yes(PlanWithKeyRequirements::new(
-                    requirements.plan,
-                ))),
+        .map(Transformed::Yes);
+    } else if let Some(aggregate_exec) = plan.as_any().downcast_ref::<AggregateExec>() {
+        if !requirements.data.is_empty() {
+            if aggregate_exec.mode() == &AggregateMode::FinalPartitioned {
+                return reorder_aggregate_keys(requirements, aggregate_exec)
+                    .map(Transformed::Yes);
+            } else {
+                requirements.data.clear();
             }
         } else {
             // Keep everything unchanged
-            Ok(Transformed::No(requirements))
+            return Ok(Transformed::No(requirements));
         }
-    } else if let Some(proj) = plan_any.downcast_ref::<ProjectionExec>() {
+    } else if let Some(proj) = plan.as_any().downcast_ref::<ProjectionExec>() {
         let expr = proj.expr();
         // For Projection, we need to transform the requirements to the columns before the Projection
         // And then to push down the requirements
         // Construct a mapping from new name to the the orginal Column
-        let new_required = map_columns_before_projection(&parent_required, expr);
-        if new_required.len() == parent_required.len() {
-            requirements.children[0].required_key_ordering = new_required;
-            Ok(Transformed::Yes(requirements))
+        let new_required = map_columns_before_projection(&requirements.data, expr);
+        if new_required.len() == requirements.data.len() {
+            requirements.children[0].data = new_required;
         } else {
             // Can not satisfy, clear the current requirements and generate new empty requirements
-            Ok(Transformed::Yes(PlanWithKeyRequirements::new(
-                requirements.plan,
-            )))
+            requirements.data.clear();
         }
-    } else if plan_any.downcast_ref::<RepartitionExec>().is_some()
-        || plan_any.downcast_ref::<CoalescePartitionsExec>().is_some()
-        || plan_any.downcast_ref::<WindowAggExec>().is_some()
+    } else if plan.as_any().downcast_ref::<RepartitionExec>().is_some()
+        || plan
+            .as_any()
+            .downcast_ref::<CoalescePartitionsExec>()
+            .is_some()
+        || plan.as_any().downcast_ref::<WindowAggExec>().is_some()
     {
-        Ok(Transformed::Yes(PlanWithKeyRequirements::new(
-            requirements.plan,
-        )))
+        requirements.data.clear();
     } else {
         // By default, push down the parent requirements to children
-        requirements.children.iter_mut().for_each(|child| {
-            child.required_key_ordering = parent_required.clone();
-        });
-        Ok(Transformed::Yes(requirements))
+        for child in requirements.children.iter_mut() {
+            child.data = requirements.data.clone();
+        }
     }
+    Ok(Transformed::Yes(requirements))
 }
 
 fn reorder_partitioned_join_keys<F>(
-    join_plan: Arc<dyn ExecutionPlan>,
-    parent_required: &[Arc<dyn PhysicalExpr>],
+    mut join_plan: PlanWithKeyRequirements,
     on: &[(Column, Column)],
     sort_options: Vec<SortOptions>,
     join_constructor: &F,
@@ -434,56 +415,48 @@ fn reorder_partitioned_join_keys<F>(
 where
     F: Fn((Vec<(Column, Column)>, Vec<SortOptions>)) -> Result<Arc<dyn ExecutionPlan>>,
 {
+    let parent_required = &join_plan.data;
     let join_key_pairs = extract_join_keys(on);
+    let eq_properties = join_plan.plan.equivalence_properties();
+
     if let Some((
         JoinKeyPairs {
             left_keys,
             right_keys,
         },
         new_positions,
-    )) = try_reorder(
-        join_key_pairs.clone(),
-        parent_required,
-        &join_plan.equivalence_properties(),
-    ) {
+    )) = try_reorder(join_key_pairs.clone(), parent_required, &eq_properties)
+    {
         if !new_positions.is_empty() {
             let new_join_on = new_join_conditions(&left_keys, &right_keys);
-            let mut new_sort_options: Vec<SortOptions> = vec![];
-            for idx in 0..sort_options.len() {
-                new_sort_options.push(sort_options[new_positions[idx]])
-            }
-            let mut requirement_tree = PlanWithKeyRequirements::new(join_constructor((
-                new_join_on,
-                new_sort_options,
-            ))?);
-            requirement_tree.children[0].required_key_ordering = left_keys;
-            requirement_tree.children[1].required_key_ordering = right_keys;
-            Ok(requirement_tree)
-        } else {
-            let mut requirement_tree = PlanWithKeyRequirements::new(join_plan);
-            requirement_tree.children[0].required_key_ordering = left_keys;
-            requirement_tree.children[1].required_key_ordering = right_keys;
-            Ok(requirement_tree)
+            let new_sort_options = (0..sort_options.len())
+                .map(|idx| sort_options[new_positions[idx]])
+                .collect();
+            join_plan.plan = join_constructor((new_join_on, new_sort_options))?;
         }
+        let mut requirements = join_plan;
+        requirements.children[0].data = left_keys;
+        requirements.children[1].data = right_keys;
+        Ok(requirements)
     } else {
-        let mut requirement_tree = PlanWithKeyRequirements::new(join_plan);
-        requirement_tree.children[0].required_key_ordering = join_key_pairs.left_keys;
-        requirement_tree.children[1].required_key_ordering = join_key_pairs.right_keys;
-        Ok(requirement_tree)
+        let mut requirements = join_plan;
+        requirements.children[0].data = join_key_pairs.left_keys;
+        requirements.children[1].data = join_key_pairs.right_keys;
+        Ok(requirements)
     }
 }
 
 fn reorder_aggregate_keys(
-    agg_plan: Arc<dyn ExecutionPlan>,
-    parent_required: &[Arc<dyn PhysicalExpr>],
+    mut agg_node: PlanWithKeyRequirements,
     agg_exec: &AggregateExec,
 ) -> Result<PlanWithKeyRequirements> {
+    let parent_required = &agg_node.data;
     let output_columns = agg_exec
         .group_by()
         .expr()
         .iter()
         .enumerate()
-        .map(|(index, (_col, name))| Column::new(name, index))
+        .map(|(index, (_, name))| Column::new(name, index))
         .collect::<Vec<_>>();
 
     let output_exprs = output_columns
@@ -491,95 +464,82 @@ fn reorder_aggregate_keys(
         .map(|c| Arc::new(c.clone()) as _)
         .collect::<Vec<_>>();
 
-    if parent_required.len() != output_exprs.len()
-        || !agg_exec.group_by().null_expr().is_empty()
-        || physical_exprs_equal(&output_exprs, parent_required)
+    if parent_required.len() == output_exprs.len()
+        && agg_exec.group_by().null_expr().is_empty()
+        && !physical_exprs_equal(&output_exprs, parent_required)
     {
-        Ok(PlanWithKeyRequirements::new(agg_plan))
-    } else {
-        let new_positions = expected_expr_positions(&output_exprs, parent_required);
-        match new_positions {
-            None => Ok(PlanWithKeyRequirements::new(agg_plan)),
-            Some(positions) => {
-                let new_partial_agg = if let Some(agg_exec) =
-                    agg_exec.input().as_any().downcast_ref::<AggregateExec>()
-                {
-                    if matches!(agg_exec.mode(), &AggregateMode::Partial) {
-                        let group_exprs = agg_exec.group_by().expr();
-                        let new_group_exprs = positions
-                            .into_iter()
-                            .map(|idx| group_exprs[idx].clone())
-                            .collect();
-                        let new_partial_group_by =
-                            PhysicalGroupBy::new_single(new_group_exprs);
-                        Some(Arc::new(AggregateExec::try_new(
-                            AggregateMode::Partial,
-                            new_partial_group_by,
-                            agg_exec.aggr_expr().to_vec(),
-                            agg_exec.filter_expr().to_vec(),
-                            agg_exec.input().clone(),
-                            agg_exec.input_schema.clone(),
-                        )?))
-                    } else {
-                        None
-                    }
-                } else {
-                    None
-                };
-                if let Some(partial_agg) = new_partial_agg {
-                    // Build new group expressions that correspond to the output of partial_agg
+        if let Some(positions) = expected_expr_positions(&output_exprs, parent_required) {
+            if let Some(agg_exec) =
+                agg_exec.input().as_any().downcast_ref::<AggregateExec>()
+            {
+                if matches!(agg_exec.mode(), &AggregateMode::Partial) {
+                    let group_exprs = agg_exec.group_by().expr();
+                    let new_group_exprs = positions
+                        .into_iter()
+                        .map(|idx| group_exprs[idx].clone())
+                        .collect();
+                    let partial_agg = Arc::new(AggregateExec::try_new(
+                        AggregateMode::Partial,
+                        PhysicalGroupBy::new_single(new_group_exprs),
+                        agg_exec.aggr_expr().to_vec(),
+                        agg_exec.filter_expr().to_vec(),
+                        agg_exec.input().clone(),
+                        agg_exec.input_schema.clone(),
+                    )?);
+                    // Build new group expressions that correspond to the output
+                    // of the "reordered" aggregator:
                     let group_exprs = partial_agg.group_expr().expr();
-                    let new_final_group = partial_agg.output_group_expr();
                     let new_group_by = PhysicalGroupBy::new_single(
-                        new_final_group
-                            .iter()
+                        partial_agg
+                            .output_group_expr()
+                            .into_iter()
                             .enumerate()
-                            .map(|(idx, expr)| (expr.clone(), group_exprs[idx].1.clone()))
+                            .map(|(idx, expr)| (expr, group_exprs[idx].1.clone()))
                             .collect(),
                     );
-
                     let new_final_agg = Arc::new(AggregateExec::try_new(
                         AggregateMode::FinalPartitioned,
                         new_group_by,
                         agg_exec.aggr_expr().to_vec(),
                         agg_exec.filter_expr().to_vec(),
-                        partial_agg,
+                        partial_agg.clone(),
                         agg_exec.input_schema(),
                     )?);
 
+                    agg_node.plan = new_final_agg.clone();
+                    agg_node.data.clear();
+                    agg_node.children = vec![PlanWithKeyRequirements::new(
+                        partial_agg as _,
+                        vec![],
+                        agg_node.children.swap_remove(0).children,
+                    )];
+
                     // Need to create a new projection to change the expr ordering back
                     let agg_schema = new_final_agg.schema();
                     let mut proj_exprs = output_columns
                         .iter()
                         .map(|col| {
                             let name = col.name();
-                            (
-                                Arc::new(Column::new(
-                                    name,
-                                    agg_schema.index_of(name).unwrap(),
-                                )) as _,
-                                name.to_owned(),
-                            )
+                            let index = agg_schema.index_of(name)?;
+                            Ok((Arc::new(Column::new(name, index)) as _, name.to_owned()))
                         })
-                        .collect::<Vec<_>>();
+                        .collect::<Result<Vec<_>>>()?;
                     let agg_fields = agg_schema.fields();
                     for (idx, field) in
                         agg_fields.iter().enumerate().skip(output_columns.len())
                     {
                         let name = field.name();
-                        proj_exprs
-                            .push((Arc::new(Column::new(name, idx)) as _, name.clone()))
+                        let plan = Arc::new(Column::new(name, idx)) as _;
+                        proj_exprs.push((plan, name.clone()))
                     }
-                    // TODO merge adjacent Projections if there are
-                    Ok(PlanWithKeyRequirements::new(Arc::new(
-                        ProjectionExec::try_new(proj_exprs, new_final_agg)?,
-                    )))
-                } else {
-                    Ok(PlanWithKeyRequirements::new(agg_plan))
+                    return ProjectionExec::try_new(proj_exprs, new_final_agg).map(|p| {
+                        PlanWithKeyRequirements::new(Arc::new(p), vec![], vec![agg_node])
+                    });
                 }
             }
         }
     }
+    Ok(agg_node)
 }
 
 fn shift_right_required(
@@ -589,17 +549,11 @@ fn shift_right_required(
     let new_right_required = parent_required
         .iter()
         .filter_map(|r| {
-            if let Some(col) = r.as_any().downcast_ref::<Column>() {
-                let idx = col.index();
-                if idx >= left_columns_len {
-                    let result = Column::new(col.name(), idx - left_columns_len);
-                    Some(Arc::new(result) as _)
-                } else {
-                    None
-                }
-            } else {
-                None
-            }
+            r.as_any().downcast_ref::<Column>().and_then(|col| {
+                col.index()
+                    .checked_sub(left_columns_len)
+                    .map(|index| Arc::new(Column::new(col.name(), index)) as _)
+            })
         })
         .collect::<Vec<_>>();
 
@@ -700,14 +654,15 @@ pub(crate) fn reorder_join_keys_to_inputs(
                 let new_sort_options = (0..sort_options.len())
                     .map(|idx| sort_options[new_positions[idx]])
                     .collect();
-                return Ok(Arc::new(SortMergeJoinExec::try_new(
+                return SortMergeJoinExec::try_new(
                     left.clone(),
                     right.clone(),
                     new_join_on,
                     *join_type,
                     new_sort_options,
                     *null_equals_null,
-                )?));
+                )
+                .map(|smj| Arc::new(smj) as _);
             }
         }
     }
@@ -762,21 +717,21 @@ fn try_reorder(
         normalized_expected = expected
             .iter()
             .map(|e| eq_groups.normalize_expr(e.clone()))
-            .collect::<Vec<_>>();
+            .collect();
         assert_eq!(normalized_expected.len(), expected.len());
 
         normalized_left_keys = join_keys
             .left_keys
             .iter()
             .map(|e| eq_groups.normalize_expr(e.clone()))
-            .collect::<Vec<_>>();
+            .collect();
         assert_eq!(join_keys.left_keys.len(), normalized_left_keys.len());
 
         normalized_right_keys = join_keys
             .right_keys
             .iter()
             .map(|e| eq_groups.normalize_expr(e.clone()))
-            .collect::<Vec<_>>();
+            .collect();
         assert_eq!(join_keys.right_keys.len(), normalized_right_keys.len());
 
         if physical_exprs_equal(&normalized_expected, &normalized_left_keys)
@@ -793,23 +748,19 @@ fn try_reorder(
             expected_expr_positions(&normalized_right_keys, &normalized_expected)
         });
 
-    if let Some(positions) = new_positions {
+    new_positions.map(|positions| {
         let mut new_left_keys = vec![];
         let mut new_right_keys = vec![];
         for pos in positions.iter() {
             new_left_keys.push(join_keys.left_keys[*pos].clone());
             new_right_keys.push(join_keys.right_keys[*pos].clone());
         }
-        Some((
-            JoinKeyPairs {
-                left_keys: new_left_keys,
-                right_keys: new_right_keys,
-            },
-            positions,
-        ))
-    } else {
-        None
-    }
+        let pairs = JoinKeyPairs {
+            left_keys: new_left_keys,
+            right_keys: new_right_keys,
+        };
+        (pairs, positions)
+    })
 }
 
 /// Return the expected expressions positions.
@@ -894,11 +845,7 @@ fn add_roundrobin_on_top(
 
         let new_plan = Arc::new(repartition) as _;
 
-        Ok(DistributionContext {
-            plan: new_plan,
-            distribution_connection: true,
-            children_nodes: vec![input],
-        })
+        Ok(DistributionContext::new(new_plan, true, vec![input]))
     } else {
         // Partition is not helpful, we already have desired number of partitions.
         Ok(input)
@@ -922,7 +869,7 @@ fn add_roundrobin_on_top(
 /// A [`Result`] object that contains new execution plan where the desired
 /// distribution is satisfied by adding a Hash repartition.
 fn add_hash_on_top(
-    mut input: DistributionContext,
+    input: DistributionContext,
     hash_exprs: Vec<Arc<dyn PhysicalExpr>>,
     n_target: usize,
 ) -> Result<DistributionContext> {
@@ -952,10 +899,9 @@ fn add_hash_on_top(
         let partitioning = Partitioning::Hash(hash_exprs, n_target);
         let repartition = RepartitionExec::try_new(input.plan.clone(), partitioning)?
             .with_preserve_order();
+        let plan = Arc::new(repartition) as _;
 
-        input.children_nodes = vec![input.clone()];
-        input.distribution_connection = true;
-        input.plan = Arc::new(repartition) as _;
+        return Ok(DistributionContext::new(plan, true, vec![input]));
     }
 
     Ok(input)
@@ -992,11 +938,7 @@ fn add_spm_on_top(input: DistributionContext) -> DistributionContext {
             Arc::new(CoalescePartitionsExec::new(input.plan.clone())) as _
         };
 
-        DistributionContext {
-            plan: new_plan,
-            distribution_connection: true,
-            children_nodes: vec![input],
-        }
+        DistributionContext::new(new_plan, true, vec![input])
     } else {
         input
     }
@@ -1027,10 +969,9 @@ fn remove_dist_changing_operators(
         || is_sort_preserving_merge(&distribution_context.plan)
     {
         // All of above operators have a single child. First child is only child.
-        let child = distribution_context.children_nodes.swap_remove(0);
         // Remove any distribution changing operators at the beginning:
+        distribution_context = distribution_context.children.swap_remove(0);
         // Note that they will be re-inserted later on if necessary or helpful.
-        distribution_context = child;
     }
 
     Ok(distribution_context)
@@ -1057,42 +998,35 @@ fn remove_dist_changing_operators(
 fn replace_order_preserving_variants(
     mut context: DistributionContext,
 ) -> Result<DistributionContext> {
-    let mut updated_children = context
-        .children_nodes
-        .iter()
+    context.children = context
+        .children
+        .into_iter()
         .map(|child| {
-            if child.distribution_connection {
-                replace_order_preserving_variants(child.clone())
+            if child.data {
+                replace_order_preserving_variants(child)
             } else {
-                Ok(child.clone())
+                Ok(child)
             }
         })
         .collect::<Result<Vec<_>>>()?;
 
     if is_sort_preserving_merge(&context.plan) {
-        let child = updated_children.swap_remove(0);
-        context.plan = Arc::new(CoalescePartitionsExec::new(child.plan.clone()));
-        context.children_nodes = vec![child];
+        let child_plan = context.children[0].plan.clone();
+        context.plan = Arc::new(CoalescePartitionsExec::new(child_plan));
         return Ok(context);
     } else if let Some(repartition) =
         context.plan.as_any().downcast_ref::<RepartitionExec>()
     {
         if repartition.preserve_order() {
-            let child = updated_children.swap_remove(0);
             context.plan = Arc::new(RepartitionExec::try_new(
-                child.plan.clone(),
+                context.children[0].plan.clone(),
                 repartition.partitioning().clone(),
             )?);
-            context.children_nodes = vec![child];
             return Ok(context);
         }
     }
 
-    context.plan = context
-        .plan
-        .clone()
-        .with_new_children(updated_children.into_iter().map(|c| c.plan).collect())?;
-    Ok(context)
+    context.update_plan_from_children()
 }
 
 /// This utility function adds a [`SortExec`] above an operator according to the
@@ -1109,17 +1043,11 @@ fn add_sort_preserving_partitions(
         .ordering_satisfy_requirement(sort_requirement)
     {
         let sort_expr = PhysicalSortRequirement::to_sort_exprs(sort_requirement.to_vec());
-        let new_sort = SortExec::new(sort_expr, node.plan.clone()).with_fetch(fetch);
-
-        DistributionContext {
-            plan: Arc::new(if node.plan.output_partitioning().partition_count() > 1 {
-                new_sort.with_preserve_partitioning(true)
-            } else {
-                new_sort
-            }),
-            distribution_connection: false,
-            children_nodes: vec![node],
+        let mut new_sort = SortExec::new(sort_expr, node.plan.clone()).with_fetch(fetch);
+        if node.plan.output_partitioning().partition_count() > 1 {
+            new_sort = new_sort.with_preserve_partitioning(true);
         }
+        DistributionContext::new(Arc::new(new_sort), false, vec![node])
     } else {
         node
     }
@@ -1133,7 +1061,7 @@ fn ensure_distribution(
     dist_context: DistributionContext,
     config: &ConfigOptions,
 ) -> Result<Transformed<DistributionContext>> {
-    let dist_context = dist_context.update_children()?;
+    let dist_context = update_children(dist_context)?;
 
     if dist_context.plan.children().is_empty() {
         return Ok(Transformed::No(dist_context));
@@ -1154,8 +1082,8 @@ fn ensure_distribution(
     // Remove unnecessary repartition from the physical plan if any
     let DistributionContext {
         mut plan,
-        distribution_connection,
-        children_nodes,
+        data,
+        children,
     } = remove_dist_changing_operators(dist_context)?;
 
     if let Some(exec) = plan.as_any().downcast_ref::<WindowAggExec>() {
@@ -1181,8 +1109,8 @@ fn ensure_distribution(
     // - Satisfy the distribution requirements of every child, if it is not
     //   already satisfied.
     // We store the updated children in `new_children`.
-    let children_nodes = izip!(
-        children_nodes.into_iter(),
+    let children = izip!(
+        children.into_iter(),
         plan.required_input_distribution().iter(),
         plan.required_input_ordering().iter(),
         plan.benefits_from_input_partitioning(),
@@ -1249,7 +1177,7 @@ fn ensure_distribution(
                     .equivalence_properties()
                     .ordering_satisfy_requirement(required_input_ordering);
                 if (!ordering_satisfied || !order_preserving_variants_desirable)
-                    && child.distribution_connection
+                    && child.data
                 {
                     child = replace_order_preserving_variants(child)?;
                     // If ordering requirements were satisfied before repartitioning,
@@ -1264,7 +1192,7 @@ fn ensure_distribution(
                     }
                 }
                 // Stop tracking distribution changing operators
-                child.distribution_connection = false;
+                child.data = false;
             } else {
                 // no ordering requirement
                 match requirement {
@@ -1286,151 +1214,78 @@ fn ensure_distribution(
     )
     .collect::<Result<Vec<_>>>()?;
 
-    let new_distribution_context = DistributionContext {
-        plan: if plan.as_any().is::<UnionExec>()
-            && can_interleave(children_nodes.iter().map(|c| c.plan.clone()))
-        {
-            // Add a special case for [`UnionExec`] since we want to "bubble up"
-            // hash-partitioned data. So instead of
-            //
-            // Agg:
-            //   Repartition (hash):
-            //     Union:
-            //       - Agg:
-            //           Repartition (hash):
-            //             Data
-            //       - Agg:
-            //           Repartition (hash):
-            //             Data
-            //
-            // we can use:
-            //
-            // Agg:
-            //   Interleave:
-            //     - Agg:
-            //         Repartition (hash):
-            //           Data
-            //     - Agg:
-            //         Repartition (hash):
-            //           Data
-            Arc::new(InterleaveExec::try_new(
-                children_nodes.iter().map(|c| c.plan.clone()).collect(),
-            )?)
-        } else {
-            plan.with_new_children(
-                children_nodes.iter().map(|c| c.plan.clone()).collect(),
-            )?
-        },
-        distribution_connection,
-        children_nodes,
+    let children_plans = children.iter().map(|c| c.plan.clone()).collect::<Vec<_>>();
+    plan = if plan.as_any().is::<UnionExec>() && can_interleave(children_plans.iter()) {
+        // Add a special case for [`UnionExec`] since we want to "bubble up"
+        // hash-partitioned data. So instead of
+        //
+        // Agg:
+        //   Repartition (hash):
+        //     Union:
+        //       - Agg:
+        //           Repartition (hash):
+        //             Data
+        //       - Agg:
+        //           Repartition (hash):
+        //             Data
+        //
+        // we can use:
+        //
+        // Agg:
+        //   Interleave:
+        //     - Agg:
+        //         Repartition (hash):
+        //           Data
+        //     - Agg:
+        //         Repartition (hash):
+        //           Data
+        Arc::new(InterleaveExec::try_new(children_plans)?)
+    } else {
+        plan.with_new_children(children_plans)?
     };
 
-    Ok(Transformed::Yes(new_distribution_context))
-}
-
-/// A struct to keep track of distribution changing operators
-/// (`RepartitionExec`, `SortPreservingMergeExec`, `CoalescePartitionsExec`),
-/// and their associated parents inside `plan`. Using this information,
-/// we can optimize distribution of the plan if/when necessary.
-#[derive(Debug, Clone)]
-struct DistributionContext {
-    plan: Arc<dyn ExecutionPlan>,
-    /// Indicates whether this plan is connected to a distribution-changing
-    /// operator.
-    distribution_connection: bool,
-    children_nodes: Vec<Self>,
-}
-
-impl DistributionContext {
-    /// Creates a tree according to the plan with empty states.
-    fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
-        let children = plan.children();
-        Self {
-            plan,
-            distribution_connection: false,
-            children_nodes: children.into_iter().map(Self::new).collect(),
-        }
-    }
-
-    fn update_children(mut self) -> Result<Self> {
-        for child_context in self.children_nodes.iter_mut() {
-            child_context.distribution_connection = match &child_context.plan {
-                plan if is_repartition(plan)
-                    || is_coalesce_partitions(plan)
-                    || is_sort_preserving_merge(plan) =>
-                {
-                    true
-                }
-                _ => {
-                    child_context.plan.children().is_empty()
-                        || child_context.children_nodes[0].distribution_connection
-                        || child_context
-                            .plan
-                            .required_input_distribution()
-                            .iter()
-                            .zip(child_context.children_nodes.iter())
-                            .any(|(required_dist, child_context)| {
-                                child_context.distribution_connection
-                                    && matches!(
-                                        required_dist,
-                                        Distribution::UnspecifiedDistribution
-                                    )
-                            })
-                }
-            };
-        }
-
-        let children_plans = self
-            .children_nodes
-            .iter()
-            .map(|context| context.plan.clone())
-            .collect::<Vec<_>>();
-
-        Ok(Self {
-            plan: with_new_children_if_necessary(self.plan, children_plans)?.into(),
-            distribution_connection: false,
-            children_nodes: self.children_nodes,
-        })
-    }
+    Ok(Transformed::Yes(DistributionContext::new(
+        plan, data, children,
+    )))
 }
 
-impl TreeNode for DistributionContext {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children_nodes.iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children_nodes.is_empty() {
-            self.children_nodes = self
-                .children_nodes
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-            self.plan = with_new_children_if_necessary(
-                self.plan,
-                self.children_nodes.iter().map(|c| c.plan.clone()).collect(),
-            )?
-            .into();
-        }
-        Ok(self)
+/// Keeps track of distribution changing operators (like `RepartitionExec`,
+/// `SortPreservingMergeExec`, `CoalescePartitionsExec`) and their ancestors.
+/// Using this information, we can optimize distribution of the plan if/when
+/// necessary.
+type DistributionContext = PlanContext<bool>;
+
+fn update_children(mut dist_context: DistributionContext) -> Result<DistributionContext> {
+    for child_context in dist_context.children.iter_mut() {
+        let child_plan_any = child_context.plan.as_any();
+        child_context.data =
+            if let Some(repartition) = child_plan_any.downcast_ref::<RepartitionExec>() {
+                !matches!(
+                    repartition.partitioning(),
+                    Partitioning::UnknownPartitioning(_)
+                )
+            } else {
+                child_plan_any.is::<SortPreservingMergeExec>()
+                    || child_plan_any.is::<CoalescePartitionsExec>()
+                    || child_context.plan.children().is_empty()
+                    || child_context.children[0].data
+                    || child_context
+                        .plan
+                        .required_input_distribution()
+                        .iter()
+                        .zip(child_context.children.iter())
+                        .any(|(required_dist, child_context)| {
+                            child_context.data
+                                && matches!(
+                                    required_dist,
+                                    Distribution::UnspecifiedDistribution
+                                )
+                        })
+            }
     }
-}
 
-/// implement Display method for `DistributionContext` struct.
-impl fmt::Display for DistributionContext {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        let plan_string = get_plan_string(&self.plan);
-        write!(f, "plan: {:?}", plan_string)?;
-        write!(
-            f,
-            "distribution_connection:{}",
-            self.distribution_connection,
-        )?;
-        write!(f, "")
-    }
+    dist_context.data = false;
+    Ok(dist_context)
 }
 
 #[derive(Debug, Clone)]
@@ -1439,49 +1294,8 @@ struct JoinKeyPairs {
     right_keys: Vec<Arc<dyn PhysicalExpr>>,
 }
 
-#[derive(Debug, Clone)]
-struct PlanWithKeyRequirements {
-    plan: Arc<dyn ExecutionPlan>,
-    /// Parent required key ordering
-    required_key_ordering: Vec<Arc<dyn PhysicalExpr>>,
-    children: Vec<Self>,
-}
-
-impl PlanWithKeyRequirements {
-    fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
-        let children = plan.children();
-        Self {
-            plan,
-            required_key_ordering: vec![],
-            children: children.into_iter().map(Self::new).collect(),
-        }
-    }
-}
-
-impl TreeNode for PlanWithKeyRequirements {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children.iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children.is_empty() {
-            self.children = self
-                .children
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-            self.plan = with_new_children_if_necessary(
-                self.plan,
-                self.children.iter().map(|c| c.plan.clone()).collect(),
-            )?
-            .into();
-        }
-        Ok(self)
-    }
-}
+/// Keeps track of parent required key orderings.
+type PlanWithKeyRequirements = PlanContext<Vec<Arc<dyn PhysicalExpr>>>;
 
 /// Since almost all of these tests explicitly use `ParquetExec` they only run with the parquet  feature flag on
 #[cfg(feature = "parquet")]
@@ -1497,6 +1311,9 @@ pub(crate) mod tests {
     use crate::datasource::physical_plan::{CsvExec, FileScanConfig};
     use crate::physical_optimizer::enforce_sorting::EnforceSorting;
     use crate::physical_optimizer::output_requirements::OutputRequirements;
+    use crate::physical_optimizer::test_utils::{
+        check_integrity, coalesce_partitions_exec, repartition_exec,
+    };
     use crate::physical_plan::aggregates::{
         AggregateExec, AggregateMode, PhysicalGroupBy,
     };
@@ -1506,16 +1323,12 @@ pub(crate) mod tests {
     use crate::physical_plan::joins::{
         utils::JoinOn, HashJoinExec, PartitionMode, SortMergeJoinExec,
     };
+    use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
     use crate::physical_plan::projection::ProjectionExec;
+    use crate::physical_plan::sorts::sort::SortExec;
     use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
     use crate::physical_plan::{displayable, DisplayAs, DisplayFormatType, Statistics};
 
-    use crate::physical_optimizer::test_utils::{
-        coalesce_partitions_exec, repartition_exec,
-    };
-    use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
-    use crate::physical_plan::sorts::sort::SortExec;
-
     use arrow::compute::SortOptions;
     use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
     use datafusion_common::ScalarValue;
@@ -1897,7 +1710,7 @@ pub(crate) mod tests {
         target_partitions: usize,
         prefer_existing_sort: bool,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let distribution_context = DistributionContext::new(plan);
+        let distribution_context = DistributionContext::new_default(plan);
         let mut config = ConfigOptions::new();
         config.execution.target_partitions = target_partitions;
         config.optimizer.enable_round_robin_repartition = true;
@@ -1963,6 +1776,35 @@ pub(crate) mod tests {
             let optimizer = OutputRequirements::new_add_mode();
             let optimized = optimizer.optimize($PLAN.clone(), &config)?;
 
+            // This file has 2 rules that use tree node, apply these rules to original plan consecutively
+            // After these operations tree nodes should be in a consistent state.
+            // This code block makes sure that these rules doesn't violate tree node integrity.
+            {
+                let adjusted = if config.optimizer.top_down_join_key_reordering {
+                    // Run adjust_input_keys_ordering rule
+                    let plan_requirements =
+                        PlanWithKeyRequirements::new_default($PLAN.clone());
+                    let adjusted = plan_requirements
+                        .transform_down(&adjust_input_keys_ordering)
+                        .and_then(check_integrity)?;
+                    // TODO: End state payloads will be checked here.
+                    adjusted.plan
+                } else {
+                    // Run reorder_join_keys_to_inputs rule
+                    $PLAN.clone().transform_up(&|plan| {
+                        Ok(Transformed::Yes(reorder_join_keys_to_inputs(plan)?))
+                    })?
+                };
+
+                // Then run ensure_distribution rule
+                DistributionContext::new_default(adjusted)
+                    .transform_up(&|distribution_context| {
+                        ensure_distribution(distribution_context, &config)
+                    })
+                    .and_then(check_integrity)?;
+                // TODO: End state payloads will be checked here.
+            }
+
             let optimized = if $FIRST_ENFORCE_DIST {
                 // Run enforce distribution rule first:
                 let optimizer = EnforceDistribution::new();
@@ -2255,9 +2097,9 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, top_join.clone(), true);
         assert_optimized!(expected, top_join, false);
+
         Ok(())
     }
 
@@ -2313,6 +2155,7 @@ pub(crate) mod tests {
 
         assert_optimized!(expected, top_join.clone(), true);
         assert_optimized!(expected, top_join, false);
+
         Ok(())
     }
 
@@ -2352,6 +2195,7 @@ pub(crate) mod tests {
         ];
         assert_optimized!(expected, join.clone(), true);
         assert_optimized!(expected, join, false);
+
         Ok(())
     }
 
@@ -2404,6 +2248,7 @@ pub(crate) mod tests {
         ];
         assert_optimized!(expected, join.clone(), true);
         assert_optimized!(expected, join, false);
+
         Ok(())
     }
 
@@ -2520,6 +2365,7 @@ pub(crate) mod tests {
         ];
         assert_optimized!(expected, filter_top_join.clone(), true);
         assert_optimized!(expected, filter_top_join, false);
+
         Ok(())
     }
 
@@ -3143,6 +2989,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
         assert_optimized!(expected_first_sort_enforcement, join, false, true);
+
         Ok(())
     }
 
@@ -3162,7 +3009,8 @@ pub(crate) mod tests {
         let exec = Arc::new(CoalesceBatchesExec::new(exec, 4096));
 
         // Merge from multiple parquet files and keep the data sorted
-        let exec = Arc::new(SortPreservingMergeExec::new(sort_key, exec));
+        let exec: Arc<dyn ExecutionPlan> =
+            Arc::new(SortPreservingMergeExec::new(sort_key, exec));
 
         // The optimizer should not add an additional SortExec as the
         // data is already sorted
@@ -3172,6 +3020,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC]",
         ];
         assert_optimized!(expected, exec, true);
+
         // In this case preserving ordering through order preserving operators is not desirable
         // (according to flag: PREFER_EXISTING_SORT)
         // hence in this case ordering lost during CoalescePartitionsExec and re-introduced with
@@ -3183,6 +3032,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC]",
         ];
         assert_optimized!(expected, exec, false);
+
         Ok(())
     }
 
@@ -3224,6 +3074,7 @@ pub(crate) mod tests {
         ];
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3239,9 +3090,9 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3258,9 +3109,9 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3281,6 +3132,7 @@ pub(crate) mod tests {
 
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3300,9 +3152,9 @@ pub(crate) mod tests {
             "SortExec: expr=[c@2 ASC]",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3328,6 +3180,7 @@ pub(crate) mod tests {
 
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3355,9 +3208,9 @@ pub(crate) mod tests {
             // Expect no repartition to happen for local limit
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3377,6 +3230,7 @@ pub(crate) mod tests {
 
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3395,7 +3249,6 @@ pub(crate) mod tests {
             "SortExec: expr=[c@2 ASC]",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
 
@@ -3430,6 +3283,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3462,6 +3316,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3487,6 +3342,7 @@ pub(crate) mod tests {
 
         assert_optimized!(expected, plan.clone(), true, true);
         assert_optimized!(expected, plan, false, true);
+
         Ok(())
     }
 
@@ -3522,9 +3378,9 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3567,6 +3423,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
         assert_optimized!(expected_first_sort_enforcement, plan, false);
+
         Ok(())
     }
 
@@ -3594,9 +3451,9 @@ pub(crate) mod tests {
             "ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c]",
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3623,9 +3480,9 @@ pub(crate) mod tests {
             "ProjectionExec: expr=[a@0 as a]",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, plan.clone(), true);
         assert_optimized!(expected, plan, false);
+
         Ok(())
     }
 
@@ -3658,6 +3515,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
         assert_optimized!(expected_first_sort_enforcement, plan, false);
+
         Ok(())
     }
 
@@ -3704,6 +3562,7 @@ pub(crate) mod tests {
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
         assert_optimized!(expected_first_sort_enforcement, plan, false);
+
         Ok(())
     }
 
@@ -3725,9 +3584,9 @@ pub(crate) mod tests {
             "AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
             "CsvExec: file_groups={2 groups: [[x:0..50], [x:50..100]]}, projection=[a, b, c, d, e], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true, false, 2, true, 10);
         assert_optimized!(expected_csv, plan_csv, true, false, 2, true, 10);
+
         Ok(())
     }
 
@@ -3838,7 +3697,6 @@ pub(crate) mod tests {
                 )),
                 vec![("a".to_string(), "a".to_string())],
             );
-
             assert_optimized!(expected, plan, true, false, 2, true, 10);
         }
         Ok(())
@@ -3865,7 +3723,6 @@ pub(crate) mod tests {
             // Plan already has two partitions
             "CsvExec: file_groups={2 groups: [[x:0..100], [y:0..100]]}, projection=[a, b, c, d, e], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true, false, 2, true, 10);
         assert_optimized!(expected_csv, plan_csv, true, false, 2, true, 10);
         Ok(())
@@ -3892,9 +3749,9 @@ pub(crate) mod tests {
             // Multiple source files splitted across partitions
             "CsvExec: file_groups={4 groups: [[x:0..50], [x:50..100], [y:0..50], [y:50..100]]}, projection=[a, b, c, d, e], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true, false, 4, true, 10);
         assert_optimized!(expected_csv, plan_csv, true, false, 4, true, 10);
+
         Ok(())
     }
 
@@ -3924,9 +3781,9 @@ pub(crate) mod tests {
             // Doesn't parallelize for SortExec without preserve_partitioning
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -3969,9 +3826,9 @@ pub(crate) mod tests {
             // SortExec doesn't benefit from input partitioning
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -4019,9 +3876,9 @@ pub(crate) mod tests {
             "LocalLimitExec: fetch=100",
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -4048,9 +3905,9 @@ pub(crate) mod tests {
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], has_header=false",
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -4078,9 +3935,9 @@ pub(crate) mod tests {
         let expected_csv = &[
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -4112,9 +3969,9 @@ pub(crate) mod tests {
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], has_header=false",
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -4140,9 +3997,9 @@ pub(crate) mod tests {
             "SortRequiredExec: [c@2 ASC]",
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], has_header=false",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -4182,8 +4039,8 @@ pub(crate) mod tests {
             "ProjectionExec: expr=[a@0 as a2, c@2 as c2]",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
-
         assert_optimized!(expected_parquet, plan_parquet, true);
+
         Ok(())
     }
 
@@ -4221,8 +4078,8 @@ pub(crate) mod tests {
             "ProjectionExec: expr=[a@0 as a2, c@2 as c2]",
             "CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC], has_header=false",
         ];
-
         assert_optimized!(expected_csv, plan_csv, true);
+
         Ok(())
     }
 
@@ -4245,7 +4102,6 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, physical_plan.clone(), true);
         assert_optimized!(expected, physical_plan, false);
 
@@ -4268,7 +4124,6 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2, preserve_order=true, sort_exprs=c@2 ASC",
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
-
         // last flag sets config.optimizer.PREFER_EXISTING_SORT
         assert_optimized!(expected, physical_plan.clone(), true, true);
         assert_optimized!(expected, physical_plan, false, true);
@@ -4326,7 +4181,6 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
-
         assert_optimized!(expected, physical_plan.clone(), true);
         assert_optimized!(expected, physical_plan, false);
 
@@ -4386,7 +4240,6 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=2",
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
-
         assert_optimized!(expected, physical_plan.clone(), true);
         assert_optimized!(expected, physical_plan, false);
 
@@ -4424,9 +4277,8 @@ pub(crate) mod tests {
         config.execution.target_partitions = 10;
         config.optimizer.enable_round_robin_repartition = true;
         config.optimizer.prefer_existing_sort = false;
-        let distribution_plan =
-            EnforceDistribution::new().optimize(physical_plan, &config)?;
-        assert_plan_txt!(expected, distribution_plan);
+        let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?;
+        assert_plan_txt!(expected, dist_plan);
 
         Ok(())
     }
@@ -4462,9 +4314,8 @@ pub(crate) mod tests {
         config.execution.target_partitions = 10;
         config.optimizer.enable_round_robin_repartition = true;
         config.optimizer.prefer_existing_sort = false;
-        let distribution_plan =
-            EnforceDistribution::new().optimize(physical_plan, &config)?;
-        assert_plan_txt!(expected, distribution_plan);
+        let dist_plan = EnforceDistribution::new().optimize(physical_plan, &config)?;
+        assert_plan_txt!(expected, dist_plan);
 
         Ok(())
     }
@@ -4485,7 +4336,6 @@ pub(crate) mod tests {
             "AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[]",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
-
         // Make sure target partition number is 1. In this case hash repartition is unnecessary
         assert_optimized!(expected, physical_plan.clone(), true, false, 1, false, 1024);
         assert_optimized!(expected, physical_plan, false, false, 1, false, 1024);
@@ -4516,7 +4366,6 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2",
             "ParquetExec: file_groups={2 groups: [[x], [y]]}, projection=[a, b, c, d, e], output_ordering=[c@2 ASC]",
         ];
-
         // Make sure target partition number is larger than 2 (e.g partition number at the source).
         assert_optimized!(expected, physical_plan.clone(), true, false, 4, false, 1024);
         assert_optimized!(expected, physical_plan, false, false, 4, false, 1024);
@@ -4536,7 +4385,6 @@ pub(crate) mod tests {
 
         let expected =
             &["ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]"];
-
         assert_optimized!(expected, physical_plan.clone(), true);
         assert_optimized!(expected, physical_plan, false);
 
@@ -4564,7 +4412,6 @@ pub(crate) mod tests {
             "RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e]",
         ];
-
         assert_optimized!(expected, physical_plan.clone(), true);
         assert_optimized!(expected, physical_plan, false);
 
diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
index f609ddea66cf..3aa9cdad1845 100644
--- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs
+++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs
@@ -34,29 +34,30 @@
 //! in the physical plan. The first sort is unnecessary since its result is overwritten
 //! by another [`SortExec`]. Therefore, this rule removes it from the physical plan.
 
-use std::borrow::Cow;
 use std::sync::Arc;
 
+use super::utils::add_sort_above;
 use crate::config::ConfigOptions;
 use crate::error::Result;
 use crate::physical_optimizer::replace_with_order_preserving_variants::{
     replace_with_order_preserving_variants, OrderPreservationContext,
 };
-use crate::physical_optimizer::sort_pushdown::{pushdown_sorts, SortPushDown};
+use crate::physical_optimizer::sort_pushdown::{
+    assign_initial_requirements, pushdown_sorts, SortPushDown,
+};
 use crate::physical_optimizer::utils::{
-    add_sort_above, is_coalesce_partitions, is_limit, is_repartition, is_sort,
-    is_sort_preserving_merge, is_union, is_window,
+    is_coalesce_partitions, is_limit, is_repartition, is_sort, is_sort_preserving_merge,
+    is_union, is_window,
 };
 use crate::physical_optimizer::PhysicalOptimizerRule;
 use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use crate::physical_plan::sorts::sort::SortExec;
 use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use crate::physical_plan::tree_node::PlanContext;
 use crate::physical_plan::windows::{
     get_best_fitting_window, BoundedWindowAggExec, WindowAggExec,
 };
-use crate::physical_plan::{
-    with_new_children_if_necessary, Distribution, ExecutionPlan, InputOrderMode,
-};
+use crate::physical_plan::{Distribution, ExecutionPlan, InputOrderMode};
 
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{plan_err, DataFusionError};
@@ -78,177 +79,70 @@ impl EnforceSorting {
 }
 
 /// This object is used within the [`EnforceSorting`] rule to track the closest
-/// [`SortExec`] descendant(s) for every child of a plan.
-#[derive(Debug, Clone)]
-struct PlanWithCorrespondingSort {
-    plan: Arc<dyn ExecutionPlan>,
-    // For every child, track `ExecutionPlan`s starting from the child until
-    // the `SortExec`(s). If the child has no connection to any sort, it simply
-    // stores false.
-    sort_connection: bool,
-    children_nodes: Vec<Self>,
-}
-
-impl PlanWithCorrespondingSort {
-    fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
-        let children = plan.children();
-        Self {
-            plan,
-            sort_connection: false,
-            children_nodes: children.into_iter().map(Self::new).collect(),
+/// [`SortExec`] descendant(s) for every child of a plan. The data attribute
+/// stores whether the plan is a `SortExec` or is connected to a `SortExec`
+/// via its children.
+type PlanWithCorrespondingSort = PlanContext<bool>;
+
+fn update_sort_ctx_children(
+    mut node: PlanWithCorrespondingSort,
+    data: bool,
+) -> Result<PlanWithCorrespondingSort> {
+    for child_node in node.children.iter_mut() {
+        let plan = &child_node.plan;
+        child_node.data = if is_sort(plan) {
+            // Initiate connection:
+            true
+        } else if is_limit(plan) {
+            // There is no sort linkage for this path, it starts at a limit.
+            false
+        } else {
+            let is_spm = is_sort_preserving_merge(plan);
+            let required_orderings = plan.required_input_ordering();
+            let flags = plan.maintains_input_order();
+            // Add parent node to the tree if there is at least one child with
+            // a sort connection:
+            izip!(flags, required_orderings).any(|(maintains, required_ordering)| {
+                let propagates_ordering =
+                    (maintains && required_ordering.is_none()) || is_spm;
+                let connected_to_sort =
+                    child_node.children.iter().any(|child| child.data);
+                propagates_ordering && connected_to_sort
+            })
         }
     }
 
-    fn update_children(
-        parent_plan: Arc<dyn ExecutionPlan>,
-        mut children_nodes: Vec<Self>,
-    ) -> Result<Self> {
-        for node in children_nodes.iter_mut() {
-            let plan = &node.plan;
-            // Leaves of `sort_onwards` are `SortExec` operators, which impose
-            // an ordering. This tree collects all the intermediate executors
-            // that maintain this ordering. If we just saw a order imposing
-            // operator, we reset the tree and start accumulating.
-            node.sort_connection = if is_sort(plan) {
-                // Initiate connection
-                true
-            } else if is_limit(plan) {
-                // There is no sort linkage for this path, it starts at a limit.
-                false
-            } else {
-                let is_spm = is_sort_preserving_merge(plan);
-                let required_orderings = plan.required_input_ordering();
-                let flags = plan.maintains_input_order();
-                // Add parent node to the tree if there is at least one
-                // child with a sort connection:
-                izip!(flags, required_orderings).any(|(maintains, required_ordering)| {
-                    let propagates_ordering =
-                        (maintains && required_ordering.is_none()) || is_spm;
-                    let connected_to_sort =
-                        node.children_nodes.iter().any(|item| item.sort_connection);
-                    propagates_ordering && connected_to_sort
-                })
-            }
-        }
-
-        let children_plans = children_nodes
-            .iter()
-            .map(|item| item.plan.clone())
-            .collect::<Vec<_>>();
-        let plan = with_new_children_if_necessary(parent_plan, children_plans)?.into();
-
-        Ok(Self {
-            plan,
-            sort_connection: false,
-            children_nodes,
-        })
-    }
-}
-
-impl TreeNode for PlanWithCorrespondingSort {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children_nodes.iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children_nodes.is_empty() {
-            self.children_nodes = self
-                .children_nodes
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-            self.plan = with_new_children_if_necessary(
-                self.plan,
-                self.children_nodes.iter().map(|c| c.plan.clone()).collect(),
-            )?
-            .into();
-        }
-        Ok(self)
-    }
+    node.data = data;
+    node.update_plan_from_children()
 }
 
 /// This object is used within the [`EnforceSorting`] rule to track the closest
-/// [`CoalescePartitionsExec`] descendant(s) for every child of a plan.
-#[derive(Debug, Clone)]
-struct PlanWithCorrespondingCoalescePartitions {
-    plan: Arc<dyn ExecutionPlan>,
-    // Stores whether the plan is a `CoalescePartitionsExec` or it is connected to
-    // a `CoalescePartitionsExec` via its children.
-    coalesce_connection: bool,
-    children_nodes: Vec<Self>,
-}
-
-impl PlanWithCorrespondingCoalescePartitions {
-    /// Creates an empty tree with empty connections.
-    fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
-        let children = plan.children();
-        Self {
-            plan,
-            coalesce_connection: false,
-            children_nodes: children.into_iter().map(Self::new).collect(),
-        }
-    }
-
-    fn update_children(mut self) -> Result<Self> {
-        self.coalesce_connection = if self.plan.children().is_empty() {
-            // Plan has no children, it cannot be a `CoalescePartitionsExec`.
-            false
-        } else if is_coalesce_partitions(&self.plan) {
-            // Initiate a connection
-            true
-        } else {
-            self.children_nodes
-                .iter()
-                .enumerate()
-                .map(|(idx, node)| {
-                    // Only consider operators that don't require a
-                    // single partition, and connected to any coalesce
-                    node.coalesce_connection
-                        && !matches!(
-                            self.plan.required_input_distribution()[idx],
-                            Distribution::SinglePartition
-                        )
-                    // If all children are None. There is nothing to track, set connection false.
-                })
-                .any(|c| c)
-        };
-
-        let children_plans = self
-            .children_nodes
-            .iter()
-            .map(|item| item.plan.clone())
-            .collect();
-        self.plan = with_new_children_if_necessary(self.plan, children_plans)?.into();
-        Ok(self)
-    }
-}
-
-impl TreeNode for PlanWithCorrespondingCoalescePartitions {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children_nodes.iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children_nodes.is_empty() {
-            self.children_nodes = self
-                .children_nodes
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-            self.plan = with_new_children_if_necessary(
-                self.plan,
-                self.children_nodes.iter().map(|c| c.plan.clone()).collect(),
-            )?
-            .into();
-        }
-        Ok(self)
-    }
+/// [`CoalescePartitionsExec`] descendant(s) for every child of a plan. The data
+/// attribute stores whether the plan is a `CoalescePartitionsExec` or is
+/// connected to a `CoalescePartitionsExec` via its children.
+type PlanWithCorrespondingCoalescePartitions = PlanContext<bool>;
+
+fn update_coalesce_ctx_children(
+    coalesce_context: &mut PlanWithCorrespondingCoalescePartitions,
+) {
+    let children = &coalesce_context.children;
+    coalesce_context.data = if children.is_empty() {
+        // Plan has no children, it cannot be a `CoalescePartitionsExec`.
+        false
+    } else if is_coalesce_partitions(&coalesce_context.plan) {
+        // Initiate a connection:
+        true
+    } else {
+        children.iter().enumerate().any(|(idx, node)| {
+            // Only consider operators that don't require a single partition,
+            // and connected to some `CoalescePartitionsExec`:
+            node.data
+                && !matches!(
+                    coalesce_context.plan.required_input_distribution()[idx],
+                    Distribution::SinglePartition
+                )
+        })
+    };
 }
 
 /// The boolean flag `repartition_sorts` defined in the config indicates
@@ -261,13 +155,13 @@ impl PhysicalOptimizerRule for EnforceSorting {
         plan: Arc<dyn ExecutionPlan>,
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let plan_requirements = PlanWithCorrespondingSort::new(plan);
+        let plan_requirements = PlanWithCorrespondingSort::new_default(plan);
         // Execute a bottom-up traversal to enforce sorting requirements,
         // remove unnecessary sorts, and optimize sort-sensitive operators:
         let adjusted = plan_requirements.transform_up(&ensure_sorting)?;
         let new_plan = if config.optimizer.repartition_sorts {
             let plan_with_coalesce_partitions =
-                PlanWithCorrespondingCoalescePartitions::new(adjusted.plan);
+                PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan);
             let parallel =
                 plan_with_coalesce_partitions.transform_up(&parallelize_sorts)?;
             parallel.plan
@@ -275,7 +169,7 @@ impl PhysicalOptimizerRule for EnforceSorting {
             adjusted.plan
         };
 
-        let plan_with_pipeline_fixer = OrderPreservationContext::new(new_plan);
+        let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan);
         let updated_plan =
             plan_with_pipeline_fixer.transform_up(&|plan_with_pipeline_fixer| {
                 replace_with_order_preserving_variants(
@@ -288,8 +182,8 @@ impl PhysicalOptimizerRule for EnforceSorting {
 
         // Execute a top-down traversal to exploit sort push-down opportunities
         // missed by the bottom-up traversal:
-        let mut sort_pushdown = SortPushDown::new(updated_plan.plan);
-        sort_pushdown.assign_initial_requirements();
+        let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan);
+        assign_initial_requirements(&mut sort_pushdown);
         let adjusted = sort_pushdown.transform_down(&pushdown_sorts)?;
         Ok(adjusted.plan)
     }
@@ -318,414 +212,347 @@ impl PhysicalOptimizerRule for EnforceSorting {
 /// by following connections from [`CoalescePartitionsExec`]s to [`SortExec`]s.
 /// By performing sorting in parallel, we can increase performance in some scenarios.
 fn parallelize_sorts(
-    requirements: PlanWithCorrespondingCoalescePartitions,
+    mut requirements: PlanWithCorrespondingCoalescePartitions,
 ) -> Result<Transformed<PlanWithCorrespondingCoalescePartitions>> {
-    let PlanWithCorrespondingCoalescePartitions {
-        mut plan,
-        coalesce_connection,
-        mut children_nodes,
-    } = requirements.update_children()?;
-
-    if plan.children().is_empty() || !children_nodes[0].coalesce_connection {
-        // We only take an action when the plan is either a SortExec, a
-        // SortPreservingMergeExec or a CoalescePartitionsExec, and they
-        // all have a single child. Therefore, if the first child is `None`,
-        // we can return immediately.
-        return Ok(Transformed::No(PlanWithCorrespondingCoalescePartitions {
-            plan,
-            coalesce_connection,
-            children_nodes,
-        }));
-    } else if (is_sort(&plan) || is_sort_preserving_merge(&plan))
-        && plan.output_partitioning().partition_count() <= 1
+    update_coalesce_ctx_children(&mut requirements);
+
+    if requirements.children.is_empty() || !requirements.children[0].data {
+        // We only take an action when the plan is either a `SortExec`, a
+        // `SortPreservingMergeExec` or a `CoalescePartitionsExec`, and they
+        // all have a single child. Therefore, if the first child has no
+        // connection, we can return immediately.
+        Ok(Transformed::No(requirements))
+    } else if (is_sort(&requirements.plan)
+        || is_sort_preserving_merge(&requirements.plan))
+        && requirements.plan.output_partitioning().partition_count() <= 1
     {
-        // If there is a connection between a CoalescePartitionsExec and a
-        // global sort that satisfy the requirements (i.e. intermediate
-        // executors don't require single partition), then we can replace
-        // the CoalescePartitionsExec + Sort cascade with a SortExec +
-        // SortPreservingMergeExec cascade to parallelize sorting.
-        let (sort_exprs, fetch) = get_sort_exprs(&plan)?;
+        // Take the initial sort expressions and requirements
+        let (sort_exprs, fetch) = get_sort_exprs(&requirements.plan)?;
         let sort_reqs = PhysicalSortRequirement::from_sort_exprs(sort_exprs);
         let sort_exprs = sort_exprs.to_vec();
-        update_child_to_remove_coalesce(&mut plan, &mut children_nodes[0])?;
-        add_sort_above(&mut plan, &sort_reqs, fetch);
-        let spm = SortPreservingMergeExec::new(sort_exprs, plan).with_fetch(fetch);
-
-        return Ok(Transformed::Yes(
-            PlanWithCorrespondingCoalescePartitions::new(Arc::new(spm)),
-        ));
-    } else if is_coalesce_partitions(&plan) {
-        // There is an unnecessary `CoalescePartitionsExec` in the plan.
-        update_child_to_remove_coalesce(&mut plan, &mut children_nodes[0])?;
 
-        let new_plan = Arc::new(CoalescePartitionsExec::new(plan)) as _;
-        return Ok(Transformed::Yes(
-            PlanWithCorrespondingCoalescePartitions::new(new_plan),
-        ));
-    }
+        // If there is a connection between a `CoalescePartitionsExec` and a
+        // global sort that satisfy the requirements (i.e. intermediate
+        // executors don't require single partition), then we can replace
+        // the `CoalescePartitionsExec` + `SortExec` cascade with a `SortExec`
+        // + `SortPreservingMergeExec` cascade to parallelize sorting.
+        requirements = remove_corresponding_coalesce_in_sub_plan(requirements)?;
+        // We also need to remove the self node since `remove_corresponding_coalesce_in_sub_plan`
+        // deals with the children and their children and so on.
+        requirements = requirements.children.swap_remove(0);
+
+        if !requirements
+            .plan
+            .equivalence_properties()
+            .ordering_satisfy_requirement(&sort_reqs)
+        {
+            requirements = add_sort_above(requirements, sort_reqs, fetch);
+        }
 
-    Ok(Transformed::Yes(PlanWithCorrespondingCoalescePartitions {
-        plan,
-        coalesce_connection,
-        children_nodes,
-    }))
+        let spm = SortPreservingMergeExec::new(sort_exprs, requirements.plan.clone());
+        Ok(Transformed::Yes(
+            PlanWithCorrespondingCoalescePartitions::new(
+                Arc::new(spm.with_fetch(fetch)),
+                false,
+                vec![requirements],
+            ),
+        ))
+    } else if is_coalesce_partitions(&requirements.plan) {
+        // There is an unnecessary `CoalescePartitionsExec` in the plan.
+        // This will handle the recursive `CoalescePartitionsExec` plans.
+        requirements = remove_corresponding_coalesce_in_sub_plan(requirements)?;
+        // For the removal of self node which is also a `CoalescePartitionsExec`.
+        requirements = requirements.children.swap_remove(0);
+
+        Ok(Transformed::Yes(
+            PlanWithCorrespondingCoalescePartitions::new(
+                Arc::new(CoalescePartitionsExec::new(requirements.plan.clone())),
+                false,
+                vec![requirements],
+            ),
+        ))
+    } else {
+        Ok(Transformed::Yes(requirements))
+    }
 }
 
 /// This function enforces sorting requirements and makes optimizations without
 /// violating these requirements whenever possible.
 fn ensure_sorting(
-    requirements: PlanWithCorrespondingSort,
+    mut requirements: PlanWithCorrespondingSort,
 ) -> Result<Transformed<PlanWithCorrespondingSort>> {
-    let requirements = PlanWithCorrespondingSort::update_children(
-        requirements.plan,
-        requirements.children_nodes,
-    )?;
+    requirements = update_sort_ctx_children(requirements, false)?;
 
     // Perform naive analysis at the beginning -- remove already-satisfied sorts:
-    if requirements.plan.children().is_empty() {
+    if requirements.children.is_empty() {
         return Ok(Transformed::No(requirements));
     }
-    if let Some(result) = analyze_immediate_sort_removal(&requirements) {
-        return Ok(Transformed::Yes(result));
-    }
-
-    let plan = requirements.plan;
-    let mut children_nodes = requirements.children_nodes;
+    let maybe_requirements = analyze_immediate_sort_removal(requirements);
+    let Transformed::No(mut requirements) = maybe_requirements else {
+        return Ok(maybe_requirements);
+    };
 
-    for (idx, (child_node, required_ordering)) in
-        izip!(children_nodes.iter_mut(), plan.required_input_ordering()).enumerate()
+    let plan = &requirements.plan;
+    let mut updated_children = vec![];
+    for (idx, (required_ordering, mut child)) in plan
+        .required_input_ordering()
+        .into_iter()
+        .zip(requirements.children.into_iter())
+        .enumerate()
     {
-        let mut child_plan = child_node.plan.clone();
-        let physical_ordering = child_plan.output_ordering();
-        match (required_ordering, physical_ordering) {
-            (Some(required_ordering), Some(_)) => {
-                if !child_plan
-                    .equivalence_properties()
-                    .ordering_satisfy_requirement(&required_ordering)
-                {
-                    // Make sure we preserve the ordering requirements:
-                    update_child_to_remove_unnecessary_sort(idx, child_node, &plan)?;
-                    add_sort_above(&mut child_plan, &required_ordering, None);
-                    if is_sort(&child_plan) {
-                        *child_node = PlanWithCorrespondingSort::update_children(
-                            child_plan,
-                            vec![child_node.clone()],
-                        )?;
-                        child_node.sort_connection = true;
-                    }
+        let physical_ordering = child.plan.output_ordering();
+
+        if let Some(required) = required_ordering {
+            let eq_properties = child.plan.equivalence_properties();
+            if !eq_properties.ordering_satisfy_requirement(&required) {
+                // Make sure we preserve the ordering requirements:
+                if physical_ordering.is_some() {
+                    child = update_child_to_remove_unnecessary_sort(idx, child, plan)?;
                 }
+                child = add_sort_above(child, required, None);
+                child = update_sort_ctx_children(child, true)?;
             }
-            (Some(required), None) => {
-                // Ordering requirement is not met, we should add a `SortExec` to the plan.
-                add_sort_above(&mut child_plan, &required, None);
-                *child_node = PlanWithCorrespondingSort::update_children(
-                    child_plan,
-                    vec![child_node.clone()],
-                )?;
-                child_node.sort_connection = true;
-            }
-            (None, Some(_)) => {
-                // We have a `SortExec` whose effect may be neutralized by
-                // another order-imposing operator. Remove this sort.
-                if !plan.maintains_input_order()[idx] || is_union(&plan) {
-                    update_child_to_remove_unnecessary_sort(idx, child_node, &plan)?;
-                }
-            }
-            (None, None) => {
-                update_child_to_remove_unnecessary_sort(idx, child_node, &plan)?;
-            }
+        } else if physical_ordering.is_none()
+            || !plan.maintains_input_order()[idx]
+            || is_union(plan)
+        {
+            // We have a `SortExec` whose effect may be neutralized by another
+            // order-imposing operator, remove this sort:
+            child = update_child_to_remove_unnecessary_sort(idx, child, plan)?;
         }
+        updated_children.push(child);
     }
+    requirements.children = updated_children;
     // For window expressions, we can remove some sorts when we can
     // calculate the result in reverse:
-    if is_window(&plan) && children_nodes[0].sort_connection {
-        if let Some(result) = analyze_window_sort_removal(&mut children_nodes[0], &plan)?
-        {
-            return Ok(Transformed::Yes(result));
-        }
-    } else if is_sort_preserving_merge(&plan)
-        && children_nodes[0]
-            .plan
-            .output_partitioning()
-            .partition_count()
-            <= 1
+    let child_node = &requirements.children[0];
+    if is_window(plan) && child_node.data {
+        return adjust_window_sort_removal(requirements).map(Transformed::Yes);
+    } else if is_sort_preserving_merge(plan)
+        && child_node.plan.output_partitioning().partition_count() <= 1
     {
-        // This SortPreservingMergeExec is unnecessary, input already has a
+        // This `SortPreservingMergeExec` is unnecessary, input already has a
         // single partition.
-        let child_node = children_nodes.swap_remove(0);
+        let child_node = requirements.children.swap_remove(0);
         return Ok(Transformed::Yes(child_node));
     }
-    Ok(Transformed::Yes(
-        PlanWithCorrespondingSort::update_children(plan, children_nodes)?,
-    ))
+
+    update_sort_ctx_children(requirements, false).map(Transformed::Yes)
 }
 
 /// Analyzes a given [`SortExec`] (`plan`) to determine whether its input
 /// already has a finer ordering than it enforces.
 fn analyze_immediate_sort_removal(
-    node: &PlanWithCorrespondingSort,
-) -> Option<PlanWithCorrespondingSort> {
-    let PlanWithCorrespondingSort {
-        plan,
-        children_nodes,
-        ..
-    } = node;
-    if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
-        let sort_input = sort_exec.input().clone();
+    mut node: PlanWithCorrespondingSort,
+) -> Transformed<PlanWithCorrespondingSort> {
+    if let Some(sort_exec) = node.plan.as_any().downcast_ref::<SortExec>() {
+        let sort_input = sort_exec.input();
         // If this sort is unnecessary, we should remove it:
         if sort_input
             .equivalence_properties()
             .ordering_satisfy(sort_exec.output_ordering().unwrap_or(&[]))
         {
-            // Since we know that a `SortExec` has exactly one child,
-            // we can use the zero index safely:
-            return Some(
-                if !sort_exec.preserve_partitioning()
-                    && sort_input.output_partitioning().partition_count() > 1
-                {
-                    // Replace the sort with a sort-preserving merge:
-                    let new_plan: Arc<dyn ExecutionPlan> =
-                        Arc::new(SortPreservingMergeExec::new(
-                            sort_exec.expr().to_vec(),
-                            sort_input,
-                        ));
-                    PlanWithCorrespondingSort {
-                        plan: new_plan,
-                        // SortPreservingMergeExec has single child.
-                        sort_connection: false,
-                        children_nodes: children_nodes
-                            .iter()
-                            .cloned()
-                            .map(|mut node| {
-                                node.sort_connection = false;
-                                node
-                            })
-                            .collect(),
-                    }
-                } else {
-                    // Remove the sort:
-                    PlanWithCorrespondingSort {
-                        plan: sort_input,
-                        sort_connection: false,
-                        children_nodes: children_nodes[0]
-                            .children_nodes
-                            .iter()
-                            .cloned()
-                            .map(|mut node| {
-                                node.sort_connection = false;
-                                node
-                            })
-                            .collect(),
-                    }
-                },
-            );
+            node.plan = if !sort_exec.preserve_partitioning()
+                && sort_input.output_partitioning().partition_count() > 1
+            {
+                // Replace the sort with a sort-preserving merge:
+                let expr = sort_exec.expr().to_vec();
+                Arc::new(SortPreservingMergeExec::new(expr, sort_input.clone())) as _
+            } else {
+                // Remove the sort:
+                node.children = node.children.swap_remove(0).children;
+                sort_input.clone()
+            };
+            for child in node.children.iter_mut() {
+                child.data = false;
+            }
+            node.data = false;
+            return Transformed::Yes(node);
         }
     }
-    None
+    Transformed::No(node)
 }
 
-/// Analyzes a [`WindowAggExec`] or a [`BoundedWindowAggExec`] to determine
+/// Adjusts a [`WindowAggExec`] or a [`BoundedWindowAggExec`] to determine
 /// whether it may allow removing a sort.
-fn analyze_window_sort_removal(
-    sort_tree: &mut PlanWithCorrespondingSort,
-    window_exec: &Arc<dyn ExecutionPlan>,
-) -> Result<Option<PlanWithCorrespondingSort>> {
-    let requires_single_partition = matches!(
-        window_exec.required_input_distribution()[0],
-        Distribution::SinglePartition
-    );
-    remove_corresponding_sort_from_sub_plan(sort_tree, requires_single_partition)?;
-    let mut window_child = sort_tree.plan.clone();
+fn adjust_window_sort_removal(
+    mut window_tree: PlanWithCorrespondingSort,
+) -> Result<PlanWithCorrespondingSort> {
+    // Window operators have a single child we need to adjust:
+    let child_node = remove_corresponding_sort_from_sub_plan(
+        window_tree.children.swap_remove(0),
+        matches!(
+            window_tree.plan.required_input_distribution()[0],
+            Distribution::SinglePartition
+        ),
+    )?;
+    window_tree.children.push(child_node);
+
+    let plan = window_tree.plan.as_any();
+    let child_plan = &window_tree.children[0].plan;
     let (window_expr, new_window) =
-        if let Some(exec) = window_exec.as_any().downcast_ref::<BoundedWindowAggExec>() {
-            (
-                exec.window_expr(),
-                get_best_fitting_window(
-                    exec.window_expr(),
-                    &window_child,
-                    &exec.partition_keys,
-                )?,
-            )
-        } else if let Some(exec) = window_exec.as_any().downcast_ref::<WindowAggExec>() {
-            (
-                exec.window_expr(),
-                get_best_fitting_window(
-                    exec.window_expr(),
-                    &window_child,
-                    &exec.partition_keys,
-                )?,
-            )
+        if let Some(exec) = plan.downcast_ref::<WindowAggExec>() {
+            let window_expr = exec.window_expr();
+            let new_window =
+                get_best_fitting_window(window_expr, child_plan, &exec.partition_keys)?;
+            (window_expr, new_window)
+        } else if let Some(exec) = plan.downcast_ref::<BoundedWindowAggExec>() {
+            let window_expr = exec.window_expr();
+            let new_window =
+                get_best_fitting_window(window_expr, child_plan, &exec.partition_keys)?;
+            (window_expr, new_window)
         } else {
-            return plan_err!(
-                "Expects to receive either WindowAggExec of BoundedWindowAggExec"
-            );
+            return plan_err!("Expected WindowAggExec or BoundedWindowAggExec");
         };
-    let partitionby_exprs = window_expr[0].partition_by();
 
-    if let Some(new_window) = new_window {
+    window_tree.plan = if let Some(new_window) = new_window {
         // We were able to change the window to accommodate the input, use it:
-        Ok(Some(PlanWithCorrespondingSort::new(new_window)))
+        new_window
     } else {
         // We were unable to change the window to accommodate the input, so we
         // will insert a sort.
-        let reqs = window_exec
+        let reqs = window_tree
+            .plan
             .required_input_ordering()
             .swap_remove(0)
             .unwrap_or_default();
+
         // Satisfy the ordering requirement so that the window can run:
-        add_sort_above(&mut window_child, &reqs, None);
+        let mut child_node = window_tree.children.swap_remove(0);
+        child_node = add_sort_above(child_node, reqs, None);
+        let child_plan = child_node.plan.clone();
+        window_tree.children.push(child_node);
 
-        let uses_bounded_memory = window_expr.iter().all(|e| e.uses_bounded_memory());
-        let new_window = if uses_bounded_memory {
+        if window_expr.iter().all(|e| e.uses_bounded_memory()) {
             Arc::new(BoundedWindowAggExec::try_new(
                 window_expr.to_vec(),
-                window_child,
-                partitionby_exprs.to_vec(),
+                child_plan,
+                window_expr[0].partition_by().to_vec(),
                 InputOrderMode::Sorted,
             )?) as _
         } else {
             Arc::new(WindowAggExec::try_new(
                 window_expr.to_vec(),
-                window_child,
-                partitionby_exprs.to_vec(),
+                child_plan,
+                window_expr[0].partition_by().to_vec(),
             )?) as _
-        };
-        Ok(Some(PlanWithCorrespondingSort::new(new_window)))
-    }
-}
+        }
+    };
 
-/// Updates child to remove the unnecessary [`CoalescePartitionsExec`] below it.
-fn update_child_to_remove_coalesce(
-    child: &mut Arc<dyn ExecutionPlan>,
-    coalesce_onwards: &mut PlanWithCorrespondingCoalescePartitions,
-) -> Result<()> {
-    if coalesce_onwards.coalesce_connection {
-        *child = remove_corresponding_coalesce_in_sub_plan(coalesce_onwards, child)?;
-    }
-    Ok(())
+    window_tree.data = false;
+    Ok(window_tree)
 }
 
-/// Removes the [`CoalescePartitionsExec`] from the plan in `coalesce_onwards`.
+/// Removes the [`CoalescePartitionsExec`] from the plan in `node`.
 fn remove_corresponding_coalesce_in_sub_plan(
-    coalesce_onwards: &mut PlanWithCorrespondingCoalescePartitions,
-    parent: &Arc<dyn ExecutionPlan>,
-) -> Result<Arc<dyn ExecutionPlan>> {
-    if is_coalesce_partitions(&coalesce_onwards.plan) {
+    mut requirements: PlanWithCorrespondingCoalescePartitions,
+) -> Result<PlanWithCorrespondingCoalescePartitions> {
+    let plan = &requirements.plan;
+    let children = &mut requirements.children;
+    if is_coalesce_partitions(&children[0].plan) {
         // We can safely use the 0th index since we have a `CoalescePartitionsExec`.
-        let mut new_plan = coalesce_onwards.plan.children()[0].clone();
-        while new_plan.output_partitioning() == parent.output_partitioning()
-            && is_repartition(&new_plan)
-            && is_repartition(parent)
+        let mut new_child_node = children[0].children.swap_remove(0);
+        while new_child_node.plan.output_partitioning() == plan.output_partitioning()
+            && is_repartition(&new_child_node.plan)
+            && is_repartition(plan)
         {
-            new_plan = new_plan.children().swap_remove(0)
+            new_child_node = new_child_node.children.swap_remove(0)
         }
-        Ok(new_plan)
+        children[0] = new_child_node;
     } else {
-        let plan = coalesce_onwards.plan.clone();
-        let mut children = plan.children();
-        for (idx, node) in coalesce_onwards.children_nodes.iter_mut().enumerate() {
-            if node.coalesce_connection {
-                children[idx] = remove_corresponding_coalesce_in_sub_plan(node, &plan)?;
-            }
-        }
-        plan.with_new_children(children)
+        requirements.children = requirements
+            .children
+            .into_iter()
+            .map(|node| {
+                if node.data {
+                    remove_corresponding_coalesce_in_sub_plan(node)
+                } else {
+                    Ok(node)
+                }
+            })
+            .collect::<Result<_>>()?;
     }
+
+    requirements.update_plan_from_children()
 }
 
 /// Updates child to remove the unnecessary sort below it.
 fn update_child_to_remove_unnecessary_sort(
     child_idx: usize,
-    sort_onwards: &mut PlanWithCorrespondingSort,
+    mut node: PlanWithCorrespondingSort,
     parent: &Arc<dyn ExecutionPlan>,
-) -> Result<()> {
-    if sort_onwards.sort_connection {
+) -> Result<PlanWithCorrespondingSort> {
+    if node.data {
         let requires_single_partition = matches!(
             parent.required_input_distribution()[child_idx],
             Distribution::SinglePartition
         );
-        remove_corresponding_sort_from_sub_plan(sort_onwards, requires_single_partition)?;
+        node = remove_corresponding_sort_from_sub_plan(node, requires_single_partition)?;
     }
-    sort_onwards.sort_connection = false;
-    Ok(())
+    node.data = false;
+    Ok(node)
 }
 
-/// Removes the sort from the plan in `sort_onwards`.
+/// Removes the sort from the plan in `node`.
 fn remove_corresponding_sort_from_sub_plan(
-    sort_onwards: &mut PlanWithCorrespondingSort,
+    mut node: PlanWithCorrespondingSort,
     requires_single_partition: bool,
-) -> Result<()> {
+) -> Result<PlanWithCorrespondingSort> {
     // A `SortExec` is always at the bottom of the tree.
-    if is_sort(&sort_onwards.plan) {
-        *sort_onwards = sort_onwards.children_nodes.swap_remove(0);
+    if is_sort(&node.plan) {
+        node = node.children.swap_remove(0);
     } else {
-        let PlanWithCorrespondingSort {
-            plan,
-            sort_connection: _,
-            children_nodes,
-        } = sort_onwards;
         let mut any_connection = false;
-        for (child_idx, child_node) in children_nodes.iter_mut().enumerate() {
-            if child_node.sort_connection {
-                any_connection = true;
-                let requires_single_partition = matches!(
-                    plan.required_input_distribution()[child_idx],
-                    Distribution::SinglePartition
-                );
-                remove_corresponding_sort_from_sub_plan(
-                    child_node,
-                    requires_single_partition,
-                )?;
-            }
-        }
-        if any_connection || children_nodes.is_empty() {
-            *sort_onwards = PlanWithCorrespondingSort::update_children(
-                plan.clone(),
-                children_nodes.clone(),
-            )?;
+        let required_dist = node.plan.required_input_distribution();
+        node.children = node
+            .children
+            .into_iter()
+            .enumerate()
+            .map(|(idx, child)| {
+                if child.data {
+                    any_connection = true;
+                    remove_corresponding_sort_from_sub_plan(
+                        child,
+                        matches!(required_dist[idx], Distribution::SinglePartition),
+                    )
+                } else {
+                    Ok(child)
+                }
+            })
+            .collect::<Result<_>>()?;
+        if any_connection || node.children.is_empty() {
+            node = update_sort_ctx_children(node, false)?;
         }
-        let PlanWithCorrespondingSort {
-            plan,
-            children_nodes,
-            ..
-        } = sort_onwards;
+
         // Replace with variants that do not preserve order.
-        if is_sort_preserving_merge(plan) {
-            children_nodes.swap_remove(0);
-            *plan = plan.children().swap_remove(0);
-        } else if let Some(repartition) = plan.as_any().downcast_ref::<RepartitionExec>()
+        if is_sort_preserving_merge(&node.plan) {
+            node.children = node.children.swap_remove(0).children;
+            node.plan = node.plan.children().swap_remove(0);
+        } else if let Some(repartition) =
+            node.plan.as_any().downcast_ref::<RepartitionExec>()
         {
-            *plan = Arc::new(RepartitionExec::try_new(
-                children_nodes[0].plan.clone(),
+            node.plan = Arc::new(RepartitionExec::try_new(
+                node.children[0].plan.clone(),
                 repartition.output_partitioning(),
             )?) as _;
         }
     };
     // Deleting a merging sort may invalidate distribution requirements.
     // Ensure that we stay compliant with such requirements:
-    if requires_single_partition
-        && sort_onwards.plan.output_partitioning().partition_count() > 1
+    if requires_single_partition && node.plan.output_partitioning().partition_count() > 1
     {
-        // If there is existing ordering, to preserve ordering use SortPreservingMergeExec
-        // instead of CoalescePartitionsExec.
-        if let Some(ordering) = sort_onwards.plan.output_ordering() {
-            let plan = Arc::new(SortPreservingMergeExec::new(
-                ordering.to_vec(),
-                sort_onwards.plan.clone(),
-            )) as _;
-            *sort_onwards = PlanWithCorrespondingSort::update_children(
-                plan,
-                vec![sort_onwards.clone()],
-            )?;
+        // If there is existing ordering, to preserve ordering use
+        // `SortPreservingMergeExec` instead of a `CoalescePartitionsExec`.
+        let plan = node.plan.clone();
+        let plan = if let Some(ordering) = plan.output_ordering() {
+            Arc::new(SortPreservingMergeExec::new(ordering.to_vec(), plan)) as _
         } else {
-            let plan =
-                Arc::new(CoalescePartitionsExec::new(sort_onwards.plan.clone())) as _;
-            *sort_onwards = PlanWithCorrespondingSort::update_children(
-                plan,
-                vec![sort_onwards.clone()],
-            )?;
-        }
+            Arc::new(CoalescePartitionsExec::new(plan)) as _
+        };
+        node = PlanWithCorrespondingSort::new(plan, false, vec![node]);
+        node = update_sort_ctx_children(node, false)?;
     }
-    Ok(())
+    Ok(node)
 }
 
 /// Converts an [ExecutionPlan] trait object to a [PhysicalSortExpr] slice when possible.
@@ -734,13 +561,9 @@ fn get_sort_exprs(
 ) -> Result<(&[PhysicalSortExpr], Option<usize>)> {
     if let Some(sort_exec) = sort_any.as_any().downcast_ref::<SortExec>() {
         Ok((sort_exec.expr(), sort_exec.fetch()))
-    } else if let Some(sort_preserving_merge_exec) =
-        sort_any.as_any().downcast_ref::<SortPreservingMergeExec>()
+    } else if let Some(spm) = sort_any.as_any().downcast_ref::<SortPreservingMergeExec>()
     {
-        Ok((
-            sort_preserving_merge_exec.expr(),
-            sort_preserving_merge_exec.fetch(),
-        ))
+        Ok((spm.expr(), spm.fetch()))
     } else {
         plan_err!("Given ExecutionPlan is not a SortExec or a SortPreservingMergeExec")
     }
@@ -753,7 +576,7 @@ mod tests {
     use super::*;
     use crate::physical_optimizer::enforce_distribution::EnforceDistribution;
     use crate::physical_optimizer::test_utils::{
-        aggregate_exec, bounded_window_exec, coalesce_batches_exec,
+        aggregate_exec, bounded_window_exec, check_integrity, coalesce_batches_exec,
         coalesce_partitions_exec, filter_exec, global_limit_exec, hash_join_exec,
         limit_exec, local_limit_exec, memory_exec, parquet_exec, parquet_exec_sorted,
         repartition_exec, sort_exec, sort_expr, sort_expr_options, sort_merge_join_exec,
@@ -776,7 +599,6 @@ mod tests {
         let nullable_column = Field::new("nullable_col", DataType::Int32, true);
         let non_nullable_column = Field::new("non_nullable_col", DataType::Int32, false);
         let schema = Arc::new(Schema::new(vec![nullable_column, non_nullable_column]));
-
         Ok(schema)
     }
 
@@ -812,6 +634,50 @@ mod tests {
             let session_ctx = SessionContext::new_with_config(config);
             let state = session_ctx.state();
 
+            // This file has 4 rules that use tree node, apply these rules as in the
+            // EnforSorting::optimize implementation
+            // After these operations tree nodes should be in a consistent state.
+            // This code block makes sure that these rules doesn't violate tree node integrity.
+            {
+                let plan_requirements = PlanWithCorrespondingSort::new_default($PLAN.clone());
+                let adjusted = plan_requirements
+                    .transform_up(&ensure_sorting)
+                    .and_then(check_integrity)?;
+                // TODO: End state payloads will be checked here.
+
+                let new_plan = if state.config_options().optimizer.repartition_sorts {
+                    let plan_with_coalesce_partitions =
+                        PlanWithCorrespondingCoalescePartitions::new_default(adjusted.plan);
+                    let parallel = plan_with_coalesce_partitions
+                        .transform_up(&parallelize_sorts)
+                        .and_then(check_integrity)?;
+                    // TODO: End state payloads will be checked here.
+                    parallel.plan
+                } else {
+                    adjusted.plan
+                };
+
+                let plan_with_pipeline_fixer = OrderPreservationContext::new_default(new_plan);
+                let updated_plan = plan_with_pipeline_fixer
+                    .transform_up(&|plan_with_pipeline_fixer| {
+                        replace_with_order_preserving_variants(
+                            plan_with_pipeline_fixer,
+                            false,
+                            true,
+                            state.config_options(),
+                        )
+                    })
+                    .and_then(check_integrity)?;
+                // TODO: End state payloads will be checked here.
+
+                let mut sort_pushdown = SortPushDown::new_default(updated_plan.plan);
+                assign_initial_requirements(&mut sort_pushdown);
+                sort_pushdown
+                    .transform_down(&pushdown_sorts)
+                    .and_then(check_integrity)?;
+                // TODO: End state payloads will be checked here.
+            }
+
             let physical_plan = $PLAN;
             let formatted = displayable(physical_plan.as_ref()).indent(true).to_string();
             let actual: Vec<&str> = formatted.trim().lines().collect();
@@ -858,6 +724,7 @@ mod tests {
             "  MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -917,6 +784,7 @@ mod tests {
             "        SortExec: expr=[non_nullable_col@1 DESC]",
             "          MemoryExec: partitions=1, partition_sizes=[0]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -938,6 +806,7 @@ mod tests {
             "  MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -964,6 +833,7 @@ mod tests {
             "  MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1003,6 +873,7 @@ mod tests {
             "    MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1047,6 +918,7 @@ mod tests {
             "      MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1100,6 +972,7 @@ mod tests {
             "        RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "          MemoryExec: partitions=1, partition_sizes=[0]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1127,6 +1000,7 @@ mod tests {
             "  MemoryExec: partitions=1, partition_sizes=[0]",
             "  ParquetExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1156,6 +1030,7 @@ mod tests {
             "  MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1198,6 +1073,7 @@ mod tests {
             "            SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]",
             "              ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1221,6 +1097,7 @@ mod tests {
             "  MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1248,6 +1125,7 @@ mod tests {
             "  MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1275,6 +1153,7 @@ mod tests {
         // should not add a sort at the output of the union, input plan should not be changed
         let expected_optimized = expected_input.clone();
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1306,6 +1185,7 @@ mod tests {
         // should not add a sort at the output of the union, input plan should not be changed
         let expected_optimized = expected_input.clone();
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1342,6 +1222,7 @@ mod tests {
             "    SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]",
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1383,6 +1264,7 @@ mod tests {
             "    SortExec: expr=[nullable_col@0 ASC]",
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1424,6 +1306,7 @@ mod tests {
             "    SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]",
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1470,6 +1353,7 @@ mod tests {
             "    SortExec: expr=[nullable_col@0 ASC]",
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1518,6 +1402,7 @@ mod tests {
             "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "        ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1552,6 +1437,7 @@ mod tests {
             "    SortExec: expr=[nullable_col@0 ASC]",
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_output, physical_plan, true);
+
         Ok(())
     }
 
@@ -1600,6 +1486,7 @@ mod tests {
             "  ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
             "  ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1649,6 +1536,7 @@ mod tests {
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC, non_nullable_col@1 ASC]",
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1685,6 +1573,7 @@ mod tests {
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC]",
             "      ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col], output_ordering=[nullable_col@0 ASC]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1736,6 +1625,7 @@ mod tests {
             "        SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 DESC NULLS LAST]",
             "          ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -1979,6 +1869,7 @@ mod tests {
             "      SortExec: expr=[nullable_col@0 ASC,non_nullable_col@1 ASC]",
             "        MemoryExec: partitions=1, partition_sizes=[0]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -2013,6 +1904,7 @@ mod tests {
             "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "        ParquetExec: file_groups={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -2105,6 +1997,7 @@ mod tests {
             "      MemoryExec: partitions=1, partition_sizes=[0]",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -2132,6 +2025,7 @@ mod tests {
             "      RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "        CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], has_header=false"];
         assert_optimized!(expected_input, expected_optimized, physical_plan, true);
+
         Ok(())
     }
 
@@ -2221,6 +2115,7 @@ mod tests {
             physical_plan,
             true
         );
+
         Ok(())
     }
 
@@ -2242,6 +2137,7 @@ mod tests {
             "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "      CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], has_header=false",];
         assert_optimized!(expected_input, expected_optimized, physical_plan, false);
+
         Ok(())
     }
 
@@ -2270,6 +2166,7 @@ mod tests {
             "    RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
             "      CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC, b@1 ASC], has_header=false",];
         assert_optimized!(expected_input, expected_optimized, physical_plan, false);
+
         Ok(())
     }
 
@@ -2302,6 +2199,7 @@ mod tests {
             "          CsvExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], has_header=false",
         ];
         assert_optimized!(expected_input, expected_optimized, physical_plan, false);
+
         Ok(())
     }
 }
diff --git a/datafusion/core/src/physical_optimizer/join_selection.rs b/datafusion/core/src/physical_optimizer/join_selection.rs
index f9b9fdf85cfa..083cd5ecab8a 100644
--- a/datafusion/core/src/physical_optimizer/join_selection.rs
+++ b/datafusion/core/src/physical_optimizer/join_selection.rs
@@ -27,7 +27,9 @@ use std::sync::Arc;
 
 use crate::config::ConfigOptions;
 use crate::error::Result;
-use crate::physical_optimizer::pipeline_checker::PipelineStatePropagator;
+use crate::physical_optimizer::pipeline_checker::{
+    children_unbounded, PipelineStatePropagator,
+};
 use crate::physical_optimizer::PhysicalOptimizerRule;
 use crate::physical_plan::joins::utils::{ColumnIndex, JoinFilter};
 use crate::physical_plan::joins::{
@@ -229,7 +231,7 @@ impl PhysicalOptimizerRule for JoinSelection {
         plan: Arc<dyn ExecutionPlan>,
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let pipeline = PipelineStatePropagator::new(plan);
+        let pipeline = PipelineStatePropagator::new_default(plan);
         // First, we make pipeline-fixing modifications to joins so as to accommodate
         // unbounded inputs. Each pipeline-fixing subrule, which is a function
         // of type `PipelineFixerSubrule`, takes a single [`PipelineStatePropagator`]
@@ -352,7 +354,7 @@ fn try_collect_left(
         )?))),
         (false, true) => {
             if supports_swap(*hash_join.join_type()) {
-                Ok(Some(swap_hash_join(hash_join, PartitionMode::CollectLeft)?))
+                swap_hash_join(hash_join, PartitionMode::CollectLeft).map(Some)
             } else {
                 Ok(None)
             }
@@ -443,10 +445,8 @@ fn statistical_join_selection_subrule(
 }
 
 /// Pipeline-fixing join selection subrule.
-pub type PipelineFixerSubrule = dyn Fn(
-    PipelineStatePropagator,
-    &ConfigOptions,
-) -> Option<Result<PipelineStatePropagator>>;
+pub type PipelineFixerSubrule =
+    dyn Fn(PipelineStatePropagator, &ConfigOptions) -> Result<PipelineStatePropagator>;
 
 /// Converts a hash join to a symmetric hash join in the case of infinite inputs on both sides.
 ///
@@ -466,16 +466,16 @@ pub type PipelineFixerSubrule = dyn Fn(
 fn hash_join_convert_symmetric_subrule(
     mut input: PipelineStatePropagator,
     config_options: &ConfigOptions,
-) -> Option<Result<PipelineStatePropagator>> {
+) -> Result<PipelineStatePropagator> {
     // Check if the current plan node is a HashJoinExec.
     if let Some(hash_join) = input.plan.as_any().downcast_ref::<HashJoinExec>() {
         // Determine if left and right children are unbounded.
-        let ub_flags = input.children_unbounded();
+        let ub_flags = children_unbounded(&input);
         let (left_unbounded, right_unbounded) = (ub_flags[0], ub_flags[1]);
         // Update the unbounded flag of the input.
-        input.unbounded = left_unbounded || right_unbounded;
+        input.data = left_unbounded || right_unbounded;
         // Process only if both left and right sides are unbounded.
-        let result = if left_unbounded && right_unbounded {
+        if left_unbounded && right_unbounded {
             // Determine the partition mode based on configuration.
             let mode = if config_options.optimizer.repartition_joins {
                 StreamJoinPartitionMode::Partitioned
@@ -519,7 +519,7 @@ fn hash_join_convert_symmetric_subrule(
                                 let name = schema.field(*index).name();
                                 let col = Arc::new(Column::new(name, *index)) as _;
                                 // Check if the column is ordered.
-                                equivalence.get_expr_ordering(col).state
+                                equivalence.get_expr_ordering(col).data
                                     != SortProperties::Unordered
                             },
                         )
@@ -539,7 +539,7 @@ fn hash_join_convert_symmetric_subrule(
             let left_order = determine_order(JoinSide::Left);
             let right_order = determine_order(JoinSide::Right);
 
-            SymmetricHashJoinExec::try_new(
+            return SymmetricHashJoinExec::try_new(
                 hash_join.left().clone(),
                 hash_join.right().clone(),
                 hash_join.on().to_vec(),
@@ -553,14 +553,10 @@ fn hash_join_convert_symmetric_subrule(
             .map(|exec| {
                 input.plan = Arc::new(exec) as _;
                 input
-            })
-        } else {
-            Ok(input)
-        };
-        Some(result)
-    } else {
-        None
+            });
+        }
     }
+    Ok(input)
 }
 
 /// This subrule will swap build/probe sides of a hash join depending on whether
@@ -607,12 +603,12 @@ fn hash_join_convert_symmetric_subrule(
 fn hash_join_swap_subrule(
     mut input: PipelineStatePropagator,
     _config_options: &ConfigOptions,
-) -> Option<Result<PipelineStatePropagator>> {
+) -> Result<PipelineStatePropagator> {
     if let Some(hash_join) = input.plan.as_any().downcast_ref::<HashJoinExec>() {
-        let ub_flags = input.children_unbounded();
+        let ub_flags = children_unbounded(&input);
         let (left_unbounded, right_unbounded) = (ub_flags[0], ub_flags[1]);
-        input.unbounded = left_unbounded || right_unbounded;
-        let result = if left_unbounded
+        input.data = left_unbounded || right_unbounded;
+        if left_unbounded
             && !right_unbounded
             && matches!(
                 *hash_join.join_type(),
@@ -620,18 +616,12 @@ fn hash_join_swap_subrule(
                     | JoinType::Left
                     | JoinType::LeftSemi
                     | JoinType::LeftAnti
-            ) {
-            swap_join_according_to_unboundedness(hash_join).map(|plan| {
-                input.plan = plan;
-                input
-            })
-        } else {
-            Ok(input)
-        };
-        Some(result)
-    } else {
-        None
+            )
+        {
+            input.plan = swap_join_according_to_unboundedness(hash_join)?;
+        }
     }
+    Ok(input)
 }
 
 /// This function swaps sides of a hash join to make it runnable even if one of
@@ -669,13 +659,11 @@ fn apply_subrules(
     config_options: &ConfigOptions,
 ) -> Result<Transformed<PipelineStatePropagator>> {
     for subrule in subrules {
-        if let Some(value) = subrule(input.clone(), config_options).transpose()? {
-            input = value;
-        }
+        input = subrule(input, config_options)?;
     }
-    let is_unbounded = input
+    input.data = input
         .plan
-        .unbounded_output(&input.children_unbounded())
+        .unbounded_output(&children_unbounded(&input))
         // Treat the case where an operator can not run on unbounded data as
         // if it can and it outputs unbounded data. Do not raise an error yet.
         // Such operators may be fixed, adjusted or replaced later on during
@@ -683,7 +671,6 @@ fn apply_subrules(
         // etc. If this doesn't happen, the final `PipelineChecker` rule will
         // catch this and raise an error anyway.
         .unwrap_or(true);
-    input.unbounded = is_unbounded;
     Ok(Transformed::Yes(input))
 }
 
@@ -693,6 +680,7 @@ mod tests_statistical {
 
     use super::*;
     use crate::{
+        physical_optimizer::test_utils::check_integrity,
         physical_plan::{
             displayable, joins::PartitionMode, ColumnStatistics, Statistics,
         },
@@ -840,26 +828,51 @@ mod tests_statistical {
         (big, medium, small)
     }
 
+    pub(crate) fn crosscheck_plans(plan: Arc<dyn ExecutionPlan>) -> Result<()> {
+        let pipeline = PipelineStatePropagator::new_default(plan);
+        let subrules: Vec<Box<PipelineFixerSubrule>> = vec![
+            Box::new(hash_join_convert_symmetric_subrule),
+            Box::new(hash_join_swap_subrule),
+        ];
+        let state = pipeline
+            .transform_up(&|p| apply_subrules(p, &subrules, &ConfigOptions::new()))
+            .and_then(check_integrity)?;
+        // TODO: End state payloads will be checked here.
+        let config = ConfigOptions::new().optimizer;
+        let collect_left_threshold = config.hash_join_single_partition_threshold;
+        let collect_threshold_num_rows = config.hash_join_single_partition_threshold_rows;
+        let _ = state.plan.transform_up(&|plan| {
+            statistical_join_selection_subrule(
+                plan,
+                collect_left_threshold,
+                collect_threshold_num_rows,
+            )
+        })?;
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_join_with_swap() {
         let (big, small) = create_big_and_small();
 
-        let join = HashJoinExec::try_new(
-            Arc::clone(&big),
-            Arc::clone(&small),
-            vec![(
-                Column::new_with_schema("big_col", &big.schema()).unwrap(),
-                Column::new_with_schema("small_col", &small.schema()).unwrap(),
-            )],
-            None,
-            &JoinType::Left,
-            PartitionMode::CollectLeft,
-            false,
-        )
-        .unwrap();
+        let join = Arc::new(
+            HashJoinExec::try_new(
+                Arc::clone(&big),
+                Arc::clone(&small),
+                vec![(
+                    Column::new_with_schema("big_col", &big.schema()).unwrap(),
+                    Column::new_with_schema("small_col", &small.schema()).unwrap(),
+                )],
+                None,
+                &JoinType::Left,
+                PartitionMode::CollectLeft,
+                false,
+            )
+            .unwrap(),
+        );
 
         let optimized_join = JoinSelection::new()
-            .optimize(Arc::new(join), &ConfigOptions::new())
+            .optimize(join.clone(), &ConfigOptions::new())
             .unwrap();
 
         let swapping_projection = optimized_join
@@ -889,28 +902,31 @@ mod tests_statistical {
             swapped_join.right().statistics().unwrap().total_byte_size,
             Precision::Inexact(2097152)
         );
+        crosscheck_plans(join.clone()).unwrap();
     }
 
     #[tokio::test]
     async fn test_left_join_with_swap() {
         let (big, small) = create_big_and_small();
         // Left out join should alway swap when the mode is PartitionMode::CollectLeft, even left side is small and right side is large
-        let join = HashJoinExec::try_new(
-            Arc::clone(&small),
-            Arc::clone(&big),
-            vec![(
-                Column::new_with_schema("small_col", &small.schema()).unwrap(),
-                Column::new_with_schema("big_col", &big.schema()).unwrap(),
-            )],
-            None,
-            &JoinType::Left,
-            PartitionMode::CollectLeft,
-            false,
-        )
-        .unwrap();
+        let join = Arc::new(
+            HashJoinExec::try_new(
+                Arc::clone(&small),
+                Arc::clone(&big),
+                vec![(
+                    Column::new_with_schema("small_col", &small.schema()).unwrap(),
+                    Column::new_with_schema("big_col", &big.schema()).unwrap(),
+                )],
+                None,
+                &JoinType::Left,
+                PartitionMode::CollectLeft,
+                false,
+            )
+            .unwrap(),
+        );
 
         let optimized_join = JoinSelection::new()
-            .optimize(Arc::new(join), &ConfigOptions::new())
+            .optimize(join.clone(), &ConfigOptions::new())
             .unwrap();
 
         let swapping_projection = optimized_join
@@ -940,6 +956,7 @@ mod tests_statistical {
             swapped_join.right().statistics().unwrap().total_byte_size,
             Precision::Inexact(8192)
         );
+        crosscheck_plans(join.clone()).unwrap();
     }
 
     #[tokio::test]
@@ -948,24 +965,26 @@ mod tests_statistical {
         for join_type in join_types {
             let (big, small) = create_big_and_small();
 
-            let join = HashJoinExec::try_new(
-                Arc::clone(&big),
-                Arc::clone(&small),
-                vec![(
-                    Column::new_with_schema("big_col", &big.schema()).unwrap(),
-                    Column::new_with_schema("small_col", &small.schema()).unwrap(),
-                )],
-                None,
-                &join_type,
-                PartitionMode::Partitioned,
-                false,
-            )
-            .unwrap();
+            let join = Arc::new(
+                HashJoinExec::try_new(
+                    Arc::clone(&big),
+                    Arc::clone(&small),
+                    vec![(
+                        Column::new_with_schema("big_col", &big.schema()).unwrap(),
+                        Column::new_with_schema("small_col", &small.schema()).unwrap(),
+                    )],
+                    None,
+                    &join_type,
+                    PartitionMode::Partitioned,
+                    false,
+                )
+                .unwrap(),
+            );
 
             let original_schema = join.schema();
 
             let optimized_join = JoinSelection::new()
-                .optimize(Arc::new(join), &ConfigOptions::new())
+                .optimize(join.clone(), &ConfigOptions::new())
                 .unwrap();
 
             let swapped_join = optimized_join
@@ -976,7 +995,6 @@ mod tests_statistical {
                 );
 
             assert_eq!(swapped_join.schema().fields().len(), 1);
-
             assert_eq!(
                 swapped_join.left().statistics().unwrap().total_byte_size,
                 Precision::Inexact(8192)
@@ -985,8 +1003,8 @@ mod tests_statistical {
                 swapped_join.right().statistics().unwrap().total_byte_size,
                 Precision::Inexact(2097152)
             );
-
             assert_eq!(original_schema, swapped_join.schema());
+            crosscheck_plans(join).unwrap();
         }
     }
 
@@ -996,18 +1014,20 @@ mod tests_statistical {
             let expected_lines =
                 $EXPECTED_LINES.iter().map(|s| *s).collect::<Vec<&str>>();
 
+            let plan = Arc::new($PLAN);
             let optimized = JoinSelection::new()
-                .optimize(Arc::new($PLAN), &ConfigOptions::new())
+                .optimize(plan.clone(), &ConfigOptions::new())
                 .unwrap();
 
-            let plan = displayable(optimized.as_ref()).indent(true).to_string();
-            let actual_lines = plan.split("\n").collect::<Vec<&str>>();
+            let plan_string = displayable(optimized.as_ref()).indent(true).to_string();
+            let actual_lines = plan_string.split("\n").collect::<Vec<&str>>();
 
             assert_eq!(
                 &expected_lines, &actual_lines,
                 "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
                 expected_lines, actual_lines
             );
+            crosscheck_plans(plan).unwrap();
         };
     }
 
@@ -1069,22 +1089,24 @@ mod tests_statistical {
     #[tokio::test]
     async fn test_join_no_swap() {
         let (big, small) = create_big_and_small();
-        let join = HashJoinExec::try_new(
-            Arc::clone(&small),
-            Arc::clone(&big),
-            vec![(
-                Column::new_with_schema("small_col", &small.schema()).unwrap(),
-                Column::new_with_schema("big_col", &big.schema()).unwrap(),
-            )],
-            None,
-            &JoinType::Inner,
-            PartitionMode::CollectLeft,
-            false,
-        )
-        .unwrap();
+        let join = Arc::new(
+            HashJoinExec::try_new(
+                Arc::clone(&small),
+                Arc::clone(&big),
+                vec![(
+                    Column::new_with_schema("small_col", &small.schema()).unwrap(),
+                    Column::new_with_schema("big_col", &big.schema()).unwrap(),
+                )],
+                None,
+                &JoinType::Inner,
+                PartitionMode::CollectLeft,
+                false,
+            )
+            .unwrap(),
+        );
 
         let optimized_join = JoinSelection::new()
-            .optimize(Arc::new(join), &ConfigOptions::new())
+            .optimize(join.clone(), &ConfigOptions::new())
             .unwrap();
 
         let swapped_join = optimized_join
@@ -1100,6 +1122,7 @@ mod tests_statistical {
             swapped_join.right().statistics().unwrap().total_byte_size,
             Precision::Inexact(2097152)
         );
+        crosscheck_plans(join).unwrap();
     }
 
     #[tokio::test]
@@ -1270,19 +1293,21 @@ mod tests_statistical {
         is_swapped: bool,
         expected_mode: PartitionMode,
     ) {
-        let join = HashJoinExec::try_new(
-            left,
-            right,
-            on,
-            None,
-            &JoinType::Inner,
-            PartitionMode::Auto,
-            false,
-        )
-        .unwrap();
+        let join = Arc::new(
+            HashJoinExec::try_new(
+                left,
+                right,
+                on,
+                None,
+                &JoinType::Inner,
+                PartitionMode::Auto,
+                false,
+            )
+            .unwrap(),
+        );
 
         let optimized_join = JoinSelection::new()
-            .optimize(Arc::new(join), &ConfigOptions::new())
+            .optimize(join.clone(), &ConfigOptions::new())
             .unwrap();
 
         if !is_swapped {
@@ -1306,6 +1331,7 @@ mod tests_statistical {
 
             assert_eq!(*swapped_join.partition_mode(), expected_mode);
         }
+        crosscheck_plans(join).unwrap();
     }
 }
 
@@ -1350,6 +1376,8 @@ mod util_tests {
 
 #[cfg(test)]
 mod hash_join_tests {
+    use self::tests_statistical::crosscheck_plans;
+
     use super::*;
     use crate::physical_optimizer::join_selection::swap_join_type;
     use crate::physical_optimizer::test_utils::SourceType;
@@ -1716,7 +1744,7 @@ mod hash_join_tests {
             2,
         )) as Arc<dyn ExecutionPlan>;
 
-        let join = HashJoinExec::try_new(
+        let join = Arc::new(HashJoinExec::try_new(
             Arc::clone(&left_exec),
             Arc::clone(&right_exec),
             vec![(
@@ -1727,29 +1755,19 @@ mod hash_join_tests {
             &t.initial_join_type,
             t.initial_mode,
             false,
-        )?;
+        )?);
 
+        let left_child = Arc::new(EmptyExec::new(Arc::new(Schema::empty())));
+        let right_child = Arc::new(EmptyExec::new(Arc::new(Schema::empty())));
         let children = vec![
-            PipelineStatePropagator {
-                plan: Arc::new(EmptyExec::new(Arc::new(Schema::empty()))),
-                unbounded: left_unbounded,
-                children: vec![],
-            },
-            PipelineStatePropagator {
-                plan: Arc::new(EmptyExec::new(Arc::new(Schema::empty()))),
-                unbounded: right_unbounded,
-                children: vec![],
-            },
+            PipelineStatePropagator::new(left_child, left_unbounded, vec![]),
+            PipelineStatePropagator::new(right_child, right_unbounded, vec![]),
         ];
-        let initial_hash_join_state = PipelineStatePropagator {
-            plan: Arc::new(join),
-            unbounded: false,
-            children,
-        };
+        let initial_hash_join_state =
+            PipelineStatePropagator::new(join.clone(), false, children);
 
         let optimized_hash_join =
-            hash_join_swap_subrule(initial_hash_join_state, &ConfigOptions::new())
-                .unwrap()?;
+            hash_join_swap_subrule(initial_hash_join_state, &ConfigOptions::new())?;
         let optimized_join_plan = optimized_hash_join.plan;
 
         // If swap did happen
@@ -1805,6 +1823,7 @@ mod hash_join_tests {
                 )
             );
         };
+        crosscheck_plans(plan).unwrap();
         Ok(())
     }
 }
diff --git a/datafusion/core/src/physical_optimizer/pipeline_checker.rs b/datafusion/core/src/physical_optimizer/pipeline_checker.rs
index e281d0e7c23e..bb0665c10bcc 100644
--- a/datafusion/core/src/physical_optimizer/pipeline_checker.rs
+++ b/datafusion/core/src/physical_optimizer/pipeline_checker.rs
@@ -19,19 +19,19 @@
 //! infinite sources, if there are any. It will reject non-runnable query plans
 //! that use pipeline-breaking operators on infinite input(s).
 
-use std::borrow::Cow;
 use std::sync::Arc;
 
 use crate::config::ConfigOptions;
 use crate::error::Result;
 use crate::physical_optimizer::PhysicalOptimizerRule;
-use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan};
+use crate::physical_plan::ExecutionPlan;
 
 use datafusion_common::config::OptimizerOptions;
 use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{plan_err, DataFusionError};
 use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported};
 use datafusion_physical_plan::joins::SymmetricHashJoinExec;
+use datafusion_physical_plan::tree_node::PlanContext;
 
 /// The PipelineChecker rule rejects non-runnable query plans that use
 /// pipeline-breaking operators on infinite input(s).
@@ -51,7 +51,7 @@ impl PhysicalOptimizerRule for PipelineChecker {
         plan: Arc<dyn ExecutionPlan>,
         config: &ConfigOptions,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let pipeline = PipelineStatePropagator::new(plan);
+        let pipeline = PipelineStatePropagator::new_default(plan);
         let state = pipeline
             .transform_up(&|p| check_finiteness_requirements(p, &config.optimizer))?;
         Ok(state.plan)
@@ -66,54 +66,12 @@ impl PhysicalOptimizerRule for PipelineChecker {
     }
 }
 
-/// [PipelineStatePropagator] propagates the [ExecutionPlan] pipelining information.
-#[derive(Clone, Debug)]
-pub struct PipelineStatePropagator {
-    pub(crate) plan: Arc<dyn ExecutionPlan>,
-    pub(crate) unbounded: bool,
-    pub(crate) children: Vec<Self>,
-}
-
-impl PipelineStatePropagator {
-    /// Constructs a new, default pipelining state.
-    pub fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
-        let children = plan.children();
-        Self {
-            plan,
-            unbounded: false,
-            children: children.into_iter().map(Self::new).collect(),
-        }
-    }
+/// This object propagates the [`ExecutionPlan`] pipelining information.
+pub type PipelineStatePropagator = PlanContext<bool>;
 
-    /// Returns the children unboundedness information.
-    pub fn children_unbounded(&self) -> Vec<bool> {
-        self.children.iter().map(|c| c.unbounded).collect()
-    }
-}
-
-impl TreeNode for PipelineStatePropagator {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children.iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children.is_empty() {
-            self.children = self
-                .children
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-            self.plan = with_new_children_if_necessary(
-                self.plan,
-                self.children.iter().map(|c| c.plan.clone()).collect(),
-            )?
-            .into();
-        }
-        Ok(self)
-    }
+/// Collects unboundedness flags of all the children of the plan in `pipeline`.
+pub fn children_unbounded(pipeline: &PipelineStatePropagator) -> Vec<bool> {
+    pipeline.children.iter().map(|c| c.data).collect()
 }
 
 /// This function propagates finiteness information and rejects any plan with
@@ -126,16 +84,15 @@ pub fn check_finiteness_requirements(
         if !(optimizer_options.allow_symmetric_joins_without_pruning
             || (exec.check_if_order_information_available()? && is_prunable(exec)))
         {
-            const MSG: &str = "Join operation cannot operate on a non-prunable stream without enabling \
-                               the 'allow_symmetric_joins_without_pruning' configuration flag";
-            return plan_err!("{}", MSG);
+            return plan_err!("Join operation cannot operate on a non-prunable stream without enabling \
+                              the 'allow_symmetric_joins_without_pruning' configuration flag");
         }
     }
     input
         .plan
-        .unbounded_output(&input.children_unbounded())
+        .unbounded_output(&children_unbounded(&input))
         .map(|value| {
-            input.unbounded = value;
+            input.data = value;
             Transformed::Yes(input)
         })
 }
diff --git a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs
index e49b358608aa..4656b5b27067 100644
--- a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs
+++ b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs
@@ -19,121 +19,78 @@
 //! order-preserving variants when it is helpful; either in terms of
 //! performance or to accommodate unbounded streams by fixing the pipeline.
 
-use std::borrow::Cow;
 use std::sync::Arc;
 
-use super::utils::is_repartition;
+use super::utils::{is_repartition, is_sort_preserving_merge};
 use crate::error::Result;
 use crate::physical_optimizer::utils::{is_coalesce_partitions, is_sort};
 use crate::physical_plan::repartition::RepartitionExec;
 use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
-use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan};
 
 use datafusion_common::config::ConfigOptions;
-use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::tree_node::Transformed;
+use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
+use datafusion_physical_plan::tree_node::PlanContext;
 use datafusion_physical_plan::unbounded_output;
 
+use itertools::izip;
+
 /// For a given `plan`, this object carries the information one needs from its
 /// descendants to decide whether it is beneficial to replace order-losing (but
 /// somewhat faster) variants of certain operators with their order-preserving
 /// (but somewhat slower) cousins.
-#[derive(Debug, Clone)]
-pub(crate) struct OrderPreservationContext {
-    pub(crate) plan: Arc<dyn ExecutionPlan>,
-    ordering_connection: bool,
-    children_nodes: Vec<Self>,
-}
+pub type OrderPreservationContext = PlanContext<bool>;
+
+/// Updates order-preservation data for all children of the given node.
+pub fn update_children(opc: &mut OrderPreservationContext) {
+    for PlanContext {
+        plan,
+        children,
+        data,
+    } in opc.children.iter_mut()
+    {
+        let maintains_input_order = plan.maintains_input_order();
+        let inspect_child = |idx| {
+            maintains_input_order[idx]
+                || is_coalesce_partitions(plan)
+                || is_repartition(plan)
+        };
 
-impl OrderPreservationContext {
-    /// Creates an empty context tree. Each node has `false` connections.
-    pub fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
-        let children = plan.children();
-        Self {
-            plan,
-            ordering_connection: false,
-            children_nodes: children.into_iter().map(Self::new).collect(),
+        // We cut the path towards nodes that do not maintain ordering.
+        for (idx, c) in children.iter_mut().enumerate() {
+            c.data &= inspect_child(idx);
         }
-    }
-
-    /// Creates a new order-preservation context from those of children nodes.
-    pub fn update_children(mut self) -> Result<Self> {
-        for node in self.children_nodes.iter_mut() {
-            let plan = node.plan.clone();
-            let children = plan.children();
-            let maintains_input_order = plan.maintains_input_order();
-            let inspect_child = |idx| {
-                maintains_input_order[idx]
-                    || is_coalesce_partitions(&plan)
-                    || is_repartition(&plan)
-            };
-
-            // We cut the path towards nodes that do not maintain ordering.
-            for (idx, c) in node.children_nodes.iter_mut().enumerate() {
-                c.ordering_connection &= inspect_child(idx);
-            }
-
-            node.ordering_connection = if children.is_empty() {
-                false
-            } else if !node.children_nodes[0].ordering_connection
-                && ((is_repartition(&plan) && !maintains_input_order[0])
-                    || (is_coalesce_partitions(&plan)
-                        && children[0].output_ordering().is_some()))
-            {
-                // We either have a RepartitionExec or a CoalescePartitionsExec
-                // and they lose their input ordering, so initiate connection:
-                true
-            } else {
-                // Maintain connection if there is a child with a connection,
-                // and operator can possibly maintain that connection (either
-                // in its current form or when we replace it with the corresponding
-                // order preserving operator).
-                node.children_nodes
-                    .iter()
-                    .enumerate()
-                    .any(|(idx, c)| c.ordering_connection && inspect_child(idx))
-            }
-        }
-
-        self.plan = with_new_children_if_necessary(
-            self.plan,
-            self.children_nodes.iter().map(|c| c.plan.clone()).collect(),
-        )?
-        .into();
-        self.ordering_connection = false;
-        Ok(self)
-    }
-}
-
-impl TreeNode for OrderPreservationContext {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children_nodes.iter().map(Cow::Borrowed).collect()
-    }
 
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children_nodes.is_empty() {
-            self.children_nodes = self
-                .children_nodes
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-            self.plan = with_new_children_if_necessary(
-                self.plan,
-                self.children_nodes.iter().map(|c| c.plan.clone()).collect(),
-            )?
-            .into();
+        let plan_children = plan.children();
+        *data = if plan_children.is_empty() {
+            false
+        } else if !children[0].data
+            && ((is_repartition(plan) && !maintains_input_order[0])
+                || (is_coalesce_partitions(plan)
+                    && plan_children[0].output_ordering().is_some()))
+        {
+            // We either have a RepartitionExec or a CoalescePartitionsExec
+            // and they lose their input ordering, so initiate connection:
+            true
+        } else {
+            // Maintain connection if there is a child with a connection,
+            // and operator can possibly maintain that connection (either
+            // in its current form or when we replace it with the corresponding
+            // order preserving operator).
+            children
+                .iter()
+                .enumerate()
+                .any(|(idx, c)| c.data && inspect_child(idx))
         }
-        Ok(self)
     }
+    opc.data = false;
 }
 
 /// Calculates the updated plan by replacing operators that lose ordering
 /// inside `sort_input` with their order-preserving variants. This will
 /// generate an alternative plan, which will be accepted or rejected later on
 /// depending on whether it helps us remove a `SortExec`.
-fn get_updated_plan(
+fn plan_with_order_preserving_variants(
     mut sort_input: OrderPreservationContext,
     // Flag indicating that it is desirable to replace `RepartitionExec`s with
     // `SortPreservingRepartitionExec`s:
@@ -142,74 +99,115 @@ fn get_updated_plan(
     // with `SortPreservingMergeExec`s:
     is_spm_better: bool,
 ) -> Result<OrderPreservationContext> {
-    let updated_children = sort_input
-        .children_nodes
-        .clone()
+    sort_input.children = sort_input
+        .children
         .into_iter()
-        .map(|item| {
-            // Update children and their descendants in the given tree if the connection is open:
-            if item.ordering_connection {
-                get_updated_plan(item, is_spr_better, is_spm_better)
+        .map(|node| {
+            // Update descendants in the given tree if there is a connection:
+            if node.data {
+                plan_with_order_preserving_variants(node, is_spr_better, is_spm_better)
             } else {
-                Ok(item)
+                Ok(node)
             }
         })
-        .collect::<Result<Vec<_>>>()?;
+        .collect::<Result<_>>()?;
+    sort_input.data = false;
 
-    sort_input.plan = sort_input
-        .plan
-        .with_new_children(updated_children.iter().map(|c| c.plan.clone()).collect())?;
-    sort_input.ordering_connection = false;
-    sort_input.children_nodes = updated_children;
-
-    // When a `RepartitionExec` doesn't preserve ordering, replace it with
-    // a sort-preserving variant if appropriate:
     if is_repartition(&sort_input.plan)
         && !sort_input.plan.maintains_input_order()[0]
         && is_spr_better
     {
-        let child = sort_input.plan.children().swap_remove(0);
-        let repartition =
-            RepartitionExec::try_new(child, sort_input.plan.output_partitioning())?
-                .with_preserve_order();
-        sort_input.plan = Arc::new(repartition) as _;
-        sort_input.children_nodes[0].ordering_connection = true;
+        // When a `RepartitionExec` doesn't preserve ordering, replace it with
+        // a sort-preserving variant if appropriate:
+        let child = sort_input.children[0].plan.clone();
+        let partitioning = sort_input.plan.output_partitioning();
+        sort_input.plan = Arc::new(
+            RepartitionExec::try_new(child, partitioning)?.with_preserve_order(),
+        ) as _;
+        sort_input.children[0].data = true;
+        return Ok(sort_input);
     } else if is_coalesce_partitions(&sort_input.plan) && is_spm_better {
-        // When the input of a `CoalescePartitionsExec` has an ordering, replace it
-        // with a `SortPreservingMergeExec` if appropriate:
-        if let Some(ordering) = sort_input.children_nodes[0]
-            .plan
-            .output_ordering()
-            .map(|o| o.to_vec())
+        let child = &sort_input.children[0].plan;
+        if let Some(ordering) = child.output_ordering().map(Vec::from) {
+            // When the input of a `CoalescePartitionsExec` has an ordering,
+            // replace it with a `SortPreservingMergeExec` if appropriate:
+            let spm = SortPreservingMergeExec::new(ordering, child.clone());
+            sort_input.plan = Arc::new(spm) as _;
+            sort_input.children[0].data = true;
+            return Ok(sort_input);
+        }
+    }
+
+    sort_input.update_plan_from_children()
+}
+
+/// Calculates the updated plan by replacing operators that preserve ordering
+/// inside `sort_input` with their order-breaking variants. This will restore
+/// the original plan modified by [`plan_with_order_preserving_variants`].
+fn plan_with_order_breaking_variants(
+    mut sort_input: OrderPreservationContext,
+) -> Result<OrderPreservationContext> {
+    let plan = &sort_input.plan;
+    sort_input.children = izip!(
+        sort_input.children,
+        plan.maintains_input_order(),
+        plan.required_input_ordering()
+    )
+    .map(|(node, maintains, required_ordering)| {
+        // Replace with non-order preserving variants as long as ordering is
+        // not required by intermediate operators:
+        if maintains
+            && (is_sort_preserving_merge(plan)
+                || !required_ordering.map_or(false, |required_ordering| {
+                    node.plan
+                        .equivalence_properties()
+                        .ordering_satisfy_requirement(&required_ordering)
+                }))
         {
-            // Now we can mutate `new_node.children_nodes` safely
-            let child = sort_input.children_nodes.clone().swap_remove(0);
-            sort_input.plan =
-                Arc::new(SortPreservingMergeExec::new(ordering, child.plan)) as _;
-            sort_input.children_nodes[0].ordering_connection = true;
+            plan_with_order_breaking_variants(node)
+        } else {
+            Ok(node)
         }
+    })
+    .collect::<Result<_>>()?;
+    sort_input.data = false;
+
+    if is_repartition(plan) && plan.maintains_input_order()[0] {
+        // When a `RepartitionExec` preserves ordering, replace it with a
+        // non-sort-preserving variant:
+        let child = sort_input.children[0].plan.clone();
+        let partitioning = plan.output_partitioning();
+        sort_input.plan = Arc::new(RepartitionExec::try_new(child, partitioning)?) as _;
+    } else if is_sort_preserving_merge(plan) {
+        // Replace `SortPreservingMergeExec` with a `CoalescePartitionsExec`:
+        let child = sort_input.children[0].plan.clone();
+        let coalesce = CoalescePartitionsExec::new(child);
+        sort_input.plan = Arc::new(coalesce) as _;
+    } else {
+        return sort_input.update_plan_from_children();
     }
 
+    sort_input.children[0].data = false;
     Ok(sort_input)
 }
 
 /// The `replace_with_order_preserving_variants` optimizer sub-rule tries to
 /// remove `SortExec`s from the physical plan by replacing operators that do
 /// not preserve ordering with their order-preserving variants; i.e. by replacing
-/// `RepartitionExec`s with `SortPreservingRepartitionExec`s or by replacing
+/// ordinary `RepartitionExec`s with their sort-preserving variants or by replacing
 /// `CoalescePartitionsExec`s with `SortPreservingMergeExec`s.
 ///
 /// If this replacement is helpful for removing a `SortExec`, it updates the plan.
 /// Otherwise, it leaves the plan unchanged.
 ///
-/// Note: this optimizer sub-rule will only produce `SortPreservingRepartitionExec`s
-/// if the query is bounded or if the config option `bounded_order_preserving_variants`
-/// is set to `true`.
+/// NOTE: This optimizer sub-rule will only produce sort-preserving `RepartitionExec`s
+/// if the query is bounded or if the config option `prefer_existing_sort` is
+/// set to `true`.
 ///
 /// The algorithm flow is simply like this:
 /// 1. Visit nodes of the physical plan bottom-up and look for `SortExec` nodes.
-/// 1_1. During the traversal, keep track of operators that maintain ordering
-///    (or can maintain ordering when replaced by an order-preserving variant) until
+///    During the traversal, keep track of operators that maintain ordering (or
+///    can maintain ordering when replaced by an order-preserving variant) until
 ///    a `SortExec` is found.
 /// 2. When a `SortExec` is found, update the child of the `SortExec` by replacing
 ///    operators that do not preserve ordering in the tree with their order
@@ -218,57 +216,56 @@ fn get_updated_plan(
 ///    its input ordering with the output ordering it imposes. We do this because
 ///    replacing operators that lose ordering with their order-preserving variants
 ///    enables us to preserve the previously lost ordering at the input of `SortExec`.
-/// 4. If the `SortExec` in question turns out to be unnecessary, remove it and use
-///     updated plan. Otherwise, use the original plan.
-/// 5. Continue the bottom-up traversal until another `SortExec` is seen, or the traversal
-///    is complete.
+/// 4. If the `SortExec` in question turns out to be unnecessary, remove it and
+///    use updated plan. Otherwise, use the original plan.
+/// 5. Continue the bottom-up traversal until another `SortExec` is seen, or the
+///    traversal is complete.
 pub(crate) fn replace_with_order_preserving_variants(
-    requirements: OrderPreservationContext,
-    // A flag indicating that replacing `RepartitionExec`s with
-    // `SortPreservingRepartitionExec`s is desirable when it helps
-    // to remove a `SortExec` from the plan. If this flag is `false`,
-    // this replacement should only be made to fix the pipeline (streaming).
+    mut requirements: OrderPreservationContext,
+    // A flag indicating that replacing `RepartitionExec`s with sort-preserving
+    // variants is desirable when it helps to remove a `SortExec` from the plan.
+    // If this flag is `false`, this replacement should only be made to fix the
+    // pipeline (streaming).
     is_spr_better: bool,
     // A flag indicating that replacing `CoalescePartitionsExec`s with
-    // `SortPreservingMergeExec`s is desirable when it helps to remove
-    // a `SortExec` from the plan. If this flag is `false`, this replacement
+    // `SortPreservingMergeExec`s is desirable when it helps to remove a
+    // `SortExec` from the plan. If this flag is `false`, this replacement
     // should only be made to fix the pipeline (streaming).
     is_spm_better: bool,
     config: &ConfigOptions,
 ) -> Result<Transformed<OrderPreservationContext>> {
-    let mut requirements = requirements.update_children()?;
-    if !(is_sort(&requirements.plan)
-        && requirements.children_nodes[0].ordering_connection)
-    {
+    update_children(&mut requirements);
+    if !(is_sort(&requirements.plan) && requirements.children[0].data) {
         return Ok(Transformed::No(requirements));
     }
 
-    // For unbounded cases, replace with the order-preserving variant in
-    // any case, as doing so helps fix the pipeline.
-    // Also do the replacement if opted-in via config options.
+    // For unbounded cases, we replace with the order-preserving variant in any
+    // case, as doing so helps fix the pipeline. Also replace if config allows.
     let use_order_preserving_variant =
         config.optimizer.prefer_existing_sort || unbounded_output(&requirements.plan);
 
-    let mut updated_sort_input = get_updated_plan(
-        requirements.children_nodes.clone().swap_remove(0),
+    // Create an alternate plan with order-preserving variants:
+    let mut alternate_plan = plan_with_order_preserving_variants(
+        requirements.children.swap_remove(0),
         is_spr_better || use_order_preserving_variant,
         is_spm_better || use_order_preserving_variant,
     )?;
 
-    // If this sort is unnecessary, we should remove it and update the plan:
-    if updated_sort_input
+    // If the alternate plan makes this sort unnecessary, accept the alternate:
+    if alternate_plan
         .plan
         .equivalence_properties()
         .ordering_satisfy(requirements.plan.output_ordering().unwrap_or(&[]))
     {
-        for child in updated_sort_input.children_nodes.iter_mut() {
-            child.ordering_connection = false;
+        for child in alternate_plan.children.iter_mut() {
+            child.data = false;
         }
-        Ok(Transformed::Yes(updated_sort_input))
+        Ok(Transformed::Yes(alternate_plan))
     } else {
-        for child in requirements.children_nodes.iter_mut() {
-            child.ordering_connection = false;
-        }
+        // The alternate plan does not help, use faster order-breaking variants:
+        alternate_plan = plan_with_order_breaking_variants(alternate_plan)?;
+        alternate_plan.data = false;
+        requirements.children = vec![alternate_plan];
         Ok(Transformed::Yes(requirements))
     }
 }
@@ -280,6 +277,7 @@ mod tests {
     use crate::datasource::file_format::file_compression_type::FileCompressionType;
     use crate::datasource::listing::PartitionedFile;
     use crate::datasource::physical_plan::{CsvExec, FileScanConfig};
+    use crate::physical_optimizer::test_utils::check_integrity;
     use crate::physical_plan::coalesce_batches::CoalesceBatchesExec;
     use crate::physical_plan::coalesce_partitions::CoalescePartitionsExec;
     use crate::physical_plan::filter::FilterExec;
@@ -287,7 +285,9 @@ mod tests {
     use crate::physical_plan::repartition::RepartitionExec;
     use crate::physical_plan::sorts::sort::SortExec;
     use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
-    use crate::physical_plan::{displayable, get_plan_string, Partitioning};
+    use crate::physical_plan::{
+        displayable, get_plan_string, ExecutionPlan, Partitioning,
+    };
     use crate::prelude::SessionConfig;
     use crate::test::TestStreamPartition;
 
@@ -394,8 +394,8 @@ mod tests {
 
             // Run the rule top-down
             let config = SessionConfig::new().with_prefer_existing_sort($PREFER_EXISTING_SORT);
-            let plan_with_pipeline_fixer = OrderPreservationContext::new(physical_plan);
-            let parallel = plan_with_pipeline_fixer.transform_up(&|plan_with_pipeline_fixer| replace_with_order_preserving_variants(plan_with_pipeline_fixer, false, false, config.options()))?;
+            let plan_with_pipeline_fixer = OrderPreservationContext::new_default(physical_plan);
+            let parallel = plan_with_pipeline_fixer.transform_up(&|plan_with_pipeline_fixer| replace_with_order_preserving_variants(plan_with_pipeline_fixer, false, false, config.options())).and_then(check_integrity)?;
             let optimized_physical_plan = parallel.plan;
 
             // Get string representation of the plan
diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
index f0a8c8cfd3cb..3413486c6b46 100644
--- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs
+++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::borrow::Cow;
 use std::sync::Arc;
 
+use super::utils::add_sort_above;
 use crate::physical_optimizer::utils::{
-    add_sort_above, is_limit, is_sort_preserving_merge, is_union, is_window,
+    is_limit, is_sort_preserving_merge, is_union, is_window,
 };
 use crate::physical_plan::filter::FilterExec;
 use crate::physical_plan::joins::utils::calculate_join_output_ordering;
@@ -27,9 +27,10 @@ use crate::physical_plan::joins::{HashJoinExec, SortMergeJoinExec};
 use crate::physical_plan::projection::ProjectionExec;
 use crate::physical_plan::repartition::RepartitionExec;
 use crate::physical_plan::sorts::sort::SortExec;
-use crate::physical_plan::{with_new_children_if_necessary, ExecutionPlan};
+use crate::physical_plan::tree_node::PlanContext;
+use crate::physical_plan::ExecutionPlan;
 
-use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::tree_node::Transformed;
 use datafusion_common::{plan_err, DataFusionError, JoinSide, Result};
 use datafusion_expr::JoinType;
 use datafusion_physical_expr::expressions::Column;
@@ -39,60 +40,17 @@ use datafusion_physical_expr::{
 
 /// This is a "data class" we use within the [`EnforceSorting`] rule to push
 /// down [`SortExec`] in the plan. In some cases, we can reduce the total
-/// computational cost by pushing down `SortExec`s through some executors.
+/// computational cost by pushing down `SortExec`s through some executors. The
+/// object carries the parent required ordering as its data.
 ///
 /// [`EnforceSorting`]: crate::physical_optimizer::enforce_sorting::EnforceSorting
-#[derive(Debug, Clone)]
-pub(crate) struct SortPushDown {
-    /// Current plan
-    pub plan: Arc<dyn ExecutionPlan>,
-    /// Parent required sort ordering
-    required_ordering: Option<Vec<PhysicalSortRequirement>>,
-    children_nodes: Vec<Self>,
-}
+pub type SortPushDown = PlanContext<Option<Vec<PhysicalSortRequirement>>>;
 
-impl SortPushDown {
-    /// Creates an empty tree with empty `required_ordering`'s.
-    pub fn new(plan: Arc<dyn ExecutionPlan>) -> Self {
-        let children = plan.children();
-        Self {
-            plan,
-            required_ordering: None,
-            children_nodes: children.into_iter().map(Self::new).collect(),
-        }
-    }
-
-    /// Assigns the ordering requirement of the root node to the its children.
-    pub fn assign_initial_requirements(&mut self) {
-        let reqs = self.plan.required_input_ordering();
-        for (child, requirement) in self.children_nodes.iter_mut().zip(reqs) {
-            child.required_ordering = requirement;
-        }
-    }
-}
-
-impl TreeNode for SortPushDown {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children_nodes.iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children_nodes.is_empty() {
-            self.children_nodes = self
-                .children_nodes
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-            self.plan = with_new_children_if_necessary(
-                self.plan,
-                self.children_nodes.iter().map(|c| c.plan.clone()).collect(),
-            )?
-            .into();
-        }
-        Ok(self)
+/// Assigns the ordering requirement of the root node to the its children.
+pub fn assign_initial_requirements(node: &mut SortPushDown) {
+    let reqs = node.plan.required_input_ordering();
+    for (child, requirement) in node.children.iter_mut().zip(reqs) {
+        child.data = requirement;
     }
 }
 
@@ -100,71 +58,61 @@ pub(crate) fn pushdown_sorts(
     mut requirements: SortPushDown,
 ) -> Result<Transformed<SortPushDown>> {
     let plan = &requirements.plan;
-    let parent_required = requirements.required_ordering.as_deref().unwrap_or(&[]);
+    let parent_reqs = requirements.data.as_deref().unwrap_or(&[]);
+    let satisfy_parent = plan
+        .equivalence_properties()
+        .ordering_satisfy_requirement(parent_reqs);
 
     if let Some(sort_exec) = plan.as_any().downcast_ref::<SortExec>() {
-        if !plan
-            .equivalence_properties()
-            .ordering_satisfy_requirement(parent_required)
-        {
-            // If the current plan is a SortExec, modify it to satisfy parent requirements:
-            let mut new_plan = sort_exec.input().clone();
-            add_sort_above(&mut new_plan, parent_required, sort_exec.fetch());
-            requirements.plan = new_plan;
-        };
-
-        let required_ordering = requirements
-            .plan
+        let required_ordering = plan
             .output_ordering()
             .map(PhysicalSortRequirement::from_sort_exprs)
             .unwrap_or_default();
-        // Since new_plan is a SortExec, we can safely get the 0th index.
-        let mut child = requirements.children_nodes.swap_remove(0);
+
+        if !satisfy_parent {
+            // Make sure this `SortExec` satisfies parent requirements:
+            let fetch = sort_exec.fetch();
+            let sort_reqs = requirements.data.unwrap_or_default();
+            requirements = requirements.children.swap_remove(0);
+            requirements = add_sort_above(requirements, sort_reqs, fetch);
+        };
+
+        // We can safely get the 0th index as we are dealing with a `SortExec`.
+        let mut child = requirements.children.swap_remove(0);
         if let Some(adjusted) =
             pushdown_requirement_to_children(&child.plan, &required_ordering)?
         {
-            for (c, o) in child.children_nodes.iter_mut().zip(adjusted) {
-                c.required_ordering = o;
+            for (grand_child, order) in child.children.iter_mut().zip(adjusted) {
+                grand_child.data = order;
             }
             // Can push down requirements
-            child.required_ordering = None;
-            Ok(Transformed::Yes(child))
+            child.data = None;
+            return Ok(Transformed::Yes(child));
         } else {
             // Can not push down requirements
-            let mut empty_node = SortPushDown::new(requirements.plan);
-            empty_node.assign_initial_requirements();
-            Ok(Transformed::Yes(empty_node))
+            requirements.children = vec![child];
+            assign_initial_requirements(&mut requirements);
         }
-    } else {
-        // Executors other than SortExec
-        if plan
-            .equivalence_properties()
-            .ordering_satisfy_requirement(parent_required)
-        {
-            // Satisfies parent requirements, immediately return.
-            let reqs = requirements.plan.required_input_ordering();
-            for (child, order) in requirements.children_nodes.iter_mut().zip(reqs) {
-                child.required_ordering = order;
-            }
-            return Ok(Transformed::Yes(requirements));
+    } else if satisfy_parent {
+        // For non-sort operators, immediately return if parent requirements are met:
+        let reqs = plan.required_input_ordering();
+        for (child, order) in requirements.children.iter_mut().zip(reqs) {
+            child.data = order;
         }
-        // Can not satisfy the parent requirements, check whether the requirements can be pushed down:
-        if let Some(adjusted) = pushdown_requirement_to_children(plan, parent_required)? {
-            for (c, o) in requirements.children_nodes.iter_mut().zip(adjusted) {
-                c.required_ordering = o;
-            }
-            requirements.required_ordering = None;
-            Ok(Transformed::Yes(requirements))
-        } else {
-            // Can not push down requirements, add new SortExec:
-            let mut new_plan = requirements.plan;
-            add_sort_above(&mut new_plan, parent_required, None);
-            let mut new_empty = SortPushDown::new(new_plan);
-            new_empty.assign_initial_requirements();
-            // Can not push down requirements
-            Ok(Transformed::Yes(new_empty))
+    } else if let Some(adjusted) = pushdown_requirement_to_children(plan, parent_reqs)? {
+        // Can not satisfy the parent requirements, check whether we can push
+        // requirements down:
+        for (child, order) in requirements.children.iter_mut().zip(adjusted) {
+            child.data = order;
         }
+        requirements.data = None;
+    } else {
+        // Can not push down requirements, add new `SortExec`:
+        let sort_reqs = requirements.data.clone().unwrap_or_default();
+        requirements = add_sort_above(requirements, sort_reqs, None);
+        assign_initial_requirements(&mut requirements);
     }
+    Ok(Transformed::Yes(requirements))
 }
 
 fn pushdown_requirement_to_children(
@@ -178,11 +126,7 @@ fn pushdown_requirement_to_children(
         let child_plan = plan.children().swap_remove(0);
         match determine_children_requirement(parent_required, request_child, child_plan) {
             RequirementsCompatibility::Satisfy => {
-                let req = if request_child.is_empty() {
-                    None
-                } else {
-                    Some(request_child.to_vec())
-                };
+                let req = (!request_child.is_empty()).then(|| request_child.to_vec());
                 Ok(Some(vec![req]))
             }
             RequirementsCompatibility::Compatible(adjusted) => Ok(Some(vec![adjusted])),
@@ -191,20 +135,14 @@ fn pushdown_requirement_to_children(
     } else if is_union(plan) {
         // UnionExec does not have real sort requirements for its input. Here we change the adjusted_request_ordering to UnionExec's output ordering and
         // propagate the sort requirements down to correct the unnecessary descendant SortExec under the UnionExec
-        let req = if parent_required.is_empty() {
-            None
-        } else {
-            Some(parent_required.to_vec())
-        };
+        let req = (!parent_required.is_empty()).then(|| parent_required.to_vec());
         Ok(Some(vec![req; plan.children().len()]))
     } else if let Some(smj) = plan.as_any().downcast_ref::<SortMergeJoinExec>() {
         // If the current plan is SortMergeJoinExec
         let left_columns_len = smj.left().schema().fields().len();
         let parent_required_expr =
             PhysicalSortRequirement::to_sort_exprs(parent_required.iter().cloned());
-        let expr_source_side =
-            expr_source_sides(&parent_required_expr, smj.join_type(), left_columns_len);
-        match expr_source_side {
+        match expr_source_side(&parent_required_expr, smj.join_type(), left_columns_len) {
             Some(JoinSide::Left) => try_pushdown_requirements_to_join(
                 smj,
                 parent_required,
@@ -256,11 +194,7 @@ fn pushdown_requirement_to_children(
         } else {
             // Can push-down through SortPreservingMergeExec, because parent requirement is finer
             // than SortPreservingMergeExec output ordering.
-            let req = if parent_required.is_empty() {
-                None
-            } else {
-                Some(parent_required.to_vec())
-            };
+            let req = (!parent_required.is_empty()).then(|| parent_required.to_vec());
             Ok(Some(vec![req]))
         }
     } else {
@@ -268,11 +202,8 @@ fn pushdown_requirement_to_children(
             maintains_input_order
                 .into_iter()
                 .map(|flag| {
-                    if flag && !parent_required.is_empty() {
-                        Some(parent_required.to_vec())
-                    } else {
-                        None
-                    }
+                    (flag && !parent_required.is_empty())
+                        .then(|| parent_required.to_vec())
                 })
                 .collect(),
         ))
@@ -350,7 +281,7 @@ fn try_pushdown_requirements_to_join(
     }))
 }
 
-fn expr_source_sides(
+fn expr_source_side(
     required_exprs: &[PhysicalSortExpr],
     join_type: JoinType,
     left_columns_len: usize,
@@ -402,18 +333,14 @@ fn shift_right_required(
     parent_required: LexRequirementRef,
     left_columns_len: usize,
 ) -> Result<Vec<PhysicalSortRequirement>> {
-    let new_right_required: Vec<PhysicalSortRequirement> = parent_required
+    let new_right_required = parent_required
         .iter()
         .filter_map(|r| {
             let col = r.expr.as_any().downcast_ref::<Column>()?;
-
-            if col.index() < left_columns_len {
-                return None;
-            }
-
-            let new_col =
-                Arc::new(Column::new(col.name(), col.index() - left_columns_len));
-            Some(r.clone().with_expr(new_col))
+            col.index().checked_sub(left_columns_len).map(|offset| {
+                r.clone()
+                    .with_expr(Arc::new(Column::new(col.name(), offset)))
+            })
         })
         .collect::<Vec<_>>();
     if new_right_required.len() == parent_required.len() {
diff --git a/datafusion/core/src/physical_optimizer/test_utils.rs b/datafusion/core/src/physical_optimizer/test_utils.rs
index 68ac9598a335..5de6cff0b4fa 100644
--- a/datafusion/core/src/physical_optimizer/test_utils.rs
+++ b/datafusion/core/src/physical_optimizer/test_utils.rs
@@ -21,6 +21,7 @@ use std::sync::Arc;
 
 use crate::datasource::listing::PartitionedFile;
 use crate::datasource::physical_plan::{FileScanConfig, ParquetExec};
+use crate::datasource::stream::{StreamConfig, StreamTable};
 use crate::error::Result;
 use crate::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
 use crate::physical_plan::coalesce_batches::CoalesceBatchesExec;
@@ -39,13 +40,15 @@ use crate::physical_plan::{ExecutionPlan, InputOrderMode, Partitioning};
 use crate::prelude::{CsvReadOptions, SessionContext};
 
 use arrow_schema::{Schema, SchemaRef, SortOptions};
+use datafusion_common::tree_node::{Transformed, TreeNode};
 use datafusion_common::{JoinType, Statistics};
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunctionDefinition};
 use datafusion_physical_expr::expressions::col;
 use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
+use datafusion_physical_plan::displayable;
+use datafusion_physical_plan::tree_node::PlanContext;
 
-use crate::datasource::stream::{StreamConfig, StreamTable};
 use async_trait::async_trait;
 
 async fn register_current_csv(
@@ -361,3 +364,25 @@ pub fn sort_exec(
     let sort_exprs = sort_exprs.into_iter().collect();
     Arc::new(SortExec::new(sort_exprs, input))
 }
+
+/// A [`PlanContext`] object is susceptible to being left in an inconsistent state after
+/// untested mutable operations. It is crucial that there be no discrepancies between a plan
+/// associated with the root node and the plan generated after traversing all nodes
+/// within the [`PlanContext`] tree. In addition to verifying the plans resulting from optimizer
+/// rules, it is essential to ensure that the overall tree structure corresponds with the plans
+/// contained within the node contexts.
+/// TODO: Once [`ExecutionPlan`] implements [`PartialEq`], string comparisons should be
+/// replaced with direct plan equality checks.
+pub fn check_integrity<T: Clone>(context: PlanContext<T>) -> Result<PlanContext<T>> {
+    context.transform_up(&|node| {
+        let children_plans = node.plan.children();
+        assert_eq!(node.children.len(), children_plans.len());
+        for (child_plan, child_node) in children_plans.iter().zip(node.children.iter()) {
+            assert_eq!(
+                displayable(child_plan.as_ref()).one_line().to_string(),
+                displayable(child_node.plan.as_ref()).one_line().to_string()
+            );
+        }
+        Ok(Transformed::No(node))
+    })
+}
diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs
index f8063e969422..4f4b17345ef8 100644
--- a/datafusion/core/src/physical_optimizer/utils.rs
+++ b/datafusion/core/src/physical_optimizer/utils.rs
@@ -27,30 +27,23 @@ use crate::physical_plan::union::UnionExec;
 use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec};
 use crate::physical_plan::ExecutionPlan;
 
-use datafusion_physical_expr::{LexRequirementRef, PhysicalSortRequirement};
+use datafusion_physical_expr::{LexRequirement, PhysicalSortRequirement};
 use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec};
+use datafusion_physical_plan::tree_node::PlanContext;
 
 /// This utility function adds a `SortExec` above an operator according to the
 /// given ordering requirements while preserving the original partitioning.
-pub fn add_sort_above(
-    node: &mut Arc<dyn ExecutionPlan>,
-    sort_requirement: LexRequirementRef,
+pub fn add_sort_above<T: Clone + Default>(
+    node: PlanContext<T>,
+    sort_requirements: LexRequirement,
     fetch: Option<usize>,
-) {
-    // If the ordering requirement is already satisfied, do not add a sort.
-    if !node
-        .equivalence_properties()
-        .ordering_satisfy_requirement(sort_requirement)
-    {
-        let sort_expr = PhysicalSortRequirement::to_sort_exprs(sort_requirement.to_vec());
-        let new_sort = SortExec::new(sort_expr, node.clone()).with_fetch(fetch);
-
-        *node = Arc::new(if node.output_partitioning().partition_count() > 1 {
-            new_sort.with_preserve_partitioning(true)
-        } else {
-            new_sort
-        }) as _
+) -> PlanContext<T> {
+    let sort_expr = PhysicalSortRequirement::to_sort_exprs(sort_requirements);
+    let mut new_sort = SortExec::new(sort_expr, node.plan.clone()).with_fetch(fetch);
+    if node.plan.output_partitioning().partition_count() > 1 {
+        new_sort = new_sort.with_preserve_partitioning(true);
     }
+    PlanContext::new(Arc::new(new_sort), T::default(), vec![node])
 }
 
 /// Checks whether the given operator is a limit;
diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs
index 56388be58b8a..05464c96d05e 100644
--- a/datafusion/expr/src/tree_node/expr.rs
+++ b/datafusion/expr/src/tree_node/expr.rs
@@ -23,15 +23,17 @@ use crate::expr::{
     ScalarFunction, ScalarFunctionDefinition, Sort, TryCast, WindowFunction,
 };
 use crate::{Expr, GetFieldAccess};
-use std::borrow::Cow;
 
-use datafusion_common::tree_node::TreeNode;
+use datafusion_common::tree_node::{TreeNode, VisitRecursion};
 use datafusion_common::{internal_err, DataFusionError, Result};
 
 impl TreeNode for Expr {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        match self {
-            Expr::Alias(Alias { expr, .. })
+    fn apply_children<F: FnMut(&Self) -> Result<VisitRecursion>>(
+        &self,
+        op: &mut F,
+    ) -> Result<VisitRecursion> {
+        let children = match self {
+            Expr::Alias(Alias{expr, .. })
             | Expr::Not(expr)
             | Expr::IsNotNull(expr)
             | Expr::IsTrue(expr)
@@ -45,79 +47,67 @@ impl TreeNode for Expr {
             | Expr::Cast(Cast { expr, .. })
             | Expr::TryCast(TryCast { expr, .. })
             | Expr::Sort(Sort { expr, .. })
-            | Expr::InSubquery(InSubquery { expr, .. }) => vec![Cow::Borrowed(expr)],
+            | Expr::InSubquery(InSubquery{ expr, .. }) => vec![expr.as_ref()],
             Expr::GetIndexedField(GetIndexedField { expr, field }) => {
-                let expr = Cow::Borrowed(expr.as_ref());
+                let expr = expr.as_ref();
                 match field {
-                    GetFieldAccess::ListIndex { key } => {
-                        vec![Cow::Borrowed(key.as_ref()), expr]
-                    }
-                    GetFieldAccess::ListRange { start, stop } => {
-                        vec![Cow::Borrowed(start), Cow::Borrowed(stop), expr]
-                    }
-                    GetFieldAccess::NamedStructField { name: _name } => {
-                        vec![expr]
+                    GetFieldAccess::ListIndex {key} => vec![key.as_ref(), expr],
+                    GetFieldAccess::ListRange {start, stop} => {
+                        vec![start.as_ref(), stop.as_ref(), expr]
                     }
+                    GetFieldAccess::NamedStructField { .. } => vec![expr],
                 }
             }
             Expr::GroupingSet(GroupingSet::Rollup(exprs))
-            | Expr::GroupingSet(GroupingSet::Cube(exprs)) => exprs.iter().map(Cow::Borrowed).collect(),
-            Expr::ScalarFunction(ScalarFunction { args, .. }) => args.iter().map(Cow::Borrowed).collect(),
+            | Expr::GroupingSet(GroupingSet::Cube(exprs)) => exprs.iter().collect(),
+            Expr::ScalarFunction (ScalarFunction{ args, .. } )  => {
+                args.iter().collect()
+            }
             Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => {
-                lists_of_exprs.iter().flatten().map(Cow::Borrowed).collect()
+                lists_of_exprs.iter().flatten().collect()
             }
             Expr::Column(_)
             // Treat OuterReferenceColumn as a leaf expression
             | Expr::OuterReferenceColumn(_, _)
             | Expr::ScalarVariable(_, _)
             | Expr::Literal(_)
-            | Expr::Exists { .. }
+            | Expr::Exists {..}
             | Expr::ScalarSubquery(_)
-            | Expr::Wildcard { .. }
-            | Expr::Placeholder(_) => vec![],
+            | Expr::Wildcard {..}
+            | Expr::Placeholder (_) => vec![],
             Expr::BinaryExpr(BinaryExpr { left, right, .. }) => {
-                vec![Cow::Borrowed(left), Cow::Borrowed(right)]
+                vec![left.as_ref(), right.as_ref()]
             }
             Expr::Like(Like { expr, pattern, .. })
             | Expr::SimilarTo(Like { expr, pattern, .. }) => {
-                vec![Cow::Borrowed(expr), Cow::Borrowed(pattern)]
+                vec![expr.as_ref(), pattern.as_ref()]
             }
             Expr::Between(Between {
                 expr, low, high, ..
-            }) => vec![
-                Cow::Borrowed(expr),
-                Cow::Borrowed(low),
-                Cow::Borrowed(high),
-            ],
+            }) => vec![expr.as_ref(), low.as_ref(), high.as_ref()],
             Expr::Case(case) => {
                 let mut expr_vec = vec![];
                 if let Some(expr) = case.expr.as_ref() {
-                    expr_vec.push(Cow::Borrowed(expr.as_ref()));
+                    expr_vec.push(expr.as_ref());
                 };
                 for (when, then) in case.when_then_expr.iter() {
-                    expr_vec.push(Cow::Borrowed(when));
-                    expr_vec.push(Cow::Borrowed(then));
+                    expr_vec.push(when.as_ref());
+                    expr_vec.push(then.as_ref());
                 }
                 if let Some(else_expr) = case.else_expr.as_ref() {
-                    expr_vec.push(Cow::Borrowed(else_expr));
+                    expr_vec.push(else_expr.as_ref());
                 }
                 expr_vec
             }
-            Expr::AggregateFunction(AggregateFunction {
-                args,
-                filter,
-                order_by,
-                ..
-            }) => {
-                let mut expr_vec: Vec<_> = args.iter().map(Cow::Borrowed).collect();
-
+            Expr::AggregateFunction(AggregateFunction { args, filter, order_by, .. })
+             => {
+                let mut expr_vec = args.iter().collect::<Vec<_>>();
                 if let Some(f) = filter {
-                    expr_vec.push(Cow::Borrowed(f));
+                    expr_vec.push(f.as_ref());
                 }
-                if let Some(o) = order_by {
-                    expr_vec.extend(o.iter().map(Cow::Borrowed).collect::<Vec<_>>());
+                if let Some(order_by) = order_by {
+                    expr_vec.extend(order_by);
                 }
-
                 expr_vec
             }
             Expr::WindowFunction(WindowFunction {
@@ -126,34 +116,47 @@ impl TreeNode for Expr {
                 order_by,
                 ..
             }) => {
-                let mut expr_vec: Vec<_> = args.iter().map(Cow::Borrowed).collect();
-                expr_vec.extend(partition_by.iter().map(Cow::Borrowed).collect::<Vec<_>>());
-                expr_vec.extend(order_by.iter().map(Cow::Borrowed).collect::<Vec<_>>());
+                let mut expr_vec = args.iter().collect::<Vec<_>>();
+                expr_vec.extend(partition_by);
+                expr_vec.extend(order_by);
                 expr_vec
             }
             Expr::InList(InList { expr, list, .. }) => {
-                let mut expr_vec = vec![Cow::Borrowed(expr.as_ref())];
-                expr_vec.extend(list.iter().map(Cow::Borrowed).collect::<Vec<_>>());
+                let mut expr_vec = vec![expr.as_ref()];
+                expr_vec.extend(list);
                 expr_vec
             }
+        };
+
+        for child in children {
+            match op(child)? {
+                VisitRecursion::Continue => {}
+                VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
+                VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
+            }
         }
-    }
 
-    fn map_children<F>(self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        let mut transform = transform;
+        Ok(VisitRecursion::Continue)
+    }
 
+    fn map_children<F: FnMut(Self) -> Result<Self>>(
+        self,
+        mut transform: F,
+    ) -> Result<Self> {
         Ok(match self {
+            Expr::Column(_)
+            | Expr::Wildcard { .. }
+            | Expr::Placeholder(Placeholder { .. })
+            | Expr::OuterReferenceColumn(_, _)
+            | Expr::Exists { .. }
+            | Expr::ScalarSubquery(_)
+            | Expr::ScalarVariable(_, _)
+            | Expr::Literal(_) => self,
             Expr::Alias(Alias {
                 expr,
                 relation,
                 name,
             }) => Expr::Alias(Alias::new(transform(*expr)?, relation, name)),
-            Expr::Column(_) => self,
-            Expr::OuterReferenceColumn(_, _) => self,
-            Expr::Exists { .. } => self,
             Expr::InSubquery(InSubquery {
                 expr,
                 subquery,
@@ -163,9 +166,6 @@ impl TreeNode for Expr {
                 subquery,
                 negated,
             )),
-            Expr::ScalarSubquery(_) => self,
-            Expr::ScalarVariable(ty, names) => Expr::ScalarVariable(ty, names),
-            Expr::Literal(value) => Expr::Literal(value),
             Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
                 Expr::BinaryExpr(BinaryExpr::new(
                     transform_boxed(left, &mut transform)?,
@@ -244,7 +244,6 @@ impl TreeNode for Expr {
                         ))
                     })
                     .collect::<Result<Vec<_>>>()?;
-
                 let else_expr = transform_option_box(case.else_expr, &mut transform)?;
 
                 Expr::Case(Case::new(expr, when_then_expr, else_expr))
@@ -273,9 +272,7 @@ impl TreeNode for Expr {
                     ScalarFunction::new_udf(fun, transform_vec(args, &mut transform)?),
                 ),
                 ScalarFunctionDefinition::Name(_) => {
-                    return internal_err!(
-                        "Function `Expr` with name should be resolved."
-                    );
+                    return internal_err!("Function `Expr` with name should be resolved.")
                 }
             },
             Expr::WindowFunction(WindowFunction {
@@ -308,11 +305,9 @@ impl TreeNode for Expr {
                     ))
                 }
                 AggregateFunctionDefinition::UDF(fun) => {
-                    let order_by = if let Some(order_by) = order_by {
-                        Some(transform_vec(order_by, &mut transform)?)
-                    } else {
-                        None
-                    };
+                    let order_by = order_by
+                        .map(|order_by| transform_vec(order_by, &mut transform))
+                        .transpose()?;
                     Expr::AggregateFunction(AggregateFunction::new_udf(
                         fun,
                         transform_vec(args, &mut transform)?,
@@ -322,9 +317,7 @@ impl TreeNode for Expr {
                     ))
                 }
                 AggregateFunctionDefinition::Name(_) => {
-                    return internal_err!(
-                        "Function `Expr` with name should be resolved."
-                    );
+                    return internal_err!("Function `Expr` with name should be resolved.")
                 }
             },
             Expr::GroupingSet(grouping_set) => match grouping_set {
@@ -337,13 +330,12 @@ impl TreeNode for Expr {
                 GroupingSet::GroupingSets(lists_of_exprs) => {
                     Expr::GroupingSet(GroupingSet::GroupingSets(
                         lists_of_exprs
-                            .iter()
-                            .map(|exprs| transform_vec(exprs.clone(), &mut transform))
+                            .into_iter()
+                            .map(|exprs| transform_vec(exprs, &mut transform))
                             .collect::<Result<Vec<_>>>()?,
                     ))
                 }
             },
-
             Expr::InList(InList {
                 expr,
                 list,
@@ -353,62 +345,47 @@ impl TreeNode for Expr {
                 transform_vec(list, &mut transform)?,
                 negated,
             )),
-            Expr::Wildcard { qualifier } => Expr::Wildcard { qualifier },
             Expr::GetIndexedField(GetIndexedField { expr, field }) => {
                 Expr::GetIndexedField(GetIndexedField::new(
                     transform_boxed(expr, &mut transform)?,
                     field,
                 ))
             }
-            Expr::Placeholder(Placeholder { id, data_type }) => {
-                Expr::Placeholder(Placeholder { id, data_type })
-            }
         })
     }
 }
 
-fn transform_boxed<F>(boxed_expr: Box<Expr>, transform: &mut F) -> Result<Box<Expr>>
-where
-    F: FnMut(Expr) -> Result<Expr>,
-{
-    // TODO:
-    // It might be possible to avoid an allocation (the Box::new) below by reusing the box.
-    let expr: Expr = *boxed_expr;
-    let rewritten_expr = transform(expr)?;
-    Ok(Box::new(rewritten_expr))
+fn transform_boxed<F: FnMut(Expr) -> Result<Expr>>(
+    boxed_expr: Box<Expr>,
+    transform: &mut F,
+) -> Result<Box<Expr>> {
+    // TODO: It might be possible to avoid an allocation (the Box::new) below by reusing the box.
+    transform(*boxed_expr).map(Box::new)
 }
 
-fn transform_option_box<F>(
+fn transform_option_box<F: FnMut(Expr) -> Result<Expr>>(
     option_box: Option<Box<Expr>>,
     transform: &mut F,
-) -> Result<Option<Box<Expr>>>
-where
-    F: FnMut(Expr) -> Result<Expr>,
-{
+) -> Result<Option<Box<Expr>>> {
     option_box
         .map(|expr| transform_boxed(expr, transform))
         .transpose()
 }
 
 /// &mut transform a Option<`Vec` of `Expr`s>
-fn transform_option_vec<F>(
+fn transform_option_vec<F: FnMut(Expr) -> Result<Expr>>(
     option_box: Option<Vec<Expr>>,
     transform: &mut F,
-) -> Result<Option<Vec<Expr>>>
-where
-    F: FnMut(Expr) -> Result<Expr>,
-{
-    Ok(if let Some(exprs) = option_box {
-        Some(transform_vec(exprs, transform)?)
-    } else {
-        None
-    })
+) -> Result<Option<Vec<Expr>>> {
+    option_box
+        .map(|exprs| transform_vec(exprs, transform))
+        .transpose()
 }
 
 /// &mut transform a `Vec` of `Expr`s
-fn transform_vec<F>(v: Vec<Expr>, transform: &mut F) -> Result<Vec<Expr>>
-where
-    F: FnMut(Expr) -> Result<Expr>,
-{
+fn transform_vec<F: FnMut(Expr) -> Result<Expr>>(
+    v: Vec<Expr>,
+    transform: &mut F,
+) -> Result<Vec<Expr>> {
     v.into_iter().map(transform).collect()
 }
diff --git a/datafusion/expr/src/tree_node/plan.rs b/datafusion/expr/src/tree_node/plan.rs
index 208a8b57d7b0..589bb917a953 100644
--- a/datafusion/expr/src/tree_node/plan.rs
+++ b/datafusion/expr/src/tree_node/plan.rs
@@ -18,33 +18,19 @@
 //! Tree node implementation for logical plan
 
 use crate::LogicalPlan;
-use datafusion_common::tree_node::{TreeNodeVisitor, VisitRecursion};
-use datafusion_common::{tree_node::TreeNode, Result};
-use std::borrow::Cow;
 
-impl TreeNode for LogicalPlan {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.inputs().into_iter().map(Cow::Borrowed).collect()
-    }
+use datafusion_common::tree_node::{TreeNode, TreeNodeVisitor, VisitRecursion};
+use datafusion_common::{handle_tree_recursion, Result};
 
-    fn apply<F>(&self, op: &mut F) -> Result<VisitRecursion>
-    where
-        F: FnMut(&Self) -> Result<VisitRecursion>,
-    {
-        // Note,
-        //
+impl TreeNode for LogicalPlan {
+    fn apply<F: FnMut(&Self) -> Result<VisitRecursion>>(
+        &self,
+        op: &mut F,
+    ) -> Result<VisitRecursion> {
         // Compared to the default implementation, we need to invoke
         // [`Self::apply_subqueries`] before visiting its children
-        match op(self)? {
-            VisitRecursion::Continue => {}
-            // If the recursion should skip, do not apply to its children. And let the recursion continue
-            VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
-            // If the recursion should stop, do not apply to its children
-            VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
-        };
-
+        handle_tree_recursion!(op(self)?);
         self.apply_subqueries(op)?;
-
         self.apply_children(&mut |node| node.apply(op))
     }
 
@@ -74,26 +60,20 @@ impl TreeNode for LogicalPlan {
     ) -> Result<VisitRecursion> {
         // Compared to the default implementation, we need to invoke
         // [`Self::visit_subqueries`] before visiting its children
-
-        match visitor.pre_visit(self)? {
-            VisitRecursion::Continue => {}
-            // If the recursion should skip, do not apply to its children. And let the recursion continue
-            VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
-            // If the recursion should stop, do not apply to its children
-            VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
-        };
-
+        handle_tree_recursion!(visitor.pre_visit(self)?);
         self.visit_subqueries(visitor)?;
+        handle_tree_recursion!(self.apply_children(&mut |node| node.visit(visitor))?);
+        visitor.post_visit(self)
+    }
 
-        match self.apply_children(&mut |node| node.visit(visitor))? {
-            VisitRecursion::Continue => {}
-            // If the recursion should skip, do not apply to its children. And let the recursion continue
-            VisitRecursion::Skip => return Ok(VisitRecursion::Continue),
-            // If the recursion should stop, do not apply to its children
-            VisitRecursion::Stop => return Ok(VisitRecursion::Stop),
+    fn apply_children<F: FnMut(&Self) -> Result<VisitRecursion>>(
+        &self,
+        op: &mut F,
+    ) -> Result<VisitRecursion> {
+        for child in self.inputs() {
+            handle_tree_recursion!(op(child)?)
         }
-
-        visitor.post_visit(self)
+        Ok(VisitRecursion::Continue)
     }
 
     fn map_children<F>(self, transform: F) -> Result<Self>
diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs
index 31c1cf61193a..94650c0a6226 100644
--- a/datafusion/physical-expr/src/equivalence/properties.rs
+++ b/datafusion/physical-expr/src/equivalence/properties.rs
@@ -15,28 +15,27 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::expressions::Column;
-use arrow_schema::SchemaRef;
-use datafusion_common::{JoinSide, JoinType};
-use indexmap::IndexSet;
-use itertools::Itertools;
 use std::collections::{HashMap, HashSet};
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
+use super::ordering::collapse_lex_ordering;
 use crate::equivalence::{
     collapse_lex_req, EquivalenceGroup, OrderingEquivalenceClass, ProjectionMapping,
 };
-
-use crate::expressions::Literal;
+use crate::expressions::{Column, Literal};
 use crate::sort_properties::{ExprOrdering, SortProperties};
 use crate::{
     physical_exprs_contains, LexOrdering, LexOrderingRef, LexRequirement,
     LexRequirementRef, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement,
 };
+
+use arrow_schema::SchemaRef;
 use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::{JoinSide, JoinType};
 
-use super::ordering::collapse_lex_ordering;
+use indexmap::IndexSet;
+use itertools::Itertools;
 
 /// A `EquivalenceProperties` object stores useful information related to a schema.
 /// Currently, it keeps track of:
@@ -314,8 +313,8 @@ impl EquivalenceProperties {
     /// Returns `true` if the specified ordering is satisfied, `false` otherwise.
     fn ordering_satisfy_single(&self, req: &PhysicalSortRequirement) -> bool {
         let expr_ordering = self.get_expr_ordering(req.expr.clone());
-        let ExprOrdering { expr, state, .. } = expr_ordering;
-        match state {
+        let ExprOrdering { expr, data, .. } = expr_ordering;
+        match data {
             SortProperties::Ordered(options) => {
                 let sort_expr = PhysicalSortExpr { expr, options };
                 sort_expr.satisfy(req, self.schema())
@@ -708,9 +707,9 @@ impl EquivalenceProperties {
             let ordered_exprs = search_indices
                 .iter()
                 .flat_map(|&idx| {
-                    let ExprOrdering { expr, state, .. } =
+                    let ExprOrdering { expr, data, .. } =
                         eq_properties.get_expr_ordering(exprs[idx].clone());
-                    if let SortProperties::Ordered(options) = state {
+                    if let SortProperties::Ordered(options) = data {
                         Some((PhysicalSortExpr { expr, options }, idx))
                     } else {
                         None
@@ -776,7 +775,7 @@ impl EquivalenceProperties {
     /// Returns an `ExprOrdering` object containing the ordering information for
     /// the given expression.
     pub fn get_expr_ordering(&self, expr: Arc<dyn PhysicalExpr>) -> ExprOrdering {
-        ExprOrdering::new(expr.clone())
+        ExprOrdering::new_default(expr.clone())
             .transform_up(&|expr| Ok(update_ordering(expr, self)))
             // Guaranteed to always return `Ok`.
             .unwrap()
@@ -802,18 +801,19 @@ fn update_ordering(
     // We have a Column, which is one of the two possible leaf node types:
     let normalized_expr = eq_properties.eq_group.normalize_expr(node.expr.clone());
     if eq_properties.is_expr_constant(&normalized_expr) {
-        node.state = SortProperties::Singleton;
+        node.data = SortProperties::Singleton;
     } else if let Some(options) = eq_properties
         .normalized_oeq_class()
         .get_options(&normalized_expr)
     {
-        node.state = SortProperties::Ordered(options);
+        node.data = SortProperties::Ordered(options);
     } else if !node.expr.children().is_empty() {
         // We have an intermediate (non-leaf) node, account for its children:
-        node.state = node.expr.get_ordering(&node.children_state());
+        let children_orderings = node.children.iter().map(|c| c.data).collect_vec();
+        node.data = node.expr.get_ordering(&children_orderings);
     } else if node.expr.as_any().is::<Literal>() {
         // We have a Literal, which is the other possible leaf node type:
-        node.state = node.expr.get_ordering(&[]);
+        node.data = node.expr.get_ordering(&[]);
     } else {
         return Transformed::No(node);
     }
@@ -1683,9 +1683,9 @@ mod tests {
             let expr_ordering = eq_properties.get_expr_ordering(expr.clone());
             let err_msg = format!(
                 "expr:{:?}, expected: {:?}, actual: {:?}, leading_orderings: {leading_orderings:?}",
-                expr, expected, expr_ordering.state
+                expr, expected, expr_ordering.data
             );
-            assert_eq!(expr_ordering.state, expected, "{}", err_msg);
+            assert_eq!(expr_ordering.data, expected, "{}", err_msg);
         }
 
         Ok(())
diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs
index 5064ad8d5c48..b2403dadf05a 100644
--- a/datafusion/physical-expr/src/intervals/cp_solver.rs
+++ b/datafusion/physical-expr/src/intervals/cp_solver.rs
@@ -180,7 +180,7 @@ impl ExprIntervalGraphNode {
     /// object. Literals are created with definite, singleton intervals while
     /// any other expression starts with an indefinite interval ([-∞, ∞]).
     pub fn make_node(node: &ExprTreeNode<NodeIndex>, schema: &Schema) -> Result<Self> {
-        let expr = node.expression().clone();
+        let expr = node.expr.clone();
         if let Some(literal) = expr.as_any().downcast_ref::<Literal>() {
             let value = literal.value();
             Interval::try_new(value.clone(), value.clone())
diff --git a/datafusion/physical-expr/src/sort_properties.rs b/datafusion/physical-expr/src/sort_properties.rs
index 0205f85dced4..4df29ced2f01 100644
--- a/datafusion/physical-expr/src/sort_properties.rs
+++ b/datafusion/physical-expr/src/sort_properties.rs
@@ -15,16 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::borrow::Cow;
-use std::{ops::Neg, sync::Arc};
+use std::ops::Neg;
 
-use arrow_schema::SortOptions;
+use crate::tree_node::ExprContext;
 
-use crate::PhysicalExpr;
-use datafusion_common::tree_node::TreeNode;
-use datafusion_common::Result;
+use arrow_schema::SortOptions;
 
-/// To propagate [`SortOptions`] across the [`PhysicalExpr`], it is insufficient
+/// To propagate [`SortOptions`] across the `PhysicalExpr`, it is insufficient
 /// to simply use `Option<SortOptions>`: There must be a differentiation between
 /// unordered columns and literal values, since literals may not break the ordering
 /// when they are used as a child of some binary expression when the other child has
@@ -139,56 +136,13 @@ impl Neg for SortProperties {
 }
 
 /// The `ExprOrdering` struct is designed to aid in the determination of ordering (represented
-/// by [`SortProperties`]) for a given [`PhysicalExpr`]. When analyzing the orderings
-/// of a [`PhysicalExpr`], the process begins by assigning the ordering of its leaf nodes.
+/// by [`SortProperties`]) for a given `PhysicalExpr`. When analyzing the orderings
+/// of a `PhysicalExpr`, the process begins by assigning the ordering of its leaf nodes.
 /// By propagating these leaf node orderings upwards in the expression tree, the overall
-/// ordering of the entire [`PhysicalExpr`] can be derived.
+/// ordering of the entire `PhysicalExpr` can be derived.
 ///
-/// This struct holds the necessary state information for each expression in the [`PhysicalExpr`].
-/// It encapsulates the orderings (`state`) associated with the expression (`expr`), and
-/// orderings of the children expressions (`children_states`). The [`ExprOrdering`] of a parent
+/// This struct holds the necessary state information for each expression in the `PhysicalExpr`.
+/// It encapsulates the orderings (`data`) associated with the expression (`expr`), and
+/// orderings of the children expressions (`children`). The [`ExprOrdering`] of a parent
 /// expression is determined based on the [`ExprOrdering`] states of its children expressions.
-#[derive(Debug, Clone)]
-pub struct ExprOrdering {
-    pub expr: Arc<dyn PhysicalExpr>,
-    pub state: SortProperties,
-    pub children: Vec<Self>,
-}
-
-impl ExprOrdering {
-    /// Creates a new [`ExprOrdering`] with [`SortProperties::Unordered`] states
-    /// for `expr` and its children.
-    pub fn new(expr: Arc<dyn PhysicalExpr>) -> Self {
-        let children = expr.children();
-        Self {
-            expr,
-            state: Default::default(),
-            children: children.into_iter().map(Self::new).collect(),
-        }
-    }
-
-    /// Get a reference to each child state.
-    pub fn children_state(&self) -> Vec<SortProperties> {
-        self.children.iter().map(|c| c.state).collect()
-    }
-}
-
-impl TreeNode for ExprOrdering {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children.iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        if !self.children.is_empty() {
-            self.children = self
-                .children
-                .into_iter()
-                .map(transform)
-                .collect::<Result<_>>()?;
-        }
-        Ok(self)
-    }
-}
+pub type ExprOrdering = ExprContext<SortProperties>;
diff --git a/datafusion/physical-expr/src/tree_node.rs b/datafusion/physical-expr/src/tree_node.rs
index 742846cf56bb..42dc6673af6a 100644
--- a/datafusion/physical-expr/src/tree_node.rs
+++ b/datafusion/physical-expr/src/tree_node.rs
@@ -16,12 +16,15 @@
 // under the License.
 
 //! This module provides common traits for visiting or rewriting tree nodes easily.
-use crate::physical_expr::with_new_children_if_necessary;
-use crate::PhysicalExpr;
-use datafusion_common::tree_node::DynTreeNode;
-use datafusion_common::Result;
+
+use std::fmt::{self, Display, Formatter};
 use std::sync::Arc;
 
+use crate::physical_expr::{with_new_children_if_necessary, PhysicalExpr};
+
+use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode};
+use datafusion_common::Result;
+
 impl DynTreeNode for dyn PhysicalExpr {
     fn arc_children(&self) -> Vec<Arc<Self>> {
         self.children()
@@ -35,3 +38,63 @@ impl DynTreeNode for dyn PhysicalExpr {
         with_new_children_if_necessary(arc_self, new_children)
     }
 }
+
+/// A node object encapsulating a [`PhysicalExpr`] node with a payload. Since there are
+/// two ways to access child plans—directly from the plan  and through child nodes—it's
+/// recommended to perform mutable operations via [`Self::update_expr_from_children`].
+#[derive(Debug)]
+pub struct ExprContext<T: Sized> {
+    /// The physical expression associated with this context.
+    pub expr: Arc<dyn PhysicalExpr>,
+    /// Custom data payload of the node.
+    pub data: T,
+    /// Child contexts of this node.
+    pub children: Vec<Self>,
+}
+
+impl<T> ExprContext<T> {
+    pub fn new(expr: Arc<dyn PhysicalExpr>, data: T, children: Vec<Self>) -> Self {
+        Self {
+            expr,
+            data,
+            children,
+        }
+    }
+
+    pub fn update_expr_from_children(mut self) -> Result<Self> {
+        let children_expr = self.children.iter().map(|c| c.expr.clone()).collect();
+        self.expr = with_new_children_if_necessary(self.expr, children_expr)?;
+        Ok(self)
+    }
+}
+
+impl<T: Default> ExprContext<T> {
+    pub fn new_default(plan: Arc<dyn PhysicalExpr>) -> Self {
+        let children = plan.children().into_iter().map(Self::new_default).collect();
+        Self::new(plan, Default::default(), children)
+    }
+}
+
+impl<T: Display> Display for ExprContext<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "expr: {:?}", self.expr)?;
+        write!(f, "data:{}", self.data)?;
+        write!(f, "")
+    }
+}
+
+impl<T> ConcreteTreeNode for ExprContext<T> {
+    fn children(&self) -> Vec<&Self> {
+        self.children.iter().collect()
+    }
+
+    fn take_children(mut self) -> (Self, Vec<Self>) {
+        let children = std::mem::take(&mut self.children);
+        (self, children)
+    }
+
+    fn with_new_children(mut self, children: Vec<Self>) -> Result<Self> {
+        self.children = children;
+        self.update_expr_from_children()
+    }
+}
diff --git a/datafusion/physical-expr/src/utils/mod.rs b/datafusion/physical-expr/src/utils/mod.rs
index 64a62dc7820d..e14ff2692146 100644
--- a/datafusion/physical-expr/src/utils/mod.rs
+++ b/datafusion/physical-expr/src/utils/mod.rs
@@ -18,11 +18,12 @@
 mod guarantee;
 pub use guarantee::{Guarantee, LiteralGuarantee};
 
-use std::borrow::{Borrow, Cow};
+use std::borrow::Borrow;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
 use crate::expressions::{BinaryExpr, Column};
+use crate::tree_node::ExprContext;
 use crate::{PhysicalExpr, PhysicalSortExpr};
 
 use arrow::array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData};
@@ -127,49 +128,7 @@ pub fn get_indices_of_exprs_strict<T: Borrow<Arc<dyn PhysicalExpr>>>(
         .collect()
 }
 
-#[derive(Clone, Debug)]
-pub struct ExprTreeNode<T> {
-    expr: Arc<dyn PhysicalExpr>,
-    data: Option<T>,
-    child_nodes: Vec<ExprTreeNode<T>>,
-}
-
-impl<T> ExprTreeNode<T> {
-    pub fn new(expr: Arc<dyn PhysicalExpr>) -> Self {
-        let children = expr.children();
-        ExprTreeNode {
-            expr,
-            data: None,
-            child_nodes: children.into_iter().map(Self::new).collect_vec(),
-        }
-    }
-
-    pub fn expression(&self) -> &Arc<dyn PhysicalExpr> {
-        &self.expr
-    }
-
-    pub fn children(&self) -> &[ExprTreeNode<T>] {
-        &self.child_nodes
-    }
-}
-
-impl<T: Clone> TreeNode for ExprTreeNode<T> {
-    fn children_nodes(&self) -> Vec<Cow<Self>> {
-        self.children().iter().map(Cow::Borrowed).collect()
-    }
-
-    fn map_children<F>(mut self, transform: F) -> Result<Self>
-    where
-        F: FnMut(Self) -> Result<Self>,
-    {
-        self.child_nodes = self
-            .child_nodes
-            .into_iter()
-            .map(transform)
-            .collect::<Result<Vec<_>>>()?;
-        Ok(self)
-    }
-}
+pub type ExprTreeNode<T> = ExprContext<Option<T>>;
 
 /// This struct facilitates the [TreeNodeRewriter] mechanism to convert a
 /// [PhysicalExpr] tree into a DAEG (i.e. an expression DAG) by collecting
@@ -207,7 +166,7 @@ impl<'a, T, F: Fn(&ExprTreeNode<NodeIndex>) -> Result<T>> TreeNodeRewriter
             // of visited expressions and return the newly created node index.
             None => {
                 let node_idx = self.graph.add_node((self.constructor)(&node)?);
-                for expr_node in node.child_nodes.iter() {
+                for expr_node in node.children.iter() {
                     self.graph.add_edge(node_idx, expr_node.data.unwrap(), 0);
                 }
                 self.visited_plans.push((expr.clone(), node_idx));
@@ -230,7 +189,7 @@ where
     F: Fn(&ExprTreeNode<NodeIndex>) -> Result<T>,
 {
     // Create a new expression tree node from the input expression.
-    let init = ExprTreeNode::new(expr);
+    let init = ExprTreeNode::new_default(expr);
     // Create a new `PhysicalExprDAEGBuilder` instance.
     let mut builder = PhysicalExprDAEGBuilder {
         graph: StableGraph::<T, usize>::new(),
@@ -388,7 +347,7 @@ mod tests {
     }
 
     fn make_dummy_node(node: &ExprTreeNode<NodeIndex>) -> Result<PhysicalExprDummyNode> {
-        let expr = node.expression().clone();
+        let expr = node.expr.clone();
         let dummy_property = if expr.as_any().is::<BinaryExpr>() {
             "Binary"
         } else if expr.as_any().is::<Column>() {
diff --git a/datafusion/physical-plan/src/tree_node.rs b/datafusion/physical-plan/src/tree_node.rs
index bce906a00c4d..b8a5f95c5325 100644
--- a/datafusion/physical-plan/src/tree_node.rs
+++ b/datafusion/physical-plan/src/tree_node.rs
@@ -17,11 +17,14 @@
 
 //! This module provides common traits for visiting or rewriting tree nodes easily.
 
-use crate::{with_new_children_if_necessary, ExecutionPlan};
-use datafusion_common::tree_node::{DynTreeNode, Transformed};
-use datafusion_common::Result;
+use std::fmt::{self, Display, Formatter};
 use std::sync::Arc;
 
+use crate::{displayable, with_new_children_if_necessary, ExecutionPlan};
+
+use datafusion_common::tree_node::{ConcreteTreeNode, DynTreeNode, Transformed};
+use datafusion_common::Result;
+
 impl DynTreeNode for dyn ExecutionPlan {
     fn arc_children(&self) -> Vec<Arc<Self>> {
         self.children()
@@ -35,3 +38,64 @@ impl DynTreeNode for dyn ExecutionPlan {
         with_new_children_if_necessary(arc_self, new_children).map(Transformed::into)
     }
 }
+
+/// A node object beneficial for writing optimizer rules, encapsulating an [`ExecutionPlan`] node with a payload.
+/// Since there are two ways to access child plans—directly from the plan and through child nodes—it's recommended
+/// to perform mutable operations via [`Self::update_plan_from_children`].
+#[derive(Debug)]
+pub struct PlanContext<T: Sized> {
+    /// The execution plan associated with this context.
+    pub plan: Arc<dyn ExecutionPlan>,
+    /// Custom data payload of the node.
+    pub data: T,
+    /// Child contexts of this node.
+    pub children: Vec<Self>,
+}
+
+impl<T> PlanContext<T> {
+    pub fn new(plan: Arc<dyn ExecutionPlan>, data: T, children: Vec<Self>) -> Self {
+        Self {
+            plan,
+            data,
+            children,
+        }
+    }
+
+    pub fn update_plan_from_children(mut self) -> Result<Self> {
+        let children_plans = self.children.iter().map(|c| c.plan.clone()).collect();
+        self.plan = with_new_children_if_necessary(self.plan, children_plans)?.into();
+        Ok(self)
+    }
+}
+
+impl<T: Default> PlanContext<T> {
+    pub fn new_default(plan: Arc<dyn ExecutionPlan>) -> Self {
+        let children = plan.children().into_iter().map(Self::new_default).collect();
+        Self::new(plan, Default::default(), children)
+    }
+}
+
+impl<T: Display> Display for PlanContext<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        let node_string = displayable(self.plan.as_ref()).one_line();
+        write!(f, "Node plan: {}", node_string)?;
+        write!(f, "Node data: {}", self.data)?;
+        write!(f, "")
+    }
+}
+
+impl<T> ConcreteTreeNode for PlanContext<T> {
+    fn children(&self) -> Vec<&Self> {
+        self.children.iter().collect()
+    }
+
+    fn take_children(mut self) -> (Self, Vec<Self>) {
+        let children = std::mem::take(&mut self.children);
+        (self, children)
+    }
+
+    fn with_new_children(mut self, children: Vec<Self>) -> Result<Self> {
+        self.children = children;
+        self.update_plan_from_children()
+    }
+}

From ff7dfc3787b46715fce2b4e5aa805bb9afb9ce3a Mon Sep 17 00:00:00 2001
From: comphead <comphead@users.noreply.github.com>
Date: Sat, 27 Jan 2024 14:40:01 -0800
Subject: [PATCH 27/27] feat: Disable client console highlight by default
 (#9013)

* console highlight disabled by default
---
 datafusion-cli/src/exec.rs          |  1 +
 datafusion-cli/src/helper.rs        | 15 ++++++++++-----
 datafusion-cli/src/highlighter.rs   | 10 ++++++----
 datafusion-cli/src/lib.rs           |  3 +--
 datafusion-cli/src/main.rs          | 18 +++++++++++-------
 datafusion-cli/src/print_options.rs |  1 +
 docs/source/user-guide/cli.md       |  3 +++
 7 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
index aabf69aac888..a175f99a90d8 100644
--- a/datafusion-cli/src/exec.rs
+++ b/datafusion-cli/src/exec.rs
@@ -128,6 +128,7 @@ pub async fn exec_from_repl(
     let mut rl = Editor::new()?;
     rl.set_helper(Some(CliHelper::new(
         &ctx.task_ctx().session_config().options().sql_parser.dialect,
+        print_options.color,
     )));
     rl.load_history(".history").ok();
 
diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs
index 0e146d575718..a8e149b4c5c6 100644
--- a/datafusion-cli/src/helper.rs
+++ b/datafusion-cli/src/helper.rs
@@ -38,20 +38,25 @@ use rustyline::Context;
 use rustyline::Helper;
 use rustyline::Result;
 
-use crate::highlighter::SyntaxHighlighter;
+use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter};
 
 pub struct CliHelper {
     completer: FilenameCompleter,
     dialect: String,
-    highlighter: SyntaxHighlighter,
+    highlighter: Box<dyn Highlighter>,
 }
 
 impl CliHelper {
-    pub fn new(dialect: &str) -> Self {
+    pub fn new(dialect: &str, color: bool) -> Self {
+        let highlighter: Box<dyn Highlighter> = if !color {
+            Box::new(NoSyntaxHighlighter {})
+        } else {
+            Box::new(SyntaxHighlighter::new(dialect))
+        };
         Self {
             completer: FilenameCompleter::new(),
             dialect: dialect.into(),
-            highlighter: SyntaxHighlighter::new(dialect),
+            highlighter,
         }
     }
 
@@ -102,7 +107,7 @@ impl CliHelper {
 
 impl Default for CliHelper {
     fn default() -> Self {
-        Self::new("generic")
+        Self::new("generic", false)
     }
 }
 
diff --git a/datafusion-cli/src/highlighter.rs b/datafusion-cli/src/highlighter.rs
index 28732d5b976f..0bb75510b524 100644
--- a/datafusion-cli/src/highlighter.rs
+++ b/datafusion-cli/src/highlighter.rs
@@ -30,20 +30,22 @@ use datafusion::sql::sqlparser::{
 use rustyline::highlight::Highlighter;
 
 /// The syntax highlighter.
+#[derive(Debug)]
 pub struct SyntaxHighlighter {
     dialect: Box<dyn Dialect>,
 }
 
 impl SyntaxHighlighter {
     pub fn new(dialect: &str) -> Self {
-        let dialect = match dialect_from_str(dialect) {
-            Some(dialect) => dialect,
-            None => Box::new(GenericDialect {}),
-        };
+        let dialect = dialect_from_str(dialect).unwrap_or(Box::new(GenericDialect {}));
         Self { dialect }
     }
 }
 
+pub struct NoSyntaxHighlighter {}
+
+impl Highlighter for NoSyntaxHighlighter {}
+
 impl Highlighter for SyntaxHighlighter {
     fn highlight<'l>(&self, line: &'l str, _: usize) -> Cow<'l, str> {
         let mut out_line = String::new();
diff --git a/datafusion-cli/src/lib.rs b/datafusion-cli/src/lib.rs
index 61f9eae7dd53..139a60b8cf16 100644
--- a/datafusion-cli/src/lib.rs
+++ b/datafusion-cli/src/lib.rs
@@ -23,8 +23,7 @@ pub mod command;
 pub mod exec;
 pub mod functions;
 pub mod helper;
+pub mod highlighter;
 pub mod object_storage;
 pub mod print_format;
 pub mod print_options;
-
-mod highlighter;
diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs
index a9082f2e5351..38537dbd9238 100644
--- a/datafusion-cli/src/main.rs
+++ b/datafusion-cli/src/main.rs
@@ -136,6 +136,9 @@ struct Args {
         default_value = "40"
     )]
     maxrows: MaxRows,
+
+    #[clap(long, help = "Enables console syntax highlighting")]
+    color: bool,
 }
 
 #[tokio::main]
@@ -169,28 +172,28 @@ async fn main_inner() -> Result<()> {
         session_config = session_config.with_batch_size(batch_size);
     };
 
-    let rn_config = RuntimeConfig::new();
-    let rn_config =
+    let rt_config = RuntimeConfig::new();
+    let rt_config =
         // set memory pool size
         if let Some(memory_limit) = args.memory_limit {
             let memory_limit = extract_memory_pool_size(&memory_limit).unwrap();
             // set memory pool type
             if let Some(mem_pool_type) = args.mem_pool_type {
                 match mem_pool_type {
-                    PoolType::Greedy => rn_config
+                    PoolType::Greedy => rt_config
                         .with_memory_pool(Arc::new(GreedyMemoryPool::new(memory_limit))),
-                    PoolType::Fair => rn_config
+                    PoolType::Fair => rt_config
                         .with_memory_pool(Arc::new(FairSpillPool::new(memory_limit))),
                 }
             } else {
-                rn_config
+                rt_config
                 .with_memory_pool(Arc::new(GreedyMemoryPool::new(memory_limit)))
             }
         } else {
-            rn_config
+            rt_config
         };
 
-    let runtime_env = create_runtime_env(rn_config.clone())?;
+    let runtime_env = create_runtime_env(rt_config.clone())?;
 
     let mut ctx =
         SessionContext::new_with_config_rt(session_config.clone(), Arc::new(runtime_env));
@@ -207,6 +210,7 @@ async fn main_inner() -> Result<()> {
         format: args.format,
         quiet: args.quiet,
         maxrows: args.maxrows,
+        color: args.color,
     };
 
     let commands = args.command;
diff --git a/datafusion-cli/src/print_options.rs b/datafusion-cli/src/print_options.rs
index b382eb34f62c..f8cd9b3258ef 100644
--- a/datafusion-cli/src/print_options.rs
+++ b/datafusion-cli/src/print_options.rs
@@ -70,6 +70,7 @@ pub struct PrintOptions {
     pub format: PrintFormat,
     pub quiet: bool,
     pub maxrows: MaxRows,
+    pub color: bool,
 }
 
 fn get_timing_info_str(
diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md
index 95b3e7125cd4..30ab7d1495a5 100644
--- a/docs/source/user-guide/cli.md
+++ b/docs/source/user-guide/cli.md
@@ -111,6 +111,9 @@ OPTIONS:
     -c, --command <COMMAND>...
             Execute the given command string(s), then exit
 
+        --color
+            Enables console syntax highlighting
+
     -f, --file <FILE>...
             Execute commands from file(s), then exit