From 7b5a43ea61e9202c9a18758086cd7c871e3572b1 Mon Sep 17 00:00:00 2001 From: xzj7019 <131111794+xzj7019@users.noreply.github.com> Date: Thu, 24 Oct 2024 11:48:28 +0800 Subject: [PATCH] [opt](nereids) Refine stats derive (#42112) pick from master #40654 #40698 #40762 #42286 --- .../datasource/hive/HMSExternalTable.java | 3 +- .../exploration/mv/MaterializedViewUtils.java | 2 +- .../nereids/stats/ExpressionEstimation.java | 50 +- .../doris/nereids/stats/FilterEstimation.java | 499 ++++++++++-------- .../doris/nereids/stats/JoinEstimation.java | 170 ++++-- .../doris/nereids/stats/StatsCalculator.java | 55 +- .../doris/nereids/stats/StatsMathUtil.java | 5 + .../expressions/functions/table/Numbers.java | 2 +- .../doris/nereids/trees/plans/JoinType.java | 2 +- .../doris/nereids/types/StringType.java | 5 - .../nereids/types/coercion/CharacterType.java | 3 +- .../apache/doris/statistics/ColStatsData.java | 3 +- .../doris/statistics/ColumnStatistic.java | 35 +- .../statistics/ColumnStatisticBuilder.java | 22 +- .../doris/statistics/OlapScanStatsDerive.java | 3 - .../statistics/PartitionColumnStatistic.java | 11 +- .../PartitionColumnStatisticBuilder.java | 4 +- .../doris/statistics/StatisticRange.java | 68 ++- .../apache/doris/statistics/Statistics.java | 70 +-- .../statistics/StatisticsRepository.java | 3 +- .../doris/statistics/StatsDeriveResult.java | 21 - .../doris/statistics/util/StatisticsUtil.java | 32 +- .../stats/ExpressionEstimationTest.java | 63 +++ .../nereids/stats/FilterEstimationTest.java | 253 +++++---- .../doris/nereids/stats/JoinEstimateTest.java | 15 +- .../nereids/stats/StatsCalculatorTest.java | 4 +- .../statistics/StatsDeriveResultTest.java | 58 -- .../nereids_hint_tpcds_p0/shape/query74.out | 14 +- .../shape/query48.out | 10 +- .../shape/query58.out | 16 +- .../shape/query74.out | 14 +- .../noStatsRfPrune/query74.out | 14 +- .../no_stats_shape/query74.out | 14 +- .../rf_prune/query58.out | 24 +- .../shape/query58.out | 24 +- .../shape/query11.out | 14 +- .../shape/query2.out | 10 +- .../shape/query4.out | 14 +- .../tpcds_sf100/noStatsRfPrune/query74.out | 14 +- .../tpcds_sf100/no_stats_shape/query74.out | 14 +- .../tpcds_sf100/rf_prune/query58.out | 16 +- .../tpcds_sf100/shape/query58.out | 16 +- .../tpcds_sf1000/shape/query58.out | 16 +- .../tpcds_sf1000/shape/query74.out | 14 +- 44 files changed, 906 insertions(+), 813 deletions(-) delete mode 100644 fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java index 74d9e3c56e65d1..a215cba3f9cee6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java @@ -601,9 +601,8 @@ private Optional getHiveColumnStats(String colName) { if (!parameters.containsKey(NUM_ROWS) || Long.parseLong(parameters.get(NUM_ROWS)) == 0) { return Optional.empty(); } - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); long count = Long.parseLong(parameters.get(NUM_ROWS)); - columnStatisticBuilder.setCount(count); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count); // The tableStats length is at most 1. for (ColumnStatisticsObj tableStat : tableStats) { if (!tableStat.isSetStatsData()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializedViewUtils.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializedViewUtils.java index 8a5110428b2764..2d483c45185322 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializedViewUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/exploration/mv/MaterializedViewUtils.java @@ -426,7 +426,7 @@ public Void visitLogicalJoin(LogicalJoin join, if (joinType.isInnerJoin() || joinType.isCrossJoin()) { return visit(join, context); } else if ((joinType.isLeftJoin() - || joinType.isLefSemiJoin() + || joinType.isLeftSemiJoin() || joinType.isLeftAntiJoin()) && useLeft) { return visit(join.left(), context); } else if ((joinType.isRightJoin() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java index 4068fc6b064660..2307a6dfba3525 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java @@ -128,6 +128,10 @@ public static ColumnStatistic estimate(Expression expression, Statistics stats) @Override public ColumnStatistic visit(Expression expr, Statistics context) { + ColumnStatistic stats = context.findColumnStatistics(expr); + if (stats != null) { + return stats; + } List childrenExpr = expr.children(); if (CollectionUtils.isEmpty(childrenExpr)) { return ColumnStatistic.UNKNOWN; @@ -135,26 +139,28 @@ public ColumnStatistic visit(Expression expr, Statistics context) { return expr.child(0).accept(this, context); } - //TODO: case-when need to re-implemented @Override public ColumnStatistic visitCaseWhen(CaseWhen caseWhen, Statistics context) { double ndv = caseWhen.getWhenClauses().size(); + double width = 1; if (caseWhen.getDefaultValue().isPresent()) { ndv += 1; } for (WhenClause clause : caseWhen.getWhenClauses()) { ColumnStatistic colStats = ExpressionEstimation.estimate(clause.getResult(), context); ndv = Math.max(ndv, colStats.ndv); + width = Math.max(width, clause.getResult().getDataType().width()); } if (caseWhen.getDefaultValue().isPresent()) { ColumnStatistic colStats = ExpressionEstimation.estimate(caseWhen.getDefaultValue().get(), context); ndv = Math.max(ndv, colStats.ndv); + width = Math.max(width, caseWhen.getDefaultValue().get().getDataType().width()); } return new ColumnStatisticBuilder() .setNdv(ndv) .setMinValue(Double.NEGATIVE_INFINITY) .setMaxValue(Double.POSITIVE_INFINITY) - .setAvgSizeByte(8) + .setAvgSizeByte(width) .setNumNulls(0) .build(); } @@ -162,15 +168,20 @@ public ColumnStatistic visitCaseWhen(CaseWhen caseWhen, Statistics context) { @Override public ColumnStatistic visitIf(If ifClause, Statistics context) { double ndv = 2; + double width = 1; ColumnStatistic colStatsThen = ExpressionEstimation.estimate(ifClause.child(1), context); ndv = Math.max(ndv, colStatsThen.ndv); + width = Math.max(width, ifClause.child(1).getDataType().width()); + ColumnStatistic colStatsElse = ExpressionEstimation.estimate(ifClause.child(2), context); ndv = Math.max(ndv, colStatsElse.ndv); + width = Math.max(width, ifClause.child(2).getDataType().width()); + return new ColumnStatisticBuilder() .setNdv(ndv) .setMinValue(Double.NEGATIVE_INFINITY) .setMaxValue(Double.POSITIVE_INFINITY) - .setAvgSizeByte(8) + .setAvgSizeByte(width) .setNumNulls(0) .build(); } @@ -242,9 +253,9 @@ public ColumnStatistic visitLiteral(Literal literal, Statistics context) { return new ColumnStatisticBuilder() .setMaxValue(literalVal) .setMinValue(literalVal) - .setNdv(1) - .setNumNulls(1) - .setAvgSizeByte(1) + .setNdv(literal.isNullLiteral() ? 0 : 1) + .setNumNulls(literal.isNullLiteral() ? 1 : 0) + .setAvgSizeByte(literal.getDataType().width()) .setMinExpr(literal.toLegacyLiteral()) .setMaxExpr(literal.toLegacyLiteral()) .build(); @@ -274,13 +285,13 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, int exprResultTypeWidth = binaryArithmetic.getDataType().width(); double dataSize = exprResultTypeWidth * rowCount; if (binaryArithmetic instanceof Add) { - return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) + return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin + rightMin) .setMaxValue(leftMax + rightMax) .setMinExpr(null).setMaxExpr(null).build(); } if (binaryArithmetic instanceof Subtract) { - return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) + return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin - rightMax) .setMaxValue(leftMax - rightMin).setMinExpr(null) .setMaxExpr(null).build(); @@ -297,7 +308,7 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, Math.max(leftMin * rightMin, leftMin * rightMax), leftMax * rightMin), leftMax * rightMax); - return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) + return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max) .setMaxExpr(null).setMinExpr(null).build(); } @@ -312,14 +323,14 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, Math.max(leftMin / noneZeroDivisor(rightMin), leftMin / noneZeroDivisor(rightMax)), leftMax / noneZeroDivisor(rightMin)), leftMax / noneZeroDivisor(rightMax)); - return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) + return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(binaryArithmetic.getDataType().width()).setMinValue(min) .setMaxValue(max).build(); } if (binaryArithmetic instanceof Mod) { double min = -Math.max(Math.abs(rightMin), Math.abs(rightMax)); double max = -min; - return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv) + return new ColumnStatisticBuilder().setNdv(ndv) .setAvgSizeByte(exprResultTypeWidth) .setDataSize(dataSize) .setNumNulls(numNulls) @@ -343,8 +354,7 @@ public ColumnStatistic visitMin(Min min, Statistics context) { return ColumnStatistic.UNKNOWN; } // if this is scalar agg, we will update count and ndv to 1 when visiting group clause - return new ColumnStatisticBuilder(columnStat) - .build(); + return new ColumnStatisticBuilder(columnStat).build(); } @Override @@ -355,16 +365,14 @@ public ColumnStatistic visitMax(Max max, Statistics context) { return ColumnStatistic.UNKNOWN; } // if this is scalar agg, we will update count and ndv to 1 when visiting group clause - return new ColumnStatisticBuilder(columnStat) - .build(); + return new ColumnStatisticBuilder(columnStat).build(); } @Override public ColumnStatistic visitCount(Count count, Statistics context) { double width = count.getDataType().width(); // for scalar agg, ndv and row count will be normalized by 1 in StatsCalculator.computeAggregate() - return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setCount(context.getRowCount()) - .setAvgSizeByte(width).build(); + return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setAvgSizeByte(width).build(); } // TODO: return a proper estimated stat after supports histogram @@ -382,14 +390,14 @@ public ColumnStatistic visitAvg(Avg avg, Statistics context) { @Override public ColumnStatistic visitYear(Year year, Statistics context) { ColumnStatistic childStat = year.child().accept(this, context); + double rowCount = context.getRowCount(); long minYear = 1970; long maxYear = 2038; return new ColumnStatisticBuilder() - .setCount(childStat.count) .setNdv(maxYear - minYear + 1) .setAvgSizeByte(4) .setNumNulls(childStat.numNulls) - .setDataSize(4 * childStat.count) + .setDataSize(4 * rowCount) .setMinValue(minYear) .setMaxValue(maxYear).setMinExpr(null).build(); } @@ -580,7 +588,7 @@ public ColumnStatistic visitToDate(ToDate toDate, Statistics context) { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats) .setAvgSizeByte(toDate.getDataType().width()) .setDataSize(toDate.getDataType().width() * context.getRowCount()); - if (childColumnStats.minOrMaxIsInf()) { + if (childColumnStats.isMinMaxInvalid()) { return columnStatisticBuilder.build(); } double minValue; @@ -611,7 +619,7 @@ public ColumnStatistic visitToDays(ToDays toDays, Statistics context) { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats) .setAvgSizeByte(toDays.getDataType().width()) .setDataSize(toDays.getDataType().width() * context.getRowCount()); - if (childColumnStats.minOrMaxIsInf()) { + if (childColumnStats.isMinMaxInvalid()) { return columnStatisticBuilder.build(); } double minValue; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index a65a07fea30bdf..8ca083c1bb20b0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -103,7 +103,7 @@ public Statistics estimate(Expression expression, Statistics inputStats) { } outputStats = expression.accept(this, new EstimationContext(deltaStats.build())); } - outputStats.enforceValid(); + outputStats.normalizeColumnStatistics(); return outputStats; } @@ -117,13 +117,14 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation Expression leftExpr = predicate.child(0); Expression rightExpr = predicate.child(1); Statistics leftStats = leftExpr.accept(this, context); - leftStats = leftStats.normalizeByRatio(context.statistics.getRowCount()); - Statistics andStats = rightExpr.accept(this, - new EstimationContext(leftStats)); + leftStats.normalizeColumnStatistics(context.statistics.getRowCount(), true); + Statistics andStats = rightExpr.accept(this, new EstimationContext(leftStats)); if (predicate instanceof And) { + andStats.normalizeColumnStatistics(context.statistics.getRowCount(), true); return andStats; } else if (predicate instanceof Or) { Statistics rightStats = rightExpr.accept(this, context); + rightStats.normalizeColumnStatistics(context.statistics.getRowCount(), true); double rowCount = leftStats.getRowCount() + rightStats.getRowCount() - andStats.getRowCount(); Statistics orStats = context.statistics.withRowCount(rowCount); Set leftInputSlots = leftExpr.getInputSlots(); @@ -171,42 +172,32 @@ public Statistics visitComparisonPredicate(ComparisonPredicate cp, EstimationCon ColumnStatistic statsForLeft = ExpressionEstimation.estimate(left, context.statistics); ColumnStatistic statsForRight = ExpressionEstimation.estimate(right, context.statistics); if (!left.isConstant() && !right.isConstant()) { - return calculateWhenBothColumn(cp, context, statsForLeft, statsForRight); + return estimateColumnToColumn(cp, context, statsForLeft, statsForRight); } else { - // For literal, it's max min is same value. - return calculateWhenLiteralRight(cp, - statsForLeft, - statsForRight, - context); + return estimateColumnToConstant(cp, statsForLeft, statsForRight, context); } } - private Statistics updateLessThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, - ColumnStatistic statsForRight, EstimationContext context) { - StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, - statsForRight.maxValue, statsForRight.maxExpr, - statsForLeft.ndv, dataType); - return estimateBinaryComparisonFilter(leftExpr, dataType, - statsForLeft, - rightRange, context); + private Statistics estimateColumnLessThanConstant(Expression leftExpr, DataType dataType, + ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { + StatisticRange constantRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, + statsForRight.maxValue, statsForRight.maxExpr, statsForLeft.ndv, dataType); + return estimateColumnToConstantRange(leftExpr, dataType, statsForLeft, constantRange, context); } - private Statistics updateGreaterThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft, - ColumnStatistic statsForRight, EstimationContext context) { - StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, - statsForLeft.maxValue, statsForLeft.maxExpr, - statsForLeft.ndv, dataType); - return estimateBinaryComparisonFilter(leftExpr, dataType, statsForLeft, rightRange, context); + private Statistics estimateColumnGreaterThanConstant(Expression leftExpr, DataType dataType, + ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { + StatisticRange constantRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, + statsForLeft.maxValue, statsForLeft.maxExpr, statsForLeft.ndv, dataType); + return estimateColumnToConstantRange(leftExpr, dataType, statsForLeft, constantRange, context); } - private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, + private Statistics estimateColumnToConstant(ComparisonPredicate cp, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { if (statsForLeft.isUnKnown) { return context.statistics.withSel(DEFAULT_INEQUALITY_COEFFICIENT); - } - - if (cp instanceof EqualPredicate) { - return estimateEqualTo(cp, statsForLeft, statsForRight, context); + } else if (cp instanceof EqualPredicate) { + return estimateColumnEqualToConstant(cp, statsForLeft, statsForRight, context); } else { // literal Map used to covert dateLiteral back to stringLiteral Map literalMap = new HashMap<>(); @@ -229,12 +220,13 @@ private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, statsForLeftMayConverted = statsForLeftMayConvertedOpt.get(); statsForRightMayConverted = statsForRightMayConvertedOpt.get(); } - Statistics result = null; + + Statistics result; if (cp instanceof LessThan || cp instanceof LessThanEqual) { - result = updateLessThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + result = estimateColumnLessThanConstant(cp.left(), compareType, statsForLeftMayConverted, statsForRightMayConverted, context); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - result = updateGreaterThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + result = estimateColumnGreaterThanConstant(cp.left(), compareType, statsForLeftMayConverted, statsForRightMayConverted, context); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); @@ -315,14 +307,15 @@ private Optional tryConvertStrLiteralToDateLiteral(LiteralExpr lite return dt == null ? Optional.empty() : Optional.of(dt); } - private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic statsForLeft, - ColumnStatistic statsForRight, - EstimationContext context) { + private Statistics estimateColumnEqualToConstant(ComparisonPredicate cp, ColumnStatistic statsForLeft, + ColumnStatistic statsForRight, EstimationContext context) { double selectivity; if (statsForLeft.isUnKnown) { selectivity = DEFAULT_INEQUALITY_COEFFICIENT; } else { double ndv = statsForLeft.ndv; + double numNulls = statsForLeft.numNulls; + double rowCount = context.statistics.getRowCount(); if (statsForRight.isUnKnown) { if (ndv >= 1.0) { selectivity = 1.0 / ndv; @@ -332,13 +325,15 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats } else { double val = statsForRight.maxValue; if (val > statsForLeft.maxValue || val < statsForLeft.minValue) { + // TODO: make sure left's stats is RangeScalable whose min/max is trustable. + // The equal to constant doesn't rely on the range, so maybe safe here. selectivity = 0.0; } else if (ndv >= 1.0) { selectivity = StatsMathUtil.minNonNaN(1.0, 1.0 / ndv); } else { selectivity = DEFAULT_INEQUALITY_COEFFICIENT; } - selectivity = getNotNullSelectivity(statsForLeft, selectivity); + selectivity = getNotNullSelectivity(numNulls, rowCount, ndv, selectivity); } } Statistics equalStats = context.statistics.withSel(selectivity); @@ -348,112 +343,132 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats if (!(left instanceof SlotReference)) { left.accept(new ColumnStatsAdjustVisitor(), equalStats); } + // normalize column statistics here to sync numNulls by proportion and ndv by current row count + equalStats.normalizeColumnStatistics(context.statistics.getRowCount(), true); return equalStats; } - private Statistics calculateWhenBothColumn(ComparisonPredicate cp, EstimationContext context, + private Statistics estimateColumnToColumn(ComparisonPredicate cp, EstimationContext context, ColumnStatistic statsForLeft, ColumnStatistic statsForRight) { Expression left = cp.left(); Expression right = cp.right(); if (cp instanceof EqualPredicate) { return estimateColumnEqualToColumn(left, statsForLeft, right, statsForRight, cp instanceof NullSafeEqual, context); - } - if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { + } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { return estimateColumnLessThanColumn(right, statsForRight, left, statsForLeft, context); - } - if (cp instanceof LessThan || cp instanceof LessThanEqual) { + } else if (cp instanceof LessThan || cp instanceof LessThanEqual) { return estimateColumnLessThanColumn(left, statsForLeft, right, statsForRight, context); + } else { + return context.statistics; } - return context.statistics; } - @Override - public Statistics visitInPredicate(InPredicate inPredicate, EstimationContext context) { - Expression compareExpr = inPredicate.getCompareExpr(); - ColumnStatistic compareExprStats = ExpressionEstimation.estimate(compareExpr, context.statistics); - if (compareExprStats.isUnKnown || compareExpr instanceof Function) { - return context.statistics.withSel(DEFAULT_IN_COEFFICIENT); - } + private ColumnStatistic updateInPredicateColumnStatistics(InPredicate inPredicate, EstimationContext context, + ColumnStatistic compareExprStats) { List options = inPredicate.getOptions(); - // init minOption and maxOption by compareExpr.max and compareExpr.min respectively, - // and then adjust min/max by options - double minOptionValue = compareExprStats.maxValue; - double maxOptionValue = compareExprStats.minValue; - LiteralExpr minOptionLiteral = compareExprStats.maxExpr; - LiteralExpr maxOptionLiteral = compareExprStats.minExpr; - /* suppose A.(min, max) = (0, 10), A.ndv=10 - A in ( 1, 2, 5, 100): - validInOptCount = 3, that is (1, 2, 5) - table selectivity = 3/10 - A.min = 1, A.max=5 - A.selectivity = 3/5 - A.ndv = 3 - A not in (1, 2, 3, 100): - validInOptCount = 10 - 3 - we assume that 1, 2, 3 exist in A - A.ndv = 10 - 3 = 7 - table selectivity = 7/10 - A.(min, max) not changed - A.selectivity = 7/10 - */ - int validInOptCount = 0; - double selectivity = 1.0; ColumnStatisticBuilder compareExprStatsBuilder = new ColumnStatisticBuilder(compareExprStats); - int nonLiteralOptionCount = 0; - for (Expression option : options) { - ColumnStatistic optionStats = ExpressionEstimation.estimate(option, context.statistics); - if (option instanceof Literal) { - // remove the options which is out of compareExpr.range - if (compareExprStats.minValue <= optionStats.maxValue - && optionStats.minValue <= compareExprStats.maxValue) { - validInOptCount++; - LiteralExpr optionLiteralExpr = ((Literal) option).toLegacyLiteral(); - if (maxOptionLiteral == null || optionLiteralExpr.compareTo(maxOptionLiteral) >= 0) { - maxOptionLiteral = optionLiteralExpr; - maxOptionValue = optionStats.maxValue; - } - if (minOptionLiteral == null || optionLiteralExpr.compareTo(minOptionLiteral) <= 0) { - minOptionLiteral = optionLiteralExpr; - minOptionValue = optionStats.minValue; + if (!compareExprStats.isMinMaxInvalid()) { + // init minOption and maxOption by compareExpr.max and compareExpr.min respectively, + // and then adjust min/max by options + double minOptionValue = compareExprStats.maxValue; + double maxOptionValue = compareExprStats.minValue; + LiteralExpr minOptionLiteral = compareExprStats.maxExpr; + LiteralExpr maxOptionLiteral = compareExprStats.minExpr; + /* suppose A.(min, max) = (0, 10), A.ndv=10 + A in ( 1, 2, 5, 100): + validInOptCount = 3, that is (1, 2, 5) + table selectivity = 3/10 + A.min = 1, A.max=5 + A.selectivity = 3/5 + A.ndv = 3 + A not in (1, 2, 3, 100): + validInOptCount = 10 - 3 + we assume that 1, 2, 3 exist in A + A.ndv = 10 - 3 = 7 + table selectivity = 7/10 + A.(min, max) not changed + A.selectivity = 7/10 + */ + int validInOptCount = 0; + int nonLiteralOptionCount = 0; + for (Expression option : options) { + ColumnStatistic optionStats = ExpressionEstimation.estimate(option, context.statistics); + if (option instanceof Literal) { + // remove the options which is out of compareExpr.range + Preconditions.checkState(Math.abs(optionStats.maxValue - optionStats.minValue) < 1e-06, + "literal's min/max doesn't equal"); + double constValue = optionStats.maxValue; + if (compareExprStats.minValue <= constValue && compareExprStats.maxValue >= constValue) { + validInOptCount++; + LiteralExpr optionLiteralExpr = ((Literal) option).toLegacyLiteral(); + if (maxOptionLiteral == null || optionLiteralExpr.compareTo(maxOptionLiteral) >= 0) { + maxOptionLiteral = optionLiteralExpr; + maxOptionValue = constValue; + } + + if (minOptionLiteral == null || optionLiteralExpr.compareTo(minOptionLiteral) <= 0) { + minOptionLiteral = optionLiteralExpr; + minOptionValue = constValue; + } } + } else { + nonLiteralOptionCount++; } - } else { - nonLiteralOptionCount++; } - } - if (nonLiteralOptionCount > 0) { - // A in (x+1, ...) - // "x+1" is not literal, and if const-fold can not handle it, it blocks estimation of min/max value. - // and hence, we do not adjust compareExpr.stats.range. - int newNdv = nonLiteralOptionCount + validInOptCount; - if (newNdv < compareExprStats.ndv) { - compareExprStatsBuilder.setNdv(newNdv); - selectivity = StatsMathUtil.divide(newNdv, compareExprStats.ndv); + if (nonLiteralOptionCount > 0) { + // A in (x+1, ...) + // "x+1" is not literal, and if const-fold can not handle it, it blocks estimation of min/max value. + // and hence, we do not adjust compareExpr.stats.range. + int newNdv = nonLiteralOptionCount + validInOptCount; + if (newNdv < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(newNdv); + } } else { - selectivity = 1.0; + maxOptionValue = Math.min(maxOptionValue, compareExprStats.maxValue); + minOptionValue = Math.max(minOptionValue, compareExprStats.minValue); + compareExprStatsBuilder.setMaxValue(maxOptionValue); + compareExprStatsBuilder.setMaxExpr(maxOptionLiteral); + compareExprStatsBuilder.setMinValue(minOptionValue); + compareExprStatsBuilder.setMinExpr(minOptionLiteral); + if (validInOptCount < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(validInOptCount); + } } } else { - maxOptionValue = Math.min(maxOptionValue, compareExprStats.maxValue); - minOptionValue = Math.max(minOptionValue, compareExprStats.minValue); - compareExprStatsBuilder.setMaxValue(maxOptionValue); - compareExprStatsBuilder.setMaxExpr(maxOptionLiteral); - compareExprStatsBuilder.setMinValue(minOptionValue); - compareExprStatsBuilder.setMinExpr(minOptionLiteral); - if (validInOptCount < compareExprStats.ndv) { - compareExprStatsBuilder.setNdv(validInOptCount); - selectivity = StatsMathUtil.divide(validInOptCount, compareExprStats.ndv); - } else { - selectivity = 1.0; - } + // other types, such as string type, using option's size to estimate + // min/max will not be updated + compareExprStatsBuilder.setNdv(Math.min(options.size(), compareExprStats.getOriginalNdv())); } compareExprStatsBuilder.setNumNulls(0); + return compareExprStatsBuilder.build(); + } + + @Override + public Statistics visitInPredicate(InPredicate inPredicate, EstimationContext context) { + Expression compareExpr = inPredicate.getCompareExpr(); + ColumnStatistic compareExprStats = ExpressionEstimation.estimate(compareExpr, context.statistics); + if (compareExprStats.isUnKnown || compareExpr instanceof Function) { + return context.statistics.withSel(DEFAULT_IN_COEFFICIENT); + } + + List options = inPredicate.getOptions(); + ColumnStatistic newCompareExprStats = updateInPredicateColumnStatistics(inPredicate, context, compareExprStats); + double selectivity; + if (!newCompareExprStats.isMinMaxInvalid()) { + selectivity = Statistics.getValidSelectivity( + Math.min(StatsMathUtil.divide(newCompareExprStats.ndv, compareExprStats.ndv), 1)); + } else { + selectivity = Statistics.getValidSelectivity( + Math.min(options.size() / compareExprStats.getOriginalNdv(), 1)); + } + Statistics estimated = new StatisticsBuilder(context.statistics).build(); - ColumnStatistic stats = compareExprStatsBuilder.build(); - selectivity = getNotNullSelectivity(stats, selectivity); + selectivity = getNotNullSelectivity(compareExprStats.numNulls, estimated.getRowCount(), + compareExprStats.ndv, selectivity); estimated = estimated.withSel(selectivity); - estimated.addColumnStats(compareExpr, stats); + estimated.addColumnStats(compareExpr, newCompareExprStats); context.addKeyIfSlot(compareExpr); return estimated; } @@ -470,6 +485,7 @@ public Statistics visitNot(Not not, EstimationContext context) { } Expression child = not.child(); Statistics childStats = child.accept(this, context); + childStats.normalizeColumnStatistics(); //if estimated rowCount is 0, adjust to 1 to make upper join reorder reasonable. double rowCount = Math.max(context.statistics.getRowCount() - childStats.getRowCount(), 1); StatisticsBuilder statisticsBuilder = new StatisticsBuilder(context.statistics).setRowCount(rowCount); @@ -536,17 +552,19 @@ public Statistics visitIsNull(IsNull isNull, EstimationContext context) { double row = context.statistics.getRowCount() * DEFAULT_ISNULL_SELECTIVITY; return new StatisticsBuilder(context.statistics).setRowCount(row).build(); } - double outputRowCount = childColStats.numNulls; + double outputRowCount = Math.min(childColStats.numNulls, context.statistics.getRowCount()); if (!isOnBaseTable) { // for is null on base table, use the numNulls, otherwise // nulls will be generated such as outer join and then we do a protection Expression child = isNull.child(); Statistics childStats = child.accept(this, context); + childStats.normalizeColumnStatistics(); outputRowCount = Math.max(childStats.getRowCount() * DEFAULT_ISNULL_SELECTIVITY, outputRowCount); outputRowCount = Math.max(outputRowCount, 1); } + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(childColStats); - colBuilder.setCount(outputRowCount).setNumNulls(outputRowCount) + colBuilder.setNumNulls(outputRowCount) .setMaxValue(Double.POSITIVE_INFINITY) .setMinValue(Double.NEGATIVE_INFINITY) .setNdv(0); @@ -580,26 +598,27 @@ public boolean isKeySlot(Expression expr) { } } - private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType dataType, ColumnStatistic leftStats, + private Statistics estimateColumnToConstantRange(Expression leftExpr, DataType dataType, ColumnStatistic leftStats, StatisticRange rightRange, EstimationContext context) { - StatisticRange leftRange = - new StatisticRange(leftStats.minValue, leftStats.minExpr, leftStats.maxValue, leftStats.maxExpr, - leftStats.ndv, dataType); - StatisticRange intersectRange = leftRange.cover(rightRange); - + StatisticRange leftRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, + leftStats.maxValue, leftStats.maxExpr, leftStats.ndv, dataType); ColumnStatisticBuilder leftColumnStatisticBuilder; Statistics updatedStatistics; + + StatisticRange intersectRange = leftRange.intersect(rightRange, true); + double sel = leftRange.getDistinctValues() == 0 + ? 1.0 + : intersectRange.getDistinctValues() / leftRange.getDistinctValues(); if (intersectRange.isEmpty()) { - updatedStatistics = context.statistics.withRowCount(0); leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) .setMinValue(Double.NEGATIVE_INFINITY) .setMinExpr(null) .setMaxValue(Double.POSITIVE_INFINITY) .setMaxExpr(null) .setNdv(0) - .setCount(0) .setNumNulls(0); - } else { + updatedStatistics = context.statistics.withRowCount(0); + } else if (dataType instanceof RangeScalable || sel == 0 || sel == 1) { leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) .setMinValue(intersectRange.getLow()) .setMinExpr(intersectRange.getLowExpr()) @@ -607,43 +626,63 @@ private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType .setMaxExpr(intersectRange.getHighExpr()) .setNdv(intersectRange.getDistinctValues()) .setNumNulls(0); - double sel = leftRange.getDistinctValues() == 0 - ? 1.0 - : intersectRange.getDistinctValues() / leftRange.getDistinctValues(); - if (!(dataType instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) { - sel = DEFAULT_INEQUALITY_COEFFICIENT; - } else { - sel = Math.max(sel, RANGE_SELECTIVITY_THRESHOLD); - } - sel = getNotNullSelectivity(leftStats, sel); + sel = Math.max(sel, RANGE_SELECTIVITY_THRESHOLD); + sel = getNotNullSelectivity(leftStats.numNulls, context.statistics.getRowCount(), leftStats.ndv, sel); + updatedStatistics = context.statistics.withSel(sel); + } else { + sel = DEFAULT_INEQUALITY_COEFFICIENT; + sel = getNotNullSelectivity(leftStats.numNulls, context.statistics.getRowCount(), leftStats.ndv, sel); + leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) + .setMinValue(intersectRange.getLow()) + .setMinExpr(intersectRange.getLowExpr()) + .setMaxValue(intersectRange.getHigh()) + .setMaxExpr(intersectRange.getHighExpr()) + .setNdv(Math.max(1, Math.min(leftStats.ndv * sel, intersectRange.getDistinctValues()))) + .setNumNulls(0); updatedStatistics = context.statistics.withSel(sel); - leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); } updatedStatistics.addColumnStats(leftExpr, leftColumnStatisticBuilder.build()); context.addKeyIfSlot(leftExpr); leftExpr.accept(new ColumnStatsAdjustVisitor(), updatedStatistics); + // normalize column statistics here to sync numNulls by proportion and ndv by current row count + updatedStatistics.normalizeColumnStatistics(context.statistics.getRowCount(), true); + return updatedStatistics; } private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatistic leftStats, Expression rightExpr, ColumnStatistic rightStats, boolean keepNull, EstimationContext context) { + ColumnStatisticBuilder intersectBuilder = new ColumnStatisticBuilder(leftStats); StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType()); StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType()); - StatisticRange leftIntersectRight = leftRange.intersect(rightRange); - StatisticRange intersect = rightRange.intersect(leftIntersectRight); - ColumnStatisticBuilder intersectBuilder = new ColumnStatisticBuilder(leftStats); - intersectBuilder.setNdv(intersect.getDistinctValues()); + StatisticRange intersect = leftRange.intersect(rightRange); intersectBuilder.setMinValue(intersect.getLow()); intersectBuilder.setMaxValue(intersect.getHigh()); - double numNull = 0; - if (keepNull) { - numNull = Math.min(leftStats.numNulls, rightStats.numNulls); + + if (leftExpr.getDataType() instanceof RangeScalable && rightExpr.getDataType() instanceof RangeScalable + && !leftStats.isMinMaxInvalid() && !rightStats.isMinMaxInvalid()) { + intersectBuilder.setNdv(intersect.getDistinctValues()); + } else { + // intersect ndv uses min ndv but selectivity computing use the max + intersectBuilder.setNdv(Math.min(leftStats.ndv, rightStats.ndv)); } + double numNull = keepNull ? Math.min(leftStats.numNulls, rightStats.numNulls) : 0; intersectBuilder.setNumNulls(numNull); - double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)); - Statistics updatedStatistics = context.statistics.withSel(sel, numNull); - updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build()); - updatedStatistics.addColumnStats(rightExpr, intersectBuilder.build()); + + // TODO: consider notNullSelectivity + //double origRowCount = context.statistics.getRowCount(); + double leftNotNullSel = 1.0; //Statistics.getValidSelectivity(1 - (leftStats.numNulls / origRowCount)); + double rightNotNullSel = 1.0; //Statistics.getValidSelectivity(1 - (rightStats.numNulls / origRowCount)); + double notNullSel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)) + * (keepNull ? 1 : leftNotNullSel * rightNotNullSel); + + Statistics updatedStatistics = context.statistics.withSel(notNullSel, numNull); + ColumnStatistic newLeftStatistics = intersectBuilder + .setAvgSizeByte(leftStats.avgSizeByte).build(); + ColumnStatistic newRightStatistics = intersectBuilder + .setAvgSizeByte(rightStats.avgSizeByte).build(); + updatedStatistics.addColumnStats(leftExpr, newLeftStatistics); + updatedStatistics.addColumnStats(rightExpr, newRightStatistics); context.addKeyIfSlot(leftExpr); context.addKeyIfSlot(rightExpr); @@ -654,102 +693,116 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) { StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType()); StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType()); - Statistics statistics = null; - // Left always less than Right - if (leftRange.getHigh() < rightRange.getLow()) { - statistics = - context.statistics.withRowCount(Math.min(context.statistics.getRowCount() - leftStats.numNulls, - context.statistics.getRowCount() - rightStats.numNulls)); - statistics.addColumnStats(leftExpr, new ColumnStatisticBuilder(leftStats).setNumNulls(0.0).build()); - statistics.addColumnStats(rightExpr, new ColumnStatisticBuilder(rightStats).setNumNulls(0.0).build()); - context.addKeyIfSlot(leftExpr); - context.addKeyIfSlot(rightExpr); - return statistics; - } - if (leftRange.isInfinite() || rightRange.isInfinite()) { - return context.statistics.withSel(DEFAULT_INEQUALITY_COEFFICIENT); - } + StatisticRange intersect = leftRange.intersect(rightRange); + + if (leftExpr.getDataType() instanceof RangeScalable && rightExpr.getDataType() instanceof RangeScalable + && !leftStats.isMinMaxInvalid() && !rightStats.isMinMaxInvalid()) { + // TODO: use intersect interface to refine this to avoid this kind of left-dominating style + Statistics statistics; + // Left always less than Right + if (leftRange.getHigh() < rightRange.getLow()) { + statistics = + context.statistics.withRowCount(Math.min(context.statistics.getRowCount() - leftStats.numNulls, + context.statistics.getRowCount() - rightStats.numNulls)); + statistics.addColumnStats(leftExpr, new ColumnStatisticBuilder(leftStats).setNumNulls(0.0).build()); + statistics.addColumnStats(rightExpr, new ColumnStatisticBuilder(rightStats).setNumNulls(0.0).build()); + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); + return statistics; + } - double leftOverlapPercent = leftRange.overlapPercentWith(rightRange); + double leftOverlapPercent = leftRange.overlapPercentWith(rightRange); - if (leftOverlapPercent == 0.0) { - // Left always greater than right - return context.statistics.withRowCount(0.0); - } - StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, - rightStats.minValue, rightStats.minExpr, Double.NaN, leftExpr.getDataType()); - double leftAlwaysLessThanRightPercent = 0; - if (leftRange.getLow() < rightRange.getLow()) { - leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange); - } - ColumnStatistic leftColumnStatistic = new ColumnStatisticBuilder(leftStats) - .setMaxValue(Math.min(leftRange.getHigh(), rightRange.getHigh())) - .setMinValue(leftRange.getLow()) - .setNdv(leftStats.ndv * (leftAlwaysLessThanRightPercent + leftOverlapPercent)) - .setNumNulls(0) - .build(); - double rightOverlappingRangeFraction = rightRange.overlapPercentWith(leftRange); - double rightAlwaysGreaterRangeFraction = 0; - if (leftRange.getHigh() < rightRange.getHigh()) { - rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange( - leftRange.getHigh(), leftRange.getHighExpr(), - rightRange.getHigh(), rightRange.getHighExpr(), - Double.NaN, rightExpr.getDataType())); - } - ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats) - .setMinValue(Math.max(leftRange.getLow(), rightRange.getLow())) - .setMaxValue(rightRange.getHigh()) - .setNdv(rightStats.ndv * (rightAlwaysGreaterRangeFraction + rightOverlappingRangeFraction)) - .setNumNulls(0) - .build(); - double sel = DEFAULT_INEQUALITY_COEFFICIENT; - if (leftExpr.getDataType() instanceof RangeScalable) { - sel = leftAlwaysLessThanRightPercent - + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT - + leftOverlapPercent * rightAlwaysGreaterRangeFraction; - } else if (leftOverlapPercent == 1.0) { - sel = 1.0; + if (leftOverlapPercent == 0.0) { + // Left always greater than right + return context.statistics.withRowCount(0.0); + } + StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, + rightStats.minValue, rightStats.minExpr, Double.NaN, leftExpr.getDataType()); + double leftAlwaysLessThanRightPercent = 0; + if (leftRange.getLow() < rightRange.getLow()) { + leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange); + } + ColumnStatistic leftColumnStatistic = new ColumnStatisticBuilder(leftStats) + .setMaxValue(Math.min(leftRange.getHigh(), rightRange.getHigh())) + .setMinValue(leftRange.getLow()) + .setNdv(leftStats.ndv * (leftAlwaysLessThanRightPercent + leftOverlapPercent)) + .setNumNulls(0) + .build(); + double rightOverlappingRangeFraction = rightRange.overlapPercentWith(leftRange); + double rightAlwaysGreaterRangeFraction = 0; + if (leftRange.getHigh() < rightRange.getHigh()) { + rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange( + leftRange.getHigh(), leftRange.getHighExpr(), + rightRange.getHigh(), rightRange.getHighExpr(), + Double.NaN, rightExpr.getDataType())); + } + ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats) + .setMinValue(Math.max(leftRange.getLow(), rightRange.getLow())) + .setMaxValue(rightRange.getHigh()) + .setNdv(rightStats.ndv * (rightAlwaysGreaterRangeFraction + rightOverlappingRangeFraction)) + .setNumNulls(0) + .build(); + double sel; + if (leftExpr.getDataType() instanceof RangeScalable) { + sel = leftAlwaysLessThanRightPercent + + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT + + leftOverlapPercent * rightAlwaysGreaterRangeFraction; + } else if (leftOverlapPercent == 1.0) { + sel = 1.0; + } else { + sel = DEFAULT_INEQUALITY_COEFFICIENT; + } + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); + return context.statistics.withSel(sel) + .addColumnStats(leftExpr, leftColumnStatistic) + .addColumnStats(rightExpr, rightColumnStatistic); + } else { + ColumnStatistic leftColumnStatistic = new ColumnStatisticBuilder(leftStats) + .setMaxValue(intersect.getHigh()) + .setMinValue(intersect.getLow()) + .setNumNulls(0) + .setNdv(Math.max(leftStats.ndv * DEFAULT_INEQUALITY_COEFFICIENT, 1)) + .build(); + ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats) + .setMaxValue(intersect.getHigh()) + .setMinValue(intersect.getLow()) + .setNumNulls(0) + .setNdv(Math.max(rightStats.ndv * DEFAULT_INEQUALITY_COEFFICIENT, 1)) + .build(); + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); + return context.statistics.withSel(DEFAULT_INEQUALITY_COEFFICIENT) + .addColumnStats(leftExpr, leftColumnStatistic) + .addColumnStats(rightExpr, rightColumnStatistic); } - context.addKeyIfSlot(leftExpr); - context.addKeyIfSlot(rightExpr); - return context.statistics.withSel(sel) - .addColumnStats(leftExpr, leftColumnStatistic) - .addColumnStats(rightExpr, rightColumnStatistic); } @Override public Statistics visitLike(Like like, EstimationContext context) { StatisticsBuilder statsBuilder = new StatisticsBuilder(context.statistics); - statsBuilder.setRowCount(context.statistics.getRowCount() * DEFAULT_LIKE_COMPARISON_SELECTIVITY); + double rowCount = context.statistics.getRowCount() * DEFAULT_LIKE_COMPARISON_SELECTIVITY; + statsBuilder.setRowCount(rowCount); if (like.left() instanceof Slot) { ColumnStatistic origin = context.statistics.findColumnStatistics(like.left()); Preconditions.checkArgument(origin != null, "col stats not found. slot=%s in %s", like.left().toSql(), like.toSql()); ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(origin); - double selectivity = StatsMathUtil.divide(DEFAULT_LIKE_COMPARISON_SELECTIVITY, origin.ndv); - double notNullSel = getNotNullSelectivity(origin, selectivity); - colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY) - .setCount(notNullSel * context.statistics.getRowCount()).setNumNulls(0); + colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0); statsBuilder.putColumnStatistics(like.left(), colBuilder.build()); context.addKeyIfSlot(like.left()); } return statsBuilder.build(); } - private double getNotNullSelectivity(ColumnStatistic stats, double origSel) { - double rowCount = stats.count; - double numNulls = stats.numNulls; - - // comment following check since current rowCount and ndv may be inconsistant - // e.g, rowCount has been reduced by one filter but another filter column's - // ndv and numNull remains originally, which will unexpectedly go into the following - // normalization. - - //if (numNulls > rowCount - ndv) { - // numNulls = rowCount - ndv > 0 ? rowCount - ndv : 0; - //} - double notNullSel = rowCount <= 1.0 ? 1.0 : 1 - Statistics.getValidSelectivity(numNulls / rowCount); + private double getNotNullSelectivity(double origNumNulls, double origRowCount, double origNdv, double origSel) { + if (origNumNulls > origRowCount - origNdv) { + origNumNulls = origRowCount - origNdv > 0 ? origRowCount - origNdv : 0; + } + double notNullSel = origRowCount <= 1.0 ? 1.0 : 1 - Statistics + .getValidSelectivity(origNumNulls / origRowCount); double validSel = origSel * notNullSel; return Statistics.getValidSelectivity(validSel); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index f8298871f0d632..a7430a41055971 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -46,9 +46,11 @@ public class JoinEstimation { private static double DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT = 0.3; private static double UNKNOWN_COL_STATS_FILTER_SEL_LOWER_BOUND = 0.5; + private static double TRUSTABLE_CONDITION_SELECTIVITY_POW_FACTOR = 2.0; + private static double UNTRUSTABLE_CONDITION_SELECTIVITY_LINEAR_FACTOR = 0.9; + private static double TRUSTABLE_UNIQ_THRESHOLD = 0.9; - private static EqualPredicate normalizeHashJoinCondition(EqualPredicate equal, Statistics leftStats, - Statistics rightStats) { + private static EqualPredicate normalizeEqualPredJoinCondition(EqualPredicate equal, Statistics rightStats) { boolean changeOrder = equal.left().getInputSlots().stream() .anyMatch(slot -> rightStats.findColumnStatistics(slot) != null); if (changeOrder) { @@ -58,7 +60,7 @@ private static EqualPredicate normalizeHashJoinCondition(EqualPredicate equal, S } } - private static boolean hashJoinConditionContainsUnknownColumnStats(Statistics leftStats, + private static boolean joinConditionContainsUnknownColumnStats(Statistics leftStats, Statistics rightStats, Join join) { for (Expression expr : join.getEqualPredicates()) { for (Slot slot : expr.getInputSlots()) { @@ -74,7 +76,8 @@ private static boolean hashJoinConditionContainsUnknownColumnStats(Statistics le return false; } - private static Statistics estimateHashJoin(Statistics leftStats, Statistics rightStats, Join join) { + private static Statistics estimateInnerJoinWithEqualPredicate(Statistics leftStats, + Statistics rightStats, Join join) { /* * When we estimate filter A=B, * if any side of equation, A or B, is almost unique, the confidence level of estimation is high. @@ -91,14 +94,13 @@ private static Statistics estimateHashJoin(Statistics leftStats, Statistics righ .map(expression -> (EqualPredicate) expression) .filter( expression -> { - // since ndv is not accurate, if ndv/rowcount < almostUniqueThreshold, + // since ndv is not accurate, if ndv/rowcount < TRUSTABLE_UNIQ_THRESHOLD, // this column is regarded as unique. - double almostUniqueThreshold = 0.9; - EqualPredicate equal = normalizeHashJoinCondition(expression, leftStats, rightStats); + EqualPredicate equal = normalizeEqualPredJoinCondition(expression, rightStats); ColumnStatistic eqLeftColStats = ExpressionEstimation.estimate(equal.left(), leftStats); ColumnStatistic eqRightColStats = ExpressionEstimation.estimate(equal.right(), rightStats); - boolean trustable = eqRightColStats.ndv / rightStatsRowCount > almostUniqueThreshold - || eqLeftColStats.ndv / leftStatsRowCount > almostUniqueThreshold; + boolean trustable = eqRightColStats.ndv / rightStatsRowCount > TRUSTABLE_UNIQ_THRESHOLD + || eqLeftColStats.ndv / leftStatsRowCount > TRUSTABLE_UNIQ_THRESHOLD; if (!trustable) { double rNdv = StatsMathUtil.nonZeroDivisor(eqRightColStats.ndv); double lNdv = StatsMathUtil.nonZeroDivisor(eqLeftColStats.ndv); @@ -124,6 +126,8 @@ private static Statistics estimateHashJoin(Statistics leftStats, Statistics righ double outputRowCount; if (!trustableConditions.isEmpty()) { + // TODO: strict pk-fk can use one-side stats instead of crossJoinStats + // in estimateJoinConditionSel, to get more accurate estimation. List joinConditionSels = trustableConditions.stream() .map(expression -> estimateJoinConditionSel(crossJoinStats, expression)) .sorted() @@ -133,10 +137,11 @@ private static Statistics estimateHashJoin(Statistics leftStats, Statistics righ double denominator = 1.0; for (Double joinConditionSel : joinConditionSels) { sel *= Math.pow(joinConditionSel, 1 / denominator); - denominator *= 2; + denominator *= TRUSTABLE_CONDITION_SELECTIVITY_POW_FACTOR; } outputRowCount = Math.max(1, crossJoinStats.getRowCount() * sel); - outputRowCount = outputRowCount * Math.pow(0.9, unTrustableCondition.size()); + outputRowCount = outputRowCount * Math.pow(UNTRUSTABLE_CONDITION_SELECTIVITY_LINEAR_FACTOR, + unTrustableCondition.size()); } else { outputRowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); Optional ratio = unTrustEqualRatio.stream().min(Double::compareTo); @@ -148,8 +153,9 @@ private static Statistics estimateHashJoin(Statistics leftStats, Statistics righ return innerJoinStats; } - private static Statistics estimateNestLoopJoin(Statistics leftStats, Statistics rightStats, Join join) { - if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { + private static Statistics estimateInnerJoinWithoutEqualPredicate(Statistics leftStats, + Statistics rightStats, Join join) { + if (joinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { double rowCount = (leftStats.getRowCount() + rightStats.getRowCount()); // We do more like the nested loop join with one rows than inner join if (leftStats.getRowCount() == 1 || rightStats.getRowCount() == 1) { @@ -193,7 +199,7 @@ private static double computeSelectivityForBuildSideWhenColStatsUnknown(Statisti } private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) { - if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { + if (joinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); rowCount = Math.max(1, rowCount); return new StatisticsBuilder() @@ -205,9 +211,9 @@ private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rig Statistics innerJoinStats; if (join.getEqualPredicates().isEmpty()) { - innerJoinStats = estimateNestLoopJoin(leftStats, rightStats, join); + innerJoinStats = estimateInnerJoinWithoutEqualPredicate(leftStats, rightStats, join); } else { - innerJoinStats = estimateHashJoin(leftStats, rightStats, join); + innerJoinStats = estimateInnerJoinWithEqualPredicate(leftStats, rightStats, join); } if (!join.getOtherJoinConjuncts().isEmpty()) { @@ -266,21 +272,25 @@ private static double estimateSemiOrAntiRowCountBySlotsEqual(Statistics leftStat return Math.max(1, rowCount); } - private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) { - if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join) || join.isMarkJoin()) { + private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, + Statistics innerJoinStats, Join join) { + if (joinConditionContainsUnknownColumnStats(leftStats, rightStats, join) || join.isMarkJoin()) { double sel = join.isMarkJoin() ? 1.0 : computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join); + Statistics result; if (join.getJoinType().isLeftSemiOrAntiJoin()) { - return new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel) + result = new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel) .putColumnStatistics(leftStats.columnStatistics()) .putColumnStatistics(rightStats.columnStatistics()) .build(); } else { //right semi or anti - return new StatisticsBuilder().setRowCount(rightStats.getRowCount() * sel) + result = new StatisticsBuilder().setRowCount(rightStats.getRowCount() * sel) .putColumnStatistics(leftStats.columnStatistics()) .putColumnStatistics(rightStats.columnStatistics()) .build(); } + result.normalizeColumnStatistics(); + return result; } double rowCount = Double.POSITIVE_INFINITY; for (Expression conjunct : join.getEqualPredicates()) { @@ -292,12 +302,41 @@ private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics ri } if (Double.isInfinite(rowCount)) { //slotsEqual estimation failed, fall back to original algorithm - Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double baseRowCount = join.getJoinType().isLeftSemiOrAntiJoin() ? leftStats.getRowCount() : rightStats.getRowCount(); rowCount = Math.min(innerJoinStats.getRowCount(), baseRowCount); return innerJoinStats.withRowCountAndEnforceValid(rowCount); } else { + // TODO: tuning the new semi/anti estimation method + /*double crossRowCount = Math.max(1, leftStats.getRowCount()) * Math.max(1, rightStats.getRowCount()); + double selectivity = innerJoinStats.getRowCount() / crossRowCount; + selectivity = Statistics.getValidSelectivity(selectivity); + double outputRowCount; + StatisticsBuilder builder; + + if (join.getJoinType().isLeftSemiOrAntiJoin()) { + outputRowCount = leftStats.getRowCount(); + builder = new StatisticsBuilder(leftStats); + } else { + outputRowCount = rightStats.getRowCount(); + builder = new StatisticsBuilder(rightStats); + } + if (join.getJoinType().isLeftSemiJoin() || join.getJoinType().isRightSemiJoin()) { + outputRowCount *= selectivity; + } else { + outputRowCount *= 1 - selectivity; + if (join.getJoinType().isLeftAntiJoin() && rightStats.getRowCount() < 1) { + outputRowCount = leftStats.getRowCount(); + } else if (join.getJoinType().isRightAntiJoin() && leftStats.getRowCount() < 1) { + outputRowCount = rightStats.getRowCount(); + } else { + outputRowCount = StatsMathUtil.normalizeRowCountOrNdv(outputRowCount); + } + } + builder.setRowCount(outputRowCount); + Statistics outputStats = builder.build(); + outputStats.normalizeColumnStatistics(); + return outputStats;*/ StatisticsBuilder builder; if (join.getJoinType().isLeftSemiOrAntiJoin()) { builder = new StatisticsBuilder(leftStats); @@ -308,7 +347,7 @@ private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics ri builder.setRowCount(rowCount); } Statistics outputStats = builder.build(); - outputStats.enforceValid(); + outputStats.normalizeColumnStatistics(); return outputStats; } } @@ -323,49 +362,48 @@ public static Statistics estimate(Statistics leftStats, Statistics rightStats, J .putColumnStatistics(leftStats.columnStatistics()) .putColumnStatistics(rightStats.columnStatistics()) .build(); + Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); if (joinType.isSemiOrAntiJoin()) { - return estimateSemiOrAnti(leftStats, rightStats, join); + Statistics outputStats = estimateSemiOrAnti(leftStats, rightStats, innerJoinStats, join); + updateJoinConditionColumnStatistics(outputStats, join); + return outputStats; } else if (joinType == JoinType.INNER_JOIN) { - Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); - innerJoinStats = updateJoinResultStatsByHashJoinCondition(innerJoinStats, join); + updateJoinConditionColumnStatistics(innerJoinStats, join); return innerJoinStats; } else if (joinType == JoinType.LEFT_OUTER_JOIN) { - Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(leftStats.getRowCount(), innerJoinStats.getRowCount()); - rowCount = Math.max(leftStats.getRowCount(), rowCount); + updateJoinConditionColumnStatistics(crossJoinStats, join); return crossJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.RIGHT_OUTER_JOIN) { - Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(rightStats.getRowCount(), innerJoinStats.getRowCount()); - rowCount = Math.max(rowCount, rightStats.getRowCount()); + updateJoinConditionColumnStatistics(crossJoinStats, join); return crossJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.FULL_OUTER_JOIN) { - Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); - return crossJoinStats.withRowCountAndEnforceValid(leftStats.getRowCount() - + rightStats.getRowCount() + innerJoinStats.getRowCount()); + double rowCount = Math.max(leftStats.getRowCount(), innerJoinStats.getRowCount()); + rowCount = Math.max(rightStats.getRowCount(), rowCount); + updateJoinConditionColumnStatistics(crossJoinStats, join); + return crossJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.CROSS_JOIN) { - return new StatisticsBuilder() - .setRowCount(leftStats.getRowCount() * rightStats.getRowCount()) - .putColumnStatistics(leftStats.columnStatistics()) - .putColumnStatistics(rightStats.columnStatistics()) - .build(); + updateJoinConditionColumnStatistics(crossJoinStats, join); + return crossJoinStats; } throw new AnalysisException("join type not supported: " + join.getJoinType()); } /** - * L join R on a = b - * after join, a.ndv and b.ndv should be equal to min(a.ndv, b.ndv) + * update join condition columns' ColumnStatistics, based on different join type. */ - private static Statistics updateJoinResultStatsByHashJoinCondition(Statistics innerStats, Join join) { + private static void updateJoinConditionColumnStatistics(Statistics inputStats, Join join) { Map updatedCols = new HashMap<>(); + JoinType joinType = join.getJoinType(); for (Expression expr : join.getEqualPredicates()) { EqualPredicate equalTo = (EqualPredicate) expr; - ColumnStatistic leftColStats = ExpressionEstimation.estimate(equalTo.left(), innerStats); - ColumnStatistic rightColStats = ExpressionEstimation.estimate(equalTo.right(), innerStats); - double minNdv = Math.min(leftColStats.ndv, rightColStats.ndv); - leftColStats = new ColumnStatisticBuilder(leftColStats).setNdv(minNdv).build(); - rightColStats = new ColumnStatisticBuilder(rightColStats).setNdv(minNdv).build(); + ColumnStatistic leftColStats = ExpressionEstimation.estimate(equalTo.left(), inputStats); + ColumnStatistic rightColStats = ExpressionEstimation.estimate(equalTo.right(), inputStats); + double leftNdv = 1.0; + double rightNdv = 1.0; + boolean updateLeft = false; + boolean updateRight = false; Expression eqLeft = equalTo.left(); if (eqLeft instanceof Cast) { eqLeft = eqLeft.child(0); @@ -374,13 +412,47 @@ private static Statistics updateJoinResultStatsByHashJoinCondition(Statistics in if (eqRight instanceof Cast) { eqRight = eqRight.child(0); } - updatedCols.put(eqLeft, leftColStats); - updatedCols.put(eqRight, rightColStats); + if (joinType == JoinType.INNER_JOIN) { + leftNdv = Math.min(leftColStats.ndv, rightColStats.ndv); + rightNdv = Math.min(leftColStats.ndv, rightColStats.ndv); + updateLeft = true; + updateRight = true; + } else if (joinType == JoinType.LEFT_OUTER_JOIN) { + leftNdv = leftColStats.ndv; + rightNdv = Math.min(leftColStats.ndv, rightColStats.ndv); + updateLeft = true; + updateRight = true; + } else if (joinType == JoinType.LEFT_SEMI_JOIN + || joinType == JoinType.LEFT_ANTI_JOIN + || joinType == JoinType.NULL_AWARE_LEFT_ANTI_JOIN) { + leftNdv = Math.min(leftColStats.ndv, rightColStats.ndv); + updateLeft = true; + } else if (joinType == JoinType.RIGHT_OUTER_JOIN) { + leftNdv = Math.min(leftColStats.ndv, rightColStats.ndv); + rightNdv = rightColStats.ndv; + } else if (joinType == JoinType.RIGHT_SEMI_JOIN + || joinType == JoinType.RIGHT_ANTI_JOIN) { + rightNdv = Math.min(leftColStats.ndv, rightColStats.ndv); + updateRight = true; + } else if (joinType == JoinType.FULL_OUTER_JOIN || joinType == JoinType.CROSS_JOIN) { + leftNdv = leftColStats.ndv; + rightNdv = rightColStats.ndv; + updateLeft = true; + updateRight = true; + } + + if (updateLeft) { + leftColStats = new ColumnStatisticBuilder(leftColStats).setNdv(leftNdv).build(); + updatedCols.put(eqLeft, leftColStats); + } + if (updateRight) { + rightColStats = new ColumnStatisticBuilder(rightColStats).setNdv(rightNdv).build(); + updatedCols.put(eqRight, rightColStats); + } } updatedCols.entrySet().stream().forEach( - entry -> innerStats.addColumnStats(entry.getKey(), entry.getValue()) + entry -> inputStats.addColumnStats(entry.getKey(), entry.getValue()) ); - return innerStats; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 6bdd930b4326de..3c9baac01bed06 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -49,6 +49,7 @@ import org.apache.doris.nereids.trees.plans.algebra.EmptyRelation; import org.apache.doris.nereids.trees.plans.algebra.Filter; import org.apache.doris.nereids.trees.plans.algebra.Generate; +import org.apache.doris.nereids.trees.plans.algebra.Join; import org.apache.doris.nereids.trees.plans.algebra.Limit; import org.apache.doris.nereids.trees.plans.algebra.OlapScan; import org.apache.doris.nereids.trees.plans.algebra.PartitionTopN; @@ -253,7 +254,7 @@ public static void estimate(GroupExpression groupExpression, CascadesContext con private void estimate() { Plan plan = groupExpression.getPlan(); Statistics newStats = plan.accept(this, null); - newStats.enforceValid(); + newStats.normalizeColumnStatistics(); // We ensure that the rowCount remains unchanged in order to make the cost of each plan comparable. if (groupExpression.getOwnerGroup().getStatistics() == null) { @@ -423,8 +424,7 @@ private Statistics computeOlapScan(OlapScan olapScan) { for (Slot slot : ((Relation) olapScan).getOutput()) { if (derivedStats.findColumnStatistics(slot) == null) { derivedStats.addColumnStats(slot, - new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN) - .setCount(derivedRowCount).build()); + new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN, derivedRowCount).build()); } } return derivedStats; @@ -451,7 +451,7 @@ private Statistics computeOlapScan(OlapScan olapScan) { // get row count from any visible slotReference's colStats for (Slot slot : ((Plan) olapScan).getOutput()) { builder.putColumnStatistics(slot, - new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setCount(tableRowCount).build()); + new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN, tableRowCount).build()); } setHasUnknownColStatsInStatementContext(); return builder.setRowCount(tableRowCount).build(); @@ -480,8 +480,8 @@ private Statistics computeOlapScan(OlapScan olapScan) { }); for (SlotReference slot : visibleOutputSlots) { ColumnStatistic cache = getColumnStatsFromPartitionCache(olapScan, slot, selectedPartitionNames); - ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache); - colStatsBuilder.setCount(selectedPartitionsRowCount); + ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache, + selectedPartitionsRowCount); colStatsBuilder.normalizeAvgSizeByte(slot); builder.putColumnStatistics(slot, colStatsBuilder.build()); } @@ -491,8 +491,7 @@ private Statistics computeOlapScan(OlapScan olapScan) { // if partition row count is invalid (-1), fallback to table stats for (SlotReference slot : visibleOutputSlots) { ColumnStatistic cache = getColumnStatsFromTableCache((CatalogRelation) olapScan, slot); - ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache); - colStatsBuilder.setCount(tableRowCount); + ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache, tableRowCount); colStatsBuilder.normalizeAvgSizeByte(slot); builder.putColumnStatistics(slot, colStatsBuilder.build()); } @@ -503,8 +502,7 @@ private Statistics computeOlapScan(OlapScan olapScan) { // get table level stats for (SlotReference slot : visibleOutputSlots) { ColumnStatistic cache = getColumnStatsFromTableCache((CatalogRelation) olapScan, slot); - ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache); - colStatsBuilder.setCount(tableRowCount); + ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache, tableRowCount); colStatsBuilder.normalizeAvgSizeByte(slot); builder.putColumnStatistics(slot, colStatsBuilder.build()); } @@ -601,8 +599,9 @@ public Statistics visitLogicalPartitionTopN(LogicalPartitionTopN @Override public Statistics visitLogicalJoin(LogicalJoin join, Void context) { - Statistics joinStats = JoinEstimation.estimate(groupExpression.childStatistics(0), - groupExpression.childStatistics(1), join); + Statistics joinStats = computeJoin(join); + // NOTE: physical operator visiting doesn't need the following + // logic which will ONLY be used in no-stats estimation. joinStats = new StatisticsBuilder(joinStats).setWidthInJoinCluster( groupExpression.childStatistics(0).getWidthInJoinCluster() + groupExpression.childStatistics(1).getWidthInJoinCluster()).build(); @@ -746,16 +745,14 @@ public Statistics visitPhysicalDeferMaterializeTopN(PhysicalDeferMaterializeTopN @Override public Statistics visitPhysicalHashJoin( PhysicalHashJoin hashJoin, Void context) { - return JoinEstimation.estimate(groupExpression.childStatistics(0), - groupExpression.childStatistics(1), hashJoin); + return computeJoin(hashJoin); } @Override public Statistics visitPhysicalNestedLoopJoin( PhysicalNestedLoopJoin nestedLoopJoin, Void context) { - return JoinEstimation.estimate(groupExpression.childStatistics(0), - groupExpression.childStatistics(1), nestedLoopJoin); + return computeJoin(nestedLoopJoin); } // TODO: We should subtract those pruned column, and consider the expression transformations in the node. @@ -872,7 +869,7 @@ private Statistics computeFilter(Filter filter) { } builder.setRowCount(isNullStats.getRowCount()); stats = builder.build(); - stats.enforceValid(); + stats.normalizeColumnStatistics(); } } } @@ -944,7 +941,7 @@ false, getTotalColumnStatisticMap(), false, newStats = ((Plan) newJoin).accept(statsCalculator, null); } - newStats.enforceValid(); + newStats.normalizeColumnStatistics(); double selectivity = Statistics.getValidSelectivity( newStats.getRowCount() / (leftRowCount * rightRowCount)); @@ -1089,14 +1086,18 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) { } else { cache = getColumnStatsFromTableCache(catalogRelation, slot); } - ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache); - colStatsBuilder.setCount(tableRowCount); + ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache, tableRowCount); builder.putColumnStatistics(slot, colStatsBuilder.build()); } checkIfUnknownStatsUsedAsKey(builder); return builder.setRowCount(tableRowCount).build(); } + private Statistics computeJoin(Join join) { + return JoinEstimation.estimate(groupExpression.childStatistics(0), + groupExpression.childStatistics(1), join); + } + private Statistics computeTopN(TopN topN) { Statistics stats = groupExpression.childStatistics(0); return stats.withRowCountAndEnforceValid(Math.min(stats.getRowCount(), topN.getLimit())); @@ -1200,7 +1201,7 @@ private Statistics computeAggregate(Aggregate aggregate) { slotToColumnStats.put(outputExpression.toSlot(), columnStat); } Statistics aggOutputStats = new Statistics(rowCount, 1, slotToColumnStats); - aggOutputStats.enforceValid(); + aggOutputStats.normalizeColumnStatistics(); return aggOutputStats; } @@ -1214,7 +1215,6 @@ private Statistics computeRepeat(Repeat repeat) { ColumnStatistic stats = kv.getValue(); ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(stats); columnStatisticBuilder - .setCount(stats.count < 0 ? stats.count : stats.count * groupingSetNum) .setNumNulls(stats.numNulls < 0 ? stats.numNulls : stats.numNulls * groupingSetNum) .setDataSize(stats.dataSize < 0 ? stats.dataSize : stats.dataSize * groupingSetNum); return Pair.of(kv.getKey(), columnStatisticBuilder.build()); @@ -1349,12 +1349,11 @@ private Statistics computeGenerate(Generate generate) { double count = stats.getRowCount() * generate.getGeneratorOutput().size() * statsFactor; Map columnStatsMap = Maps.newHashMap(); for (Map.Entry entry : stats.columnStatistics().entrySet()) { - ColumnStatistic columnStatistic = new ColumnStatisticBuilder(entry.getValue()).setCount(count).build(); + ColumnStatistic columnStatistic = new ColumnStatisticBuilder(entry.getValue()).build(); columnStatsMap.put(entry.getKey(), columnStatistic); } for (Slot output : generate.getGeneratorOutput()) { ColumnStatistic columnStatistic = new ColumnStatisticBuilder() - .setCount(count) .setMinValue(Double.NEGATIVE_INFINITY) .setMaxValue(Double.POSITIVE_INFINITY) .setNdv(count) @@ -1376,8 +1375,7 @@ private Statistics computeWindow(Window windowOperator) { "need WindowExpression, but we meet " + expr); WindowExpression windExpr = (WindowExpression) expr.child(0); ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(); - colStatsBuilder.setCount(childStats.getRowCount()) - .setOriginal(null); + colStatsBuilder.setOriginal(null); Double partitionCount = windExpr.getPartitionKeys().stream().map(key -> { ColumnStatistic keyStats = childStats.findColumnStatistics(key); @@ -1392,8 +1390,7 @@ private Statistics computeWindow(Window windowOperator) { if (partitionCount == -1.0) { // partition key stats are all unknown - colStatsBuilder.setCount(childStats.getRowCount()) - .setNdv(1) + colStatsBuilder.setNdv(1) .setMinValue(Double.NEGATIVE_INFINITY) .setMaxValue(Double.POSITIVE_INFINITY); } else { @@ -1438,7 +1435,7 @@ private Statistics computeWindow(Window windowOperator) { private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats, double rightRowCount, DataType dataType) { if (leftStats.isUnKnown() || rightStats.isUnKnown()) { - return new ColumnStatisticBuilder(leftStats).setCount(leftRowCount + rightRowCount).build(); + return new ColumnStatisticBuilder(leftStats).build(); } ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); columnStatisticBuilder.setMaxValue(Math.max(leftStats.maxValue, rightStats.maxValue)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsMathUtil.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsMathUtil.java index c56437f53bcb7a..49cc466b780ec3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsMathUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsMathUtil.java @@ -58,4 +58,9 @@ public static double divide(double a, double b) { } return a / nonZeroDivisor(b); } + + // TODO: add more protection at other stats estimation + public static double normalizeRowCountOrNdv(double value) { + return value >= 0 && value < 1 ? 1 : value; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java index 845baa045cc041..26027d1049912f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java @@ -70,7 +70,7 @@ public Statistics computeStats(List slots) { Map columnToStatistics = Maps.newHashMap(); ColumnStatisticBuilder statBuilder = new ColumnStatisticBuilder() - .setCount(rowNum).setAvgSizeByte(8).setNumNulls(0).setDataSize(8); + .setAvgSizeByte(8).setNumNulls(0).setDataSize(8); if (numberTvf.getUseConst()) { // a column of const value long value = numberTvf.getConstValue(); statBuilder = statBuilder.setNdv(1).setMinValue(value).setMaxValue(value) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/JoinType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/JoinType.java index 3423b13168b428..112c1d98a98a1f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/JoinType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/JoinType.java @@ -164,7 +164,7 @@ public final boolean isLeftAntiJoin() { return this == LEFT_ANTI_JOIN; } - public final boolean isLefSemiJoin() { + public final boolean isLeftSemiJoin() { return this == LEFT_SEMI_JOIN; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java index 935716e42bf05f..8e92f83274e2f5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/StringType.java @@ -31,11 +31,6 @@ private StringType() { super(-1); } - @Override - public int width() { - return len; - } - @Override public Type toCatalogDataType() { return Type.STRING; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java index 446ccc7fd002c2..0c5c115194de3b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/CharacterType.java @@ -26,14 +26,13 @@ */ public abstract class CharacterType extends PrimitiveType { - public static final int DEFAULT_SLOT_SIZE = 20; private static final int WIDTH = 16; + public static final int DEFAULT_WIDTH = WIDTH; protected final int len; // When defining SQL schemas, users often tend to set the length of string // fields much longer than actually needed for storage. - public CharacterType(int len) { this.len = len; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java index bb8263994583fd..ed0b2effdb1125 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -137,8 +137,7 @@ public ColumnStatistic toColumnStatistic() { return ColumnStatistic.UNKNOWN; } try { - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); - columnStatisticBuilder.setCount(count); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count); columnStatisticBuilder.setNdv(ndv); columnStatisticBuilder.setNumNulls(nullCount); columnStatisticBuilder.setDataSize(dataSizeInBytes); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index 919d2018ba8f53..c740f774da4cfc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -47,20 +47,18 @@ public class ColumnStatistic { private static final Logger LOG = LogManager.getLogger(ColumnStatistic.class); - public static ColumnStatistic UNKNOWN = new ColumnStatisticBuilder().setAvgSizeByte(1).setNdv(1) - .setNumNulls(1).setCount(1).setMaxValue(Double.POSITIVE_INFINITY).setMinValue(Double.NEGATIVE_INFINITY) + public static ColumnStatistic UNKNOWN = new ColumnStatisticBuilder(1).setAvgSizeByte(1).setNdv(1) + .setNumNulls(1).setMaxValue(Double.POSITIVE_INFINITY).setMinValue(Double.NEGATIVE_INFINITY) .setIsUnknown(true).setUpdatedTime("") .build(); - public static ColumnStatistic ZERO = new ColumnStatisticBuilder().setAvgSizeByte(0).setNdv(0) - .setNumNulls(0).setCount(0).setMaxValue(Double.NaN).setMinValue(Double.NaN) - .build(); - public static final Set UNSUPPORTED_TYPE = Sets.newHashSet( Type.HLL, Type.BITMAP, Type.ARRAY, Type.STRUCT, Type.MAP, Type.QUANTILE_STATE, Type.JSONB, Type.VARIANT, Type.TIME, Type.TIMEV2, Type.LAMBDA_FUNCTION ); + // ATTENTION: Stats deriving WILL NOT use 'count' field any longer. + // Use 'rowCount' field in Statistics if needed. @SerializedName("count") public final double count; @SerializedName("ndv") @@ -127,9 +125,8 @@ public static ColumnStatistic fromResultRow(List resultRows) { // TODO: use thrift public static ColumnStatistic fromResultRow(ResultRow row) { - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); double count = Double.parseDouble(row.get(7)); - columnStatisticBuilder.setCount(count); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count); double ndv = Double.parseDouble(row.getWithDefault(8, "0")); columnStatisticBuilder.setNdv(ndv); String nullCount = row.getWithDefault(9, "0"); @@ -195,26 +192,6 @@ public static boolean isAlmostUnique(double ndv, double rowCount) { return rowCount * ALMOST_UNIQUE_FACTOR < ndv; } - public ColumnStatistic updateByLimit(long limit, double rowCount) { - double ratio = 0; - if (rowCount != 0) { - ratio = limit / rowCount; - } - double newNdv = Math.ceil(Math.min(ndv, limit)); - return new ColumnStatisticBuilder() - .setCount(Math.ceil(limit)) - .setNdv(newNdv) - .setAvgSizeByte(Math.ceil(avgSizeByte)) - .setNumNulls(Math.ceil(numNulls * ratio)) - .setDataSize(Math.ceil(dataSize * ratio)) - .setMinValue(minValue) - .setMaxValue(maxValue) - .setMinExpr(minExpr) - .setMaxExpr(maxExpr) - .setIsUnknown(isUnKnown) - .build(); - } - public boolean hasIntersect(ColumnStatistic other) { return Math.max(this.minValue, other.minValue) <= Math.min(this.maxValue, other.maxValue); } @@ -374,7 +351,7 @@ public static ColumnStatistic fromJson(String statJson) throws AnalysisException ); } - public boolean minOrMaxIsInf() { + public boolean isMinMaxInvalid() { return Double.isInfinite(maxValue) || Double.isInfinite(minValue); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index 4c8df0bf67751d..4e190ce388ee0a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -56,9 +56,24 @@ public ColumnStatisticBuilder(ColumnStatistic columnStatistic) { this.updatedTime = columnStatistic.updatedTime; } - public ColumnStatisticBuilder setCount(double count) { + // ATTENTION: DON'T USE FOLLOWING TWO DURING STATS DERIVING EXCEPT FOR INITIALIZATION + public ColumnStatisticBuilder(double count) { this.count = count; - return this; + } + + public ColumnStatisticBuilder(ColumnStatistic columnStatistic, double count) { + this.count = count; + this.ndv = columnStatistic.ndv; + this.avgSizeByte = columnStatistic.avgSizeByte; + this.numNulls = columnStatistic.numNulls; + this.dataSize = columnStatistic.dataSize; + this.minValue = columnStatistic.minValue; + this.maxValue = columnStatistic.maxValue; + this.minExpr = columnStatistic.minExpr; + this.maxExpr = columnStatistic.maxExpr; + this.isUnknown = columnStatistic.isUnKnown; + this.original = columnStatistic.original; + this.updatedTime = columnStatistic.updatedTime; } public ColumnStatisticBuilder setNdv(double ndv) { @@ -184,8 +199,7 @@ public void normalizeAvgSizeByte(SlotReference slot) { // When defining SQL schemas, users often tend to set the length of string \ // fields much longer than actually needed for storage. if (slot.getDataType() instanceof CharacterType) { - avgSizeByte = Math.min(avgSizeByte, - CharacterType.DEFAULT_SLOT_SIZE); + avgSizeByte = Math.min(avgSizeByte, CharacterType.DEFAULT_WIDTH); } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapScanStatsDerive.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapScanStatsDerive.java index 7ac4b95d484e04..753167fb442f41 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapScanStatsDerive.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapScanStatsDerive.java @@ -67,9 +67,6 @@ public StatsDeriveResult deriveStats() { Env.getCurrentEnv().getStatisticsCache().getColumnStatistics( table.getDatabase().getCatalog().getId(), table.getDatabase().getId(), table.getId(), -1, colName); - if (!statistic.isUnKnown) { - rowCount = statistic.count; - } columnStatisticMap.put(entry.getKey(), statistic); } return new StatsDeriveResult(rowCount, columnStatisticMap); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatistic.java index eebe910d8b007b..7222dc8825831c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatistic.java @@ -38,16 +38,12 @@ public class PartitionColumnStatistic { private static final Logger LOG = LogManager.getLogger(PartitionColumnStatistic.class); - public static PartitionColumnStatistic UNKNOWN = new PartitionColumnStatisticBuilder().setAvgSizeByte(1) - .setNdv(new Hll128()).setNumNulls(1).setCount(1).setMaxValue(Double.POSITIVE_INFINITY) + public static PartitionColumnStatistic UNKNOWN = new PartitionColumnStatisticBuilder(1).setAvgSizeByte(1) + .setNdv(new Hll128()).setNumNulls(1).setMaxValue(Double.POSITIVE_INFINITY) .setMinValue(Double.NEGATIVE_INFINITY) .setIsUnknown(true).setUpdatedTime("") .build(); - public static PartitionColumnStatistic ZERO = new PartitionColumnStatisticBuilder().setAvgSizeByte(0) - .setNdv(new Hll128()).setNumNulls(0).setCount(0).setMaxValue(Double.NaN).setMinValue(Double.NaN) - .build(); - public final double count; public final Hll128 ndv; public final double numNulls; @@ -109,9 +105,8 @@ public static PartitionColumnStatistic fromResultRow(ResultRow row) throws IOExc return PartitionColumnStatistic.UNKNOWN; } - PartitionColumnStatisticBuilder partitionStatisticBuilder = new PartitionColumnStatisticBuilder(); double count = Double.parseDouble(row.get(6)); - partitionStatisticBuilder.setCount(count); + PartitionColumnStatisticBuilder partitionStatisticBuilder = new PartitionColumnStatisticBuilder(count); String ndv = row.get(7); Base64.Decoder decoder = Base64.getDecoder(); DataInputStream dis = new DataInputStream(new ByteArrayInputStream(decoder.decode(ndv))); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java index fe26396f21213a..b1dc7cdd0017d4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/PartitionColumnStatisticBuilder.java @@ -50,9 +50,9 @@ public PartitionColumnStatisticBuilder(PartitionColumnStatistic statistic) { this.updatedTime = statistic.updatedTime; } - public PartitionColumnStatisticBuilder setCount(double count) { + // ATTENTION: DON'T USE FOLLOWING TWO DURING STATS DERIVING EXCEPT FOR INITIALIZATION + public PartitionColumnStatisticBuilder(double count) { this.count = count; - return this; } public PartitionColumnStatisticBuilder setNdv(Hll128 ndv) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java index ca9735b56654b1..b8a7b0a9e2de4f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java @@ -150,6 +150,10 @@ public double length() { } public StatisticRange intersect(StatisticRange other) { + return intersect(other, false); + } + + public StatisticRange intersect(StatisticRange other, boolean partial) { Pair biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr); double newLow = biggerLow.first; LiteralExpr newLowExpr = biggerLow.second; @@ -158,8 +162,8 @@ public StatisticRange intersect(StatisticRange other) { double newHigh = smallerHigh.first; LiteralExpr newHighExpr = smallerHigh.second; if (newLow <= newHigh) { - return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, - overlappingDistinctValues(other), dataType); + double distinctValues = overlappingDistinctValues(other, partial); + return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, distinctValues, dataType); } return empty(dataType); } @@ -178,33 +182,6 @@ public Pair maxPair(double r1, LiteralExpr e1, double r2, L return Pair.of(r2, e2); } - public StatisticRange cover(StatisticRange other) { - StatisticRange resultRange; - Pair biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr); - double newLow = biggerLow.first; - LiteralExpr newLowExpr = biggerLow.second; - Pair smallerHigh = minPair(high, highExpr, other.high, other.highExpr); - double newHigh = smallerHigh.first; - LiteralExpr newHighExpr = smallerHigh.second; - - if (newLow <= newHigh) { - double overlapPercentOfLeft = overlapPercentWith(other); - double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; - double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft); - if (this.isBothInfinite() && other.isOneSideInfinite()) { - resultRange = new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, - distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR, - dataType); - } else { - resultRange = new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, coveredDistinctValues, - dataType); - } - } else { - resultRange = empty(dataType); - } - return resultRange; - } - public StatisticRange union(StatisticRange other) { double overlapPercentThis = this.overlapPercentWith(other); double overlapPercentOther = other.overlapPercentWith(this); @@ -219,12 +196,33 @@ public StatisticRange union(StatisticRange other) { biggerHigh.first, biggerHigh.second, newNDV, dataType); } - private double overlappingDistinctValues(StatisticRange other) { - double overlapPercentOfLeft = overlapPercentWith(other); - double overlapPercentOfRight = other.overlapPercentWith(this); - double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; - double overlapDistinctValuesRight = overlapPercentOfRight * other.distinctValues; - return minExcludeNaN(overlapDistinctValuesLeft, overlapDistinctValuesRight); + private double overlappingDistinctValues(StatisticRange other, boolean partial) { + double overlapDistinctValuesLeft; + if (other.isInfinite() || this.isInfinite()) { + overlapDistinctValuesLeft = distinctValues * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; + } else if (Math.abs(this.low - this.high) < 1e-6) { + overlapDistinctValuesLeft = distinctValues; + } else { + double overlapPercentOfLeft = this.overlapPercentWith(other); + overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; + } + + if (partial) { + return overlapDistinctValuesLeft; + } else { + double overlapDistinctValuesRight; + if (this.isInfinite() || other.isInfinite()) { + overlapDistinctValuesRight = distinctValues + * INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; + } else if (Math.abs(other.low - other.high) < 1e-6) { + // other is constant + overlapDistinctValuesRight = distinctValues; + } else { + double overlapPercentOfRight = other.overlapPercentWith(this); + overlapDistinctValuesRight = overlapPercentOfRight * other.distinctValues; + } + return minExcludeNaN(overlapDistinctValuesLeft, overlapDistinctValuesRight); + } } public static double minExcludeNaN(double v1, double v2) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java index f4552a2560d7e8..20da97df0205b8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java @@ -99,40 +99,54 @@ public Statistics withExpressionToColumnStats(Map e */ public Statistics withRowCountAndEnforceValid(double rowCount) { Statistics statistics = new Statistics(rowCount, widthInJoinCluster, expressionToColumnStats); - statistics.enforceValid(); + statistics.normalizeColumnStatistics(); return statistics; } - public void enforceValid() { + // IMPORTANT: it is suggested to do this action after each estimation critical visiting, + // since statistics will have serious deviation during the partial deriving. + public void normalizeColumnStatistics() { + normalizeColumnStatistics(this.rowCount); + } + + public void normalizeColumnStatistics(double inputRowCount) { + normalizeColumnStatistics(this.rowCount, false); + } + + public void normalizeColumnStatistics(double inputRowCount, boolean isNumNullsDecreaseByProportion) { + double factor = isNumNullsDecreaseByProportion ? rowCount / inputRowCount : 1.0; for (Entry entry : expressionToColumnStats.entrySet()) { ColumnStatistic columnStatistic = entry.getValue(); - if (!checkColumnStatsValid(columnStatistic) && !columnStatistic.isUnKnown()) { - double ndv = Math.min(columnStatistic.ndv, rowCount); + // the following columnStatistic.isUnKnown() judgment is loop inside since current doris + // supports partial stats deriving, i.e, allowing part of tables have stats and other parts don't, + // or part of columns have stats but other parts don't, especially join and filter estimation. + if (!columnStatistic.isUnKnown() && (!checkColumnStatsValid(columnStatistic, rowCount) + || isNumNullsDecreaseByProportion && columnStatistic.numNulls != 0)) { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); + double ndv = Math.min(columnStatistic.ndv, rowCount); + double numNulls = Math.min(columnStatistic.numNulls * factor, rowCount - ndv); + columnStatisticBuilder.setNumNulls(numNulls); columnStatisticBuilder.setNdv(ndv); - columnStatisticBuilder.setNumNulls(Math.min(columnStatistic.numNulls, rowCount - ndv)); - columnStatisticBuilder.setCount(rowCount); columnStatistic = columnStatisticBuilder.build(); expressionToColumnStats.put(entry.getKey(), columnStatistic); } } } - public boolean checkColumnStatsValid(ColumnStatistic columnStatistic) { - return columnStatistic.ndv <= rowCount - && columnStatistic.numNulls <= rowCount - columnStatistic.ndv; + public boolean checkColumnStatsValid(ColumnStatistic columnStatistic, double rowCount) { + return columnStatistic.ndv <= rowCount && columnStatistic.numNulls <= rowCount - columnStatistic.ndv; } public Statistics withSel(double sel) { return withSel(sel, 0); } - public Statistics withSel(double sel, double numNull) { - sel = StatsMathUtil.minNonNaN(sel, 1); + public Statistics withSel(double notNullSel, double numNull) { + notNullSel = StatsMathUtil.minNonNaN(notNullSel, 1); if (Double.isNaN(rowCount)) { return this; } - double newCount = rowCount * sel + numNull; + double newCount = rowCount * notNullSel + numNull; return new Statistics(newCount, widthInJoinCluster, new HashMap<>(expressionToColumnStats)); } @@ -153,7 +167,7 @@ public double computeTupleSize(List slots) { for (Slot slot : slots) { ColumnStatistic s = expressionToColumnStats.get(slot); if (s != null) { - tempSize += Math.max(1, Math.min(CharacterType.DEFAULT_SLOT_SIZE, s.avgSizeByte)); + tempSize += Math.max(1, Math.min(CharacterType.DEFAULT_WIDTH, s.avgSizeByte)); } } tupleSize = Math.max(1, tempSize); @@ -229,16 +243,8 @@ public int getBENumber() { return 1; } - public static Statistics zero(Statistics statistics) { - Statistics zero = new Statistics(0, new HashMap<>()); - for (Map.Entry entry : statistics.expressionToColumnStats.entrySet()) { - zero.addColumnStats(entry.getKey(), ColumnStatistic.ZERO); - } - return zero; - } - - public static double getValidSelectivity(double nullSel) { - return nullSel < 0 ? 0 : (nullSel > 1 ? 1 : nullSel); + public static double getValidSelectivity(double selectivity) { + return selectivity < 0 ? 0 : (selectivity > 1 ? 1 : selectivity); } /** @@ -273,24 +279,6 @@ public int getWidthInJoinCluster() { return widthInJoinCluster; } - public Statistics normalizeByRatio(double originRowCount) { - if (rowCount >= originRowCount || rowCount <= 0) { - return this; - } - StatisticsBuilder builder = new StatisticsBuilder(this); - double ratio = rowCount / originRowCount; - for (Entry entry : expressionToColumnStats.entrySet()) { - ColumnStatistic colStats = entry.getValue(); - if (colStats.numNulls != 0 || colStats.ndv > rowCount) { - ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(colStats); - colStatsBuilder.setNumNulls(colStats.numNulls * ratio); - colStatsBuilder.setNdv(Math.min(rowCount - colStatsBuilder.getNumNulls(), colStats.ndv)); - builder.putColumnStatistics(entry.getKey(), colStatsBuilder.build()); - } - } - return builder.build(); - } - public double getDeltaRowCount() { return deltaRowCount; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java index 0d9fa3674628c7..ba23ab84dc7a32 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java @@ -326,10 +326,9 @@ public static void alterColumnStatistics(AlterColumnStatsStmt alterColumnStatsSt if (rowCount == null) { throw new RuntimeException("Row count is null."); } - ColumnStatisticBuilder builder = new ColumnStatisticBuilder(); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(Double.parseDouble(rowCount)); String colName = alterColumnStatsStmt.getColumnName(); Column column = objects.table.getColumn(colName); - builder.setCount(Double.parseDouble(rowCount)); if (ndv != null) { double dNdv = Double.parseDouble(ndv); builder.setNdv(dNdv); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java index 8c301f911be95b..977518d47ed549 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java @@ -20,8 +20,6 @@ import org.apache.doris.common.Id; import org.apache.doris.nereids.trees.expressions.Slot; -import com.google.common.base.Preconditions; - import java.util.HashMap; import java.util.List; import java.util.Map; @@ -125,25 +123,6 @@ public StatsDeriveResult withSelectivity(double selectivity) { return statsDeriveResult; } - public StatsDeriveResult updateByLimit(long limit) { - Preconditions.checkArgument(limit >= 0); - limit = Math.min(limit, (long) rowCount); - StatsDeriveResult statsDeriveResult = new StatsDeriveResult(limit, width, penalty); - for (Entry entry : slotIdToColumnStats.entrySet()) { - statsDeriveResult.addColumnStats(entry.getKey(), entry.getValue().updateByLimit(limit, rowCount)); - } - // When the table is first created, rowCount is empty. - // This leads to NPE if there is SetOperation outside the limit. - // Therefore, when rowCount is empty, slotIdToColumnStats is also imported, - // but the possible problem is that the first query statistics are not derived accurately. - if (statsDeriveResult.slotIdToColumnStats.isEmpty()) { - for (Entry entry : slotIdToColumnStats.entrySet()) { - statsDeriveResult.addColumnStats(entry.getKey(), entry.getValue()); - } - } - return statsDeriveResult; - } - public StatsDeriveResult copy() { return new StatsDeriveResult(this); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 66f6bda6819a9e..d51281eb0e667c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -84,7 +84,6 @@ import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.text.StringSubstitutor; -import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.TableScan; @@ -667,20 +666,25 @@ public static long getTotalSizeFromHMS(HMSExternalTable table) { */ public static Optional getIcebergColumnStats(String colName, org.apache.iceberg.Table table) { TableScan tableScan = table.newScan().includeColumnStats(); - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); - columnStatisticBuilder.setCount(0); - columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); - columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); - columnStatisticBuilder.setDataSize(0); - columnStatisticBuilder.setAvgSizeByte(0); - columnStatisticBuilder.setNumNulls(0); + double totalDataSize = 0; + double totalDataCount = 0; + double totalNumNull = 0; try (CloseableIterable fileScanTasks = tableScan.planFiles()) { for (FileScanTask task : fileScanTasks) { - processDataFile(task.file(), task.spec(), colName, columnStatisticBuilder); + int colId = getColId(task.spec(), colName); + totalDataSize += task.file().columnSizes().get(colId); + totalDataCount += task.file().recordCount(); + totalNumNull += task.file().nullValueCounts().get(colId); } } catch (IOException e) { LOG.warn("Error to close FileScanTask.", e); } + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(totalDataCount); + columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); + columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); + columnStatisticBuilder.setDataSize(totalDataSize); + columnStatisticBuilder.setAvgSizeByte(0); + columnStatisticBuilder.setNumNulls(totalNumNull); if (columnStatisticBuilder.getCount() > 0) { columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getDataSize() / columnStatisticBuilder.getCount()); @@ -688,8 +692,7 @@ public static Optional getIcebergColumnStats(String colName, or return Optional.of(columnStatisticBuilder.build()); } - private static void processDataFile(DataFile dataFile, PartitionSpec partitionSpec, - String colName, ColumnStatisticBuilder columnStatisticBuilder) { + private static int getColId(PartitionSpec partitionSpec, String colName) { int colId = -1; for (Types.NestedField column : partitionSpec.schema().columns()) { if (column.name().equals(colName)) { @@ -700,12 +703,7 @@ private static void processDataFile(DataFile dataFile, PartitionSpec partitionSp if (colId == -1) { throw new RuntimeException(String.format("Column %s not exist.", colName)); } - // Update the data size, count and num of nulls in columnStatisticBuilder. - // TODO: Get min max value. - columnStatisticBuilder.setDataSize(columnStatisticBuilder.getDataSize() + dataFile.columnSizes().get(colId)); - columnStatisticBuilder.setCount(columnStatisticBuilder.getCount() + dataFile.recordCount()); - columnStatisticBuilder.setNumNulls(columnStatisticBuilder.getNumNulls() - + dataFile.nullValueCounts().get(colId)); + return colId; } public static boolean isUnsupportedType(Type type) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java index 5f91e2b70d8ab0..91da5192b48c02 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/ExpressionEstimationTest.java @@ -31,7 +31,14 @@ import org.apache.doris.nereids.trees.expressions.functions.agg.Max; import org.apache.doris.nereids.trees.expressions.functions.agg.Min; import org.apache.doris.nereids.trees.expressions.functions.scalar.If; +import org.apache.doris.nereids.trees.expressions.literal.BigIntLiteral; import org.apache.doris.nereids.trees.expressions.literal.BooleanLiteral; +import org.apache.doris.nereids.trees.expressions.literal.DateTimeLiteral; +import org.apache.doris.nereids.trees.expressions.literal.DateV2Literal; +import org.apache.doris.nereids.trees.expressions.literal.DecimalLiteral; +import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral; +import org.apache.doris.nereids.trees.expressions.literal.NullLiteral; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DoubleType; import org.apache.doris.nereids.types.IntegerType; @@ -44,6 +51,7 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.math.BigDecimal; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -357,6 +365,7 @@ public void testCaseWhen() { CaseWhen caseWhen = new CaseWhen(whens); ColumnStatistic est = ExpressionEstimation.estimate(caseWhen, stats); Assertions.assertEquals(est.ndv, 100); + Assertions.assertEquals(est.avgSizeByte, 16); } @Test @@ -383,5 +392,59 @@ public void testIf() { If ifClause = new If(BooleanLiteral.TRUE, a, b); ColumnStatistic est = ExpressionEstimation.estimate(ifClause, stats); Assertions.assertEquals(est.ndv, 100); + Assertions.assertEquals(est.avgSizeByte, 16); + } + + @Test + public void testLiteral() { + Statistics stats = new Statistics(1000, new HashMap<>()); + + BigIntLiteral l1 = new BigIntLiteral(1000000); + ColumnStatistic est = ExpressionEstimation.estimate(l1, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 8); + Assertions.assertEquals(est.numNulls, 0); + + VarcharLiteral l2 = new VarcharLiteral("abcdefghij"); + est = ExpressionEstimation.estimate(l2, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 10); + Assertions.assertEquals(est.numNulls, 0); + + DoubleLiteral l3 = new DoubleLiteral(0.01); + est = ExpressionEstimation.estimate(l3, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 8); + Assertions.assertEquals(est.numNulls, 0); + + DateV2Literal l4 = new DateV2Literal("2024-09-10"); + est = ExpressionEstimation.estimate(l4, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 4); + Assertions.assertEquals(est.numNulls, 0); + + DateTimeLiteral l5 = new DateTimeLiteral("2024-09-10 00:00:00"); + est = ExpressionEstimation.estimate(l5, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 16); + Assertions.assertEquals(est.numNulls, 0); + + BooleanLiteral l6 = BooleanLiteral.TRUE; + est = ExpressionEstimation.estimate(l6, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 1); + Assertions.assertEquals(est.numNulls, 0); + + DecimalLiteral l7 = new DecimalLiteral(BigDecimal.valueOf(2024.0928)); + est = ExpressionEstimation.estimate(l7, stats); + Assertions.assertEquals(est.ndv, 1); + Assertions.assertEquals(est.avgSizeByte, 16); + Assertions.assertEquals(est.numNulls, 0); + + NullLiteral l8 = new NullLiteral(); + est = ExpressionEstimation.estimate(l8, stats); + Assertions.assertEquals(est.ndv, 0); + Assertions.assertEquals(est.avgSizeByte, 1); + Assertions.assertEquals(est.numNulls, 1); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 6e76c3f6a33d1c..4461cc375a151a 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -70,10 +70,10 @@ public void testOrNaN() { LessThan lessThan = new LessThan(b, int100); Or or = new Or(greaterThan1, lessThan); Map columnStat = new HashMap<>(); - ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4) + ColumnStatistic aStats = new ColumnStatisticBuilder(500).setNdv(500).setAvgSizeByte(4) .setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); - ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500).setAvgSizeByte(4) + ColumnStatistic bStats = new ColumnStatisticBuilder(500).setNdv(500).setAvgSizeByte(4) .setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build(); columnStat.put(a, aStats); @@ -99,10 +99,10 @@ public void testAndNaN() { LessThan lessThan = new LessThan(b, int100); And and = new And(greaterThan1, lessThan); Map columnStat = new HashMap<>(); - ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) + ColumnStatistic aStats = new ColumnStatisticBuilder(500).setNdv(500) .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); - ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) + ColumnStatistic bStats = new ColumnStatisticBuilder(500).setNdv(500) .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).setIsUnknown(true).build(); columnStat.put(a, aStats); @@ -165,7 +165,7 @@ public void testRelatedAnd() { LessThan le = new LessThan(a, int200); And and = new And(ge, le); Map slotToColumnStat = new HashMap<>(); - ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(300).setNdv(30) + ColumnStatistic aStats = new ColumnStatisticBuilder(300).setNdv(30) .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(300).build(); slotToColumnStat.put(a, aStats); @@ -184,8 +184,7 @@ public void knownEqualToUnknown() { SlotReference ym = new SlotReference("a", new VarcharType(7)); double rowCount = 404962.0; double ndv = 14.0; - ColumnStatistic ymStats = new ColumnStatisticBuilder() - .setCount(rowCount) + ColumnStatistic ymStats = new ColumnStatisticBuilder(rowCount) .setNdv(ndv) .setMinExpr(new StringLiteral("2023-07")) .setMinValue(14126741000630328.000000) @@ -211,8 +210,7 @@ public void knownEqualToUnknownWithLittleNdv() { SlotReference ym = new SlotReference("a", new VarcharType(7)); double rowCount = 404962.0; double ndv = 0.5; - ColumnStatistic ymStats = new ColumnStatisticBuilder() - .setCount(rowCount) + ColumnStatistic ymStats = new ColumnStatisticBuilder(rowCount) .setNdv(ndv) .setMinExpr(new StringLiteral("2023-07")) .setMinValue(14126741000630328.000000) @@ -267,13 +265,13 @@ public void test1() { And and = new And(greaterThan1, lessThan); Or or = new Or(and, equalTo); Map slotToColumnStat = new HashMap<>(); - ColumnStatistic aStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) + ColumnStatistic aStats = new ColumnStatisticBuilder(500).setNdv(500) .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); - ColumnStatistic bStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) + ColumnStatistic bStats = new ColumnStatisticBuilder(500).setNdv(500) .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); - ColumnStatistic cStats = new ColumnStatisticBuilder().setCount(500).setNdv(500) + ColumnStatistic cStats = new ColumnStatisticBuilder(500).setNdv(500) .setAvgSizeByte(4).setNumNulls(0).setDataSize(0) .setMinValue(0).setMaxValue(1000).setMinExpr(null).build(); slotToColumnStat.put(a, aStats); @@ -312,7 +310,7 @@ public void test2() { FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(or, stat); Assertions.assertTrue( - Precision.equals(512.5, + Precision.equals(506.25, expected.getRowCount(), 0.01)); } @@ -334,7 +332,7 @@ public void test3() { Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(ge, stat); - Assertions.assertEquals(1000 * 1.0 / 500, expected.getRowCount()); + Assertions.assertEquals(1000 * (500.0 / 1000) * (1.0 / 500), expected.getRowCount()); } // a <= 500 @@ -355,7 +353,7 @@ public void test4() { Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(le, stat); - Assertions.assertEquals(1000 * 1.0 / 500, expected.getRowCount()); + Assertions.assertEquals(1000 * (500.0 / 1000) * (1.0 / 500), expected.getRowCount()); } // a < 500 @@ -376,7 +374,7 @@ public void test5() { Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(less, stat); - Assertions.assertEquals(2, expected.getRowCount()); + Assertions.assertEquals(1, expected.getRowCount()); } // a > 1000 @@ -397,7 +395,7 @@ public void test6() { Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(ge, stat); - Assertions.assertEquals(2, expected.getRowCount()); + Assertions.assertEquals(1, expected.getRowCount()); } // a > b @@ -659,27 +657,24 @@ public void testFilterOutofMinMax() { IntegerLiteral i300 = new IntegerLiteral(300); GreaterThan ge = new GreaterThan(c, i300); Map slotToColumnStat = new HashMap<>(); - ColumnStatisticBuilder builderA = new ColumnStatisticBuilder() + ColumnStatisticBuilder builderA = new ColumnStatisticBuilder(1000) .setNdv(1000) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(1000) - .setMaxValue(10000) - .setCount(1000); - ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() + .setMaxValue(10000); + ColumnStatisticBuilder builderB = new ColumnStatisticBuilder(1000) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setCount(1000); - ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() + .setMaxValue(500); + ColumnStatisticBuilder builderC = new ColumnStatisticBuilder(1000) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(200) - .setCount(1000); + .setMaxValue(200); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -799,22 +794,19 @@ public void testInPredicateEstimationForColumnsOutofRange() { IntegerLiteral i200 = new IntegerLiteral(200); Map slotToColumnStat = new HashMap<>(); - ColumnStatisticBuilder builderA = new ColumnStatisticBuilder() + ColumnStatisticBuilder builderA = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(100) - .setCount(100); - ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() - .setCount(100) + .setMaxValue(100); + ColumnStatisticBuilder builderB = new ColumnStatisticBuilder(100) .setNdv(20) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) .setMaxValue(500); - ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() - .setCount(100) + ColumnStatisticBuilder builderC = new ColumnStatisticBuilder(100) .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) @@ -868,22 +860,19 @@ public void testFilterEstimationForColumnsNotChanged() { IntegerLiteral i10 = new IntegerLiteral(10); Map slotToColumnStat = new HashMap<>(); - ColumnStatisticBuilder builderA = new ColumnStatisticBuilder() + ColumnStatisticBuilder builderA = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(100) - .setCount(100); - ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() - .setCount(100) + .setMaxValue(100); + ColumnStatisticBuilder builderB = new ColumnStatisticBuilder(100) .setNdv(20) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) .setMaxValue(500); - ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() - .setCount(100) + ColumnStatisticBuilder builderC = new ColumnStatisticBuilder(100) .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) @@ -914,15 +903,14 @@ public void testFilterEstimationForColumnsNotChanged() { @Test public void testBetweenCastFilter() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMaxExpr(new IntLiteral(100)) .setMaxValue(100) .setMinExpr(new IntLiteral(0)) - .setMinValue(0) - .setCount(100); + .setMinValue(0); DoubleLiteral begin = new DoubleLiteral(40.0); DoubleLiteral end = new DoubleLiteral(50.0); LessThan less = new LessThan(new Cast(a, DoubleType.INSTANCE), end); @@ -943,13 +931,12 @@ public void testDateRangeSelectivity() { DateLiteral from = new DateLiteral("1990-01-01"); DateLiteral to = new DateLiteral("2000-01-01"); SlotReference a = new SlotReference("a", DateType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMaxValue(to.getDouble()) - .setMinValue(from.getDouble()) - .setCount(100); + .setMinValue(from.getDouble()); DateLiteral mid = new DateLiteral("1999-01-01"); GreaterThan greaterThan = new GreaterThan(a, mid); Statistics stats = new Statistics(100, new HashMap<>()); @@ -962,13 +949,12 @@ public void testDateRangeSelectivity() { @Test public void testIsNull() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(10) .setMaxValue(100) - .setMinValue(0) - .setCount(100); + .setMinValue(0); IsNull isNull = new IsNull(a); Statistics stats = new Statistics(100, new HashMap<>()); stats.addColumnStats(a, builder.build()); @@ -980,13 +966,12 @@ public void testIsNull() { @Test public void testIsNotNull() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(4) .setNumNulls(10) .setMaxValue(100) - .setMinValue(0) - .setCount(100); + .setMinValue(0); IsNull isNull = new IsNull(a); Not not = new Not(isNull); Statistics stats = new Statistics(100, new HashMap<>()); @@ -1002,13 +987,12 @@ public void testIsNotNull() { @Test public void testNumNullsEqualTo() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); EqualTo equalTo = new EqualTo(a, int1); Statistics stats = new Statistics(10, new HashMap<>()); @@ -1024,13 +1008,12 @@ public void testNumNullsEqualTo() { @Test public void testNumNullsComparable() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); GreaterThan greaterThan = new GreaterThan(a, int1); Statistics stats = new Statistics(10, new HashMap<>()); @@ -1046,13 +1029,12 @@ public void testNumNullsComparable() { @Test public void testNumNullsIn() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); IntegerLiteral int2 = new IntegerLiteral(2); InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2)); @@ -1060,7 +1042,7 @@ public void testNumNullsIn() { stats.addColumnStats(a, builder.build()); FilterEstimation filterEstimation = new FilterEstimation(); Statistics result = filterEstimation.estimate(in, stats); - Assertions.assertEquals(result.getRowCount(), 10.0, 0.01); + Assertions.assertEquals(result.getRowCount(), 2.0, 0.01); } /** @@ -1069,13 +1051,12 @@ public void testNumNullsIn() { @Test public void testNumNullsNotEqualTo() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); EqualTo equalTo = new EqualTo(a, int1); Not not = new Not(equalTo); @@ -1092,13 +1073,12 @@ public void testNumNullsNotEqualTo() { @Test public void testNumNullsNotIn() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); IntegerLiteral int2 = new IntegerLiteral(2); InPredicate in = new InPredicate(a, Lists.newArrayList(int1, int2)); @@ -1116,13 +1096,12 @@ public void testNumNullsNotIn() { @Test public void testNumNullsAnd() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); IntegerLiteral int2 = new IntegerLiteral(2); GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1); @@ -1141,23 +1120,21 @@ public void testNumNullsAnd() { @Test public void testNumNullsAndTwoCol() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builderA = new ColumnStatisticBuilder() + ColumnStatisticBuilder builderA = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(0) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); EqualTo equalTo = new EqualTo(a, int1); SlotReference b = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() + ColumnStatisticBuilder builderB = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); Not isNotNull = new Not(new IsNull(b)); And and = new And(equalTo, isNotNull); Statistics stats = new Statistics(10, new HashMap<>()); @@ -1165,7 +1142,9 @@ public void testNumNullsAndTwoCol() { stats.addColumnStats(b, builderB.build()); FilterEstimation filterEstimation = new FilterEstimation(); Statistics result = filterEstimation.estimate(and, stats); - Assertions.assertEquals(result.getRowCount(), 1.0, 0.01); + // result 1.0->2.0 bc happens because the calculation from normalization of + // "Math.min(columnStatistic.numNulls * factor, rowCount - ndv);" + Assertions.assertEquals(result.getRowCount(), 3.5, 0.01); } /** @@ -1174,13 +1153,12 @@ public void testNumNullsAndTwoCol() { @Test public void testNumNullsOr() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); IntegerLiteral int2 = new IntegerLiteral(2); GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int2); @@ -1199,13 +1177,12 @@ public void testNumNullsOr() { @Test public void testNumNullsOrIsNull() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); IntegerLiteral int1 = new IntegerLiteral(1); GreaterThanEqual greaterThanEqual = new GreaterThanEqual(a, int1); IsNull isNull = new IsNull(a); @@ -1219,23 +1196,22 @@ public void testNumNullsOrIsNull() { @Test public void testNullSafeEqual() { - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(10) .setNdv(2) .setAvgSizeByte(4) .setNumNulls(8) .setMaxValue(2) - .setMinValue(1) - .setCount(10); + .setMinValue(1); ColumnStatistic aStats = columnStatisticBuilder.build(); SlotReference a = new SlotReference("a", IntegerType.INSTANCE); - columnStatisticBuilder.setNdv(2) + ColumnStatisticBuilder columnStatisticBuilder2 = new ColumnStatisticBuilder(10) + .setNdv(2) .setAvgSizeByte(4) .setNumNulls(7) .setMaxValue(2) - .setMinValue(1) - .setCount(10); - ColumnStatistic bStats = columnStatisticBuilder.build(); + .setMinValue(1); + ColumnStatistic bStats = columnStatisticBuilder2.build(); SlotReference b = new SlotReference("b", IntegerType.INSTANCE); StatisticsBuilder statsBuilder = new StatisticsBuilder(); @@ -1258,15 +1234,14 @@ public void testNullSafeEqual() { @Test public void testStringRangeColToLiteral() { SlotReference a = new SlotReference("a", new VarcharType(25)); - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(25) .setNumNulls(0) .setMaxExpr(new StringLiteral("200")) .setMaxValue(new VarcharLiteral("200").getDouble()) .setMinExpr(new StringLiteral("100")) - .setMinValue(new VarcharLiteral("100").getDouble()) - .setCount(100); + .setMinValue(new VarcharLiteral("100").getDouble()); StatisticsBuilder statsBuilder = new StatisticsBuilder(); statsBuilder.setRowCount(100); statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build()); @@ -1287,15 +1262,14 @@ public void testStringRangeColToLiteral() { @Test public void testStringRangeColToDateLiteral() { SlotReference a = new SlotReference("a", new VarcharType(25)); - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(25) .setNumNulls(0) .setMaxExpr(new StringLiteral("2022-01-01")) .setMaxValue(new VarcharLiteral("2022-01-01").getDouble()) .setMinExpr(new StringLiteral("2020-01-01")) - .setMinValue(new VarcharLiteral("2020-01-01").getDouble()) - .setCount(100); + .setMinValue(new VarcharLiteral("2020-01-01").getDouble()); StatisticsBuilder statsBuilder = new StatisticsBuilder(); statsBuilder.setRowCount(100); statsBuilder.putColumnStatistics(a, columnStatisticBuilder.build()); @@ -1316,37 +1290,34 @@ public void testStringRangeColToDateLiteral() { @Test public void testStringRangeColToCol() { SlotReference a = new SlotReference("a", new VarcharType(25)); - ColumnStatisticBuilder columnStatisticBuilderA = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilderA = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(25) .setNumNulls(0) .setMaxExpr(new StringLiteral("2022-01-01")) .setMaxValue(new VarcharLiteral("2022-01-01").getDouble()) .setMinExpr(new StringLiteral("2020-01-01")) - .setMinValue(new VarcharLiteral("2020-01-01").getDouble()) - .setCount(100); + .setMinValue(new VarcharLiteral("2020-01-01").getDouble()); SlotReference b = new SlotReference("b", new VarcharType(25)); - ColumnStatisticBuilder columnStatisticBuilderB = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilderB = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(25) .setNumNulls(0) .setMaxExpr(new StringLiteral("2012-01-01")) .setMaxValue(new VarcharLiteral("2012-01-01").getDouble()) .setMinExpr(new StringLiteral("2010-01-01")) - .setMinValue(new VarcharLiteral("2010-01-01").getDouble()) - .setCount(100); + .setMinValue(new VarcharLiteral("2010-01-01").getDouble()); SlotReference c = new SlotReference("c", new VarcharType(25)); - ColumnStatisticBuilder columnStatisticBuilderC = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilderC = new ColumnStatisticBuilder(100) .setNdv(100) .setAvgSizeByte(25) .setNumNulls(0) .setMaxExpr(new StringLiteral("2021-01-01")) .setMaxValue(new VarcharLiteral("2021-01-01").getDouble()) .setMinExpr(new StringLiteral("2010-01-01")) - .setMinValue(new VarcharLiteral("2010-01-01").getDouble()) - .setCount(100); + .setMinValue(new VarcharLiteral("2010-01-01").getDouble()); StatisticsBuilder statsBuilder = new StatisticsBuilder(); statsBuilder.setRowCount(100); @@ -1356,25 +1327,77 @@ public void testStringRangeColToCol() { Statistics baseStats = statsBuilder.build(); // (2020-2022) > (2010,2012), sel=1 + // string type, use conservative way to do estimation: sel = DEFAULT (0.5) Statistics agrtb = new FilterEstimation().estimate(new GreaterThan(a, b), baseStats); - Assertions.assertEquals(100, agrtb.getRowCount()); + Assertions.assertEquals(50, agrtb.getRowCount()); // (2020-2022) < (2010,2012), sel=0 + // string type, use conservative way to do estimation: sel = DEFAULT (0.5) Statistics alessb = new FilterEstimation().estimate(new LessThan(a, b), baseStats); - Assertions.assertEquals(0, alessb.getRowCount()); + Assertions.assertEquals(50, alessb.getRowCount()); // (2020-2022) > (2010-2021), sel = DEFAULT (0.5) Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, c), baseStats); Assertions.assertEquals(50, agrtc.getRowCount()); } + @Test + public void testStringRangeColToColDateType() { + SlotReference a = new SlotReference("a", DateType.INSTANCE); + ColumnStatisticBuilder columnStatisticBuilderA = new ColumnStatisticBuilder(100) + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2022-01-01")) + .setMaxValue(new DateLiteral("2022-01-01").getDouble()) + .setMinExpr(new StringLiteral("2020-01-01")) + .setMinValue(new DateLiteral("2020-01-01").getDouble()); + + SlotReference b = new SlotReference("b", DateType.INSTANCE); + ColumnStatisticBuilder columnStatisticBuilderB = new ColumnStatisticBuilder(100) + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2012-01-01")) + .setMaxValue(new DateLiteral("2012-01-01").getDouble()) + .setMinExpr(new StringLiteral("2010-01-01")) + .setMinValue(new DateLiteral("2010-01-01").getDouble()); + + SlotReference c = new SlotReference("c", DateType.INSTANCE); + ColumnStatisticBuilder columnStatisticBuilderC = new ColumnStatisticBuilder(100) + .setNdv(100) + .setAvgSizeByte(25) + .setNumNulls(0) + .setMaxExpr(new StringLiteral("2021-01-01")) + .setMaxValue(new DateLiteral("2021-01-01").getDouble()) + .setMinExpr(new StringLiteral("2010-01-01")) + .setMinValue(new DateLiteral("2010-01-01").getDouble()); + + StatisticsBuilder statsBuilder = new StatisticsBuilder(); + statsBuilder.setRowCount(100); + statsBuilder.putColumnStatistics(a, columnStatisticBuilderA.build()); + statsBuilder.putColumnStatistics(b, columnStatisticBuilderB.build()); + statsBuilder.putColumnStatistics(c, columnStatisticBuilderC.build()); + Statistics baseStats = statsBuilder.build(); + + // (2020-2022) > (2010,2012), sel=1 + Statistics agrtb = new FilterEstimation().estimate(new GreaterThan(a, b), baseStats); + Assertions.assertEquals(100, agrtb.getRowCount()); + // (2020-2022) < (2010,2012), sel=0 + Statistics alessb = new FilterEstimation().estimate(new LessThan(a, b), baseStats); + Assertions.assertEquals(0, alessb.getRowCount()); + + // (2020-2022) > (2010-2021), sel = 97.72 + Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, c), baseStats); + Assertions.assertTrue(Precision.equals(97.72, agrtc.getRowCount(), 0.01)); + } + @Test public void testLargeRange() { SlotReference a = new SlotReference("a", IntegerType.INSTANCE); long tenB = 1000000000; long row = 1600000000; - ColumnStatistic colStats = new ColumnStatisticBuilder() + ColumnStatistic colStats = new ColumnStatisticBuilder(row) .setAvgSizeByte(10) - .setCount(row) .setNdv(10000) .setMinExpr(new IntLiteral(0)) .setMinValue(0) @@ -1399,18 +1422,16 @@ public void testLargeRange() { void testAndWithInfinity() { Double row = 1000.0; SlotReference a = new SlotReference("a", new VarcharType(25)); - ColumnStatisticBuilder columnStatisticBuilderA = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilderA = new ColumnStatisticBuilder(row) .setNdv(10) .setAvgSizeByte(4) - .setNumNulls(0) - .setCount(row); + .setNumNulls(0); SlotReference b = new SlotReference("b", IntegerType.INSTANCE); - ColumnStatisticBuilder columnStatisticBuilderB = new ColumnStatisticBuilder() + ColumnStatisticBuilder columnStatisticBuilderB = new ColumnStatisticBuilder(row) .setNdv(488) .setAvgSizeByte(25) - .setNumNulls(0) - .setCount(row); + .setNumNulls(0); StatisticsBuilder statsBuilder = new StatisticsBuilder(); statsBuilder.setRowCount(row); statsBuilder.putColumnStatistics(a, columnStatisticBuilderA.build()); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java index 168650c6351239..8e37234a0c6567 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/JoinEstimateTest.java @@ -55,15 +55,13 @@ public void testInnerJoinStats() { EqualTo eq = new EqualTo(a, b); Statistics leftStats = new StatisticsBuilder().setRowCount(100).build(); leftStats.addColumnStats(a, - new ColumnStatisticBuilder() - .setCount(100) + new ColumnStatisticBuilder(100) .setNdv(10) .build() ); Statistics rightStats = new StatisticsBuilder().setRowCount(80).build(); rightStats.addColumnStats(b, - new ColumnStatisticBuilder() - .setCount(80) + new ColumnStatisticBuilder(80) .setNdv(5) .build() ); @@ -101,20 +99,17 @@ public void testOuterJoinStats() { EqualTo eq = new EqualTo(a, b); Statistics leftStats = new StatisticsBuilder().setRowCount(100).build(); leftStats.addColumnStats(a, - new ColumnStatisticBuilder() - .setCount(100) + new ColumnStatisticBuilder(100) .setNdv(10) .build() ); Statistics rightStats = new StatisticsBuilder().setRowCount(80).build(); rightStats.addColumnStats(b, - new ColumnStatisticBuilder() - .setCount(80) + new ColumnStatisticBuilder(80) .setNdv(0) .build() ).addColumnStats(c, - new ColumnStatisticBuilder() - .setCount(80) + new ColumnStatisticBuilder(80) .setNdv(20) .build() ); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java index 84c162ac9cfbe8..cf91eacb51ca75 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/StatsCalculatorTest.java @@ -145,13 +145,13 @@ public void testFilter() { GroupExpression groupExpression = new GroupExpression(logicalFilter, ImmutableList.of(childGroup)); Group ownerGroup = new Group(null, groupExpression, null); StatsCalculator.estimate(groupExpression, null); - Assertions.assertEquals((10000 * 0.1 * 0.05), ownerGroup.getStatistics().getRowCount(), 0.001); + Assertions.assertEquals(49.945, ownerGroup.getStatistics().getRowCount(), 0.001); LogicalFilter logicalFilterOr = new LogicalFilter<>(or, groupPlan); GroupExpression groupExpressionOr = new GroupExpression(logicalFilterOr, ImmutableList.of(childGroup)); Group ownerGroupOr = new Group(null, groupExpressionOr, null); StatsCalculator.estimate(groupExpressionOr, null); - Assertions.assertEquals((long) (10000 * (0.1 + 0.05 - 0.1 * 0.05)), + Assertions.assertEquals(1448.555, ownerGroupOr.getStatistics().getRowCount(), 0.001); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java deleted file mode 100644 index c3f04bccfc8b28..00000000000000 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java +++ /dev/null @@ -1,58 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.common.Id; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import java.util.Date; - -public class StatsDeriveResultTest { - @Test - public void testUpdateRowCountByLimit() { - StatsDeriveResult stats = new StatsDeriveResult(100); - ColumnStatistic a = new ColumnStatistic(100, 10, null, 1, 5, 10, - 1, 100, null, null, false, - new Date().toString()); - Id id = new Id(1); - stats.addColumnStats(id, a); - StatsDeriveResult res = stats.updateByLimit(0); - Assertions.assertEquals(0, res.getRowCount()); - Assertions.assertEquals(1, res.getSlotIdToColumnStats().size()); - ColumnStatistic resColStats = res.getColumnStatsBySlotId(id); - Assertions.assertEquals(0, resColStats.ndv); - Assertions.assertEquals(1, resColStats.avgSizeByte); - Assertions.assertEquals(0, resColStats.numNulls); - Assertions.assertEquals(1, resColStats.dataSize); - Assertions.assertEquals(1, resColStats.minValue); - Assertions.assertEquals(100, resColStats.maxValue); - Assertions.assertEquals(false, resColStats.isUnKnown); - - res = stats.updateByLimit(1); - resColStats = res.getColumnStatsBySlotId(id); - Assertions.assertEquals(1, resColStats.ndv); - Assertions.assertEquals(1, resColStats.avgSizeByte); - Assertions.assertEquals(1, resColStats.numNulls); - Assertions.assertEquals(1, resColStats.dataSize); - Assertions.assertEquals(1, resColStats.minValue); - Assertions.assertEquals(100, resColStats.maxValue); - Assertions.assertEquals(false, resColStats.isUnKnown); - } -} diff --git a/regression-test/data/nereids_hint_tpcds_p0/shape/query74.out b/regression-test/data/nereids_hint_tpcds_p0/shape/query74.out index e6f3e10d22fbc4..8b171914ebd371 100644 --- a/regression-test/data/nereids_hint_tpcds_p0/shape/query74.out +++ b/regression-test/data/nereids_hint_tpcds_p0/shape/query74.out @@ -35,20 +35,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL) > if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL))) build RFs:RF5 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL) > if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL))) build RFs:RF5 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF3 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.sale_type = 's') and (t_s_secyear.year = 2000)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 --------------------PhysicalProject ----------------------filter((t_s_firstyear.sale_type = 's') and (t_s_firstyear.year = 1999) and (t_s_firstyear.year_total > 0.00)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 ------------------PhysicalProject --------------------filter((t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year = 1999) and (t_w_firstyear.year_total > 0.00)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ---------------PhysicalProject -----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query48.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query48.out index c20cd07f178f50..8efddd9c9f1a18 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query48.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query48.out @@ -7,23 +7,23 @@ PhysicalResultSink --------PhysicalProject ----------hashJoin[INNER_JOIN broadcast] hashCondition=((store.s_store_sk = store_sales.ss_store_sk)) otherCondition=() build RFs:RF3 s_store_sk->[ss_store_sk] ------------PhysicalProject ---------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((store_sales.ss_addr_sk = customer_address.ca_address_sk)) otherCondition=((((ca_state IN ('ND', 'NY', 'SD') AND ((store_sales.ss_net_profit >= 0.00) AND (store_sales.ss_net_profit <= 2000.00))) OR (ca_state IN ('GA', 'KS', 'MD') AND ((store_sales.ss_net_profit >= 150.00) AND (store_sales.ss_net_profit <= 3000.00)))) OR (ca_state IN ('CO', 'MN', 'NC') AND ((store_sales.ss_net_profit >= 50.00) AND (store_sales.ss_net_profit <= 25000.00))))) build RFs:RF2 ss_addr_sk->[ca_address_sk] -----------------PhysicalProject -------------------filter((customer_address.ca_country = 'United States') and ca_state IN ('CO', 'GA', 'KS', 'MD', 'MN', 'NC', 'ND', 'NY', 'SD')) ---------------------PhysicalOlapScan[customer_address] apply RFs: RF2 +--------------hashJoin[INNER_JOIN shuffle] hashCondition=((store_sales.ss_addr_sk = customer_address.ca_address_sk)) otherCondition=((((ca_state IN ('ND', 'NY', 'SD') AND ((store_sales.ss_net_profit >= 0.00) AND (store_sales.ss_net_profit <= 2000.00))) OR (ca_state IN ('GA', 'KS', 'MD') AND ((store_sales.ss_net_profit >= 150.00) AND (store_sales.ss_net_profit <= 3000.00)))) OR (ca_state IN ('CO', 'MN', 'NC') AND ((store_sales.ss_net_profit >= 50.00) AND (store_sales.ss_net_profit <= 25000.00))))) build RFs:RF2 ca_address_sk->[ss_addr_sk] ----------------PhysicalProject ------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN broadcast] hashCondition=((customer_demographics.cd_demo_sk = store_sales.ss_cdemo_sk)) otherCondition=((((((customer_demographics.cd_marital_status = 'S') AND (customer_demographics.cd_education_status = 'Secondary')) AND ((store_sales.ss_sales_price >= 100.00) AND (store_sales.ss_sales_price <= 150.00))) OR (((customer_demographics.cd_marital_status = 'M') AND (customer_demographics.cd_education_status = '2 yr Degree')) AND ((store_sales.ss_sales_price >= 50.00) AND (store_sales.ss_sales_price <= 100.00)))) OR (((customer_demographics.cd_marital_status = 'D') AND (customer_demographics.cd_education_status = 'Advanced Degree')) AND ((store_sales.ss_sales_price >= 150.00) AND (store_sales.ss_sales_price <= 200.00))))) build RFs:RF0 cd_demo_sk->[ss_cdemo_sk] ------------------------PhysicalProject --------------------------filter((store_sales.ss_net_profit <= 25000.00) and (store_sales.ss_net_profit >= 0.00) and (store_sales.ss_sales_price <= 200.00) and (store_sales.ss_sales_price >= 50.00)) -----------------------------PhysicalOlapScan[store_sales] apply RFs: RF0 RF1 RF3 +----------------------------PhysicalOlapScan[store_sales] apply RFs: RF0 RF1 RF2 RF3 ------------------------PhysicalProject --------------------------filter(((((customer_demographics.cd_marital_status = 'S') AND (customer_demographics.cd_education_status = 'Secondary')) OR ((customer_demographics.cd_marital_status = 'M') AND (customer_demographics.cd_education_status = '2 yr Degree'))) OR ((customer_demographics.cd_marital_status = 'D') AND (customer_demographics.cd_education_status = 'Advanced Degree')))) ----------------------------PhysicalOlapScan[customer_demographics] --------------------PhysicalProject ----------------------filter((date_dim.d_year = 2001)) ------------------------PhysicalOlapScan[date_dim] +----------------PhysicalProject +------------------filter((customer_address.ca_country = 'United States') and ca_state IN ('CO', 'GA', 'KS', 'MD', 'MN', 'NC', 'ND', 'NY', 'SD')) +--------------------PhysicalOlapScan[customer_address] ------------PhysicalProject --------------PhysicalOlapScan[store] diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query58.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query58.out index 1958853f90fb2a..97b3a3af96ad02 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query58.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query58.out @@ -5,17 +5,17 @@ PhysicalResultSink ----PhysicalDistribute[DistributionSpecGather] ------PhysicalTopN[LOCAL_SORT] --------PhysicalProject -----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] +----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] ------------PhysicalProject --------------hashAgg[GLOBAL] ----------------PhysicalDistribute[DistributionSpecHash] ------------------hashAgg[LOCAL] --------------------PhysicalProject -----------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[cs_item_sk] +----------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[ws_item_sk] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[cs_sold_date_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[ws_sold_date_sk] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11 RF12 +------------------------------PhysicalOlapScan[web_sales] apply RFs: RF11 RF12 ----------------------------PhysicalProject ------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF10 d_date->[d_date] --------------------------------PhysicalProject @@ -32,7 +32,7 @@ PhysicalResultSink ------------------------PhysicalProject --------------------------PhysicalOlapScan[item] apply RFs: RF13 ------------PhysicalProject ---------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] +--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] ----------------PhysicalProject ------------------hashAgg[GLOBAL] --------------------PhysicalDistribute[DistributionSpecHash] @@ -63,11 +63,11 @@ PhysicalResultSink --------------------PhysicalDistribute[DistributionSpecHash] ----------------------hashAgg[LOCAL] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[cs_item_sk] ----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk] +------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[cs_sold_date_sk] --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 RF3 +----------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 RF3 --------------------------------PhysicalProject ----------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF1 d_date->[d_date] ------------------------------------PhysicalProject diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query74.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query74.out index e6f3e10d22fbc4..8b171914ebd371 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query74.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query74.out @@ -35,20 +35,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL) > if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL))) build RFs:RF5 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL) > if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL))) build RFs:RF5 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF3 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.sale_type = 's') and (t_s_secyear.year = 2000)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 --------------------PhysicalProject ----------------------filter((t_s_firstyear.sale_type = 's') and (t_s_firstyear.year = 1999) and (t_s_firstyear.year_total > 0.00)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 ------------------PhysicalProject --------------------filter((t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year = 1999) and (t_w_firstyear.year_total > 0.00)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ---------------PhysicalProject -----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query74.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query74.out index 84771c7fe6a7e0..d8a82ca998ac09 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query74.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query74.out @@ -35,20 +35,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF3 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.sale_type = 's') and (t_s_secyear.year = 2000)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 --------------------PhysicalProject ----------------------filter((t_s_firstyear.sale_type = 's') and (t_s_firstyear.year = 1999) and (t_s_firstyear.year_total > 0.0)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 ------------------PhysicalProject --------------------filter((t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year = 1999) and (t_w_firstyear.year_total > 0.0)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ---------------PhysicalProject -----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query74.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query74.out index e48fc87588c83a..64a56e4e850db7 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query74.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query74.out @@ -35,20 +35,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF3 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.sale_type = 's') and (t_s_secyear.year = 2000)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 --------------------PhysicalProject ----------------------filter((t_s_firstyear.sale_type = 's') and (t_s_firstyear.year = 1999) and (t_s_firstyear.year_total > 0.0)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 ------------------PhysicalProject --------------------filter((t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year = 1999) and (t_w_firstyear.year_total > 0.0)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ---------------PhysicalProject -----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query58.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query58.out index ddde38eeedf943..154cbf3229240b 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query58.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query58.out @@ -5,19 +5,17 @@ PhysicalResultSink ----PhysicalDistribute[DistributionSpecGather] ------PhysicalTopN[LOCAL_SORT] --------PhysicalProject -----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] +----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] ------------PhysicalProject --------------hashAgg[GLOBAL] ----------------PhysicalDistribute[DistributionSpecHash] ------------------hashAgg[LOCAL] --------------------PhysicalProject -----------------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 ws_item_sk->[i_item_sk] +----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[cs_item_sk] ------------------------PhysicalProject ---------------------------PhysicalOlapScan[item] apply RFs: RF12 RF13 -------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[ws_sold_date_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[cs_sold_date_sk] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[web_sales] apply RFs: RF11 +------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11 RF12 ----------------------------PhysicalProject ------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF10 d_date->[d_date] --------------------------------PhysicalProject @@ -31,8 +29,10 @@ PhysicalResultSink ----------------------------------------PhysicalProject ------------------------------------------filter((date_dim.d_date = '2001-03-24')) --------------------------------------------PhysicalOlapScan[date_dim] +------------------------PhysicalProject +--------------------------PhysicalOlapScan[item] apply RFs: RF13 ------------PhysicalProject ---------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] +--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] ----------------PhysicalProject ------------------hashAgg[GLOBAL] --------------------PhysicalDistribute[DistributionSpecHash] @@ -63,11 +63,13 @@ PhysicalResultSink --------------------PhysicalDistribute[DistributionSpecHash] ----------------------hashAgg[LOCAL] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() +--------------------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 ws_item_sk->[i_item_sk] ----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[cs_sold_date_sk] +------------------------------PhysicalOlapScan[item] apply RFs: RF3 +----------------------------PhysicalProject +------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk] --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 +----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 --------------------------------PhysicalProject ----------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF1 d_date->[d_date] ------------------------------------PhysicalProject @@ -81,6 +83,4 @@ PhysicalResultSink --------------------------------------------PhysicalProject ----------------------------------------------filter((date_dim.d_date = '2001-03-24')) ------------------------------------------------PhysicalOlapScan[date_dim] -----------------------------PhysicalProject -------------------------------PhysicalOlapScan[item] diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query58.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query58.out index 92bc9775af58e0..154cbf3229240b 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query58.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query58.out @@ -5,19 +5,17 @@ PhysicalResultSink ----PhysicalDistribute[DistributionSpecGather] ------PhysicalTopN[LOCAL_SORT] --------PhysicalProject -----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] +----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] ------------PhysicalProject --------------hashAgg[GLOBAL] ----------------PhysicalDistribute[DistributionSpecHash] ------------------hashAgg[LOCAL] --------------------PhysicalProject -----------------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 ws_item_sk->[i_item_sk] +----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[cs_item_sk] ------------------------PhysicalProject ---------------------------PhysicalOlapScan[item] apply RFs: RF12 RF13 -------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[ws_sold_date_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[cs_sold_date_sk] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[web_sales] apply RFs: RF11 +------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11 RF12 ----------------------------PhysicalProject ------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF10 d_date->[d_date] --------------------------------PhysicalProject @@ -31,8 +29,10 @@ PhysicalResultSink ----------------------------------------PhysicalProject ------------------------------------------filter((date_dim.d_date = '2001-03-24')) --------------------------------------------PhysicalOlapScan[date_dim] +------------------------PhysicalProject +--------------------------PhysicalOlapScan[item] apply RFs: RF13 ------------PhysicalProject ---------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] +--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] ----------------PhysicalProject ------------------hashAgg[GLOBAL] --------------------PhysicalDistribute[DistributionSpecHash] @@ -63,11 +63,13 @@ PhysicalResultSink --------------------PhysicalDistribute[DistributionSpecHash] ----------------------hashAgg[LOCAL] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[cs_item_sk] +--------------------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 ws_item_sk->[i_item_sk] ----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[cs_sold_date_sk] +------------------------------PhysicalOlapScan[item] apply RFs: RF3 +----------------------------PhysicalProject +------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk] --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 RF3 +----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 --------------------------------PhysicalProject ----------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF1 d_date->[d_date] ------------------------------------PhysicalProject @@ -81,6 +83,4 @@ PhysicalResultSink --------------------------------------------PhysicalProject ----------------------------------------------filter((date_dim.d_date = '2001-03-24')) ------------------------------------------------PhysicalOlapScan[date_dim] -----------------------------PhysicalProject -------------------------------PhysicalOlapScan[item] diff --git a/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query11.out b/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query11.out index 2727eb396b3814..82db2123eb155c 100644 --- a/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query11.out +++ b/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query11.out @@ -38,20 +38,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(38, 8)) / year_total), 0.000000) > if((year_total > 0.00), (cast(year_total as DECIMALV3(38, 8)) / year_total), 0.000000))) build RFs:RF6 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(38, 8)) / year_total), 0.000000) > if((year_total > 0.00), (cast(year_total as DECIMALV3(38, 8)) / year_total), 0.000000))) build RFs:RF6 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.dyear = 2000) and (t_w_secyear.sale_type = 'w')) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF6 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF5 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.dyear = 2000) and (t_s_secyear.sale_type = 's')) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 RF6 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 --------------------PhysicalProject ----------------------filter((t_s_firstyear.dyear = 1999) and (t_s_firstyear.sale_type = 's') and (t_s_firstyear.year_total > 0.00)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 RF6 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ------------------PhysicalProject --------------------filter((t_w_firstyear.dyear = 1999) and (t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year_total > 0.00)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF6 ---------------PhysicalProject -----------------filter((t_w_secyear.dyear = 2000) and (t_w_secyear.sale_type = 'w')) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query2.out b/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query2.out index cc4538c16b7233..45d3bd88c05b96 100644 --- a/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query2.out +++ b/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query2.out @@ -23,16 +23,16 @@ PhysicalCteAnchor ( cteId=CTEId#1 ) ----------PhysicalProject ------------hashJoin[INNER_JOIN broadcast] hashCondition=((date_dim.d_week_seq = d_week_seq2)) otherCondition=() build RFs:RF3 d_week_seq->[d_week_seq] --------------PhysicalProject -----------------hashJoin[INNER_JOIN broadcast] hashCondition=((date_dim.d_week_seq = d_week_seq1)) otherCondition=() build RFs:RF2 d_week_seq->[d_week_seq] +----------------hashJoin[INNER_JOIN shuffle] hashCondition=((date_dim.d_week_seq = d_week_seq1)) otherCondition=() build RFs:RF2 d_week_seq1->[d_week_seq] +------------------PhysicalProject +--------------------filter((date_dim.d_year = 1998)) +----------------------PhysicalOlapScan[date_dim] apply RFs: RF2 ------------------PhysicalProject --------------------hashJoin[INNER_JOIN shuffle] hashCondition=((expr_cast(d_week_seq1 as BIGINT) = expr_(d_week_seq2 - 53))) otherCondition=() build RFs:RF1 expr_(d_week_seq2 - 53)->[cast(d_week_seq as BIGINT)] ----------------------PhysicalProject -------------------------PhysicalCteConsumer ( cteId=CTEId#1 ) apply RFs: RF1 RF2 +------------------------PhysicalCteConsumer ( cteId=CTEId#1 ) apply RFs: RF1 ----------------------PhysicalProject ------------------------PhysicalCteConsumer ( cteId=CTEId#1 ) apply RFs: RF3 -------------------PhysicalProject ---------------------filter((date_dim.d_year = 1998)) -----------------------PhysicalOlapScan[date_dim] --------------PhysicalProject ----------------filter((date_dim.d_year = 1999)) ------------------PhysicalOlapScan[date_dim] diff --git a/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query4.out b/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query4.out index 07c89db89688de..163b1cb7c71753 100644 --- a/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query4.out +++ b/regression-test/data/nereids_tpcds_shape_sf10t_orc/shape/query4.out @@ -60,22 +60,22 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF9 customer_id->[customer_id,customer_id,customer_id,customer_id] ------------------PhysicalProject ---------------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_c_secyear.customer_id)) otherCondition=((if((year_total > 0.000000), (cast(year_total as DECIMALV3(38, 16)) / year_total), NULL) > if((year_total > 0.000000), (cast(year_total as DECIMALV3(38, 16)) / year_total), NULL))) build RFs:RF8 customer_id->[customer_id,customer_id,customer_id] +--------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_c_secyear.customer_id)) otherCondition=((if((year_total > 0.000000), (cast(year_total as DECIMALV3(38, 16)) / year_total), NULL) > if((year_total > 0.000000), (cast(year_total as DECIMALV3(38, 16)) / year_total), NULL))) build RFs:RF8 customer_id->[customer_id] +----------------------PhysicalProject +------------------------filter((t_c_secyear.dyear = 2000) and (t_c_secyear.sale_type = 'c')) +--------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF8 RF9 ----------------------PhysicalProject ------------------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_c_firstyear.customer_id)) otherCondition=() build RFs:RF7 customer_id->[customer_id,customer_id] --------------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF6 customer_id->[customer_id] ----------------------------PhysicalProject ------------------------------filter((t_s_secyear.dyear = 2000) and (t_s_secyear.sale_type = 's')) ---------------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF6 RF7 RF8 RF9 +--------------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF6 RF7 RF9 ----------------------------PhysicalProject ------------------------------filter((t_s_firstyear.dyear = 1999) and (t_s_firstyear.sale_type = 's') and (t_s_firstyear.year_total > 0.000000)) ---------------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF7 RF8 RF9 +--------------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF7 RF9 --------------------------PhysicalProject ----------------------------filter((t_c_firstyear.dyear = 1999) and (t_c_firstyear.sale_type = 'c') and (t_c_firstyear.year_total > 0.000000)) -------------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF8 RF9 -----------------------PhysicalProject -------------------------filter((t_c_secyear.dyear = 2000) and (t_c_secyear.sale_type = 'c')) ---------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF9 +------------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF9 ------------------PhysicalProject --------------------filter((t_w_firstyear.dyear = 1999) and (t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year_total > 0.000000)) ----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/noStatsRfPrune/query74.out b/regression-test/data/new_shapes_p0/tpcds_sf100/noStatsRfPrune/query74.out index 84771c7fe6a7e0..d8a82ca998ac09 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf100/noStatsRfPrune/query74.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf100/noStatsRfPrune/query74.out @@ -35,20 +35,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF3 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.sale_type = 's') and (t_s_secyear.year = 2000)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 --------------------PhysicalProject ----------------------filter((t_s_firstyear.sale_type = 's') and (t_s_firstyear.year = 1999) and (t_s_firstyear.year_total > 0.0)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 ------------------PhysicalProject --------------------filter((t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year = 1999) and (t_w_firstyear.year_total > 0.0)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ---------------PhysicalProject -----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/no_stats_shape/query74.out b/regression-test/data/new_shapes_p0/tpcds_sf100/no_stats_shape/query74.out index e48fc87588c83a..64a56e4e850db7 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf100/no_stats_shape/query74.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf100/no_stats_shape/query74.out @@ -35,20 +35,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.0), (year_total / year_total), NULL) > if((year_total > 0.0), (year_total / year_total), NULL))) build RFs:RF5 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF3 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.sale_type = 's') and (t_s_secyear.year = 2000)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 --------------------PhysicalProject ----------------------filter((t_s_firstyear.sale_type = 's') and (t_s_firstyear.year = 1999) and (t_s_firstyear.year_total > 0.0)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 ------------------PhysicalProject --------------------filter((t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year = 1999) and (t_w_firstyear.year_total > 0.0)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ---------------PhysicalProject -----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query58.out b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query58.out index 1fbb2c10eeac41..8b1c60c8c19a1b 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query58.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query58.out @@ -5,17 +5,17 @@ PhysicalResultSink ----PhysicalDistribute[DistributionSpecGather] ------PhysicalTopN[LOCAL_SORT] --------PhysicalProject -----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] +----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] ------------PhysicalProject --------------hashAgg[GLOBAL] ----------------PhysicalDistribute[DistributionSpecHash] ------------------hashAgg[LOCAL] --------------------PhysicalProject -----------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[ws_item_sk] +----------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[cs_item_sk] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[ws_sold_date_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[cs_sold_date_sk] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[web_sales] apply RFs: RF11 RF12 +------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11 RF12 ----------------------------PhysicalProject ------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF10 d_date->[d_date] --------------------------------PhysicalProject @@ -32,7 +32,7 @@ PhysicalResultSink ------------------------PhysicalProject --------------------------PhysicalOlapScan[item] apply RFs: RF13 ------------PhysicalProject ---------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] +--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] ----------------PhysicalProject ------------------hashAgg[GLOBAL] --------------------PhysicalDistribute[DistributionSpecHash] @@ -63,11 +63,11 @@ PhysicalResultSink --------------------PhysicalDistribute[DistributionSpecHash] ----------------------hashAgg[LOCAL] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() +--------------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() ----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[cs_sold_date_sk] +------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk] --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 +----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 --------------------------------PhysicalProject ----------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF1 d_date->[d_date] ------------------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query58.out b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query58.out index 4734560f2ef112..62cf69cc400980 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query58.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query58.out @@ -5,17 +5,17 @@ PhysicalResultSink ----PhysicalDistribute[DistributionSpecGather] ------PhysicalTopN[LOCAL_SORT] --------PhysicalProject -----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] +----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] ------------PhysicalProject --------------hashAgg[GLOBAL] ----------------PhysicalDistribute[DistributionSpecHash] ------------------hashAgg[LOCAL] --------------------PhysicalProject -----------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[ws_item_sk] +----------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[cs_item_sk] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[ws_sold_date_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[cs_sold_date_sk] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[web_sales] apply RFs: RF11 RF12 +------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11 RF12 ----------------------------PhysicalProject ------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF10 d_date->[d_date] --------------------------------PhysicalProject @@ -32,7 +32,7 @@ PhysicalResultSink ------------------------PhysicalProject --------------------------PhysicalOlapScan[item] apply RFs: RF13 ------------PhysicalProject ---------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] +--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] ----------------PhysicalProject ------------------hashAgg[GLOBAL] --------------------PhysicalDistribute[DistributionSpecHash] @@ -63,11 +63,11 @@ PhysicalResultSink --------------------PhysicalDistribute[DistributionSpecHash] ----------------------hashAgg[LOCAL] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[cs_item_sk] +--------------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] ----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[cs_sold_date_sk] +------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk] --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 RF3 +----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 RF3 --------------------------------PhysicalProject ----------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF1 d_date->[d_date] ------------------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query58.out b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query58.out index 1958853f90fb2a..97b3a3af96ad02 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query58.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query58.out @@ -5,17 +5,17 @@ PhysicalResultSink ----PhysicalDistribute[DistributionSpecGather] ------PhysicalTopN[LOCAL_SORT] --------PhysicalProject -----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] +----------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF13 item_id->[i_item_id] ------------PhysicalProject --------------hashAgg[GLOBAL] ----------------PhysicalDistribute[DistributionSpecHash] ------------------hashAgg[LOCAL] --------------------PhysicalProject -----------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[cs_item_sk] +----------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF12 i_item_sk->[ws_item_sk] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[cs_sold_date_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF11 d_date_sk->[ws_sold_date_sk] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF11 RF12 +------------------------------PhysicalOlapScan[web_sales] apply RFs: RF11 RF12 ----------------------------PhysicalProject ------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF10 d_date->[d_date] --------------------------------PhysicalProject @@ -32,7 +32,7 @@ PhysicalResultSink ------------------------PhysicalProject --------------------------PhysicalOlapScan[item] apply RFs: RF13 ------------PhysicalProject ---------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = ws_items.item_id)) otherCondition=((cast(ss_item_rev as DOUBLE) <= cast((1.1 * ws_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * ws_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(ws_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] +--------------hashJoin[INNER_JOIN colocated] hashCondition=((ss_items.item_id = cs_items.item_id)) otherCondition=((cast(cs_item_rev as DOUBLE) <= cast((1.1 * ss_item_rev) as DOUBLE)) and (cast(cs_item_rev as DOUBLE) >= cast((0.9 * ss_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) <= cast((1.1 * cs_item_rev) as DOUBLE)) and (cast(ss_item_rev as DOUBLE) >= cast((0.9 * cs_item_rev) as DOUBLE))) build RFs:RF8 item_id->[i_item_id] ----------------PhysicalProject ------------------hashAgg[GLOBAL] --------------------PhysicalDistribute[DistributionSpecHash] @@ -63,11 +63,11 @@ PhysicalResultSink --------------------PhysicalDistribute[DistributionSpecHash] ----------------------hashAgg[LOCAL] ------------------------PhysicalProject ---------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +--------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[cs_item_sk] ----------------------------PhysicalProject -------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ws_sold_date_sk] +------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[cs_sold_date_sk] --------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF2 RF3 +----------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF2 RF3 --------------------------------PhysicalProject ----------------------------------hashJoin[LEFT_SEMI_JOIN broadcast] hashCondition=((date_dim.d_date = date_dim.d_date)) otherCondition=() build RFs:RF1 d_date->[d_date] ------------------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query74.out b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query74.out index e6f3e10d22fbc4..8b171914ebd371 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query74.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query74.out @@ -35,20 +35,20 @@ PhysicalCteAnchor ( cteId=CTEId#0 ) ------PhysicalDistribute[DistributionSpecGather] --------PhysicalTopN[LOCAL_SORT] ----------PhysicalProject -------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL) > if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL))) build RFs:RF5 customer_id->[customer_id,customer_id,customer_id] +------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((t_s_firstyear.customer_id = t_w_secyear.customer_id)) otherCondition=((if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL) > if((year_total > 0.00), (cast(year_total as DECIMALV3(13, 8)) / year_total), NULL))) build RFs:RF5 customer_id->[customer_id] +--------------PhysicalProject +----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) +------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 --------------PhysicalProject ----------------hashJoin[INNER_JOIN bucketShuffle] hashCondition=((t_s_firstyear.customer_id = t_w_firstyear.customer_id)) otherCondition=() build RFs:RF4 customer_id->[customer_id,customer_id] ------------------hashJoin[INNER_JOIN shuffle] hashCondition=((t_s_secyear.customer_id = t_s_firstyear.customer_id)) otherCondition=() build RFs:RF3 customer_id->[customer_id] --------------------PhysicalProject ----------------------filter((t_s_secyear.sale_type = 's') and (t_s_secyear.year = 2000)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF3 RF4 --------------------PhysicalProject ----------------------filter((t_s_firstyear.sale_type = 's') and (t_s_firstyear.year = 1999) and (t_s_firstyear.year_total > 0.00)) -------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 RF5 +------------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF4 ------------------PhysicalProject --------------------filter((t_w_firstyear.sale_type = 'w') and (t_w_firstyear.year = 1999) and (t_w_firstyear.year_total > 0.00)) -----------------------PhysicalCteConsumer ( cteId=CTEId#0 ) apply RFs: RF5 ---------------PhysicalProject -----------------filter((t_w_secyear.sale_type = 'w') and (t_w_secyear.year = 2000)) -------------------PhysicalCteConsumer ( cteId=CTEId#0 ) +----------------------PhysicalCteConsumer ( cteId=CTEId#0 )