Skip to content

Commit

Permalink
[opt](nereids) Refine stats derive (apache#42112)
Browse files Browse the repository at this point in the history
  • Loading branch information
xzj7019 authored Oct 24, 2024
1 parent a1690f0 commit 7b5a43e
Show file tree
Hide file tree
Showing 44 changed files with 906 additions and 813 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -601,9 +601,8 @@ private Optional<ColumnStatistic> getHiveColumnStats(String colName) {
if (!parameters.containsKey(NUM_ROWS) || Long.parseLong(parameters.get(NUM_ROWS)) == 0) {
return Optional.empty();
}
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
long count = Long.parseLong(parameters.get(NUM_ROWS));
columnStatisticBuilder.setCount(count);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count);
// The tableStats length is at most 1.
for (ColumnStatisticsObj tableStat : tableStats) {
if (!tableStat.isSetStatsData()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ public Void visitLogicalJoin(LogicalJoin<? extends Plan, ? extends Plan> join,
if (joinType.isInnerJoin() || joinType.isCrossJoin()) {
return visit(join, context);
} else if ((joinType.isLeftJoin()
|| joinType.isLefSemiJoin()
|| joinType.isLeftSemiJoin()
|| joinType.isLeftAntiJoin()) && useLeft) {
return visit(join.left(), context);
} else if ((joinType.isRightJoin()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,49 +128,60 @@ public static ColumnStatistic estimate(Expression expression, Statistics stats)

@Override
public ColumnStatistic visit(Expression expr, Statistics context) {
ColumnStatistic stats = context.findColumnStatistics(expr);
if (stats != null) {
return stats;
}
List<Expression> childrenExpr = expr.children();
if (CollectionUtils.isEmpty(childrenExpr)) {
return ColumnStatistic.UNKNOWN;
}
return expr.child(0).accept(this, context);
}

//TODO: case-when need to re-implemented
@Override
public ColumnStatistic visitCaseWhen(CaseWhen caseWhen, Statistics context) {
double ndv = caseWhen.getWhenClauses().size();
double width = 1;
if (caseWhen.getDefaultValue().isPresent()) {
ndv += 1;
}
for (WhenClause clause : caseWhen.getWhenClauses()) {
ColumnStatistic colStats = ExpressionEstimation.estimate(clause.getResult(), context);
ndv = Math.max(ndv, colStats.ndv);
width = Math.max(width, clause.getResult().getDataType().width());
}
if (caseWhen.getDefaultValue().isPresent()) {
ColumnStatistic colStats = ExpressionEstimation.estimate(caseWhen.getDefaultValue().get(), context);
ndv = Math.max(ndv, colStats.ndv);
width = Math.max(width, caseWhen.getDefaultValue().get().getDataType().width());
}
return new ColumnStatisticBuilder()
.setNdv(ndv)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY)
.setAvgSizeByte(8)
.setAvgSizeByte(width)
.setNumNulls(0)
.build();
}

@Override
public ColumnStatistic visitIf(If ifClause, Statistics context) {
double ndv = 2;
double width = 1;
ColumnStatistic colStatsThen = ExpressionEstimation.estimate(ifClause.child(1), context);
ndv = Math.max(ndv, colStatsThen.ndv);
width = Math.max(width, ifClause.child(1).getDataType().width());

ColumnStatistic colStatsElse = ExpressionEstimation.estimate(ifClause.child(2), context);
ndv = Math.max(ndv, colStatsElse.ndv);
width = Math.max(width, ifClause.child(2).getDataType().width());

return new ColumnStatisticBuilder()
.setNdv(ndv)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY)
.setAvgSizeByte(8)
.setAvgSizeByte(width)
.setNumNulls(0)
.build();
}
Expand Down Expand Up @@ -242,9 +253,9 @@ public ColumnStatistic visitLiteral(Literal literal, Statistics context) {
return new ColumnStatisticBuilder()
.setMaxValue(literalVal)
.setMinValue(literalVal)
.setNdv(1)
.setNumNulls(1)
.setAvgSizeByte(1)
.setNdv(literal.isNullLiteral() ? 0 : 1)
.setNumNulls(literal.isNullLiteral() ? 1 : 0)
.setAvgSizeByte(literal.getDataType().width())
.setMinExpr(literal.toLegacyLiteral())
.setMaxExpr(literal.toLegacyLiteral())
.build();
Expand Down Expand Up @@ -274,13 +285,13 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic,
int exprResultTypeWidth = binaryArithmetic.getDataType().width();
double dataSize = exprResultTypeWidth * rowCount;
if (binaryArithmetic instanceof Add) {
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin + rightMin)
.setMaxValue(leftMax + rightMax)
.setMinExpr(null).setMaxExpr(null).build();
}
if (binaryArithmetic instanceof Subtract) {
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin - rightMax)
.setMaxValue(leftMax - rightMin).setMinExpr(null)
.setMaxExpr(null).build();
Expand All @@ -297,7 +308,7 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic,
Math.max(leftMin * rightMin, leftMin * rightMax),
leftMax * rightMin),
leftMax * rightMax);
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max)
.setMaxExpr(null).setMinExpr(null).build();
}
Expand All @@ -312,14 +323,14 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic,
Math.max(leftMin / noneZeroDivisor(rightMin), leftMin / noneZeroDivisor(rightMax)),
leftMax / noneZeroDivisor(rightMin)),
leftMax / noneZeroDivisor(rightMax));
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(binaryArithmetic.getDataType().width()).setMinValue(min)
.setMaxValue(max).build();
}
if (binaryArithmetic instanceof Mod) {
double min = -Math.max(Math.abs(rightMin), Math.abs(rightMax));
double max = -min;
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv)
return new ColumnStatisticBuilder().setNdv(ndv)
.setAvgSizeByte(exprResultTypeWidth)
.setDataSize(dataSize)
.setNumNulls(numNulls)
Expand All @@ -343,8 +354,7 @@ public ColumnStatistic visitMin(Min min, Statistics context) {
return ColumnStatistic.UNKNOWN;
}
// if this is scalar agg, we will update count and ndv to 1 when visiting group clause
return new ColumnStatisticBuilder(columnStat)
.build();
return new ColumnStatisticBuilder(columnStat).build();
}

@Override
Expand All @@ -355,16 +365,14 @@ public ColumnStatistic visitMax(Max max, Statistics context) {
return ColumnStatistic.UNKNOWN;
}
// if this is scalar agg, we will update count and ndv to 1 when visiting group clause
return new ColumnStatisticBuilder(columnStat)
.build();
return new ColumnStatisticBuilder(columnStat).build();
}

@Override
public ColumnStatistic visitCount(Count count, Statistics context) {
double width = count.getDataType().width();
// for scalar agg, ndv and row count will be normalized by 1 in StatsCalculator.computeAggregate()
return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setCount(context.getRowCount())
.setAvgSizeByte(width).build();
return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setAvgSizeByte(width).build();
}

// TODO: return a proper estimated stat after supports histogram
Expand All @@ -382,14 +390,14 @@ public ColumnStatistic visitAvg(Avg avg, Statistics context) {
@Override
public ColumnStatistic visitYear(Year year, Statistics context) {
ColumnStatistic childStat = year.child().accept(this, context);
double rowCount = context.getRowCount();
long minYear = 1970;
long maxYear = 2038;
return new ColumnStatisticBuilder()
.setCount(childStat.count)
.setNdv(maxYear - minYear + 1)
.setAvgSizeByte(4)
.setNumNulls(childStat.numNulls)
.setDataSize(4 * childStat.count)
.setDataSize(4 * rowCount)
.setMinValue(minYear)
.setMaxValue(maxYear).setMinExpr(null).build();
}
Expand Down Expand Up @@ -580,7 +588,7 @@ public ColumnStatistic visitToDate(ToDate toDate, Statistics context) {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats)
.setAvgSizeByte(toDate.getDataType().width())
.setDataSize(toDate.getDataType().width() * context.getRowCount());
if (childColumnStats.minOrMaxIsInf()) {
if (childColumnStats.isMinMaxInvalid()) {
return columnStatisticBuilder.build();
}
double minValue;
Expand Down Expand Up @@ -611,7 +619,7 @@ public ColumnStatistic visitToDays(ToDays toDays, Statistics context) {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats)
.setAvgSizeByte(toDays.getDataType().width())
.setDataSize(toDays.getDataType().width() * context.getRowCount());
if (childColumnStats.minOrMaxIsInf()) {
if (childColumnStats.isMinMaxInvalid()) {
return columnStatisticBuilder.build();
}
double minValue;
Expand Down
Loading

0 comments on commit 7b5a43e

Please sign in to comment.