Skip to content

Commit 26ca4e0

Browse files
committed
use better formula to estimate compoundPredicates
Signed-off-by: m.bogusz <m.bogusz@celonis.com>
1 parent 35e2b16 commit 26ca4e0

2 files changed

Lines changed: 73 additions & 58 deletions

File tree

fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/ExpressionStatisticCalculator.java

Lines changed: 69 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ public static ColumnStatistic calculate(ScalarOperator operator, Statistics inpu
7676
return operator.accept(new ExpressionStatisticVisitor(input, rowCount), null);
7777
}
7878

79+
private record NullableBooleanProbabilities(double pTrue, double pFalse, double pNull) {
80+
}
81+
7982
private static class ExpressionStatisticVisitor extends ScalarOperatorVisitor<ColumnStatistic, Void> {
8083
private final Statistics inputStatistics;
8184
// Some functions estimate need plan node row count, such as COUNT
@@ -149,24 +152,76 @@ public ColumnStatistic visitCompoundPredicate(CompoundPredicateOperator operator
149152
return ColumnStatistic.unknown();
150153
}
151154

152-
// Lambda arguments are synthetic refs, not input Statistics columns. PredicateStatisticsCalculator fetches
153-
// every ColumnRef from Statistics, so compound lambda predicates can fail on missing stats for lambda columns.
154-
// Return conservative boolean stats instead of estimating table-column selectivity for them.
155-
if (containsLambdaArgument(operator)) {
156-
return buildGenericBooleanStatistic(operator, estimateCompoundNullsFraction(operator, context));
155+
if (operator.isNot()) {
156+
// not just swaps pTrue and pFalse, pNull stays the same.
157+
return computeBooleanProbabilities(operator.getChild(0), context).stream()
158+
.map(p -> buildCompoundResult(operator, p.pFalse(), p.pTrue(), p.pNull())).findFirst()
159+
.orElseGet(ColumnStatistic::unknown);
157160
}
158161

159-
Statistics predicateStatistics = PredicateStatisticsCalculator.statisticsCalculate(operator, inputStatistics);
160-
if (predicateStatistics == null || Double.isNaN(predicateStatistics.getOutputRowCount())) {
162+
var left = computeBooleanProbabilities(operator.getChild(0), context);
163+
var right = computeBooleanProbabilities(operator.getChild(1), context);
164+
165+
if (left.isEmpty() || right.isEmpty()) {
161166
return ColumnStatistic.unknown();
162167
}
163168

164-
long inputRows = Math.max(1L, Math.round(rowCount));
165-
double nullsFraction = estimateCompoundNullsFraction(operator, context);
166-
long nullRows = Math.round(inputRows * nullsFraction);
167-
long nonNullRows = Math.max(0L, inputRows - nullRows);
168-
long trueRows = Math.min(nonNullRows, estimatePredicateTrueRows(predicateStatistics, inputRows));
169-
long falseRows = nonNullRows - trueRows;
169+
double pTrue;
170+
double pFalse;
171+
var l = left.get();
172+
var r = right.get();
173+
if (operator.isAnd()) {
174+
pTrue = l.pTrue() * r.pTrue();
175+
pFalse = (l.pFalse() + r.pFalse()) - (l.pFalse() * r.pFalse());
176+
} else {
177+
pTrue = (l.pTrue() + r.pTrue()) - (l.pTrue() * r.pTrue());
178+
pFalse = l.pFalse() * r.pFalse();
179+
}
180+
double pNull = 1.0 - pTrue - pFalse;
181+
182+
return buildCompoundResult(operator, pTrue, pFalse, pNull);
183+
}
184+
185+
private Optional<NullableBooleanProbabilities> computeBooleanProbabilities(ScalarOperator child, Void context) {
186+
ColumnStatistic childStat = child.accept(this, context);
187+
188+
// Extract from boolean MCV histogram if available. This is the most accurate strategy since it reflects the actual boolean value distribution.
189+
if (!childStat.isUnknown() && childStat.getHistogram() != null) {
190+
Map<String, Long> mcv = childStat.getHistogram().getMCV();
191+
if (mcv != null && (!mcv.isEmpty())) {
192+
String trueKey = booleanToMcvValue(true);
193+
String falseKey = booleanToMcvValue(false);
194+
if (mcv.containsKey(trueKey) || mcv.containsKey(falseKey)) {
195+
long trueCount = mcv.getOrDefault(trueKey, 0L);
196+
long falseCount = mcv.getOrDefault(falseKey, 0L);
197+
long inputRows = Math.max(1L, Math.round(rowCount));
198+
double pNull = childStat.getNullsFraction();
199+
double pTrue = clampFraction((double) trueCount / inputRows);
200+
double pFalse = clampFraction((double) falseCount / inputRows);
201+
return Optional.of(new NullableBooleanProbabilities(pTrue, pFalse, pNull));
202+
}
203+
}
204+
}
205+
206+
// Use PredicateStatisticsCalculator selectivity as a fallback.
207+
Statistics predicateStatistics = PredicateStatisticsCalculator.statisticsCalculate(child, inputStatistics);
208+
if (predicateStatistics != null && !Double.isNaN(predicateStatistics.getOutputRowCount())) {
209+
double inputStatisticsRows = inputStatistics.getOutputRowCount();
210+
if (!Double.isNaN(inputStatisticsRows) && inputStatisticsRows > 0) {
211+
double pTrue = clampFraction(predicateStatistics.getOutputRowCount() / inputStatisticsRows);
212+
double pNull = childStat.isUnknown() ? 0.0 : childStat.getNullsFraction();
213+
double pFalse = clampFraction(1.0 - pTrue - pNull);
214+
return Optional.of(new NullableBooleanProbabilities(pTrue, pFalse, pNull));
215+
}
216+
}
217+
218+
return Optional.empty();
219+
}
220+
221+
private ColumnStatistic buildCompoundResult(CompoundPredicateOperator operator, double pTrue, double pFalse,
222+
double pNull) {
223+
long trueRows = Math.round(rowCount * pTrue);
224+
long falseRows = Math.round(rowCount * pFalse);
170225

171226
Map<String, Long> mcvs = new HashMap<>();
172227
if (trueRows > 0) {
@@ -178,8 +233,7 @@ public ColumnStatistic visitCompoundPredicate(CompoundPredicateOperator operator
178233

179234
ColumnStatistic.Builder builder = ColumnStatistic.builder()
180235
.setMinValue(0)
181-
.setMaxValue(1)
182-
.setNullsFraction(nullsFraction)
236+
.setMaxValue(1).setNullsFraction(pNull)
183237
.setAverageRowSize(operator.getType().getTypeSize())
184238
.setDistinctValuesCount(mcvs.size());
185239

@@ -190,44 +244,7 @@ public ColumnStatistic visitCompoundPredicate(CompoundPredicateOperator operator
190244
return builder.build();
191245
}
192246

193-
private boolean containsLambdaArgument(ScalarOperator operator) {
194-
return operator.asStream().anyMatch(op -> op instanceof ColumnRefOperator column
195-
&& column.getOpType() == OperatorType.LAMBDA_ARGUMENT);
196-
}
197-
198-
private ColumnStatistic buildGenericBooleanStatistic(ScalarOperator operator, double nullsFraction) {
199-
return ColumnStatistic.builder()
200-
.setMinValue(0)
201-
.setMaxValue(1)
202-
.setNullsFraction(clampFraction(nullsFraction))
203-
.setAverageRowSize(operator.getType().getTypeSize())
204-
.setDistinctValuesCount(2)
205-
.build();
206-
}
207-
208-
private long estimatePredicateTrueRows(Statistics predicateStatistics, long inputRows) {
209-
double inputStatisticsRows = inputStatistics.getOutputRowCount();
210-
if (Double.isNaN(inputStatisticsRows) || inputStatisticsRows <= 0) {
211-
return Math.max(0L, Math.round(predicateStatistics.getOutputRowCount()));
212-
}
213-
double selectivity = clampFraction(predicateStatistics.getOutputRowCount() / inputStatisticsRows);
214-
return Math.max(0L, Math.round(inputRows * selectivity));
215-
}
216247

217-
private double estimateCompoundNullsFraction(CompoundPredicateOperator operator, Void context) {
218-
if (operator.isNot()) {
219-
return getExpressionNullsFraction(operator.getChild(0), context);
220-
}
221-
222-
double leftNullFraction = getExpressionNullsFraction(operator.getChild(0), context);
223-
double rightNullFraction = getExpressionNullsFraction(operator.getChild(1), context);
224-
return 1.0 - (1.0 - leftNullFraction) * (1.0 - rightNullFraction);
225-
}
226-
227-
private double getExpressionNullsFraction(ScalarOperator operator, Void context) {
228-
ColumnStatistic statistic = operator.accept(this, context);
229-
return statistic.isUnknown() ? 0.0 : clampFraction(statistic.getNullsFraction());
230-
}
231248

232249
private static double clampFraction(double value) {
233250
if (Double.isNaN(value)) {

fe/fe-core/src/test/java/com/starrocks/sql/optimizer/statistics/ExpressionStatisticsCalculatorTest.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1923,9 +1923,8 @@ public void testCompoundPredicateAndWithNulls() {
19231923
Assertions.assertEquals(0.0, stat.getMinValue(), 0.001);
19241924
Assertions.assertEquals(1.0, stat.getMaxValue(), 0.001);
19251925
Assertions.assertNotNull(stat.getHistogram());
1926-
// TRUE comes from the existing predicate calculator.
1927-
// NULL upper bound: 1 - (1 - 0.3) * (1 - 0.1) = 0.37.
1928-
assertBooleanDistribution(stat, 80L, 550L, 0.37);
1926+
1927+
assertBooleanDistribution(stat, 80L, 750L, 0.17);
19291928
}
19301929

19311930
@Test
@@ -1948,8 +1947,7 @@ public void testCompoundPredicateOrWithNulls() {
19481947
Assertions.assertEquals(1.0, stat.getMaxValue(), 0.001);
19491948
Assertions.assertNotNull(stat.getHistogram());
19501949
// TRUE comes from the existing predicate calculator.
1951-
// NULL uses the same upper bound as AND.
1952-
assertBooleanDistribution(stat, 520L, 110L, 0.37);
1950+
assertBooleanDistribution(stat, 520L, 250L, 0.23);
19531951
}
19541952

19551953
@Test
@@ -1969,7 +1967,7 @@ public void testCompoundPredicateNotWithNulls() {
19691967
Assertions.assertEquals(0.0, stat.getMinValue(), 0.001);
19701968
Assertions.assertEquals(1.0, stat.getMaxValue(), 0.001);
19711969
Assertions.assertNotNull(stat.getHistogram());
1972-
assertBooleanDistribution(stat, 700L, 0L, 0.3);
1970+
assertBooleanDistribution(stat, 500L, 200L, 0.3);
19731971
}
19741972

19751973
@Test

0 commit comments

Comments
 (0)