@@ -76,6 +76,9 @@ public static ColumnStatistic calculate(ScalarOperator operator, Statistics inpu
7676 return operator .accept (new ExpressionStatisticVisitor (input , rowCount ), null );
7777 }
7878
79+ private record NullableBooleanProbabilities (double pTrue , double pFalse , double pNull ) {
80+ }
81+
7982 private static class ExpressionStatisticVisitor extends ScalarOperatorVisitor <ColumnStatistic , Void > {
8083 private final Statistics inputStatistics ;
8184 // Some functions estimate need plan node row count, such as COUNT
@@ -149,24 +152,76 @@ public ColumnStatistic visitCompoundPredicate(CompoundPredicateOperator operator
149152 return ColumnStatistic .unknown ();
150153 }
151154
152- // Lambda arguments are synthetic refs, not input Statistics columns. PredicateStatisticsCalculator fetches
153- // every ColumnRef from Statistics, so compound lambda predicates can fail on missing stats for lambda columns .
154- // Return conservative boolean stats instead of estimating table-column selectivity for them.
155- if ( containsLambdaArgument (operator )) {
156- return buildGenericBooleanStatistic ( operator , estimateCompoundNullsFraction ( operator , context ) );
155+ if ( operator . isNot ()) {
156+ // not just swaps pTrue and pFalse, pNull stays the same .
157+ return computeBooleanProbabilities ( operator . getChild ( 0 ), context ). stream ()
158+ . map ( p -> buildCompoundResult (operator , p . pFalse (), p . pTrue (), p . pNull ())). findFirst ()
159+ . orElseGet ( ColumnStatistic :: unknown );
157160 }
158161
159- Statistics predicateStatistics = PredicateStatisticsCalculator .statisticsCalculate (operator , inputStatistics );
160- if (predicateStatistics == null || Double .isNaN (predicateStatistics .getOutputRowCount ())) {
162+ var left = computeBooleanProbabilities (operator .getChild (0 ), context );
163+ var right = computeBooleanProbabilities (operator .getChild (1 ), context );
164+
165+ if (left .isEmpty () || right .isEmpty ()) {
161166 return ColumnStatistic .unknown ();
162167 }
163168
164- long inputRows = Math .max (1L , Math .round (rowCount ));
165- double nullsFraction = estimateCompoundNullsFraction (operator , context );
166- long nullRows = Math .round (inputRows * nullsFraction );
167- long nonNullRows = Math .max (0L , inputRows - nullRows );
168- long trueRows = Math .min (nonNullRows , estimatePredicateTrueRows (predicateStatistics , inputRows ));
169- long falseRows = nonNullRows - trueRows ;
169+ double pTrue ;
170+ double pFalse ;
171+ var l = left .get ();
172+ var r = right .get ();
173+ if (operator .isAnd ()) {
174+ pTrue = l .pTrue () * r .pTrue ();
175+ pFalse = (l .pFalse () + r .pFalse ()) - (l .pFalse () * r .pFalse ());
176+ } else {
177+ pTrue = (l .pTrue () + r .pTrue ()) - (l .pTrue () * r .pTrue ());
178+ pFalse = l .pFalse () * r .pFalse ();
179+ }
180+ double pNull = 1.0 - pTrue - pFalse ;
181+
182+ return buildCompoundResult (operator , pTrue , pFalse , pNull );
183+ }
184+
185+ private Optional <NullableBooleanProbabilities > computeBooleanProbabilities (ScalarOperator child , Void context ) {
186+ ColumnStatistic childStat = child .accept (this , context );
187+
188+ // Extract from boolean MCV histogram if available. This is the most accurate strategy since it reflects the actual boolean value distribution.
189+ if (!childStat .isUnknown () && childStat .getHistogram () != null ) {
190+ Map <String , Long > mcv = childStat .getHistogram ().getMCV ();
191+ if (mcv != null && (!mcv .isEmpty ())) {
192+ String trueKey = booleanToMcvValue (true );
193+ String falseKey = booleanToMcvValue (false );
194+ if (mcv .containsKey (trueKey ) || mcv .containsKey (falseKey )) {
195+ long trueCount = mcv .getOrDefault (trueKey , 0L );
196+ long falseCount = mcv .getOrDefault (falseKey , 0L );
197+ long inputRows = Math .max (1L , Math .round (rowCount ));
198+ double pNull = childStat .getNullsFraction ();
199+ double pTrue = clampFraction ((double ) trueCount / inputRows );
200+ double pFalse = clampFraction ((double ) falseCount / inputRows );
201+ return Optional .of (new NullableBooleanProbabilities (pTrue , pFalse , pNull ));
202+ }
203+ }
204+ }
205+
206+ // Use PredicateStatisticsCalculator selectivity as a fallback.
207+ Statistics predicateStatistics = PredicateStatisticsCalculator .statisticsCalculate (child , inputStatistics );
208+ if (predicateStatistics != null && !Double .isNaN (predicateStatistics .getOutputRowCount ())) {
209+ double inputStatisticsRows = inputStatistics .getOutputRowCount ();
210+ if (!Double .isNaN (inputStatisticsRows ) && inputStatisticsRows > 0 ) {
211+ double pTrue = clampFraction (predicateStatistics .getOutputRowCount () / inputStatisticsRows );
212+ double pNull = childStat .isUnknown () ? 0.0 : childStat .getNullsFraction ();
213+ double pFalse = clampFraction (1.0 - pTrue - pNull );
214+ return Optional .of (new NullableBooleanProbabilities (pTrue , pFalse , pNull ));
215+ }
216+ }
217+
218+ return Optional .empty ();
219+ }
220+
221+ private ColumnStatistic buildCompoundResult (CompoundPredicateOperator operator , double pTrue , double pFalse ,
222+ double pNull ) {
223+ long trueRows = Math .round (rowCount * pTrue );
224+ long falseRows = Math .round (rowCount * pFalse );
170225
171226 Map <String , Long > mcvs = new HashMap <>();
172227 if (trueRows > 0 ) {
@@ -178,8 +233,7 @@ public ColumnStatistic visitCompoundPredicate(CompoundPredicateOperator operator
178233
179234 ColumnStatistic .Builder builder = ColumnStatistic .builder ()
180235 .setMinValue (0 )
181- .setMaxValue (1 )
182- .setNullsFraction (nullsFraction )
236+ .setMaxValue (1 ).setNullsFraction (pNull )
183237 .setAverageRowSize (operator .getType ().getTypeSize ())
184238 .setDistinctValuesCount (mcvs .size ());
185239
@@ -190,44 +244,7 @@ public ColumnStatistic visitCompoundPredicate(CompoundPredicateOperator operator
190244 return builder .build ();
191245 }
192246
193- private boolean containsLambdaArgument (ScalarOperator operator ) {
194- return operator .asStream ().anyMatch (op -> op instanceof ColumnRefOperator column
195- && column .getOpType () == OperatorType .LAMBDA_ARGUMENT );
196- }
197-
198- private ColumnStatistic buildGenericBooleanStatistic (ScalarOperator operator , double nullsFraction ) {
199- return ColumnStatistic .builder ()
200- .setMinValue (0 )
201- .setMaxValue (1 )
202- .setNullsFraction (clampFraction (nullsFraction ))
203- .setAverageRowSize (operator .getType ().getTypeSize ())
204- .setDistinctValuesCount (2 )
205- .build ();
206- }
207-
208- private long estimatePredicateTrueRows (Statistics predicateStatistics , long inputRows ) {
209- double inputStatisticsRows = inputStatistics .getOutputRowCount ();
210- if (Double .isNaN (inputStatisticsRows ) || inputStatisticsRows <= 0 ) {
211- return Math .max (0L , Math .round (predicateStatistics .getOutputRowCount ()));
212- }
213- double selectivity = clampFraction (predicateStatistics .getOutputRowCount () / inputStatisticsRows );
214- return Math .max (0L , Math .round (inputRows * selectivity ));
215- }
216247
217- private double estimateCompoundNullsFraction (CompoundPredicateOperator operator , Void context ) {
218- if (operator .isNot ()) {
219- return getExpressionNullsFraction (operator .getChild (0 ), context );
220- }
221-
222- double leftNullFraction = getExpressionNullsFraction (operator .getChild (0 ), context );
223- double rightNullFraction = getExpressionNullsFraction (operator .getChild (1 ), context );
224- return 1.0 - (1.0 - leftNullFraction ) * (1.0 - rightNullFraction );
225- }
226-
227- private double getExpressionNullsFraction (ScalarOperator operator , Void context ) {
228- ColumnStatistic statistic = operator .accept (this , context );
229- return statistic .isUnknown () ? 0.0 : clampFraction (statistic .getNullsFraction ());
230- }
231248
232249 private static double clampFraction (double value ) {
233250 if (Double .isNaN (value )) {
0 commit comments