On 21.11.2014 19:38, Jeff Janes wrote: > > When I run this patch on the regression database, I get a case where > the current method is exact but the adaptive one is off: > > WARNING: ndistinct estimate current=676.00 adaptive=906.00 > > select count(distinct stringu1) from onek; > 676 > > It should be seeing every single row, so I don't know why the > adaptive method is off. Seems like a bug. Thanks for noticing this. I wouldn't call it a bug, but there's clearly room for improvement. The estimator, as described in the original paper, does not expect the sampling to be done "our" way (using fixed number of rows) but assumes to get a fixed percentage of rows. Thus it does not expect the number of sampled rows to get so close (or equal) to the total number of rows. I think the only way to fix this is by checking if samplerows is close to totalrows, and use a straightforward estimate in that case (instead of a more sophisticated one). Something along these lines: if (samplerows >= 0.95 * totalrows) stats->stadistinct = (d + d/0.95) / 2; which means "if we sampled >= 95% of the table, use the number of observed distinct values directly". I have modified the estimator to do the adaptive estimation, and then do this correction too (and print the values). And with that in place I get these results WARNING: ndistinct estimate current=676.00 adaptive=996.00 WARNING: corrected ndistinct estimate current=676.00 adaptive=693.79 So it gets fairly close to the original estimate (and exact value). In the end, this check should be performed before calling the adaptive estimator at all (and not calling it in case we sampled most of the rows). I also discovered an actual bug in the optimize_estimate() function, using 'f_max' instead of the number of sampled rows. Attached is a patch fixing the bug, and implementing the sample size check. regards Tomas
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 732ab22..3975fb6 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -110,6 +110,8 @@ static void update_attstats(Oid relid, bool inh, static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); +static double optimize_estimate(int total_rows, int sample_rows, int ndistinct, + int *f, int f_max); /* * analyze_rel() -- analyze one relation @@ -2369,6 +2371,11 @@ compute_scalar_stats(VacAttrStatsP stats, int slot_idx = 0; CompareScalarsContext cxt; + /* f values for the estimator - messy and we likely need much + * less memory, but who cares */ + int f_max = 0; /* max number of duplicates */ + int *f_count = (int*)palloc0(sizeof(int)*values_cnt); + /* Sort the collected values */ cxt.ssup = &ssup; cxt.tupnoLink = tupnoLink; @@ -2410,6 +2417,7 @@ compute_scalar_stats(VacAttrStatsP stats, ndistinct++; if (dups_cnt > 1) { + nmultiple++; if (track_cnt < num_mcv || dups_cnt > track[track_cnt - 1].count) @@ -2435,6 +2443,12 @@ compute_scalar_stats(VacAttrStatsP stats, track[j].first = i + 1 - dups_cnt; } } + + /* increment the number of values with this number of + * repetitions and the largest number of repetitions */ + f_count[dups_cnt] += 1; + f_max = (f_max < dups_cnt) ? dups_cnt : f_max; + dups_cnt = 0; } } @@ -2481,6 +2495,7 @@ compute_scalar_stats(VacAttrStatsP stats, double numer, denom, stadistinct; + double adaptdistinct; /* adaptive estimate */ numer = (double) samplerows *(double) d; @@ -2494,6 +2509,20 @@ compute_scalar_stats(VacAttrStatsP stats, if (stadistinct > totalrows) stadistinct = totalrows; stats->stadistinct = floor(stadistinct + 0.5); + + /* compute the adaptive estimate */ + adaptdistinct = optimize_estimate(totalrows, samplerows, d, f_count, f_max); + + elog(WARNING, "ndistinct estimate current=%.2f adaptive=%.2f", stadistinct, adaptdistinct); + + /* if we've seen 'almost' all rows, use the estimate instead */ + if (samplerows >= 0.95 * totalrows) + { + adaptdistinct = (d + d/0.95)/2; + elog(WARNING, "corrected ndistinct estimate current=%.2f adaptive=%.2f", + stadistinct, adaptdistinct); + } + } /* @@ -2819,3 +2848,84 @@ compare_mcvs(const void *a, const void *b) return da - db; } + + +/* + * We need to minimize this equality (find "m" solving it) + * + * m - f1 - f2 = f1 * (A + A(m)) / (B + B(m)) + * + * where A, B are effectively constants (not depending on m), and A(m) + * and B(m) are functions. This is equal to solving + * + * 0 = f1 * (A + A(m)) / (B + B(m)) - (m - f1 - f2) + * + * Instead of looking for the exact solution to this equation (which + * might be fractional), we'll look for a natural number minimizing + * the absolute difference. Number of (distinct) elements is a natural + * number, and we don't mind if the number os slightly wrong. It's + * just an estimate, after all. The error from sampling will be much + * worse in most cases. + * + * We know the acceptable values of 'm' are [d,N] where 'd' is the number + * of distinct elements in the sample, and N is the number of rows in + * the table (not just the sample). For large tables (billions of rows) + * that'd be quite time-consuming to compute, so we'll approximate the + * solution by gradually increasing the step to ~1% of the current value + * of 'm'. This will make it much faster and yet very accurate. + * + * All this of course assumes the function behaves reasonably (not + * oscillating etc.), but that's a safe assumption as the estimator + * would perform terribly otherwise. + */ +static double +optimize_estimate(int total_rows, int sample_rows, int d, int *f, int f_max) +{ + int i, m; + double A = 0.0, B = 0.0; + int opt_m = 0; + double opt_diff = total_rows; + int step = 1; + int ndistinct; + + /* compute the 'constant' parts of the equality (A, B) */ + for (i = 3; i <= f_max; i++) + { + double p = i / (double)sample_rows; + A += pow((1.0 - p), sample_rows ) * f[i]; + B += i * pow((1.0 - p), sample_rows-1) * f[i]; + } + + /* find the 'm' value minimizing the difference */ + for (m = 1; m <= total_rows; m += step) + { + int k = (f[1] + 2*f[2]); + double q = k / ((double)(sample_rows * 2 * m)); + + double A_m = A + m * pow((1 - q), sample_rows ); + double B_m = B + k * pow((1 - q), sample_rows-1); + + double diff = fabs(f[1] * (A_m / B_m) - (m - f[1] - f[2])); + + /* if this is a better solution */ + if (diff < opt_diff) + { + opt_diff = diff; + opt_m = m; + } + + /* tweak the step to 1% to make it faster */ + step = ((int)(0.01 * m) > 1) ? (int)(0.01 * m) : 1; + } + + /* compute the final estimate */ + ndistinct = d + opt_m - f[1] - f[2]; + + /* sanity checks that the estimate is within [d,total_rows] */ + if (ndistinct < d) + ndistinct = d; + else if (ndistinct > total_rows) + ndistinct = total_rows; + + return ndistinct; +}
-- Sent via pgsql-performance mailing list (pgsql-performance@xxxxxxxxxxxxxx) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-performance