Re: Re: join under-estimates with ineq conditions

Justin Pryzby <pryzby@xxxxxxxxxxxxx> · Thu, 15 Jun 2017 21:53:52 -0500

I never heard back but was hoping for some feedback/discussion about this 2nd
problem/patch.

just a reminder - Thanks

On Thu, Jun 08, 2017 at 11:05:38AM -0500, Justin Pryzby wrote:
> On Mon, Jun 05, 2017 at 05:02:32PM -0400, Tom Lane wrote:
> > Justin Pryzby <pryzby@xxxxxxxxxxxxx> writes:
> > > diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
> > > +       if (nd1>vardata1->rel->rows) nd1=vardata1->rel->rows;
> > > +       if (nd2>vardata1->rel->rows) nd2=vardata2->rel->rows;
> > 
> > I don't like this change too much.
> 
> Thanks for your analysis ;)
> 
> I have a 2nd patch which improves the 2nd case I mentioned..
> 
> > I note for instance that this patch would do nothing at all for the toy
> 
> >> There's still an 2nd issue which this doesn't address, having to do with joins
> >> of tables with full/complete MCV lists, and selective queries on those tables,
> >> as demonstrated by the artificial test:
> >>
> >> > postgres=# CREATE TABLE t(i INT);
> >> > postgres=# TRUNCATE t;INSERT INTO t SELECT i FROM generate_series(1,99) i,generate_series(1,99);ANALYZE t;
> >> > postgres=# SELECT (SELECT sum(x) FROM unnest(most_common_freqs) x) frac_MCV, tablename, attname, n_distinct, array_length(most_common_vals,1) n_mcv, array_length(histogram_bounds,1) n_hist, (SELECT MAX(x) FROM unnest(most_common_vals::text::text[]) x) maxmcv, (histogram_bounds::text::text[])[array_length(histogram_bounds,1)] maxhist FROM pg_stats WHERE attname~'i' AND tablename='t' GROUP BY 1,2,3,4,5,6,7,8 ORDER BY 1 DESC;
> 
> I pointed out that there were two issues, both involving underestimates from
> querying a fraction of a table using inequality condition.  One due to join
> estimate based on "nd" (and not substantially based on MCV), and one due to
> frequencies associated with MCV list (and not substantially falling back to
> estimate from "nd").
> 
> I made another patch to address the 2nd issue, which affects our pre-aggregated
> tables (which are partitioned by month, same as the raw tables).  The
> aggregated tables are the result of something like SELECT start_time::date, k1,
> k2, ..., sum(a), avg(b) ... GROUP BY 1,2,3, so have many fewer rows, and nd for
> start_time::date column would be at most 31, so MCV list would be expected to
> be complete, same as the "toy" example I gave.
> 
> Sometimes when we query the aggregated tables for a small number of days we get
> underestimate leading to nested loops..
> 
> Without patch:
> Merge Join  (cost=339.59..341.57 rows=99 width=4) (actual time=10.190..17.430 rows=9801 loops=1)
> 
> With patch:
> DEBUG:  ndfactor 99.000000 99.000000
> DEBUG:  nmatches 99 matchprodfreq 1.000000
> DEBUG:  nmatches 99 matchprodfreq 1.000000
> DEBUG:  matchfreq1 99.000000 unmatchfreq1 0.000000
> DEBUG:  matchfreq1 1.000000 unmatchfreq1 0.000000
> DEBUG:  matchfreq2 99.000000 unmatchfreq2 0.000000
> DEBUG:  matchfreq2 1.000000 unmatchfreq2 0.000000
> DEBUG:  otherfreq1 0.000000 otherfreq2 0.000000
> DEBUG:  select(1) 1.000000
>  Hash Join  (cost=167.75..444.77 rows=9801 width=4) (actual time=4.706..13.892 rows=9801 loops=1)
> 
> 
> diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
> index 6a4f7b1..bc88423 100644
> --- a/src/backend/utils/adt/selfuncs.c
> +++ b/src/backend/utils/adt/selfuncs.c
> @@ -2279,6 +2279,14 @@ eqjoinsel_inner(Oid operator,
>  
>  	nd1 = get_variable_numdistinct(vardata1, &isdefault1);
>  	nd2 = get_variable_numdistinct(vardata2, &isdefault2);
> +	float ndfactor1=1;
> +	float ndfactor2=1;
> +	if (vardata1->rel->rows)
> +		ndfactor1=vardata1->rel->tuples / vardata1->rel->rows;
> +	if (vardata2->rel->rows)
> +		ndfactor2=vardata2->rel->tuples / vardata2->rel->rows;
> +	// ndfactor1=ndfactor2=1;
> +	elog(DEBUG4, "ndfactor %lf %lf", ndfactor1,ndfactor2);
>  
>  	opfuncoid = get_opcode(operator);
>  
> @@ -2375,7 +2383,19 @@ eqjoinsel_inner(Oid operator,
>  				}
>  			}
>  		}
> +
> +		// you might think we should multiple by ndfactor1*ndfactor2,
> +		// but that gives serious overestimates...
> +		// matchprodfreq*= ndfactor1>ndfactor2?ndfactor1:ndfactor2;
> +		// matchprodfreq*=ndfactor1;
> +		// matchprodfreq*=ndfactor2;
> +		// matchprodfreq*= ndfactor1<ndfactor2?ndfactor1:ndfactor2;
> +		matchprodfreq*= ndfactor1<ndfactor2?ndfactor1:ndfactor2;
> +
> +		elog(DEBUG4, "nmatches %d matchprodfreq %lf", nmatches, matchprodfreq);
>  		CLAMP_PROBABILITY(matchprodfreq);
> +		elog(DEBUG4, "nmatches %d matchprodfreq %lf", nmatches, matchprodfreq);
> +
>  		/* Sum up frequencies of matched and unmatched MCVs */
>  		matchfreq1 = unmatchfreq1 = 0.0;
>  		for (i = 0; i < nvalues1; i++)
> @@ -2385,8 +2405,14 @@ eqjoinsel_inner(Oid operator,
>  			else
>  				unmatchfreq1 += numbers1[i];
>  		}
> +
> +		matchfreq1*=ndfactor1;
> +		unmatchfreq1*=ndfactor1;
> +		elog(DEBUG4, "matchfreq1 %lf unmatchfreq1 %lf", matchfreq1, unmatchfreq1);
>  		CLAMP_PROBABILITY(matchfreq1);
>  		CLAMP_PROBABILITY(unmatchfreq1);
> +		elog(DEBUG4, "matchfreq1 %lf unmatchfreq1 %lf", matchfreq1, unmatchfreq1);
> +
>  		matchfreq2 = unmatchfreq2 = 0.0;
>  		for (i = 0; i < nvalues2; i++)
>  		{
> @@ -2395,8 +2421,12 @@ eqjoinsel_inner(Oid operator,
>  			else
>  				unmatchfreq2 += numbers2[i];
>  		}
> +		matchfreq2*=ndfactor2;
> +		unmatchfreq2*=ndfactor2;
> +		elog(DEBUG4, "matchfreq2 %lf unmatchfreq2 %lf", matchfreq2, unmatchfreq2);
>  		CLAMP_PROBABILITY(matchfreq2);
>  		CLAMP_PROBABILITY(unmatchfreq2);
> +		elog(DEBUG4, "matchfreq2 %lf unmatchfreq2 %lf", matchfreq2, unmatchfreq2);
>  		pfree(hasmatch1);
>  		pfree(hasmatch2);
>  
>  	if (have_mcvs1)
> 
> Justin

-- 
Sent via pgsql-performance mailing list (pgsql-performance@xxxxxxxxxxxxxx)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-performance