Slow index scan backward.

"John van Breda" <john.vanbreda@xxxxxxxxxxxxxxxxxx> · Mon, 5 Mar 2018 15:45:09 -0000

I have a single table with 45 columns and 6.5 million records with a roughly random distribution of data of a variety of types. I am trying to implement a data table with pagination in a web user interface, where the data table can be filtered in a very flexible way on pretty much any combination of columns. I have indexes covering the most frequently filtered columns. The data table shows 30 records at a time, sorted to put the most recent records first. The database version is 9.3.5.

The problem occurs if I have a filter which results in less than 30 records, or where the 30 records that are returned are distributed through the dataset (it's OK if the page of 30 records are found in the relatively recent records). Basically, because of the ORDER BY id DESC LIMIT 30 the query planner is opting to use an index scan backward on the primary key, applying a filter to each record, until it finds 30 records. If it finds these relatively quickly then all is good. However sometimes the filter results in < 30 records in the final result set, in which case the index scan runs through the whole table and takes several minutes. A better plan in these cases would be to use the indexes available on the other fields to limit the results set, then filter, sort and limit. But, the planner is presumably not able to work this out because the statistics aren't detailed enough. 

Here's the table schema:

-- Table: cache_occurrences_functional

-- DROP TABLE cache_occurrences_functional;

CREATE TABLE cache_occurrences_functional
(
  id integer NOT NULL,
  sample_id integer,
  website_id integer,
  survey_id integer,
  input_form character varying,
  location_id integer,
  location_name character varying,
  public_geom geometry(Geometry,900913),
  map_sq_1km_id integer,
  map_sq_2km_id integer,
  map_sq_10km_id integer,
  date_start date,
  date_end date,
  date_type character varying(2),
  created_on timestamp without time zone,
  updated_on timestamp without time zone,
  verified_on timestamp without time zone,
  created_by_id integer,
  group_id integer,
  taxa_taxon_list_id integer,
  preferred_taxa_taxon_list_id integer,
  taxon_meaning_id integer,
  taxa_taxon_list_external_key character varying(50),
  family_taxa_taxon_list_id integer,
  taxon_group_id integer,
  taxon_rank_sort_order integer,
  record_status character(1),
  record_substatus smallint,
  certainty character(1),
  query character(1),
  sensitive boolean,
  release_status character(1),
  marine_flag boolean,
  data_cleaner_result boolean,
  media_count integer DEFAULT 0,
  training boolean NOT NULL DEFAULT false,
  zero_abundance boolean,
  licence_id integer,
  location_id_vice_county integer,
  location_id_lrc_boundary integer,
  location_id_country integer,
  identification_difficulty integer, -- Identification difficulty assigned by the data_cleaner module, on a scale from 1 (easy) to 5 (difficult)
  import_guid character varying, -- Globally unique identifier of the import batch.
  confidential boolean DEFAULT false,
  external_key character varying,
  CONSTRAINT pk_cache_occurrences_functional PRIMARY KEY (id)
)
WITH (
  OIDS=FALSE
);
ALTER TABLE cache_occurrences_functional
  OWNER TO indicia_user;
GRANT ALL ON TABLE cache_occurrences_functional TO indicia_user;
GRANT SELECT ON TABLE cache_occurrences_functional TO indicia_report_user;
GRANT SELECT ON TABLE cache_occurrences_functional TO naturespot;
GRANT SELECT ON TABLE cache_occurrences_functional TO brc_read_only;
COMMENT ON COLUMN cache_occurrences_functional.identification_difficulty IS 'Identification difficulty assigned by the data_cleaner module, on a scale from 1 (easy) to 5 (difficult)';
COMMENT ON COLUMN cache_occurrences_functional.import_guid IS 'Globally unique identifier of the import batch.';

-- Index: ix_cache_occurrences_functional_created_by_id

-- DROP INDEX ix_cache_occurrences_functional_created_by_id;

CREATE INDEX ix_cache_occurrences_functional_created_by_id
  ON cache_occurrences_functional
  USING btree
  (created_by_id);

-- Index: ix_cache_occurrences_functional_date_end

-- DROP INDEX ix_cache_occurrences_functional_date_end;

CREATE INDEX ix_cache_occurrences_functional_date_end
  ON cache_occurrences_functional
  USING btree
  (date_end);

-- Index: ix_cache_occurrences_functional_date_start

-- DROP INDEX ix_cache_occurrences_functional_date_start;

CREATE INDEX ix_cache_occurrences_functional_date_start
  ON cache_occurrences_functional
  USING btree
  (date_start);

-- Index: ix_cache_occurrences_functional_family_taxa_taxon_list_id

-- DROP INDEX ix_cache_occurrences_functional_family_taxa_taxon_list_id;

CREATE INDEX ix_cache_occurrences_functional_family_taxa_taxon_list_id
  ON cache_occurrences_functional
  USING btree
  (family_taxa_taxon_list_id);

-- Index: ix_cache_occurrences_functional_group_id

-- DROP INDEX ix_cache_occurrences_functional_group_id;

CREATE INDEX ix_cache_occurrences_functional_group_id
  ON cache_occurrences_functional
  USING btree
  (group_id);

-- Index: ix_cache_occurrences_functional_location_id

-- DROP INDEX ix_cache_occurrences_functional_location_id;

CREATE INDEX ix_cache_occurrences_functional_location_id
  ON cache_occurrences_functional
  USING btree
  (location_id);

-- Index: ix_cache_occurrences_functional_location_id_country

-- DROP INDEX ix_cache_occurrences_functional_location_id_country;

CREATE INDEX ix_cache_occurrences_functional_location_id_country
  ON cache_occurrences_functional
  USING btree
  (location_id_country);

-- Index: ix_cache_occurrences_functional_location_id_lrc_boundary

-- DROP INDEX ix_cache_occurrences_functional_location_id_lrc_boundary;

CREATE INDEX ix_cache_occurrences_functional_location_id_lrc_boundary
  ON cache_occurrences_functional
  USING btree
  (location_id_lrc_boundary);

-- Index: ix_cache_occurrences_functional_location_id_vice_county

-- DROP INDEX ix_cache_occurrences_functional_location_id_vice_county;

CREATE INDEX ix_cache_occurrences_functional_location_id_vice_county
  ON cache_occurrences_functional
  USING btree
  (location_id_vice_county);

-- Index: ix_cache_occurrences_functional_map_sq_10km_id

-- DROP INDEX ix_cache_occurrences_functional_map_sq_10km_id;

CREATE INDEX ix_cache_occurrences_functional_map_sq_10km_id
  ON cache_occurrences_functional
  USING btree
  (map_sq_10km_id);

-- Index: ix_cache_occurrences_functional_map_sq_1km_id

-- DROP INDEX ix_cache_occurrences_functional_map_sq_1km_id;

CREATE INDEX ix_cache_occurrences_functional_map_sq_1km_id
  ON cache_occurrences_functional
  USING btree
  (map_sq_1km_id);

-- Index: ix_cache_occurrences_functional_map_sq_2km_id

-- DROP INDEX ix_cache_occurrences_functional_map_sq_2km_id;

CREATE INDEX ix_cache_occurrences_functional_map_sq_2km_id
  ON cache_occurrences_functional
  USING btree
  (map_sq_2km_id);

-- Index: ix_cache_occurrences_functional_public_geom

-- DROP INDEX ix_cache_occurrences_functional_public_geom;

CREATE INDEX ix_cache_occurrences_functional_public_geom
  ON cache_occurrences_functional
  USING gist
  (public_geom);

-- Index: ix_cache_occurrences_functional_status

-- DROP INDEX ix_cache_occurrences_functional_status;

CREATE INDEX ix_cache_occurrences_functional_status
  ON cache_occurrences_functional
  USING btree
  (record_status COLLATE pg_catalog."default", record_substatus);

-- Index: ix_cache_occurrences_functional_submission

-- DROP INDEX ix_cache_occurrences_functional_submission;

CREATE INDEX ix_cache_occurrences_functional_submission
  ON cache_occurrences_functional
  USING btree
  (website_id, survey_id, sample_id);
ALTER TABLE cache_occurrences_functional CLUSTER ON ix_cache_occurrences_functional_submission;

-- Index: ix_cache_occurrences_functional_taxa_taxon_list_external_key

-- DROP INDEX ix_cache_occurrences_functional_taxa_taxon_list_external_key;

CREATE INDEX ix_cache_occurrences_functional_taxa_taxon_list_external_key
  ON cache_occurrences_functional
  USING btree
  (taxa_taxon_list_external_key COLLATE pg_catalog."default");

-- Index: ix_cache_occurrences_functional_taxon_group_id

-- DROP INDEX ix_cache_occurrences_functional_taxon_group_id;

CREATE INDEX ix_cache_occurrences_functional_taxon_group_id
  ON cache_occurrences_functional
  USING btree
  (taxon_group_id);

-- Index: ix_cache_occurrences_functional_updated_on

-- DROP INDEX ix_cache_occurrences_functional_updated_on;

CREATE INDEX ix_cache_occurrences_functional_updated_on
  ON cache_occurrences_functional
  USING btree
  (updated_on);

-- Index: ix_cache_occurrences_functional_verified_on

-- DROP INDEX ix_cache_occurrences_functional_verified_on;

CREATE INDEX ix_cache_occurrences_functional_verified_on
  ON cache_occurrences_functional
  USING btree
  (verified_on);

Here's an example query:

SELECT o.id
  FROM cache_occurrences_functional o
  WHERE o.website_id in (101,12,24,14,8,6,17,25,11,3,7,30,40,16,27,34,5,43,13,41,29,33,44,32,42,47,54,28,51,49,59,65,68,73,75,9,71,83,87,72,97,69,23,10)
AND o.record_status='C' and o.record_substatus is null and (o.query<>'Q' or o.query is null)
AND o.taxa_taxon_list_external_key in ('NBNSYS0000008324')
AND o.media_count>0
 ORDER BY o.id DESC LIMIT 30

and a link to a query plan:
https://explain.depesz.com/s/LuK7

Interestingly if I deliberately prevent the index being scanned by sorting by o.id+0, then I get good performance because the planner uses the column indexes to filter first:
SELECT o.id
  FROM cache_occurrences_functional o
  WHERE o.website_id in (101,12,24,14,8,6,17,25,11,3,7,30,40,16,27,34,5,43,13,41,29,33,44,32,42,47,54,28,51,49,59,65,68,73,75,9,71,83,87,72,97,69,23,10)
AND o.record_status='C' and o.record_substatus is null and (o.query<>'Q' or o.query is null)
AND o.taxa_taxon_list_external_key in ('NBNSYS0000008324')
AND o.media_count>0
 ORDER BY o.id+0 DESC LIMIT 30

The "fixed" plan:
https://explain.depesz.com/s/7KAy

Unfortunately this way of hacking the query to prevent the index scan backward makes other filters with more than 30 records in the results set much slower so it is not an option.

Any ideas on indexing strategies or ways of restructuring the database schema to cope with this scenario would be much appreciated.

Regards
John