Efficient rows filter for array inclusion with gin index

Shanti-Dominique <shanti@xxxxxxxxxx> · Wed, 28 Feb 2024 12:05:27 +0100

Hello,

I have a problem writing performant queries that requires to test value 
inclusion within an array.

First: the schema consists of an "items" table with a "ref_id" primary 
key (uuid, primary key). The items are hierarchical and can have a 
"parent_ref_id" (uuid) that references their parent. In order to avoid 
recursive queries,there is a secondary table "item_paths" populated via 
triggers that have two columns "ref_id" (uuid that references a row in 
"items") and "item_path" (uuid[] which contains the path of items from 
the root to the item included, a gin index).

On order to test if an item i2 isa descendant of another item i1, I see 
two ways to query the database:

1)
    SELECT  *
    FROM    items i1
            JOIN item_paths p1 ON i1.ref_id = p1.ref_id
            JOIN items i2 ON i2.ref_id = ANY (p1.item_path)
    WHERE   ...

2)
    SELECT  *
    FROM    items i1
            JOIN item_paths p1 ON i1.ref_id = p1.ref_id
            JOIN items i2 ON ARRAY[i2.ref_id] <@ p1.item_path
    WHERE   ...

The is that neither of these two solutions seems good for the general case:

1) does not make use of the gin index as the "= ANY(...)" construct is 
not part of the supported operators. What it seems to be doing is that 
it runs a sequential scan against p1 while it uses the index to find the 
item i2.

2) uses the operator <@ which is supported by the gin index, the test 
for inclusion is fast and the query does not run a sequential scan over 
the whole "item_paths" table. However, because of the ARRAY[i2.ref_id] 
construct, it performs a sequential scan on i2.

I use this kind of construct in many places in my code and I'd welcome a 
solution that would use the index in all cases. Is it possible?

Thank you.

Here below a SQL file that demonstrate the problem using EXPLAIN:

-- Database Schema

SET enable_seqscan  = off;

CREATE TABLE items (
    ref_id uuid DEFAULT public.gen_random_uuid() NOT NULL,
    name character varying,
    parent_ref_id uuid
);

CREATE TABLE item_paths (
    ref_id uuid NOT NULL,
    item_path uuid[]
);

CREATE INDEX items_ref_id_idx ON items USING btree (ref_id);
CREATE INDEX items_name_idx ON items USING btree (name);
CREATE INDEX items_parent_ref_id_idx ON items USING btree (parent_ref_id);
CREATE INDEX item_paths_ref_id_idx ON item_paths USING btree (ref_id);
CREATE INDEX item_paths_item_path_idx ON item_paths USING gin (item_path);

--------------------------------------------------------------------------------

-- Fast: select the small number of items i1, join with their paths and 
from the
-- path values take all the values from i2 using the index on i2

EXPLAIN
SELECT  i2.*
FROM    items i1
        JOIN item_paths p1 ON i1.ref_id = p1.ref_id
        JOIN items i2 ON i2.ref_id = ANY (p1.item_path)
WHERE   i1.name = 'a';

-- Nested Loop  (cost=5.52..102.86 rows=903 width=64)
--   ->  Nested Loop  (cost=5.37..46.14 rows=21 width=32)
--         ->  Bitmap Heap Scan on items i1 (cost=4.18..12.64 rows=4 
width=16)
--               Recheck Cond: ((name)::text = 'a'::text)
--               ->  Bitmap Index Scan on items_name_idx 
(cost=0.00..4.18 rows=4 width=0)
--                     Index Cond: ((name)::text = 'a'::text)
--         ->  Bitmap Heap Scan on item_paths p1 (cost=1.19..8.32 rows=5 
width=48)
--               Recheck Cond: (ref_id = i1.ref_id)
--               ->  Bitmap Index Scan on item_paths_ref_id_idx  
(cost=0.00..1.19 rows=5 width=0)
--                     Index Cond: (ref_id = i1.ref_id)
--   ->  Index Scan using items_ref_id_idx on items i2 (cost=0.15..2.27 
rows=43 width=64)
--         Index Cond: (ref_id = ANY (p1.item_path))

--------------------------------------------------------------------------------

-- Slow: select the small number of items then performs a sequantial 
scan on the
-- item_paths to find the paths that have this item ref_id in their path.

EXPLAIN
SELECT  item_paths.*
FROM    items
        JOIN item_paths ON items.ref_id = ANY (item_paths.item_path)
WHERE   items.name = 'a';

-- Nested Loop  (cost=10000000004.18..10000000140.35 rows=209 width=48)
--   Join Filter: (items.ref_id = ANY (item_paths.item_path))
--   ->  Seq Scan on item_paths (cost=10000000000.00..10000000020.70 
rows=1070 width=48)
--   ->  Materialize  (cost=4.18..12.66 rows=4 width=16)
--         ->  Bitmap Heap Scan on items  (cost=4.18..12.64 rows=4 width=16)
--               Recheck Cond: ((name)::text = 'a'::text)
--               ->  Bitmap Index Scan on items_name_idx 
(cost=0.00..4.18 rows=4 width=0)
--                     Index Cond: ((name)::text = 'a'::text)

--------------------------------------------------------------------------------

-- Slow: Find by index i1 and p1, then perform a sequantial scan on i2 
(probably
-- caused by the construct ARRAY[i1.ref_id]) to find the matching item

EXPLAIN
SELECT  i2.*
FROM    items i1
        JOIN item_paths p1 ON i1.ref_id = p1.ref_id
        JOIN items i2 ON ARRAY[i2.ref_id] <@ p1.item_path
WHERE   i1.name = 'a';

-- Nested Loop  (cost=10000000005.37..10000000342.19 rows=92 width=64)
--   Join Filter: (ARRAY[i2.ref_id] <@ p1.item_path)
--   ->  Seq Scan on items i2 (cost=10000000000.00..10000000018.80 
rows=880 width=64)
--   ->  Materialize  (cost=5.37..46.24 rows=21 width=32)
--         ->  Nested Loop  (cost=5.37..46.14 rows=21 width=32)
--               ->  Bitmap Heap Scan on items i1 (cost=4.18..12.64 
rows=4 width=16)
--                     Recheck Cond: ((name)::text = 'a'::text)
--                     ->  Bitmap Index Scan on items_name_idx  
(cost=0.00..4.18 rows=4 width=0)
--                           Index Cond: ((name)::text = 'a'::text)
--               ->  Bitmap Heap Scan on item_paths p1 (cost=1.19..8.32 
rows=5 width=48)
--                     Recheck Cond: (ref_id = i1.ref_id)
--                     ->  Bitmap Index Scan on item_paths_ref_id_idx  
(cost=0.00..1.19 rows=5 width=0)
--                           Index Cond: (ref_id = i1.ref_id)

--------------------------------------------------------------------------------

-- Fast: Use the index to find the item and then use the gin index to 
find the
-- path.

EXPLAIN
SELECT  item_paths.*
FROM    items
        JOIN item_paths ON ARRAY[items.ref_id] <@ item_paths.item_path
WHERE   items.name = 'a';

-- Nested Loop  (cost=9.22..61.54 rows=21 width=48)
--   ->  Bitmap Heap Scan on items  (cost=4.18..12.64 rows=4 width=16)
--         Recheck Cond: ((name)::text = 'a'::text)
--         ->  Bitmap Index Scan on items_name_idx (cost=0.00..4.18 
rows=4 width=0)
--               Index Cond: ((name)::text = 'a'::text)
--   ->  Bitmap Heap Scan on item_paths  (cost=5.04..12.17 rows=5 width=48)
--         Recheck Cond: (ARRAY[items.ref_id] <@ item_path)
--         ->  Bitmap Index Scan on item_paths_item_path_idx 
(cost=0.00..5.04 rows=5 width=0)
--               Index Cond: (item_path @> ARRAY[items.ref_id])

--------------------------------------------------------------------------------

DROP TABLE items;
DROP TABLE item_paths;