Re: More than one UNIQUE key when matching items..

Ron Clarke <rclarkeai@xxxxxxxxx> · Mon, 22 Mar 2021 13:22:43 +0000

Hey thanks for working out a solution to this deceptive problem.   One of those you expect to be simple, but then all of a sudden it isn't.
Best regards
Ron

On Sat, 20 Mar 2021 at 19:01, Allan Kamau <kamauallan@xxxxxxxxx> wrote:

On Sat, Mar 20, 2021 at 6:52 PM Ron Clarke <rclarkeai@xxxxxxxxx> wrote:
/*I'm trying to port a system from SQL server, and at the same time better learn postgreSQL.

I've come across a problem that is easily solved in that world, but I am struggling to find an approach in postgres that works.

We have 2 sets of events A and B (sets), they have a shared number (ncode), both have unique Id's

We want to link items of set A to those of set B, but each item of each set can only be linked once.    That is we do not want to link all set 'A' items to all set 'B' Items with the same code.		  

	 In SQL Server this is easy, we insert the records into a temporary table with separate Unique indexes on the id for set a and the ids for set b and put the 'ignore_dup_key' on which tells SQL Server to ignore duplicate rows and carry on.

The nearest to this at first appears to be the 

ON CONFLICT ON CONSTRAINT IGNORE in Postgres. But this only works with a single constraint, at a time i.e. we can't set the ON CONFLICT ON CONSTRAINT IGNORE to work with multiple UNIQUE indexes.

To show the problem:

	 I'm using PostgreSQL version 11.

*/
-- source data
WITH sd AS (
	SELECT iid, s, ncode FROM (
		VALUES (1, 'A', 10), 
       (2, 'A', 30), 
       (3, 'A', 10),
       (4, 'B', 10),
       (5, 'B', 20),
       (6, 'B', 10)
		) 
	AS tx (iid, s, ncode))
SELECT iid, s, ncode FROM sd		   

/*     The target result would be : 

		   id:1, A, 10 this matches id:4, B, 10
  id:3, A, 10 this matches id:6, B, 10
*/

--  Example to get the wrong answer, i.e. both sets of links

WITH 
	sd (i, s, n ) AS (
	SELECT iid, s, ncode FROM (
		VALUES (1, 'A', 10), 
			   (2, 'A', 30), 
			   (3, 'A', 10),
			   (4, 'B', 10),
			   (5, 'B', 20),
			   (6, 'B', 10)
		) 
	AS tx (iid, s, ncode))
	,
	x AS ( SELECT 
   		ax.i as ia,
    ax.s as sa,
    ax.n as na,
    bx.i as ib,
    bx.s as sb,
    bx.n as nb,
ROW_NUMBER () OVER (
			PARTITION BY bx.i
			ORDER BY
				ax.i ) as rx
   		FROM sd AS ax  
   		INNER JOIN sd AS bx ON ax.n = bx.n and ax.i != bx.i and bx.s = 'B'
   		WHERE ax.s = 'A'
		)
		SELECT ia,ib, na, rx FROM x
		;		   

/*	   I've tried using a recursive CTE where I'm trying to exclude results from the result set that have already been identified, but I can't get an allowed syntax. 
	   Doesn't seem to allow joins to the recursive term to exclude results.
*/	

/* 	   I've tried Unique and Exclusion constraints on temporary table, e.g */
--     similar Example to get the wrong answer, i.e. both sets of links

DROP TABLE IF EXISTS links ;

CREATE TEMPORARY TABLE links 
	 (    mid serial ,
			ia int ,
			-- ia int UNIQUE,
			ib int ,
			-- ib int UNIQUE,
	    	EXCLUDE USING gist (ia WITH =, ib WITH =)

	  	) ;		   

WITH 
	sd (i, s, n ) AS (
	SELECT iid, side, ncode FROM (
		VALUES (1, 'A', 10), 
			   (2, 'A', 30), 
			   (3, 'A', 10),
			   (4, 'B', 10),
			   (5, 'B', 20),
			   (6, 'B', 10)
		) 
	AS tx (iid, side, ncode))
	,
	x AS (		   
	SELECT 
   		ax.i as ia,
		ax.s as sa,
		ax.n as na,
		bx.i as ib,
		bx.s as sb,
		bx.n as nb,
		ROW_NUMBER () OVER (
			PARTITION BY bx.i
			ORDER BY
				ax.i
		) as rx
   		FROM sd AS ax  
   		INNER JOIN sd AS bx ON ax.n = bx.n and ax.i != bx.i and bx.s = 'B'
   		WHERE ax.s = 'A'
		)
		-- SELECT * FROM x
		INSERT INTO links(ia,ib)
		SELECT ia, ib FROM x
		ON CONFLICT ON CONSTRAINT links_ia_ib_excl DO NOTHING;		   	   

 		--
		SELECT * from links;		   	   

/* 	   I've also tried and failed to use array(ia,ib) within or as computed column of an Exclusion constraint of && s on temporary table, e.g 
	   but can't find any syntax that doesn't result in an error    
		   */

DROP TABLE IF EXISTS links ;

CREATE TEMPORARY TABLE links 
	 (    mid serial ,
			ia int ,
			-- ia int UNIQUE,
			ib int ,
			-- ib int UNIQUE,
	    	 ix int[],
	   		EXCLUDE USING gist (ix WITH &&)
	  	) ;		   

-- This gives me:
-- ERROR:  data type integer[] has no default operator class for access method "gist"

-- I have the btree_gist extension installed		   

/*

	I appreciate I could create a cursor from a list of proposed links and step through each one, checking if the id value has been "used up"
	but I am trying to keep this as a set based operation to give me the results in one statement.

	There are some similar questions w.r.t. duplicate detection, but these again seem to be solved by evaluating each proposed record individually.
    If that's just what I have to do then so be it. There is probably a simple 'postgreSQL' freindly approach I'm still yet to discover having spent
	too long in Sybase and SQL Server worlds.
		  Thanks for looking at this

*/		   

Hi Ron,

How about the code below.
It may require testing with more data.

WITH _sd AS (
    SELECT iid, s, ncode FROM (
    VALUES (1, 'A', 10),
           (2, 'A', 30),
           (3, 'A', 10),
           (4, 'B', 10),
           (5, 'B', 20),
           (6, 'B', 10)
    )
    AS tx (iid, s, ncode)
)
--SELECT a.iid, a.s, a.ncode FROM _sd a;
,_sd__ab AS
(
    SELECT
        a.iid as iid__a, a.s AS s__a, a.ncode AS ncode__a
        ,b.iid as iid__b, b.s AS s__b, b.ncode AS ncode__b
    FROM _sd a
    JOIN _sd b ON b.ncode=a.ncode AND b.s>a.s
)
,_sd__ab__dist AS
(
    SELECT
        DISTINCT ON(
            a.iid__a
            ,a.iid__b
        )
        a.iid__a, a.s__a, a.ncode__a
        ,a.iid__b, a.s__b, a.ncode__b
    FROM _sd__ab a
    ORDER BY
        a.iid__a, a.iid__b, a.s__a, a.ncode__a
        , a.s__b, a.ncode__b
)
,_sd__ab__dist2 AS
(
    SELECT
        a.iid__a, a.s__a, a.ncode__a
        ,a.iid__b, a.s__b, a.ncode__b
        ,a.iid__a__b__row_number1
        ,a.iid__a__b__row_number2
    FROM
    (
        SELECT
            a.iid__a, a.s__a, a.ncode__a
            ,a.iid__b, a.s__b, a.ncode__b
            ,ROW_NUMBER()OVER(PARTITION BY a.ncode__a,a.iid__a ORDER BY a.iid__b)AS iid__a__b__row_number1
            ,ROW_NUMBER()OVER(PARTITION BY a.ncode__a,a.iid__b ORDER BY a.iid__a)AS iid__a__b__row_number2
        FROM _sd__ab__dist a
    )a
)
SELECT a.*,ROW_NUMBER()OVER(ORDER BY a.iid__a)AS row_number FROM _sd__ab__dist2 a WHERE a.iid__a__b__row_number1=iid__a__b__row_number2
;

Yields
 iid__a | s__a | ncode__a | iid__b | s__b | ncode__b | iid__a__b__row_number1 | iid__a__b__row_number2 | row_number 
--------+------+----------+--------+------+----------+------------------------+------------------------+------------
      1 | A    |       10 |      4 | B    |       10 |                      1 |                      1 |          1
      3 | A    |       10 |      6 | B    |       10 |                      2 |                      2 |          2
(2 rows)

Time: 2.394 ms

-Allan.