Search Postgresql Archives

Re: Batch update million records in prd DB

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Michael

This is the script and explain plan info, please check, seems  Filter remove more records took more time

DO $MAIN$
DECLARE
    affect_count integer := 1000;
    processed_row_count integer := 0;
BEGIN
    LOOP
        exit
        WHEN affect_count = 0;
        UPDATE
            app gaa
        SET
            deleted_at = (
                SELECT
                    CAST(extract(epoch FROM now() at time zone 'utc') * 1000000000 AS bigint))
        WHERE
            gaa.id IN (
                SELECT
                    gab.id
                FROM
                    app gab
                LEFT JOIN pol gp ON gab.policy_id = gp.id
                WHERE
                    gab.policy_type = 'policy.protection.total'
                    AND gp.name LIKE 'Mobile backup%'
                    AND gab.deleted_at IS NULL
                    AND gp.deleted_at IS NOT NULL
                    LIMIT 1000);
        GET DIAGNOSTICS affect_count = ROW_COUNT;
        COMMIT;
        processed_row_count = processed_row_count + affect_count;
END LOOP;
    RAISE NOTICE 'total processed rows %', processed_row_count;
END;
$MAIN$;

--early explain plan, 1000 records update take 156.488 ms
--------------------------------------------------------------------
 Update on app gaa  (cost=3307.57..6085.41 rows=1000 width=3943) (actual time=156.347..156.347 rows=0 loops=1)
   InitPlan 1 (returns $0)
     ->  Result  (cost=0.00..0.02 rows=1 width=8) (actual time=0.007..0.008 rows=1 loops=1)
   ->  Nested Loop  (cost=3307.54..6085.39 rows=1000 width=3943) (actual time=18.599..33.987 rows=1000 loops=1)
         ->  HashAggregate  (cost=3306.99..3316.99 rows=1000 width=98) (actual time=18.554..19.085 rows=1000 loops=1)
               Group Key: ("ANY_subquery".id)::text
               ->  Subquery Scan on "ANY_subquery"  (cost=2.17..3304.49 rows=1000 width=98) (actual time=0.041..18.052 rows=1000 loops=1)
                     ->  Limit  (cost=2.17..3294.49 rows=1000 width=37) (actual time=0.030..17.827 rows=1000 loops=1)
                           ->  Merge Join  (cost=2.17..877396.03 rows=266497 width=37) (actual time=0.029..17.764 rows=1000 loops=1)
                                 Merge Cond: ((gab.policy_id)::text = (gp.id)::text)
                                 ->  Index Scan using tmp_uq_policy_id_context2 on app gab  (cost=0.56..487631.06 rows=3151167 width=74) (actual time=0.018..9.192 rows=3542 loops=1)
                                       Filter: ((policy_type)::text = 'policy.protection.total'::text)
                                       Rows Removed by Filter: 2064
                                 ->  Index Scan using pol_pkey on pol gp  (cost=0.56..378322.78 rows=361105 width=37) (actual time=0.008..7.380 rows=1006 loops=1)
                                       Filter: ((deleted_at IS NOT NULL) AND (name ~~ 'Mobile backup%'::text))
                                       Rows Removed by Filter: 3502
         ->  Index Scan using app2_pkey on app gaa  (cost=0.56..2.77 rows=1 width=3874) (actual time=0.014..0.014 rows=1 loops=1000)
               Index Cond: ((id)::text = ("ANY_subquery".id)::text)
 Planning Time: 0.852 ms
 Execution Time: 156.488 ms

--later explain plan, 1000 records update take 13301.600 ms
--------------------------------------------------------------------------
 Update on app gaa  (cost=3789.35..6567.19 rows=1000 width=3980) (actual time=13301.466..13301.466 rows=0 loops=1)
   InitPlan 1 (returns $0)
     ->  Result  (cost=0.00..0.02 rows=1 width=8) (actual time=0.007..0.007 rows=1 loops=1)
   ->  Nested Loop  (cost=3789.32..6567.17 rows=1000 width=3980) (actual time=12881.004..12896.440 rows=1000 loops=1)
         ->  HashAggregate  (cost=3788.77..3798.77 rows=1000 width=98) (actual time=12880.958..12881.378 rows=1000 loops=1)
               Group Key: ("ANY_subquery".id)::text
               ->  Subquery Scan on "ANY_subquery"  (cost=2.17..3786.27 rows=1000 width=98) (actual time=12850.663..12880.505 rows=1000 loops=1)
                     ->  Limit  (cost=2.17..3776.27 rows=1000 width=37) (actual time=12850.656..12880.233 rows=1000 loops=1)
                           ->  Merge Join  (cost=2.17..862421.74 rows=228510 width=37) (actual time=12850.655..12880.162 rows=1000 loops=1)
                                 Merge Cond: ((gab.policy_id)::text = (gp.id)::text)
                                 ->  Index Scan using tmp_uq_policy_id_context2 on app gab  (cost=0.56..474159.31 rows=2701994 width=74) (actual time=0.017..6054.269 rows=2302988 loops=1)
                                       Filter: ((policy_type)::text = 'policy.protection.total'::text)
                                       Rows Removed by Filter: 1822946
                                 ->  Index Scan using pol_pkey on pol gp  (cost=0.56..378322.78 rows=361105 width=37) (actual time=0.007..5976.346 rows=936686 loops=1)
                                       Filter: ((deleted_at IS NOT NULL) AND (name ~~ 'Mobile backup%'::text))
                                       Rows Removed by Filter: 3152553
         ->  Index Scan using app2_pkey on app gaa  (cost=0.56..2.77 rows=1 width=3911) (actual time=0.014..0.014 rows=1 loops=1000)
               Index Cond: ((id)::text = ("ANY_subquery".id)::text)
 Planning Time: 0.785 ms
 Execution Time: 13301.600 ms
 

--we also choose a temporary table solution to test, script as below
DO $MAIN$
DECLARE
  affect_count integer;
  offset_count integer:=0;
  chunk_size CONSTANT integer :=1000;
  sleep_sec CONSTANT numeric :=0.1;
BEGIN

  DROP TABLE IF EXISTS tmp_usage_tbl;
  CREATE TEMPORARY TABLE tmp_usage_tbl(id character varying(36));
 
  INSERT INTO tmp_usage_tbl(id)
                SELECT
                    gab.id
                FROM
                    app gab
                LEFT JOIN pol gp ON gab.policy_id = gp.id
                WHERE
                    gab.policy_type = 'policy.protection.total'
                    AND gp.name LIKE 'Mobile backup%'
                    AND gab.deleted_at IS NULL
                    AND gp.deleted_at IS NOT NULL;

  loop

    exit when affect_count=0;

        UPDATE
            app gaa
        SET
            deleted_at = (
                SELECT
                    CAST(extract(epoch FROM now() at time zone 'utc') * 1000000000 AS bigint))
      WHERE gaa.id IN (SELECT id
                       FROM tmp_usage_tbl
  order by id
                       LIMIT chunk_size offset offset_count);
       
      GET DIAGNOSTICS affect_count = ROW_COUNT;
 
      commit;
 
 offset_count:=offset_count+chunk_size;

      PERFORM pg_sleep(sleep_sec);

  end loop;  

END;
$MAIN$;

--1000 records update take around 2000 ms(each time same as use temporay table)

which solution is better please?

Michael Lewis <mlewis@xxxxxxxxxxx> 于2021年2月27日周六 上午1:46写道:
It might be a concern, but generally that should be a row level lock and only block other update/delete options on those rows. It might be helpful to look at the explain analyze output early on vs later in the process. It might be that you are getting very few hot updates and indexes are being updated constantly.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Postgresql Jobs]     [Postgresql Admin]     [Postgresql Performance]     [Linux Clusters]     [PHP Home]     [PHP on Windows]     [Kernel Newbies]     [PHP Classes]     [PHP Databases]     [Postgresql & PHP]     [Yosemite]

  Powered by Linux