From: Johannes Weiner <hannes@xxxxxxxxxxx> Subject: mm: vmscan: replace shrink_node() loop with a retry jump Most of the function body is inside a loop, which imposes an additional indentation and scoping level that makes the code a bit hard to follow and modify. The looping only happens in case of reclaim-compaction, which isn't the common case. So rather than adding yet another function level to the reclaim path and have every reclaim invocation go through a level that only exists for one specific cornercase, use a retry goto. Link: http://lkml.kernel.org/r/20191022144803.302233-6-hannes@xxxxxxxxxxx Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> Reviewed-by: Roman Gushchin <guro@xxxxxx> Reviewed-by: Shakeel Butt <shakeelb@xxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/vmscan.c | 237 ++++++++++++++++++++++++-------------------------- 1 file changed, 118 insertions(+), 119 deletions(-) --- a/mm/vmscan.c~mm-vmscan-replace-shrink_node-loop-with-a-retry-jump +++ a/mm/vmscan.c @@ -2729,144 +2729,143 @@ static bool pgdat_memcg_congested(pg_dat static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; + struct mem_cgroup *root = sc->target_mem_cgroup; unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; + struct mem_cgroup *memcg; +again: + memset(&sc->nr, 0, sizeof(sc->nr)); - do { - struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup *memcg; - - memset(&sc->nr, 0, sizeof(sc->nr)); - - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; - - memcg = mem_cgroup_iter(root, NULL, NULL); - do { - unsigned long reclaimed; - unsigned long scanned; - - switch (mem_cgroup_protected(root, memcg)) { - case MEMCG_PROT_MIN: - /* - * Hard protection. - * If there is no reclaimable memory, OOM. - */ - continue; - case MEMCG_PROT_LOW: - /* - * Soft protection. - * Respect the protection only as long as - * there is an unprotected supply - * of reclaimable memory from other cgroups. - */ - if (!sc->memcg_low_reclaim) { - sc->memcg_low_skipped = 1; - continue; - } - memcg_memory_event(memcg, MEMCG_LOW); - break; - case MEMCG_PROT_NONE: - /* - * All protection thresholds breached. We may - * still choose to vary the scan pressure - * applied based on by how much the cgroup in - * question has exceeded its protection - * thresholds (see get_scan_count). - */ - break; - } + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; - reclaimed = sc->nr_reclaimed; - scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc); - - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); - - /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); - - } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); - - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - - if (sc->nr_reclaimed - nr_reclaimed) - reclaimable = true; + memcg = mem_cgroup_iter(root, NULL, NULL); + do { + unsigned long reclaimed; + unsigned long scanned; - if (current_is_kswapd()) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: /* - * If reclaim is isolating dirty pages under writeback, - * it implies that the long-lived page allocation rate - * is exceeding the page laundering rate. Either the - * global limits are not being effective at throttling - * processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing - * device. The only option is to throttle from reclaim - * context which is not ideal as there is no guarantee - * the dirtying process is throttled in the same way - * balance_dirty_pages() manages. - * - * Once a node is flagged PGDAT_WRITEBACK, kswapd will - * count the number of pages under pages flagged for - * immediate reclaim and stall if any are encountered - * in the nr_immediate check below. + * Hard protection. + * If there is no reclaimable memory, OOM. */ - if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); - + continue; + case MEMCG_PROT_LOW: /* - * Tag a node as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. */ - if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; + } + memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU - * faster than they are written so also forcibly stall. + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). */ - if (sc->nr.immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); + break; } + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + shrink_node_memcg(pgdat, memcg, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); + + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling in wait_iff_congested(). + * Tag a node as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. */ - if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_memcg_congestion(pgdat, root, true); + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(PGDAT_CONGESTED, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); /* - * Stall direct reclaim for IO completions if underlying BDIs - * and node is congested. Allow kswapd to continue until it - * starts encountering unqueued dirty pages or cycling through - * the LRU too quickly. + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle() && pgdat_memcg_congested(pgdat, root)) - wait_iff_congested(BLK_RW_ASYNC, HZ/10); + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } - } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc)); + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_memcg_congestion(pgdat, root, true); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + + if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)) + goto again; /* * Kswapd gives up on balancing particular nodes after too _