On Fri, Apr 23, 2021 at 02:57:07PM +0800, Xing Zhengjun wrote: > On 4/23/2021 1:13 AM, Yu Zhao wrote: > > On Thu, Apr 22, 2021 at 04:36:19PM +0800, Xing Zhengjun wrote: > > > Hi, > > > > > > In the system with very few file pages (nr_active_file + nr_inactive_file > > > < 100), it is easy to reproduce "nr_isolated_file > nr_inactive_file", then > > > too_many_isolated return true, shrink_inactive_list enter "msleep(100)", the > > > long latency will happen. > > > > > > The test case to reproduce it is very simple: allocate many huge pages(near > > > the DRAM size), then do free, repeat the same operation many times. > > > In the test case, the system with very few file pages (nr_active_file + > > > nr_inactive_file < 100), I have dumpped the numbers of > > > active/inactive/isolated file pages during the whole test(see in the > > > attachments) , in shrink_inactive_list "too_many_isolated" is very easy to > > > return true, then enter "msleep(100)",in "too_many_isolated" sc->gfp_mask is > > > 0x342cca ("_GFP_IO" and "__GFP_FS" is masked) , it is also very easy to > > > enter “inactive >>=3”, then “isolated > inactive” will be true. > > > > > > So I have a proposal to set a threshold number for the total file pages to > > > ignore the system with very few file pages, and then bypass the 100ms sleep. > > > It is hard to set a perfect number for the threshold, so I just give an > > > example of "256" for it. > > > > > > I appreciate it if you can give me your suggestion/comments. Thanks. > > > > Hi Zhengjun, > > > > It seems to me using the number of isolated pages to keep a lid on > > direct reclaimers is not a good solution. We shouldn't keep going > > that direction if we really want to fix the problem because migration > > can isolate many pages too, which in turn blocks page reclaim. > > > > Here is something works a lot better. Please give it a try. Thanks. > > Thanks, I will try it with my test cases. Thanks. I took care my sloppiness from yesterday and tested the following. It should apply cleanly and work well. Please let me know. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec7584..48bb2b77389e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -832,6 +832,7 @@ typedef struct pglist_data { #endif /* Fields commonly accessed by the page reclaim scanner */ + atomic_t nr_reclaimers; /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. diff --git a/mm/vmscan.c b/mm/vmscan.c index 562e87cbd7a1..3fcdfbee89c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1775,43 +1775,6 @@ int isolate_lru_page(struct page *page) return ret; } -/* - * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and - * then get rescheduled. When there are massive number of tasks doing page - * allocation, such sleeping direct reclaimers may keep piling up on each CPU, - * the LRU list will go small and be scanned faster than necessary, leading to - * unnecessary swapping, thrashing and OOM. - */ -static int too_many_isolated(struct pglist_data *pgdat, int file, - struct scan_control *sc) -{ - unsigned long inactive, isolated; - - if (current_is_kswapd()) - return 0; - - if (!writeback_throttling_sane(sc)) - return 0; - - if (file) { - inactive = node_page_state(pgdat, NR_INACTIVE_FILE); - isolated = node_page_state(pgdat, NR_ISOLATED_FILE); - } else { - inactive = node_page_state(pgdat, NR_INACTIVE_ANON); - isolated = node_page_state(pgdat, NR_ISOLATED_ANON); - } - - /* - * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they - * won't get blocked by normal direct-reclaimers, forming a circular - * deadlock. - */ - if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) - inactive >>= 3; - - return isolated > inactive; -} - /* * move_pages_to_lru() moves pages from private @list to appropriate LRU list. * On return, @list is reused as a list of pages to be freed by the caller. @@ -1911,20 +1874,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, bool file = is_file_lru(lru); enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); - bool stalled = false; - - while (unlikely(too_many_isolated(pgdat, file, sc))) { - if (stalled) - return 0; - - /* wait a bit for the reclaimer. */ - msleep(100); - stalled = true; - - /* We are about to die and free our memory. Return now. */ - if (fatal_signal_pending(current)) - return SWAP_CLUSTER_MAX; - } lru_add_drain(); @@ -2903,6 +2852,8 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_scanned; gfp_t orig_mask; pg_data_t *last_pgdat = NULL; + bool should_retry = false; + int nr_cpus = num_online_cpus(); /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2914,9 +2865,18 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask |= __GFP_HIGHMEM; sc->reclaim_idx = gfp_zone(sc->gfp_mask); } - +retry: for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, sc->nodemask) { + /* + * Shrink each node in the zonelist once. If the zonelist is + * ordered by zone (not the default) then a node may be shrunk + * multiple times but in that case the user prefers lower zones + * being preserved. + */ + if (zone->zone_pgdat == last_pgdat) + continue; + /* * Take care memory controller reclaiming has small influence * to global LRU. @@ -2941,16 +2901,28 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->compaction_ready = true; continue; } + } - /* - * Shrink each node in the zonelist once. If the - * zonelist is ordered by zone (not the default) then a - * node may be shrunk multiple times but in that case - * the user prefers lower zones being preserved. - */ - if (zone->zone_pgdat == last_pgdat) - continue; + /* + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from + * the LRU list and then get rescheduled. When there are massive + * number of tasks doing page allocation, such sleeping direct + * reclaimers may keep piling up on each CPU, the LRU list will + * go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. + */ + VM_BUG_ON(current_is_kswapd()); + if (!atomic_add_unless(&zone->zone_pgdat->nr_reclaimers, 1, nr_cpus)) { + should_retry = true; + continue; + } + + if (last_pgdat) + atomic_dec(&last_pgdat->nr_reclaimers); + last_pgdat = zone->zone_pgdat; + + if (!cgroup_reclaim(sc)) { /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2966,13 +2938,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - /* See comment about same check for global reclaim above */ - if (zone->zone_pgdat == last_pgdat) - continue; - last_pgdat = zone->zone_pgdat; shrink_node(zone->zone_pgdat, sc); } + if (last_pgdat) + atomic_dec(&last_pgdat->nr_reclaimers); + else if (should_retry) { + /* wait a bit for the reclaimer. */ + if (!schedule_timeout_killable(HZ / 10)) + goto retry; + + /* We are about to die and free our memory. Return now. */ + sc->nr_reclaimed += SWAP_CLUSTER_MAX; + } + /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. @@ -4189,6 +4168,15 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { + int nr_cpus = num_online_cpus(); + + VM_BUG_ON(current_is_kswapd()); + + if (!atomic_add_unless(&pgdat->nr_reclaimers, 1, nr_cpus)) { + schedule_timeout_killable(HZ / 10); + goto out; + } + /* * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. @@ -4196,8 +4184,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in do { shrink_node(pgdat, &sc); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); - } + atomic_dec(&pgdat->nr_reclaimers); + } +out: set_task_reclaim_state(p, NULL); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag);