On Fri, 21 Jun 2019 18:14:46 +0800 Yafang Shao <laoar.shao@xxxxxxxxx> wrote: > There're six different reclaim paths by now, > - kswapd reclaim path > - node reclaim path > - hibernate preallocate memory reclaim path > - direct reclaim path > - memcg reclaim path > - memcg softlimit reclaim path > > The slab caches reclaimed in these paths are only calculated in the above > three paths. > > There're some drawbacks if we don't calculate the reclaimed slab caches. > - The sc->nr_reclaimed isn't correct if there're some slab caches > relcaimed in this path. > - The slab caches may be reclaimed thoroughly if there're lots of > reclaimable slab caches and few page caches. > Let's take an easy example for this case. > If one memcg is full of slab caches and the limit of it is 512M, in > other words there're approximately 512M slab caches in this memcg. > Then the limit of the memcg is reached and the memcg reclaim begins, > and then in this memcg reclaim path it will continuesly reclaim the > slab caches until the sc->priority drops to 0. > After this reclaim stops, you will find there're few slab caches left, > which is less than 20M in my test case. > While after this patch applied the number is greater than 300M and > the sc->priority only drops to 3. I got a bit exhausted checking that none of these six callsites can scribble on some caller's value of current->reclaim_state. How about we do it at runtime? From: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Subject: mm/vmscan.c: add checks for incorrect handling of current->reclaim_state Six sites are presently altering current->reclaim_state. There is a risk that one function stomps on a caller's value. Use a helper function to catch such errors. Cc: Yafang Shao <laoar.shao@xxxxxxxxx> Cc: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/vmscan.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) --- a/mm/vmscan.c~mm-vmscanc-add-checks-for-incorrect-handling-of-current-reclaim_state +++ a/mm/vmscan.c @@ -177,6 +177,18 @@ unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + #ifdef CONFIG_MEMCG_KMEM /* @@ -3194,13 +3206,13 @@ unsigned long try_to_free_pages(struct z if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); return nr_reclaimed; } @@ -3223,7 +3235,7 @@ unsigned long mem_cgroup_shrink_node(str }; unsigned long lru_pages; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -3245,7 +3257,7 @@ unsigned long mem_cgroup_shrink_node(str cgroup_ino(memcg->css.cgroup), sc.nr_reclaimed); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; @@ -3274,7 +3286,7 @@ unsigned long try_to_free_mem_cgroup_pag .may_shrinkslab = 1, }; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care zof from where we get pages. So the node where we start the @@ -3299,7 +3311,7 @@ unsigned long try_to_free_mem_cgroup_pag trace_mm_vmscan_memcg_reclaim_end( cgroup_ino(memcg->css.cgroup), nr_reclaimed); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); return nr_reclaimed; } @@ -3501,7 +3513,7 @@ static int balance_pgdat(pg_data_t *pgda .may_unmap = 1, }; - current->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(); @@ -3683,7 +3695,7 @@ out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); psi_memstall_leave(&pflags); - current->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); /* * Return the order kswapd stopped reclaiming at as @@ -3945,17 +3957,16 @@ unsigned long shrink_all_memory(unsigned .hibernation_mode = 1, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - struct task_struct *p = current; unsigned long nr_reclaimed; unsigned int noreclaim_flag; fs_reclaim_acquire(sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); - p->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - p->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); @@ -4144,7 +4155,7 @@ static int __node_reclaim(struct pglist_ */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - p->reclaim_state = &sc.reclaim_state; + set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* @@ -4156,7 +4167,7 @@ static int __node_reclaim(struct pglist_ } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - p->reclaim_state = NULL; + set_task_reclaim_state(p, NULL); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); _