Re: [PATCH 2/2] mm/vmscan: calculate reclaimed slab caches in all reclaim paths

Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> · Fri, 21 Jun 2019 20:30:14 -0700

On Fri, 21 Jun 2019 18:14:46 +0800 Yafang Shao <laoar.shao@xxxxxxxxx> wrote:

> There're six different reclaim paths by now,
> - kswapd reclaim path
> - node reclaim path
> - hibernate preallocate memory reclaim path
> - direct reclaim path
> - memcg reclaim path
> - memcg softlimit reclaim path
> 
> The slab caches reclaimed in these paths are only calculated in the above
> three paths.
> 
> There're some drawbacks if we don't calculate the reclaimed slab caches.
> - The sc->nr_reclaimed isn't correct if there're some slab caches
>   relcaimed in this path.
> - The slab caches may be reclaimed thoroughly if there're lots of
>   reclaimable slab caches and few page caches.
>   Let's take an easy example for this case.
>   If one memcg is full of slab caches and the limit of it is 512M, in
>   other words there're approximately 512M slab caches in this memcg.
>   Then the limit of the memcg is reached and the memcg reclaim begins,
>   and then in this memcg reclaim path it will continuesly reclaim the
>   slab caches until the sc->priority drops to 0.
>   After this reclaim stops, you will find there're few slab caches left,
>   which is less than 20M in my test case.
>   While after this patch applied the number is greater than 300M and
>   the sc->priority only drops to 3.

I got a bit exhausted checking that none of these six callsites can
scribble on some caller's value of current->reclaim_state.

How about we do it at runtime?

From: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Subject: mm/vmscan.c: add checks for incorrect handling of current->reclaim_state

Six sites are presently altering current->reclaim_state.  There is a risk
that one function stomps on a caller's value.  Use a helper function to
catch such errors.

Cc: Yafang Shao <laoar.shao@xxxxxxxxx>
Cc: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 mm/vmscan.c |   37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

--- a/mm/vmscan.c~mm-vmscanc-add-checks-for-incorrect-handling-of-current-reclaim_state
+++ a/mm/vmscan.c
@@ -177,6 +177,18 @@ unsigned long vm_total_pages;
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
+static void set_task_reclaim_state(struct task_struct *task,
+				   struct reclaim_state *rs)
+{
+	/* Check for an overwrite */
+	WARN_ON_ONCE(rs && task->reclaim_state);
+
+	/* Check for the nulling of an already-nulled member */
+	WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+	task->reclaim_state = rs;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 
 /*
@@ -3194,13 +3206,13 @@ unsigned long try_to_free_pages(struct z
 	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
 		return 1;
 
-	current->reclaim_state = &sc.reclaim_state;
+	set_task_reclaim_state(current, &sc.reclaim_state);
 	trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
-	current->reclaim_state = NULL;
+	set_task_reclaim_state(current, NULL);
 
 	return nr_reclaimed;
 }
@@ -3223,7 +3235,7 @@ unsigned long mem_cgroup_shrink_node(str
 	};
 	unsigned long lru_pages;
 
-	current->reclaim_state = &sc.reclaim_state;
+	set_task_reclaim_state(current, &sc.reclaim_state);
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 
@@ -3245,7 +3257,7 @@ unsigned long mem_cgroup_shrink_node(str
 					cgroup_ino(memcg->css.cgroup),
 					sc.nr_reclaimed);
 
-	current->reclaim_state = NULL;
+	set_task_reclaim_state(current, NULL);
 	*nr_scanned = sc.nr_scanned;
 
 	return sc.nr_reclaimed;
@@ -3274,7 +3286,7 @@ unsigned long try_to_free_mem_cgroup_pag
 		.may_shrinkslab = 1,
 	};
 
-	current->reclaim_state = &sc.reclaim_state;
+	set_task_reclaim_state(current, &sc.reclaim_state);
 	/*
 	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
 	 * take care zof from where we get pages. So the node where we start the
@@ -3299,7 +3311,7 @@ unsigned long try_to_free_mem_cgroup_pag
 	trace_mm_vmscan_memcg_reclaim_end(
 				cgroup_ino(memcg->css.cgroup),
 				nr_reclaimed);
-	current->reclaim_state = NULL;
+	set_task_reclaim_state(current, NULL);
 
 	return nr_reclaimed;
 }
@@ -3501,7 +3513,7 @@ static int balance_pgdat(pg_data_t *pgda
 		.may_unmap = 1,
 	};
 
-	current->reclaim_state = &sc.reclaim_state;
+	set_task_reclaim_state(current, &sc.reclaim_state);
 	psi_memstall_enter(&pflags);
 	__fs_reclaim_acquire();
 
@@ -3683,7 +3695,7 @@ out:
 	snapshot_refaults(NULL, pgdat);
 	__fs_reclaim_release();
 	psi_memstall_leave(&pflags);
-	current->reclaim_state = NULL;
+	set_task_reclaim_state(current, NULL);
 
 	/*
 	 * Return the order kswapd stopped reclaiming at as
@@ -3945,17 +3957,16 @@ unsigned long shrink_all_memory(unsigned
 		.hibernation_mode = 1,
 	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
-	struct task_struct *p = current;
 	unsigned long nr_reclaimed;
 	unsigned int noreclaim_flag;
 
 	fs_reclaim_acquire(sc.gfp_mask);
 	noreclaim_flag = memalloc_noreclaim_save();
-	p->reclaim_state = &sc.reclaim_state;
+	set_task_reclaim_state(current, &sc.reclaim_state);
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
-	p->reclaim_state = NULL;
+	set_task_reclaim_state(current, NULL);
 	memalloc_noreclaim_restore(noreclaim_flag);
 	fs_reclaim_release(sc.gfp_mask);
 
@@ -4144,7 +4155,7 @@ static int __node_reclaim(struct pglist_
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	p->flags |= PF_SWAPWRITE;
-	p->reclaim_state = &sc.reclaim_state;
+	set_task_reclaim_state(p, &sc.reclaim_state);
 
 	if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
 		/*
@@ -4156,7 +4167,7 @@ static int __node_reclaim(struct pglist_
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 
-	p->reclaim_state = NULL;
+	set_task_reclaim_state(p, NULL);
 	current->flags &= ~PF_SWAPWRITE;
 	memalloc_noreclaim_restore(noreclaim_flag);
 	fs_reclaim_release(sc.gfp_mask);
_