Re: [PATCH][v2] mm: use sc->priority for slab shrink targets

Johannes Weiner <hannes@xxxxxxxxxxx> · Tue, 29 Aug 2017 16:40:26 -0400

On Thu, Aug 24, 2017 at 11:39:36AM -0400, josef@xxxxxxxxxxxxxx wrote:
> From: Josef Bacik <jbacik@xxxxxx>
> 
> Previously we were using the ratio of the number of lru pages scanned to
> the number of eligible lru pages to determine the number of slab objects
> to scan.  The problem with this is that these two things have nothing to
> do with each other, so in slab heavy work loads where there is little to
> no page cache we can end up with the pages scanned being a very low
> number.  This means that we reclaim next to no slab pages and waste a
> lot of time reclaiming small amounts of space.
> 
> Consider the following scenario, where we have the following values and
> the rest of the memory usage is in slab
> 
> Active:            58840 kB
> Inactive:          46860 kB
> 
> Every time we do a get_scan_count() we do this
> 
> scan = size >> sc->priority
> 
> where sc->priority starts at DEF_PRIORITY, which is 12.  The first loop
> through reclaim would result in a scan target of 2 pages to 11715 total
> inactive pages, and 3 pages to 14710 total active pages.  This is a
> really really small target for a system that is entirely slab pages.
> And this is super optimistic, this assumes we even get to scan these
> pages.  We don't increment sc->nr_scanned unless we 1) isolate the page,
> which assumes it's not in use, and 2) can lock the page.  Under
> pressure these numbers could probably go down, I'm sure there's some
> random pages from daemons that aren't actually in use, so the targets
> get even smaller.
> 
> Instead use sc->priority in the same way we use it to determine scan
> amounts for the lru's.  This generally equates to pages.  Consider the
> following
> 
> slab_pages = (nr_objects * object_size) / PAGE_SIZE
> 
> What we would like to do is
> 
> scan = slab_pages >> sc->priority
> 
> but we don't know the number of slab pages each shrinker controls, only
> the objects.  However say that theoretically we knew how many pages a
> shrinker controlled, we'd still have to convert this to objects, which
> would look like the following
> 
> scan = shrinker_pages >> sc->priority
> scan_objects = (PAGE_SIZE / object_size) * scan
> 
> or written another way
> 
> scan_objects = (shrinker_pages >> sc->priority) *
> 		(PAGE_SIZE / object_size)
> 
> which can thus be written
> 
> scan_objects = ((shrinker_pages * PAGE_SIZE) / object_size) >>
> 		sc->priority
> 
> which is just
> 
> scan_objects = nr_objects >> sc->priority
> 
> We don't need to know exactly how many pages each shrinker represents,
> it's objects are all the information we need.  Making this change allows
> us to place an appropriate amount of pressure on the shrinker pools for
> their relative size.
> 
> Signed-off-by: Josef Bacik <jbacik@xxxxxx>

Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx>

This looks good to me, thanks for persisting Josef.

There is a small cleanup possible on top of this, as the slab shrinker
was the only thing that used that lru_pages accumulation when the scan
targets are calculated.

---
>From 818548a03404101d0a1e80ea500643105671b69b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@xxxxxxxxxxx>
Date: Tue, 29 Aug 2017 16:22:41 -0400
Subject: [PATCH] mm: use sc->priority for slab shrink targets fix

The slab shrinker code was the only user of the lru_pages counter in
get_scan_count() and shrink_node_memcg(). Now that it's switched its
pressure notion to sc->priority, the counter is dead code. Remove it.

Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
---
 mm/vmscan.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2bf7890964f7..8a6603d94e0d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2143,8 +2143,7 @@ enum scan_balance {
  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
  */
 static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
-			   struct scan_control *sc, unsigned long *nr,
-			   unsigned long *lru_pages)
+			   struct scan_control *sc, unsigned long *nr)
 {
 	int swappiness = mem_cgroup_swappiness(memcg);
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
@@ -2295,7 +2294,6 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	fraction[1] = fp;
 	denominator = ap + fp + 1;
 out:
-	*lru_pages = 0;
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
 		unsigned long size;
@@ -2325,17 +2323,14 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 		case SCAN_FILE:
 		case SCAN_ANON:
 			/* Scan one type exclusively */
-			if ((scan_balance == SCAN_FILE) != file) {
-				size = 0;
+			if ((scan_balance == SCAN_FILE) != file)
 				scan = 0;
-			}
 			break;
 		default:
 			/* Look ma, no brain */
 			BUG();
 		}
 
-		*lru_pages += size;
 		nr[lru] = scan;
 	}
 }
@@ -2344,7 +2339,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
  * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
  */
 static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
-			      struct scan_control *sc, unsigned long *lru_pages)
+			      struct scan_control *sc)
 {
 	struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 	unsigned long nr[NR_LRU_LISTS];
@@ -2356,7 +2351,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 	struct blk_plug plug;
 	bool scan_adjusted;
 
-	get_scan_count(lruvec, memcg, sc, nr, lru_pages);
+	get_scan_count(lruvec, memcg, sc, nr);
 
 	/* Record the original scan target for proportional adjustments later */
 	memcpy(targets, nr, sizeof(nr));
@@ -2555,7 +2550,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			.pgdat = pgdat,
 			.priority = sc->priority,
 		};
-		unsigned long node_lru_pages = 0;
 		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
@@ -2563,7 +2557,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
-			unsigned long lru_pages;
 			unsigned long reclaimed;
 			unsigned long scanned;
 
@@ -2577,8 +2570,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 			reclaimed = sc->nr_reclaimed;
 			scanned = sc->nr_scanned;
-			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
-			node_lru_pages += lru_pages;
+
+			shrink_node_memcg(pgdat, memcg, sc);
 
 			if (memcg)
 				shrink_slab(sc->gfp_mask, pgdat->node_id,
@@ -3043,7 +3036,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.may_swap = !noswap,
 	};
-	unsigned long lru_pages;
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -3060,7 +3052,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
 	 * will pick up pages from other mem cgroup's as well. We hack
 	 * the priority and make it zero.
 	 */
-	shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
+	shrink_node_memcg(pgdat, memcg, &sc);
 
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
-- 
2.14.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>