[RFC 2/3] mm/page_alloc: Access lists in 'struct per_cpu_pages' indirectly

Nicolas Saenz Julienne <nsaenzju@xxxxxxxxxx> · Fri, 8 Oct 2021 18:19:21 +0200

In preparation to adding remote pcplists drain support, let's bundle
'struct per_cpu_pages' list heads and page count into a new structure,
'struct pcplists', and have all code access it indirectly through a
pointer. It'll be used by upcoming patches, which will maintain multiple
versions of pcplists and switch the pointer atomically.

free_pcppages_bulk() also gains a new argument, since we want to avoid
dereferencing the pcplists pointer twice per critical section (delimited
by the pagevec local locks).

'struct pcplists' data is marked as __private, so as to make sure nobody
accesses it directly, except for the initialization code. Note that
'struct per_cpu_pages' is used during boot, when no allocation is
possible.

Signed-off-by: Nicolas Saenz Julienne <nsaenzju@xxxxxxxxxx>
---
 include/linux/mmzone.h | 10 +++++--
 mm/page_alloc.c        | 66 +++++++++++++++++++++++++-----------------
 mm/vmstat.c            |  6 ++--
 3 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6a1d79d84675..fb023da9a181 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -358,7 +358,6 @@ enum zone_watermarks {
 
 /* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
-	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
 	short free_factor;	/* batch scaling factor during free */
@@ -366,8 +365,13 @@ struct per_cpu_pages {
 	short expire;		/* When 0, remote pagesets are drained */
 #endif
 
-	/* Lists of pages, one per migrate type stored on the pcp-lists */
-	struct list_head lists[NR_PCP_LISTS];
+	struct pcplists *lp;
+	struct pcplists {
+		/* Number of pages in the lists */
+		int count;
+		/* Lists of pages, one per migrate type stored on the pcp-lists */
+		struct list_head lists[NR_PCP_LISTS];
+	} __private pcplists;
 };
 
 struct per_cpu_zonestat {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd89933503b4..842816f269da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1438,7 +1438,8 @@ static inline void prefetch_buddy(struct page *page)
  * pinned" detection logic.
  */
 static void free_pcppages_bulk(struct zone *zone, int count,
-					struct per_cpu_pages *pcp)
+			       struct per_cpu_pages *pcp,
+			       struct pcplists *lp)
 {
 	int pindex = 0;
 	int batch_free = 0;
@@ -1453,7 +1454,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	 * Ensure proper count is passed which otherwise would stuck in the
 	 * below while (list_empty(list)) loop.
 	 */
-	count = min(pcp->count, count);
+	count = min(lp->count, count);
 	while (count > 0) {
 		struct list_head *list;
 
@@ -1468,7 +1469,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			batch_free++;
 			if (++pindex == NR_PCP_LISTS)
 				pindex = 0;
-			list = &pcp->lists[pindex];
+			list = &lp->lists[pindex];
 		} while (list_empty(list));
 
 		/* This is the only non-empty list. Free them all. */
@@ -1508,7 +1509,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			}
 		} while (count > 0 && --batch_free && !list_empty(list));
 	}
-	pcp->count -= nr_freed;
+	lp->count -= nr_freed;
 
 	/*
 	 * local_lock_irq held so equivalent to spin_lock_irqsave for
@@ -3069,14 +3070,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
+	struct pcplists *lp;
 	unsigned long flags;
 	int to_drain, batch;
 
 	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
-	to_drain = min(pcp->count, batch);
+	lp = pcp->lp;
+	to_drain = min(lp->count, batch);
 	if (to_drain > 0)
-		free_pcppages_bulk(zone, to_drain, pcp);
+		free_pcppages_bulk(zone, to_drain, pcp, lp);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 #endif
@@ -3092,12 +3095,14 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
 	unsigned long flags;
 	struct per_cpu_pages *pcp;
+	struct pcplists *lp;
 
 	local_lock_irqsave(&pagesets.lock, flags);
 
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	if (pcp->count)
-		free_pcppages_bulk(zone, pcp->count, pcp);
+	lp = pcp->lp;
+	if (lp->count)
+		free_pcppages_bulk(zone, lp->count, pcp, lp);
 
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
@@ -3158,7 +3163,7 @@ static void drain_local_pages_wq(struct work_struct *work)
  *
  * drain_all_pages() is optimized to only execute on cpus where pcplists are
  * not empty. The check for non-emptiness can however race with a free to
- * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
+ * pcplist that has not yet increased the lp->count from 0 to 1. Callers
  * that need the guarantee that every CPU has drained can disable the
  * optimizing racy check.
  */
@@ -3200,21 +3205,22 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
 		struct per_cpu_pages *pcp;
 		struct zone *z;
 		bool has_pcps = false;
+		struct pcplists *lp;
 
 		if (force_all_cpus) {
 			/*
-			 * The pcp.count check is racy, some callers need a
+			 * The lp->count check is racy, some callers need a
 			 * guarantee that no cpu is missed.
 			 */
 			has_pcps = true;
 		} else if (zone) {
-			pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-			if (pcp->count)
+			lp = per_cpu_ptr(zone->per_cpu_pageset, cpu)->lp;
+			if (lp->count)
 				has_pcps = true;
 		} else {
 			for_each_populated_zone(z) {
-				pcp = per_cpu_ptr(z->per_cpu_pageset, cpu);
-				if (pcp->count) {
+				lp = per_cpu_ptr(z->per_cpu_pageset, cpu)->lp;
+				if (lp->count) {
 					has_pcps = true;
 					break;
 				}
@@ -3366,19 +3372,21 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
+	struct pcplists *lp;
 	int high;
 	int pindex;
 
 	__count_vm_event(PGFREE);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
+	lp = pcp->lp;
 	pindex = order_to_pindex(migratetype, order);
-	list_add(&page->lru, &pcp->lists[pindex]);
-	pcp->count += 1 << order;
+	list_add(&page->lru, &lp->lists[pindex]);
+	lp->count += 1 << order;
 	high = nr_pcp_high(pcp, zone);
-	if (pcp->count >= high) {
+	if (lp->count >= high) {
 		int batch = READ_ONCE(pcp->batch);
 
-		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp, lp);
 	}
 }
 
@@ -3603,9 +3611,11 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 			struct per_cpu_pages *pcp)
 {
 	struct list_head *list;
+	struct pcplists *lp;
 	struct page *page;
 
-	list = &pcp->lists[order_to_pindex(migratetype, order)];
+	lp = pcp->lp;
+	list = &lp->lists[order_to_pindex(migratetype, order)];
 
 	do {
 		if (list_empty(list)) {
@@ -3625,14 +3635,14 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 					batch, list,
 					migratetype, alloc_flags);
 
-			pcp->count += alloced << order;
+			lp->count += alloced << order;
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
 
 		page = list_first_entry(list, struct page, lru);
 		list_del(&page->lru);
-		pcp->count -= 1 << order;
+		lp->count -= 1 << order;
 	} while (check_new_pcp(page));
 
 	return page;
@@ -5877,7 +5887,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			continue;
 
 		for_each_online_cpu(cpu)
-			free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+			free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->lp->count;
 	}
 
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
@@ -5971,7 +5981,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 
 		free_pcp = 0;
 		for_each_online_cpu(cpu)
-			free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count;
+			free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->lp->count;
 
 		show_node(zone);
 		printk(KERN_CONT
@@ -6012,7 +6022,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(zone_page_state(zone, NR_MLOCK)),
 			K(zone_page_state(zone, NR_BOUNCE)),
 			K(free_pcp),
-			K(this_cpu_read(zone->per_cpu_pageset->count)),
+			K(this_cpu_read(zone->per_cpu_pageset)->lp->count),
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
@@ -6848,7 +6858,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
 
 /*
  * pcp->high and pcp->batch values are related and generally batch is lower
- * than high. They are also related to pcp->count such that count is lower
+ * than high. They are also related to pcp->lp->count such that count is lower
  * than high, and as soon as it reaches high, the pcplist is flushed.
  *
  * However, guaranteeing these relations at all times would require e.g. write
@@ -6856,7 +6866,7 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
  * thus be prone to error and bad for performance. Thus the update only prevents
  * store tearing. Any new users of pcp->batch and pcp->high should ensure they
  * can cope with those fields changing asynchronously, and fully trust only the
- * pcp->count field on the local CPU with interrupts disabled.
+ * pcp->lp->count field on the local CPU with interrupts disabled.
  *
  * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
  * outside of boot time (or some other assurance that no concurrent updaters
@@ -6876,8 +6886,10 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	memset(pcp, 0, sizeof(*pcp));
 	memset(pzstats, 0, sizeof(*pzstats));
 
+	pcp->lp = &ACCESS_PRIVATE(pcp, pcplists);
+
 	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
-		INIT_LIST_HEAD(&pcp->lists[pindex]);
+		INIT_LIST_HEAD(&pcp->lp->lists[pindex]);
 
 	/*
 	 * Set batch and high values safe for a boot pageset. A true percpu
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8ce2620344b2..5279d3f34e0b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -856,7 +856,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 			 * if not then there is nothing to expire.
 			 */
 			if (!__this_cpu_read(pcp->expire) ||
-			       !__this_cpu_read(pcp->count))
+			       !this_cpu_ptr(pcp)->lp->count)
 				continue;
 
 			/*
@@ -870,7 +870,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
 			if (__this_cpu_dec_return(pcp->expire))
 				continue;
 
-			if (__this_cpu_read(pcp->count)) {
+			if (this_cpu_ptr(pcp)->lp->count) {
 				drain_zone_pages(zone, this_cpu_ptr(pcp));
 				changes++;
 			}
@@ -1707,7 +1707,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 			   "\n              high:  %i"
 			   "\n              batch: %i",
 			   i,
-			   pcp->count,
+			   pcp->lp->count,
 			   pcp->high,
 			   pcp->batch);
 #ifdef CONFIG_SMP
-- 
2.31.1