On 6/13/22 14:56, Mel Gorman wrote: > struct per_cpu_pages is no longer strictly local as PCP lists can be > drained remotely using a lock for protection. While the use of local_lock > works, it goes against the intent of local_lock which is for "pure > CPU local concurrency control mechanisms and not suited for inter-CPU > concurrency control" (Documentation/locking/locktypes.rst) > > local_lock protects against migration between when the percpu pointer is > accessed and the pcp->lock acquired. The lock acquisition is a preemption > point so in the worst case, a task could migrate to another NUMA node > and accidentally allocate remote memory. The main requirement is to pin > the task to a CPU that is suitable for PREEMPT_RT and !PREEMPT_RT. > > Replace local_lock with helpers that pin a task to a CPU, lookup the > per-cpu structure and acquire the embedded lock. It's similar to local_lock > without breaking the intent behind the API. It is not a complete API > as only the parts needed for PCP-alloc are implemented but in theory, > the generic helpers could be promoted to a general API if there was > demand for an embedded lock within a per-cpu struct with a guarantee > that the per-cpu structure locked matches the running CPU and cannot use > get_cpu_var due to RT concerns. PCP requires these semantics to avoid > accidentally allocating remote memory. > > Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> ... > @@ -3367,30 +3429,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, > return min(READ_ONCE(pcp->batch) << 2, high); > } > > -/* Returns true if the page was committed to the per-cpu list. */ > -static bool free_unref_page_commit(struct page *page, int migratetype, > - unsigned int order, bool locked) > +static void free_unref_page_commit(struct per_cpu_pages *pcp, struct zone *zone, > + struct page *page, int migratetype, > + unsigned int order) Hmm given this drops the "bool locked" and bool return value again, my suggestion for patch 5/7 would result in less churn as those woudn't need to be introduced? ... > @@ -3794,19 +3805,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, > struct list_head *list; > struct page *page; > unsigned long flags; > + unsigned long __maybe_unused UP_flags; > > - local_lock_irqsave(&pagesets.lock, flags); > + /* > + * spin_trylock_irqsave is not necessary right now as it'll only be > + * true when contending with a remote drain. It's in place as a > + * preparation step before converting pcp locking to spin_trylock > + * to protect against IRQ reentry. > + */ > + pcp_trylock_prepare(UP_flags); > + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); > + if (!pcp) Besides the missing unpin Andrew fixed, I think also this is missing pcp_trylock_finish(UP_flags); ? > + return NULL; > > /* > * On allocation, reduce the number of pages that are batch freed. > * See nr_pcp_free() where free_factor is increased for subsequent > * frees. > */ > - pcp = this_cpu_ptr(zone->per_cpu_pageset); > pcp->free_factor >>= 1; > list = &pcp->lists[order_to_pindex(migratetype, order)]; > - page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list, false); > - local_unlock_irqrestore(&pagesets.lock, flags); > + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); > + pcp_spin_unlock_irqrestore(pcp, flags); > + pcp_trylock_finish(UP_flags); > if (page) { > __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); > zone_statistics(preferred_zone, zone, 1); > @@ -5410,10 +5431,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > goto failed; > > /* Attempt the batch allocation */ > - local_lock_irqsave(&pagesets.lock, flags); > - pcp = this_cpu_ptr(zone->per_cpu_pageset); > + pcp = pcp_spin_lock_irqsave(zone->per_cpu_pageset, flags); > pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; > - spin_lock(&pcp->lock); > > while (nr_populated < nr_pages) { > > @@ -5424,13 +5443,11 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > } > > page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, > - pcp, pcp_list, true); > + pcp, pcp_list); > if (unlikely(!page)) { > /* Try and allocate at least one page */ > - if (!nr_account) { > - spin_unlock(&pcp->lock); > + if (!nr_account) > goto failed_irq; > - } > break; > } > nr_account++; > @@ -5443,8 +5460,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > nr_populated++; > } > > - spin_unlock(&pcp->lock); > - local_unlock_irqrestore(&pagesets.lock, flags); > + pcp_spin_unlock_irqrestore(pcp, flags); > > __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); > zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); > @@ -5453,7 +5469,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > return nr_populated; > > failed_irq: > - local_unlock_irqrestore(&pagesets.lock, flags); > + pcp_spin_unlock_irqrestore(pcp, flags); > > failed: > page = __alloc_pages(gfp, 0, preferred_nid, nodemask);