On Fri, 4 Nov 2022, Mel Gorman wrote: > Changelog since v1 > o Use trylock in free_unref_page_list due to IO completion from softirq > context > > The pcp_spin_lock_irqsave protecting the PCP lists is IRQ-safe as a task > allocating from the PCP must not re-enter the allocator from IRQ context. > In each instance where IRQ-reentrancy is possible, the lock is acquired using > pcp_spin_trylock_irqsave() even though IRQs are disabled and re-entrancy > is impossible. > > Demote the lock to pcp_spin_lock avoids an IRQ disable/enable in the common > case at the cost of some IRQ allocations taking a slower path. If the PCP > lists need to be refilled, the zone lock still needs to disable IRQs but > that will only happen on PCP refill and drain. If an IRQ is raised when > a PCP allocation is in progress, the trylock will fail and fallback to > using the buddy lists directly. Note that this may not be a universal win > if an interrupt-intensive workload also allocates heavily from interrupt > context and contends heavily on the zone->lock as a result. > > [yuzhao@xxxxxxxxxx: Reported lockdep issue on IO completion from softirq] > Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> Hi Mel, I think you Cc'ed me for the purpose of giving this patch a run, and reporting if it's not good. That is the case, I'm afraid. I first tried it on a v6.1-rc3, and very soon crashed under load with something about PageBuddy in the output. When I reverted, no problem; I thought maybe it's dependent on other commits in akpm's tree. Later I tried on current mm-unstable: which is living up to the name in other ways, but when other issues patched, it soon crashed under load, GPF probably for non-canonical address 0xdead0000000000f8 in compact_zone < compact_zone_order < try_to_compact_pages < .... < shmem_alloc_hugefolio < ... I do try to exercise compaction as hard as I can, even to the point of having a hack in what used to be called shmem_getpage_gfp(), reverting to the stronger attempt to get huge pages, before Rik weakened the effect of huge=always with vma_thp_gfp_mask() in 5.12: so shmem is probably applying stronger flags for compaction than it would in your tree - I'm using GFP_TRANSHUGE_LIGHT | __GFP_RECLAIM | __GFP_NORETRY there. Sorry for not giving you more info, I'm rather hoping that compaction is relevant, and will give you a clue (maybe that capture code, which surprised us once before??). What I'm really trying to do is fix the bug in Linus's rmap/TLB series, and its interaction with my rmap series, and report back on his series (asking for temporary drop), before next-20221107 goes down in flames. I'd advocate for dropping this patch of yours too; but if it's giving nobody else any trouble, I can easily continue to patch it out. Hugh > --- > mm/page_alloc.c | 122 ++++++++++++++++++++++++-------------------------------- > 1 file changed, 53 insertions(+), 69 deletions(-) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index e20ade858e71..ae410adf36fb 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -170,21 +170,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock); > _ret; \ > }) > > -#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ > +#define pcpu_spin_trylock(type, member, ptr) \ > ({ \ > type *_ret; \ > pcpu_task_pin(); \ > _ret = this_cpu_ptr(ptr); \ > - spin_lock_irqsave(&_ret->member, flags); \ > - _ret; \ > -}) > - > -#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ > -({ \ > - type *_ret; \ > - pcpu_task_pin(); \ > - _ret = this_cpu_ptr(ptr); \ > - if (!spin_trylock_irqsave(&_ret->member, flags)) { \ > + if (!spin_trylock(&_ret->member)) { \ > pcpu_task_unpin(); \ > _ret = NULL; \ > } \ > @@ -197,27 +188,16 @@ static DEFINE_MUTEX(pcp_batch_high_lock); > pcpu_task_unpin(); \ > }) > > -#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ > -({ \ > - spin_unlock_irqrestore(&ptr->member, flags); \ > - pcpu_task_unpin(); \ > -}) > - > /* struct per_cpu_pages specific helpers. */ > #define pcp_spin_lock(ptr) \ > pcpu_spin_lock(struct per_cpu_pages, lock, ptr) > > -#define pcp_spin_lock_irqsave(ptr, flags) \ > - pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) > - > -#define pcp_spin_trylock_irqsave(ptr, flags) \ > - pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) > +#define pcp_spin_trylock(ptr) \ > + pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) > > #define pcp_spin_unlock(ptr) \ > pcpu_spin_unlock(lock, ptr) > > -#define pcp_spin_unlock_irqrestore(ptr, flags) \ > - pcpu_spin_unlock_irqrestore(lock, ptr, flags) > #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID > DEFINE_PER_CPU(int, numa_node); > EXPORT_PER_CPU_SYMBOL(numa_node); > @@ -1546,6 +1526,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, > struct per_cpu_pages *pcp, > int pindex) > { > + unsigned long flags; > int min_pindex = 0; > int max_pindex = NR_PCP_LISTS - 1; > unsigned int order; > @@ -1561,8 +1542,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, > /* Ensure requested pindex is drained first. */ > pindex = pindex - 1; > > - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ > - spin_lock(&zone->lock); > + spin_lock_irqsave(&zone->lock, flags); > isolated_pageblocks = has_isolate_pageblock(zone); > > while (count > 0) { > @@ -1610,7 +1590,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, > } while (count > 0 && !list_empty(list)); > } > > - spin_unlock(&zone->lock); > + spin_unlock_irqrestore(&zone->lock, flags); > } > > static void free_one_page(struct zone *zone, > @@ -3124,10 +3104,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, > unsigned long count, struct list_head *list, > int migratetype, unsigned int alloc_flags) > { > + unsigned long flags; > int i, allocated = 0; > > - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ > - spin_lock(&zone->lock); > + spin_lock_irqsave(&zone->lock, flags); > for (i = 0; i < count; ++i) { > struct page *page = __rmqueue(zone, order, migratetype, > alloc_flags); > @@ -3161,7 +3141,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, > * pages added to the pcp list. > */ > __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); > - spin_unlock(&zone->lock); > + spin_unlock_irqrestore(&zone->lock, flags); > return allocated; > } > > @@ -3178,16 +3158,9 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) > batch = READ_ONCE(pcp->batch); > to_drain = min(pcp->count, batch); > if (to_drain > 0) { > - unsigned long flags; > - > - /* > - * free_pcppages_bulk expects IRQs disabled for zone->lock > - * so even though pcp->lock is not intended to be IRQ-safe, > - * it's needed in this context. > - */ > - spin_lock_irqsave(&pcp->lock, flags); > + spin_lock(&pcp->lock); > free_pcppages_bulk(zone, to_drain, pcp, 0); > - spin_unlock_irqrestore(&pcp->lock, flags); > + spin_unlock(&pcp->lock); > } > } > #endif > @@ -3201,12 +3174,9 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) > > pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); > if (pcp->count) { > - unsigned long flags; > - > - /* See drain_zone_pages on why this is disabling IRQs */ > - spin_lock_irqsave(&pcp->lock, flags); > + spin_lock(&pcp->lock); > free_pcppages_bulk(zone, pcp->count, pcp, 0); > - spin_unlock_irqrestore(&pcp->lock, flags); > + spin_unlock(&pcp->lock); > } > } > > @@ -3472,7 +3442,6 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, > */ > void free_unref_page(struct page *page, unsigned int order) > { > - unsigned long flags; > unsigned long __maybe_unused UP_flags; > struct per_cpu_pages *pcp; > struct zone *zone; > @@ -3500,10 +3469,10 @@ void free_unref_page(struct page *page, unsigned int order) > > zone = page_zone(page); > pcp_trylock_prepare(UP_flags); > - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); > + pcp = pcp_spin_trylock(zone->per_cpu_pageset); > if (pcp) { > free_unref_page_commit(zone, pcp, page, migratetype, order); > - pcp_spin_unlock_irqrestore(pcp, flags); > + pcp_spin_unlock(pcp); > } else { > free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); > } > @@ -3515,10 +3484,10 @@ void free_unref_page(struct page *page, unsigned int order) > */ > void free_unref_page_list(struct list_head *list) > { > + unsigned long __maybe_unused UP_flags; > struct page *page, *next; > struct per_cpu_pages *pcp = NULL; > struct zone *locked_zone = NULL; > - unsigned long flags; > int batch_count = 0; > int migratetype; > > @@ -3547,11 +3516,26 @@ void free_unref_page_list(struct list_head *list) > > /* Different zone, different pcp lock. */ > if (zone != locked_zone) { > - if (pcp) > - pcp_spin_unlock_irqrestore(pcp, flags); > + if (pcp) { > + pcp_spin_unlock(pcp); > + pcp_trylock_finish(UP_flags); > + } > > + /* > + * trylock is necessary as pages may be getting freed > + * from IRQ or SoftIRQ context after an IO completion. > + */ > + pcp_trylock_prepare(UP_flags); > + pcp = pcp_spin_trylock(zone->per_cpu_pageset); > + if (!pcp) { > + pcp_trylock_finish(UP_flags); > + list_del(&page->lru); > + free_one_page(page_zone(page), page, > + page_to_pfn(page), 0, migratetype, > + FPI_NONE); > + continue; > + } > locked_zone = zone; > - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); > } > > /* > @@ -3566,18 +3550,23 @@ void free_unref_page_list(struct list_head *list) > free_unref_page_commit(zone, pcp, page, migratetype, 0); > > /* > - * Guard against excessive IRQ disabled times when we get > - * a large list of pages to free. > + * Guard against excessive IRQ disabled times when freeing > + * a large list of pages. Lock will be reacquired if > + * necessary on the next iteration. > */ > if (++batch_count == SWAP_CLUSTER_MAX) { > - pcp_spin_unlock_irqrestore(pcp, flags); > + pcp_spin_unlock(pcp); > + pcp_trylock_finish(UP_flags); > batch_count = 0; > - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); > + pcp = NULL; > + locked_zone = NULL; > } > } > > - if (pcp) > - pcp_spin_unlock_irqrestore(pcp, flags); > + if (pcp) { > + pcp_spin_unlock(pcp); > + pcp_trylock_finish(UP_flags); > + } > } > > /* > @@ -3778,15 +3767,11 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, > struct per_cpu_pages *pcp; > struct list_head *list; > struct page *page; > - unsigned long flags; > unsigned long __maybe_unused UP_flags; > > - /* > - * spin_trylock may fail due to a parallel drain. In the future, the > - * trylock will also protect against IRQ reentrancy. > - */ > + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ > pcp_trylock_prepare(UP_flags); > - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); > + pcp = pcp_spin_trylock(zone->per_cpu_pageset); > if (!pcp) { > pcp_trylock_finish(UP_flags); > return NULL; > @@ -3800,7 +3785,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, > pcp->free_factor >>= 1; > list = &pcp->lists[order_to_pindex(migratetype, order)]; > page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); > - pcp_spin_unlock_irqrestore(pcp, flags); > + pcp_spin_unlock(pcp); > pcp_trylock_finish(UP_flags); > if (page) { > __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); > @@ -5368,7 +5353,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > struct page **page_array) > { > struct page *page; > - unsigned long flags; > unsigned long __maybe_unused UP_flags; > struct zone *zone; > struct zoneref *z; > @@ -5450,9 +5434,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > if (unlikely(!zone)) > goto failed; > > - /* Is a parallel drain in progress? */ > + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ > pcp_trylock_prepare(UP_flags); > - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); > + pcp = pcp_spin_trylock(zone->per_cpu_pageset); > if (!pcp) > goto failed_irq; > > @@ -5471,7 +5455,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > if (unlikely(!page)) { > /* Try and allocate at least one page */ > if (!nr_account) { > - pcp_spin_unlock_irqrestore(pcp, flags); > + pcp_spin_unlock(pcp); > goto failed_irq; > } > break; > @@ -5486,7 +5470,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, > nr_populated++; > } > > - pcp_spin_unlock_irqrestore(pcp, flags); > + pcp_spin_unlock(pcp); > pcp_trylock_finish(UP_flags); > > __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); >