On 2023/08/10 18:58, Tetsuo Handa wrote: > If __build_all_zonelists() can run without being switched to other threads > (except interrupt handlers), I consider that this approach works. If there is no way to make sure that the section between write_seqlock(&zonelist_update_seq) and write_sequnlock(&zonelist_update_seq) runs without context switching (interrupts handlers are fine), something like below could be used in order to keep spin_lock(s->lock); spin_unlock(s->lock); away from seqprop_sequence() from atomic allocations. But I think that looses the reason to replace read_seqbegin() with raw_seqcount_begin(); that will be essentially the same with https://lkml.kernel.org/r/dfdb9da6-ca8f-7a81-bfdd-d74b4c401f11@xxxxxxxxxxxxxxxxxxx . diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d3460c7a480..f2f79caab2cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3644,20 +3644,20 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release); */ static DEFINE_SEQLOCK(zonelist_update_seq); -static unsigned int zonelist_iter_begin(void) +static unsigned int zonelist_iter_begin(gfp_t gfp) { - if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) - return read_seqbegin(&zonelist_update_seq); + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE) && (gfp & __GFP_DIRECT_RECLAIM)) + return data_race(raw_seqcount_begin(&zonelist_update_seq.seqcount)); return 0; } -static unsigned int check_retry_zonelist(unsigned int seq) +static unsigned int check_retry_zonelist(gfp_t gfp, unsigned int seq) { - if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) - return read_seqretry(&zonelist_update_seq, seq); + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE) && (gfp & __GFP_DIRECT_RECLAIM)) + return data_race(read_seqcount_retry(&zonelist_update_seq.seqcount, seq)); - return seq; + return 0; } /* Perform direct synchronous page reclaim */ @@ -3968,7 +3968,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist_iter_cookie = zonelist_iter_begin(); + zonelist_iter_cookie = zonelist_iter_begin(gfp_mask); /* * The fast path uses conservative alloc_flags to succeed only until @@ -4146,7 +4146,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * a unnecessary OOM kill. */ if (check_retry_cpuset(cpuset_mems_cookie, ac) || - check_retry_zonelist(zonelist_iter_cookie)) + check_retry_zonelist(gfp_mask, zonelist_iter_cookie)) goto restart; /* Reclaim has failed us, start killing things */ @@ -4172,7 +4172,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * a unnecessary OOM kill. */ if (check_retry_cpuset(cpuset_mems_cookie, ac) || - check_retry_zonelist(zonelist_iter_cookie)) + check_retry_zonelist(gfp_mask, zonelist_iter_cookie)) goto restart; /* @@ -5138,20 +5138,7 @@ static void __build_all_zonelists(void *data) pg_data_t *self = data; unsigned long flags; - /* - * Explicitly disable this CPU's interrupts before taking seqlock - * to prevent any IRQ handler from calling into the page allocator - * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. - */ - local_irq_save(flags); - /* - * Explicitly disable this CPU's synchronous printk() before taking - * seqlock to prevent any printk() from trying to hold port->lock, for - * tty_insert_flip_string_and_push_buffer() on other CPU might be - * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. - */ - printk_deferred_enter(); - write_seqlock(&zonelist_update_seq); + write_seqlock_irqsave(&zonelist_update_seq, flags); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -5188,9 +5175,7 @@ static void __build_all_zonelists(void *data) #endif } - write_sequnlock(&zonelist_update_seq); - printk_deferred_exit(); - local_irq_restore(flags); + write_sequnlock_irqrestore(&zonelist_update_seq, flags); } static noinline void __init