On 2023/07/29 14:31, Tetsuo Handa wrote: > On 2023/07/28 0:10, Sebastian Andrzej Siewior wrote: >> On 2023-06-28 21:14:16 [+0900], Tetsuo Handa wrote: >>>> Anyway, please do not do this change only because of printk(). >>>> IMHO, the current ordering is more logical and the printk() problem >>>> should be solved another way. >>> >>> Then, since [PATCH 1/2] cannot be applied, [PATCH 2/2] is automatically >>> rejected. >> >> My understanding is that this patch gets applied and your objection will >> be noted. > > My preference is that zonelist_update_seq is not checked by !__GFP_DIRECT_RECLAIM > allocations, which is a low-hanging fruit towards GFP_LOCKLESS mentioned at > https://lkml.kernel.org/r/ZG3+l4qcCWTPtSMD@xxxxxxxxxxxxxx and > https://lkml.kernel.org/r/ZJWWpGZMJIADQvRS@xxxxxxxxxxxxxx . > > Maybe we can defer checking zonelist_update_seq till retry check like below, > for this is really an infrequent event. > An updated version with comments added. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d3460c7a480..92ecf5f2f76b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3638,26 +3638,44 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release); /* * Zonelists may change due to hotplug during allocation. Detect when zonelists - * have been rebuilt so allocation retries. Reader side does not lock and - * retries the allocation if zonelist changes. Writer side is protected by the - * embedded spin_lock. + * have been rebuilt so __GFP_DIRECT_RECLAIM allocation retries. Writer side + * makes this sequence odd before rebuilding zonelists and makes this sequence + * even after rebuilding zonelists. Sine writer side disables context switching + * when rebuilding zonelists, and !__GFP_DIRECT_RECLAIM allocation will not + * retry when zonelists changed, reader side does not need to hold a lock (but + * needs to use data_race() annotation), making locking dependency simpler. */ -static DEFINE_SEQLOCK(zonelist_update_seq); +static unsigned int zonelist_update_seq; static unsigned int zonelist_iter_begin(void) { if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) - return read_seqbegin(&zonelist_update_seq); + /* See comment above. */ + return data_race(READ_ONCE(zonelist_update_seq)); return 0; } -static unsigned int check_retry_zonelist(unsigned int seq) +static unsigned int check_retry_zonelist(gfp_t gfp, unsigned int seq) { - if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) - return read_seqretry(&zonelist_update_seq, seq); + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE) && (gfp & __GFP_DIRECT_RECLAIM)) { + /* See comment above. */ + unsigned int seq2 = data_race(READ_ONCE(zonelist_update_seq)); - return seq; + /* + * "seq != seq2" indicates that __build_all_zonelists() has + * started or has finished rebuilding zonelists, hence retry. + * "seq == seq2 && (seq2 & 1)" indicates that + * __build_all_zonelists() is still rebuilding zonelists + * with context switching disabled, hence retry. + * "seq == seq2 && !(seq2 & 1)" indicates that + * __build_all_zonelists() did not rebuilt zonelists, hence + * no retry. + */ + return unlikely(seq != seq2 || (seq2 & 1)); + } + + return 0; } /* Perform direct synchronous page reclaim */ @@ -4146,7 +4164,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * a unnecessary OOM kill. */ if (check_retry_cpuset(cpuset_mems_cookie, ac) || - check_retry_zonelist(zonelist_iter_cookie)) + check_retry_zonelist(gfp_mask, zonelist_iter_cookie)) goto restart; /* Reclaim has failed us, start killing things */ @@ -4172,7 +4190,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * a unnecessary OOM kill. */ if (check_retry_cpuset(cpuset_mems_cookie, ac) || - check_retry_zonelist(zonelist_iter_cookie)) + check_retry_zonelist(gfp_mask, zonelist_iter_cookie)) goto restart; /* @@ -5136,22 +5154,17 @@ static void __build_all_zonelists(void *data) int nid; int __maybe_unused cpu; pg_data_t *self = data; + static DEFINE_SPINLOCK(lock); unsigned long flags; - /* - * Explicitly disable this CPU's interrupts before taking seqlock - * to prevent any IRQ handler from calling into the page allocator - * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. - */ - local_irq_save(flags); - /* - * Explicitly disable this CPU's synchronous printk() before taking - * seqlock to prevent any printk() from trying to hold port->lock, for - * tty_insert_flip_string_and_push_buffer() on other CPU might be - * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. - */ - printk_deferred_enter(); - write_seqlock(&zonelist_update_seq); +#ifdef CONFIG_PREEMPT_RT + migrate_disable() +#endif + /* Serialize increments of zonelist_update_seq. */ + spin_lock_irqsave(&lock, flags); + zonelist_update_seq++; + /* Tell check_retry_zonelist() that we started rebuilding zonelists. */ + smp_wmb(); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -5188,9 +5201,13 @@ static void __build_all_zonelists(void *data) #endif } - write_sequnlock(&zonelist_update_seq); - printk_deferred_exit(); - local_irq_restore(flags); + /* Tell check_retry_zonelist() that we finished rebuilding zonelists. */ + smp_wmb(); + zonelist_update_seq++; + spin_unlock_irqrestore(&lock, flags); +#ifdef CONFIG_PREEMPT_RT + migrate_enable() +#endif } static noinline void __init