On 2023-08-09 20:03:00 [+0900], Tetsuo Handa wrote: > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 7d3460c7a480..5557d9a2ff2c 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -3638,26 +3638,44 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release); … > -static DEFINE_SEQLOCK(zonelist_update_seq); > +static unsigned int zonelist_update_seq; > > static unsigned int zonelist_iter_begin(void) > { > if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) > - return read_seqbegin(&zonelist_update_seq); > + /* See comment above. */ > + return data_race(READ_ONCE(zonelist_update_seq)); This is open coded raw_read_seqcount() while it should have been raw_seqcount_begin(). > return 0; > } > > -static unsigned int check_retry_zonelist(unsigned int seq) > +static unsigned int check_retry_zonelist(gfp_t gfp, unsigned int seq) > { > - if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) > - return read_seqretry(&zonelist_update_seq, seq); > + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE) && (gfp & __GFP_DIRECT_RECLAIM)) { > + /* See comment above. */ > + unsigned int seq2 = data_race(READ_ONCE(zonelist_update_seq)); > > - return seq; > + /* > + * "seq != seq2" indicates that __build_all_zonelists() has > + * started or has finished rebuilding zonelists, hence retry. > + * "seq == seq2 && (seq2 & 1)" indicates that > + * __build_all_zonelists() is still rebuilding zonelists > + * with context switching disabled, hence retry. > + * "seq == seq2 && !(seq2 & 1)" indicates that > + * __build_all_zonelists() did not rebuild zonelists, hence > + * no retry. > + */ > + return unlikely(seq != seq2 || (seq2 & 1)); open coded read_seqcount_retry(). > + } > + > + return 0; > } > > /* Perform direct synchronous page reclaim */ > @@ -5136,22 +5154,17 @@ static void __build_all_zonelists(void *data) > int nid; > int __maybe_unused cpu; > pg_data_t *self = data; > + static DEFINE_SPINLOCK(lock); > unsigned long flags; > > - /* > - * Explicitly disable this CPU's interrupts before taking seqlock > - * to prevent any IRQ handler from calling into the page allocator > - * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. > - */ > - local_irq_save(flags); > - /* > - * Explicitly disable this CPU's synchronous printk() before taking > - * seqlock to prevent any printk() from trying to hold port->lock, for > - * tty_insert_flip_string_and_push_buffer() on other CPU might be > - * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. > - */ > - printk_deferred_enter(); > - write_seqlock(&zonelist_update_seq); > +#ifdef CONFIG_PREEMPT_RT > + migrate_disable() > +#endif There is no justification/ explanation why migrate_disable() here is needed on PREEMPT_RT and I don't see one. There are two changes here: - The replacement of seqlock_t with something open coded - Logic change when a retry is needed (the gfp mask is considered). I am not a big fan of open coding things especially when not needed and then there is this ifdef which is not needed as well. I don't comment on the logic change. Can we please put an end to this? > + /* Serialize increments of zonelist_update_seq. */ > + spin_lock_irqsave(&lock, flags); > + zonelist_update_seq++; > + /* Tell check_retry_zonelist() that we started rebuilding zonelists. */ > + smp_wmb(); > > #ifdef CONFIG_NUMA > memset(node_load, 0, sizeof(node_load)); Sebastian