On Mon, Sep 20, 2021 at 1:55 AM Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> wrote: > > Page reclaim throttles on congestion if too many parallel reclaim instances > have isolated too many pages. This makes no sense, excessive parallelisation > has nothing to do with writeback or congestion. > > This patch creates an additional workqueue to sleep on when too many > pages are isolated. The throttled tasks are woken when the number > of isolated pages is reduced or a timeout occurs. There may be > some false positive wakeups for GFP_NOIO/GFP_NOFS callers but > the tasks will throttle again if necessary. > > Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> > --- > include/linux/mmzone.h | 4 +++- > include/trace/events/vmscan.h | 4 +++- > mm/compaction.c | 2 +- > mm/internal.h | 2 ++ > mm/page_alloc.c | 6 +++++- > mm/vmscan.c | 22 ++++++++++++++++------ > 6 files changed, 30 insertions(+), 10 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index ef0a63ebd21d..ca65d6a64bdd 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -275,6 +275,8 @@ enum lru_list { > > enum vmscan_throttle_state { > VMSCAN_THROTTLE_WRITEBACK, > + VMSCAN_THROTTLE_ISOLATED, > + NR_VMSCAN_THROTTLE, > }; > > #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) > @@ -846,7 +848,7 @@ typedef struct pglist_data { > int node_id; > wait_queue_head_t kswapd_wait; > wait_queue_head_t pfmemalloc_wait; > - wait_queue_head_t reclaim_wait; /* wq for throttling reclaim */ > + wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE]; > atomic_t nr_reclaim_throttled; /* nr of throtted tasks */ > unsigned long nr_reclaim_start; /* nr pages written while throttled > * when throttling started. */ > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h > index c317f9fe0d17..d4905bd9e9c4 100644 > --- a/include/trace/events/vmscan.h > +++ b/include/trace/events/vmscan.h > @@ -28,10 +28,12 @@ > ) : "RECLAIM_WB_NONE" > > #define _VMSCAN_THROTTLE_WRITEBACK (1 << VMSCAN_THROTTLE_WRITEBACK) > +#define _VMSCAN_THROTTLE_ISOLATED (1 << VMSCAN_THROTTLE_ISOLATED) > > #define show_throttle_flags(flags) \ > (flags) ? __print_flags(flags, "|", \ > - {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"} \ > + {_VMSCAN_THROTTLE_WRITEBACK, "VMSCAN_THROTTLE_WRITEBACK"}, \ > + {_VMSCAN_THROTTLE_ISOLATED, "VMSCAN_THROTTLE_ISOLATED"} \ > ) : "VMSCAN_THROTTLE_NONE" > > > diff --git a/mm/compaction.c b/mm/compaction.c > index bfc93da1c2c7..221c9c10ad7e 100644 > --- a/mm/compaction.c > +++ b/mm/compaction.c > @@ -822,7 +822,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, > if (cc->mode == MIGRATE_ASYNC) > return -EAGAIN; > > - congestion_wait(BLK_RW_ASYNC, HZ/10); > + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10); It seems waking up tasks is missed in compaction's too_many_isolated(). There are two too_many_isolated(), one is for compaction, the other is for reclaimer. I saw the waking up code was added to the reclaimer's in the below. Or the compaction one is left out intentionally? > > if (fatal_signal_pending(current)) > return -EINTR; > diff --git a/mm/internal.h b/mm/internal.h > index e25b3686bfab..e6cd22fb5a43 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -118,6 +118,8 @@ extern unsigned long highest_memmap_pfn; > */ > extern int isolate_lru_page(struct page *page); > extern void putback_lru_page(struct page *page); > +extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, > + long timeout); > > /* > * in mm/rmap.c: > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index d849ddfc1e51..78e538067651 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -7389,6 +7389,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} > > static void __meminit pgdat_init_internals(struct pglist_data *pgdat) > { > + int i; > + > pgdat_resize_init(pgdat); > > pgdat_init_split_queue(pgdat); > @@ -7396,7 +7398,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) > > init_waitqueue_head(&pgdat->kswapd_wait); > init_waitqueue_head(&pgdat->pfmemalloc_wait); > - init_waitqueue_head(&pgdat->reclaim_wait); > + > + for (i = 0; i < NR_VMSCAN_THROTTLE; i++) > + init_waitqueue_head(&pgdat->reclaim_wait[i]); > > pgdat_page_ext_init(pgdat); > lruvec_init(&pgdat->__lruvec); > diff --git a/mm/vmscan.c b/mm/vmscan.c > index b58ea0b13286..eb81dcac15b2 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1006,11 +1006,10 @@ static void handle_write_error(struct address_space *mapping, > unlock_page(page); > } > > -static void > -reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, > +void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason, > long timeout) > { > - wait_queue_head_t *wqh = &pgdat->reclaim_wait; > + wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; > unsigned long start = jiffies; > long ret; > DEFINE_WAIT(wait); > @@ -1044,7 +1043,7 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct page *page) > READ_ONCE(pgdat->nr_reclaim_start); > > if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) > - wake_up_interruptible_all(&pgdat->reclaim_wait); > + wake_up_interruptible_all(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); > } > > /* possible outcome of pageout() */ > @@ -2159,6 +2158,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, > struct scan_control *sc) > { > unsigned long inactive, isolated; > + bool too_many; > > if (current_is_kswapd()) > return 0; > @@ -2182,6 +2182,17 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, > if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) > inactive >>= 3; > > + too_many = isolated > inactive; > + > + /* Wake up tasks throttled due to too_many_isolated. */ > + if (!too_many) { > + wait_queue_head_t *wqh; > + > + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; > + if (waitqueue_active(wqh)) > + wake_up_interruptible_all(wqh); > + } > + > return isolated > inactive; Just return too_many? > } > > @@ -2291,8 +2302,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, > return 0; > > /* wait a bit for the reclaimer. */ > - msleep(100); > - stalled = true; > + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED, HZ/10); > > /* We are about to die and free our memory. Return now. */ > if (fatal_signal_pending(current)) > -- > 2.31.1 > >