Johannes Weiner <hannes@xxxxxxxxxxx> writes: > Kswapd currently bails on higher-order allocations with an open-coded > check for whether it's reclaimed the compaction gap. > > compaction_suitable() is the customary interface to coordinate reclaim > with compaction. > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> > --- > mm/vmscan.c | 67 ++++++++++++++++++----------------------------------- > 1 file changed, 23 insertions(+), 44 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index ee8c8ca2e7b5..723705b9e4d9 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) > if (!managed_zone(zone)) > continue; > > + /* Allocation can succeed in any zone, done */ > if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) > mark = wmark_pages(zone, WMARK_PROMO); > else > mark = high_wmark_pages(zone); > if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) > return true; > + > + /* Allocation can't succeed, but enough order-0 to compact */ > + if (compaction_suitable(zone, order, > + highest_zoneidx) == COMPACT_CONTINUE) > + return true; Should we check the following first? order > 0 && zone_watermark_ok_safe(zone, 0, mark, highest_zoneidx) Best Regards, Huang, Ying > } > > /* > @@ -6968,16 +6974,6 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, > */ > shrink_node(pgdat, sc); > > - /* > - * Fragmentation may mean that the system cannot be rebalanced for > - * high-order allocations. If twice the allocation size has been > - * reclaimed then recheck watermarks only at order-0 to prevent > - * excessive reclaim. Assume that a process requested a high-order > - * can direct reclaim/compact. > - */ > - if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) > - sc->order = 0; > - > return sc->nr_scanned >= sc->nr_to_reclaim; > } > > @@ -7018,15 +7014,13 @@ clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) > * that are eligible for use by the caller until at least one zone is > * balanced. > * > - * Returns the order kswapd finished reclaiming at. > - * > * kswapd scans the zones in the highmem->normal->dma direction. It skips > * zones which have free_pages > high_wmark_pages(zone), but once a zone is > * found to have free_pages <= high_wmark_pages(zone), any page in that zone > * or lower is eligible for reclaim until at least one usable zone is > * balanced. > */ > -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) > +static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) > { > int i; > unsigned long nr_soft_reclaimed; > @@ -7226,14 +7220,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) > __fs_reclaim_release(_THIS_IP_); > psi_memstall_leave(&pflags); > set_task_reclaim_state(current, NULL); > - > - /* > - * Return the order kswapd stopped reclaiming at as > - * prepare_kswapd_sleep() takes it into account. If another caller > - * entered the allocator slow path while kswapd was awake, order will > - * remain at the higher level. > - */ > - return sc.order; > } > > /* > @@ -7251,7 +7237,7 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, > return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; > } > > -static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, > +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, > unsigned int highest_zoneidx) > { > long remaining = 0; > @@ -7269,7 +7255,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > * eligible zone balanced that it's also unlikely that compaction will > * succeed. > */ > - if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { > + if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) { > /* > * Compaction records what page blocks it recently failed to > * isolate pages from and skips them in the future scanning. > @@ -7282,7 +7268,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > * We have freed the memory, now we should compact it to make > * allocation of the requested order possible. > */ > - wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); > + wakeup_kcompactd(pgdat, order, highest_zoneidx); > > remaining = schedule_timeout(HZ/10); > > @@ -7296,8 +7282,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > kswapd_highest_zoneidx(pgdat, > highest_zoneidx)); > > - if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) > - WRITE_ONCE(pgdat->kswapd_order, reclaim_order); > + if (READ_ONCE(pgdat->kswapd_order) < order) > + WRITE_ONCE(pgdat->kswapd_order, order); > } > > finish_wait(&pgdat->kswapd_wait, &wait); > @@ -7308,8 +7294,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > * After a short sleep, check if it was a premature sleep. If not, then > * go fully to sleep until explicitly woken up. > */ > - if (!remaining && > - prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { > + if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) { > trace_mm_vmscan_kswapd_sleep(pgdat->node_id); > > /* > @@ -7350,8 +7335,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > */ > static int kswapd(void *p) > { > - unsigned int alloc_order, reclaim_order; > - unsigned int highest_zoneidx = MAX_NR_ZONES - 1; > + unsigned int order, highest_zoneidx; > pg_data_t *pgdat = (pg_data_t *)p; > struct task_struct *tsk = current; > const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); > @@ -7374,22 +7358,20 @@ static int kswapd(void *p) > tsk->flags |= PF_MEMALLOC | PF_KSWAPD; > set_freezable(); > > - WRITE_ONCE(pgdat->kswapd_order, 0); > + order = 0; > + highest_zoneidx = MAX_NR_ZONES - 1; > + WRITE_ONCE(pgdat->kswapd_order, order); > WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); > + > atomic_set(&pgdat->nr_writeback_throttled, 0); > + > for ( ; ; ) { > bool ret; > > - alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); > - highest_zoneidx = kswapd_highest_zoneidx(pgdat, > - highest_zoneidx); > - > -kswapd_try_sleep: > - kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, > - highest_zoneidx); > + kswapd_try_to_sleep(pgdat, order, highest_zoneidx); > > /* Read the new order and highest_zoneidx */ > - alloc_order = READ_ONCE(pgdat->kswapd_order); > + order = READ_ONCE(pgdat->kswapd_order); > highest_zoneidx = kswapd_highest_zoneidx(pgdat, > highest_zoneidx); > WRITE_ONCE(pgdat->kswapd_order, 0); > @@ -7415,11 +7397,8 @@ static int kswapd(void *p) > * request (alloc_order). > */ > trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, > - alloc_order); > - reclaim_order = balance_pgdat(pgdat, alloc_order, > - highest_zoneidx); > - if (reclaim_order < alloc_order) > - goto kswapd_try_sleep; > + order); > + balance_pgdat(pgdat, order, highest_zoneidx); > } > > tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);