Kswapd currently bails on higher-order allocations with an open-coded check for whether it's reclaimed the compaction gap. compaction_suitable() is the customary interface to coordinate reclaim with compaction. Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> --- mm/vmscan.c | 67 ++++++++++++++++++----------------------------------- 1 file changed, 23 insertions(+), 44 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index ee8c8ca2e7b5..723705b9e4d9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6872,12 +6872,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) if (!managed_zone(zone)) continue; + /* Allocation can succeed in any zone, done */ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) mark = wmark_pages(zone, WMARK_PROMO); else mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) return true; + + /* Allocation can't succeed, but enough order-0 to compact */ + if (compaction_suitable(zone, order, + highest_zoneidx) == COMPACT_CONTINUE) + return true; } /* @@ -6968,16 +6974,6 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, */ shrink_node(pgdat, sc); - /* - * Fragmentation may mean that the system cannot be rebalanced for - * high-order allocations. If twice the allocation size has been - * reclaimed then recheck watermarks only at order-0 to prevent - * excessive reclaim. Assume that a process requested a high-order - * can direct reclaim/compact. - */ - if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) - sc->order = 0; - return sc->nr_scanned >= sc->nr_to_reclaim; } @@ -7018,15 +7014,13 @@ clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) * that are eligible for use by the caller until at least one zone is * balanced. * - * Returns the order kswapd finished reclaiming at. - * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is * found to have free_pages <= high_wmark_pages(zone), any page in that zone * or lower is eligible for reclaim until at least one usable zone is * balanced. */ -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) +static void balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) { int i; unsigned long nr_soft_reclaimed; @@ -7226,14 +7220,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) __fs_reclaim_release(_THIS_IP_); psi_memstall_leave(&pflags); set_task_reclaim_state(current, NULL); - - /* - * Return the order kswapd stopped reclaiming at as - * prepare_kswapd_sleep() takes it into account. If another caller - * entered the allocator slow path while kswapd was awake, order will - * remain at the higher level. - */ - return sc.order; } /* @@ -7251,7 +7237,7 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, unsigned int highest_zoneidx) { long remaining = 0; @@ -7269,7 +7255,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * eligible zone balanced that it's also unlikely that compaction will * succeed. */ - if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { + if (prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -7282,7 +7268,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); + wakeup_kcompactd(pgdat, order, highest_zoneidx); remaining = schedule_timeout(HZ/10); @@ -7296,8 +7282,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o kswapd_highest_zoneidx(pgdat, highest_zoneidx)); - if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) - WRITE_ONCE(pgdat->kswapd_order, reclaim_order); + if (READ_ONCE(pgdat->kswapd_order) < order) + WRITE_ONCE(pgdat->kswapd_order, order); } finish_wait(&pgdat->kswapd_wait, &wait); @@ -7308,8 +7294,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!remaining && - prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { + if (!remaining && prepare_kswapd_sleep(pgdat, order, highest_zoneidx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -7350,8 +7335,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ static int kswapd(void *p) { - unsigned int alloc_order, reclaim_order; - unsigned int highest_zoneidx = MAX_NR_ZONES - 1; + unsigned int order, highest_zoneidx; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -7374,22 +7358,20 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_KSWAPD; set_freezable(); - WRITE_ONCE(pgdat->kswapd_order, 0); + order = 0; + highest_zoneidx = MAX_NR_ZONES - 1; + WRITE_ONCE(pgdat->kswapd_order, order); WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + atomic_set(&pgdat->nr_writeback_throttled, 0); + for ( ; ; ) { bool ret; - alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); - highest_zoneidx = kswapd_highest_zoneidx(pgdat, - highest_zoneidx); - -kswapd_try_sleep: - kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, - highest_zoneidx); + kswapd_try_to_sleep(pgdat, order, highest_zoneidx); /* Read the new order and highest_zoneidx */ - alloc_order = READ_ONCE(pgdat->kswapd_order); + order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); @@ -7415,11 +7397,8 @@ static int kswapd(void *p) * request (alloc_order). */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, - alloc_order); - reclaim_order = balance_pgdat(pgdat, alloc_order, - highest_zoneidx); - if (reclaim_order < alloc_order) - goto kswapd_try_sleep; + order); + balance_pgdat(pgdat, order, highest_zoneidx); } tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); -- 2.39.2