On Tue 09-02-21 16:23:13, zhou wrote: > From: zhou xianrong <xianrong.zhou@xxxxxxxxxxxxx> > > For purpose of better migration cma pages are allocated after > failure movalbe allocations and are used normally for file pages > or anonymous pages. > > In reclaim path so many cma pages if configurated are reclaimed > from lru lists in kswapd mainly or direct reclaim triggered by > unmovable or reclaimable allocations. But these cma pages can not > be used by original unmovable or reclaimable allocations. So the > reclaim are unnecessary. > > In a same system if the cma pages were configurated to large then > more failture unmovable (vmalloc etc.) or reclaimable (slab etc.) > allocations are arised and then more kswapd rounds are triggered > and then more cma pages are reclaimed. Could you be more specific? Do you have any numbers and an example configuration when this is visible? > So this maybe cause vicious cycle. It causes that when we are under > low memory and still there are many cma pages that can not be > allocated due to unnecessary cma reclaim and cma fallback allocations > . So cma pages are not used sufficiently. > > The modification is straightforward that skips reclaiming cma pages > in reclaim procedure which is triggered only by unmovable or > reclaimable allocations. This optimization can avoid ~3% unnecessary > cma isolations (cma isolated / total isolated). Joonsoo used to have a patch series to drop many of the hacks we have for CMA and made it part of a movable zone. That would solve many problems, including this one. I am not sure where the work stands now but it would be probably better to revive that rather than adding more special casing on top of what we have right now. > Signed-off-by: zhou xianrong <xianrong.zhou@xxxxxxxxxxxxx> > --- > include/linux/mmzone.h | 6 ++-- > include/trace/events/vmscan.h | 20 +++++++---- > mm/page_alloc.c | 5 +-- > mm/vmscan.c | 63 +++++++++++++++++++++++++++++------ > 4 files changed, 73 insertions(+), 21 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index b593316bff3d..7dd38d7372b9 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -301,6 +301,8 @@ struct lruvec { > #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) > /* Isolate unevictable pages */ > #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) > +/* Isolate none cma pages */ > +#define ISOLATE_NONCMA ((__force isolate_mode_t)0x10) > > /* LRU Isolation modes. */ > typedef unsigned __bitwise isolate_mode_t; > @@ -756,7 +758,7 @@ typedef struct pglist_data { > wait_queue_head_t pfmemalloc_wait; > struct task_struct *kswapd; /* Protected by > mem_hotplug_begin/end() */ > - int kswapd_order; > + int kswapd_order, kswapd_migratetype; > enum zone_type kswapd_highest_zoneidx; > > int kswapd_failures; /* Number of 'reclaimed == 0' runs */ > @@ -840,7 +842,7 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) > > void build_all_zonelists(pg_data_t *pgdat); > void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, > - enum zone_type highest_zoneidx); > + int migratetype, enum zone_type highest_zoneidx); > bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, > int highest_zoneidx, unsigned int alloc_flags, > long free_pages); > diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h > index 2070df64958e..41bbafdfde84 100644 > --- a/include/trace/events/vmscan.h > +++ b/include/trace/events/vmscan.h > @@ -51,37 +51,41 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep, > > TRACE_EVENT(mm_vmscan_kswapd_wake, > > - TP_PROTO(int nid, int zid, int order), > + TP_PROTO(int nid, int zid, int order, int mt), > > - TP_ARGS(nid, zid, order), > + TP_ARGS(nid, zid, order, mt), > > TP_STRUCT__entry( > __field( int, nid ) > __field( int, zid ) > __field( int, order ) > + __field( int, mt ) > ), > > TP_fast_assign( > __entry->nid = nid; > __entry->zid = zid; > __entry->order = order; > + __entry->mt = mt; > ), > > - TP_printk("nid=%d order=%d", > + TP_printk("nid=%d order=%d migratetype=%d", > __entry->nid, > - __entry->order) > + __entry->order, > + __entry->mt) > ); > > TRACE_EVENT(mm_vmscan_wakeup_kswapd, > > - TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags), > + TP_PROTO(int nid, int zid, int order, int mt, gfp_t gfp_flags), > > - TP_ARGS(nid, zid, order, gfp_flags), > + TP_ARGS(nid, zid, order, mt, gfp_flags), > > TP_STRUCT__entry( > __field( int, nid ) > __field( int, zid ) > __field( int, order ) > + __field( int, mt ) > __field( gfp_t, gfp_flags ) > ), > > @@ -89,12 +93,14 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, > __entry->nid = nid; > __entry->zid = zid; > __entry->order = order; > + __entry->mt = mt; > __entry->gfp_flags = gfp_flags; > ), > > - TP_printk("nid=%d order=%d gfp_flags=%s", > + TP_printk("nid=%d order=%d migratetype=%d gfp_flags=%s", > __entry->nid, > __entry->order, > + __entry->mt, > show_gfp_flags(__entry->gfp_flags)) > ); > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 519a60d5b6f7..45ceb15721b8 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -3517,7 +3517,7 @@ struct page *rmqueue(struct zone *preferred_zone, > /* Separate test+clear to avoid unnecessary atomics */ > if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { > clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); > - wakeup_kswapd(zone, 0, 0, zone_idx(zone)); > + wakeup_kswapd(zone, 0, 0, migratetype, zone_idx(zone)); > } > > VM_BUG_ON_PAGE(page && bad_range(zone, page), page); > @@ -4426,11 +4426,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, > struct zone *zone; > pg_data_t *last_pgdat = NULL; > enum zone_type highest_zoneidx = ac->highest_zoneidx; > + int migratetype = ac->migratetype; > > for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, > ac->nodemask) { > if (last_pgdat != zone->zone_pgdat) > - wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); > + wakeup_kswapd(zone, gfp_mask, order, migratetype, highest_zoneidx); > last_pgdat = zone->zone_pgdat; > } > } > diff --git a/mm/vmscan.c b/mm/vmscan.c > index b1b574ad199d..e61ec8747a40 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -99,6 +99,9 @@ struct scan_control { > /* Can pages be swapped as part of reclaim? */ > unsigned int may_swap:1; > > + /* Can cma pages be reclaimed? */ > + unsigned int may_cma:1; > + > /* > * Cgroups are not reclaimed below their configured memory.low, > * unless we threaten to OOM. If any cgroups are skipped due to > @@ -286,6 +289,11 @@ static bool writeback_throttling_sane(struct scan_control *sc) > } > #endif > > +static bool movable_reclaim(gfp_t gfp_mask) > +{ > + return is_migrate_movable(gfp_migratetype(gfp_mask)); > +} > + > /* > * This misses isolated pages which are not accounted for to save counters. > * As the data only determines if reclaim or compaction continues, it is > @@ -1499,6 +1507,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, > .gfp_mask = GFP_KERNEL, > .priority = DEF_PRIORITY, > .may_unmap = 1, > + .may_cma = 1, > }; > struct reclaim_stat stat; > unsigned int nr_reclaimed; > @@ -1593,6 +1602,9 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) > if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) > return ret; > > + if ((mode & ISOLATE_NONCMA) && is_migrate_cma(get_pageblock_migratetype(page))) > + return ret; > + > return 0; > } > > @@ -1647,7 +1659,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, > unsigned long skipped = 0; > unsigned long scan, total_scan, nr_pages; > LIST_HEAD(pages_skipped); > - isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); > + isolate_mode_t mode; > + > + mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); > + mode |= (sc->may_cma ? 0 : ISOLATE_NONCMA); > > total_scan = 0; > scan = 0; > @@ -2125,6 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list) > .may_writepage = 1, > .may_unmap = 1, > .may_swap = 1, > + .may_cma = 1, > }; > > while (!list_empty(page_list)) { > @@ -3253,6 +3269,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, > .may_writepage = !laptop_mode, > .may_unmap = 1, > .may_swap = 1, > + .may_cma = movable_reclaim(gfp_mask), > }; > > /* > @@ -3298,6 +3315,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, > .may_unmap = 1, > .reclaim_idx = MAX_NR_ZONES - 1, > .may_swap = !noswap, > + .may_cma = 1, > }; > > WARN_ON_ONCE(!current->reclaim_state); > @@ -3341,6 +3359,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > .may_writepage = !laptop_mode, > .may_unmap = 1, > .may_swap = may_swap, > + .may_cma = 1, > }; > /* > * Traverse the ZONELIST_FALLBACK zonelist of the current node to put > @@ -3548,7 +3567,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, > * or lower is eligible for reclaim until at least one usable zone is > * balanced. > */ > -static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) > +static int balance_pgdat(pg_data_t *pgdat, int order, int migratetype, int highest_zoneidx) > { > int i; > unsigned long nr_soft_reclaimed; > @@ -3650,6 +3669,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) > */ > sc.may_writepage = !laptop_mode && !nr_boost_reclaim; > sc.may_swap = !nr_boost_reclaim; > + sc.may_cma = is_migrate_movable(migratetype); > > /* > * Do some background aging of the anon list, to give > @@ -3771,8 +3791,15 @@ static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, > return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; > } > > +static int kswapd_migratetype(pg_data_t *pgdat, int prev_migratetype) > +{ > + int curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype); > + > + return curr_migratetype == MIGRATE_TYPES ? prev_migratetype : curr_migratetype; > +} > + > static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, > - unsigned int highest_zoneidx) > + int migratetype, unsigned int highest_zoneidx) > { > long remaining = 0; > DEFINE_WAIT(wait); > @@ -3807,8 +3834,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > remaining = schedule_timeout(HZ/10); > > /* > - * If woken prematurely then reset kswapd_highest_zoneidx and > - * order. The values will either be from a wakeup request or > + * If woken prematurely then reset kswapd_highest_zoneidx, order > + * and migratetype. The values will either be from a wakeup request or > * the previous request that slept prematurely. > */ > if (remaining) { > @@ -3818,6 +3845,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > > if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) > WRITE_ONCE(pgdat->kswapd_order, reclaim_order); > + > + if (!is_migrate_movable(READ_ONCE(pgdat->kswapd_migratetype))) > + WRITE_ONCE(pgdat->kswapd_migratetype, > + kswapd_migratetype(pgdat, migratetype)); > } > > finish_wait(&pgdat->kswapd_wait, &wait); > @@ -3870,6 +3901,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o > */ > static int kswapd(void *p) > { > + int migratetype = 0; > unsigned int alloc_order, reclaim_order; > unsigned int highest_zoneidx = MAX_NR_ZONES - 1; > pg_data_t *pgdat = (pg_data_t*)p; > @@ -3895,23 +3927,27 @@ static int kswapd(void *p) > set_freezable(); > > WRITE_ONCE(pgdat->kswapd_order, 0); > + WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES); > WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); > for ( ; ; ) { > bool ret; > > alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); > + migratetype = kswapd_migratetype(pgdat, migratetype); > highest_zoneidx = kswapd_highest_zoneidx(pgdat, > highest_zoneidx); > > kswapd_try_sleep: > kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, > - highest_zoneidx); > + migratetype, highest_zoneidx); > > /* Read the new order and highest_zoneidx */ > alloc_order = READ_ONCE(pgdat->kswapd_order); > + migratetype = kswapd_migratetype(pgdat, migratetype); > highest_zoneidx = kswapd_highest_zoneidx(pgdat, > highest_zoneidx); > WRITE_ONCE(pgdat->kswapd_order, 0); > + WRITE_ONCE(pgdat->kswapd_migratetype, MIGRATE_TYPES); > WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); > > ret = try_to_freeze(); > @@ -3934,8 +3970,8 @@ static int kswapd(void *p) > * request (alloc_order). > */ > trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, > - alloc_order); > - reclaim_order = balance_pgdat(pgdat, alloc_order, > + alloc_order, migratetype); > + reclaim_order = balance_pgdat(pgdat, alloc_order, migratetype, > highest_zoneidx); > if (reclaim_order < alloc_order) > goto kswapd_try_sleep; > @@ -3954,10 +3990,11 @@ static int kswapd(void *p) > * needed. > */ > void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, > - enum zone_type highest_zoneidx) > + int migratetype, enum zone_type highest_zoneidx) > { > pg_data_t *pgdat; > enum zone_type curr_idx; > + int curr_migratetype; > > if (!managed_zone(zone)) > return; > @@ -3967,6 +4004,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, > > pgdat = zone->zone_pgdat; > curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); > + curr_migratetype = READ_ONCE(pgdat->kswapd_migratetype); > > if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) > WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); > @@ -3974,6 +4012,9 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, > if (READ_ONCE(pgdat->kswapd_order) < order) > WRITE_ONCE(pgdat->kswapd_order, order); > > + if (curr_migratetype == MIGRATE_TYPES || is_migrate_movable(migratetype)) > + WRITE_ONCE(pgdat->kswapd_migratetype, migratetype); > + > if (!waitqueue_active(&pgdat->kswapd_wait)) > return; > > @@ -3994,7 +4035,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, > } > > trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, > - gfp_flags); > + migratetype, gfp_flags); > wake_up_interruptible(&pgdat->kswapd_wait); > } > > @@ -4017,6 +4058,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) > .may_writepage = 1, > .may_unmap = 1, > .may_swap = 1, > + .may_cma = 1, > .hibernation_mode = 1, > }; > struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); > @@ -4176,6 +4218,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in > .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), > .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), > .may_swap = 1, > + .may_cma = movable_reclaim(gfp_mask), > .reclaim_idx = gfp_zone(gfp_mask), > }; > > -- > 2.25.1 -- Michal Hocko SUSE Labs