On 2024/3/15 15:41, 黄朝阳 (Zhaoyang Huang) wrote: > > > On Thu, Mar 14, 2024 at 10:15 AM <liuhailong@xxxxxxxx> wrote: >> >> From: "Hailong.Liu" <liuhailong@xxxxxxxx> >> >> This reverts commit 5da226dbfce3a2f44978c2c7cf88166e69a6788b. >> >> patch may cause system not responding. if cma pages is large in lru_list >> and system is in lowmemory, many tasks would enter direct reclaim and waste >> cpu time to isolate and return. Test this patch on android-5.15 device >> and tasks call stack as below. >> >> Task name: UsbFfs-worker [affinity: 0xff] pid: 3374 cpu: 7 prio: 120 start: ffffff8897a35c80 >> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc01eaa0000 >> Last_enqueued_ts: 0.000000000 Last_sleep_ts: 0.000000000 >> Stack: >> [<ffffffd32ee7d910>] __switch_to+0x180 >> [<ffffffd3302022fc>] __schedule+0x4dc >> [<ffffffd330201e08>] preempt_schedule+0x5c >> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54 >> [<ffffffd32f14906c>] shrink_inactive_list+0x1d0 >> [<ffffffd32f143998>] shrink_lruvec+0x1bc >> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184 >> [<ffffffd32f147414>] shrink_node+0x2d0 >> [<ffffffd32f146d38>] shrink_zones+0x14c >> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8 >> [<ffffffd32f142b08>] try_to_free_pages+0x2e0 >> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84 >> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0 >> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124 >> [<ffffffd32f19a220>] __vmalloc_area_node+0x188 >> [<ffffffd32f19a540>] __vmalloc_node+0x148 >> [<ffffffd32f19a60c>] vmalloc+0x4c >> [<ffffffd32f910218>] ffs_epfile_io+0x258 >> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0 >> [<ffffffd330033780>] kretprobe_trampoline[jt]+0x0 >> [<ffffffd32f28129c>] __io_submit_one+0x1c0 >> [<ffffffd32f280e38>] io_submit_one+0x88 >> [<ffffffd32f280c88>] __do_sys_io_submit+0x178 >> [<ffffffd32f27eac0>] __arm64_sys_io_submit+0x20 >> [<ffffffd32eeabb74>] el0_svc_common.llvm.9961749221945255377+0xd0 >> [<ffffffd32eeaba34>] do_el0_svc+0x28 >> [<ffffffd32ff21be8>] el0_svc+0x14 >> [<ffffffd32ff21b70>] el0_sync_handler+0x88 >> [<ffffffd32ee128b8>] el0_sync+0x1b8 >> >> Task name: kthreadd [affinity: 0xff] pid: 2 cpu: 7 prio: 120 start: ffffff87808c0000 >> state: 0x0[R] exit_state: 0x0 stack base: 0xffffffc008078000 >> Last_enqueued_ts: 0.000000000 Last_sleep_ts: 0.000000000 >> Stack: >> [<ffffffd32ee7d910>] __switch_to+0x180 >> [<ffffffd3302022fc>] __schedule+0x4dc >> [<ffffffd330201e08>] preempt_schedule+0x5c >> [<ffffffd33020a4d0>] _raw_spin_unlock_irq+0x54 >> [<ffffffd32f149168>] shrink_inactive_list+0x2cc >> [<ffffffd32f143998>] shrink_lruvec+0x1bc >> [<ffffffd32f147c0c>] shrink_node_memcgs+0x184 >> [<ffffffd32f147414>] shrink_node+0x2d0 >> [<ffffffd32f146d38>] shrink_zones+0x14c >> [<ffffffd32f142e84>] do_try_to_free_pages+0xe8 >> [<ffffffd32f142b08>] try_to_free_pages+0x2e0 >> [<ffffffd32f1a8e44>] __alloc_pages_direct_reclaim+0x84 >> [<ffffffd32f1a2d58>] __alloc_pages_slowpath+0x4d0 >> [<ffffffd32f1a23bc>] __alloc_pages_nodemask[jt]+0x124 >> [<ffffffd32f19a220>] __vmalloc_area_node+0x188 >> [<ffffffd32f19a044>] __vmalloc_node_range+0x88 >> [<ffffffd32f0fb430>] scs_alloc+0x1b8 >> [<ffffffd32f0fb62c>] scs_prepare+0x20 >> [<ffffffd32ef2ce04>] dup_task_struct+0xd4 >> [<ffffffd32ef2a77c>] copy_process+0x144 >> [<ffffffd32ef2bae4>] kernel_clone+0xb4 >> [<ffffffd32ef2c040>] kernel_thread+0x5c >> [<ffffffd32ef618d0>] kthreadd+0x184 >> >> without this patch, the tasks will reclaim cma pages and wakeup >> oom-killer or not spin on cpus. >> >> Signed-off-by: Hailong.Liu <liuhailong@xxxxxxxx> >> --- >> mm/vmscan.c | 22 +--------------------- >> 1 file changed, 1 insertion(+), 21 deletions(-) >> >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index 2fe4a11d63f4..197ddf62019f 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -2261,25 +2261,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, >> >> } >> >> -#ifdef CONFIG_CMA >> -/* >> - * It is waste of effort to scan and reclaim CMA pages if it is not available >> - * for current allocation context. Kswapd can not be enrolled as it can not >> - * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL >> - */ >> -static bool skip_cma(struct folio *folio, struct scan_control *sc) >> -{ >> - return !current_is_kswapd() && >> - gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && >> - get_pageblock_migratetype(&folio->page) == MIGRATE_CMA; >> -} >> -#else >> -static bool skip_cma(struct folio *folio, struct scan_control *sc) >> -{ >> - return false; >> -} >> -#endif >> - > >> NAK. > >> +Charan Teja Kalla -- This can cause build errors when CONFIG_LRU_GEN=y. > >> If you plan to post a v2, please include a reproducer. Thanks. > > Could you please retest the case with bellow patch, which has not been in the aosp yet. > > From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx> > > According to current CMA utilization policy, an alloc_pages(GFP_USER) > could 'steal' UNMOVABLE & RECLAIMABLE page blocks via the help of > CMA(pass zone_watermark_ok by counting CMA in but use U&R in rmqueue), > which could lead to following alloc_pages(GFP_KERNEL) fail. > Solving this by introducing second watermark checking for GFP_MOVABLE, > which could have the allocation use CMA when proper. > > -- Free_pages(30MB) > | > | > -- WMARK_LOW(25MB) > | > -- Free_CMA(12MB) > | > | > -- > > Signed-off-by: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx> > --- > v6: update comments > --- > --- > mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 40 insertions(+), 4 deletions(-) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 452459836b71..5a146aa7c0aa 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -2078,6 +2078,43 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, > > } > > +#ifdef CONFIG_CMA > +/* > + * GFP_MOVABLE allocation could drain UNMOVABLE & RECLAIMABLE page blocks via > + * the help of CMA which makes GFP_KERNEL failed. Checking if zone_watermark_ok > + * again without ALLOC_CMA to see if to use CMA first. > + */ > +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) > +{ > + unsigned long watermark; > + bool cma_first = false; > + > + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); > + /* check if GFP_MOVABLE pass previous zone_watermark_ok via the help of CMA */ > + if (zone_watermark_ok(zone, order, watermark, 0, alloc_flags & (~ALLOC_CMA))) { > + /* > + * Balance movable allocations between regular and CMA areas by > + * allocating from CMA when over half of the zone's free memory > + * is in the CMA area. > + */ > + cma_first = (zone_page_state(zone, NR_FREE_CMA_PAGES) > > + zone_page_state(zone, NR_FREE_PAGES) / 2); > + } else { > + /* > + * watermark failed means UNMOVABLE & RECLAIMBLE is not enough > + * now, we should use cma first to keep them stay around the > + * corresponding watermark > + */ > + cma_first = true; > + } > + return cma_first; > +} > +#else > +static bool use_cma_first(struct zone *zone, unsigned int order, unsigned int alloc_flags) > +{ > + return false; > +} > +#endif > /* > * Do the hard work of removing an element from the buddy allocator. > * Call me with the zone->lock already held. > @@ -2091,12 +2128,11 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, > if (IS_ENABLED(CONFIG_CMA)) { > /* > * Balance movable allocations between regular and CMA areas by > - * allocating from CMA when over half of the zone's free memory > - * is in the CMA area. > + * allocating from CMA base on judging zone_watermark_ok again > + * to see if the latest check got pass via the help of CMA > */ > if (alloc_flags & ALLOC_CMA && > - zone_page_state(zone, NR_FREE_CMA_PAGES) > > - zone_page_state(zone, NR_FREE_PAGES) / 2) { > + use_cma_first(zone, order, alloc_flags)) { > page = __rmqueue_cma_fallback(zone, order); > if (page) > return page; > -- > Hi Zhaoyang: I write a reproducer in v2-patch, this may not solve the case. because if system in lowmemory all lru_list is cma pages. direct_reclaim would wasting time scan and skip. For now we could not know how many cma pages in lru and do some heuristic is something weird. Brs, Hailong.