On 14 Feb 2022, at 2:59, Christophe Leroy wrote: > Le 11/02/2022 à 17:41, Zi Yan a écrit : >> From: Zi Yan <ziy@xxxxxxxxxx> >> >> alloc_contig_range() worked at MAX_ORDER-1 granularity to avoid merging >> pageblocks with different migratetypes. It might unnecessarily convert >> extra pageblocks at the beginning and at the end of the range. Change >> alloc_contig_range() to work at pageblock granularity. >> >> Special handling is needed for free pages and in-use pages across the >> boundaries of the range specified alloc_contig_range(). Because these >> partially isolated pages causes free page accounting issues. The free >> pages will be split and freed into separate migratetype lists; the >> in-use pages will be migrated then the freed pages will be handled. >> >> Signed-off-by: Zi Yan <ziy@xxxxxxxxxx> >> --- >> include/linux/page-isolation.h | 2 +- >> mm/internal.h | 3 + >> mm/memory_hotplug.c | 3 +- >> mm/page_alloc.c | 235 +++++++++++++++++++++++++-------- >> mm/page_isolation.c | 33 ++++- >> 5 files changed, 211 insertions(+), 65 deletions(-) >> >> diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h >> index 4ef7be6def83..78ff940cc169 100644 >> --- a/include/linux/page-isolation.h >> +++ b/include/linux/page-isolation.h >> @@ -54,7 +54,7 @@ int move_freepages_block(struct zone *zone, struct page *page, >> */ >> int >> start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, >> - unsigned migratetype, int flags); >> + unsigned migratetype, int flags, gfp_t gfp_flags); >> >> /* >> * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. >> diff --git a/mm/internal.h b/mm/internal.h >> index 0d240e876831..509cbdc25992 100644 >> --- a/mm/internal.h >> +++ b/mm/internal.h >> @@ -319,6 +319,9 @@ isolate_freepages_range(struct compact_control *cc, >> int >> isolate_migratepages_range(struct compact_control *cc, >> unsigned long low_pfn, unsigned long end_pfn); >> + >> +int >> +isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, int isolate_before_boundary); >> #endif >> int find_suitable_fallback(struct free_area *area, unsigned int order, >> int migratetype, bool only_stealable, bool *can_steal); >> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c >> index ce68098832aa..82406d2f3e46 100644 >> --- a/mm/memory_hotplug.c >> +++ b/mm/memory_hotplug.c >> @@ -1863,7 +1863,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, >> /* set above range as isolated */ >> ret = start_isolate_page_range(start_pfn, end_pfn, >> MIGRATE_MOVABLE, >> - MEMORY_OFFLINE | REPORT_FAILURE); >> + MEMORY_OFFLINE | REPORT_FAILURE, >> + GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); >> if (ret) { >> reason = "failure to isolate range"; >> goto failed_removal_pcplists_disabled; >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >> index 62ef78f3d771..7a4fa21aea5c 100644 >> --- a/mm/page_alloc.c >> +++ b/mm/page_alloc.c >> @@ -8985,7 +8985,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list) >> #endif >> >> /* [start, end) must belong to a single zone. */ >> -static int __alloc_contig_migrate_range(struct compact_control *cc, >> +int __alloc_contig_migrate_range(struct compact_control *cc, >> unsigned long start, unsigned long end) >> { >> /* This function is based on compact_zone() from compaction.c. */ >> @@ -9043,6 +9043,167 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, >> return 0; >> } >> >> +/** >> + * split_free_page() -- split a free page at split_pfn_offset >> + * @free_page: the original free page >> + * @order: the order of the page >> + * @split_pfn_offset: split offset within the page >> + * >> + * It is used when the free page crosses two pageblocks with different migratetypes >> + * at split_pfn_offset within the page. The split free page will be put into >> + * separate migratetype lists afterwards. Otherwise, the function achieves >> + * nothing. >> + */ >> +static inline void split_free_page(struct page *free_page, >> + int order, unsigned long split_pfn_offset) >> +{ >> + struct zone *zone = page_zone(free_page); >> + unsigned long free_page_pfn = page_to_pfn(free_page); >> + unsigned long pfn; >> + unsigned long flags; >> + int free_page_order; >> + >> + spin_lock_irqsave(&zone->lock, flags); >> + del_page_from_free_list(free_page, zone, order); >> + for (pfn = free_page_pfn; >> + pfn < free_page_pfn + (1UL << order);) { >> + int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); >> + >> + free_page_order = order_base_2(split_pfn_offset); >> + __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, >> + mt, FPI_NONE); >> + pfn += 1UL << free_page_order; >> + split_pfn_offset -= (1UL << free_page_order); >> + /* we have done the first part, now switch to second part */ >> + if (split_pfn_offset == 0) >> + split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); >> + } >> + spin_unlock_irqrestore(&zone->lock, flags); >> +} >> + >> +/** >> + * isolate_single_pageblock() -- tries to isolate a pageblock that might be >> + * within a free or in-use page. >> + * @boundary_pfn: pageblock-aligned pfn that a page might cross >> + * @gfp_flags: GFP flags used for migrating pages >> + * @isolate_before_boundary: isolate the pageblock before (1) or after (0) >> + * the boundary_pfn >> + * >> + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one >> + * pageblock. When not all pageblocks within a page are isolated at the same >> + * time, free page accounting can go wrong. For example, in the case of >> + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks. >> + * [ MAX_ORDER-1 ] >> + * [ pageblock0 | pageblock1 ] >> + * When either pageblock is isolated, if it is a free page, the page is not >> + * split into separate migratetype lists, which is supposed to; if it is an >> + * in-use page and freed later, __free_one_page() does not split the free page >> + * either. The function handles this by splitting the free page or migrating >> + * the in-use page then splitting the free page. >> + */ >> +int isolate_single_pageblock(unsigned long boundary_pfn, gfp_t gfp_flags, >> + int isolate_before_boundary) > > Do you need such big param name ? I am happy to take any suggestion. > > See > https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#naming > > isolate_before_boundary could probably be shorter. isolate_before instead? > > And should be a bool by the way. Sure. > >> +{ >> + unsigned char saved_mt; >> + /* >> + * scan at max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) aligned range to >> + * avoid isolate pageblocks belonging to a bigger free or in-use page >> + */ >> + unsigned long start_pfn = pfn_max_align_down(boundary_pfn); >> + unsigned long isolated_pageblock_pfn; > > Variable name too long. > >> + unsigned long pfn; >> + >> + VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages)); >> + >> + if (isolate_before_boundary) >> + isolated_pageblock_pfn = boundary_pfn - pageblock_nr_pages; >> + else >> + isolated_pageblock_pfn = boundary_pfn; >> + >> + saved_mt = get_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn)); >> + set_pageblock_migratetype(pfn_to_page(isolated_pageblock_pfn), MIGRATE_ISOLATE); >> + >> + for (pfn = start_pfn; pfn < boundary_pfn;) { > > This loop is a bit long a deep. Isn't there a way to put what's in "if > (PageHuge(page) || PageTransCompound(page))" into a sub-function ? > Let me give it a try. > See > https://www.kernel.org/doc/html/latest/process/coding-style.html?highlight=style#functions > Thanks for the review. >> + struct page *page = pfn_to_page(pfn); >> + >> + /* >> + * start_pfn is max(MAX_ORDER_NR_PAGES, pageblock_nr_pages) >> + * aligned, if there is any free pages in [start_pfn, boundary_pfn), >> + * its head page will always be in the range. >> + */ >> + if (PageBuddy(page)) { >> + int order = buddy_order(page); >> + >> + if (pfn + (1UL << order) > boundary_pfn) >> + split_free_page(page, order, boundary_pfn - pfn); >> + pfn += (1UL << order); >> + continue; >> + } >> + /* >> + * migrate compound pages then let the free page handling code >> + * above do the rest >> + */ >> + if (PageHuge(page) || PageTransCompound(page)) { >> + unsigned long nr_pages = compound_nr(page); >> + int order = compound_order(page); >> + struct page *head = compound_head(page); >> + unsigned long head_pfn = page_to_pfn(head); >> + >> + if (head_pfn + nr_pages >= boundary_pfn) { >> + int ret; >> + struct compact_control cc = { >> + .nr_migratepages = 0, >> + .order = -1, >> + .zone = page_zone(pfn_to_page(head_pfn)), >> + .mode = MIGRATE_SYNC, >> + .ignore_skip_hint = true, >> + .no_set_skip_hint = true, >> + .gfp_mask = current_gfp_context(gfp_flags), >> + .alloc_contig = true, >> + }; >> + >> + INIT_LIST_HEAD(&cc.migratepages); >> + >> + ret = __alloc_contig_migrate_range(&cc, head_pfn, >> + head_pfn + nr_pages); >> + >> + if (ret) { >> + /* restore the original migratetype */ >> + set_pageblock_migratetype( >> + pfn_to_page(isolated_pageblock_pfn), >> + saved_mt); >> + return -EBUSY; >> + } >> + /* >> + * reset pfn, let the free page handling code >> + * above split the free page to the right >> + * migratetype list. >> + * >> + * head_pfn is not used here as a hugetlb page >> + * order can be bigger than MAX_ORDER-1, but >> + * after it is freed, the free page order is not. >> + * Use pfn within the range to find the head of >> + * the free page and reset order to 0 if a hugetlb >> + * page with >MAX_ORDER-1 order is encountered. >> + */ >> + if (order > MAX_ORDER-1) >> + order = 0; >> + while (!PageBuddy(pfn_to_page(pfn))) { >> + order++; >> + pfn &= ~0UL << order; >> + } >> + continue; >> + } >> + pfn += nr_pages; >> + continue; >> + } >> + >> + pfn++; >> + } >> + return 0; >> +} >> + >> + >> /** >> * alloc_contig_range() -- tries to allocate given range of pages >> * @start: start PFN to allocate >> @@ -9067,8 +9228,9 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, >> int alloc_contig_range(unsigned long start, unsigned long end, >> unsigned migratetype, gfp_t gfp_mask) >> { >> - unsigned long outer_start, outer_end; >> - unsigned int order; >> + unsigned long outer_end; >> + unsigned long alloc_start = ALIGN_DOWN(start, pageblock_nr_pages); >> + unsigned long alloc_end = ALIGN(end, pageblock_nr_pages); >> int ret = 0; >> >> struct compact_control cc = { >> @@ -9087,14 +9249,11 @@ int alloc_contig_range(unsigned long start, unsigned long end, >> * What we do here is we mark all pageblocks in range as >> * MIGRATE_ISOLATE. Because pageblock and max order pages may >> * have different sizes, and due to the way page allocator >> - * work, we align the range to biggest of the two pages so >> - * that page allocator won't try to merge buddies from >> - * different pageblocks and change MIGRATE_ISOLATE to some >> - * other migration type. >> + * work, start_isolate_page_range() has special handlings for this. >> * >> * Once the pageblocks are marked as MIGRATE_ISOLATE, we >> * migrate the pages from an unaligned range (ie. pages that >> - * we are interested in). This will put all the pages in >> + * we are interested in). This will put all the pages in >> * range back to page allocator as MIGRATE_ISOLATE. >> * >> * When this is done, we take the pages in range from page >> @@ -9107,9 +9266,9 @@ int alloc_contig_range(unsigned long start, unsigned long end, >> * put back to page allocator so that buddy can use them. >> */ >> >> - ret = start_isolate_page_range(start, end, migratetype, 0); >> + ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); >> if (ret) >> - return ret; >> + goto done; >> >> drain_all_pages(cc.zone); >> >> @@ -9128,68 +9287,28 @@ int alloc_contig_range(unsigned long start, unsigned long end, >> goto done; >> ret = 0; >> >> - /* >> - * Pages from [start, end) are within a MAX_ORDER_NR_PAGES >> - * aligned blocks that are marked as MIGRATE_ISOLATE. What's >> - * more, all pages in [start, end) are free in page allocator. >> - * What we are going to do is to allocate all pages from >> - * [start, end) (that is remove them from page allocator). >> - * >> - * The only problem is that pages at the beginning and at the >> - * end of interesting range may be not aligned with pages that >> - * page allocator holds, ie. they can be part of higher order >> - * pages. Because of this, we reserve the bigger range and >> - * once this is done free the pages we are not interested in. >> - * >> - * We don't have to hold zone->lock here because the pages are >> - * isolated thus they won't get removed from buddy. >> - */ >> - >> - order = 0; >> - outer_start = start; >> - while (!PageBuddy(pfn_to_page(outer_start))) { >> - if (++order >= MAX_ORDER) { >> - outer_start = start; >> - break; >> - } >> - outer_start &= ~0UL << order; >> - } >> - >> - if (outer_start != start) { >> - order = buddy_order(pfn_to_page(outer_start)); >> - >> - /* >> - * outer_start page could be small order buddy page and >> - * it doesn't include start page. Adjust outer_start >> - * in this case to report failed page properly >> - * on tracepoint in test_pages_isolated() >> - */ >> - if (outer_start + (1UL << order) <= start) >> - outer_start = start; >> - } >> - >> /* Make sure the range is really isolated. */ >> - if (test_pages_isolated(outer_start, end, 0)) { >> + if (test_pages_isolated(alloc_start, alloc_end, 0)) { >> ret = -EBUSY; >> goto done; >> } >> >> /* Grab isolated pages from freelists. */ >> - outer_end = isolate_freepages_range(&cc, outer_start, end); >> + outer_end = isolate_freepages_range(&cc, alloc_start, alloc_end); >> if (!outer_end) { >> ret = -EBUSY; >> goto done; >> } >> >> /* Free head and tail (if any) */ >> - if (start != outer_start) >> - free_contig_range(outer_start, start - outer_start); >> - if (end != outer_end) >> - free_contig_range(end, outer_end - end); >> + if (start != alloc_start) >> + free_contig_range(alloc_start, start - alloc_start); >> + if (end != alloc_end) >> + free_contig_range(end, alloc_end - end); >> >> done: >> - undo_isolate_page_range(pfn_max_align_down(start), >> - pfn_max_align_up(end), migratetype); >> + undo_isolate_page_range(alloc_start, >> + alloc_end, migratetype); >> return ret; >> } >> EXPORT_SYMBOL(alloc_contig_range); >> diff --git a/mm/page_isolation.c b/mm/page_isolation.c >> index 64d093ab83ec..0256d5e1032c 100644 >> --- a/mm/page_isolation.c >> +++ b/mm/page_isolation.c >> @@ -285,6 +285,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) >> * and PageOffline() pages. >> * REPORT_FAILURE - report details about the failure to >> * isolate the range >> + * @gfp_flags: GFP flags used for migrating pages that sit across the >> + * range boundaries. >> * >> * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in >> * the range will never be allocated. Any free pages and pages freed in the >> @@ -293,6 +295,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) >> * pages in the range finally, the caller have to free all pages in the range. >> * test_page_isolated() can be used for test it. >> * >> + * The function first tries to isolate the pageblocks at the beginning and end >> + * of the range, since there might be pages across the range boundaries. >> + * Afterwards, it isolates the rest of the range. >> + * >> * There is no high level synchronization mechanism that prevents two threads >> * from trying to isolate overlapping ranges. If this happens, one thread >> * will notice pageblocks in the overlapping range already set to isolate. >> @@ -313,21 +319,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) >> * Return: 0 on success and -EBUSY if any part of range cannot be isolated. >> */ >> int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, >> - unsigned migratetype, int flags) >> + unsigned migratetype, int flags, gfp_t gfp_flags) >> { >> unsigned long pfn; >> struct page *page; >> + /* isolation is done at page block granularity */ >> + unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); >> + unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); >> + int ret; >> >> - unsigned long isolate_start = pfn_max_align_down(start_pfn); >> - unsigned long isolate_end = pfn_max_align_up(end_pfn); >> + /* isolate [isolate_start, isolate_start + pageblock_nr_pages] pageblock */ >> + ret = isolate_single_pageblock(isolate_start, gfp_flags, 0); >> + if (ret) >> + return ret; >> + >> + /* isolate [isolate_end - pageblock_nr_pages, isolate_end] pageblock */ >> + ret = isolate_single_pageblock(isolate_end, gfp_flags, 1); >> + if (ret) { >> + unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); >> + return ret; >> + } >> >> - for (pfn = isolate_start; >> - pfn < isolate_end; >> + /* skip isolated pageblocks at the beginning and end */ >> + for (pfn = isolate_start + pageblock_nr_pages; >> + pfn < isolate_end - pageblock_nr_pages; >> pfn += pageblock_nr_pages) { >> page = __first_valid_page(pfn, pageblock_nr_pages); >> if (page && set_migratetype_isolate(page, migratetype, flags, >> start_pfn, end_pfn)) { >> undo_isolate_page_range(isolate_start, pfn, migratetype); >> + unset_migratetype_isolate( >> + pfn_to_page(isolate_end - pageblock_nr_pages), >> + migratetype); >> return -EBUSY; >> } >> } -- Best Regards, Yan, Zi
Attachment:
signature.asc
Description: OpenPGP digital signature