On Mon, Dec 21, 2020 at 11:47 PM Liang Li <liliang.opensource@xxxxxxxxx> wrote: > > Free page reporting only supports buddy pages, it can't report the > free pages reserved for hugetlbfs case. On the other hand, hugetlbfs > is a good choice for a system with a huge amount of RAM, because it > can help to reduce the memory management overhead and improve system > performance. > This patch add the support for reporting hugepages in the free list > of hugetlb, it canbe used by virtio_balloon driver for memory > overcommit and pre zero out free pages for speeding up memory population. > > Cc: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx> > Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> > Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> > Cc: Dan Williams <dan.j.williams@xxxxxxxxx> > Cc: Dave Hansen <dave.hansen@xxxxxxxxx> > Cc: David Hildenbrand <david@xxxxxxxxxx> > Cc: Michal Hocko <mhocko@xxxxxxxx> > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Cc: Alex Williamson <alex.williamson@xxxxxxxxxx> > Cc: Michael S. Tsirkin <mst@xxxxxxxxxx> > Cc: Jason Wang <jasowang@xxxxxxxxxx> > Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> > Cc: Liang Li <liliang324@xxxxxxxxx> > Signed-off-by: Liang Li <liliangleo@xxxxxxxxxxxxxx> > --- > include/linux/hugetlb.h | 3 + > include/linux/page_reporting.h | 5 + > mm/hugetlb.c | 29 ++++ > mm/page_reporting.c | 287 +++++++++++++++++++++++++++++++++ > mm/page_reporting.h | 34 ++++ > 5 files changed, 358 insertions(+) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index ebca2ef02212..a72ad25501d3 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -11,6 +11,7 @@ > #include <linux/kref.h> > #include <linux/pgtable.h> > #include <linux/gfp.h> > +#include <linux/page_reporting.h> > > struct ctl_table; > struct user_struct; > @@ -114,6 +115,8 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, > int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, > loff_t *); > > +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid); > +void putback_isolate_huge_page(struct hstate *h, struct page *page); > int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); > long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, > struct page **, struct vm_area_struct **, > diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h > index 63e1e9fbcaa2..0da3d1a6f0cc 100644 > --- a/include/linux/page_reporting.h > +++ b/include/linux/page_reporting.h > @@ -7,6 +7,7 @@ > > /* This value should always be a power of 2, see page_reporting_cycle() */ > #define PAGE_REPORTING_CAPACITY 32 > +#define HUGEPAGE_REPORTING_CAPACITY 1 > > struct page_reporting_dev_info { > /* function that alters pages to make them "reported" */ > @@ -26,4 +27,8 @@ struct page_reporting_dev_info { > /* Tear-down and bring-up for page reporting devices */ > void page_reporting_unregister(struct page_reporting_dev_info *prdev); > int page_reporting_register(struct page_reporting_dev_info *prdev); > + > +/* Tear-down and bring-up for hugepage reporting devices */ > +void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev); > +int hugepage_reporting_register(struct page_reporting_dev_info *prdev); > #endif /*_LINUX_PAGE_REPORTING_H */ > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index cbf32d2824fd..de6ce147dfe2 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -41,6 +41,7 @@ > #include <linux/node.h> > #include <linux/userfaultfd_k.h> > #include <linux/page_owner.h> > +#include "page_reporting.h" > #include "internal.h" > > int hugetlb_max_hstate __read_mostly; > @@ -1028,6 +1029,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) > list_move(&page->lru, &h->hugepage_freelists[nid]); > h->free_huge_pages++; > h->free_huge_pages_node[nid]++; > + if (hugepage_reported(page)) { > + __ClearPageReported(page); > + pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages); > + } > + hugepage_reporting_notify_free(h->order); > } > > static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) > @@ -5531,6 +5537,29 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla > return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); > } > > +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid) > +{ > + bool ret = true; > + > + VM_BUG_ON_PAGE(!PageHead(page), page); > + > + list_move(&page->lru, &h->hugepage_activelist); > + set_page_refcounted(page); > + h->free_huge_pages--; > + h->free_huge_pages_node[nid]--; > + > + return ret; > +} > + > +void putback_isolate_huge_page(struct hstate *h, struct page *page) > +{ > + int nid = page_to_nid(page); > + pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages); > + list_move(&page->lru, &h->hugepage_freelists[nid]); > + h->free_huge_pages++; > + h->free_huge_pages_node[nid]++; > +} > + > bool isolate_huge_page(struct page *page, struct list_head *list) > { > bool ret = true; > diff --git a/mm/page_reporting.c b/mm/page_reporting.c > index 20ec3fb1afc4..15d4b5372df8 100644 > --- a/mm/page_reporting.c > +++ b/mm/page_reporting.c > @@ -7,6 +7,7 @@ > #include <linux/delay.h> > #include <linux/scatterlist.h> > #include <linux/sched.h> > +#include <linux/hugetlb.h> > > #include "page_reporting.h" > #include "internal.h" > @@ -16,6 +17,10 @@ static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; > int page_report_mini_order = pageblock_order; > unsigned long page_report_batch_size = 32 * 1024 * 1024; > > +static struct page_reporting_dev_info __rcu *hgpr_dev_info __read_mostly; > +int hugepage_report_mini_order = pageblock_order; > +unsigned long hugepage_report_batch_size = 64 * 1024 * 1024; > + > enum { > PAGE_REPORTING_IDLE = 0, > PAGE_REPORTING_REQUESTED, > @@ -67,6 +72,24 @@ void __page_reporting_notify(void) > rcu_read_unlock(); > } > > +/* notify prdev of free hugepage reporting request */ > +void __hugepage_reporting_notify(void) > +{ > + struct page_reporting_dev_info *prdev; > + > + /* > + * We use RCU to protect the pr_dev_info pointer. In almost all > + * cases this should be present, however in the unlikely case of > + * a shutdown this will be NULL and we should exit. > + */ > + rcu_read_lock(); > + prdev = rcu_dereference(hgpr_dev_info); > + if (likely(prdev)) > + __page_reporting_request(prdev); > + > + rcu_read_unlock(); > +} > + > static void > page_reporting_drain(struct page_reporting_dev_info *prdev, > struct scatterlist *sgl, unsigned int nents, bool reported) > @@ -103,6 +126,213 @@ page_reporting_drain(struct page_reporting_dev_info *prdev, > sg_init_table(sgl, nents); > } > > +static void > +hugepage_reporting_drain(struct page_reporting_dev_info *prdev, > + struct hstate *h, struct scatterlist *sgl, > + unsigned int nents, bool reported) > +{ > + struct scatterlist *sg = sgl; > + > + /* > + * Drain the now reported pages back into their respective > + * free lists/areas. We assume at least one page is populated. > + */ > + do { > + struct page *page = sg_page(sg); > + > + putback_isolate_huge_page(h, page); > + > + /* If the pages were not reported due to error skip flagging */ > + if (!reported) > + continue; > + > + __SetPageReported(page); > + } while ((sg = sg_next(sg))); > + > + /* reinitialize scatterlist now that it is empty */ > + sg_init_table(sgl, nents); > +} > + > +/* > + * The page reporting cycle consists of 4 stages, fill, report, drain, and > + * idle. We will cycle through the first 3 stages until we cannot obtain a > + * full scatterlist of pages, in that case we will switch to idle. > + */ > +static int > +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev, > + struct hstate *h, unsigned int nid, > + struct scatterlist *sgl, unsigned int *offset) > +{ > + struct list_head *list = &h->hugepage_freelists[nid]; > + unsigned int page_len = PAGE_SIZE << h->order; > + struct page *page, *next; > + long budget; > + int ret = 0, scan_cnt = 0; > + > + /* > + * Perform early check, if free area is empty there is > + * nothing to process so we can skip this free_list. > + */ > + if (list_empty(list)) > + return ret; > + > + spin_lock_irq(&hugetlb_lock); > + > + if (huge_page_order(h) > MAX_ORDER) > + budget = HUGEPAGE_REPORTING_CAPACITY; > + else > + budget = HUGEPAGE_REPORTING_CAPACITY * 32; Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we don't even really need budget since this should probably be pulling out no more than one hugepage at a time. > + /* loop through free list adding unreported pages to sg list */ > + list_for_each_entry_safe(page, next, list, lru) { > + /* We are going to skip over the reported pages. */ > + if (PageReported(page)) { > + if (++scan_cnt >= MAX_SCAN_NUM) { > + ret = scan_cnt; > + break; > + } > + continue; > + } > + It would probably have been better to place this set before your new set. I don't see your new set necessarily being the best use for page reporting. > + /* > + * If we fully consumed our budget then update our > + * state to indicate that we are requesting additional > + * processing and exit this list. > + */ > + if (budget < 0) { > + atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); > + next = page; > + break; > + } > + If budget is only ever going to be 1 then we probably could just look at making this the default case for any time we find a non-reported page. > + /* Attempt to pull page from list and place in scatterlist */ > + if (*offset) { > + isolate_free_huge_page(page, h, nid); > + /* Add page to scatter list */ > + --(*offset); > + sg_set_page(&sgl[*offset], page, page_len, 0); > + > + continue; > + } > + There is no point in the continue case if we only have a budget of 1. We should probably just tighten up the loop so that all it does is search until it finds the 1 page it can pull, pull it, and then return it. The scatterlist doesn't serve much purpose and could be reduced to just a single entry. > + /* > + * Make the first non-processed page in the free list > + * the new head of the free list before we release the > + * zone lock. > + */ > + if (&page->lru != list && !list_is_first(&page->lru, list)) > + list_rotate_to_front(&page->lru, list); > + > + /* release lock before waiting on report processing */ > + spin_unlock_irq(&hugetlb_lock); > + > + /* begin processing pages in local list */ > + ret = prdev->report(prdev, sgl, HUGEPAGE_REPORTING_CAPACITY); > + > + /* reset offset since the full list was reported */ > + *offset = HUGEPAGE_REPORTING_CAPACITY; > + > + /* update budget to reflect call to report function */ > + budget--; > + > + /* reacquire zone lock and resume processing */ > + spin_lock_irq(&hugetlb_lock); > + > + /* flush reported pages from the sg list */ > + hugepage_reporting_drain(prdev, h, sgl, > + HUGEPAGE_REPORTING_CAPACITY, !ret); > + > + /* > + * Reset next to first entry, the old next isn't valid > + * since we dropped the lock to report the pages > + */ > + next = list_first_entry(list, struct page, lru); > + > + /* exit on error */ > + if (ret) > + break; > + } > + > + /* Rotate any leftover pages to the head of the freelist */ > + if (&next->lru != list && !list_is_first(&next->lru, list)) > + list_rotate_to_front(&next->lru, list); > + > + spin_unlock_irq(&hugetlb_lock); > + > + return ret; > +} > + > +static int > +hugepage_reporting_process_hstate(struct page_reporting_dev_info *prdev, > + struct scatterlist *sgl, struct hstate *h) > +{ > + unsigned int leftover, offset = HUGEPAGE_REPORTING_CAPACITY; > + int ret = 0, nid; > + > + for (nid = 0; nid < MAX_NUMNODES; nid++) { > + ret = hugepage_reporting_cycle(prdev, h, nid, sgl, &offset); > + > + if (ret < 0) > + return ret; > + } > + > + /* report the leftover pages before going idle */ > + leftover = HUGEPAGE_REPORTING_CAPACITY - offset; > + if (leftover) { > + sgl = &sgl[offset]; > + ret = prdev->report(prdev, sgl, leftover); > + > + /* flush any remaining pages out from the last report */ > + spin_lock_irq(&hugetlb_lock); > + hugepage_reporting_drain(prdev, h, sgl, leftover, !ret); > + spin_unlock_irq(&hugetlb_lock); > + } > + > + return ret; > +} > + If HUGEPAGE_REPORTING_CAPACITY is 1 it would make more sense to rewrite this code to just optimize for a find and process a page approach rather than trying to batch pages. > +static void hugepage_reporting_process(struct work_struct *work) > +{ > + struct delayed_work *d_work = to_delayed_work(work); > + struct page_reporting_dev_info *prdev = container_of(d_work, > + struct page_reporting_dev_info, work); > + int err = 0, state = PAGE_REPORTING_ACTIVE; > + struct scatterlist *sgl; > + struct hstate *h; > + > + /* > + * Change the state to "Active" so that we can track if there is > + * anyone requests page reporting after we complete our pass. If > + * the state is not altered by the end of the pass we will switch > + * to idle and quit scheduling reporting runs. > + */ > + atomic_set(&prdev->state, state); > + > + /* allocate scatterlist to store pages being reported on */ > + sgl = kmalloc_array(HUGEPAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); > + if (!sgl) > + goto err_out; > + > + sg_init_table(sgl, HUGEPAGE_REPORTING_CAPACITY); > + > + for_each_hstate(h) { > + err = hugepage_reporting_process_hstate(prdev, sgl, h); > + if (err) > + break; > + } > + > + kfree(sgl); > +err_out: > + /* > + * If the state has reverted back to requested then there may be > + * additional pages to be processed. We will defer for 2s to allow > + * more pages to accumulate. > + */ > + state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); > + if (state == PAGE_REPORTING_REQUESTED) > + schedule_delayed_work(&prdev->work, prdev->delay_jiffies); > +} > + > /* > * The page reporting cycle consists of 4 stages, fill, report, drain, and > * idle. We will cycle through the first 3 stages until we cannot obtain a > @@ -341,6 +571,9 @@ static void page_reporting_process(struct work_struct *work) > static DEFINE_MUTEX(page_reporting_mutex); > DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); > > +static DEFINE_MUTEX(hugepage_reporting_mutex); > +DEFINE_STATIC_KEY_FALSE(hugepage_reporting_enabled); > + > int page_reporting_register(struct page_reporting_dev_info *prdev) > { > int err = 0; > @@ -395,3 +628,57 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev) > mutex_unlock(&page_reporting_mutex); > } > EXPORT_SYMBOL_GPL(page_reporting_unregister); > + > +int hugepage_reporting_register(struct page_reporting_dev_info *prdev) > +{ > + int err = 0; > + > + mutex_lock(&hugepage_reporting_mutex); > + > + /* nothing to do if already in use */ > + if (rcu_access_pointer(hgpr_dev_info)) { > + err = -EBUSY; > + goto err_out; > + } > + > + /* initialize state and work structures */ > + atomic_set(&prdev->state, PAGE_REPORTING_IDLE); > + INIT_DELAYED_WORK(&prdev->work, &hugepage_reporting_process); > + > + /* Begin initial flush of zones */ > + __page_reporting_request(prdev); > + > + /* Assign device to allow notifications */ > + rcu_assign_pointer(hgpr_dev_info, prdev); > + > + hugepage_report_mini_order = prdev->mini_order; > + hugepage_report_batch_size = prdev->batch_size; > + > + /* enable hugepage reporting notification */ > + if (!static_key_enabled(&hugepage_reporting_enabled)) { > + static_branch_enable(&hugepage_reporting_enabled); > + pr_info("Free hugepage reporting enabled\n"); > + } > +err_out: > + mutex_unlock(&hugepage_reporting_mutex); > + > + return err; > +} > +EXPORT_SYMBOL_GPL(hugepage_reporting_register); > + > +void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev) > +{ > + mutex_lock(&hugepage_reporting_mutex); > + > + if (rcu_access_pointer(hgpr_dev_info) == prdev) { > + /* Disable page reporting notification */ > + RCU_INIT_POINTER(hgpr_dev_info, NULL); > + synchronize_rcu(); > + > + /* Flush any existing work, and lock it out */ > + cancel_delayed_work_sync(&prdev->work); > + } > + > + mutex_unlock(&hugepage_reporting_mutex); > +} > +EXPORT_SYMBOL_GPL(hugepage_reporting_unregister); > diff --git a/mm/page_reporting.h b/mm/page_reporting.h > index 86ac6ffad970..271c64c3c3cb 100644 > --- a/mm/page_reporting.h > +++ b/mm/page_reporting.h > @@ -18,12 +18,24 @@ extern unsigned long page_report_batch_size; > DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); > void __page_reporting_notify(void); > > +extern int hugepage_report_mini_order; > +extern unsigned long hugepage_report_batch_size; > + > +DECLARE_STATIC_KEY_FALSE(hugepage_reporting_enabled); > +void __hugepage_reporting_notify(void); > + > static inline bool page_reported(struct page *page) > { > return static_branch_unlikely(&page_reporting_enabled) && > PageReported(page); > } > > +static inline bool hugepage_reported(struct page *page) > +{ > + return static_branch_unlikely(&hugepage_reporting_enabled) && > + PageReported(page); > +} > + > /** > * page_reporting_notify_free - Free page notification to start page processing > * > @@ -52,11 +64,33 @@ static inline void page_reporting_notify_free(unsigned int order) > __page_reporting_notify(); > } > } > + > +static inline void hugepage_reporting_notify_free(unsigned int order) > +{ > + static long batch_size = 0; > + > + if (!static_branch_unlikely(&hugepage_reporting_enabled)) > + return; > + > + /* Determine if we have crossed reporting threshold */ > + if (order < hugepage_report_mini_order) > + return; > + > + batch_size += (1 << order) << PAGE_SHIFT; > + if (batch_size >= hugepage_report_batch_size) { > + batch_size = 0; > + __hugepage_reporting_notify(); > + } > +} > #else /* CONFIG_PAGE_REPORTING */ > #define page_reported(_page) false > > static inline void page_reporting_notify_free(unsigned int order) > { > } > + > +static inline void hugepage_reporting_notify_free(unsigned int order) > +{ > +} > #endif /* CONFIG_PAGE_REPORTING */ > #endif /*_MM_PAGE_REPORTING_H */ > -- > 2.18.2 > >