Re: [RFC PATCH 1/3] mm: support hugetlb free page reporting

Alexander Duyck <alexander.duyck@xxxxxxxxx> · Tue, 22 Dec 2020 11:59:15 -0800

On Mon, Dec 21, 2020 at 11:47 PM Liang Li <liliang.opensource@xxxxxxxxx> wrote:
>
> Free page reporting only supports buddy pages, it can't report the
> free pages reserved for hugetlbfs case. On the other hand, hugetlbfs
> is a good choice for a system with a huge amount of RAM, because it
> can help to reduce the memory management overhead and improve system
> performance.
> This patch add the support for reporting hugepages in the free list
> of hugetlb, it canbe used by virtio_balloon driver for memory
> overcommit and pre zero out free pages for speeding up memory population.
>
> Cc: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx>
> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx>
> Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxx>
> Cc: David Hildenbrand <david@xxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Alex Williamson <alex.williamson@xxxxxxxxxx>
> Cc: Michael S. Tsirkin <mst@xxxxxxxxxx>
> Cc: Jason Wang <jasowang@xxxxxxxxxx>
> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
> Cc: Liang Li <liliang324@xxxxxxxxx>
> Signed-off-by: Liang Li <liliangleo@xxxxxxxxxxxxxx>
> ---
>  include/linux/hugetlb.h        |   3 +
>  include/linux/page_reporting.h |   5 +
>  mm/hugetlb.c                   |  29 ++++
>  mm/page_reporting.c            | 287 +++++++++++++++++++++++++++++++++
>  mm/page_reporting.h            |  34 ++++
>  5 files changed, 358 insertions(+)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index ebca2ef02212..a72ad25501d3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -11,6 +11,7 @@
>  #include <linux/kref.h>
>  #include <linux/pgtable.h>
>  #include <linux/gfp.h>
> +#include <linux/page_reporting.h>
>
>  struct ctl_table;
>  struct user_struct;
> @@ -114,6 +115,8 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
>  int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
>                 loff_t *);
>
> +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid);
> +void putback_isolate_huge_page(struct hstate *h, struct page *page);
>  int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
>  long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>                          struct page **, struct vm_area_struct **,
> diff --git a/include/linux/page_reporting.h b/include/linux/page_reporting.h
> index 63e1e9fbcaa2..0da3d1a6f0cc 100644
> --- a/include/linux/page_reporting.h
> +++ b/include/linux/page_reporting.h
> @@ -7,6 +7,7 @@
>
>  /* This value should always be a power of 2, see page_reporting_cycle() */
>  #define PAGE_REPORTING_CAPACITY                32
> +#define HUGEPAGE_REPORTING_CAPACITY    1
>
>  struct page_reporting_dev_info {
>         /* function that alters pages to make them "reported" */
> @@ -26,4 +27,8 @@ struct page_reporting_dev_info {
>  /* Tear-down and bring-up for page reporting devices */
>  void page_reporting_unregister(struct page_reporting_dev_info *prdev);
>  int page_reporting_register(struct page_reporting_dev_info *prdev);
> +
> +/* Tear-down and bring-up for hugepage reporting devices */
> +void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev);
> +int hugepage_reporting_register(struct page_reporting_dev_info *prdev);
>  #endif /*_LINUX_PAGE_REPORTING_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index cbf32d2824fd..de6ce147dfe2 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -41,6 +41,7 @@
>  #include <linux/node.h>
>  #include <linux/userfaultfd_k.h>
>  #include <linux/page_owner.h>
> +#include "page_reporting.h"
>  #include "internal.h"
>
>  int hugetlb_max_hstate __read_mostly;
> @@ -1028,6 +1029,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
>         list_move(&page->lru, &h->hugepage_freelists[nid]);
>         h->free_huge_pages++;
>         h->free_huge_pages_node[nid]++;
> +       if (hugepage_reported(page)) {
> +               __ClearPageReported(page);
> +               pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages);
> +       }
> +       hugepage_reporting_notify_free(h->order);
>  }
>
>  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
> @@ -5531,6 +5537,29 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
>         return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
>  }
>
> +bool isolate_free_huge_page(struct page *page, struct hstate *h, int nid)
> +{
> +       bool ret = true;
> +
> +       VM_BUG_ON_PAGE(!PageHead(page), page);
> +
> +       list_move(&page->lru, &h->hugepage_activelist);
> +       set_page_refcounted(page);
> +       h->free_huge_pages--;
> +       h->free_huge_pages_node[nid]--;
> +
> +       return ret;
> +}
> +
> +void putback_isolate_huge_page(struct hstate *h, struct page *page)
> +{
> +       int nid = page_to_nid(page);
> +       pr_info("%s, free_huge_pages=%ld\n", __func__, h->free_huge_pages);
> +       list_move(&page->lru, &h->hugepage_freelists[nid]);
> +       h->free_huge_pages++;
> +       h->free_huge_pages_node[nid]++;
> +}
> +
>  bool isolate_huge_page(struct page *page, struct list_head *list)
>  {
>         bool ret = true;
> diff --git a/mm/page_reporting.c b/mm/page_reporting.c
> index 20ec3fb1afc4..15d4b5372df8 100644
> --- a/mm/page_reporting.c
> +++ b/mm/page_reporting.c
> @@ -7,6 +7,7 @@
>  #include <linux/delay.h>
>  #include <linux/scatterlist.h>
>  #include <linux/sched.h>
> +#include <linux/hugetlb.h>
>
>  #include "page_reporting.h"
>  #include "internal.h"
> @@ -16,6 +17,10 @@ static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly;
>  int page_report_mini_order = pageblock_order;
>  unsigned long page_report_batch_size = 32 * 1024 * 1024;
>
> +static struct page_reporting_dev_info __rcu *hgpr_dev_info __read_mostly;
> +int hugepage_report_mini_order = pageblock_order;
> +unsigned long hugepage_report_batch_size = 64 * 1024 * 1024;
> +
>  enum {
>         PAGE_REPORTING_IDLE = 0,
>         PAGE_REPORTING_REQUESTED,
> @@ -67,6 +72,24 @@ void __page_reporting_notify(void)
>         rcu_read_unlock();
>  }
>
> +/* notify prdev of free hugepage reporting request */
> +void __hugepage_reporting_notify(void)
> +{
> +       struct page_reporting_dev_info *prdev;
> +
> +       /*
> +        * We use RCU to protect the pr_dev_info pointer. In almost all
> +        * cases this should be present, however in the unlikely case of
> +        * a shutdown this will be NULL and we should exit.
> +        */
> +       rcu_read_lock();
> +       prdev = rcu_dereference(hgpr_dev_info);
> +       if (likely(prdev))
> +               __page_reporting_request(prdev);
> +
> +       rcu_read_unlock();
> +}
> +
>  static void
>  page_reporting_drain(struct page_reporting_dev_info *prdev,
>                      struct scatterlist *sgl, unsigned int nents, bool reported)
> @@ -103,6 +126,213 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
>         sg_init_table(sgl, nents);
>  }
>
> +static void
> +hugepage_reporting_drain(struct page_reporting_dev_info *prdev,
> +                        struct hstate *h, struct scatterlist *sgl,
> +                        unsigned int nents, bool reported)
> +{
> +       struct scatterlist *sg = sgl;
> +
> +       /*
> +        * Drain the now reported pages back into their respective
> +        * free lists/areas. We assume at least one page is populated.
> +        */
> +       do {
> +               struct page *page = sg_page(sg);
> +
> +               putback_isolate_huge_page(h, page);
> +
> +               /* If the pages were not reported due to error skip flagging */
> +               if (!reported)
> +                       continue;
> +
> +               __SetPageReported(page);
> +       } while ((sg = sg_next(sg)));
> +
> +       /* reinitialize scatterlist now that it is empty */
> +       sg_init_table(sgl, nents);
> +}
> +
> +/*
> + * The page reporting cycle consists of 4 stages, fill, report, drain, and
> + * idle. We will cycle through the first 3 stages until we cannot obtain a
> + * full scatterlist of pages, in that case we will switch to idle.
> + */
> +static int
> +hugepage_reporting_cycle(struct page_reporting_dev_info *prdev,
> +                        struct hstate *h, unsigned int nid,
> +                        struct scatterlist *sgl, unsigned int *offset)
> +{
> +       struct list_head *list = &h->hugepage_freelists[nid];
> +       unsigned int page_len = PAGE_SIZE << h->order;
> +       struct page *page, *next;
> +       long budget;
> +       int ret = 0, scan_cnt = 0;
> +
> +       /*
> +        * Perform early check, if free area is empty there is
> +        * nothing to process so we can skip this free_list.
> +        */
> +       if (list_empty(list))
> +               return ret;
> +
> +       spin_lock_irq(&hugetlb_lock);
> +
> +       if (huge_page_order(h) > MAX_ORDER)
> +               budget = HUGEPAGE_REPORTING_CAPACITY;
> +       else
> +               budget = HUGEPAGE_REPORTING_CAPACITY * 32;

Wouldn't huge_page_order always be more than MAX_ORDER? Seems like we
don't even really need budget since this should probably be pulling
out no more than one hugepage at a time.

> +       /* loop through free list adding unreported pages to sg list */
> +       list_for_each_entry_safe(page, next, list, lru) {
> +               /* We are going to skip over the reported pages. */
> +               if (PageReported(page)) {
> +                       if (++scan_cnt >= MAX_SCAN_NUM) {
> +                               ret = scan_cnt;
> +                               break;
> +                       }
> +                       continue;
> +               }
> +

It would probably have been better to place this set before your new
set. I don't see your new set necessarily being the best use for page
reporting.

> +               /*
> +                * If we fully consumed our budget then update our
> +                * state to indicate that we are requesting additional
> +                * processing and exit this list.
> +                */
> +               if (budget < 0) {
> +                       atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED);
> +                       next = page;
> +                       break;
> +               }
> +

If budget is only ever going to be 1 then we probably could just look
at making this the default case for any time we find a non-reported
page.

> +               /* Attempt to pull page from list and place in scatterlist */
> +               if (*offset) {
> +                       isolate_free_huge_page(page, h, nid);
> +                       /* Add page to scatter list */
> +                       --(*offset);
> +                       sg_set_page(&sgl[*offset], page, page_len, 0);
> +
> +                       continue;
> +               }
> +

There is no point in the continue case if we only have a budget of 1.
We should probably just tighten up the loop so that all it does is
search until it finds the 1 page it can pull, pull it, and then return
it. The scatterlist doesn't serve much purpose and could be reduced to
just a single entry.

> +               /*
> +                * Make the first non-processed page in the free list
> +                * the new head of the free list before we release the
> +                * zone lock.
> +                */
> +               if (&page->lru != list && !list_is_first(&page->lru, list))
> +                       list_rotate_to_front(&page->lru, list);
> +
> +               /* release lock before waiting on report processing */
> +               spin_unlock_irq(&hugetlb_lock);
> +
> +               /* begin processing pages in local list */
> +               ret = prdev->report(prdev, sgl, HUGEPAGE_REPORTING_CAPACITY);
> +
> +               /* reset offset since the full list was reported */
> +               *offset = HUGEPAGE_REPORTING_CAPACITY;
> +
> +               /* update budget to reflect call to report function */
> +               budget--;
> +
> +               /* reacquire zone lock and resume processing */
> +               spin_lock_irq(&hugetlb_lock);
> +
> +               /* flush reported pages from the sg list */
> +               hugepage_reporting_drain(prdev, h, sgl,
> +                                        HUGEPAGE_REPORTING_CAPACITY, !ret);
> +
> +               /*
> +                * Reset next to first entry, the old next isn't valid
> +                * since we dropped the lock to report the pages
> +                */
> +               next = list_first_entry(list, struct page, lru);
> +
> +               /* exit on error */
> +               if (ret)
> +                       break;
> +       }
> +
> +       /* Rotate any leftover pages to the head of the freelist */
> +       if (&next->lru != list && !list_is_first(&next->lru, list))
> +               list_rotate_to_front(&next->lru, list);
> +
> +       spin_unlock_irq(&hugetlb_lock);
> +
> +       return ret;
> +}
> +
> +static int
> +hugepage_reporting_process_hstate(struct page_reporting_dev_info *prdev,
> +                           struct scatterlist *sgl, struct hstate *h)
> +{
> +       unsigned int leftover, offset = HUGEPAGE_REPORTING_CAPACITY;
> +       int ret = 0, nid;
> +
> +       for (nid = 0; nid < MAX_NUMNODES; nid++) {
> +               ret = hugepage_reporting_cycle(prdev, h, nid, sgl, &offset);
> +
> +               if (ret < 0)
> +                       return ret;
> +       }
> +
> +       /* report the leftover pages before going idle */
> +       leftover = HUGEPAGE_REPORTING_CAPACITY - offset;
> +       if (leftover) {
> +               sgl = &sgl[offset];
> +               ret = prdev->report(prdev, sgl, leftover);
> +
> +               /* flush any remaining pages out from the last report */
> +               spin_lock_irq(&hugetlb_lock);
> +               hugepage_reporting_drain(prdev, h, sgl, leftover, !ret);
> +               spin_unlock_irq(&hugetlb_lock);
> +       }
> +
> +       return ret;
> +}
> +

If HUGEPAGE_REPORTING_CAPACITY is 1 it would make more sense to
rewrite this code to just optimize for a find and process a page
approach rather than trying to batch pages.

> +static void hugepage_reporting_process(struct work_struct *work)
> +{
> +       struct delayed_work *d_work = to_delayed_work(work);
> +       struct page_reporting_dev_info *prdev = container_of(d_work,
> +                                       struct page_reporting_dev_info, work);
> +       int err = 0, state = PAGE_REPORTING_ACTIVE;
> +       struct scatterlist *sgl;
> +       struct hstate *h;
> +
> +       /*
> +        * Change the state to "Active" so that we can track if there is
> +        * anyone requests page reporting after we complete our pass. If
> +        * the state is not altered by the end of the pass we will switch
> +        * to idle and quit scheduling reporting runs.
> +        */
> +       atomic_set(&prdev->state, state);
> +
> +       /* allocate scatterlist to store pages being reported on */
> +       sgl = kmalloc_array(HUGEPAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL);
> +       if (!sgl)
> +               goto err_out;
> +
> +       sg_init_table(sgl, HUGEPAGE_REPORTING_CAPACITY);
> +
> +       for_each_hstate(h) {
> +               err = hugepage_reporting_process_hstate(prdev, sgl, h);
> +               if (err)
> +                       break;
> +       }
> +
> +       kfree(sgl);
> +err_out:
> +       /*
> +        * If the state has reverted back to requested then there may be
> +        * additional pages to be processed. We will defer for 2s to allow
> +        * more pages to accumulate.
> +        */
> +       state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE);
> +       if (state == PAGE_REPORTING_REQUESTED)
> +               schedule_delayed_work(&prdev->work, prdev->delay_jiffies);
> +}
> +
>  /*
>   * The page reporting cycle consists of 4 stages, fill, report, drain, and
>   * idle. We will cycle through the first 3 stages until we cannot obtain a
> @@ -341,6 +571,9 @@ static void page_reporting_process(struct work_struct *work)
>  static DEFINE_MUTEX(page_reporting_mutex);
>  DEFINE_STATIC_KEY_FALSE(page_reporting_enabled);
>
> +static DEFINE_MUTEX(hugepage_reporting_mutex);
> +DEFINE_STATIC_KEY_FALSE(hugepage_reporting_enabled);
> +
>  int page_reporting_register(struct page_reporting_dev_info *prdev)
>  {
>         int err = 0;
> @@ -395,3 +628,57 @@ void page_reporting_unregister(struct page_reporting_dev_info *prdev)
>         mutex_unlock(&page_reporting_mutex);
>  }
>  EXPORT_SYMBOL_GPL(page_reporting_unregister);
> +
> +int hugepage_reporting_register(struct page_reporting_dev_info *prdev)
> +{
> +       int err = 0;
> +
> +       mutex_lock(&hugepage_reporting_mutex);
> +
> +       /* nothing to do if already in use */
> +       if (rcu_access_pointer(hgpr_dev_info)) {
> +               err = -EBUSY;
> +               goto err_out;
> +       }
> +
> +       /* initialize state and work structures */
> +       atomic_set(&prdev->state, PAGE_REPORTING_IDLE);
> +       INIT_DELAYED_WORK(&prdev->work, &hugepage_reporting_process);
> +
> +       /* Begin initial flush of zones */
> +       __page_reporting_request(prdev);
> +
> +       /* Assign device to allow notifications */
> +       rcu_assign_pointer(hgpr_dev_info, prdev);
> +
> +       hugepage_report_mini_order = prdev->mini_order;
> +       hugepage_report_batch_size = prdev->batch_size;
> +
> +       /* enable hugepage reporting notification */
> +       if (!static_key_enabled(&hugepage_reporting_enabled)) {
> +               static_branch_enable(&hugepage_reporting_enabled);
> +               pr_info("Free hugepage reporting enabled\n");
> +       }
> +err_out:
> +       mutex_unlock(&hugepage_reporting_mutex);
> +
> +       return err;
> +}
> +EXPORT_SYMBOL_GPL(hugepage_reporting_register);
> +
> +void hugepage_reporting_unregister(struct page_reporting_dev_info *prdev)
> +{
> +       mutex_lock(&hugepage_reporting_mutex);
> +
> +       if (rcu_access_pointer(hgpr_dev_info) == prdev) {
> +               /* Disable page reporting notification */
> +               RCU_INIT_POINTER(hgpr_dev_info, NULL);
> +               synchronize_rcu();
> +
> +               /* Flush any existing work, and lock it out */
> +               cancel_delayed_work_sync(&prdev->work);
> +       }
> +
> +       mutex_unlock(&hugepage_reporting_mutex);
> +}
> +EXPORT_SYMBOL_GPL(hugepage_reporting_unregister);
> diff --git a/mm/page_reporting.h b/mm/page_reporting.h
> index 86ac6ffad970..271c64c3c3cb 100644
> --- a/mm/page_reporting.h
> +++ b/mm/page_reporting.h
> @@ -18,12 +18,24 @@ extern unsigned long page_report_batch_size;
>  DECLARE_STATIC_KEY_FALSE(page_reporting_enabled);
>  void __page_reporting_notify(void);
>
> +extern int hugepage_report_mini_order;
> +extern unsigned long hugepage_report_batch_size;
> +
> +DECLARE_STATIC_KEY_FALSE(hugepage_reporting_enabled);
> +void __hugepage_reporting_notify(void);
> +
>  static inline bool page_reported(struct page *page)
>  {
>         return static_branch_unlikely(&page_reporting_enabled) &&
>                PageReported(page);
>  }
>
> +static inline bool hugepage_reported(struct page *page)
> +{
> +       return static_branch_unlikely(&hugepage_reporting_enabled) &&
> +              PageReported(page);
> +}
> +
>  /**
>   * page_reporting_notify_free - Free page notification to start page processing
>   *
> @@ -52,11 +64,33 @@ static inline void page_reporting_notify_free(unsigned int order)
>                 __page_reporting_notify();
>         }
>  }
> +
> +static inline void hugepage_reporting_notify_free(unsigned int order)
> +{
> +       static long batch_size = 0;
> +
> +       if (!static_branch_unlikely(&hugepage_reporting_enabled))
> +               return;
> +
> +       /* Determine if we have crossed reporting threshold */
> +       if (order < hugepage_report_mini_order)
> +               return;
> +
> +       batch_size += (1 << order) << PAGE_SHIFT;
> +       if (batch_size >= hugepage_report_batch_size) {
> +               batch_size = 0;
> +               __hugepage_reporting_notify();
> +       }
> +}
>  #else /* CONFIG_PAGE_REPORTING */
>  #define page_reported(_page)   false
>
>  static inline void page_reporting_notify_free(unsigned int order)
>  {
>  }
> +
> +static inline void hugepage_reporting_notify_free(unsigned int order)
> +{
> +}
>  #endif /* CONFIG_PAGE_REPORTING */
>  #endif /*_MM_PAGE_REPORTING_H */
> --
> 2.18.2
>
>