Re: [PATCH v1 1/3] mm/memory-failure: userspace controls soft-offlining hugetlb pages

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



+CC Jane.

On Fri, May 31, 2024 at 2:34 PM Jiaqi Yan <jiaqiyan@xxxxxxxxxx> wrote:
>
> Correctable memory errors are very common on servers with large
> amount of memory, and are corrected by ECC. Soft offline is kernel's
> additional recovery handling for memory pages having (excessive)
> corrected memory errors. Impacted page is migrated to a healthy page
> if mapped/inuse; the original page is discarded for any future use.
>
> The actual policy on whether (and when) to soft offline should be
> maintained by userspace, especially in case of HugeTLB hugepages.
> Soft-offline dissolves a hugepage, either in-use or free, into
> chunks of 4K pages, reducing HugeTLB pool capacity by 1 hugepage.
> If userspace has not acknowledged such behavior, it may be surprised
> when later mmap hugepages MAP_FAILED due to lack of hugepages.
> In addition, discarding the entire 1G memory page only because of
> corrected memory errors sounds very costly and kernel better not
> doing under the hood. But today there are at least 2 such cases:
> 1. GHES driver sees both GHES_SEV_CORRECTED and
>    CPER_SEC_ERROR_THRESHOLD_EXCEEDED after parsing CPER.
> 2. RAS Correctable Errors Collector counts correctable errors per
>    PFN and when the counter for a PFN reaches threshold
> In both cases, userspace has no control of the soft offline performed
> by kernel's memory failure recovery.
>
> This commit gives userspace the control of soft-offlining HugeTLB
> pages: kernel only soft offlines hugepage if userspace has opt-ed in
> in for that specific hugepage size. The interface to userspace is a
> new sysfs entry called softoffline_corrected_errors under the
> /sys/kernel/mm/hugepages/hugepages-${size}kB directory:
> * When softoffline_corrected_errors=0, skip soft offlining for all
>   hugepages of size ${size}kB.
> * When softoffline_corrected_errors=1, soft offline as before this
>   patch series.
>
> So the granularity of the control is per hugepage size, and is kept
> in corresponding hstate. By default softoffline_corrected_errors is
> 1 to preserve existing behavior in kernel.
>
> Signed-off-by: Jiaqi Yan <jiaqiyan@xxxxxxxxxx>
> ---
>  include/linux/hugetlb.h | 17 +++++++++++++++++
>  mm/hugetlb.c            | 34 ++++++++++++++++++++++++++++++++++
>  mm/memory-failure.c     |  7 +++++++
>  3 files changed, 58 insertions(+)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 2b3c3a404769..55f9e9593cce 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -685,6 +685,7 @@ struct hstate {
>         int next_nid_to_free;
>         unsigned int order;
>         unsigned int demote_order;
> +       unsigned int softoffline_corrected_errors;
>         unsigned long mask;
>         unsigned long max_huge_pages;
>         unsigned long nr_huge_pages;
> @@ -1029,6 +1030,16 @@ void hugetlb_unregister_node(struct node *node);
>   */
>  bool is_raw_hwpoison_page_in_hugepage(struct page *page);
>
> +/*
> + * For certain hugepage size, when a hugepage has corrected memory error(s):
> + * - Return 0 if userspace wants to disable soft offlining the hugepage.
> + * - Return > 0 if userspace allows soft offlining the hugepage.
> + */
> +static inline int hugetlb_softoffline_corrected_errors(struct folio *folio)
> +{
> +       return folio_hstate(folio)->softoffline_corrected_errors;
> +}
> +
>  #else  /* CONFIG_HUGETLB_PAGE */
>  struct hstate {};
>
> @@ -1226,6 +1237,12 @@ static inline bool hugetlbfs_pagecache_present(
>  {
>         return false;
>  }
> +
> +static inline int hugetlb_softoffline_corrected_errors(struct folio *folio)
> +{
> +       return 1;
> +}
> +
>  #endif /* CONFIG_HUGETLB_PAGE */
>
>  static inline spinlock_t *huge_pte_lock(struct hstate *h,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6be78e7d4f6e..a184e28ce592 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4325,6 +4325,38 @@ static ssize_t demote_size_store(struct kobject *kobj,
>  }
>  HSTATE_ATTR(demote_size);
>
> +static ssize_t softoffline_corrected_errors_show(struct kobject *kobj,
> +                                                struct kobj_attribute *attr,
> +                                                char *buf)
> +{
> +       struct hstate *h = kobj_to_hstate(kobj, NULL);
> +
> +       return sysfs_emit(buf, "%d\n", h->softoffline_corrected_errors);
> +}
> +
> +static ssize_t softoffline_corrected_errors_store(struct kobject *kobj,
> +                                                 struct kobj_attribute *attr,
> +                                                 const char *buf,
> +                                                 size_t count)
> +{
> +       int err;
> +       unsigned long input;
> +       struct hstate *h = kobj_to_hstate(kobj, NULL);
> +
> +       err = kstrtoul(buf, 10, &input);
> +       if (err)
> +               return err;
> +
> +       /* softoffline_corrected_errors is either 0 or 1. */
> +       if (input > 1)
> +               return -EINVAL;
> +
> +       h->softoffline_corrected_errors = input;
> +
> +       return count;
> +}
> +HSTATE_ATTR(softoffline_corrected_errors);
> +
>  static struct attribute *hstate_attrs[] = {
>         &nr_hugepages_attr.attr,
>         &nr_overcommit_hugepages_attr.attr,
> @@ -4334,6 +4366,7 @@ static struct attribute *hstate_attrs[] = {
>  #ifdef CONFIG_NUMA
>         &nr_hugepages_mempolicy_attr.attr,
>  #endif
> +       &softoffline_corrected_errors_attr.attr,
>         NULL,
>  };
>
> @@ -4655,6 +4688,7 @@ void __init hugetlb_add_hstate(unsigned int order)
>         h = &hstates[hugetlb_max_hstate++];
>         mutex_init(&h->resize_lock);
>         h->order = order;
> +       h->softoffline_corrected_errors = 1;
>         h->mask = ~(huge_page_size(h) - 1);
>         for (i = 0; i < MAX_NUMNODES; ++i)
>                 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 16ada4fb02b7..7094fc4c62e2 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -2776,6 +2776,13 @@ int soft_offline_page(unsigned long pfn, int flags)
>                 return -EIO;
>         }
>
> +       if (PageHuge(page) &&
> +           !hugetlb_softoffline_corrected_errors(page_folio(page))) {
> +               pr_info("soft offline: %#lx: hugetlb page is ignored\n", pfn);
> +               put_ref_page(pfn, flags);
> +               return -EINVAL;
> +       }
> +
>         mutex_lock(&mf_mutex);
>
>         if (PageHWPoison(page)) {
> --
> 2.45.1.288.g0e0cd299f1-goog
>





[Index of Archives]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]

  Powered by Linux