Patch "mm/hwpoison: mf_mutex for soft offline and unpoison" has been added to the 5.15-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    mm/hwpoison: mf_mutex for soft offline and unpoison

to the 5.15-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     mm-hwpoison-mf_mutex-for-soft-offline-and-unpoison.patch
and it can be found in the queue-5.15 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 37d6e57e30a76da61bd50ea469d272175c5ea977
Author: Naoya Horiguchi <naoya.horiguchi@xxxxxxx>
Date:   Fri Jan 14 14:09:02 2022 -0800

    mm/hwpoison: mf_mutex for soft offline and unpoison
    
    [ Upstream commit 91d005479e06392617bacc114509d611b705eaac ]
    
    Patch series "mm/hwpoison: fix unpoison_memory()", v4.
    
    The main purpose of this series is to sync unpoison code to recent
    changes around how hwpoison code takes page refcount.  Unpoison should
    work or simply fail (without crash) if impossible.
    
    The recent works of keeping hwpoison pages in shmem pagecache introduce
    a new state of hwpoisoned pages, but unpoison for such pages is not
    supported yet with this series.
    
    It seems that soft-offline and unpoison can be used as general purpose
    page offline/online mechanism (not in the context of memory error).  I
    think that we need some additional works to realize it because currently
    soft-offline and unpoison are assumed not to happen so frequently (print
    out too many messages for aggressive usecases).  But anyway this could
    be another interesting next topic.
    
    v1: https://lore.kernel.org/linux-mm/20210614021212.223326-1-nao.horiguchi@xxxxxxxxx/
    v2: https://lore.kernel.org/linux-mm/20211025230503.2650970-1-naoya.horiguchi@xxxxxxxxx/
    v3: https://lore.kernel.org/linux-mm/20211105055058.3152564-1-naoya.horiguchi@xxxxxxxxx/
    
    This patch (of 3):
    
    Originally mf_mutex is introduced to serialize multiple MCE events, but
    it is not that useful to allow unpoison to run in parallel with
    memory_failure() and soft offline.  So apply mf_mutex to soft offline
    and unpoison.  The memory failure handler and soft offline handler get
    simpler with this.
    
    Link: https://lkml.kernel.org/r/20211115084006.3728254-1-naoya.horiguchi@xxxxxxxxx
    Link: https://lkml.kernel.org/r/20211115084006.3728254-2-naoya.horiguchi@xxxxxxxxx
    Signed-off-by: Naoya Horiguchi <naoya.horiguchi@xxxxxxx>
    Reviewed-by: Yang Shi <shy828301@xxxxxxxxx>
    Cc: "Aneesh Kumar K.V" <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
    Cc: David Hildenbrand <david@xxxxxxxxxx>
    Cc: Ding Hui <dinghui@xxxxxxxxxxxxxx>
    Cc: Miaohe Lin <linmiaohe@xxxxxxxxxx>
    Cc: Michal Hocko <mhocko@xxxxxxxx>
    Cc: Oscar Salvador <osalvador@xxxxxxx>
    Cc: Peter Xu <peterx@xxxxxxxxxx>
    Cc: Tony Luck <tony.luck@xxxxxxxxx>
    Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
    Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c3ceb7436933..e6425d959fa9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1463,14 +1463,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
 	lock_page(head);
 	page_flags = head->flags;
 
-	if (!PageHWPoison(head)) {
-		pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
-		num_poisoned_pages_dec();
-		unlock_page(head);
-		put_page(head);
-		return 0;
-	}
-
 	/*
 	 * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
 	 * simply disable it. In order to make it work properly, we need
@@ -1584,6 +1576,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	return rc;
 }
 
+static DEFINE_MUTEX(mf_mutex);
+
 /**
  * memory_failure - Handle memory failure of a page.
  * @pfn: Page Number of the corrupted page
@@ -1610,7 +1604,6 @@ int memory_failure(unsigned long pfn, int flags)
 	int res = 0;
 	unsigned long page_flags;
 	bool retry = true;
-	static DEFINE_MUTEX(mf_mutex);
 
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure on page %lx", pfn);
@@ -1744,16 +1737,6 @@ int memory_failure(unsigned long pfn, int flags)
 	 */
 	page_flags = p->flags;
 
-	/*
-	 * unpoison always clear PG_hwpoison inside page lock
-	 */
-	if (!PageHWPoison(p)) {
-		pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
-		num_poisoned_pages_dec();
-		unlock_page(p);
-		put_page(p);
-		goto unlock_mutex;
-	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
 			num_poisoned_pages_dec();
@@ -1934,6 +1917,7 @@ int unpoison_memory(unsigned long pfn)
 	struct page *page;
 	struct page *p;
 	int freeit = 0;
+	int ret = 0;
 	unsigned long flags = 0;
 	static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
 					DEFAULT_RATELIMIT_BURST);
@@ -1944,39 +1928,30 @@ int unpoison_memory(unsigned long pfn)
 	p = pfn_to_page(pfn);
 	page = compound_head(p);
 
+	mutex_lock(&mf_mutex);
+
 	if (!PageHWPoison(p)) {
 		unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (page_count(page) > 1) {
 		unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (page_mapped(page)) {
 		unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (page_mapping(page)) {
 		unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
-	}
-
-	/*
-	 * unpoison_memory() can encounter thp only when the thp is being
-	 * worked by memory_failure() and the page lock is not held yet.
-	 * In such case, we yield to memory_failure() and make unpoison fail.
-	 */
-	if (!PageHuge(page) && PageTransHuge(page)) {
-		unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
-				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
 	if (!get_hwpoison_page(p, flags)) {
@@ -1984,29 +1959,23 @@ int unpoison_memory(unsigned long pfn)
 			num_poisoned_pages_dec();
 		unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
 				 pfn, &unpoison_rs);
-		return 0;
+		goto unlock_mutex;
 	}
 
-	lock_page(page);
-	/*
-	 * This test is racy because PG_hwpoison is set outside of page lock.
-	 * That's acceptable because that won't trigger kernel panic. Instead,
-	 * the PG_hwpoison page will be caught and isolated on the entrance to
-	 * the free buddy page pool.
-	 */
 	if (TestClearPageHWPoison(page)) {
 		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
 				 pfn, &unpoison_rs);
 		num_poisoned_pages_dec();
 		freeit = 1;
 	}
-	unlock_page(page);
 
 	put_page(page);
 	if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
 		put_page(page);
 
-	return 0;
+unlock_mutex:
+	mutex_unlock(&mf_mutex);
+	return ret;
 }
 EXPORT_SYMBOL(unpoison_memory);
 
@@ -2187,9 +2156,12 @@ int soft_offline_page(unsigned long pfn, int flags)
 		return -EIO;
 	}
 
+	mutex_lock(&mf_mutex);
+
 	if (PageHWPoison(page)) {
 		pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
 		put_ref_page(ref_page);
+		mutex_unlock(&mf_mutex);
 		return 0;
 	}
 
@@ -2208,5 +2180,7 @@ int soft_offline_page(unsigned long pfn, int flags)
 		}
 	}
 
+	mutex_unlock(&mf_mutex);
+
 	return ret;
 }



[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux