On 4/11/2022 4:27 PM, Dan Williams wrote: > On Tue, Apr 5, 2022 at 12:48 PM Jane Chu <jane.chu@xxxxxxxxxx> wrote: >> >> The set_memory_uc() approach doesn't work well in all cases. >> For example, when "The VMM unmapped the bad page from guest >> physical space and passed the machine check to the guest." >> "The guest gets virtual #MC on an access to that page. >> When the guest tries to do set_memory_uc() and instructs >> cpa_flush() to do clean caches that results in taking another >> fault / exception perhaps because the VMM unmapped the page >> from the guest." >> >> Since the driver has special knowledge to handle NP or UC, > > I think a patch is needed before this one to make this statement true? I.e.: > > diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c > index ee8d9973f60b..11641f55025a 100644 > --- a/drivers/acpi/nfit/mce.c > +++ b/drivers/acpi/nfit/mce.c > @@ -32,6 +32,7 @@ static int nfit_handle_mce(struct notifier_block > *nb, unsigned long val, > */ > mutex_lock(&acpi_desc_lock); > list_for_each_entry(acpi_desc, &acpi_descs, list) { > + unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc); > struct device *dev = acpi_desc->dev; > int found_match = 0; > > @@ -63,8 +64,7 @@ static int nfit_handle_mce(struct notifier_block > *nb, unsigned long val, > > /* If this fails due to an -ENOMEM, there is little we can do */ > nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus, > - ALIGN(mce->addr, L1_CACHE_BYTES), > - L1_CACHE_BYTES); > + ALIGN(mce->addr, align), align); > nvdimm_region_notify(nfit_spa->nd_region, > NVDIMM_REVALIDATE_POISON); > Dan, I tried the above change, and this is what I got after injecting 8 back-to-back poisons, then read them and received SIGBUS/BUS_MCEERR_AR, then repair via the v7 patch which works until this change is added. [ 6240.955331] nfit ACPI0012:00: XXX, align = 100 [ 6240.960300] nfit ACPI0012:00: XXX, ALIGN(mce->addr, L1_CACHE_BYTES)=1851600400, L1_CACHE_BYTES=40, ALIGN(mce->addr, align)=1851600400 [..] [ 6242.052277] nfit ACPI0012:00: XXX, align = 100 [ 6242.057243] nfit ACPI0012:00: XXX, ALIGN(mce->addr, L1_CACHE_BYTES)=1851601000, L1_CACHE_BYTES=40, ALIGN(mce->addr, align)=1851601000 [..] [ 6244.917198] nfit ACPI0012:00: XXX, align = 1000 [ 6244.922258] nfit ACPI0012:00: XXX, ALIGN(mce->addr, L1_CACHE_BYTES)=1851601200, L1_CACHE_BYTES=40, ALIGN(mce->addr, align)=1851602000 [..] All 8 poisons remain uncleared. Without further investigation, I don't know why the failure. Could we mark this change to a follow-on task? The driver knows a lot about how to clear poisons besides hardcoding poison alignment to 0x40 bytes. > >> let's mark the poisoned page with NP and let driver handle it >> when it comes down to repair. >> >> Please refer to discussions here for more details. >> https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@xxxxxxxxxxxxxx/ >> >> Now since poisoned page is marked as not-present, in order to >> avoid writing to a 'np' page and trigger kernel Oops, also fix >> pmem_do_write(). >> >> Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") >> Signed-off-by: Jane Chu <jane.chu@xxxxxxxxxx> >> --- >> arch/x86/kernel/cpu/mce/core.c | 6 +++--- >> arch/x86/mm/pat/set_memory.c | 18 ++++++------------ >> drivers/nvdimm/pmem.c | 31 +++++++------------------------ >> include/linux/set_memory.h | 4 ++-- >> 4 files changed, 18 insertions(+), 41 deletions(-) >> >> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c >> index 981496e6bc0e..fa67bb9d1afe 100644 >> --- a/arch/x86/kernel/cpu/mce/core.c >> +++ b/arch/x86/kernel/cpu/mce/core.c >> @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, >> >> pfn = mce->addr >> PAGE_SHIFT; >> if (!memory_failure(pfn, 0)) { >> - set_mce_nospec(pfn, whole_page(mce)); >> + set_mce_nospec(pfn); >> mce->kflags |= MCE_HANDLED_UC; >> } >> >> @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb) >> >> ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); >> if (!ret) { >> - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); >> + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); >> sync_core(); >> return; >> } >> @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb) >> p->mce_count = 0; >> pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); >> if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) >> - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); >> + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); >> } >> >> static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) >> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c >> index 93dde949f224..404ffcb3f2cb 100644 >> --- a/arch/x86/mm/pat/set_memory.c >> +++ b/arch/x86/mm/pat/set_memory.c >> @@ -1926,13 +1926,8 @@ int set_memory_wb(unsigned long addr, int numpages) >> EXPORT_SYMBOL(set_memory_wb); >> >> #ifdef CONFIG_X86_64 >> -/* >> - * Prevent speculative access to the page by either unmapping >> - * it (if we do not require access to any part of the page) or >> - * marking it uncacheable (if we want to try to retrieve data >> - * from non-poisoned lines in the page). >> - */ >> -int set_mce_nospec(unsigned long pfn, bool unmap) >> +/* Prevent speculative access to a page by marking it not-present */ >> +int set_mce_nospec(unsigned long pfn) >> { >> unsigned long decoy_addr; >> int rc; >> @@ -1954,10 +1949,7 @@ int set_mce_nospec(unsigned long pfn, bool unmap) >> */ >> decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); >> >> - if (unmap) >> - rc = set_memory_np(decoy_addr, 1); >> - else >> - rc = set_memory_uc(decoy_addr, 1); >> + rc = set_memory_np(decoy_addr, 1); >> if (rc) >> pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); >> return rc; >> @@ -1966,7 +1958,9 @@ int set_mce_nospec(unsigned long pfn, bool unmap) >> /* Restore full speculative operation to the pfn. */ >> int clear_mce_nospec(unsigned long pfn) >> { >> - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); >> + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); >> + >> + return change_page_attr_set(&addr, 1, __pgprot(_PAGE_PRESENT), 0); > > This probably warrants a set_memory_present() helper. I had a set_memory_present() helper in an earlier version, but as there is no other caller, also if I'm not mis-remembering, thought Christoph had suggested to simplify the code and I agreed. If you feel strong about adding the helper back, I can do that to improve readibility. > >> } >> EXPORT_SYMBOL_GPL(clear_mce_nospec); >> >> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c >> index 58d95242a836..30c71a68175b 100644 >> --- a/drivers/nvdimm/pmem.c >> +++ b/drivers/nvdimm/pmem.c >> @@ -158,36 +158,19 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem, >> struct page *page, unsigned int page_off, >> sector_t sector, unsigned int len) >> { >> - blk_status_t rc = BLK_STS_OK; >> - bool bad_pmem = false; >> phys_addr_t pmem_off = sector * 512 + pmem->data_offset; >> void *pmem_addr = pmem->virt_addr + pmem_off; >> >> - if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) >> - bad_pmem = true; >> + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) { >> + blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len); >> >> - /* >> - * Note that we write the data both before and after >> - * clearing poison. The write before clear poison >> - * handles situations where the latest written data is >> - * preserved and the clear poison operation simply marks >> - * the address range as valid without changing the data. >> - * In this case application software can assume that an >> - * interrupted write will either return the new good >> - * data or an error. >> - * >> - * However, if pmem_clear_poison() leaves the data in an >> - * indeterminate state we need to perform the write >> - * after clear poison. >> - */ >> + if (rc != BLK_STS_OK) >> + pr_warn_ratelimited("%s: failed to clear poison\n", __func__); > > This should be either "dev_warn_ratelimited(to_dev(pmem), ...", or a > trace point similar to trace_block_rq_complete() that tells userspace > about adverse I/O completion results. > > However, that's probably a discussion for another patch, so I would > just drop this new addition for now and we can discuss the logging in > a follow-on patch. Okay, drop the warning message. If the unexpected pathological scenario happens, user will see I/O failure. > > >> + return rc; >> + } >> flush_dcache_page(page); >> write_pmem(pmem_addr, page, page_off, len); >> - if (unlikely(bad_pmem)) { >> - rc = pmem_clear_poison(pmem, pmem_off, len); >> - write_pmem(pmem_addr, page, page_off, len); >> - } >> - >> - return rc; >> + return BLK_STS_OK; >> } >> >> static void pmem_submit_bio(struct bio *bio) >> diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h >> index d6263d7afb55..cde2d8687a7b 100644 >> --- a/include/linux/set_memory.h >> +++ b/include/linux/set_memory.h >> @@ -43,10 +43,10 @@ static inline bool can_set_direct_map(void) >> #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ >> >> #ifdef CONFIG_X86_64 >> -int set_mce_nospec(unsigned long pfn, bool unmap); >> +int set_mce_nospec(unsigned long pfn); >> int clear_mce_nospec(unsigned long pfn); >> #else >> -static inline int set_mce_nospec(unsigned long pfn, bool unmap) >> +static inline int set_mce_nospec(unsigned long pfn) > > Looks like after this change the "whole_page()" helper can be deleted > and the ->mce_whole_page flag in the task_struct can also go, right? > In a follow-on of course. Okay, will do in a follow-on patch. > > Other than those small issues, this looks good! thanks! -jane