On Tue, Apr 5, 2022 at 12:48 PM Jane Chu <jane.chu@xxxxxxxxxx> wrote: > > The set_memory_uc() approach doesn't work well in all cases. > For example, when "The VMM unmapped the bad page from guest > physical space and passed the machine check to the guest." > "The guest gets virtual #MC on an access to that page. > When the guest tries to do set_memory_uc() and instructs > cpa_flush() to do clean caches that results in taking another > fault / exception perhaps because the VMM unmapped the page > from the guest." > > Since the driver has special knowledge to handle NP or UC, I think a patch is needed before this one to make this statement true? I.e.: diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index ee8d9973f60b..11641f55025a 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -32,6 +32,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, */ mutex_lock(&acpi_desc_lock); list_for_each_entry(acpi_desc, &acpi_descs, list) { + unsigned int align = 1UL << MCI_MISC_ADDR_LSB(mce->misc); struct device *dev = acpi_desc->dev; int found_match = 0; @@ -63,8 +64,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, /* If this fails due to an -ENOMEM, there is little we can do */ nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus, - ALIGN(mce->addr, L1_CACHE_BYTES), - L1_CACHE_BYTES); + ALIGN(mce->addr, align), align); nvdimm_region_notify(nfit_spa->nd_region, NVDIMM_REVALIDATE_POISON); > let's mark the poisoned page with NP and let driver handle it > when it comes down to repair. > > Please refer to discussions here for more details. > https://lore.kernel.org/all/CAPcyv4hrXPb1tASBZUg-GgdVs0OOFKXMXLiHmktg_kFi7YBMyQ@xxxxxxxxxxxxxx/ > > Now since poisoned page is marked as not-present, in order to > avoid writing to a 'np' page and trigger kernel Oops, also fix > pmem_do_write(). > > Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") > Signed-off-by: Jane Chu <jane.chu@xxxxxxxxxx> > --- > arch/x86/kernel/cpu/mce/core.c | 6 +++--- > arch/x86/mm/pat/set_memory.c | 18 ++++++------------ > drivers/nvdimm/pmem.c | 31 +++++++------------------------ > include/linux/set_memory.h | 4 ++-- > 4 files changed, 18 insertions(+), 41 deletions(-) > > diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c > index 981496e6bc0e..fa67bb9d1afe 100644 > --- a/arch/x86/kernel/cpu/mce/core.c > +++ b/arch/x86/kernel/cpu/mce/core.c > @@ -579,7 +579,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, > > pfn = mce->addr >> PAGE_SHIFT; > if (!memory_failure(pfn, 0)) { > - set_mce_nospec(pfn, whole_page(mce)); > + set_mce_nospec(pfn); > mce->kflags |= MCE_HANDLED_UC; > } > > @@ -1316,7 +1316,7 @@ static void kill_me_maybe(struct callback_head *cb) > > ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); > if (!ret) { > - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); > + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); > sync_core(); > return; > } > @@ -1342,7 +1342,7 @@ static void kill_me_never(struct callback_head *cb) > p->mce_count = 0; > pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); > if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) > - set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); > + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); > } > > static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) > diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c > index 93dde949f224..404ffcb3f2cb 100644 > --- a/arch/x86/mm/pat/set_memory.c > +++ b/arch/x86/mm/pat/set_memory.c > @@ -1926,13 +1926,8 @@ int set_memory_wb(unsigned long addr, int numpages) > EXPORT_SYMBOL(set_memory_wb); > > #ifdef CONFIG_X86_64 > -/* > - * Prevent speculative access to the page by either unmapping > - * it (if we do not require access to any part of the page) or > - * marking it uncacheable (if we want to try to retrieve data > - * from non-poisoned lines in the page). > - */ > -int set_mce_nospec(unsigned long pfn, bool unmap) > +/* Prevent speculative access to a page by marking it not-present */ > +int set_mce_nospec(unsigned long pfn) > { > unsigned long decoy_addr; > int rc; > @@ -1954,10 +1949,7 @@ int set_mce_nospec(unsigned long pfn, bool unmap) > */ > decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); > > - if (unmap) > - rc = set_memory_np(decoy_addr, 1); > - else > - rc = set_memory_uc(decoy_addr, 1); > + rc = set_memory_np(decoy_addr, 1); > if (rc) > pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); > return rc; > @@ -1966,7 +1958,9 @@ int set_mce_nospec(unsigned long pfn, bool unmap) > /* Restore full speculative operation to the pfn. */ > int clear_mce_nospec(unsigned long pfn) > { > - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); > + unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); > + > + return change_page_attr_set(&addr, 1, __pgprot(_PAGE_PRESENT), 0); This probably warrants a set_memory_present() helper. > } > EXPORT_SYMBOL_GPL(clear_mce_nospec); > > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c > index 58d95242a836..30c71a68175b 100644 > --- a/drivers/nvdimm/pmem.c > +++ b/drivers/nvdimm/pmem.c > @@ -158,36 +158,19 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem, > struct page *page, unsigned int page_off, > sector_t sector, unsigned int len) > { > - blk_status_t rc = BLK_STS_OK; > - bool bad_pmem = false; > phys_addr_t pmem_off = sector * 512 + pmem->data_offset; > void *pmem_addr = pmem->virt_addr + pmem_off; > > - if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) > - bad_pmem = true; > + if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) { > + blk_status_t rc = pmem_clear_poison(pmem, pmem_off, len); > > - /* > - * Note that we write the data both before and after > - * clearing poison. The write before clear poison > - * handles situations where the latest written data is > - * preserved and the clear poison operation simply marks > - * the address range as valid without changing the data. > - * In this case application software can assume that an > - * interrupted write will either return the new good > - * data or an error. > - * > - * However, if pmem_clear_poison() leaves the data in an > - * indeterminate state we need to perform the write > - * after clear poison. > - */ > + if (rc != BLK_STS_OK) > + pr_warn_ratelimited("%s: failed to clear poison\n", __func__); This should be either "dev_warn_ratelimited(to_dev(pmem), ...", or a trace point similar to trace_block_rq_complete() that tells userspace about adverse I/O completion results. However, that's probably a discussion for another patch, so I would just drop this new addition for now and we can discuss the logging in a follow-on patch. > + return rc; > + } > flush_dcache_page(page); > write_pmem(pmem_addr, page, page_off, len); > - if (unlikely(bad_pmem)) { > - rc = pmem_clear_poison(pmem, pmem_off, len); > - write_pmem(pmem_addr, page, page_off, len); > - } > - > - return rc; > + return BLK_STS_OK; > } > > static void pmem_submit_bio(struct bio *bio) > diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h > index d6263d7afb55..cde2d8687a7b 100644 > --- a/include/linux/set_memory.h > +++ b/include/linux/set_memory.h > @@ -43,10 +43,10 @@ static inline bool can_set_direct_map(void) > #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ > > #ifdef CONFIG_X86_64 > -int set_mce_nospec(unsigned long pfn, bool unmap); > +int set_mce_nospec(unsigned long pfn); > int clear_mce_nospec(unsigned long pfn); > #else > -static inline int set_mce_nospec(unsigned long pfn, bool unmap) > +static inline int set_mce_nospec(unsigned long pfn) Looks like after this change the "whole_page()" helper can be deleted and the ->mce_whole_page flag in the task_struct can also go, right? In a follow-on of course. Other than those small issues, this looks good!