In HWPoison processing code, not only the struct page corresponding the error physical memory page is marked as HWPoison, but also the virtual address in processes mapping the error physical memory page is marked as HWPoison. So that, the further accessing to the virtual address will kill corresponding processes with SIGBUS. If the error physical memory page is used by a KVM guest, the SIGBUS will be sent to QEMU, and QEMU will simulate a MCE to report that memory error to the guest OS. If the guest OS can not recover from the error (for example, the page is accessed by kernel code), guest OS will reboot the system. But because the underlying host virtual address backing the guest physical memory is still poisoned, if the guest system accesses the corresponding guest physical memory even after rebooting, the SIGBUS will still be sent to QEMU and MCE will be simulated. That is, guest system can not recover via rebooting. In fact, across rebooting, the contents of guest physical memory page need not to be kept. We can allocate a new host physical page to back the corresponding guest physical address. To do that, a mechanism in KVM to "unpoison" poisoned virtual address by clearing the corresponding PTE is provided. So that, when doing rebooting, QEMU can unpoison the poisoned virtual address, and when the unpoisoned memory page is accessed, a new physical memory may be allocated if possible. Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx> --- include/linux/kvm.h | 1 + include/linux/mm.h | 8 ++++++++ mm/memory-failure.c | 39 +++++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 14 ++++++++++++++ 4 files changed, 62 insertions(+) --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -676,6 +676,7 @@ struct kvm_clock_data { #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) /* Available with KVM_CAP_PPC_GET_PVINFO */ #define KVM_PPC_GET_PVINFO _IOW(KVMIO, 0xa1, struct kvm_ppc_pvinfo) +#define KVM_UNPOISON_ADDRESS _IO(KVMIO, 0xa2) /* * ioctls for vcpu fds --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1512,6 +1512,14 @@ extern int sysctl_memory_failure_recover extern void shake_page(struct page *p, int access); extern atomic_long_t mce_bad_pages; extern int soft_offline_page(struct page *page, int flags); +#ifdef CONFIG_MEMORY_FAILURE +int unpoison_address(unsigned long addr); +#else +static inline int unpoison_address(unsigned long addr) +{ + return -EINVAL; +} +#endif extern void dump_page(struct page *page); --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1433,3 +1433,42 @@ done: /* keep elevated page count for bad page */ return ret; } + +int unpoison_address(unsigned long addr) +{ + struct mm_struct *mm; + pgd_t *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t pte, *ptep; + spinlock_t *ptl; + swp_entry_t entry; + int rc; + + mm = current->mm; + pgdp = pgd_offset(mm, addr); + if (!pgd_present(*pgdp)) + return -EINVAL; + pudp = pud_offset(pgdp, addr); + pud = *pudp; + if (!pud_present(pud) || pud_large(pud)) + return -EINVAL; + pmdp = pmd_offset(pudp, addr); + pmd = *pmdp; + /* can not unpoison huge page yet */ + if (!pmd_present(pmd) || pmd_large(pmd)) + return -EINVAL; + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + pte = *ptep; + rc = -EINVAL; + if (!is_swap_pte(pte)) + goto out; + entry = pte_to_swp_entry(pte); + if (!is_hwpoison_entry(entry)) + goto out; + pte_clear(mm, addr, ptep); +out: + pte_unmap_unlock(ptep, ptl); + return rc; +} +EXPORT_SYMBOL_GPL(unpoison_address); --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -774,6 +774,17 @@ int kvm_vm_ioctl_set_memory_region(struc return kvm_set_memory_region(kvm, mem, user_alloc); } +static int kvm_unpoison_address(struct kvm *kvm, unsigned long address) +{ + int r; + + down_read(¤t->mm->mmap_sem); + r = unpoison_address(address); + up_read(¤t->mm->mmap_sem); + + return r; +} + int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty) { @@ -1728,6 +1739,9 @@ static long kvm_vm_ioctl(struct file *fi mutex_unlock(&kvm->lock); break; #endif + case KVM_UNPOISON_ADDRESS: + r = kvm_unpoison_address(kvm, arg); + break; default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html