On Wed, May 12, 2010 at 02:44:03PM +0800, Huang Ying wrote: > In common cases, guest SRAO MCE will cause corresponding poisoned page > be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay > the MCE to guest OS. > > But it is reported that if the poisoned page is accessed in guest > after un-mapped and before MCE is relayed to guest OS, QEMU-KVM will > be killed. > > The reason is as follow. Because poisoned page has been un-mapped, > guest access will cause guest exit and kvm_mmu_page_fault will be > called. kvm_mmu_page_fault can not get the poisoned page for fault > address, so kernel and user space MMIO processing is tried in turn. In > user MMIO processing, poisoned page is accessed again, then QEMU-KVM > is killed by force_sig_info. > > To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM > and do not try kernel and user space MMIO processing for poisoned > page. > > > Changelog: > > v2: > > - Use page table walker to determine whether the virtual address is > poisoned to avoid change user space interface (via changing > get_user_pages). > > - Wrap bad page processing into kvm_handle_bad_page to avoid code > duplicating. > > Reported-by: Max Asbock <masbock@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx> > --- > arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++-------- > arch/x86/kvm/paging_tmpl.h | 7 ++----- > include/linux/kvm_host.h | 1 + > include/linux/mm.h | 8 ++++++++ > mm/memory-failure.c | 28 ++++++++++++++++++++++++++++ > virt/kvm/kvm_main.c | 30 ++++++++++++++++++++++++++++-- > 6 files changed, 93 insertions(+), 15 deletions(-) > > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -32,6 +32,7 @@ > #include <linux/compiler.h> > #include <linux/srcu.h> > #include <linux/slab.h> > +#include <linux/uaccess.h> > > #include <asm/page.h> > #include <asm/cmpxchg.h> > @@ -1975,6 +1976,27 @@ static int __direct_map(struct kvm_vcpu > return pt_write; > } > > +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) > +{ > + char buf[1]; > + void __user *hva; > + int r; > + > + /* Touch the page, so send SIGBUS */ > + hva = (void __user *)gfn_to_hva(kvm, gfn); > + r = copy_from_user(buf, hva, 1); > +} A SIGBUS signal has been raised by memory poisoning already, so i don't see why this is needed? To avoid the MMIO processing in userspace before the MCE is sent to the guest you can just return -EAGAIN from the page fault handlers back to kvm_mmu_page_fault. > +int is_hwpoison_pfn(pfn_t pfn) > +{ > + return pfn == hwpoison_pfn; > +} > +EXPORT_SYMBOL_GPL(is_hwpoison_pfn); > + > static inline unsigned long bad_hva(void) > { > return PAGE_OFFSET; > @@ -939,6 +948,11 @@ static pfn_t hva_to_pfn(struct kvm *kvm, > if (unlikely(npages != 1)) { > struct vm_area_struct *vma; > > + if (is_hwpoison_address(addr)) { > + get_page(hwpoison_page); > + return page_to_pfn(hwpoison_page); > + } > + > down_read(¤t->mm->mmap_sem); > vma = find_vma(current->mm, addr); > > @@ -2198,6 +2212,15 @@ int kvm_init(void *opaque, unsigned int > > bad_pfn = page_to_pfn(bad_page); > > + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + > + if (hwpoison_page == NULL) { > + r = -ENOMEM; > + goto out_free_0; > + } > + > + hwpoison_pfn = page_to_pfn(hwpoison_page); > + > if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { > r = -ENOMEM; > goto out_free_0; > @@ -2269,6 +2292,8 @@ out_free_1: > out_free_0a: > free_cpumask_var(cpus_hardware_enabled); > out_free_0: > + if (hwpoison_page) > + __free_page(hwpoison_page); > __free_page(bad_page); > out: > kvm_arch_exit(); > @@ -2291,6 +2316,7 @@ void kvm_exit(void) > kvm_arch_hardware_unsetup(); > kvm_arch_exit(); > free_cpumask_var(cpus_hardware_enabled); > + __free_page(hwpoison_page); > __free_page(bad_page); > } > EXPORT_SYMBOL_GPL(kvm_exit); > --- a/mm/memory-failure.c > +++ b/mm/memory-failure.c > @@ -45,6 +45,7 @@ > #include <linux/page-isolation.h> > #include <linux/suspend.h> > #include <linux/slab.h> > +#include <linux/swapops.h> > #include "internal.h" > > int sysctl_memory_failure_early_kill __read_mostly = 0; > @@ -1296,3 +1297,30 @@ done: > /* keep elevated page count for bad page */ > return ret; > } > + > +int is_hwpoison_address(unsigned long addr) > +{ > + pgd_t *pgdp; > + pud_t *pudp; > + pmd_t *pmdp; > + pte_t pte, *ptep; > + swp_entry_t entry; > + > + pgdp = pgd_offset(current->mm, addr); > + if (!pgd_present(*pgdp)) > + return 0; > + pudp = pud_offset(pgdp, addr); > + if (!pud_present(*pudp)) > + return 0; > + pmdp = pmd_offset(pudp, addr); > + if (!pmd_present(*pmdp)) > + return 0; Need to bail out if pmd is huge. > + ptep = pte_offset_map(pmdp, addr); > + pte = *ptep; > + pte_unmap(ptep); > + if (!is_swap_pte(pte)) > + return 0; > + entry = pte_to_swp_entry(pte); > + return is_hwpoison_entry(entry); > +} > +EXPORT_SYMBOL_GPL(is_hwpoison_address); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html