On Wed, Sep 09, 2009 at 10:28:02AM +0800, Huang Ying wrote: > UCR (uncorrected recovery) MCE is supported in recent Intel CPUs, > where some hardware error such as some memory error can be reported > without PCC (processor context corrupted). To recover from such MCE, > the corresponding memory will be unmapped, and all processes accessing > the memory will be killed via SIGBUS. > > For KVM, if QEMU/KVM is killed, all guest processes will be killed > too. So we relay SIGBUS from host OS to guest system via a UCR MCE > injection. Then guest OS can isolate corresponding memory and kill > necessary guest processes only. SIGBUS sent to main thread (not VCPU > threads) will be broadcast to all VCPU threads as UCR MCE. > > v2: > > - Use qemu_ram_addr_from_host instead of self made one to covert from > host address to guest RAM address. Thanks Anthony Liguori. > > Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx> > > --- > cpu-common.h | 1 > exec.c | 20 +++++-- > qemu-kvm.c | 154 ++++++++++++++++++++++++++++++++++++++++++++++++++---- > target-i386/cpu.h | 20 ++++++- > 4 files changed, 178 insertions(+), 17 deletions(-) > > --- a/qemu-kvm.c > +++ b/qemu-kvm.c > @@ -27,10 +27,23 @@ > #include <sys/mman.h> > #include <sys/ioctl.h> > #include <signal.h> > +#include <sys/signalfd.h> > +#include <sys/prctl.h> > > #define false 0 > #define true 1 > > +#ifndef PR_MCE_KILL > +#define PR_MCE_KILL 33 > +#endif > + > +#ifndef BUS_MCEERR_AR > +#define BUS_MCEERR_AR 4 > +#endif > +#ifndef BUS_MCEERR_AO > +#define BUS_MCEERR_AO 5 > +#endif > + > #define EXPECTED_KVM_API_VERSION 12 > > #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION > @@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n) > { > } > > +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx) > +{ > + if (siginfo->ssi_code == BUS_MCEERR_AO) { > + uint64_t status; > + unsigned long paddr; > + CPUState *cenv; > + > + /* Hope we are lucky for AO MCE */ > + if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) { > + fprintf(stderr, "Hardware memory error for memory used by " > + "QEMU itself instead of guest system!: %llx\n", > + (unsigned long long)siginfo->ssi_addr); > + return; qemu-kvm should die here? > + } > + status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN > + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S > + | 0xc0; > + kvm_inject_x86_mce(first_cpu, 9, status, > + MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr, > + (MCM_ADDR_PHYS << 6) | 0xc); > + for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) > + kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC, > + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0); > + return; Should abort if kvm_inject_x86_mce fails? > + } else if (siginfo->ssi_code == BUS_MCEERR_AR) > + fprintf(stderr, "Hardware memory error!\n"); > + else > + fprintf(stderr, "Internal error in QEMU!\n"); Can you re-raise SIGBUS so you we get a coredump on non-MCE SIGBUS as usual? > + exit(1); > +} > + > static void on_vcpu(CPUState *env, void (*func)(void *data), void *data) > { > struct qemu_work_item wi; > @@ -1649,29 +1693,102 @@ static void flush_queued_work(CPUState * > pthread_cond_broadcast(&qemu_work_cond); > } > > +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo) > +{ > +#if defined(KVM_CAP_MCE) && defined(TARGET_I386) > + struct kvm_x86_mce mce = { > + .bank = 9, > + }; > + unsigned long paddr; > + int r; > + > + if (env->mcg_cap && siginfo->si_addr > + && (siginfo->si_code == BUS_MCEERR_AR > + || siginfo->si_code == BUS_MCEERR_AO)) { > + if (siginfo->si_code == BUS_MCEERR_AR) { > + /* Fake an Intel architectural Data Load SRAR UCR */ > + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN > + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S > + | MCI_STATUS_AR | 0x134; > + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; > + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; > + } else { > + /* Fake an Intel architectural Memory scrubbing UCR */ > + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN > + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S > + | 0xc0; > + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; > + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; > + } > + if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) { > + fprintf(stderr, "Hardware memory error for memory used by " > + "QEMU itself instaed of guest system!\n"); > + /* Hope we are lucky for AO MCE */ > + if (siginfo->si_code == BUS_MCEERR_AO) > + return; Should die? > + else > + exit(1); > + } > + mce.addr = paddr; > + r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce); > + if (r < 0) { > + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); > + exit(1); > + } > + } else > +#endif > + { > + if (siginfo->si_code == BUS_MCEERR_AO) > + return; > + if (siginfo->si_code == BUS_MCEERR_AR) > + fprintf(stderr, "Hardware memory error!\n"); > + else > + fprintf(stderr, "Internal error in QEMU!\n"); > + exit(1); > + } > +} -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html