On Fri, Sep 18, 2009 at 03:58:59PM +0800, Huang Ying wrote: > UCR (uncorrected recovery) MCE is supported in recent Intel CPUs, > where some hardware error such as some memory error can be reported > without PCC (processor context corrupted). To recover from such MCE, > the corresponding memory will be unmapped, and all processes accessing > the memory will be killed via SIGBUS. > > For KVM, if QEMU/KVM is killed, all guest processes will be killed > too. So we relay SIGBUS from host OS to guest system via a UCR MCE > injection. Then guest OS can isolate corresponding memory and kill > necessary guest processes only. SIGBUS sent to main thread (not VCPU > threads) will be broadcast to all VCPU threads as UCR MCE. > > v3: > > - Re-raise SIGBUS for SIGBUS not from MCE > - Kill itself for error in kvm_inject_x86_mce This is broken, non-MCE SIGBUS causes qemu-kvm to call the sigbus handler in a loop. BTW, how are you testing this and what guests have been tested? > > v2: > > - Use qemu_ram_addr_from_host instead of self made one to covert from > host address to guest RAM address. Thanks Anthony Liguori. > > Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx> > > --- > cpu-common.h | 1 > exec.c | 20 ++++- > qemu-kvm.c | 178 +++++++++++++++++++++++++++++++++++++++++++++++---- > qemu-kvm.h | 9 ++ > target-i386/cpu.h | 20 +++++ > target-i386/helper.c | 2 > 6 files changed, 208 insertions(+), 22 deletions(-) > > --- a/qemu-kvm.c > +++ b/qemu-kvm.c > @@ -27,10 +27,23 @@ > #include <sys/mman.h> > #include <sys/ioctl.h> > #include <signal.h> > +#include <sys/signalfd.h> > +#include <sys/prctl.h> > > #define false 0 > #define true 1 > > +#ifndef PR_MCE_KILL > +#define PR_MCE_KILL 33 > +#endif > + > +#ifndef BUS_MCEERR_AR > +#define BUS_MCEERR_AR 4 > +#endif > +#ifndef BUS_MCEERR_AO > +#define BUS_MCEERR_AO 5 > +#endif > + > #define EXPECTED_KVM_API_VERSION 12 > > #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION > @@ -1509,6 +1522,49 @@ static void sig_ipi_handler(int n) > { > } > > +static void hardware_memory_error(void) > +{ > + fprintf(stderr, "Hardware memory error!\n"); > + exit(1); > +} > + > +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx) > +{ > +#if defined(KVM_CAP_MCE) && defined(TARGET_I386) > + if (first_cpu->mcg_cap && siginfo->ssi_addr > + && siginfo->ssi_code == BUS_MCEERR_AO) { > + uint64_t status; > + unsigned long paddr; > + CPUState *cenv; > + > + /* Hope we are lucky for AO MCE */ > + if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) { > + fprintf(stderr, "Hardware memory error for memory used by " > + "QEMU itself instead of guest system!: %llx\n", > + (unsigned long long)siginfo->ssi_addr); > + return; > + } > + status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN > + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S > + | 0xc0; > + kvm_inject_x86_mce(first_cpu, 9, status, > + MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr, > + (MCM_ADDR_PHYS << 6) | 0xc, 1); > + for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) > + kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC, > + MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1); > + } else > +#endif > + { > + if (siginfo->ssi_code == BUS_MCEERR_AO) > + return; > + else if (siginfo->ssi_code == BUS_MCEERR_AR) > + hardware_memory_error(); > + else > + raise(SIGBUS); > + } > +} > + > static void on_vcpu(CPUState *env, void (*func)(void *data), void *data) > { > struct qemu_work_item wi; > @@ -1666,29 +1722,101 @@ static void flush_queued_work(CPUState * > pthread_cond_broadcast(&qemu_work_cond); > } > > +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo) > +{ > +#if defined(KVM_CAP_MCE) && defined(TARGET_I386) > + struct kvm_x86_mce mce = { > + .bank = 9, > + }; > + unsigned long paddr; > + int r; > + > + if (env->mcg_cap && siginfo->si_addr > + && (siginfo->si_code == BUS_MCEERR_AR > + || siginfo->si_code == BUS_MCEERR_AO)) { > + if (siginfo->si_code == BUS_MCEERR_AR) { > + /* Fake an Intel architectural Data Load SRAR UCR */ > + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN > + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S > + | MCI_STATUS_AR | 0x134; > + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; > + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; > + } else { > + /* Fake an Intel architectural Memory scrubbing UCR */ > + mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN > + | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S > + | 0xc0; > + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; > + mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; > + } > + if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) { > + fprintf(stderr, "Hardware memory error for memory used by " > + "QEMU itself instaed of guest system!\n"); > + /* Hope we are lucky for AO MCE */ > + if (siginfo->si_code == BUS_MCEERR_AO) > + return; > + else > + hardware_memory_error(); > + } > + mce.addr = paddr; > + r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce); > + if (r < 0) { > + fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno)); > + abort(); > + } > + } else > +#endif > + { > + if (siginfo->si_code == BUS_MCEERR_AO) > + return; > + else if (siginfo->si_code == BUS_MCEERR_AR) > + hardware_memory_error(); > + else > + raise(SIGBUS); > + } > +} > + > static void kvm_main_loop_wait(CPUState *env, int timeout) > { > struct timespec ts; > int r, e; > siginfo_t siginfo; > sigset_t waitset; > - > - pthread_mutex_unlock(&qemu_mutex); > + sigset_t chkset; > > ts.tv_sec = timeout / 1000; > ts.tv_nsec = (timeout % 1000) * 1000000; > sigemptyset(&waitset); > sigaddset(&waitset, SIG_IPI); > + sigaddset(&waitset, SIGBUS); > > - r = sigtimedwait(&waitset, &siginfo, &ts); > - e = errno; > + do { > + pthread_mutex_unlock(&qemu_mutex); > > - pthread_mutex_lock(&qemu_mutex); > + r = sigtimedwait(&waitset, &siginfo, &ts); > + e = errno; > > - if (r == -1 && !(e == EAGAIN || e == EINTR)) { > - printf("sigtimedwait: %s\n", strerror(e)); > - exit(1); > - } > + pthread_mutex_lock(&qemu_mutex); > + > + if (r == -1 && !(e == EAGAIN || e == EINTR)) { > + printf("sigtimedwait: %s\n", strerror(e)); > + exit(1); > + } > + > + switch (r) { > + case SIGBUS: > + kvm_on_sigbus(env, &siginfo); > + break; > + default: > + break; > + } > + > + r = sigpending(&chkset); > + if (r == -1) { > + printf("sigpending: %s\n", strerror(e)); > + exit(1); > + } > + } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); > > cpu_single_env = env; > flush_queued_work(env); > @@ -1769,6 +1897,7 @@ static void setup_kernel_sigmask(CPUStat > > sigprocmask(SIG_BLOCK, NULL, &set); > sigdelset(&set, SIG_IPI); > + sigdelset(&set, SIGBUS); > > kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set); > } > @@ -1896,12 +2025,20 @@ void kvm_hpet_enable_kpit(void) > > int kvm_init_ap(void) > { > + struct sigaction action; > + > #ifdef TARGET_I386 > kvm_tpr_opt_setup(); > #endif > qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL); > > signal(SIG_IPI, sig_ipi_handler); > + > + memset(&action, 0, sizeof(action)); > + action.sa_flags = SA_SIGINFO; > + action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler; > + sigaction(SIGBUS, &action, NULL); > + prctl(PR_MCE_KILL, 1, 1); > return 0; > } > > @@ -1962,7 +2099,10 @@ static void sigfd_handler(void *opaque) > } > > sigaction(info.ssi_signo, NULL, &action); > - if (action.sa_handler) > + if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) > + action.sa_sigaction(info.ssi_signo, > + (siginfo_t *)&info, NULL); > + else if (action.sa_handler) > action.sa_handler(info.ssi_signo); > > } > @@ -2012,6 +2152,7 @@ int kvm_main_loop(void) > sigemptyset(&mask); > sigaddset(&mask, SIGIO); > sigaddset(&mask, SIGALRM); > + sigaddset(&mask, SIGBUS); > sigprocmask(SIG_BLOCK, &mask, NULL); > > sigfd = qemu_signalfd(&mask); > @@ -2508,6 +2649,7 @@ int kvm_set_boot_cpu_id(uint32_t id) > struct kvm_x86_mce_data { > CPUState *env; > struct kvm_x86_mce *mce; > + int abort_on_error; > }; > > static void kvm_do_inject_x86_mce(void *_data) > @@ -2516,13 +2658,17 @@ static void kvm_do_inject_x86_mce(void * > int r; > > r = kvm_set_mce(data->env->kvm_cpu_state.vcpu_ctx, data->mce); > - if (r < 0) > + if (r < 0) { > perror("kvm_set_mce FAILED"); > + if (data->abort_on_error) > + abort(); > + } > } > #endif > > void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, > - uint64_t mcg_status, uint64_t addr, uint64_t misc) > + uint64_t mcg_status, uint64_t addr, uint64_t misc, > + int abort_on_error) > { > #ifdef KVM_CAP_MCE > struct kvm_x86_mce mce = { > @@ -2535,9 +2681,17 @@ void kvm_inject_x86_mce(CPUState *cenv, > struct kvm_x86_mce_data data = { > .env = cenv, > .mce = &mce, > + .abort_on_error = abort_on_error, > }; > > + if (!cenv->mcg_cap) { > + fprintf(stderr, "MCE support is not enabled!\n"); > + return; > + } > on_vcpu(cenv, kvm_do_inject_x86_mce, &data); > +#else > + if (abort_on_error) > + abort(); > #endif > } > #endif > --- a/target-i386/cpu.h > +++ b/target-i386/cpu.h > @@ -250,16 +250,32 @@ > #define PG_ERROR_RSVD_MASK 0x08 > #define PG_ERROR_I_D_MASK 0x10 > > -#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */ > +#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ > +#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ > > -#define MCE_CAP_DEF MCG_CTL_P > +#define MCE_CAP_DEF (MCG_CTL_P|MCG_SER_P) > #define MCE_BANKS_DEF 10 > > +#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ > +#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ > #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ > > #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ > #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ > #define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ > +#define MCI_STATUS_EN (1ULL<<60) /* error enabled */ > +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */ > +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */ > +#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */ > +#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ > +#define MCI_STATUS_AR (1ULL<<55) /* Action required */ > + > +/* MISC register defines */ > +#define MCM_ADDR_SEGOFF 0 /* segment offset */ > +#define MCM_ADDR_LINEAR 1 /* linear address */ > +#define MCM_ADDR_PHYS 2 /* physical address */ > +#define MCM_ADDR_MEM 3 /* memory address */ > +#define MCM_ADDR_GENERIC 7 /* generic */ > > #define MSR_IA32_TSC 0x10 > #define MSR_IA32_APICBASE 0x1b > --- a/cpu-common.h > +++ b/cpu-common.h > @@ -34,6 +34,7 @@ void qemu_ram_free(ram_addr_t addr); > /* This should only be used for ram local to a device. */ > void *qemu_get_ram_ptr(ram_addr_t addr); > /* This should not be used by devices. */ > +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr); > ram_addr_t qemu_ram_addr_from_host(void *ptr); > > int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read, > --- a/exec.c > +++ b/exec.c > @@ -2600,9 +2600,7 @@ void *qemu_get_ram_ptr(ram_addr_t addr) > return block->host + (addr - block->offset); > } > > -/* Some of the softmmu routines need to translate from a host pointer > - (typically a TLB entry) back to a ram offset. */ > -ram_addr_t qemu_ram_addr_from_host(void *ptr) > +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) > { > RAMBlock *prev; > RAMBlock **prevp; > @@ -2619,11 +2617,23 @@ ram_addr_t qemu_ram_addr_from_host(void > prev = block; > block = block->next; > } > - if (!block) { > + if (!block) > + return -1; > + *ram_addr = block->offset + (host - block->host); > + return 0; > +} > + > +/* Some of the softmmu routines need to translate from a host pointer > + (typically a TLB entry) back to a ram offset. */ > +ram_addr_t qemu_ram_addr_from_host(void *ptr) > +{ > + ram_addr_t ram_addr; > + > + if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) { > fprintf(stderr, "Bad ram pointer %p\n", ptr); > abort(); > } > - return block->offset + (host - block->host); > + return ram_addr; > } > > static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr) > --- a/qemu-kvm.h > +++ b/qemu-kvm.h > @@ -625,9 +625,11 @@ int kvm_inject_nmi(kvm_vcpu_context_t vc > * \param mcg_status MSR_MCG_STATUS > * \param addr MSR_MCI_ADDR > * \param misc MSR_MCI_MISC > + * \param abort_on_error abort on error > */ > void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status, > - uint64_t mcg_status, uint64_t addr, uint64_t misc); > + uint64_t mcg_status, uint64_t addr, uint64_t misc, > + int abort_on_error); > > /*! > * \brief Query wheather in kernel pit is used > @@ -943,8 +945,11 @@ static inline int kvm_init(int smp_cpus) > > static inline void kvm_inject_x86_mce(CPUState *cenv, int bank, > uint64_t status, uint64_t mcg_status, > - uint64_t addr, uint64_t misc) > + uint64_t addr, uint64_t misc, > + int abort_on_error) > { > + if (abort_on_error) > + abort(); > } > > > --- a/target-i386/helper.c > +++ b/target-i386/helper.c > @@ -1547,7 +1547,7 @@ void cpu_inject_x86_mce(CPUState *cenv, > uint64_t *banks = cenv->mce_banks; > > if (kvm_enabled()) { > - kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc); > + kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0); > return; > } > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html