Re: [PATCH -v3] QEMU-KVM: MCE: Relay UCR MCE to guest

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Fri, 18 Sep 2009 14:05:35 -0300

On Fri, Sep 18, 2009 at 03:58:59PM +0800, Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
> 
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
> 
> v3:
> 
> - Re-raise SIGBUS for SIGBUS not from MCE
> - Kill itself for error in kvm_inject_x86_mce

This is broken, non-MCE SIGBUS causes qemu-kvm to call the sigbus
handler in a loop.

BTW, how are you testing this and what guests have been tested?

> 
> v2:
> 
> - Use qemu_ram_addr_from_host instead of self made one to covert from
>   host address to guest RAM address. Thanks Anthony Liguori.
> 
> Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx>
> 
> ---
>  cpu-common.h         |    1 
>  exec.c               |   20 ++++-
>  qemu-kvm.c           |  178 +++++++++++++++++++++++++++++++++++++++++++++++----
>  qemu-kvm.h           |    9 ++
>  target-i386/cpu.h    |   20 +++++
>  target-i386/helper.c |    2 
>  6 files changed, 208 insertions(+), 22 deletions(-)
> 
> --- a/qemu-kvm.c
> +++ b/qemu-kvm.c
> @@ -27,10 +27,23 @@
>  #include <sys/mman.h>
>  #include <sys/ioctl.h>
>  #include <signal.h>
> +#include <sys/signalfd.h>
> +#include <sys/prctl.h>
>  
>  #define false 0
>  #define true 1
>  
> +#ifndef PR_MCE_KILL
> +#define PR_MCE_KILL 33
> +#endif
> +
> +#ifndef BUS_MCEERR_AR
> +#define BUS_MCEERR_AR 4
> +#endif
> +#ifndef BUS_MCEERR_AO
> +#define BUS_MCEERR_AO 5
> +#endif
> +
>  #define EXPECTED_KVM_API_VERSION 12
>  
>  #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> @@ -1509,6 +1522,49 @@ static void sig_ipi_handler(int n)
>  {
>  }
>  
> +static void hardware_memory_error(void)
> +{
> +    fprintf(stderr, "Hardware memory error!\n");
> +    exit(1);
> +}
> +
> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> +{
> +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> +    if (first_cpu->mcg_cap && siginfo->ssi_addr
> +        && siginfo->ssi_code == BUS_MCEERR_AO) {
> +        uint64_t status;
> +        unsigned long paddr;
> +        CPUState *cenv;
> +
> +        /* Hope we are lucky for AO MCE */
> +        if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instead of guest system!: %llx\n",
> +                    (unsigned long long)siginfo->ssi_addr);
> +            return;
> +        }
> +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +            | 0xc0;
> +        kvm_inject_x86_mce(first_cpu, 9, status,
> +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> +                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
> +        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
> +            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> +                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
> +    } else
> +#endif
> +    {
> +        if (siginfo->ssi_code == BUS_MCEERR_AO)
> +            return;
> +        else if (siginfo->ssi_code == BUS_MCEERR_AR)
> +            hardware_memory_error();
> +        else
> +            raise(SIGBUS);
> +    }
> +}
> +
>  static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
>  {
>      struct qemu_work_item wi;
> @@ -1666,29 +1722,101 @@ static void flush_queued_work(CPUState *
>      pthread_cond_broadcast(&qemu_work_cond);
>  }
>  
> +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
> +{
> +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> +    struct kvm_x86_mce mce = {
> +            .bank = 9,
> +    };
> +    unsigned long paddr;
> +    int r;
> +
> +    if (env->mcg_cap && siginfo->si_addr
> +        && (siginfo->si_code == BUS_MCEERR_AR
> +            || siginfo->si_code == BUS_MCEERR_AO)) {
> +        if (siginfo->si_code == BUS_MCEERR_AR) {
> +            /* Fake an Intel architectural Data Load SRAR UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | MCI_STATUS_AR | 0x134;
> +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
> +        } else {
> +            /* Fake an Intel architectural Memory scrubbing UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | 0xc0;
> +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
> +        }
> +        if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instaed of guest system!\n");
> +            /* Hope we are lucky for AO MCE */
> +            if (siginfo->si_code == BUS_MCEERR_AO)
> +                return;
> +            else
> +                hardware_memory_error();
> +        }
> +        mce.addr = paddr;
> +        r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
> +        if (r < 0) {
> +            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
> +            abort();
> +        }
> +    } else
> +#endif
> +    {
> +        if (siginfo->si_code == BUS_MCEERR_AO)
> +            return;
> +        else if (siginfo->si_code == BUS_MCEERR_AR)
> +            hardware_memory_error();
> +        else
> +            raise(SIGBUS);
> +    }
> +}
> +
>  static void kvm_main_loop_wait(CPUState *env, int timeout)
>  {
>      struct timespec ts;
>      int r, e;
>      siginfo_t siginfo;
>      sigset_t waitset;
> -
> -    pthread_mutex_unlock(&qemu_mutex);
> +    sigset_t chkset;
>  
>      ts.tv_sec = timeout / 1000;
>      ts.tv_nsec = (timeout % 1000) * 1000000;
>      sigemptyset(&waitset);
>      sigaddset(&waitset, SIG_IPI);
> +    sigaddset(&waitset, SIGBUS);
>  
> -    r = sigtimedwait(&waitset, &siginfo, &ts);
> -    e = errno;
> +    do {
> +        pthread_mutex_unlock(&qemu_mutex);
>  
> -    pthread_mutex_lock(&qemu_mutex);
> +        r = sigtimedwait(&waitset, &siginfo, &ts);
> +        e = errno;
>  
> -    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> -        printf("sigtimedwait: %s\n", strerror(e));
> -        exit(1);
> -    }
> +        pthread_mutex_lock(&qemu_mutex);
> +
> +        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
> +            printf("sigtimedwait: %s\n", strerror(e));
> +            exit(1);
> +        }
> +
> +        switch (r) {
> +        case SIGBUS:
> +            kvm_on_sigbus(env, &siginfo);
> +            break;
> +        default:
> +            break;
> +        }
> +
> +        r = sigpending(&chkset);
> +        if (r == -1) {
> +            printf("sigpending: %s\n", strerror(e));
> +            exit(1);
> +        }
> +    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
>  
>      cpu_single_env = env;
>      flush_queued_work(env);
> @@ -1769,6 +1897,7 @@ static void setup_kernel_sigmask(CPUStat
>  
>      sigprocmask(SIG_BLOCK, NULL, &set);
>      sigdelset(&set, SIG_IPI);
> +    sigdelset(&set, SIGBUS);
>  
>      kvm_set_signal_mask(env->kvm_cpu_state.vcpu_ctx, &set);
>  }
> @@ -1896,12 +2025,20 @@ void kvm_hpet_enable_kpit(void)
>  
>  int kvm_init_ap(void)
>  {
> +    struct sigaction action;
> +
>  #ifdef TARGET_I386
>      kvm_tpr_opt_setup();
>  #endif
>      qemu_add_vm_change_state_handler(kvm_vm_state_change_handler, NULL);
>  
>      signal(SIG_IPI, sig_ipi_handler);
> +
> +    memset(&action, 0, sizeof(action));
> +    action.sa_flags = SA_SIGINFO;
> +    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
> +    sigaction(SIGBUS, &action, NULL);
> +    prctl(PR_MCE_KILL, 1, 1);
>      return 0;
>  }
>  
> @@ -1962,7 +2099,10 @@ static void sigfd_handler(void *opaque)
>          }
>  
>          sigaction(info.ssi_signo, NULL, &action);
> -        if (action.sa_handler)
> +        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction)
> +            action.sa_sigaction(info.ssi_signo,
> +                                (siginfo_t *)&info, NULL);
> +	else if (action.sa_handler)
>              action.sa_handler(info.ssi_signo);
>  
>      }
> @@ -2012,6 +2152,7 @@ int kvm_main_loop(void)
>      sigemptyset(&mask);
>      sigaddset(&mask, SIGIO);
>      sigaddset(&mask, SIGALRM);
> +    sigaddset(&mask, SIGBUS);
>      sigprocmask(SIG_BLOCK, &mask, NULL);
>  
>      sigfd = qemu_signalfd(&mask);
> @@ -2508,6 +2649,7 @@ int kvm_set_boot_cpu_id(uint32_t id)
>  struct kvm_x86_mce_data {
>      CPUState *env;
>      struct kvm_x86_mce *mce;
> +    int abort_on_error;
>  };
>  
>  static void kvm_do_inject_x86_mce(void *_data)
> @@ -2516,13 +2658,17 @@ static void kvm_do_inject_x86_mce(void *
>      int r;
>  
>      r = kvm_set_mce(data->env->kvm_cpu_state.vcpu_ctx, data->mce);
> -    if (r < 0)
> +    if (r < 0) {
>          perror("kvm_set_mce FAILED");
> +        if (data->abort_on_error)
> +            abort();
> +    }
>  }
>  #endif
>  
>  void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> -                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
> +                        int abort_on_error)
>  {
>  #ifdef KVM_CAP_MCE
>      struct kvm_x86_mce mce = {
> @@ -2535,9 +2681,17 @@ void kvm_inject_x86_mce(CPUState *cenv, 
>      struct kvm_x86_mce_data data = {
>          .env = cenv,
>          .mce = &mce,
> +        .abort_on_error = abort_on_error,
>      };
>  
> +    if (!cenv->mcg_cap) {
> +        fprintf(stderr, "MCE support is not enabled!\n");
> +        return;
> +    }
>      on_vcpu(cenv, kvm_do_inject_x86_mce, &data);
> +#else
> +    if (abort_on_error)
> +        abort();
>  #endif
>  }
>  #endif
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -250,16 +250,32 @@
>  #define PG_ERROR_RSVD_MASK 0x08
>  #define PG_ERROR_I_D_MASK  0x10
>  
> -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
>  
> -#define MCE_CAP_DEF	MCG_CTL_P
> +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
>  #define MCE_BANKS_DEF	10
>  
> +#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
> +#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
>  #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
>  
>  #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
>  #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
>  #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
> +#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
> +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
> +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
> +#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
> +#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
> +#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
> +
> +/* MISC register defines */
> +#define MCM_ADDR_SEGOFF	0	/* segment offset */
> +#define MCM_ADDR_LINEAR	1	/* linear address */
> +#define MCM_ADDR_PHYS	2	/* physical address */
> +#define MCM_ADDR_MEM	3	/* memory address */
> +#define MCM_ADDR_GENERIC 7	/* generic */
>  
>  #define MSR_IA32_TSC                    0x10
>  #define MSR_IA32_APICBASE               0x1b
> --- a/cpu-common.h
> +++ b/cpu-common.h
> @@ -34,6 +34,7 @@ void qemu_ram_free(ram_addr_t addr);
>  /* This should only be used for ram local to a device.  */
>  void *qemu_get_ram_ptr(ram_addr_t addr);
>  /* This should not be used by devices.  */
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
>  ram_addr_t qemu_ram_addr_from_host(void *ptr);
>  
>  int cpu_register_io_memory(CPUReadMemoryFunc * const *mem_read,
> --- a/exec.c
> +++ b/exec.c
> @@ -2600,9 +2600,7 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>      return block->host + (addr - block->offset);
>  }
>  
> -/* Some of the softmmu routines need to translate from a host pointer
> -   (typically a TLB entry) back to a ram offset.  */
> -ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +int do_qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
>  {
>      RAMBlock *prev;
>      RAMBlock **prevp;
> @@ -2619,11 +2617,23 @@ ram_addr_t qemu_ram_addr_from_host(void 
>          prev = block;
>          block = block->next;
>      }
> -    if (!block) {
> +    if (!block)
> +        return -1;
> +    *ram_addr = block->offset + (host - block->host);
> +    return 0;
> +}
> +
> +/* Some of the softmmu routines need to translate from a host pointer
> +   (typically a TLB entry) back to a ram offset.  */
> +ram_addr_t qemu_ram_addr_from_host(void *ptr)
> +{
> +    ram_addr_t ram_addr;
> +
> +    if (do_qemu_ram_addr_from_host(ptr, &ram_addr)) {
>          fprintf(stderr, "Bad ram pointer %p\n", ptr);
>          abort();
>      }
> -    return block->offset + (host - block->host);
> +    return ram_addr;
>  }
>  
>  static uint32_t unassigned_mem_readb(void *opaque, target_phys_addr_t addr)
> --- a/qemu-kvm.h
> +++ b/qemu-kvm.h
> @@ -625,9 +625,11 @@ int kvm_inject_nmi(kvm_vcpu_context_t vc
>   * \param mcg_status MSR_MCG_STATUS
>   * \param addr MSR_MCI_ADDR
>   * \param misc MSR_MCI_MISC
> + * \param abort_on_error abort on error
>   */
>  void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> -                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
> +                        int abort_on_error);
>  
>  /*!
>   * \brief Query wheather in kernel pit is used
> @@ -943,8 +945,11 @@ static inline int kvm_init(int smp_cpus)
>  
>  static inline void kvm_inject_x86_mce(CPUState *cenv, int bank,
>                                        uint64_t status, uint64_t mcg_status,
> -                                      uint64_t addr, uint64_t misc)
> +                                      uint64_t addr, uint64_t misc,
> +                                      int abort_on_error)
>  {
> +    if (abort_on_error)
> +        abort();
>  }
>  
> 
> --- a/target-i386/helper.c
> +++ b/target-i386/helper.c
> @@ -1547,7 +1547,7 @@ void cpu_inject_x86_mce(CPUState *cenv, 
>      uint64_t *banks = cenv->mce_banks;
>  
>      if (kvm_enabled()) {
> -        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
> +        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
>          return;
>      }
>  
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html