Re: [PATCH -v2] QEMU-KVM: MCE: Relay UCR MCE to guest

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Wed, 16 Sep 2009 14:59:32 -0300

On Wed, Sep 09, 2009 at 10:28:02AM +0800, Huang Ying wrote:
> UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
> where some hardware error such as some memory error can be reported
> without PCC (processor context corrupted). To recover from such MCE,
> the corresponding memory will be unmapped, and all processes accessing
> the memory will be killed via SIGBUS.
> 
> For KVM, if QEMU/KVM is killed, all guest processes will be killed
> too. So we relay SIGBUS from host OS to guest system via a UCR MCE
> injection. Then guest OS can isolate corresponding memory and kill
> necessary guest processes only. SIGBUS sent to main thread (not VCPU
> threads) will be broadcast to all VCPU threads as UCR MCE.
> 
> v2:
> 
> - Use qemu_ram_addr_from_host instead of self made one to covert from
>   host address to guest RAM address. Thanks Anthony Liguori.
> 
> Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx>
> 
> ---
>  cpu-common.h      |    1 
>  exec.c            |   20 +++++--
>  qemu-kvm.c        |  154 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>  target-i386/cpu.h |   20 ++++++-
>  4 files changed, 178 insertions(+), 17 deletions(-)
> 
> --- a/qemu-kvm.c
> +++ b/qemu-kvm.c
> @@ -27,10 +27,23 @@
>  #include <sys/mman.h>
>  #include <sys/ioctl.h>
>  #include <signal.h>
> +#include <sys/signalfd.h>
> +#include <sys/prctl.h>
>  
>  #define false 0
>  #define true 1
>  
> +#ifndef PR_MCE_KILL
> +#define PR_MCE_KILL 33
> +#endif
> +
> +#ifndef BUS_MCEERR_AR
> +#define BUS_MCEERR_AR 4
> +#endif
> +#ifndef BUS_MCEERR_AO
> +#define BUS_MCEERR_AO 5
> +#endif
> +
>  #define EXPECTED_KVM_API_VERSION 12
>  
>  #if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
> @@ -1507,6 +1520,37 @@ static void sig_ipi_handler(int n)
>  {
>  }
>  
> +static void sigbus_handler(int n, struct signalfd_siginfo *siginfo, void *ctx)
> +{
> +    if (siginfo->ssi_code == BUS_MCEERR_AO) {
> +        uint64_t status;
> +        unsigned long paddr;
> +        CPUState *cenv;
> +
> +        /* Hope we are lucky for AO MCE */
> +        if (do_qemu_ram_addr_from_host((void *)siginfo->ssi_addr, &paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instead of guest system!: %llx\n",
> +                    (unsigned long long)siginfo->ssi_addr);
> +            return;

qemu-kvm should die here?

> +        }
> +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +            | 0xc0;
> +        kvm_inject_x86_mce(first_cpu, 9, status,
> +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> +                           (MCM_ADDR_PHYS << 6) | 0xc);
> +        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu)
> +            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> +                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0);
> +        return;

Should abort if kvm_inject_x86_mce fails?

> +    } else if (siginfo->ssi_code == BUS_MCEERR_AR)
> +        fprintf(stderr, "Hardware memory error!\n");
> +    else
> +        fprintf(stderr, "Internal error in QEMU!\n");

Can you re-raise SIGBUS so you we get a coredump on non-MCE SIGBUS as
usual?

> +    exit(1);
> +}
> +
>  static void on_vcpu(CPUState *env, void (*func)(void *data), void *data)
>  {
>      struct qemu_work_item wi;
> @@ -1649,29 +1693,102 @@ static void flush_queued_work(CPUState *
>      pthread_cond_broadcast(&qemu_work_cond);
>  }
>  
> +static void kvm_on_sigbus(CPUState *env, siginfo_t *siginfo)
> +{
> +#if defined(KVM_CAP_MCE) && defined(TARGET_I386)
> +    struct kvm_x86_mce mce = {
> +            .bank = 9,
> +    };
> +    unsigned long paddr;
> +    int r;
> +
> +    if (env->mcg_cap && siginfo->si_addr
> +        && (siginfo->si_code == BUS_MCEERR_AR
> +            || siginfo->si_code == BUS_MCEERR_AO)) {
> +        if (siginfo->si_code == BUS_MCEERR_AR) {
> +            /* Fake an Intel architectural Data Load SRAR UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | MCI_STATUS_AR | 0x134;
> +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
> +        } else {
> +            /* Fake an Intel architectural Memory scrubbing UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | 0xc0;
> +            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
> +        }
> +        if (do_qemu_ram_addr_from_host((void *)siginfo->si_addr, &paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instaed of guest system!\n");
> +            /* Hope we are lucky for AO MCE */
> +            if (siginfo->si_code == BUS_MCEERR_AO)
> +                return;

Should die?

> +            else
> +                exit(1);
> +        }
> +        mce.addr = paddr;
> +        r = kvm_set_mce(env->kvm_cpu_state.vcpu_ctx, &mce);
> +        if (r < 0) {
> +            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
> +            exit(1);
> +        }
> +    } else
> +#endif
> +    {
> +        if (siginfo->si_code == BUS_MCEERR_AO)
> +            return;
> +        if (siginfo->si_code == BUS_MCEERR_AR)
> +            fprintf(stderr, "Hardware memory error!\n");
> +        else
> +            fprintf(stderr, "Internal error in QEMU!\n");
> +        exit(1);
> +    }
> +}

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html