Re: [PATCH v4 6/8] KVM: x86: Add emulation for MSR_IA32_MCx_CTL2 MSRs.

David Matlack <dmatlack@xxxxxxxxxx> · Fri, 3 Jun 2022 13:41:26 -0700

On Fri, May 20, 2022 at 10:36 AM Jue Wang <juew@xxxxxxxxxx> wrote:
>
> Corrected Machine Check Interrupt (CMCI) can be configured via the per
> Machine Check bank registers: IA32_MCi_CTL2. This patch adds the
> emulation of IA32_MCi_CTL2 registers to KVM. A separate mci_ctl2_banks
> array is used to keep the existing mce_banks register layout intact.
>
> In Machine Check Architecture (MCA), MCG_CMCI_P (bit 10 of MCG_CAP) is
> the corrected MC error counting/signaling extension present flag. When
> this bit is set, it does not imply CMCI reported corrected error or UCNA
> error is supported across all MCA banks. Software should check on a bank
> by bank basis (i.e. if bit 30 in each IA32_MCi_CTL2 register is set).
>
> Signed-off-by: Jue Wang <juew@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |   1 +
>  arch/x86/kvm/x86.c              | 130 ++++++++++++++++++++++----------
>  2 files changed, 92 insertions(+), 39 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 4ff36610af6a..178b7e01bf8f 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -806,6 +806,7 @@ struct kvm_vcpu_arch {
>         u64 mcg_ctl;
>         u64 mcg_ext_ctl;
>         u64 *mce_banks;
> +       u64 *mci_ctl2_banks;
>
>         /* Cache MMIO info */
>         u64 mmio_gva;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 0e839077ce52..f8ab592f519b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3174,6 +3174,16 @@ static void kvmclock_sync_fn(struct work_struct *work)
>                                         KVMCLOCK_SYNC_PERIOD);
>  }
>
> +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
> +static bool is_mci_control_msr(u32 msr)
> +{
> +       return (msr & 3) == 0;
> +}
> +static bool is_mci_status_msr(u32 msr)
> +{
> +       return (msr & 3) == 1;
> +}
> +
>  /*
>   * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
>   */
> @@ -3192,6 +3202,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>         unsigned bank_num = mcg_cap & 0xff;
>         u32 msr = msr_info->index;
>         u64 data = msr_info->data;
> +       u32 offset, last_msr;
>
>         switch (msr) {
>         case MSR_IA32_MCG_STATUS:
> @@ -3205,32 +3216,50 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                         return 1;
>                 vcpu->arch.mcg_ctl = data;
>                 break;
> -       default:
> -               if (msr >= MSR_IA32_MC0_CTL &&
> -                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> -                       u32 offset = array_index_nospec(
> -                               msr - MSR_IA32_MC0_CTL,
> -                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
> -
> -                       /* only 0 or all 1s can be written to IA32_MCi_CTL
> -                        * some Linux kernels though clear bit 10 in bank 4 to
> -                        * workaround a BIOS/GART TBL issue on AMD K8s, ignore
> -                        * this to avoid an uncatched #GP in the guest
> -                        */
> -                       if ((offset & 0x3) == 0 &&
> -                           data != 0 && (data | (1 << 10)) != ~(u64)0)
> -                               return -1;
> -
> -                       /* MCi_STATUS */
> -                       if (!msr_info->host_initiated &&
> -                           (offset & 0x3) == 1 && data != 0) {
> -                               if (!can_set_mci_status(vcpu))
> -                                       return -1;
> -                       }
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +               last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
> +               if (msr > last_msr)
> +                       return 1;
>
> -                       vcpu->arch.mce_banks[offset] = data;
> -                       break;
> -               }
> +               if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
> +                       return 1;
> +               /* An attempt to write a 1 to a reserved bit raises #GP */
> +               if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
> +                       return 1;
> +               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
> +                                           last_msr + 1 - MSR_IA32_MC0_CTL2);
> +               vcpu->arch.mci_ctl2_banks[offset] = data;

There's a lot of emulation in this commit that would be great to have
test coverage for. e.g. Testing that writing to MSR_IA32_MC0_CTL2 when
mcg_cap.MCG_CMCI_P=0 results in a #GP, writing to reserved bits, etc.

> +               break;
> +       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +               last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
> +               if (msr > last_msr)
> +                       return 1;
> +
> +               /*
> +                * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
> +                * values are architecturally undefined.  But, some Linux
> +                * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
> +                * issue on AMD K8s, allow bit 10 to be clear when setting all
> +                * other bits in order to avoid an uncaught #GP in the guest.
> +                */
> +               if (is_mci_control_msr(msr) &&
> +                   data != 0 && (data | (1 << 10)) != ~(u64)0)
> +                       return 1;
> +
> +               /*
> +                * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
> +                * AMD-based CPUs allow non-zero values, but if and only if
> +                * HWCR[McStatusWrEn] is set.
> +                */
> +               if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
> +                   data != 0 && !can_set_mci_status(vcpu))
> +                       return 1;
> +
> +               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
> +                                           last_msr + 1 - MSR_IA32_MC0_CTL);
> +               vcpu->arch.mce_banks[offset] = data;
> +               break;
> +       default:
>                 return 1;
>         }
>         return 0;
> @@ -3514,7 +3543,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                         return 1;
>                 }
>                 break;
> -       case 0x200 ... 0x2ff:
> +       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> +       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
>                 return kvm_mtrr_set_msr(vcpu, msr, data);
>         case MSR_IA32_APICBASE:
>                 return kvm_set_apic_base(vcpu, msr_info);
> @@ -3671,6 +3701,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>         case MSR_IA32_MCG_CTL:
>         case MSR_IA32_MCG_STATUS:
>         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>                 return set_msr_mce(vcpu, msr_info);
>
>         case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> @@ -3775,6 +3806,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>         u64 data;
>         u64 mcg_cap = vcpu->arch.mcg_cap;
>         unsigned bank_num = mcg_cap & 0xff;
> +       u32 offset, last_msr;
>
>         switch (msr) {
>         case MSR_IA32_P5_MC_ADDR:
> @@ -3792,16 +3824,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>         case MSR_IA32_MCG_STATUS:
>                 data = vcpu->arch.mcg_status;
>                 break;
> -       default:
> -               if (msr >= MSR_IA32_MC0_CTL &&
> -                   msr < MSR_IA32_MCx_CTL(bank_num)) {
> -                       u32 offset = array_index_nospec(
> -                               msr - MSR_IA32_MC0_CTL,
> -                               MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +               last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
> +               if (msr > last_msr)
> +                       return 1;
>
> -                       data = vcpu->arch.mce_banks[offset];
> -                       break;
> -               }
> +               if (!(mcg_cap & MCG_CMCI_P) && !host)
> +                       return 1;
> +               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
> +                                           last_msr + 1 - MSR_IA32_MC0_CTL2);
> +               data = vcpu->arch.mci_ctl2_banks[offset];
> +               break;
> +       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +               last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
> +               if (msr > last_msr)
> +                       return 1;
> +
> +               offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
> +                                           last_msr + 1 - MSR_IA32_MC0_CTL);
> +               data = vcpu->arch.mce_banks[offset];
> +               break;
> +       default:
>                 return 1;
>         }
>         *pdata = data;
> @@ -3898,7 +3941,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>                 break;
>         }
>         case MSR_MTRRcap:
> -       case 0x200 ... 0x2ff:
> +       case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> +       case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
>                 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
>         case 0xcd: /* fsb frequency */
>                 msr_info->data = 3;
> @@ -4014,6 +4058,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>         case MSR_IA32_MCG_CTL:
>         case MSR_IA32_MCG_STATUS:
>         case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>                 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
>                                    msr_info->host_initiated);
>         case MSR_IA32_XSS:
> @@ -4769,9 +4814,12 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
>         /* Init IA32_MCG_CTL to all 1s */
>         if (mcg_cap & MCG_CTL_P)
>                 vcpu->arch.mcg_ctl = ~(u64)0;
> -       /* Init IA32_MCi_CTL to all 1s */
> -       for (bank = 0; bank < bank_num; bank++)
> +       /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
> +       for (bank = 0; bank < bank_num; bank++) {
>                 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
> +               if (mcg_cap & MCG_CMCI_P)
> +                       vcpu->arch.mci_ctl2_banks[bank] = 0;
> +       }
>
>         static_call(kvm_x86_setup_mce)(vcpu);
>  out:
> @@ -11226,7 +11274,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>
>         vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
>                                        GFP_KERNEL_ACCOUNT);
> -       if (!vcpu->arch.mce_banks)
> +       vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
> +                                           GFP_KERNEL_ACCOUNT);
> +       if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
>                 goto fail_free_pio_data;
>         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
>
> @@ -11279,6 +11329,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>         free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
>  fail_free_mce_banks:
>         kfree(vcpu->arch.mce_banks);
> +       kfree(vcpu->arch.mci_ctl2_banks);
>  fail_free_pio_data:
>         free_page((unsigned long)vcpu->arch.pio_data);
>  fail_free_lapic:
> @@ -11323,6 +11374,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
>         kvm_hv_vcpu_uninit(vcpu);
>         kvm_pmu_destroy(vcpu);
>         kfree(vcpu->arch.mce_banks);
> +       kfree(vcpu->arch.mci_ctl2_banks);
>         kvm_free_lapic(vcpu);
>         idx = srcu_read_lock(&vcpu->kvm->srcu);
>         kvm_mmu_destroy(vcpu);
> --
> 2.36.1.124.g0e6072fb45-goog
>