Actually sending to KVM/x86 maintainers. Thanks, -Jue On Wed, Mar 23, 2022 at 11:28 AM Jue Wang <juew@xxxxxxxxxx> wrote: > > CMCI is supported since Nehalem. UCNA (uncorrectable no action required) > errors signaled via CMCI allows a guest to be notified as soon as > uncorrectable memory errors get detected by some background threads, > e.g., threads that migrate guest memory across hosts. > > Upon receiving UCNAs, guest kernel isolates the poisoned pages from > future accesses much earlier than a potential fatal Machine Check > Exception due to accesses from a guest thread. > > Add CMCI signaling based on the per vCPU opt-in of MCG_CMCI_P. > > Signed-off-by: Jue Wang <juew@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 11 +++ > arch/x86/kvm/lapic.c | 65 ++++++++++++++---- > arch/x86/kvm/lapic.h | 2 +- > arch/x86/kvm/vmx/vmx.c | 1 + > arch/x86/kvm/x86.c | 115 +++++++++++++++++++++++++++++--- > 5 files changed, 171 insertions(+), 23 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index ec9830d2aabf..d57f3d1284a3 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -613,6 +613,8 @@ struct kvm_vcpu_xen { > unsigned long evtchn_pending_sel; > }; > > +#define KVM_MCA_REG_PER_BANK 5 > + > struct kvm_vcpu_arch { > /* > * rip and regs accesses must go through > @@ -799,6 +801,15 @@ struct kvm_vcpu_arch { > u64 mcg_status; > u64 mcg_ctl; > u64 mcg_ext_ctl; > + /* > + * 5 registers per bank for up to KVM_MAX_MCE_BANKS. > + * Register order within each bank: > + * mce_banks[5 * bank] - IA32_MCi_CTL > + * mce_banks[5 * bank + 1] - IA32_MCi_STATUS > + * mce_banks[5 * bank + 2] - IA32_MCi_ADDR > + * mce_banks[5 * bank + 3] - IA32_MCi_MISC > + * mce_banks[5 * bank + 4] - IA32_MCi_CTL2 > + */ > u64 *mce_banks; > > /* Cache MMIO info */ > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c > index 9322e6340a74..b388eb82308a 100644 > --- a/arch/x86/kvm/lapic.c > +++ b/arch/x86/kvm/lapic.c > @@ -27,6 +27,7 @@ > #include <linux/math64.h> > #include <linux/slab.h> > #include <asm/processor.h> > +#include <asm/mce.h> > #include <asm/msr.h> > #include <asm/page.h> > #include <asm/current.h> > @@ -53,8 +54,6 @@ > #define PRIu64 "u" > #define PRIo64 "o" > > -/* 14 is the version for Xeon and Pentium 8.4.8*/ > -#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) > #define LAPIC_MMIO_LENGTH (1 << 12) > /* followed define is not in apicdef.h */ > #define MAX_APIC_VECTOR 256 > @@ -367,7 +366,10 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) > void kvm_apic_set_version(struct kvm_vcpu *vcpu) > { > struct kvm_lapic *apic = vcpu->arch.apic; > - u32 v = APIC_VERSION; > + int lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM : > + KVM_APIC_LVT_NUM - 1; > + /* 14 is the version for Xeon and Pentium 8.4.8*/ > + u32 v = 0x14UL | ((lvt_num - 1) << 16); > > if (!lapic_in_kernel(vcpu)) > return; > @@ -390,7 +392,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = { > LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ > LVT_MASK | APIC_MODE_MASK, /* LVTPC */ > LINT_MASK, LINT_MASK, /* LVT0-1 */ > - LVT_MASK /* LVTERR */ > + LVT_MASK, /* LVTERR */ > + LVT_MASK | APIC_MODE_MASK /* LVTCMCI */ > }; > > static int find_highest_vector(void *bitmap) > @@ -1405,6 +1408,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, > APIC_REG_MASK(APIC_TMCCT) | > APIC_REG_MASK(APIC_TDCR); > > + if (apic->vcpu->arch.mcg_cap & MCG_CMCI_P) > + valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI); > + > /* ARBPRI is not valid on x2APIC */ > if (!apic_x2apic_mode(apic)) > valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); > @@ -1993,6 +1999,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) > } > } > > +static int kvm_lvt_reg_by_index(int i) > +{ > + if (i < 0 || i >= KVM_APIC_LVT_NUM) { > + pr_warn("lvt register index out of bound: %i\n", i); > + return 0; > + } > + > + if (i < KVM_APIC_LVT_NUM - 1) > + return APIC_LVTT + 0x10 * i; > + return APIC_LVTCMCI; > +} > + > int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) > { > int ret = 0; > @@ -2038,12 +2056,17 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) > if (!(val & APIC_SPIV_APIC_ENABLED)) { > int i; > u32 lvt_val; > - > - for (i = 0; i < KVM_APIC_LVT_NUM; i++) { > - lvt_val = kvm_lapic_get_reg(apic, > - APIC_LVTT + 0x10 * i); > - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, > - lvt_val | APIC_LVT_MASKED); > + int lvt_reg; > + int lvt_num = apic->vcpu->arch.mcg_cap & MCG_CMCI_P ? > + KVM_APIC_LVT_NUM : KVM_APIC_LVT_NUM - 1; > + > + for (i = 0; i < lvt_num; i++) { > + lvt_reg = kvm_lvt_reg_by_index(i); > + if (lvt_reg) { > + lvt_val = kvm_lapic_get_reg(apic, lvt_reg); > + kvm_lapic_set_reg(apic, lvt_reg, > + lvt_val | APIC_LVT_MASKED); > + } > } > apic_update_lvtt(apic); > atomic_set(&apic->lapic_timer.pending, 0); > @@ -2093,6 +2116,17 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) > apic_update_lvtt(apic); > break; > > + case APIC_LVTCMCI: > + if (!(apic->vcpu->arch.mcg_cap & MCG_CMCI_P)) { > + ret = 1; > + break; > + } > + if (!kvm_apic_sw_enabled(apic)) > + val |= APIC_LVT_MASKED; > + val &= apic_lvt_mask[KVM_APIC_LVT_NUM - 1]; > + kvm_lapic_set_reg(apic, APIC_LVTCMCI, val); > + break; > + > case APIC_TMICT: > if (apic_lvtt_tscdeadline(apic)) > break; > @@ -2322,6 +2356,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) > struct kvm_lapic *apic = vcpu->arch.apic; > u64 msr_val; > int i; > + int lvt_num; > > if (!init_event) { > msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; > @@ -2341,8 +2376,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) > kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); > kvm_apic_set_version(apic->vcpu); > > - for (i = 0; i < KVM_APIC_LVT_NUM; i++) > - kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); > + lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM : > + KVM_APIC_LVT_NUM - 1; > + for (i = 0; i < lvt_num; i++) { > + int lvt_reg = kvm_lvt_reg_by_index(i); > + > + if (lvt_reg) > + kvm_lapic_set_reg(apic, lvt_reg, APIC_LVT_MASKED); > + } > apic_update_lvtt(apic); > if (kvm_vcpu_is_reset_bsp(vcpu) && > kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED)) > diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h > index 2b44e533fc8d..e2ae097613ca 100644 > --- a/arch/x86/kvm/lapic.h > +++ b/arch/x86/kvm/lapic.h > @@ -10,7 +10,7 @@ > > #define KVM_APIC_INIT 0 > #define KVM_APIC_SIPI 1 > -#define KVM_APIC_LVT_NUM 6 > +#define KVM_APIC_LVT_NUM 7 > > #define APIC_SHORT_MASK 0xc0000 > #define APIC_DEST_NOSHORT 0x0 > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index b730d799c26e..63aa2b3d30ca 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void) > } > > kvm_mce_cap_supported |= MCG_LMCE_P; > + kvm_mce_cap_supported |= MCG_CMCI_P; > > if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) > return -EINVAL; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index eb4029660bd9..6626723bf51b 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -3180,6 +3180,25 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > return 1; > vcpu->arch.mcg_ctl = data; > break; > + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: > + { > + u32 offset; > + /* BIT[30] - CMCI_ENABLE */ > + /* BIT[0:14] - CMCI_THRESHOLD */ > + u64 mask = (1 << 30) | 0x7fff; > + > + if (!(mcg_cap & MCG_CMCI_P) && > + (data || !msr_info->host_initiated)) > + return 1; > + /* An attempt to write a 1 to a reserved bit raises #GP*/ > + if (data & ~mask) > + return 1; > + offset = array_index_nospec( > + msr - MSR_IA32_MC0_CTL2, > + MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2); > + vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4] = (data & mask); > + } > + break; > default: > if (msr >= MSR_IA32_MC0_CTL && > msr < MSR_IA32_MCx_CTL(bank_num)) { > @@ -3203,7 +3222,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > return -1; > } > > - vcpu->arch.mce_banks[offset] = data; > + /* MSR_IA32_MCi_CTL addresses are incremented by 4 bytes > + * per bank. > + * kvm_vcpu_arch.mce_banks has 5 registers per bank, see > + * register layout details in kvm_host.h. > + * MSR_IA32_MCi_CTL is the first register in each bank > + * within kvm_vcpu_arch.mce_banks. > + */ > + vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4] = data; > break; > } > return 1; > @@ -3489,7 +3515,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > return 1; > } > break; > - case 0x200 ... 0x2ff: > + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: > + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: > return kvm_mtrr_set_msr(vcpu, msr, data); > case MSR_IA32_APICBASE: > return kvm_set_apic_base(vcpu, msr_info); > @@ -3646,6 +3673,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_IA32_MCG_CTL: > case MSR_IA32_MCG_STATUS: > case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: > + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: > return set_msr_mce(vcpu, msr_info); > > case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: > @@ -3767,6 +3795,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) > case MSR_IA32_MCG_STATUS: > data = vcpu->arch.mcg_status; > break; > + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: > + { > + u32 offset; > + > + if (!(mcg_cap & MCG_CMCI_P) && !host) > + return 1; > + offset = array_index_nospec( > + msr - MSR_IA32_MC0_CTL2, > + MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2); > + data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4]; > + } > + break; > default: > if (msr >= MSR_IA32_MC0_CTL && > msr < MSR_IA32_MCx_CTL(bank_num)) { > @@ -3774,7 +3814,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) > msr - MSR_IA32_MC0_CTL, > MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); > > - data = vcpu->arch.mce_banks[offset]; > + data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4]; > break; > } > return 1; > @@ -3873,7 +3913,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > break; > } > case MSR_MTRRcap: > - case 0x200 ... 0x2ff: > + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: > + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: > return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); > case 0xcd: /* fsb frequency */ > msr_info->data = 3; > @@ -3989,6 +4030,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > case MSR_IA32_MCG_CTL: > case MSR_IA32_MCG_STATUS: > case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: > + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: > return get_msr_mce(vcpu, msr_info->index, &msr_info->data, > msr_info->host_initiated); > case MSR_IA32_XSS: > @@ -4740,15 +4782,64 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, > /* Init IA32_MCG_CTL to all 1s */ > if (mcg_cap & MCG_CTL_P) > vcpu->arch.mcg_ctl = ~(u64)0; > - /* Init IA32_MCi_CTL to all 1s */ > - for (bank = 0; bank < bank_num; bank++) > - vcpu->arch.mce_banks[bank*4] = ~(u64)0; > + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ > + for (bank = 0; bank < bank_num; bank++) { > + vcpu->arch.mce_banks[bank * KVM_MCA_REG_PER_BANK] = ~(u64)0; > + if (mcg_cap & MCG_CMCI_P) > + vcpu->arch.mce_banks[bank * KVM_MCA_REG_PER_BANK + 4] = 0; > + } > > static_call(kvm_x86_setup_mce)(vcpu); > out: > return r; > } > > +static bool is_ucna(u64 mcg_status, u64 mci_status) > +{ > + return !mcg_status && > + !(mci_status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR)); > +} > + > +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, > + struct kvm_x86_mce *mce) > +{ > + u64 mcg_cap = vcpu->arch.mcg_cap; > + unsigned int bank_num = mcg_cap & 0xff; > + u64 *banks = vcpu->arch.mce_banks; > + > + /* Check for legal bank number in guest */ > + if (mce->bank >= bank_num) > + return -EINVAL; > + > + /* Disallow bits that are used for machine check signalling */ > + if (mce->mcg_status || > + (mce->status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR))) > + return -EINVAL; > + > + /* UCNA must have VAL and UC bits set */ > + if ((mce->status & (MCI_STATUS_VAL|MCI_STATUS_UC)) != > + (MCI_STATUS_VAL|MCI_STATUS_UC)) > + return -EINVAL; > + > + banks += KVM_MCA_REG_PER_BANK * mce->bank; > + banks[1] = mce->status; > + banks[2] = mce->addr; > + banks[3] = mce->misc; > + vcpu->arch.mcg_status = mce->mcg_status; > + > + /* > + * if MCG_CMCI_P is 0 or BIT[30] of IA32_MCi_CTL2 is 0, CMCI signaling > + * is disabled for the bank > + */ > + if (!(mcg_cap & MCG_CMCI_P) || !(banks[4] & (1 << 30))) > + return 0; > + > + if (lapic_in_kernel(vcpu)) > + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); > + > + return 0; > +} > + > static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, > struct kvm_x86_mce *mce) > { > @@ -4758,14 +4849,18 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, > > if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) > return -EINVAL; > - /* > + > + if (is_ucna(mce->mcg_status, mce->status)) > + return kvm_vcpu_x86_set_ucna(vcpu, mce); > + > + /* > * if IA32_MCG_CTL is not all 1s, the uncorrected error > * reporting is disabled > */ > if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && > vcpu->arch.mcg_ctl != ~(u64)0) > return 0; > - banks += 4 * mce->bank; > + banks += KVM_MCA_REG_PER_BANK * mce->bank; > /* > * if IA32_MCi_CTL is not all 1s, the uncorrected error > * reporting is disabled for the bank > @@ -11126,7 +11221,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) > goto fail_free_lapic; > vcpu->arch.pio_data = page_address(page); > > - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, > + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * KVM_MCA_REG_PER_BANK, > GFP_KERNEL_ACCOUNT); > if (!vcpu->arch.mce_banks) > goto fail_free_pio_data; > -- > 2.35.1.1021.g381101b075-goog >