Re: [PATCH] KVM: x86: Add support for CMCI and UCNA.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Mar 23, 2022, Jue Wang wrote:
> CMCI 

Please write Corrected Machine Check Interrupt at least once.

> is supported since Nehalem.

While possibly interesting, this info is defitely not the most interesting tidbit
in the changelong, i.e. shouldn't be the opening line.

>  UCNA (uncorrectable no action required) errors signaled via CMCI allows a
>  guest to be notified as soon as uncorrectable memory errors get detected by
>  some background threads, e.g., threads that migrate guest memory across
>  hosts.
> 
> Upon receiving UCNAs, guest kernel isolates the poisoned pages from
> future accesses much earlier than a potential fatal Machine Check
> Exception due to accesses from a guest thread.
> 
> Add CMCI signaling based on the per vCPU opt-in of MCG_CMCI_P.

This changelog needs much longer explanation of what exactly is being added, e.g. I
had to read the code to find out that this introduces new userspace functionality
to allow injecting UNCA #MCs and thus CMCI IRQs.

That's also a symptom of this patch needing to be split into a proper series, e.g.
exposing the UNCA injection point to userspace needs to be a separate patch.

Looking through this, 5 or 6 patches is probably appropriate:

  1. Replace existing magic numbers with #defines
  2. Clean up the existing LVT mess
  3. Add in-kernel LVTCMCI support (unreachable until #, but easier to review)
  4. Add support for MSR_IA32_MCx_CTL2 MSRs
  5. Add CMCI support
  6. Add UNCA injection support

I can't tell if #4 is necessary as a separate patch, it might belong with #3.

> Signed-off-by: Jue Wang <juew@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  11 +++
>  arch/x86/kvm/lapic.c            |  65 ++++++++++++++----
>  arch/x86/kvm/lapic.h            |   2 +-
>  arch/x86/kvm/vmx/vmx.c          |   1 +
>  arch/x86/kvm/x86.c              | 115 +++++++++++++++++++++++++++++---
>  5 files changed, 171 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index ec9830d2aabf..d57f3d1284a3 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -613,6 +613,8 @@ struct kvm_vcpu_xen {
>  	unsigned long evtchn_pending_sel;
>  };
>  
> +#define KVM_MCA_REG_PER_BANK 5
> +
>  struct kvm_vcpu_arch {
>  	/*
>  	 * rip and regs accesses must go through
> @@ -799,6 +801,15 @@ struct kvm_vcpu_arch {
>  	u64 mcg_status;
>  	u64 mcg_ctl;
>  	u64 mcg_ext_ctl;
> +	/*
> +	 * 5 registers per bank for up to KVM_MAX_MCE_BANKS.
> +	 * Register order within each bank:
> +	 * mce_banks[5 * bank]   - IA32_MCi_CTL
> +	 * mce_banks[5 * bank + 1] - IA32_MCi_STATUS
> +	 * mce_banks[5 * bank + 2] - IA32_MCi_ADDR
> +	 * mce_banks[5 * bank + 3] - IA32_MCi_MISC
> +	 * mce_banks[5 * bank + 4] - IA32_MCi_CTL2
> +	 */
>  	u64 *mce_banks;

Why shove CTL2 into mce_banks?  AFAICT, it just makes everything harder.  Adding
a new "u64 *mce_ctl2_banks" or whatever would simplify everything except the
memory allocation.

>  	/* Cache MMIO info */
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 9322e6340a74..b388eb82308a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -27,6 +27,7 @@
>  #include <linux/math64.h>
>  #include <linux/slab.h>
>  #include <asm/processor.h>
> +#include <asm/mce.h>
>  #include <asm/msr.h>
>  #include <asm/page.h>
>  #include <asm/current.h>
> @@ -53,8 +54,6 @@
>  #define PRIu64 "u"
>  #define PRIo64 "o"
>  
> -/* 14 is the version for Xeon and Pentium 8.4.8*/
> -#define APIC_VERSION			(0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))

Eh, probably worth keeping the APIC_VERSION #define and just move out the LVT crud.

>  #define LAPIC_MMIO_LENGTH		(1 << 12)
>  /* followed define is not in apicdef.h */
>  #define MAX_APIC_VECTOR			256
> @@ -367,7 +366,10 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
>  void kvm_apic_set_version(struct kvm_vcpu *vcpu)
>  {
>  	struct kvm_lapic *apic = vcpu->arch.apic;
> -	u32 v = APIC_VERSION;
> +	int lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> +			KVM_APIC_LVT_NUM - 1;

Retrieving the number of LVT entries belongs in a helper, and this is rather gross
absuse of KVM_APIC_LVT_NUM as there's zero indication that it's pseudo-dynamic.
The code that handles accesses to APIC_LVTCMCI is even worse:

	val &= apic_lvt_mask[KVM_APIC_LVT_NUM - 1];

The easiest and best way to handle this is to define an enum, especially since
the LVT entries aren't exactly intuitive (e.g. LVT_LINT0 isn't entry 0)

enum lapic_lvt_entry {
	LVT_TIMER,
	LVT_THERMAL_MONITOR,
	LVT_PERFORMANCE_COUNTER,
	LVT_LINT0,
	LVT_LINT1,
	LVT_ERROR,
	LVT_CMCI,

	KVM_APIC_MAX_NR_LVT_ENTRIES,
}

And use those for the initialization of apic_lvt_mask[] and drop the comments:

static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
	[LVT_TIMER] = LVT_MASK ,      /* timer mode mask added at runtime */
	[LVT_TERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,

	and so on and so forth
};

Then there's no need for the ugly KVM_APIC_LVT_NUM - 1 shenanigans to access the
CMCI entry, and the only place that needs to be aware at all is the helper to
query the number of LVT entries.  Heh, and if we wanted to be clever/supid...

static inline kvm_apic_get_nr_lvt_entries(struct kvm_vcpu *vcpu)
{
	return KVM_APIC_MAX_NR_LVT_ENTRIES - !kvm_is_cmci_supported(vcpu);
}

> +	/* 14 is the version for Xeon and Pentium 8.4.8*/
> +	u32 v = 0x14UL | ((lvt_num - 1) << 16);
>  
>  	if (!lapic_in_kernel(vcpu))
>  		return;
> @@ -390,7 +392,8 @@ static const unsigned int apic_lvt_mask[KVM_APIC_LVT_NUM] = {
>  	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
>  	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
>  	LINT_MASK, LINT_MASK,	/* LVT0-1 */
> -	LVT_MASK		/* LVTERR */
> +	LVT_MASK,		/* LVTERR */
> +	LVT_MASK | APIC_MODE_MASK	/* LVTCMCI */
>  };
>  
>  static int find_highest_vector(void *bitmap)
> @@ -1405,6 +1408,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
>  		APIC_REG_MASK(APIC_TMCCT) |
>  		APIC_REG_MASK(APIC_TDCR);
>  
> +	if (apic->vcpu->arch.mcg_cap & MCG_CMCI_P)

As alluded to above, this belongs in a helper too.

> +		valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
> +
>  	/* ARBPRI is not valid on x2APIC */
>  	if (!apic_x2apic_mode(apic))
>  		valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
> @@ -1993,6 +1999,18 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
>  	}
>  }
>  
> +static int kvm_lvt_reg_by_index(int i)
> +{
> +	if (i < 0 || i >= KVM_APIC_LVT_NUM) {
> +		pr_warn("lvt register index out of bound: %i\n", i);

This sanity check is unnecessary, @i is fully KVM controlled.  And a pr_warn() in
that case is nowhere near strong enough, e.g. at minimum this should be WARN_ON,
though again, I don't think that's necessary.

Actually, if we really wanted to sanity check @i, we could make this __always_inline
and turn it into a BUILD_BUG_ON(), though I bet there's a config option that will
result in the compiler not unrolling the callers and ruining that idea.

> +		return 0;
> +	}
> +
> +	if (i < KVM_APIC_LVT_NUM - 1)

Far better is

	if (i == LVT_CMCI)
		return APIC_LVTCMCI;

	return return APIC_LVTT + 0x10 * i;

Though given the nature of the usage, it might actually be better to bury this in
a macro (or a helper function masquerading as a macro by having a wierd name).

#define APIC_LVTx(x)							\
({									\
	int __apic_reg;							\
									\
	if ((x) != LVT_CMCI)						\
		__apic_reg = APIC_LVTCMCI;				\
	else								\
		__apic_reg = APIC_LVTT + 0x10 * (x);			\
	__apic_reg;							\
}


Then the usage stays quite readable and doesn't need temp variables, e.g.

			for (i = 0; i < KVM_APIC_MAX_NR_LVT_ENTRIES; i++) {
				lvt_val = kvm_lapic_get_reg(apic, APIC_LVTx(i));
				kvm_lapic_set_reg(apic, APIC_LVTx(i),
						  lvt_val | APIC_LVT_MASKED);
			}

> +		return APIC_LVTT + 0x10 * i;
> +	return APIC_LVTCMCI;
> +}
> +

...

> @@ -2341,8 +2376,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
>  		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
>  	kvm_apic_set_version(apic->vcpu);
>  
> -	for (i = 0; i < KVM_APIC_LVT_NUM; i++)
> -		kvm_lapic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
> +	lvt_num = vcpu->arch.mcg_cap & MCG_CMCI_P ? KVM_APIC_LVT_NUM :
> +			KVM_APIC_LVT_NUM - 1;
> +	for (i = 0; i < lvt_num; i++) {
> +		int lvt_reg = kvm_lvt_reg_by_index(i);
> +
> +		if (lvt_reg)
> +			kvm_lapic_set_reg(apic, lvt_reg, APIC_LVT_MASKED);
> +	}
>  	apic_update_lvtt(apic);
>  	if (kvm_vcpu_is_reset_bsp(vcpu) &&
>  	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 2b44e533fc8d..e2ae097613ca 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -10,7 +10,7 @@
>  
>  #define KVM_APIC_INIT		0
>  #define KVM_APIC_SIPI		1
> -#define KVM_APIC_LVT_NUM	6
> +#define KVM_APIC_LVT_NUM	7
>  
>  #define APIC_SHORT_MASK			0xc0000
>  #define APIC_DEST_NOSHORT		0x0
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index b730d799c26e..63aa2b3d30ca 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -8035,6 +8035,7 @@ static __init int hardware_setup(void)
>  	}
>  
>  	kvm_mce_cap_supported |= MCG_LMCE_P;
> +	kvm_mce_cap_supported |= MCG_CMCI_P;
>  
>  	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
>  		return -EINVAL;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index eb4029660bd9..6626723bf51b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3180,6 +3180,25 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		vcpu->arch.mcg_ctl = data;
>  		break;
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +		{

Kernel style when curly braces is needed for a case statement is to put the opening
braces with the case and not indent.  Though my vote is to hoist "offset" to be
declared at the function level so that each statement doesn't need curly braces
just to define a fairly common varaible.

> +			u32 offset;
> +			/* BIT[30] - CMCI_ENABLE */
> +			/* BIT[0:14] - CMCI_THRESHOLD */
> +			u64 mask = (1 << 30) | 0x7fff;

Add proper #defines, not comments.

> +
> +			if (!(mcg_cap & MCG_CMCI_P) &&
> +			    (data || !msr_info->host_initiated))

This looks wrong, userspace should either be able to write the MSR or not, '0'
isn't special.  Unless there's a danger to KVM, which I don't think there is,
userspace should be allowed to ignore architectural restrictions, i.e. bypass
the MCG_CMCI_P check, so that KVM doesn't create an unnecessary dependency between
ioctls.  I.e. this should be:

		if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
			return 1;

> +				return 1;
> +			/* An attempt to write a 1 to a reserved bit raises #GP*/
> +			if (data & ~mask)
> +				return 1;
> +			offset = array_index_nospec(
> +				msr - MSR_IA32_MC0_CTL2,
> +				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);

The existing code is gross, don't copy it :-)  I'd rather this run over the 80 char
soft limit.

> +			vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4] = (data & mask);

The AND with the mask is pointless, @data has already been verified.

With all of the above, this becomes:

	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
		if (!(mcg_cap & MCG_CMCI_P) && !msr_info->host_initiated)
			return 1;

		if (data & ~(MCx_CTL2_CMCI_ENABLE | MCx_CTL2_CMCI_THRESHOLD))
			return 1;

		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
					    MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
		vcpu->arch.mce_ctl2_banks = [offset] = data;
		break;


> +		}
> +		break;
>  	default:
>  		if (msr >= MSR_IA32_MC0_CTL &&
>  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
> @@ -3203,7 +3222,14 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  					return -1;
>  			}
>  
> -			vcpu->arch.mce_banks[offset] = data;
> +			/* MSR_IA32_MCi_CTL addresses are incremented by 4 bytes
> +			 * per bank.
> +			 * kvm_vcpu_arch.mce_banks has 5 registers per bank, see
> +			 * register layout details in kvm_host.h.
> +			 * MSR_IA32_MCi_CTL is the first register in each bank
> +			 * within kvm_vcpu_arch.mce_banks.
> +			 */
> +			vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4] = data;

This mess goes away if CTL2 gets a separate array.

>  			break;
>  		}
>  		return 1;
> @@ -3489,7 +3515,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  			return 1;
>  		}
>  		break;
> -	case 0x200 ... 0x2ff:
> +	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
> +	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
>  		return kvm_mtrr_set_msr(vcpu, msr, data);
>  	case MSR_IA32_APICBASE:
>  		return kvm_set_apic_base(vcpu, msr_info);
> @@ -3646,6 +3673,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
>  	case MSR_IA32_MCG_CTL:
>  	case MSR_IA32_MCG_STATUS:
>  	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
>  		return set_msr_mce(vcpu, msr_info);
>  
>  	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
> @@ -3767,6 +3795,18 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>  	case MSR_IA32_MCG_STATUS:
>  		data = vcpu->arch.mcg_status;
>  		break;
> +	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
> +		{
> +			u32 offset;
> +
> +			if (!(mcg_cap & MCG_CMCI_P) && !host)
> +				return 1;
> +			offset = array_index_nospec(
> +				msr - MSR_IA32_MC0_CTL2,
> +				MSR_IA32_MCx_CTL2(bank_num) - MSR_IA32_MC0_CTL2);
> +			data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK + 4];

Same comments as the write path.

> +		}
> +		break;
>  	default:
>  		if (msr >= MSR_IA32_MC0_CTL &&
>  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
> @@ -3774,7 +3814,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
>  				msr - MSR_IA32_MC0_CTL,
>  				MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
>  
> -			data = vcpu->arch.mce_banks[offset];
> +			data = vcpu->arch.mce_banks[offset * KVM_MCA_REG_PER_BANK / 4];
>  			break;
>  		}
>  		return 1;

...
  
> +static bool is_ucna(u64 mcg_status, u64 mci_status)
> +{
> +	return !mcg_status &&
> +		!(mci_status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR));

Spaces around the '|'.

> +}
> +
> +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu,
> +				       struct kvm_x86_mce *mce)
> +{
> +	u64 mcg_cap = vcpu->arch.mcg_cap;
> +	unsigned int bank_num = mcg_cap & 0xff;
> +	u64 *banks = vcpu->arch.mce_banks;
> +
> +	/* Check for legal bank number in guest */
> +	if (mce->bank >= bank_num)
> +		return -EINVAL;
> +
> +	/* Disallow bits that are used for machine check signalling */

This needs a more verbose comment/explanation.  I can kinda sorta piece things
together, but the intent is unclear.

> +	if (mce->mcg_status ||
> +	    (mce->status & (MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR)))
> +		return -EINVAL;
> +
> +	 /* UCNA must have VAL and UC bits set */
> +	if ((mce->status & (MCI_STATUS_VAL|MCI_STATUS_UC)) !=
> +	    (MCI_STATUS_VAL|MCI_STATUS_UC))

Spaces again, though my personal preference would be:

	if (!(mce->status & MCI_STATUS_VAL) || !(mce->status & MCI_STATUC_UC))

> +		return -EINVAL;
> +
> +	banks += KVM_MCA_REG_PER_BANK * mce->bank;
> +	banks[1] = mce->status;
> +	banks[2] = mce->addr;
> +	banks[3] = mce->misc;
> +	vcpu->arch.mcg_status = mce->mcg_status;
> +
> +	/*
> +	 * if MCG_CMCI_P is 0 or BIT[30] of IA32_MCi_CTL2 is 0, CMCI signaling
> +	 * is disabled for the bank
> +	 */
> +	if (!(mcg_cap & MCG_CMCI_P) || !(banks[4] & (1 << 30)))

#defines, not comments for the BIT 32 thing.

> +		return 0;
> +
> +	if (lapic_in_kernel(vcpu))

Any reason to support UNCA injection without an in-kernel APIC?

> +		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
> +
> +	return 0;
> +}



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux