Re: [PATCH] KVM: x86: nVMX: support for MSR loading/storing

Bandan Das <bsd@xxxxxxxxxx> · Tue, 09 Dec 2014 19:35:04 +0530

Eugene Korenevsky <ekorenevsky@xxxxxxxxx> writes:

> Several hypervisors use MSR loading/storing to run guests.
> This patch implements emulation of this feature and allows these
> hypervisors to work in L1.
>
> The following is emulated:
> - Loading MSRs on VM-entries
> - Saving MSRs on VM-exits
> - Loading MSRs on VM-exits
>
> Actions taken on loading MSRs:
> - MSR load area is verified
> - For each MSR entry from load area:
>     - MSR load entry is read and verified
>     - MSR value is safely written
> Actions taken on storing MSRs:
> - MSR store area is verified
> - For each MSR entry from store area:
>     - MSR entry is read and verified
>     - MSR value is safely read using MSR index from MSR entry
>     - MSR value is written to MSR entry
>
> The code performs checks required by Intel Software Developer Manual.
>
Thank you for the commit message.

> This patch is partially based on Wincy Wan's work.
Typo in name ->                          ^

I have added Jan and Wincy to the CC list since they reviewed your earlier proposal.
I think it would be better to split this up as I mentioned earlier, however,
if the other reviewers and the maintainer don't have objections, I am ok :)

>
> Signed-off-by: Eugene Korenevsky <ekorenevsky@xxxxxxxxx>
> ---
>  arch/x86/include/asm/vmx.h            |   6 +
>  arch/x86/include/uapi/asm/msr-index.h |   3 +
>  arch/x86/include/uapi/asm/vmx.h       |   2 +
>  arch/x86/kvm/vmx.c                    | 210 ++++++++++++++++++++++++++++++++--
>  virt/kvm/kvm_main.c                   |   1 +
>  5 files changed, 215 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 45afaee..8bdb247 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -457,6 +457,12 @@ struct vmx_msr_entry {
>  #define ENTRY_FAIL_VMCS_LINK_PTR	4
>  
>  /*
> + * VMX Abort codes
> + */
> +#define VMX_ABORT_MSR_STORE_FAILURE	1
> +#define VMX_ABORT_MSR_LOAD_FAILURE	4
> +
> +/*
>   * VM-instruction error numbers
>   */
>  enum vm_instruction_error_number {
> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
> index e21331c..3c9c601 100644
> --- a/arch/x86/include/uapi/asm/msr-index.h
> +++ b/arch/x86/include/uapi/asm/msr-index.h
> @@ -316,6 +316,9 @@
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> +#define MSR_IA32_SMM_MONITOR_CTL	0x0000009b
> +#define MSR_IA32_SMBASE			0x0000009e
> +
>  #define MSR_IA32_PERF_STATUS		0x00000198
>  #define MSR_IA32_PERF_CTL		0x00000199
>  #define MSR_AMD_PSTATE_DEF_BASE		0xc0010064
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index b813bf9..52ad8e2 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -56,6 +56,7 @@
>  #define EXIT_REASON_MSR_READ            31
>  #define EXIT_REASON_MSR_WRITE           32
>  #define EXIT_REASON_INVALID_STATE       33
> +#define EXIT_REASON_MSR_LOAD_FAILURE    34
>  #define EXIT_REASON_MWAIT_INSTRUCTION   36
>  #define EXIT_REASON_MONITOR_INSTRUCTION 39
>  #define EXIT_REASON_PAUSE_INSTRUCTION   40
> @@ -116,6 +117,7 @@
>  	{ EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
>  	{ EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
>  	{ EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
> +	{ EXIT_REASON_MSR_LOAD_FAILURE,      "MSR_LOAD_FAILURE" }, \
>  	{ EXIT_REASON_INVD,                  "INVD" }, \
>  	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
>  	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 9bcc871..86dc7db 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -8571,6 +8571,168 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
>  }
>  
> +static bool vmx_msr_switch_area_verify(struct kvm_vcpu *vcpu,
> +				       unsigned long count_field,
> +				       unsigned long addr_field,
> +				       int maxphyaddr)
> +{
> +	u64 count, addr;
> +
> +	BUG_ON(vmcs12_read_any(vcpu, count_field, &count));
> +	BUG_ON(vmcs12_read_any(vcpu, addr_field, &addr));
BUG_ON() is a bit harsh, please use WARN_ON and bail out.

> +	if (!IS_ALIGNED(addr, 16))
> +		goto fail;
> +	if (addr >> maxphyaddr)
> +		goto fail;
> +	if ((addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
> +		goto fail;
> +	return true;
> +fail:
> +	pr_warn_ratelimited(
> +		"nVMX: invalid MSR switch (0x%lx, 0x%lx, %d, %llu, 0x%08llx)",
> +		count_field, addr_field, maxphyaddr, count, addr);
> +	return false;
> +}
> +
> +static bool nested_vmx_msr_switch_verify(struct kvm_vcpu *vcpu,
> +					 struct vmcs12 *vmcs12)
> +{
> +	int maxphyaddr;
> +
> +	if (vmcs12->vm_exit_msr_load_count == 0 &&
> +	    vmcs12->vm_exit_msr_store_count == 0 &&
> +	    vmcs12->vm_entry_msr_load_count == 0)
> +		return true; /* Fast path */
> +	maxphyaddr = cpuid_maxphyaddr(vcpu);
> +	return vmx_msr_switch_area_verify(vcpu, VM_EXIT_MSR_LOAD_COUNT,
> +					  VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) &&
> +	       vmx_msr_switch_area_verify(vcpu, VM_EXIT_MSR_STORE_COUNT,
> +					  VM_EXIT_MSR_STORE_ADDR,
> +					  maxphyaddr) &&
> +	       vmx_msr_switch_area_verify(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
> +					  VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr);
> +}
> +
> +static bool vmx_load_msr_entry_verify(struct kvm_vcpu *vcpu,
> +				      struct vmx_msr_entry *e)
> +{
> +	if (e->index == MSR_FS_BASE || e->index == MSR_GS_BASE)
> +		return false;
> +	/* SMM is not supported */
> +	if (e->index == MSR_IA32_SMM_MONITOR_CTL)
> +		return false;
> +	/* x2APIC MSR accesses are not allowed */
> +	if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800)
> +		return false;
> +	if (e->reserved != 0)
> +		return false;
> +	return true;
> +}
> +
> +static bool vmx_msr_switch_is_protected_msr(u32 msr_index)
> +{
> +	/* Intel SDM: a processor MAY prevent writing to these MSRs when
> +	 * loading guest states on VM entries or saving guest states on VM
> +	 * exits.
> +	 * Assume our emulated processor DOES prevent writing */
Just a nit: please end the comment on a newline to be consistent.

> +	return msr_index == MSR_IA32_UCODE_WRITE ||
> +	       msr_index == MSR_IA32_UCODE_REV;
> +}
> +
> +static unsigned int nested_vmx_load_msrs(struct kvm_vcpu *vcpu,
> +					 u32 count, u64 addr)
> +{
> +	unsigned int i;
> +	struct vmx_msr_entry msr_entry;
> +	struct msr_data msr;
> +
> +	for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) {
> +		if (kvm_read_guest(vcpu->kvm, addr,
> +				   &msr_entry, sizeof(msr_entry))) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: cannot read GPA: 0x%llx\n",
> +				__func__, i, addr);
> +			return i;
> +		}
> +		if (!vmx_load_msr_entry_verify(vcpu, &msr_entry)) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: invalid MSR entry 0x%x 0x%x\n",
> +				__func__, i, msr_entry.index,
> +				msr_entry.reserved);
> +			return i;
> +		}
> +		msr.host_initiated = false;
> +		msr.index = msr_entry.index;
> +		msr.data = msr_entry.value;
> +		if (vmx_msr_switch_is_protected_msr(msr.index)) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: prevent writing to MSR 0x%x\n",
> +				__func__, i, msr.index);
> +			return i;
> +		}
> +		if (kvm_set_msr(vcpu, &msr)) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: cannot write 0x%llx to MSR 0x%x\n",
> +				__func__, i, msr.data, msr.index);
> +			return i;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static bool vmx_store_msr_entry_verify(struct kvm_vcpu *vcpu,
> +				       struct vmx_msr_entry *e)
> +{
> +	/* x2APIC MSR accesses are not allowed */
> +	if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800)
> +		return false;
> +	/* SMM is not supported */
> +	if (e->index == MSR_IA32_SMBASE)
> +		return false;
> +	if (e->reserved != 0)
> +		return false;
> +	return true;
> +}
> +
> +static unsigned int nested_vmx_store_msrs(struct kvm_vcpu *vcpu,
> +					  u32 count, u64 addr)
> +{
> +	unsigned int i;
> +	struct vmx_msr_entry msr_entry;
> +
> +	for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) {
> +		if (kvm_read_guest(vcpu->kvm, addr,
> +				   &msr_entry, 2 * sizeof(u32))) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: cannot read GPA: 0x%llx\n",
> +				__func__, i, addr);
> +			return i;
> +		}
> +		if (!vmx_store_msr_entry_verify(vcpu, &msr_entry)) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: invalid MSR entry 0x%x 0x%x\n",
> +				__func__, i, msr_entry.index,
> +				msr_entry.reserved);
> +			return i;
> +		}
> +		if (vmx_get_msr(vcpu, msr_entry.index, &msr_entry.value)) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: cannot read MSR 0x%x\n",
> +				__func__, i, msr_entry.index);
> +			return i;
> +		}
> +		if (kvm_write_guest(vcpu->kvm,
> +				addr + offsetof(struct vmx_msr_entry, value),
> +				&msr_entry.value, sizeof(msr_entry.value))) {
> +			pr_warn_ratelimited(
> +				"%s MSR %u: cannot write to GPA: 0x%llx\n",
> +				__func__, i, addr);
> +			return i;
> +		}
> +	}
> +	return 0;
> +}
> +
>  /*
>   * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
>   * for running an L2 nested guest.
> @@ -8580,6 +8742,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  	struct vmcs12 *vmcs12;
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	int cpu;
> +	unsigned int msr;
>  	struct loaded_vmcs *vmcs02;
>  	bool ia32e;
>  
> @@ -8629,11 +8792,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  		return 1;
>  	}
>  
> -	if (vmcs12->vm_entry_msr_load_count > 0 ||
> -	    vmcs12->vm_exit_msr_load_count > 0 ||
> -	    vmcs12->vm_exit_msr_store_count > 0) {
> -		pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
> -				    __func__);
> +	if (!nested_vmx_msr_switch_verify(vcpu, vmcs12)) {
>  		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
>  		return 1;
>  	}
> @@ -8739,10 +8898,20 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  
>  	vmx_segment_cache_clear(vmx);
>  
> -	vmcs12->launch_state = 1;
> -
>  	prepare_vmcs02(vcpu, vmcs12);
>  
> +	msr = nested_vmx_load_msrs(vcpu, vmcs12->vm_entry_msr_load_count,
> +				   vmcs12->vm_entry_msr_load_addr);
> +	if (msr) {
> +		leave_guest_mode(vcpu);
> +		vmx_load_vmcs01(vcpu);
> +		nested_vmx_entry_failure(vcpu, vmcs12,
> +					 EXIT_REASON_MSR_LOAD_FAILURE, msr);
> +		return 1;
> +	}
> +
> +	vmcs12->launch_state = 1;
> +
>  	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
>  		return kvm_emulate_halt(vcpu);
>  
> @@ -9040,6 +9209,15 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
>  	kvm_clear_interrupt_queue(vcpu);
>  }
>  
> +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 abort_code)
> +{
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +	vmcs12->abort = abort_code;
> +	pr_warn("Nested VMX abort %u\n", abort_code);
> +	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> +}
> +
>  /*
>   * A part of what we need to when the nested L2 guest exits and we want to
>   * run its L1 parent, is to reset L1's guest state to the host state specified
> @@ -9172,6 +9350,19 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>  
>  	kvm_set_dr(vcpu, 7, 0x400);
>  	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
> +
> +	if (nested_vmx_load_msrs(vcpu, vmcs12->vm_exit_msr_load_count,
> +				 vmcs12->vm_exit_msr_load_addr))
> +		nested_vmx_abort(vcpu, VMX_ABORT_MSR_LOAD_FAILURE);
> +}
> +
> +static bool vmx_is_entry_failure(u32 exit_reason)
> +{
> +	if ((exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) == 0)
> +		return false;
> +	exit_reason &= ~VMX_EXIT_REASONS_FAILED_VMENTRY;
> +	return exit_reason == EXIT_REASON_INVALID_STATE ||
> +	       exit_reason == EXIT_REASON_MSR_LOAD_FAILURE;
>  }
>  
>  /*
> @@ -9193,6 +9384,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
>  	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
>  		       exit_qualification);
>  
> +	if (nested_vmx_store_msrs(vcpu,
> +				  vmcs12->vm_exit_msr_store_count,
> +				  vmcs12->vm_exit_msr_store_addr))
> +		nested_vmx_abort(vcpu, VMX_ABORT_MSR_STORE_FAILURE);
> +
>  	vmx_load_vmcs01(vcpu);
>  
>  	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c5c186a..77179c5 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1579,6 +1579,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
>  	}
>  	return 0;
>  }
> +EXPORT_SYMBOL_GPL(kvm_write_guest);
>  
>  int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>  			      gpa_t gpa, unsigned long len)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html