Re: [PATCH RFC] KVM: x86: nested: support for MSR loading/storing

Bandan Das <bsd@xxxxxxxxxx> · Sun, 07 Dec 2014 15:16:11 +0530

[Ccing a few others who might be interested]

Hi Eugene,

Did you look at the other patch that was posted for this functionality
by Wincy Van ?

https://lkml.org/lkml/2014/11/21/749

Eugene Korenevsky <ekorenevsky@xxxxxxxxx> writes:

> BitVisor hypervisor fails to start a nested guest due to lack of MSR
> load/store support in KVM.
>
> This patch fixes this problem according to Intel SDM.

It would be helpful for the commit message to be a little more 
descriptive. 
>
> Signed-off-by: Eugene Korenevsky <ekorenevsky@xxxxxxxxx>
> ---
>  arch/x86/include/asm/vmx.h            |   6 ++
>  arch/x86/include/uapi/asm/msr-index.h |   3 +
>  arch/x86/include/uapi/asm/vmx.h       |   1 +
>  arch/x86/kvm/vmx.c                    | 191 +++++++++++++++++++++++++++++++++-
>  virt/kvm/kvm_main.c                   |   1 +
>  5 files changed, 197 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index bcbfade..e07414c 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -454,6 +454,12 @@ struct vmx_msr_entry {
>  #define ENTRY_FAIL_VMCS_LINK_PTR	4
>  
>  /*
> + * VMX Abort codes
> + */
> +#define VMX_ABORT_MSR_STORE		1
> +#define VMX_ABORT_MSR_LOAD		4
> +
> +/*
>   * VM-instruction error numbers
>   */
>  enum vm_instruction_error_number {
> diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
> index e21331c..3c9c601 100644
> --- a/arch/x86/include/uapi/asm/msr-index.h
> +++ b/arch/x86/include/uapi/asm/msr-index.h
> @@ -316,6 +316,9 @@
>  #define MSR_IA32_UCODE_WRITE		0x00000079
>  #define MSR_IA32_UCODE_REV		0x0000008b
>  
> +#define MSR_IA32_SMM_MONITOR_CTL	0x0000009b
> +#define MSR_IA32_SMBASE			0x0000009e
> +
Extra tab here ?

>  #define MSR_IA32_PERF_STATUS		0x00000198
>  #define MSR_IA32_PERF_CTL		0x00000199
>  #define MSR_AMD_PSTATE_DEF_BASE		0xc0010064
> diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
> index 990a2fe..f5f8a62 100644
> --- a/arch/x86/include/uapi/asm/vmx.h
> +++ b/arch/x86/include/uapi/asm/vmx.h
> @@ -56,6 +56,7 @@
>  #define EXIT_REASON_MSR_READ            31
>  #define EXIT_REASON_MSR_WRITE           32
>  #define EXIT_REASON_INVALID_STATE       33
> +#define EXIT_REASON_MSR_LOAD_FAILURE    34
>  #define EXIT_REASON_MWAIT_INSTRUCTION   36
>  #define EXIT_REASON_MONITOR_INSTRUCTION 39
>  #define EXIT_REASON_PAUSE_INSTRUCTION   40
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6a951d8..deebc96 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -8498,6 +8498,163 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
>  }
>  
> +struct msr_entry {
> +	u32 index;
> +	u32 reserved;
> +	u64 data;
> +} __packed;
> +

We already have struct vmx_msr_entry in vmx.h

> +static bool vmx_msr_switch_area_verify(u32 count, u64 addr, int maxphyaddr)
> +{
> +	if (count == 0)
> +		return true;
> +	if ((addr & 0xf) != 0)
> +		return false;
> +	if ((addr >> maxphyaddr) != 0)
> +		return false;
> +	if (((addr + count * sizeof(struct msr_entry) - 1) >> maxphyaddr) != 0)
> +		return false;
> +	return true;
> +}
> +
Shouldn't we also check if any of bits 63:32 is set ?

> +static bool nested_vmx_msr_switch_verify(struct kvm_vcpu *vcpu,
> +					 struct vmcs12 *vmcs12)
> +{
> +	int maxphyaddr = cpuid_maxphyaddr(vcpu);
> +
> +#define VMCS12_MSR_SWITCH_VERIFY(name) { \
> +	if (!vmx_msr_switch_area_verify(vmcs12->name##_count, \
> +					vmcs12->name##_addr, maxphyaddr)) { \
> +		pr_warn_ratelimited( \
> +			"%s: invalid MSR_{LOAD,STORE} parameters", \
> +			#name); \
> +		return false; \
> +	} \
> +	}
Just a personal opinion: replacing the macro with a switch statement is
probably a little less clumsy

> +	VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_load);
> +	VMCS12_MSR_SWITCH_VERIFY(vm_exit_msr_store);
> +	VMCS12_MSR_SWITCH_VERIFY(vm_entry_msr_load);
> +	return true;
> +}
> +
> +static bool vmx_load_msr_entry_verify(struct kvm_vcpu *vcpu,
> +				      struct msr_entry *e)
> +{
> +	if (e->index == MSR_FS_BASE || e->index == MSR_GS_BASE)
> +		return false;
> +	/* SMM is not supported */
> +	if (e->index == MSR_IA32_SMM_MONITOR_CTL)
> +		return false;
> +	/* x2APIC MSR accesses are not allowed */
> +	if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800)
> +		return false;
> +	if (e->reserved != 0)
> +		return false;
> +	return true;
> +}
> +
> +static bool vmx_msr_switch_is_protected_msr(u32 msr_index)
> +{
> +	/* Intel SDM: a processor MAY prevent writing to these MSRs when
> +	 * loading guest states on VM entries or saving guest states on VM
> +	 * exits.
> +	 * Assume our emulated processor DOES prevent writing */
> +	return msr_index == MSR_IA32_UCODE_WRITE ||
> +	       msr_index == MSR_IA32_UCODE_REV;
> +}
> +
> +static unsigned int nested_vmx_load_msrs(struct kvm_vcpu *vcpu,
> +					 u32 count, u64 addr)
> +{
> +	unsigned int i;
> +	struct msr_entry msr_entry;
> +	struct msr_data msr;
> +
> +	for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) {
> +		if (kvm_read_guest(vcpu->kvm, addr,
> +				   &msr_entry, sizeof(msr_entry))) {
> +			pr_warn_ratelimited(
> +				"Load MSR %d: cannot read GPA: 0x%llx\n",
> +				i, addr);
> +			return i;
> +		}
> +		if (!vmx_load_msr_entry_verify(vcpu, &msr_entry)) {
> +			pr_warn_ratelimited(
> +			       "Load MSR %d: invalid MSR entry 0x%x 0x%x\n",
> +			       i, msr_entry.index, msr_entry.reserved);
> +			return i;
> +		}
> +		msr.host_initiated = 0;
> +		msr.index = msr_entry.index;
> +		msr.data = msr_entry.data;
> +		if (vmx_msr_switch_is_protected_msr(msr.index)) {
> +			pr_warn_ratelimited(
> +				"Load MSR %d: prevent writing to MSR 0x%x\n",
> +				i, msr.index);
> +			return i;
> +		}
> +		if (kvm_set_msr(vcpu, &msr)) {
> +			pr_warn_ratelimited(
> +			       "Load MSR %d: cannot write 0x%llx to MSR 0x%x\n",
> +			       i, msr.data, msr.index);
> +			return i;
> +		}
> +	}
> +	return 0;
> +}
> +
To ease debugging, these warning messages should also specify that they are
Nested VMX specific. Also I think that the interfaces/functions you have introduced
can be split up into an introductory patch and a second patch can then use these
interfaces to provide the load/store functionality.

> +static bool vmx_store_msr_entry_verify(struct kvm_vcpu *vcpu,
> +				       struct msr_entry *e)
> +{
> +	/* x2APIC MSR accesses are not allowed */
> +	if (apic_x2apic_mode(vcpu->arch.apic) && (e->index >> 24) == 0x800)
> +		return false;
> +	/* SMM is not supported */
> +	if (e->index == MSR_IA32_SMBASE)
> +		return false;
> +	if (e->reserved != 0)
> +		return false;
> +	return true;
> +}
> +
> +static unsigned int nested_vmx_store_msrs(struct kvm_vcpu *vcpu,
> +					  u32 count, u64 addr)
> +{
> +	unsigned int i;
> +	struct msr_entry msr_entry;
> +
> +	for (i = 1; i <= count; i++, addr += sizeof(msr_entry)) {
> +		if (kvm_read_guest(vcpu->kvm, addr,
> +				   &msr_entry, 2 * sizeof(u32))) {
> +			pr_warn_ratelimited(
> +				"Store MSR %d: cannot read GPA: 0x%llx\n",
> +				i, addr);
> +			return i;
> +		}
> +		if (!vmx_store_msr_entry_verify(vcpu, &msr_entry)) {
> +			pr_warn_ratelimited(
> +			       "Store MSR %d: invalid MSR entry 0x%x 0x%x\n",
> +			       i, msr_entry.index, msr_entry.reserved);
> +			return i;
> +		}
> +		if (vmx_get_msr(vcpu, msr_entry.index, &msr_entry.data)) {
> +			pr_warn_ratelimited(
> +				"Store MSR %d: cannot read MSR 0x%x\n",
> +				i, msr_entry.index);
> +			return i;
> +		}
> +		if (kvm_write_guest(vcpu->kvm,
> +				    addr + offsetof(struct msr_entry, data),
> +				    &msr_entry.data, sizeof(msr_entry.data))) {
> +			pr_warn_ratelimited(
> +				"Store MSR %d: cannot write to GPA: 0x%llx\n",
> +				i, addr);
> +			return i;
> +		}
> +	}
> +	return 0;
> +}
> +
>  /*
>   * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
>   * for running an L2 nested guest.
> @@ -8507,6 +8664,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  	struct vmcs12 *vmcs12;
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>  	int cpu;
> +	unsigned int msr;
>  	struct loaded_vmcs *vmcs02;
>  	bool ia32e;
>  
> @@ -8556,11 +8714,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  		return 1;
>  	}
>  
> -	if (vmcs12->vm_entry_msr_load_count > 0 ||
> -	    vmcs12->vm_exit_msr_load_count > 0 ||
> -	    vmcs12->vm_exit_msr_store_count > 0) {
> -		pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
> -				    __func__);
> +	if (!nested_vmx_msr_switch_verify(vcpu, vmcs12)) {
>  		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
>  		return 1;
>  	}
> @@ -8670,6 +8824,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>  
>  	prepare_vmcs02(vcpu, vmcs12);
>  
> +	msr = nested_vmx_load_msrs(vcpu, vmcs12->vm_entry_msr_load_count,
> +				   vmcs12->vm_entry_msr_load_addr);
> +	if (msr)
> +		nested_vmx_entry_failure(vcpu, vmcs12,
> +					 EXIT_REASON_MSR_LOAD_FAILURE, msr);
> +
>  	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
>  		return kvm_emulate_halt(vcpu);
>  
> @@ -9099,6 +9259,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
>  	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
>  }
>  
> +static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 abort_code)
> +{
> +	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +	vmcs12->abort = abort_code;
> +	pr_warn("Nested VMX abort %u\n", abort_code);
> +	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
> +}
> +
>  /*
>   * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
>   * and modify vmcs12 to make it see what it would expect to see there if
> @@ -9118,8 +9287,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
>  	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
>  		       exit_qualification);
>  
> +	if (exit_reason != EXIT_REASON_INVALID_STATE &&
> +	    exit_reason != EXIT_REASON_MSR_LOAD_FAILURE) {
> +		if (nested_vmx_store_msrs(vcpu,
> +					  vmcs12->vm_exit_msr_store_count,
> +					  vmcs12->vm_exit_msr_store_addr))
> +			nested_vmx_abort(vcpu, VMX_ABORT_MSR_STORE);
> +	}
> +
>  	vmx_load_vmcs01(vcpu);
>  
> +	if (nested_vmx_load_msrs(vcpu, vmcs12->vm_exit_msr_load_count,
> +				 vmcs12->vm_exit_msr_load_addr))
> +		nested_vmx_abort(vcpu, VMX_ABORT_MSR_LOAD);
> +
>  	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
>  	    && nested_exit_intr_ack_set(vcpu)) {
>  		int irq = kvm_cpu_get_interrupt(vcpu);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 5b45330..c9d1c4a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1582,6 +1582,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
>  	}
>  	return 0;
>  }
> +EXPORT_SYMBOL_GPL(kvm_write_guest);
>  
>  int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
>  			      gpa_t gpa, unsigned long len)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html