RE: [PATCH] new version of loaded_vmcs patch

"Tian, Kevin" <kevin.tian@xxxxxxxxx> · Tue, 24 May 2011 21:02:29 +0800



> From: Nadav Har'El
> Sent: Tuesday, May 24, 2011 8:26 PM
> 
> Hi Avi, here is a updated version of the loaded_vmcs patch which you asked me
> to send before the rest of the nvmx patches.
> 
> Please let me know when you'd like me to send you an updated version of
> the rest of the patches.
> 
> ----------------------------------------------------------------------------
> Subject: [PATCH 01/31] nVMX: Keep list of loaded VMCSs, instead of vcpus.
> 
> In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it
> because (at least in theory) the processor might not have written all of its
> content back to memory. Since a patch from June 26, 2008, this is done using
> a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU.
> 
> The problem is that with nested VMX, we no longer have the concept of a
> vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for
> L2s), and each of those may be have been last loaded on a different cpu.
> 
> So instead of linking the vcpus, we link the VMCSs, using a new structure
> loaded_vmcs. This structure contains the VMCS, and the information
> pertaining
> to its loading on a specific cpu (namely, the cpu number, and whether it
> was already launched on this cpu once). In nested we will also use the same
> structure to hold L2 VMCSs, and vmx->loaded_vmcs is a pointer to the
> currently active VMCS.
> 
> Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx>

Acked-by: Kevin Tian <kevin.tian@xxxxxxxxx>

Thanks
Kevin

> ---
>  arch/x86/kvm/vmx.c |  150 ++++++++++++++++++++++++-------------------
>  1 file changed, 86 insertions(+), 64 deletions(-)
> 
> --- .before/arch/x86/kvm/vmx.c	2011-05-24 15:12:22.000000000 +0300
> +++ .after/arch/x86/kvm/vmx.c	2011-05-24 15:12:22.000000000 +0300
> @@ -116,6 +116,18 @@ struct vmcs {
>  	char data[0];
>  };
> 
> +/*
> + * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
> + * remember whether it was VMLAUNCHed, and maintain a linked list of all
> VMCSs
> + * loaded on this CPU (so we can clear them if the CPU goes down).
> + */
> +struct loaded_vmcs {
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
> +	struct list_head loaded_vmcss_on_cpu_link;
> +};
> +
>  struct shared_msr_entry {
>  	unsigned index;
>  	u64 data;
> @@ -124,9 +136,7 @@ struct shared_msr_entry {
> 
>  struct vcpu_vmx {
>  	struct kvm_vcpu       vcpu;
> -	struct list_head      local_vcpus_link;
>  	unsigned long         host_rsp;
> -	int                   launched;
>  	u8                    fail;
>  	u8                    cpl;
>  	bool                  nmi_known_unmasked;
> @@ -140,7 +150,14 @@ struct vcpu_vmx {
>  	u64 		      msr_host_kernel_gs_base;
>  	u64 		      msr_guest_kernel_gs_base;
>  #endif
> -	struct vmcs          *vmcs;
> +	/*
> +	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
> +	 * non-nested (L1) guest, it always points to vmcs01. For a nested
> +	 * guest (L2), it points to a different VMCS.
> +	 */
> +	struct loaded_vmcs    vmcs01;
> +	struct loaded_vmcs   *loaded_vmcs;
> +	bool                  __launched; /* temporary, used in
> vmx_vcpu_run */
>  	struct msr_autoload {
>  		unsigned nr;
>  		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
> @@ -200,7 +217,11 @@ static int vmx_set_tss_addr(struct kvm *
> 
>  static DEFINE_PER_CPU(struct vmcs *, vmxarea);
>  static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
> -static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
> +/*
> + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is
> needed
> + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded
> on it.
> + */
> +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
>  static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
> 
>  static unsigned long *vmx_io_bitmap_a;
> @@ -501,6 +522,13 @@ static void vmcs_clear(struct vmcs *vmcs
>  		       vmcs, phys_addr);
>  }
> 
> +static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
> +{
> +	vmcs_clear(loaded_vmcs->vmcs);
> +	loaded_vmcs->cpu = -1;
> +	loaded_vmcs->launched = 0;
> +}
> +
>  static void vmcs_load(struct vmcs *vmcs)
>  {
>  	u64 phys_addr = __pa(vmcs);
> @@ -514,25 +542,24 @@ static void vmcs_load(struct vmcs *vmcs)
>  		       vmcs, phys_addr);
>  }
> 
> -static void __vcpu_clear(void *arg)
> +static void __loaded_vmcs_clear(void *arg)
>  {
> -	struct vcpu_vmx *vmx = arg;
> +	struct loaded_vmcs *loaded_vmcs = arg;
>  	int cpu = raw_smp_processor_id();
> 
> -	if (vmx->vcpu.cpu == cpu)
> -		vmcs_clear(vmx->vmcs);
> -	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
> +	if (loaded_vmcs->cpu != cpu)
> +		return; /* vcpu migration can race with cpu offline */
> +	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
>  		per_cpu(current_vmcs, cpu) = NULL;
> -	list_del(&vmx->local_vcpus_link);
> -	vmx->vcpu.cpu = -1;
> -	vmx->launched = 0;
> +	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
> +	loaded_vmcs_init(loaded_vmcs);
>  }
> 
> -static void vcpu_clear(struct vcpu_vmx *vmx)
> +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
>  {
> -	if (vmx->vcpu.cpu == -1)
> -		return;
> -	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
> +	if (loaded_vmcs->cpu != -1)
> +		smp_call_function_single(
> +			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
>  }
> 
>  static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
> @@ -971,22 +998,22 @@ static void vmx_vcpu_load(struct kvm_vcp
> 
>  	if (!vmm_exclusive)
>  		kvm_cpu_vmxon(phys_addr);
> -	else if (vcpu->cpu != cpu)
> -		vcpu_clear(vmx);
> +	else if (vmx->loaded_vmcs->cpu != cpu)
> +		loaded_vmcs_clear(vmx->loaded_vmcs);
> 
> -	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
> -		per_cpu(current_vmcs, cpu) = vmx->vmcs;
> -		vmcs_load(vmx->vmcs);
> +	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
> +		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
> +		vmcs_load(vmx->loaded_vmcs->vmcs);
>  	}
> 
> -	if (vcpu->cpu != cpu) {
> +	if (vmx->loaded_vmcs->cpu != cpu) {
>  		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
>  		unsigned long sysenter_esp;
> 
>  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>  		local_irq_disable();
> -		list_add(&vmx->local_vcpus_link,
> -			 &per_cpu(vcpus_on_cpu, cpu));
> +		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
> +			 &per_cpu(loaded_vmcss_on_cpu, cpu));
>  		local_irq_enable();
> 
>  		/*
> @@ -998,6 +1025,7 @@ static void vmx_vcpu_load(struct kvm_vcp
> 
>  		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
>  		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
> +		vmx->loaded_vmcs->cpu = cpu;
>  	}
>  }
> 
> @@ -1005,7 +1033,8 @@ static void vmx_vcpu_put(struct kvm_vcpu
>  {
>  	__vmx_load_host_state(to_vmx(vcpu));
>  	if (!vmm_exclusive) {
> -		__vcpu_clear(to_vmx(vcpu));
> +		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
> +		vcpu->cpu = -1;
>  		kvm_cpu_vmxoff();
>  	}
>  }
> @@ -1469,7 +1498,7 @@ static int hardware_enable(void *garbage
>  	if (read_cr4() & X86_CR4_VMXE)
>  		return -EBUSY;
> 
> -	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
> +	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
>  	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
> 
>  	test_bits = FEATURE_CONTROL_LOCKED;
> @@ -1493,14 +1522,14 @@ static int hardware_enable(void *garbage
>  	return 0;
>  }
> 
> -static void vmclear_local_vcpus(void)
> +static void vmclear_local_loaded_vmcss(void)
>  {
>  	int cpu = raw_smp_processor_id();
> -	struct vcpu_vmx *vmx, *n;
> +	struct loaded_vmcs *v, *n;
> 
> -	list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
> -				 local_vcpus_link)
> -		__vcpu_clear(vmx);
> +	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
> +				 loaded_vmcss_on_cpu_link)
> +		__loaded_vmcs_clear(v);
>  }
> 
> 
> @@ -1515,7 +1544,7 @@ static void kvm_cpu_vmxoff(void)
>  static void hardware_disable(void *garbage)
>  {
>  	if (vmm_exclusive) {
> -		vmclear_local_vcpus();
> +		vmclear_local_loaded_vmcss();
>  		kvm_cpu_vmxoff();
>  	}
>  	write_cr4(read_cr4() & ~X86_CR4_VMXE);
> @@ -1696,6 +1725,18 @@ static void free_vmcs(struct vmcs *vmcs)
>  	free_pages((unsigned long)vmcs, vmcs_config.order);
>  }
> 
> +/*
> + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last
> loaded
> + */
> +static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
> +{
> +	if (!loaded_vmcs->vmcs)
> +		return;
> +	loaded_vmcs_clear(loaded_vmcs);
> +	free_vmcs(loaded_vmcs->vmcs);
> +	loaded_vmcs->vmcs = NULL;
> +}
> +
>  static void free_kvm_area(void)
>  {
>  	int cpu;
> @@ -4166,6 +4207,7 @@ static void __noclone vmx_vcpu_run(struc
>  	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
>  		vmx_set_interrupt_shadow(vcpu, 0);
> 
> +	vmx->__launched = vmx->loaded_vmcs->launched;
>  	asm(
>  		/* Store host registers */
>  		"push %%"R"dx; push %%"R"bp;"
> @@ -4236,7 +4278,7 @@ static void __noclone vmx_vcpu_run(struc
>  		"pop  %%"R"bp; pop  %%"R"dx \n\t"
>  		"setbe %c[fail](%0) \n\t"
>  	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
> -		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
> +		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
>  		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
>  		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
>  		[rax]"i"(offsetof(struct vcpu_vmx,
> vcpu.arch.regs[VCPU_REGS_RAX])),
> @@ -4276,7 +4318,7 @@ static void __noclone vmx_vcpu_run(struc
>  	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> 
>  	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
> -	vmx->launched = 1;
> +	vmx->loaded_vmcs->launched = 1;
> 
>  	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> 
> @@ -4288,41 +4330,17 @@ static void __noclone vmx_vcpu_run(struc
>  #undef R
>  #undef Q
> 
> -static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
> -{
> -	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -
> -	if (vmx->vmcs) {
> -		vcpu_clear(vmx);
> -		free_vmcs(vmx->vmcs);
> -		vmx->vmcs = NULL;
> -	}
> -}
> -
>  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>  {
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
> 
>  	free_vpid(vmx);
> -	vmx_free_vmcs(vcpu);
> +	free_loaded_vmcs(vmx->loaded_vmcs);
>  	kfree(vmx->guest_msrs);
>  	kvm_vcpu_uninit(vcpu);
>  	kmem_cache_free(kvm_vcpu_cache, vmx);
>  }
> 
> -static inline void vmcs_init(struct vmcs *vmcs)
> -{
> -	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
> -
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxon(phys_addr);
> -
> -	vmcs_clear(vmcs);
> -
> -	if (!vmm_exclusive)
> -		kvm_cpu_vmxoff();
> -}
> -
>  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>  {
>  	int err;
> @@ -4344,11 +4362,15 @@ static struct kvm_vcpu *vmx_create_vcpu(
>  		goto uninit_vcpu;
>  	}
> 
> -	vmx->vmcs = alloc_vmcs();
> -	if (!vmx->vmcs)
> +	vmx->loaded_vmcs = &vmx->vmcs01;
> +	vmx->loaded_vmcs->vmcs = alloc_vmcs();
> +	if (!vmx->loaded_vmcs->vmcs)
>  		goto free_msrs;
> -
> -	vmcs_init(vmx->vmcs);
> +	if (!vmm_exclusive)
> +		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
> +	loaded_vmcs_init(vmx->loaded_vmcs);
> +	if (!vmm_exclusive)
> +		kvm_cpu_vmxoff();
> 
>  	cpu = get_cpu();
>  	vmx_vcpu_load(&vmx->vcpu, cpu);
> @@ -4377,7 +4399,7 @@ static struct kvm_vcpu *vmx_create_vcpu(
>  	return &vmx->vcpu;
> 
>  free_vmcs:
> -	free_vmcs(vmx->vmcs);
> +	free_vmcs(vmx->loaded_vmcs->vmcs);
>  free_msrs:
>  	kfree(vmx->guest_msrs);
>  uninit_vcpu:
> 
> --
> Nadav Har'El                        |      Tuesday, May 24 2011, 20
> Iyyar 5771
> nyh@xxxxxxxxxxxxxxxxxxx             |-----------------------------------------
> Phone +972-523-790466, ICQ 13349191 |I am thinking about a new signature.
> Stay
> http://nadav.harel.org.il           |tuned.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html