On Mon, May 16, 2011 at 10:48:01PM +0300, Nadav Har'El wrote: > In VMX, before we bring down a CPU we must VMCLEAR all VMCSs loaded on it > because (at least in theory) the processor might not have written all of its > content back to memory. Since a patch from June 26, 2008, this is done using > a per-cpu "vcpus_on_cpu" linked list of vcpus loaded on each CPU. > > The problem is that with nested VMX, we no longer have the concept of a > vcpu being loaded on a cpu: A vcpu has multiple VMCSs (one for L1, a pool for > L2s), and each of those may be have been last loaded on a different cpu. > > Our solution is to hold, in addition to vcpus_on_cpu, a second linked list > saved_vmcss_on_cpu, which holds the current list of "saved" VMCSs, i.e., > VMCSs which are loaded on this CPU but are not the vmx->vmcs of any of > the vcpus. These saved VMCSs include L1's VMCS while L2 is running > (saved_vmcs01), and L2 VMCSs not currently used - because L1 is running or > because the vmcs02_pool contains more than one entry. > > When we will switch between L1's and L2's VMCSs, they need to be moved > between vcpus_on_cpu and saved_vmcs_on_cpu lists and vice versa. A new > function, nested_maintain_per_cpu_lists(), takes care of that. > > Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> > --- > arch/x86/kvm/vmx.c | 67 +++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 67 insertions(+) > > --- .before/arch/x86/kvm/vmx.c 2011-05-16 22:36:47.000000000 +0300 > +++ .after/arch/x86/kvm/vmx.c 2011-05-16 22:36:47.000000000 +0300 > @@ -181,6 +181,7 @@ struct saved_vmcs { > struct vmcs *vmcs; > int cpu; > int launched; > + struct list_head local_saved_vmcss_link; /* see saved_vmcss_on_cpu */ > }; > > /* Used to remember the last vmcs02 used for some recently used vmcs12s */ > @@ -315,7 +316,20 @@ static int vmx_set_tss_addr(struct kvm * > > static DEFINE_PER_CPU(struct vmcs *, vmxarea); > static DEFINE_PER_CPU(struct vmcs *, current_vmcs); > +/* > + * We maintain a per-CPU linked-list vcpus_on_cpu, holding for each CPU a list > + * of vcpus whose VMCS are loaded on that CPU. This is needed when a CPU is > + * brought down, and we need to VMCLEAR all VMCSs loaded on it. > + * > + * With nested VMX, we have additional VMCSs which are not the current > + * vmx->vmcs of any vcpu, but may also be loaded on some CPU: While L2 is > + * running, L1's VMCS is loaded but not the VMCS of any vcpu; While L1 is > + * running, a previously used L2 VMCS might still be around and loaded on some > + * CPU, somes even more than one such L2 VMCSs is kept (see VMCS02_POOL_SIZE). > + * The list of these additional VMCSs is kept on cpu saved_vmcss_on_cpu. > + */ > static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); > +static DEFINE_PER_CPU(struct list_head, saved_vmcss_on_cpu); > static DEFINE_PER_CPU(struct desc_ptr, host_gdt); > > static unsigned long *vmx_io_bitmap_a; > @@ -1818,6 +1832,7 @@ static int hardware_enable(void *garbage > return -EBUSY; > > INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); > + INIT_LIST_HEAD(&per_cpu(saved_vmcss_on_cpu, cpu)); > rdmsrl(MSR_IA32_FEATURE_CONTROL, old); > > test_bits = FEATURE_CONTROL_LOCKED; > @@ -1860,10 +1875,13 @@ static void kvm_cpu_vmxoff(void) > asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); > } > > +static void vmclear_local_saved_vmcss(void); > + > static void hardware_disable(void *garbage) > { > if (vmm_exclusive) { > vmclear_local_vcpus(); > + vmclear_local_saved_vmcss(); > kvm_cpu_vmxoff(); > } > write_cr4(read_cr4() & ~X86_CR4_VMXE); > @@ -4248,6 +4266,8 @@ static void __nested_free_saved_vmcs(voi > vmcs_clear(saved_vmcs->vmcs); > if (per_cpu(current_vmcs, saved_vmcs->cpu) == saved_vmcs->vmcs) > per_cpu(current_vmcs, saved_vmcs->cpu) = NULL; > + list_del(&saved_vmcs->local_saved_vmcss_link); > + saved_vmcs->cpu = -1; > } > > /* > @@ -4265,6 +4285,21 @@ static void nested_free_saved_vmcs(struc > free_vmcs(saved_vmcs->vmcs); > } > > +/* > + * VMCLEAR all the currently unused (not vmx->vmcs on any vcpu) saved_vmcss > + * which were loaded on the current CPU. See also vmclear_load_vcpus(), which > + * does the same for VMCS currently used in vcpus. > + */ > +static void vmclear_local_saved_vmcss(void) > +{ > + int cpu = raw_smp_processor_id(); > + struct saved_vmcs *v, *n; > + > + list_for_each_entry_safe(v, n, &per_cpu(saved_vmcss_on_cpu, cpu), > + local_saved_vmcss_link) > + __nested_free_saved_vmcs(v); > +} > + > /* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ > static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) > { > @@ -5143,6 +5178,38 @@ static void vmx_set_supported_cpuid(u32 > { > } > > +/* > + * Maintain the vcpus_on_cpu and saved_vmcss_on_cpu lists of vcpus and > + * inactive saved_vmcss on nested entry (L1->L2) or nested exit (L2->L1). > + * > + * nested_maintain_per_cpu_lists should be called after the VMCS was switched > + * to the new one, with parameters giving both the new on (after the entry > + * or exit) and the old one, in that order. > + */ > +static void nested_maintain_per_cpu_lists(struct vcpu_vmx *vmx, > + struct saved_vmcs *new_vmcs, > + struct saved_vmcs *old_vmcs) > +{ > + /* > + * When a vcpus's old vmcs is saved, we need to drop it from > + * vcpus_on_cpu and put it on saved_vmcss_on_cpu. > + */ > + if (old_vmcs->cpu != -1) { > + list_del(&vmx->local_vcpus_link); > + list_add(&old_vmcs->local_saved_vmcss_link, > + &per_cpu(saved_vmcss_on_cpu, old_vmcs->cpu)); > + } This new handling of vmcs could be simplified (local_vcpus_link must be manipulated with interrupts disabled, BTW). What about having a per-CPU VMCS list instead of per-CPU vcpu list? "local_vmcs_link" list node could be in "struct saved_vmcs" (and a current_saved_vmcs pointer in "struct vcpu_vmx"). vmx_vcpu_load would then add to this list at if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { per_cpu(current_vmcs, cpu) = vmx->vmcs; vmcs_load(vmx->vmcs); } -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html