Re: [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits

Avi Kivity <avi@xxxxxxxxxx> · Thu, 17 Dec 2009 15:46:44 +0200




On 12/10/2009 08:38 PM, oritw@xxxxxxxxxx wrote:
From: Orit Wasserman<oritw@xxxxxxxxxx>

   

(changelog)

@@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  			new_offset = vmcs_read64(TSC_OFFSET) + delta;
  			vmcs_write64(TSC_OFFSET, new_offset);
  		}
+
+		if (l1_shadow_vmcs != NULL) {
+			l1_shadow_vmcs->host_tr_base =
+				vmcs_readl(HOST_TR_BASE);
+			l1_shadow_vmcs->host_gdtr_base =
+				vmcs_readl(HOST_GDTR_BASE);
+			l1_shadow_vmcs->host_ia32_sysenter_esp =
+				vmcs_readl(HOST_IA32_SYSENTER_ESP);
+
+			if (tsc_this<  vcpu->arch.host_tsc)
+				l1_shadow_vmcs->tsc_offset =
+					vmcs_read64(TSC_OFFSET);
+
+			if (vmx->nested.nested_mode)
+				load_vmcs_host_state(l1_shadow_vmcs);
+		}
   

Please share this code with non-nested vmcs setup.

@@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
  {
  	u32 cpu_based_vm_exec_control;

+	if (to_vmx(vcpu)->nested.nested_mode) {
+		nested_vmx_intr(vcpu);
+		return;
+	}
   

I think this happens too late?  enable_irq_window() is called after 
we've given up on injecting the interrupt because interrupts are 
disabled.  But if we're running a guest, we can vmexit and inject the 
interrupt.  This code will only vmexit.

Hm, I see the vmexit code has an in_interrupt case, but I'd like this to 
be more regular: adjust vmx_interrupt_allowed() to allow interrupts if 
in a guest, and vmx_inject_irq() to force the vmexit.  This way 
interrupts have a single code path.


  static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
  {
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		if (!nested_vmx_intr(vcpu))
+			return 0;
+	}
   

... and you do that... so I wonder why the changes to 
enable_irq_window() are needed?

+
  	return (vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
  		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)&
  			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
  		   not interested (exception bitmap 12 does not include NM_VECTOR)
  		   enable fpu and resume l2 (avoid switching to l1)
  		*/
+
+		if (vmx->nested.nested_mode)
+			vmx->nested.nested_run_pending = 1; /* removing this line cause hung on boot of l2*/
+
   

This indicates a hack?

  		vmx_fpu_activate(vcpu);

  		return 1;
@@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu)
  		trace_kvm_cr_write(cr, val);
  		switch (cr) {
  		case 0:
-			kvm_set_cr0(vcpu, val);
+			if (to_vmx(vcpu)->nested.nested_mode) {
+				/* assume only X86_CR0_TS is handled by l0 */
+				long new_cr0 = vmcs_readl(GUEST_CR0);
+				long new_cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+
+				vmx_fpu_deactivate(vcpu);
   
+
+				if (val&  X86_CR0_TS) {
+					new_cr0 |= X86_CR0_TS;
+					new_cr0_read_shadow |= X86_CR0_TS;
+					vcpu->arch.cr0 |= X86_CR0_TS;
+				} else {
+					new_cr0&= ~X86_CR0_TS;
+					new_cr0_read_shadow&= ~X86_CR0_TS;
+					vcpu->arch.cr0&= X86_CR0_TS;
+				}
+
+				vmcs_writel(GUEST_CR0, new_cr0);
+				vmcs_writel(CR0_READ_SHADOW, new_cr0_read_shadow);
   

Don't you need to #vmexit if the new cr0 violates the cr0_bits_always_on 
constraint, or if it changes bits in cr0 that the guest intercepts?

+
+				if (!(val&  X86_CR0_TS) || !(val&  X86_CR0_PE))
+					vmx_fpu_activate(vcpu);
+
+				to_vmx(vcpu)->nested.nested_run_pending = 1;
   

Please split into a function.

+			} else
+				kvm_set_cr0(vcpu, val);
+
  			skip_emulated_instruction(vcpu);
  			return 1;
  		case 3:
@@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu)
  		break;
  	case 2: /* clts */
  		vmx_fpu_deactivate(vcpu);
-		vcpu->arch.cr0&= ~X86_CR0_TS;
-		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+		if (to_vmx(vcpu)->nested.nested_mode) {
+			vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0)&  ~X86_CR0_TS);
+			vmcs_writel(CR0_READ_SHADOW, vmcs_readl(CR0_READ_SHADOW)&  ~X86_CR0_TS);
+			vcpu->arch.cr0&= ~X86_CR0_TS;
+			to_vmx(vcpu)->nested.nested_run_pending = 1;
   

Won't the guest want to intercept this some time?

  	/* Access CR3 don't cause VMExit in paging mode, so we need
  	 * to sync with guest real CR3. */
  	if (enable_ept&&  is_paging(vcpu))
@@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
  		| vmx->rmode.irq.vector;
  }

+static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
   

I asked for this to be renamed.

  #ifdef CONFIG_X86_64
  #define R "r"
  #define Q "q"
@@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
  static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
  {
  	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int r;
  	u32 nested_exception_bitmap = 0;

+	if (vmx->nested.nested_mode) {
+		r = nested_handle_valid_idt(vcpu);
   

This will cause vmread()s before the launch of state that is not saved.  
This means it is broken on migration or after set_regs().

In general we follow the following pattern:

  read from memory
  vmwrite
  vmlaunch/vmresume
  vmread
  write to memory
  loop

There are exceptions where we allow state to be cached, mostly 
registers.  But we keep accessors for them so that save/restore works.

+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+	if (to_vmx(vcpu)->nested.nested_mode) {
+		struct page *msr_page = NULL;
+		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+		struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
+
+		if (!cpu_has_vmx_msr_bitmap()
+		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+			return 1;
+
+		msr_page = nested_get_page(vcpu,
+					   l2svmcs->msr_bitmap);
+
+		if (!msr_page) {
+			printk(KERN_INFO "%s error in nested_get_page\n",
+			       __func__);
+			return 0;
+		}
+
+		switch (exit_code) {
+		case EXIT_REASON_MSR_READ:
+			if (msr_index<= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x000)))
+					return 1;
+			} else if ((msr_index>= 0xc0000000)&&
+				   (msr_index<= 0xc0001fff)) {
+				msr_index&= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x400)))
+					return 1;
+			}
+			break;
+		case EXIT_REASON_MSR_WRITE:
+			if (msr_index<= 0x1fff) {
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0x800)))
+						return 1;
+			} else if ((msr_index>= 0xc0000000)&&
+				   (msr_index<= 0xc0001fff)) {
+				msr_index&= 0x1fff;
+				if (test_bit(msr_index,
+					     (unsigned long *)(msr_page +
+							       0xc00)))
+					return 1;
+			}
+			break;
+		}
+	}
+
   

Please refactor with a single test_bit, just calculate the offsets 
differently (+400*8 for high msrs, +800*8 for writes).

+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct shadow_vmcs *l2svmcs;
+
+	int r = 0;
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	if (kvm_override) {
   

What's kvm_override?

+		switch (exit_code) {
+		case EXIT_REASON_EXTERNAL_INTERRUPT:
+			return 0;
+		case EXIT_REASON_EXCEPTION_NMI:
+			if (!is_exception(intr_info))
+				return 0;
+
+			if (is_page_fault(intr_info)&&  (!enable_ept))
+				return 0;
+
+			break;
+		case EXIT_REASON_EPT_VIOLATION:
+			if (enable_ept)
+				return 0;
+
+			break;
+		}
+	}
+
+
+	if (!nested_map_current(vcpu))
+		return 0;
+
+	l2svmcs = get_shadow_vmcs(vcpu);
+
+	switch (exit_code) {
+	case EXIT_REASON_INVLPG:
+		if (l2svmcs->cpu_based_vm_exec_control&
+		    CPU_BASED_INVLPG_EXITING)
+			r = 1;
+		break;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		r = nested_vmx_exit_handled_msr(vcpu);
+		break;
+	case EXIT_REASON_CR_ACCESS: {
+		unsigned long exit_qualification =
+			vmcs_readl(EXIT_QUALIFICATION);
+		int cr = exit_qualification&  15;
+		int reg = (exit_qualification>>  8)&  15;
+		unsigned long val = kvm_register_read(vcpu, reg);
+
+		switch ((exit_qualification>>  4)&  3) {
+		case 0: /* mov to cr */
+			switch (cr) {
+			case 0:
+				if (l2svmcs->cr0_guest_host_mask&
+				    (val ^ l2svmcs->cr0_read_shadow))
+					r = 1;
   
+				break;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control&
+				    CPU_BASED_CR3_LOAD_EXITING)
+					r = 1;
+				break;
+			case 4:
+				if (l2svmcs->cr4_guest_host_mask&
+				    (l2svmcs->cr4_read_shadow ^ val))
+					r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control&
+				    CPU_BASED_CR8_LOAD_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 2: /* clts */
+			if (l2svmcs->cr0_guest_host_mask&  X86_CR0_TS)
+				r = 1;
+			break;
+		case 1: /*mov from cr*/
+			switch (cr) {
+			case 0:
+				r = 1;
+			case 3:
+				if (l2svmcs->cpu_based_vm_exec_control&
+				    CPU_BASED_CR3_STORE_EXITING)
+					r = 1;
+				break;
+			case 4:
+				r = 1;
+				break;
+			case 8:
+				if (l2svmcs->cpu_based_vm_exec_control&
+				    CPU_BASED_CR8_STORE_EXITING)
+					r = 1;
+				break;
+			}
+			break;
+		case 3: /* lmsw */
+			if (l2svmcs->cr0_guest_host_mask&
+			    (val ^ l2svmcs->cr0_read_shadow))
+				r = 1;
+			break;
+		}
+		break;
+	}
+	case EXIT_REASON_DR_ACCESS: {
+		if (l2svmcs->cpu_based_vm_exec_control&
+		    CPU_BASED_MOV_DR_EXITING)
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXCEPTION_NMI: {
+
+		if (is_external_interrupt(intr_info)&&
+		    (l2svmcs->pin_based_vm_exec_control&
+		     PIN_BASED_EXT_INTR_MASK))
+			r = 1;
+		else if (is_nmi(intr_info)&&
+		    (l2svmcs->pin_based_vm_exec_control&
+		     PIN_BASED_NMI_EXITING))
+			r = 1;
+		else if (is_exception(intr_info)&&
+		    (l2svmcs->exception_bitmap&
+		     (1u<<  (intr_info&  INTR_INFO_VECTOR_MASK))))
+			r = 1;
+		else if (is_page_fault(intr_info))
+			r = 1;
+		break;
+	}
+
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		if (l2svmcs->pin_based_vm_exec_control&
+		    PIN_BASED_EXT_INTR_MASK)
+			r = 1;
+		break;
+	default:
+		r = 1;
+	}
+	nested_unmap_current(vcpu);
+
   

Please move these to the normal handlers so it is possible to follow the 
code.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html