Re: [PATCH 19/24] Deciding if L0 or L1 should handle an L2 exit

Avi Kivity <avi@xxxxxxxxxx> · Mon, 14 Jun 2010 15:24:02 +0300




On 06/13/2010 03:32 PM, Nadav Har'El wrote:
This patch contains the logic of whether an L2 exit should be handled by L0
and then L2 should be resumed, or whether L1 should be run to handle this
exit (using the nested_vmx_vmexit() function of the previous patch).

The basic idea is to let L1 handle the exit only if it actually asked to
trap this sort of event. For example, when L2 exits on a change to CR0,
we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any
bit which changed; If it did, we exit to L1. But if it didn't it means that
it is we (L0) that wished to trap this event, so we handle it ourselves.

The next two patches add additional logic of what to do when an interrupt or
exception is injected: Does L0 need to do it, should we exit to L1 to do it,
or should we resume L2 and keep the exception to be injected later.

We keep a new flag, "nested_run_pending", which can override the decision of
which should run next, L1 or L2. nested_run_pending=1 means that we *must* run
L2 next, not L1. This is necessary in several situations where had L1 run on
bare metal it would not have expected to be resumed at this stage. One
example is when L1 did a VMLAUNCH of L2 and therefore expects L2 to be run.
Another examples is when L2 exits on an #NM exception that L0 asked for
(because of lazy FPU loading), and L0 must deal with the exception and resume
L2 which was in a middle of an instruction, and not resume L1 which does not
expect to see an exit from L2 at this point. nested_run_pending is especially
intended to avoid switching to L1 in the injection decision-point described
above.

@@ -3819,6 +3841,8 @@ static int handle_exception(struct kvm_v

  	if (is_no_device(intr_info)) {
  		vmx_fpu_activate(vcpu);
+		if (vmx->nested.nested_mode)
+			vmx->nested.nested_run_pending = 1;
  		return 1;
  	}
   

Isn't this true for many other exceptions?  #UD which we emulate (but 
the guest doesn't trap), page faults which we handle completely...


+
+/* Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+	struct shadow_vmcs *l2svmcs)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	int cr = exit_qualification&  15;
+	int reg = (exit_qualification>>  8)&  15;
+	unsigned long val = kvm_register_read(vcpu, reg);
+
+	switch ((exit_qualification>>  4)&  3) {
+	case 0: /* mov to cr */
+		switch (cr) {
+		case 0:
+			if (l2svmcs->cr0_guest_host_mask&
+			    (val ^ l2svmcs->cr0_read_shadow))
+				return 1;
+			break;
+		case 3:
+			if (l2svmcs->cpu_based_vm_exec_control&
+			    CPU_BASED_CR3_LOAD_EXITING)
+				return 1;
+			break;
+		case 4:
+			if (l2svmcs->cr4_guest_host_mask&
+			    (l2svmcs->cr4_read_shadow ^ val))
+				return 1;
+			break;
+		case 8:
+			if (l2svmcs->cpu_based_vm_exec_control&
+			    CPU_BASED_CR8_LOAD_EXITING)
+				return 1;
   

Should check TPR threshold here too if enabled.


+	case 3: /* lmsw */
+		if (l2svmcs->cr0_guest_host_mask&
+		    (val ^ l2svmcs->cr0_read_shadow))
+			return 1;
   

Need to mask off bit 0 (cr0.pe) of val, since lmsw can't clear it.

+		break;
+	}
+	return 0;
+}
+
+/* Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * should handle it ourselves in L0. Only call this when in nested_mode (L2).
+ */
+static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool afterexit)
+{
+	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct shadow_vmcs *l2svmcs;
+	int r = 0;
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	if (afterexit) {
+		/* There are some cases where we should let L1 handle certain
+		 * events when these are injected (afterexit==0) but we should
+		 * handle them in L0 on an exit (afterexit==1).
+		 */
+		switch (exit_code) {
+		case EXIT_REASON_EXTERNAL_INTERRUPT:
+			return 0;
+		case EXIT_REASON_EXCEPTION_NMI:
+			if (!is_exception(intr_info))
+				return 0;
+			if (is_page_fault(intr_info)&&  (!enable_ept))
+				return 0;
   

Some page faults do need a l2->l1 transition.  Maybe I'll see this later.

+			break;
+		case EXIT_REASON_EPT_VIOLATION:
+			if (enable_ept)
+				return 0;
+			break;
+		}
+	}
+
+	if (!nested_map_current(vcpu))
+		return 0;
+	l2svmcs = get_shadow_vmcs(vcpu);
+
+	switch (exit_code) {
+	case EXIT_REASON_INVLPG:
+		if (l2svmcs->cpu_based_vm_exec_control&
+		    CPU_BASED_INVLPG_EXITING)
+			r = 1;
+		break;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		r = nested_vmx_exit_handled_msr(vcpu, l2svmcs, exit_code);
+		break;
+	case EXIT_REASON_CR_ACCESS:
+		r = nested_vmx_exit_handled_cr(vcpu, l2svmcs);
+		break;
+	case EXIT_REASON_DR_ACCESS:
+		if (l2svmcs->cpu_based_vm_exec_control&
+		    CPU_BASED_MOV_DR_EXITING)
+			r = 1;
+		break;
+	case EXIT_REASON_EXCEPTION_NMI:
+		if (is_external_interrupt(intr_info)&&
+		    (l2svmcs->pin_based_vm_exec_control&
+		     PIN_BASED_EXT_INTR_MASK))
+			r = 1;
   

A real external interrupt should never be handled by the guest, only a 
virtual external interrupt.

+		else if (is_nmi(intr_info)&&
+		    (l2svmcs->pin_based_vm_exec_control&
+		     PIN_BASED_NMI_EXITING))
+			r = 1;
   

Ditto for nmi.

+		else if (is_exception(intr_info)&&
+		    (l2svmcs->exception_bitmap&
+		     (1u<<  (intr_info&  INTR_INFO_VECTOR_MASK))))
+			r = 1;
   

Bit 14 of the exception bitmap is special, need special treatment.

+		else if (is_page_fault(intr_info))
+			r = 1;
   

Still looking for magic page fault handling...

+		break;
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		if (l2svmcs->pin_based_vm_exec_control&
+		    PIN_BASED_EXT_INTR_MASK)
+			r = 1;
+		break;
+	default:
+		r = 1;
+	}
+	nested_unmap_current(vcpu);
+
+	return r;
+}
+
  /*
   * The guest has exited.  See if we can fix it or if we need userspace
   * assistance.
   

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html