On 12/10/2009 08:38 PM, oritw@xxxxxxxxxx wrote:
From: Orit Wasserman<oritw@xxxxxxxxxx>
(changelog)
@@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
new_offset = vmcs_read64(TSC_OFFSET) + delta;
vmcs_write64(TSC_OFFSET, new_offset);
}
+
+ if (l1_shadow_vmcs != NULL) {
+ l1_shadow_vmcs->host_tr_base =
+ vmcs_readl(HOST_TR_BASE);
+ l1_shadow_vmcs->host_gdtr_base =
+ vmcs_readl(HOST_GDTR_BASE);
+ l1_shadow_vmcs->host_ia32_sysenter_esp =
+ vmcs_readl(HOST_IA32_SYSENTER_ESP);
+
+ if (tsc_this< vcpu->arch.host_tsc)
+ l1_shadow_vmcs->tsc_offset =
+ vmcs_read64(TSC_OFFSET);
+
+ if (vmx->nested.nested_mode)
+ load_vmcs_host_state(l1_shadow_vmcs);
+ }
Please share this code with non-nested vmcs setup.
@@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ nested_vmx_intr(vcpu);
+ return;
+ }
I think this happens too late? enable_irq_window() is called after
we've given up on injecting the interrupt because interrupts are
disabled. But if we're running a guest, we can vmexit and inject the
interrupt. This code will only vmexit.
Hm, I see the vmexit code has an in_interrupt case, but I'd like this to
be more regular: adjust vmx_interrupt_allowed() to allow interrupts if
in a guest, and vmx_inject_irq() to force the vmexit. This way
interrupts have a single code path.
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ if (!nested_vmx_intr(vcpu))
+ return 0;
+ }
... and you do that... so I wonder why the changes to
enable_irq_window() are needed?
+
return (vmcs_readl(GUEST_RFLAGS)& X86_EFLAGS_IF)&&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)&
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
not interested (exception bitmap 12 does not include NM_VECTOR)
enable fpu and resume l2 (avoid switching to l1)
*/
+
+ if (vmx->nested.nested_mode)
+ vmx->nested.nested_run_pending = 1; /* removing this line cause hung on boot of l2*/
+
This indicates a hack?
vmx_fpu_activate(vcpu);
return 1;
@@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, val);
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ /* assume only X86_CR0_TS is handled by l0 */
+ long new_cr0 = vmcs_readl(GUEST_CR0);
+ long new_cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
+
+ vmx_fpu_deactivate(vcpu);
+
+ if (val& X86_CR0_TS) {
+ new_cr0 |= X86_CR0_TS;
+ new_cr0_read_shadow |= X86_CR0_TS;
+ vcpu->arch.cr0 |= X86_CR0_TS;
+ } else {
+ new_cr0&= ~X86_CR0_TS;
+ new_cr0_read_shadow&= ~X86_CR0_TS;
+ vcpu->arch.cr0&= X86_CR0_TS;
+ }
+
+ vmcs_writel(GUEST_CR0, new_cr0);
+ vmcs_writel(CR0_READ_SHADOW, new_cr0_read_shadow);
Don't you need to #vmexit if the new cr0 violates the cr0_bits_always_on
constraint, or if it changes bits in cr0 that the guest intercepts?
+
+ if (!(val& X86_CR0_TS) || !(val& X86_CR0_PE))
+ vmx_fpu_activate(vcpu);
+
+ to_vmx(vcpu)->nested.nested_run_pending = 1;
Please split into a function.
+ } else
+ kvm_set_cr0(vcpu, val);
+
skip_emulated_instruction(vcpu);
return 1;
case 3:
@@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu)
break;
case 2: /* clts */
vmx_fpu_deactivate(vcpu);
- vcpu->arch.cr0&= ~X86_CR0_TS;
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0)& ~X86_CR0_TS);
+ vmcs_writel(CR0_READ_SHADOW, vmcs_readl(CR0_READ_SHADOW)& ~X86_CR0_TS);
+ vcpu->arch.cr0&= ~X86_CR0_TS;
+ to_vmx(vcpu)->nested.nested_run_pending = 1;
Won't the guest want to intercept this some time?
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
if (enable_ept&& is_paging(vcpu))
@@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
| vmx->rmode.irq.vector;
}
+static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
I asked for this to be renamed.
#ifdef CONFIG_X86_64
#define R "r"
#define Q "q"
@@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int r;
u32 nested_exception_bitmap = 0;
+ if (vmx->nested.nested_mode) {
+ r = nested_handle_valid_idt(vcpu);
This will cause vmread()s before the launch of state that is not saved.
This means it is broken on migration or after set_regs().
In general we follow the following pattern:
read from memory
vmwrite
vmlaunch/vmresume
vmread
write to memory
loop
There are exceptions where we allow state to be cached, mostly
registers. But we keep accessors for them so that save/restore works.
+
+static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
+{
+ if (to_vmx(vcpu)->nested.nested_mode) {
+ struct page *msr_page = NULL;
+ u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+ u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+ struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
+
+ if (!cpu_has_vmx_msr_bitmap()
+ || !nested_cpu_has_vmx_msr_bitmap(vcpu))
+ return 1;
+
+ msr_page = nested_get_page(vcpu,
+ l2svmcs->msr_bitmap);
+
+ if (!msr_page) {
+ printk(KERN_INFO "%s error in nested_get_page\n",
+ __func__);
+ return 0;
+ }
+
+ switch (exit_code) {
+ case EXIT_REASON_MSR_READ:
+ if (msr_index<= 0x1fff) {
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0x000)))
+ return 1;
+ } else if ((msr_index>= 0xc0000000)&&
+ (msr_index<= 0xc0001fff)) {
+ msr_index&= 0x1fff;
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0x400)))
+ return 1;
+ }
+ break;
+ case EXIT_REASON_MSR_WRITE:
+ if (msr_index<= 0x1fff) {
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0x800)))
+ return 1;
+ } else if ((msr_index>= 0xc0000000)&&
+ (msr_index<= 0xc0001fff)) {
+ msr_index&= 0x1fff;
+ if (test_bit(msr_index,
+ (unsigned long *)(msr_page +
+ 0xc00)))
+ return 1;
+ }
+ break;
+ }
+ }
+
Please refactor with a single test_bit, just calculate the offsets
differently (+400*8 for high msrs, +800*8 for writes).
+
+static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
+{
+ u32 exit_code = vmcs_read32(VM_EXIT_REASON);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ struct shadow_vmcs *l2svmcs;
+
+ int r = 0;
+
+ if (vmx->nested.nested_run_pending)
+ return 0;
+
+ if (unlikely(vmx->fail)) {
+ printk(KERN_INFO "%s failed vm entry %x\n",
+ __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+ return 1;
+ }
+
+ if (kvm_override) {
What's kvm_override?
+ switch (exit_code) {
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return 0;
+ case EXIT_REASON_EXCEPTION_NMI:
+ if (!is_exception(intr_info))
+ return 0;
+
+ if (is_page_fault(intr_info)&& (!enable_ept))
+ return 0;
+
+ break;
+ case EXIT_REASON_EPT_VIOLATION:
+ if (enable_ept)
+ return 0;
+
+ break;
+ }
+ }
+
+
+ if (!nested_map_current(vcpu))
+ return 0;
+
+ l2svmcs = get_shadow_vmcs(vcpu);
+
+ switch (exit_code) {
+ case EXIT_REASON_INVLPG:
+ if (l2svmcs->cpu_based_vm_exec_control&
+ CPU_BASED_INVLPG_EXITING)
+ r = 1;
+ break;
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ r = nested_vmx_exit_handled_msr(vcpu);
+ break;
+ case EXIT_REASON_CR_ACCESS: {
+ unsigned long exit_qualification =
+ vmcs_readl(EXIT_QUALIFICATION);
+ int cr = exit_qualification& 15;
+ int reg = (exit_qualification>> 8)& 15;
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ switch ((exit_qualification>> 4)& 3) {
+ case 0: /* mov to cr */
+ switch (cr) {
+ case 0:
+ if (l2svmcs->cr0_guest_host_mask&
+ (val ^ l2svmcs->cr0_read_shadow))
+ r = 1;
+ break;
+ case 3:
+ if (l2svmcs->cpu_based_vm_exec_control&
+ CPU_BASED_CR3_LOAD_EXITING)
+ r = 1;
+ break;
+ case 4:
+ if (l2svmcs->cr4_guest_host_mask&
+ (l2svmcs->cr4_read_shadow ^ val))
+ r = 1;
+ break;
+ case 8:
+ if (l2svmcs->cpu_based_vm_exec_control&
+ CPU_BASED_CR8_LOAD_EXITING)
+ r = 1;
+ break;
+ }
+ break;
+ case 2: /* clts */
+ if (l2svmcs->cr0_guest_host_mask& X86_CR0_TS)
+ r = 1;
+ break;
+ case 1: /*mov from cr*/
+ switch (cr) {
+ case 0:
+ r = 1;
+ case 3:
+ if (l2svmcs->cpu_based_vm_exec_control&
+ CPU_BASED_CR3_STORE_EXITING)
+ r = 1;
+ break;
+ case 4:
+ r = 1;
+ break;
+ case 8:
+ if (l2svmcs->cpu_based_vm_exec_control&
+ CPU_BASED_CR8_STORE_EXITING)
+ r = 1;
+ break;
+ }
+ break;
+ case 3: /* lmsw */
+ if (l2svmcs->cr0_guest_host_mask&
+ (val ^ l2svmcs->cr0_read_shadow))
+ r = 1;
+ break;
+ }
+ break;
+ }
+ case EXIT_REASON_DR_ACCESS: {
+ if (l2svmcs->cpu_based_vm_exec_control&
+ CPU_BASED_MOV_DR_EXITING)
+ r = 1;
+ break;
+ }
+
+ case EXIT_REASON_EXCEPTION_NMI: {
+
+ if (is_external_interrupt(intr_info)&&
+ (l2svmcs->pin_based_vm_exec_control&
+ PIN_BASED_EXT_INTR_MASK))
+ r = 1;
+ else if (is_nmi(intr_info)&&
+ (l2svmcs->pin_based_vm_exec_control&
+ PIN_BASED_NMI_EXITING))
+ r = 1;
+ else if (is_exception(intr_info)&&
+ (l2svmcs->exception_bitmap&
+ (1u<< (intr_info& INTR_INFO_VECTOR_MASK))))
+ r = 1;
+ else if (is_page_fault(intr_info))
+ r = 1;
+ break;
+ }
+
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ if (l2svmcs->pin_based_vm_exec_control&
+ PIN_BASED_EXT_INTR_MASK)
+ r = 1;
+ break;
+ default:
+ r = 1;
+ }
+ nested_unmap_current(vcpu);
+
Please move these to the normal handlers so it is possible to follow the
code.
--
error compiling committee.c: too many arguments to function
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html