KVM's "Lazy FPU loading" means that sometimes L0 needs to set CR0.TS, even if a guest didn't set it. Moreover, L0 must also trap CR0.TS changes and NM exceptions, even if we have a guest hypervisor (L1) who didn't want these traps. And of course, conversely: If L1 wanted to trap these events, we must let it, even if L0 is not interested in them. This patch fixes some existing KVM code (in update_exception_bitmap(), vmx_fpu_activate(), vmx_fpu_deactivate(), handle_cr()) to do the correct merging of L0's and L1's needs. Note that new code in introduced in previous patches already handles CR0 correctly (see prepare_vmcs02(), prepare_vmcs12(), and nested_vmx_vmexit()). Signed-off-by: Nadav Har'El <nyh@xxxxxxxxxx> --- arch/x86/kvm/vmx.c | 90 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 5 deletions(-) --- .before/arch/x86/kvm/vmx.c 2010-10-17 11:52:03.000000000 +0200 +++ .after/arch/x86/kvm/vmx.c 2010-10-17 11:52:03.000000000 +0200 @@ -1098,6 +1098,17 @@ static void update_exception_bitmap(stru eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ if (vcpu->fpu_active) eb &= ~(1u << NM_VECTOR); + + /* When we are running a nested L2 guest and L1 specified for it a + * certain exception bitmap, we must trap the same exceptions and pass + * them to L1. When running L2, we will only handle the exceptions + * specified above if L1 did not want them. + */ + if (to_vmx(vcpu)->nested.nested_mode) { + u32 nested_eb = get_vmcs12_fields(vcpu)->exception_bitmap; + eb |= nested_eb; + } + vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1422,8 +1433,19 @@ static void vmx_fpu_activate(struct kvm_ cr0 &= ~(X86_CR0_TS | X86_CR0_MP); cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); vmcs_writel(GUEST_CR0, cr0); - update_exception_bitmap(vcpu); vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + if (to_vmx(vcpu)->nested.nested_mode) { + /* While we (L0) no longer care about NM exceptions or cr0.TS + * changes, our guest hypervisor (L1) might care in which case + * we must trap them for it. + */ + u32 eb = vmcs_read32(EXCEPTION_BITMAP) & ~(1u << NM_VECTOR); + struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu); + eb |= vmcs12->exception_bitmap; + vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; + vmcs_write32(EXCEPTION_BITMAP, eb); + } else + update_exception_bitmap(vcpu); vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } @@ -1431,12 +1453,24 @@ static void vmx_decache_cr0_guest_bits(s static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { + /* Note that there is no vcpu->fpu_active = 0 here. The caller must + * set this *before* calling this function. + */ vmx_decache_cr0_guest_bits(vcpu); vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); - update_exception_bitmap(vcpu); + vmcs_write32(EXCEPTION_BITMAP, + vmcs_read32(EXCEPTION_BITMAP) | (1u << NM_VECTOR)); vcpu->arch.cr0_guest_owned_bits = 0; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); + if (to_vmx(vcpu)->nested.nested_mode) + /* Unfortunately in nested mode we play with arch.cr0's PG + * bit, so we musn't copy it all, just the relevant TS bit + */ + vmcs_writel(CR0_READ_SHADOW, + (vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS) | + (vcpu->arch.cr0 & X86_CR0_TS)); + else + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) @@ -3876,6 +3910,52 @@ static void complete_insn_gp(struct kvm_ skip_emulated_instruction(vcpu); } +/* called to set cr0 as approriate for a mov-to-cr0 exit. */ +static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (to_vmx(vcpu)->nested.nested_mode) { + /* When running L2, we usually do what L1 wants: it decides + * which cr0 bits to intercept, we forward it cr0-change events + * (see nested_vmx_exit_handled()). We only get here when a cr0 + * bit was changed that L1 did not ask to intercept, but L0 + * nevertheless did. Currently this can only happen with the TS + * bit (see CR0_GUEST_HOST_MASK in prepare_vmcs02()). + * We must change only this bit in GUEST_CR0 and CR0_READ_SHADOW + * and not call kvm_set_cr0 because it enforces a relationship + * between the two that is specific to KVM (i.e., only the TS + * bit might differ) and with which L1 might not agree. + */ + long new_cr0 = vmcs_readl(GUEST_CR0); + long new_cr0_rs = vmcs_readl(CR0_READ_SHADOW); + if (val & X86_CR0_TS) { + new_cr0 |= X86_CR0_TS; + new_cr0_rs |= X86_CR0_TS; + vcpu->arch.cr0 |= X86_CR0_TS; + } else { + new_cr0 &= ~X86_CR0_TS; + new_cr0_rs &= ~X86_CR0_TS; + vcpu->arch.cr0 &= ~X86_CR0_TS; + } + vmcs_writel(GUEST_CR0, new_cr0); + vmcs_writel(CR0_READ_SHADOW, new_cr0_rs); + return 0; + } else + return kvm_set_cr0(vcpu, val); +} + +/* called to set cr0 as approriate for clts instruction exit. */ +static void handle_clts(struct kvm_vcpu *vcpu) +{ + if (to_vmx(vcpu)->nested.nested_mode) { + /* As in handle_set_cr0(), we can't call vmx_set_cr0 here */ + vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0) & ~X86_CR0_TS); + vmcs_writel(CR0_READ_SHADOW, + vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); + vcpu->arch.cr0 &= ~X86_CR0_TS; + } else + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; @@ -3892,7 +3972,7 @@ static int handle_cr(struct kvm_vcpu *vc trace_kvm_cr_write(cr, val); switch (cr) { case 0: - err = kvm_set_cr0(vcpu, val); + err = handle_set_cr0(vcpu, val); complete_insn_gp(vcpu, err); return 1; case 3: @@ -3918,7 +3998,7 @@ static int handle_cr(struct kvm_vcpu *vc }; break; case 2: /* clts */ - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + handle_clts(vcpu); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); vmx_fpu_activate(vcpu); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html