On Fri, Apr 24, 2015 at 05:13:03AM +0800, Liang Li wrote: >Romove lazy FPU logic and use eager FPU entirely. Eager FPU does >not have performance regression, and it can simplify the code. > >When compiling kernel on westmere, the performance of eager FPU >is about 0.4% faster than lazy FPU. > >Signed-off-by: Liang Li <liang.z.li@xxxxxxxxx> >Signed-off-by: Xudong Hao <xudong.hao@xxxxxxxxx> >--- > arch/x86/include/asm/kvm_host.h | 1 - > arch/x86/kvm/svm.c | 22 ++---------- > arch/x86/kvm/vmx.c | 74 +++-------------------------------------- > arch/x86/kvm/x86.c | 8 +---- > include/linux/kvm_host.h | 2 -- > 5 files changed, 9 insertions(+), 98 deletions(-) > >diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >index dea2e7e..5d84cc9 100644 >--- a/arch/x86/include/asm/kvm_host.h >+++ b/arch/x86/include/asm/kvm_host.h >@@ -743,7 +743,6 @@ struct kvm_x86_ops { > void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); > unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); > void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); >- void (*fpu_deactivate)(struct kvm_vcpu *vcpu); > > void (*tlb_flush)(struct kvm_vcpu *vcpu); > >diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >index ce741b8..1b3b29b 100644 >--- a/arch/x86/kvm/svm.c >+++ b/arch/x86/kvm/svm.c >@@ -1087,7 +1087,6 @@ static void init_vmcb(struct vcpu_svm *svm) > struct vmcb_control_area *control = &svm->vmcb->control; > struct vmcb_save_area *save = &svm->vmcb->save; > >- svm->vcpu.fpu_active = 1; > svm->vcpu.arch.hflags = 0; > > set_cr_intercept(svm, INTERCEPT_CR0_READ); >@@ -1529,15 +1528,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm) > ulong gcr0 = svm->vcpu.arch.cr0; > u64 *hcr0 = &svm->vmcb->save.cr0; > >- if (!svm->vcpu.fpu_active) >- *hcr0 |= SVM_CR0_SELECTIVE_MASK; >- else >- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) >- | (gcr0 & SVM_CR0_SELECTIVE_MASK); >+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) >+ | (gcr0 & SVM_CR0_SELECTIVE_MASK); > > mark_dirty(svm->vmcb, VMCB_CR); > >- if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { >+ if (gcr0 == *hcr0) { > clr_cr_intercept(svm, INTERCEPT_CR0_READ); > clr_cr_intercept(svm, INTERCEPT_CR0_WRITE); > } else { >@@ -1568,8 +1564,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) > if (!npt_enabled) > cr0 |= X86_CR0_PG | X86_CR0_WP; > >- if (!vcpu->fpu_active) >- cr0 |= X86_CR0_TS; > /* > * re-enable caching here because the QEMU bios > * does not do it - this results in some delay at >@@ -1795,7 +1789,6 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu) > > clr_exception_intercept(svm, NM_VECTOR); > >- svm->vcpu.fpu_active = 1; > update_cr0_intercept(svm); > } > >@@ -4139,14 +4132,6 @@ static bool svm_has_wbinvd_exit(void) > return true; > } > >-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) >-{ >- struct vcpu_svm *svm = to_svm(vcpu); >- >- set_exception_intercept(svm, NM_VECTOR); >- update_cr0_intercept(svm); >-} Do you test it on AMD cpu? What's the performance you get? Regards, Wanpeng Li >- > #define PRE_EX(exit) { .exit_code = (exit), \ > .stage = X86_ICPT_PRE_EXCEPT, } > #define POST_EX(exit) { .exit_code = (exit), \ >@@ -4381,7 +4366,6 @@ static struct kvm_x86_ops svm_x86_ops = { > .cache_reg = svm_cache_reg, > .get_rflags = svm_get_rflags, > .set_rflags = svm_set_rflags, >- .fpu_deactivate = svm_fpu_deactivate, > > .tlb_flush = svm_flush_tlb, > >diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >index f5e8dce..811a666 100644 >--- a/arch/x86/kvm/vmx.c >+++ b/arch/x86/kvm/vmx.c >@@ -1567,7 +1567,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) > u32 eb; > > eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | >- (1u << NM_VECTOR) | (1u << DB_VECTOR); >+ (1u << DB_VECTOR); > if ((vcpu->guest_debug & > (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == > (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) >@@ -1576,8 +1576,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) > eb = ~0; > if (enable_ept) > eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ >- if (vcpu->fpu_active) >- eb &= ~(1u << NM_VECTOR); > > /* When we are running a nested L2 guest and L1 specified for it a > * certain exception bitmap, we must trap the same exceptions and pass >@@ -1961,9 +1959,6 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu) > { > ulong cr0; > >- if (vcpu->fpu_active) >- return; >- vcpu->fpu_active = 1; > cr0 = vmcs_readl(GUEST_CR0); > cr0 &= ~(X86_CR0_TS | X86_CR0_MP); > cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); >@@ -1994,33 +1989,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields) > (fields->cr4_read_shadow & fields->cr4_guest_host_mask); > } > >-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) >-{ >- /* Note that there is no vcpu->fpu_active = 0 here. The caller must >- * set this *before* calling this function. >- */ >- vmx_decache_cr0_guest_bits(vcpu); >- vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); >- update_exception_bitmap(vcpu); >- vcpu->arch.cr0_guest_owned_bits = 0; >- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); >- if (is_guest_mode(vcpu)) { >- /* >- * L1's specified read shadow might not contain the TS bit, >- * so now that we turned on shadowing of this bit, we need to >- * set this bit of the shadow. Like in nested_vmx_run we need >- * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet >- * up-to-date here because we just decached cr0.TS (and we'll >- * only update vmcs12->guest_cr0 on nested exit). >- */ >- struct vmcs12 *vmcs12 = get_vmcs12(vcpu); >- vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | >- (vcpu->arch.cr0 & X86_CR0_TS); >- vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); >- } else >- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); >-} >- > static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) > { > unsigned long rflags, save_rflags; >@@ -3575,9 +3543,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) > if (enable_ept) > ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); > >- if (!vcpu->fpu_active) >- hw_cr0 |= X86_CR0_TS | X86_CR0_MP; >- > vmcs_writel(CR0_READ_SHADOW, cr0); > vmcs_writel(GUEST_CR0, hw_cr0); > vcpu->arch.cr0 = cr0; >@@ -5066,11 +5031,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) > if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) > return 1; /* already handled by vmx_vcpu_run() */ > >- if (is_no_device(intr_info)) { >- vmx_fpu_activate(vcpu); >- return 1; >- } >- > if (is_invalid_opcode(intr_info)) { > if (is_guest_mode(vcpu)) { > kvm_queue_exception(vcpu, UD_VECTOR); >@@ -5263,22 +5223,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) > return kvm_set_cr4(vcpu, val); > } > >-/* called to set cr0 as approriate for clts instruction exit. */ >-static void handle_clts(struct kvm_vcpu *vcpu) >-{ >- if (is_guest_mode(vcpu)) { >- /* >- * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS >- * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, >- * just pretend it's off (also in arch.cr0 for fpu_activate). >- */ >- vmcs_writel(CR0_READ_SHADOW, >- vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); >- vcpu->arch.cr0 &= ~X86_CR0_TS; >- } else >- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); >-} >- > static int handle_cr(struct kvm_vcpu *vcpu) > { > unsigned long exit_qualification, val; >@@ -5321,10 +5265,6 @@ static int handle_cr(struct kvm_vcpu *vcpu) > } > break; > case 2: /* clts */ >- handle_clts(vcpu); >- trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); >- skip_emulated_instruction(vcpu); >- vmx_fpu_activate(vcpu); > return 1; > case 1: /*mov from cr*/ > switch (cr) { >@@ -9856,19 +9796,16 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, > kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); > vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); > /* >- * Note that calling vmx_set_cr0 is important, even if cr0 hasn't >- * actually changed, because it depends on the current state of >- * fpu_active (which may have changed). > * Note that vmx_set_cr0 refers to efer set above. > */ > vmx_set_cr0(vcpu, vmcs12->host_cr0); > /* >- * If we did fpu_activate()/fpu_deactivate() during L2's run, we need >- * to apply the same changes to L1's vmcs. We just set cr0 correctly, >- * but we also need to update cr0_guest_host_mask and exception_bitmap. >+ * If we did fpu_activate() during L2's run, we need to apply the same >+ * changes to L1's vmcs. We just set cr0 correctly, but we also need >+ * to update cr0_guest_host_mask and exception_bitmap. > */ > update_exception_bitmap(vcpu); >- vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); >+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; > vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); > > /* >@@ -10177,7 +10114,6 @@ static struct kvm_x86_ops vmx_x86_ops = { > .cache_reg = vmx_cache_reg, > .get_rflags = vmx_get_rflags, > .set_rflags = vmx_set_rflags, >- .fpu_deactivate = vmx_fpu_deactivate, > > .tlb_flush = vmx_flush_tlb, > >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >index e1a8126..f7597fe 100644 >--- a/arch/x86/kvm/x86.c >+++ b/arch/x86/kvm/x86.c >@@ -6236,10 +6236,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > r = 0; > goto out; > } >- if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { >- vcpu->fpu_active = 0; >- kvm_x86_ops->fpu_deactivate(vcpu); >- } > if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { > /* Page is swapped out. Do synthetic halt */ > vcpu->arch.apf.halted = true; >@@ -6296,8 +6292,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > preempt_disable(); > > kvm_x86_ops->prepare_guest_switch(vcpu); >- if (vcpu->fpu_active) >- kvm_load_guest_fpu(vcpu); >+ kvm_load_guest_fpu(vcpu); > kvm_load_guest_xcr0(vcpu); > > vcpu->mode = IN_GUEST_MODE; >@@ -7038,7 +7033,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) > fpu_save_init(&vcpu->arch.guest_fpu); > __kernel_fpu_end(); > ++vcpu->stat.fpu_reload; >- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); > trace_kvm_fpu(0); > } > >diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >index 82af5d0..c82f798 100644 >--- a/include/linux/kvm_host.h >+++ b/include/linux/kvm_host.h >@@ -118,7 +118,6 @@ static inline bool is_error_page(struct page *page) > #define KVM_REQ_MMU_SYNC 7 > #define KVM_REQ_CLOCK_UPDATE 8 > #define KVM_REQ_KICK 9 >-#define KVM_REQ_DEACTIVATE_FPU 10 > #define KVM_REQ_EVENT 11 > #define KVM_REQ_APF_HALT 12 > #define KVM_REQ_STEAL_UPDATE 13 >@@ -228,7 +227,6 @@ struct kvm_vcpu { > struct mutex mutex; > struct kvm_run *run; > >- int fpu_active; > int guest_fpu_loaded, guest_xcr0_loaded; > wait_queue_head_t wq; > struct pid *pid; >-- >1.9.1 > >-- >To unsubscribe from this list: send the line "unsubscribe linux-kernel" in >the body of a message to majordomo@xxxxxxxxxxxxxxx >More majordomo info at http://vger.kernel.org/majordomo-info.html >Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html