XFD allows the kernel to enable a feature state in XCR0 and to receive a #NM trap when a task uses instructions accessing that state. Kernel defines "struct fpu.state_mask" to indicate the saved xstate and interact with the XFD hardware when needed via a simple conversion. Once a dynamic feature is detected, "state_mask" is expanded and "state_ptr" is dynamically allocated to hold the whole state. Meanwhile once the state is not in INIT state, the corresponding XFD bit should not be armed anymore. In KVM, "guest_fpu" serves for any guest task working on this vcpu during vmexit and vmenter. We provide a pre-allocated guest_fpu space and entire "guest_fpu.state_mask" to avoid each dynamic features detection on each vcpu task. Meanwhile, to ensure correctly xsaves/xrstors guest state, set IA32_XFD as zero during vmexit and vmenter. For "current->thread.fpu", since host and guest probably have different state and mask, it also need be switched to the right context when fpu load and put. Signed-off-by: Jing Liu <jing2.liu@xxxxxxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 3 ++ arch/x86/kernel/fpu/init.c | 1 + arch/x86/kernel/fpu/xstate.c | 2 + arch/x86/kvm/vmx/vmx.c | 76 +++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmx.h | 1 + arch/x86/kvm/x86.c | 69 +++++++++++++++++++++++++----- 6 files changed, 141 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e5f33a0d0e2..6dedf3d22659 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1203,6 +1203,9 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); + void (*xfd_load)(struct kvm_vcpu *vcpu); + void (*xfd_put)(struct kvm_vcpu *vcpu); + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 7e0c68043ce3..fbb761fc13ec 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -145,6 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_kernel_xstate_min_size); * can be dynamically expanded to include some states up to this size. */ unsigned int fpu_kernel_xstate_max_size; +EXPORT_SYMBOL_GPL(fpu_kernel_xstate_max_size); /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 080f3be9a5e6..9c471a0364e2 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -77,12 +77,14 @@ static struct xfeature_capflag_info xfeature_capflags[] __initdata = { * XSAVE buffer, both supervisor and user xstates. */ u64 xfeatures_mask_all __read_mostly; +EXPORT_SYMBOL_GPL(xfeatures_mask_all); /* * This represents user xstates, a subset of xfeatures_mask_all, saved in a * dynamic kernel XSAVE buffer. */ u64 xfeatures_mask_user_dynamic __read_mostly; +EXPORT_SYMBOL_GPL(xfeatures_mask_user_dynamic); static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7fa54e78c45c..be3cc0f3ec6d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1167,6 +1167,75 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } +static void vmx_xfd_load(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (guest_cpuid_has(vcpu, X86_FEATURE_XFD)) { + vmx->host_ia32_xfd = xfirstuse_not_detected(vcpu->arch.user_fpu); + /* + * Keep IA32_XFD as zero in hypervisor. + * Guest non-zero IA32_XFD is restored until kvm_x86_ops.run + */ + if (vmx->host_ia32_xfd) + wrmsrl(MSR_IA32_XFD, 0); + } +} + +static void vmx_xfd_put(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (guest_cpuid_has(vcpu, X86_FEATURE_XFD)) { + /* IA32_XFD register is kept as zero in hypervisor. */ + if (vmx->host_ia32_xfd) + wrmsrl(MSR_IA32_XFD, vmx->host_ia32_xfd); + /* User (qemu) IA32_XFD_ERR should be zero. */ + if (vmx->msr_ia32_xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, 0); + } +} + +/* Load guest XFD MSRs before entering. */ +static void xfd_guest_enter(struct vcpu_vmx *vmx) +{ + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_XFD)) { + if (vmx->msr_ia32_xfd) + wrmsrl(MSR_IA32_XFD, vmx->msr_ia32_xfd); + /* + * We do not rdmsr here since in most cases + * IA32_XFD_ERR is zero. One rare exception is that, + * this vmenter follows a vmexit with non-zero + * MSR_IA32_XFD_ERR and it doesn't change during + * this interval. + * + * So just simply load the non-zero guest value. + */ + if (vmx->msr_ia32_xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, vmx->msr_ia32_xfd_err); + } +} + +/* + * Save guest XFD MSRs once vmexit since the registers may be changed + * when control is transferred out of KVM, e.g. preemption. + */ +static void xfd_guest_exit(struct vcpu_vmx *vmx) +{ + if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_XFD)) { + rdmsrl(MSR_IA32_XFD, vmx->msr_ia32_xfd); + rdmsrl(MSR_IA32_XFD_ERR, vmx->msr_ia32_xfd_err); + /* + * Clear the MSR_IA32_XFD to ensure correctly protect guest + * fpu context in hypervisor. + * No need to reset MSR_IA32_XFD_ERR in hypervisor since it + * has no impact on others. + */ + if (vmx->msr_ia32_xfd) + wrmsrl(MSR_IA32_XFD, 0); + } +} + void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base) { @@ -6735,6 +6804,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_load_guest_xsave_state(vcpu); + xfd_guest_enter(vmx); + pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); @@ -6804,6 +6875,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_exit(vmx); + xfd_guest_exit(vmx); + kvm_load_host_xsave_state(vcpu); vmx->nested.nested_run_pending = 0; @@ -7644,6 +7717,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, + .xfd_load = vmx_xfd_load, + .xfd_put = vmx_xfd_put, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index d487f5a53a08..9a9ea37a29b1 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -288,6 +288,7 @@ struct vcpu_vmx { } shadow_msr_intercept; /* eXtended Feature Disabling (XFD) MSRs */ + u64 host_ia32_xfd; u64 msr_ia32_xfd; u64 msr_ia32_xfd_err; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ca8b1e58afa..15908bc65d1c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9220,22 +9220,44 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) static void kvm_save_current_fpu(struct fpu *fpu) { - struct fpu *src_fpu = ¤t->thread.fpu; + struct fpu *cur_fpu = ¤t->thread.fpu; + fpu->state_ptr = cur_fpu->state_ptr; + fpu->state_mask = cur_fpu->state_mask; /* * If the target FPU state is not resident in the CPU registers, just * memcpy() from current, else save CPU state directly to the target. */ if (test_thread_flag(TIF_NEED_FPU_LOAD)) { - memcpy(&fpu->state, &src_fpu->state, - fpu_kernel_xstate_min_size); + /* + * No need to copy if dynamic feature is used, because + * they just simply point to the same recent state. + */ + if (!cur_fpu->state_ptr) + memcpy(&fpu->state, &cur_fpu->state, + fpu_kernel_xstate_min_size); } else { - if (fpu->state_mask != src_fpu->state_mask) - fpu->state_mask = src_fpu->state_mask; copy_fpregs_to_fpstate(fpu); } } +/* + * Swap fpu context to next fpu role. + * + * "current" fpu acts two roles: user contexts and guest contexts. + * Swap "current" fpu to next role to ensure correctly handle + * dynamic state buffers, e.g. in preemption case. + */ +static void kvm_load_next_fpu(struct fpu *next_fpu, u64 mask) +{ + struct fpu *cur_fpu = ¤t->thread.fpu; + + cur_fpu->state_ptr = next_fpu->state_ptr; + cur_fpu->state_mask = next_fpu->state_mask; + + __copy_kernel_to_fpregs(__xstate(next_fpu), mask); +} + /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { @@ -9243,9 +9265,11 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_save_current_fpu(vcpu->arch.user_fpu); + if (static_cpu_has(X86_FEATURE_XFD) && kvm_x86_ops.xfd_load) + kvm_x86_ops.xfd_load(vcpu); + /* PKRU is separately restored in kvm_x86_ops.run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); + kvm_load_next_fpu(vcpu->arch.guest_fpu, ~XFEATURE_MASK_PKRU); fpregs_mark_activate(); fpregs_unlock(); @@ -9260,7 +9284,10 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) kvm_save_current_fpu(vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(vcpu->arch.user_fpu); + if (static_cpu_has(X86_FEATURE_XFD) && kvm_x86_ops.xfd_put) + kvm_x86_ops.xfd_put(vcpu); + + kvm_load_next_fpu(vcpu->arch.user_fpu, -1); fpregs_mark_activate(); fpregs_unlock(); @@ -9840,11 +9867,13 @@ static int sync_regs(struct kvm_vcpu *vcpu) static void fx_init(struct kvm_vcpu *vcpu) { + struct xregs_state *xsave; + + xsave = __xsave(vcpu->arch.guest_fpu); fpstate_init(vcpu->arch.guest_fpu); if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = + xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; - /* * Ensure guest xcr0 is valid for loading */ @@ -9920,6 +9949,21 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) pr_err("kvm: failed to allocate vcpu's fpu\n"); goto free_user_fpu; } + + vcpu->arch.guest_fpu->state_mask = xfeatures_mask_all & + ~xfeatures_mask_user_dynamic; + + /* If have dynamic features, initialize full context. */ + if (xfeatures_mask_user_dynamic) { + vcpu->arch.guest_fpu->state_ptr = + kmalloc(fpu_kernel_xstate_max_size, GFP_KERNEL); + if (!vcpu->arch.guest_fpu->state_ptr) + goto free_guest_fpu; + + vcpu->arch.guest_fpu->state_mask |= + xfeatures_mask_user_dynamic; + } + fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); @@ -9936,7 +9980,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) r = kvm_x86_ops.vcpu_create(vcpu); if (r) - goto free_guest_fpu; + goto free_guest_fpu_exp; vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; @@ -9947,6 +9991,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu_put(vcpu); return 0; +free_guest_fpu_exp: + kfree(vcpu->arch.guest_fpu->state_ptr); free_guest_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: @@ -10002,6 +10048,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kfree(vcpu->arch.guest_fpu->state_ptr); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); kvm_hv_vcpu_uninit(vcpu); -- 2.18.4