On 26/05/2017 17:30, Paolo Bonzini wrote: > > > On 26/05/2017 02:47, Andy Lutomirski wrote: >> When PCID is enabled, CR3's PCID bits can change during context >> switches, so KVM won't be able to treat CR3 as a per-mm constant any >> more. >> >> I structured this like the existing CR4 handling. Under ordinary >> circumstances (PCID disabled or if the current PCID and the value >> that's already in the VMCS match), then we won't do an extra VMCS >> write, and we'll never do an extra direct CR3 read. The overhead >> should be minimal. >> >> I disallowed using the new helper in non-atomic context because >> PCID support will cause CR3 to stop being constant in non-atomic >> process context. >> >> (Frankly, it also scares me a bit that KVM ever treated CR3 as >> constant, but it looks like it was okay before.) >> >> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx> >> Cc: Radim Krčmář <rkrcmar@xxxxxxxxxx> >> Cc: kvm@xxxxxxxxxxxxxxx >> Cc: Rik van Riel <riel@xxxxxxxxxx> >> Cc: Dave Hansen <dave.hansen@xxxxxxxxx> >> Cc: Nadav Amit <namit@xxxxxxxxxx> >> Cc: Michal Hocko <mhocko@xxxxxxxx> >> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> >> Cc: Arjan van de Ven <arjan@xxxxxxxxxxxxxxx> >> Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx> >> --- >> arch/x86/include/asm/mmu_context.h | 19 +++++++++++++++++++ >> arch/x86/kvm/vmx.c | 21 ++++++++++++++++++--- >> 2 files changed, 37 insertions(+), 3 deletions(-) >> >> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h >> index 187c39470a0b..f20d7ea47095 100644 >> --- a/arch/x86/include/asm/mmu_context.h >> +++ b/arch/x86/include/asm/mmu_context.h >> @@ -266,4 +266,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, >> return __pkru_allows_pkey(vma_pkey(vma), write); >> } >> >> + >> +/* >> + * This can be used from process context to figure out what the value of >> + * CR3 is without needing to do a (slow) read_cr3(). >> + * >> + * It's intended to be used for code like KVM that sneakily changes CR3 >> + * and needs to restore it. It needs to be used very carefully. >> + */ >> +static inline unsigned long __get_current_cr3_fast(void) >> +{ >> + unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); >> + >> + /* For now, be very restrictive about when this can be called. */ >> + VM_WARN_ON(in_nmi() || !in_atomic()); >> + >> + VM_BUG_ON(cr3 != read_cr3()); >> + return cr3; >> +} >> + >> #endif /* _ASM_X86_MMU_CONTEXT_H */ >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index 72f78396bc09..b7b36c9ffa3d 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -48,6 +48,7 @@ >> #include <asm/kexec.h> >> #include <asm/apic.h> >> #include <asm/irq_remapping.h> >> +#include <asm/mmu_context.h> >> >> #include "trace.h" >> #include "pmu.h" >> @@ -596,6 +597,7 @@ struct vcpu_vmx { >> int gs_ldt_reload_needed; >> int fs_reload_needed; >> u64 msr_host_bndcfgs; >> + unsigned long vmcs_host_cr3; /* May not match real cr3 */ >> unsigned long vmcs_host_cr4; /* May not match real cr4 */ >> } host_state; >> struct { >> @@ -5012,12 +5014,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) >> u32 low32, high32; >> unsigned long tmpl; >> struct desc_ptr dt; >> - unsigned long cr0, cr4; >> + unsigned long cr0, cr3, cr4; >> >> cr0 = read_cr0(); >> WARN_ON(cr0 & X86_CR0_TS); >> vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ >> - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ >> + >> + /* >> + * Save the most likely value for this task's CR3 in the VMCS. >> + * We can't use __get_current_cr3_fast() because we're not atomic. >> + */ >> + cr3 = read_cr3(); >> + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ >> + vmx->host_state.vmcs_host_cr3 = cr3; >> >> /* Save the most likely value for this task's CR4 in the VMCS. */ >> cr4 = cr4_read_shadow(); >> @@ -8843,7 +8852,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) >> static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) >> { >> struct vcpu_vmx *vmx = to_vmx(vcpu); >> - unsigned long debugctlmsr, cr4; >> + unsigned long debugctlmsr, cr3, cr4; >> >> /* Don't enter VMX if guest state is invalid, let the exit handler >> start emulation until we arrive back to a valid state */ >> @@ -8865,6 +8874,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) >> if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) >> vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); >> >> + cr3 = __get_current_cr3_fast(); >> + if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { >> + vmcs_writel(HOST_CR3, cr3); >> + vmx->host_state.vmcs_host_cr3 = cr3; >> + } >> + >> cr4 = cr4_read_shadow(); >> if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { >> vmcs_writel(HOST_CR4, cr4); >> > > Queued, thanks. If anybody needs a topic branch, please holler. Ah, no, it depends on the others. Note to self, compile first, answer second. Paolo