----- junaids@xxxxxxxxxx wrote: > When PCIDs are enabled, the MSb of the source operand for a > MOV-to-CR3 > instruction indicates that the TLB doesn't need to be flushed. > > This change enables this optimization for MOV-to-CR3s in the guest > that have been intercepted by KVM for shadow paging and are handled > within the fast CR3 switch path. > > Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 2 +- > arch/x86/kvm/mmu.c | 10 ++++++---- > arch/x86/kvm/x86.c | 11 ++++++++--- > 3 files changed, 15 insertions(+), 8 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h > b/arch/x86/include/asm/kvm_host.h > index 9b30003b4429..9f97ed12c951 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -1305,7 +1305,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, > gva_t gva, u64 error_code, > void *insn, int insn_len); > void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); > void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned > long pcid); > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3); > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3, bool > skip_tlb_flush); > > void kvm_enable_tdp(void); > void kvm_disable_tdp(void); > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 1258b8e4718b..7ecd25a21534 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -4014,7 +4014,8 @@ static void nonpaging_init_context(struct > kvm_vcpu *vcpu, > context->nx = false; > } > > -static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3) > +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t old_cr3, > + bool skip_tlb_flush) > { > struct kvm_mmu *mmu = &vcpu->arch.mmu; > gpa_t new_cr3 = kvm_read_cr3(vcpu); > @@ -4050,7 +4051,8 @@ static bool fast_cr3_switch(struct kvm_vcpu > *vcpu, gpa_t old_cr3) > kvm_mmu_sync_roots(vcpu); > __clear_sp_write_flooding_count( > page_header(mmu->root_hpa)); > - mmu->set_cr3(vcpu, mmu->root_hpa | pcid, false); > + mmu->set_cr3(vcpu, mmu->root_hpa | pcid, > + skip_tlb_flush); > return true; > } > } > @@ -4058,9 +4060,9 @@ static bool fast_cr3_switch(struct kvm_vcpu > *vcpu, gpa_t old_cr3) > return false; > } > > -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3) > +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t old_cr3, bool > skip_tlb_flush) > { > - if (!fast_cr3_switch(vcpu, old_cr3)) > + if (!fast_cr3_switch(vcpu, old_cr3, skip_tlb_flush)) > kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); > > vcpu->arch.mmu.prev_cr3 = old_cr3; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 5a19d220a9c3..baeb8447ede2 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -843,17 +843,22 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); > int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) > { > unsigned long old_cr3 = kvm_read_cr3(vcpu); > + bool skip_tlb_flush = false; > > #ifdef CONFIG_X86_64 > bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); > > - if (pcid_enabled) > + if (pcid_enabled) { > + skip_tlb_flush = cr3 & CR3_PCID_INVD; > cr3 &= ~CR3_PCID_INVD; > + } > #endif > > if (cr3 == old_cr3 && !pdptrs_changed(vcpu)) { > kvm_mmu_sync_roots(vcpu); > - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); > + > + if (!skip_tlb_flush) > + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); > return 0; > } > > @@ -866,7 +871,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned > long cr3) > > vcpu->arch.cr3 = cr3; > __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); > - kvm_mmu_new_cr3(vcpu, old_cr3); > + kvm_mmu_new_cr3(vcpu, old_cr3, skip_tlb_flush); > > return 0; > } > -- > 2.17.0.441.gb46fe60e1d-goog I think this patch breaks the INVPCID handling of previous patch. In patch 7 in series ("kvm: vmx: Support INVPCID in shadow paging mode"), INVPCID was implemented such that it only sync SPTs (shadow page-tables) and flush TLB for INVPCID invoked with current active PCID. This is because it assumes that on every guest CR3-switch, host will both sync SPTs and flush TLB as it cause host to switch CR3 as-well. However, this patch partially breaks this assumption by not flushing TLB entries when switching between "current_cr3" and "prev_cr3" with bit CR3_PCID_INVD set. I am thinking about the following scenario: Let's assume: {current_cr3=X, current_pcid=A, prev_cr3=Y, prev_pcid=B}. What if now guest executes INVPCID on address G and PCID B (Individual-address invalidation). and then immediately switch CR3 to value Y (==prev_cr3) with bit CR3_PCID_INVD set? In this case, INVPCID handling does nothing as current_pcid!=B however we also don't flush TLB when CR3-switch is done to prev_cr3=Y. Therefore, physical TLB could still hold stale entry that maps G with PCID of B. In contrast to a real CPU that would implement INVPCID correctly.