Currently we flush the TLB while holding mmu_lock. This increases the lock hold time by the IPI round-trip time, increasing contention, and makes dropping the lock (for latency reasons) harder. This patch changes TLB management to be usable locklessly, introducing the following APIs: kvm_mark_tlb_dirty() - mark the TLB as containing stale entries kvm_cond_flush_remote_tlbs() - flush the TLB if it was marked as dirty These APIs can be used without holding mmu_lock (though if the TLB became stale due to shadow page table modifications, typically it will need to be called with the lock held to prevent other threads from seeing the modified page tables with the TLB unmarked and unflushed)/ Signed-off-by: Avi Kivity <avi@xxxxxxxxxx> --- Documentation/virtual/kvm/locking.txt | 14 ++++++++++++++ arch/x86/kvm/paging_tmpl.h | 4 ++-- include/linux/kvm_host.h | 22 +++++++++++++++++++++- virt/kvm/kvm_main.c | 29 ++++++++++++++++++++--------- 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt index 3b4cd3b..f6c90479 100644 --- a/Documentation/virtual/kvm/locking.txt +++ b/Documentation/virtual/kvm/locking.txt @@ -23,3 +23,17 @@ Arch: x86 Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} - tsc offset in vmcb Comment: 'raw' because updating the tsc offsets must not be preempted. + +3. TLB control +-------------- + +The following APIs should be used for TLB control: + + - kvm_mark_tlb_dirty() - indicates that the TLB is out of sync wrt + either guest or host page tables. + - kvm_flush_remote_tlbs() - unconditionally flush the tlbs + - kvm_cond_flush_remote_tlbs() - flush the TLBs if previously marked + +These may be used without mmu_lock, though kvm_mark_tlb_dirty() needs to be +used while holding mmu_lock if it is called due to host page table changes +(contrast to guest page table changes). diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 34f9709..97e2a81 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -793,7 +793,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - vcpu->kvm->tlbs_dirty++; + kvm_mark_tlb_dirty(vcpu->kvm); continue; } @@ -806,7 +806,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - vcpu->kvm->tlbs_dirty++; + kvm_mark_tlb_dirty(vcpu->kvm); continue; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6f34330..4bff05d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -310,7 +310,14 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif - long tlbs_dirty; + struct { + /* + * When these two are different, a TLB somewhere holds a + * stale TLB entry. Clean with kvm_[cond_]flush_remote_tlbs(). + */ + atomic_long_t dirtied_count; + atomic_long_t flushed_count; + } tlb_state; }; /* The guest did something we don't support. */ @@ -467,6 +474,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); +void kvm_cond_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); long kvm_arch_dev_ioctl(struct file *filp, @@ -888,5 +896,17 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } } +/* + * Mark the TLB as dirty, for kvm_cond_flush_remote_tlbs(). + */ +static inline void kvm_mark_tlb_dirty(struct kvm *kvm) +{ + /* + * Make any changes to the page tables visible to remote flushers. + */ + smp_mb__before_atomic_inc(); + atomic_long_inc(&kvm->tlb_state.dirtied_count); +} + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1847c76..643ce01 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -203,12 +203,21 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) void kvm_flush_remote_tlbs(struct kvm *kvm) { - long dirty_count = kvm->tlbs_dirty; + long dirty_count = atomic_long_read(&kvm->tlb_state.dirtied_count); + long flushed_count = atomic_long_read(&kvm->tlb_state.flushed_count); smp_mb(); if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; - cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); + atomic_long_cmpxchg(&kvm->tlb_state.flushed_count, + flushed_count, dirty_count); +} + +void kvm_cond_flush_remote_tlbs(struct kvm *kvm) +{ + if (atomic_long_read(&kvm->tlb_state.dirtied_count) + != atomic_long_read(&kvm->tlb_state.flushed_count)) + kvm_flush_remote_tlbs(kvm); } void kvm_reload_remote_mmus(struct kvm *kvm) @@ -267,7 +276,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, unsigned long address) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush, idx; + int idx; /* * When ->invalidate_page runs, the linux pte has been zapped @@ -291,10 +300,10 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; + if (kvm_unmap_hva(kvm, address)) + kvm_mark_tlb_dirty(kvm); /* we've to flush the tlb before the pages can be freed */ - if (need_tlb_flush) - kvm_flush_remote_tlbs(kvm); + kvm_cond_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); @@ -334,10 +343,12 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mmu_notifier_count++; for (; start < end; start += PAGE_SIZE) need_tlb_flush |= kvm_unmap_hva(kvm, start); - need_tlb_flush |= kvm->tlbs_dirty; - /* we've to flush the tlb before the pages can be freed */ + if (need_tlb_flush) - kvm_flush_remote_tlbs(kvm); + kvm_mark_tlb_dirty(kvm); + + /* we've to flush the tlb before the pages can be freed */ + kvm_cond_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); -- 1.7.10 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html