Introduced kvm_mmu_notifier_test_and_clear_dirty(), kvm_mmu_notifier_dirty_update() and their mmu_notifier interfaces to support KSM dirty bit tracking, which brings significant performance gain in volatile pages scanning in KSM. Currently, kvm_mmu_notifier_dirty_update() returns 0 if and only if intel EPT is enabled to indicate that the dirty bits of underlying sptes are not updated by hardware. Signed-off-by: Nai Xia <nai.xia@xxxxxxxxx> Acked-by: Izik Eidus <izik.eidus@xxxxxxxxxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 36 +++++++++++++++++++++++++++++ arch/x86/kvm/mmu.h | 3 +- arch/x86/kvm/vmx.c | 1 + include/linux/kvm_host.h | 2 +- include/linux/mmu_notifier.h | 48 +++++++++++++++++++++++++++++++++++++++ mm/mmu_notifier.c | 33 ++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 27 ++++++++++++++++++++++ 8 files changed, 149 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d2ac8e2..f0d7aa0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -848,6 +848,7 @@ extern bool kvm_rebooting; int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aee3862..a5a0c51 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -979,6 +979,37 @@ out: return young; } +/* + * Caller is supposed to SetPageDirty(), it's not done inside this. + */ +static +int kvm_test_and_clear_dirty_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int dirty = 0; + + if (!shadow_dirty_mask) { + WARN(1, "KVM: do NOT try to test dirty bit in EPT\n"); + goto out; + } + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + int _dirty; + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + _dirty = _spte & PT_DIRTY_MASK; + if (_dirty) { + dirty = 1; + clear_bit(PT_DIRTY_SHIFT, (unsigned long *)spte); + } + spte = rmap_next(kvm, rmapp, spte); + } +out: + return dirty; +} + #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -1004,6 +1035,11 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); } +int kvm_test_and_clear_dirty_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_test_and_clear_dirty_rmapp); +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 7086ca8..b8d01c3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -18,7 +18,8 @@ #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) -#define PT_DIRTY_MASK (1ULL << 6) +#define PT_DIRTY_SHIFT 6 +#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) #define PT_PAGE_SIZE_MASK (1ULL << 7) #define PT_PAT_MASK (1ULL << 7) #define PT_GLOBAL_MASK (1ULL << 8) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d48ec60..b407a69 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4674,6 +4674,7 @@ static int __init vmx_init(void) kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); + kvm_dirty_update = 0; } else kvm_disable_tdp(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 31ebb59..2036bae 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -53,7 +53,7 @@ struct kvm; struct kvm_vcpu; extern struct kmem_cache *kvm_vcpu_cache; - +extern int kvm_dirty_update; /* * It would be nice to use something smarter than a linear search, TBD... * Thankfully we dont expect many devices to register (famous last words :), diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 1d1b1e1..bd6ba2d 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -24,6 +24,9 @@ struct mmu_notifier_mm { }; struct mmu_notifier_ops { + int (*dirty_update)(struct mmu_notifier *mn, + struct mm_struct *mm); + /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are @@ -72,6 +75,16 @@ struct mmu_notifier_ops { unsigned long address); /* + * clear_flush_dirty is called after the VM is + * test-and-clearing the dirty/modified bitflag in the + * pte. This way the VM will provide proper volatile page + * testing to ksm. + */ + int (*test_and_clear_dirty)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ @@ -170,11 +183,14 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn, extern void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); +extern int __mmu_notifier_dirty_update(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); +extern int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm, + unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -184,6 +200,19 @@ extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end); +/* + * For ksm to make use of dirty bit, it wants to make sure that the dirty bits + * in sptes really carry the dirty information. Currently only intel EPT is + * not for ksm dirty bit tracking. + */ +static inline int mmu_notifier_dirty_update(struct mm_struct *mm) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_dirty_update(mm); + + return 1; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -206,6 +235,14 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_and_clear_dirty(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_and_clear_dirty(mm, address); + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -323,6 +360,11 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #else /* CONFIG_MMU_NOTIFIER */ +static inline int mmu_notifier_dirty_update(struct mm_struct *mm) +{ + return 1; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { } @@ -339,6 +381,12 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_and_clear_dirty(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8d032de..a4a1467 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -18,6 +18,22 @@ #include <linux/sched.h> #include <linux/slab.h> +int __mmu_notifier_dirty_update(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int dirty_update = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->dirty_update) + dirty_update |= mn->ops->dirty_update(mn, mm); + } + rcu_read_unlock(); + + return dirty_update; +} + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -120,6 +136,23 @@ int __mmu_notifier_test_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_and_clear_dirty(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int dirty = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_and_clear_dirty) + dirty |= mn->ops->test_and_clear_dirty(mn, mm, address); + } + rcu_read_unlock(); + + return dirty; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 96ebc06..22967c8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -78,6 +78,8 @@ static atomic_t hardware_enable_failed; struct kmem_cache *kvm_vcpu_cache; EXPORT_SYMBOL_GPL(kvm_vcpu_cache); +int kvm_dirty_update = 1; + static __read_mostly struct preempt_ops kvm_preempt_ops; struct dentry *kvm_debugfs_dir; @@ -398,6 +400,23 @@ static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, return young; } +/* Caller should SetPageDirty(), no need to flush tlb */ +static int kvm_mmu_notifier_test_and_clear_dirty(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int dirty, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + dirty = kvm_test_and_clear_dirty_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return dirty; +} + static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -409,14 +428,22 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); } +static int kvm_mmu_notifier_dirty_update(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + return kvm_dirty_update; +} + static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_page = kvm_mmu_notifier_invalidate_page, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, .test_young = kvm_mmu_notifier_test_young, + .test_and_clear_dirty = kvm_mmu_notifier_test_and_clear_dirty, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, + .dirty_update = kvm_mmu_notifier_dirty_update, }; static int kvm_init_mmu_notifier(struct kvm *kvm) -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html