Re: [PATCH -mm v9 5/8] mmu-notifier: add clear_young callback

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On Sun, Jul 19, 2015 at 5:31 AM, Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> wrote:
In the scope of the idle memory tracking feature, which is introduced by
the following patch, we need to clear the referenced/accessed bit not
only in primary, but also in secondary ptes. The latter is required in
order to estimate wss of KVM VMs. At the same time we want to avoid
flushing tlb, because it is quite expensive and it won't really affect
the final result.

Currently, there is no function for clearing pte young bit that would
meet our requirements, so this patch introduces one. To achieve that we
have to add a new mmu-notifier callback, clear_young, since there is no
method for testing-and-clearing a secondary pte w/o flushing tlb. The
new method is not mandatory and currently only implemented by KVM.

Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
Reviewed-by: Andres Lagar-Cavilla <andreslc@xxxxxxxxxx>
Acked-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
---
 include/linux/mmu_notifier.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
 mm/mmu_notifier.c            | 17 +++++++++++++++++
 virt/kvm/kvm_main.c          | 18 ++++++++++++++++++
 3 files changed, 79 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 61cd67f4d788..a5b17137c683 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -66,6 +66,16 @@ struct mmu_notifier_ops {
                                 unsigned long end);

        /*
+        * clear_young is a lightweight version of clear_flush_young. Like the
+        * latter, it is supposed to test-and-clear the young/accessed bitflag
+        * in the secondary pte, but it may omit flushing the secondary tlb.
+        */
+       int (*clear_young)(struct mmu_notifier *mn,
+                          struct mm_struct *mm,
+                          unsigned long start,
+                          unsigned long end);
+
+       /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
@@ -203,6 +213,9 @@ extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+                                     unsigned long start,
+                                     unsigned long end);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -231,6 +244,15 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return 0;
 }

+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+                                          unsigned long start,
+                                          unsigned long end)
+{
+       if (mm_has_notifiers(mm))
+               return __mmu_notifier_clear_young(mm, start, end);
+       return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
 {
@@ -311,6 +333,28 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
        __young;                                                        \
 })

+#define ptep_clear_young_notify(__vma, __address, __ptep)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PAGE_SIZE);    \
+       __young;                                                        \
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)              \
+({                                                                     \
+       int __young;                                                    \
+       struct vm_area_struct *___vma = __vma;                          \
+       unsigned long ___address = __address;                           \
+       __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+       __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,  \
+                                           ___address + PMD_SIZE);     \
+       __young;                                                        \
+})
+
 #define        ptep_clear_flush_notify(__vma, __address, __ptep)               \
 ({                                                                     \
        unsigned long ___addr = __address & PAGE_MASK;                  \
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 3b9b3d0741b2..5fbdd367bbed 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }

+int __mmu_notifier_clear_young(struct mm_struct *mm,
+                              unsigned long start,
+                              unsigned long end)
+{
+       struct mmu_notifier *mn;
+       int young = 0, id;
+
+       id = srcu_read_lock(&srcu);
+       hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+               if (mn->ops->clear_young)
+                       young |= mn->ops->clear_young(mn, mm, start, end);
+       }
+       srcu_read_unlock(&srcu, id);
+
+       return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
                              unsigned long address)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b8a44453670..ff4173ce6924 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -387,6 +387,23 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
        return young;
 }

+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long start,
+                                       unsigned long end)
+{
+       struct kvm *kvm = mmu_notifier_to_kvm(mn);
+       int young, idx;
+
If you need to cut out another version please add comments as to the two issues raised:
- This doesn't proactively flush TLBs -- not obvious if it should.
- This adversely affects performance in Pre_haswell Intel EPT.

Thanks
Andres 
+       idx = srcu_read_lock(&kvm->srcu);
+       spin_lock(&kvm->mmu_lock);
+       young = kvm_age_hva(kvm, start, end);
+       spin_unlock(&kvm->mmu_lock);
+       srcu_read_unlock(&kvm->srcu, idx);
+
+       return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
@@ -419,6 +436,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
+       .clear_young            = kvm_mmu_notifier_clear_young,
        .test_young             = kvm_mmu_notifier_test_young,
        .change_pte             = kvm_mmu_notifier_change_pte,
        .release                = kvm_mmu_notifier_release,
--
2.1.4




--
Andres Lagar-Cavilla | Google Kernel Team | andreslc@xxxxxxxxxx 

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]