Because inside the mmu_notifier callback we do not have access to the vma nor do we know which lock we are holding (the mmap semaphore or the i_mmap_lock) we can not rely on the regular page table walk (nor do we want as we have to be carefull to not split huge page). So this patch introduce an helper to iterate of the cpu page table content in an efficient way for the situation we are in. Which is we know that none of the page table entry might vanish from below us and thus it is safe to walk the page table. The only added value of the iterator is that it keeps the page table entry level map accross call which fit well with the HMM mirror page table update code. Signed-off-by: Jérôme Glisse <jglisse@xxxxxxxxxx> --- mm/hmm.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/mm/hmm.c b/mm/hmm.c index a9bdab5..74e429a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -406,6 +406,107 @@ static struct mmu_notifier_ops hmm_notifier_ops = { }; +struct mm_pt_iter { + struct mm_struct *mm; + pte_t *ptep; + unsigned long addr; +}; + +static void mm_pt_iter_init(struct mm_pt_iter *pt_iter, struct mm_struct *mm) +{ + pt_iter->mm = mm; + pt_iter->ptep = NULL; + pt_iter->addr = -1UL; +} + +static void mm_pt_iter_fini(struct mm_pt_iter *pt_iter) +{ + pte_unmap(pt_iter->ptep); + pt_iter->ptep = NULL; + pt_iter->addr = -1UL; + pt_iter->mm = NULL; +} + +static inline bool mm_pt_iter_in_range(struct mm_pt_iter *pt_iter, + unsigned long addr) +{ + return (addr >= pt_iter->addr && addr < (pt_iter->addr + PMD_SIZE)); +} + +static struct page *mm_pt_iter_page(struct mm_pt_iter *pt_iter, + unsigned long addr) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + +again: + /* + * What we are doing here is only valid if we old either the mmap + * semaphore or the i_mmap_lock of vma->address_space the address + * belongs to. Sadly because we can not easily get the vma struct + * we can not sanity test that either of those lock is taken. + * + * We have to rely on people using this code knowing what they do. + */ + if (mm_pt_iter_in_range(pt_iter, addr) && likely(pt_iter->ptep)) { + pte_t pte = *(pt_iter->ptep + pte_index(addr)); + unsigned long pfn; + + if (pte_none(pte) || !pte_present(pte)) + return NULL; + if (unlikely(pte_special(pte))) + return NULL; + + pfn = pte_pfn(pte); + if (is_zero_pfn(pfn)) + return NULL; + return pfn_to_page(pfn); + } + + if (pt_iter->ptep) { + pte_unmap(pt_iter->ptep); + pt_iter->ptep = NULL; + pt_iter->addr = -1UL; + } + + pgdp = pgd_offset(pt_iter->mm, addr); + if (pgd_none_or_clear_bad(pgdp)) + return NULL; + pudp = pud_offset(pgdp, addr); + if (pud_none_or_clear_bad(pudp)) + return NULL; + pmdp = pmd_offset(pudp, addr); + /* + * Because we either have the mmap semaphore or the i_mmap_lock we know + * that pmd can not vanish from under us, thus if pmd exist then it is + * either a huge page or a valid pmd. It might also be in the splitting + * transitory state. + */ + if (pmd_none(*pmdp) || unlikely(pmd_bad(*pmdp))) + return NULL; + if (pmd_trans_huge(*pmdp)) { + spinlock_t *ptl; + + ptl = pmd_lock(pt_iter->mm, pmdp); + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + page = pmd_page(*pmdp) + pte_index(addr); + spin_unlock(ptl); + return page; + } + /* It was morphing from thp to regular, try again. */ + spin_unlock(ptl); + goto again; + } + /* Regular pmd and it can not morph. */ + pt_iter->ptep = pte_offset_map(pmdp, addr & PMD_MASK); + pt_iter->addr = addr & PMD_MASK; + goto again; +} + + /* hmm_mirror - per device mirroring functions. * * Each device that mirror a process has a uniq hmm_mirror struct. A process -- 2.4.3 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>