On Thu, Sep 03, 2020 at 05:55:59PM +0100, Matthew Wilcox wrote: > On Thu, Sep 03, 2020 at 01:40:32PM -0300, Jason Gunthorpe wrote: > > However if the sizeof(*pXX) is 8 on a 32 bit platform then load > > tearing is a problem. At lest the various pXX_*() test functions > > operate on a single 32 bit word so don't tear, but to to convert the > > *pXX to a lower level page table pointer a coherent, untorn, read is > > required. > > > > So, looking again, I remember now, I could never quite figure out why > > gup_pmd_range() was safe to do: > > > > pmd_t pmd = READ_ONCE(*pmdp); > > [..] > > } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) > > [..] > > ptem = ptep = pte_offset_map(&pmd, addr); > > > > As I don't see what prevents load tearing a 64 bit pmd.. Eg no > > pmd_trans_unstable() or equivalent here. > > I don't think there are any 32-bit page tables which support a PUD-sized > page. Pretty sure x86 doesn't until you get to 4- or 5- level page tables > (which need you to be running in 64-bit mode). There's not much utility > in having 1GB of your 3GB process address space taken up by a single page. Make sense for PUD, but why is the above GUP code OK for PMD? pmd_trans_unstable() exists specifically to close read tearing races, so it looks like a real problem? > I'm OK if there are some oddball architectures which support it, but > Linux doesn't. So, based on that observation, I think something approximately like this is needed for the page walker for PUD: (this has been on my backlog to return to these patches..) >From 00a361ecb2d9e1226600d9e78e6e1803a886f2d6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe <jgg@xxxxxxxxxxxx> Date: Fri, 13 Mar 2020 13:15:36 -0300 Subject: [RFC] mm/pagewalk: use READ_ONCE when reading the PUD entry unlocked The pagewalker runs while only holding the mmap_sem for read. The pud can be set asynchronously, while also holding the mmap_sem for read eg from: handle_mm_fault() __handle_mm_fault() create_huge_pmd() dev_dax_huge_fault() __dev_dax_pud_fault() vmf_insert_pfn_pud() insert_pfn_pud() pud_lock() set_pud_at() At least x86 sets the PUD using WRITE_ONCE(), so an unlocked read of unstable data should be paired to use READ_ONCE(). For the pagewalker to work locklessly the PUD must work similarly to the PMD: once the PUD entry becomes a pointer to a PMD, it must be stable, and safe to pass to pmd_offset() Passing the value from READ_ONCE into the callbacks prevents the callers from seeing inconsistencies after they re-read, such as seeing pud_none(). If a callback does obtain the pud_lock then it should trigger ACTION_AGAIN if a data race caused the original value to change. Use the same pattern as gup_pmd_range() and pass in the address of the local READ_ONCE stack variable to pmd_offset() to avoid reading it again. Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> --- include/linux/pagewalk.h | 2 +- mm/hmm.c | 16 +++++++--------- mm/mapping_dirty_helpers.c | 6 ++---- mm/pagewalk.c | 28 ++++++++++++++++------------ mm/ptdump.c | 3 +-- 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index b1cb6b753abb53..6caf28aadafbff 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -39,7 +39,7 @@ struct mm_walk_ops { unsigned long next, struct mm_walk *walk); int (*p4d_entry)(p4d_t *p4d, unsigned long addr, unsigned long next, struct mm_walk *walk); - int (*pud_entry)(pud_t *pud, unsigned long addr, + int (*pud_entry)(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk); diff --git a/mm/hmm.c b/mm/hmm.c index 6d9da4b0f0a9f8..98ced96421b913 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -459,28 +459,26 @@ static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) range->flags[HMM_PFN_VALID]; } -static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, - struct mm_walk *walk) +static int hmm_vma_walk_pud(pud_t pud, pud_t *pudp, unsigned long start, + unsigned long end, struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long addr = start; - pud_t pud; int ret = 0; spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); if (!ptl) return 0; + if (memcmp(pudp, &pud, sizeof(pud)) != 0) { + walk->action = ACTION_AGAIN; + spin_unlock(ptl); + return 0; + } /* Normally we don't want to split the huge page */ walk->action = ACTION_CONTINUE; - pud = READ_ONCE(*pudp); - if (pud_none(pud)) { - spin_unlock(ptl); - return hmm_vma_walk_hole(start, end, -1, walk); - } - if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; uint64_t *pfns, cpu_flags; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 71070dda9643d4..8943c2509ec0f7 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -125,12 +125,10 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, } /* wp_clean_pud_entry - The pagewalk pud callback. */ -static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, - struct mm_walk *walk) +static int wp_clean_pud_entry(pud_t pudval, pud_t *pudp, unsigned long addr, + unsigned long end, struct mm_walk *walk) { /* Dirty-tracking should be handled on the pte level */ - pud_t pudval = READ_ONCE(*pud); - if (pud_trans_huge(pudval) || pud_devmap(pudval)) WARN_ON(pud_write(pudval) || pud_dirty(pudval)); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 928df1638c30d1..cf99536cec23be 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return err; } -static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, +static int walk_pmd_range(pud_t pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { pmd_t *pmd; @@ -67,7 +67,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, int err = 0; int depth = real_depth(3); - pmd = pmd_offset(pud, addr); + pmd = pmd_offset(&pud, addr); do { again: next = pmd_addr_end(addr, end); @@ -119,17 +119,19 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pud_t *pud; + pud_t *pudp; + pud_t pud; unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; int depth = real_depth(2); - pud = pud_offset(p4d, addr); + pudp = pud_offset(p4d, addr); do { again: + pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); - if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { + if (pud_none(pud) || (!walk->vma && !walk->no_vma)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) @@ -140,27 +142,29 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, walk->action = ACTION_SUBTREE; if (ops->pud_entry) - err = ops->pud_entry(pud, addr, next, walk); + err = ops->pud_entry(pud, pudp, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; - if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || + if ((!walk->vma && (pud_leaf(pud) || !pud_present(pud))) || walk->action == ACTION_CONTINUE || !(ops->pmd_entry || ops->pte_entry)) continue; - if (walk->vma) - split_huge_pud(walk->vma, pud, addr); - if (pud_none(*pud)) - goto again; + if (walk->vma) { + split_huge_pud(walk->vma, pudp, addr); + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + goto again; + } err = walk_pmd_range(pud, addr, next, walk); if (err) break; - } while (pud++, addr = next, addr != end); + } while (pudp++, addr = next, addr != end); return err; } diff --git a/mm/ptdump.c b/mm/ptdump.c index 26208d0d03b7a9..c5e1717671e36a 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -59,11 +59,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return 0; } -static int ptdump_pud_entry(pud_t *pud, unsigned long addr, +static int ptdump_pud_entry(pud_t val, pud_t *pudp, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; - pud_t val = READ_ONCE(*pud); #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN) if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd))) -- 2.28.0