On Fri 19-01-18 15:49:24, Kirill A. Shutemov wrote: > Tetsuo reported random crashes under memory pressure on 32-bit x86 > system and tracked down to change that introduced > page_vma_mapped_walk(). > > The root cause of the issue is the faulty pointer math in check_pte(). > As ->pte may point to an arbitrary page we have to check that they are > belong to the section before doing math. Otherwise it may lead to weird > results. > > It wasn't noticed until now as mem_map[] is virtually contiguous on > flatmem or vmemmap sparsemem. Pointer arithmetic just works against all > 'struct page' pointers. But with classic sparsemem, it doesn't because > each section memap is allocated separately and so consecutive pfns > crossing two sections might have struct pages at completely unrelated > addresses. > > Let's restructure code a bit and replace pointer arithmetic with > operations on pfns. > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > Reported-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> > Fixes: ace71a19cec5 ("mm: introduce page_vma_mapped_walk()") > Cc: stable@xxxxxxxxxxxxxxx Much better. Thanks! Acked-by: Michal Hocko <mhocko@xxxxxxxx> > --- > v2: > - Do not use uninitialized 'pfn' for !MIGRATION case (Michal) > > --- > include/linux/swapops.h | 21 +++++++++++++++++ > mm/page_vma_mapped.c | 63 +++++++++++++++++++++++++++++-------------------- > 2 files changed, 59 insertions(+), 25 deletions(-) > > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > index 9c5a2628d6ce..1d3877c39a00 100644 > --- a/include/linux/swapops.h > +++ b/include/linux/swapops.h > @@ -124,6 +124,11 @@ static inline bool is_write_device_private_entry(swp_entry_t entry) > return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); > } > > +static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry) > +{ > + return swp_offset(entry); > +} > + > static inline struct page *device_private_entry_to_page(swp_entry_t entry) > { > return pfn_to_page(swp_offset(entry)); > @@ -154,6 +159,11 @@ static inline bool is_write_device_private_entry(swp_entry_t entry) > return false; > } > > +static inline unsigned long device_private_entry_to_pfn(swp_entry_t entry) > +{ > + return 0; > +} > + > static inline struct page *device_private_entry_to_page(swp_entry_t entry) > { > return NULL; > @@ -189,6 +199,11 @@ static inline int is_write_migration_entry(swp_entry_t entry) > return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); > } > > +static inline unsigned long migration_entry_to_pfn(swp_entry_t entry) > +{ > + return swp_offset(entry); > +} > + > static inline struct page *migration_entry_to_page(swp_entry_t entry) > { > struct page *p = pfn_to_page(swp_offset(entry)); > @@ -218,6 +233,12 @@ static inline int is_migration_entry(swp_entry_t swp) > { > return 0; > } > + > +static inline unsigned long migration_entry_to_pfn(swp_entry_t entry) > +{ > + return 0; > +} > + > static inline struct page *migration_entry_to_page(swp_entry_t entry) > { > return NULL; > diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c > index d22b84310f6d..956015614395 100644 > --- a/mm/page_vma_mapped.c > +++ b/mm/page_vma_mapped.c > @@ -30,10 +30,29 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) > return true; > } > > +/** > + * check_pte - check if @pvmw->page is mapped at the @pvmw->pte > + * > + * page_vma_mapped_walk() found a place where @pvmw->page is *potentially* > + * mapped. check_pte() has to validate this. > + * > + * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary > + * page. > + * > + * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration > + * entry that points to @pvmw->page or any subpage in case of THP. > + * > + * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to > + * @pvmw->page or any subpage in case of THP. > + * > + * Otherwise, return false. > + * > + */ > static bool check_pte(struct page_vma_mapped_walk *pvmw) > { > + unsigned long pfn; > + > if (pvmw->flags & PVMW_MIGRATION) { > -#ifdef CONFIG_MIGRATION > swp_entry_t entry; > if (!is_swap_pte(*pvmw->pte)) > return false; > @@ -41,37 +60,31 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) > > if (!is_migration_entry(entry)) > return false; > - if (migration_entry_to_page(entry) - pvmw->page >= > - hpage_nr_pages(pvmw->page)) { > - return false; > - } > - if (migration_entry_to_page(entry) < pvmw->page) > - return false; > -#else > - WARN_ON_ONCE(1); > -#endif > - } else { > - if (is_swap_pte(*pvmw->pte)) { > - swp_entry_t entry; > > - entry = pte_to_swp_entry(*pvmw->pte); > - if (is_device_private_entry(entry) && > - device_private_entry_to_page(entry) == pvmw->page) > - return true; > - } > + pfn = migration_entry_to_pfn(entry); > + } else if (is_swap_pte(*pvmw->pte)) { > + swp_entry_t entry; > > - if (!pte_present(*pvmw->pte)) > + /* Handle un-addressable ZONE_DEVICE memory */ > + entry = pte_to_swp_entry(*pvmw->pte); > + if (!is_device_private_entry(entry)) > return false; > > - /* THP can be referenced by any subpage */ > - if (pte_page(*pvmw->pte) - pvmw->page >= > - hpage_nr_pages(pvmw->page)) { > - return false; > - } > - if (pte_page(*pvmw->pte) < pvmw->page) > + pfn = device_private_entry_to_pfn(entry); > + } else { > + if (!pte_present(*pvmw->pte)) > return false; > + > + pfn = pte_pfn(*pvmw->pte); > } > > + if (pfn < page_to_pfn(pvmw->page)) > + return false; > + > + /* THP can be referenced by any subpage */ > + if (pfn - page_to_pfn(pvmw->page) >= hpage_nr_pages(pvmw->page)) > + return false; > + > return true; > } > > -- > 2.15.1 -- Michal Hocko SUSE Labs