Minchan Kim <minchan@xxxxxxxxxx> writes: > Hi Huang, > > On Wed, Nov 01, 2017 at 01:41:00PM +0800, Huang, Ying wrote: >> Hi, Minchan, >> >> Minchan Kim <minchan@xxxxxxxxxx> writes: >> >> > When I see recent change of swap readahead, I am very unhappy >> > about current code structure which diverges two swap readahead >> > algorithm in do_swap_page. This patch is to clean it up. >> > >> > Main motivation is that fault handler doesn't need to be aware of >> > readahead algorithms but just should call swapin_readahead. >> > >> > As first step, this patch cleans up a little bit but not perfect >> > (I just separate for review easier) so next patch will make the goal >> > complete. >> > >> > Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> >> > --- >> > include/linux/swap.h | 17 ++-------- >> > mm/memory.c | 17 +++------- >> > mm/swap_state.c | 89 ++++++++++++++++++++++++++++------------------------ >> > 3 files changed, 55 insertions(+), 68 deletions(-) >> > >> > diff --git a/include/linux/swap.h b/include/linux/swap.h >> > index 84255b3da7c1..7c7c8b344bc9 100644 >> > --- a/include/linux/swap.h >> > +++ b/include/linux/swap.h >> > @@ -427,12 +427,8 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, >> > bool *new_page_allocated); >> > extern struct page *swapin_readahead(swp_entry_t, gfp_t, >> > struct vm_area_struct *vma, unsigned long addr); >> > - >> > -extern struct page *swap_readahead_detect(struct vm_fault *vmf, >> > - struct vma_swap_readahead *swap_ra); >> > extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, >> > - struct vm_fault *vmf, >> > - struct vma_swap_readahead *swap_ra); >> > + struct vm_fault *vmf); >> > >> > /* linux/mm/swapfile.c */ >> > extern atomic_long_t nr_swap_pages; >> > @@ -551,15 +547,8 @@ static inline bool swap_use_vma_readahead(void) >> > return false; >> > } >> > >> > -static inline struct page *swap_readahead_detect( >> > - struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) >> > -{ >> > - return NULL; >> > -} >> > - >> > -static inline struct page *do_swap_page_readahead( >> > - swp_entry_t fentry, gfp_t gfp_mask, >> > - struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) >> > +static inline struct page *do_swap_page_readahead(swp_entry_t fentry, >> > + gfp_t gfp_mask, struct vm_fault *vmf) >> > { >> > return NULL; >> > } >> > diff --git a/mm/memory.c b/mm/memory.c >> > index 8a0c410037d2..e955298e4290 100644 >> > --- a/mm/memory.c >> > +++ b/mm/memory.c >> > @@ -2849,21 +2849,14 @@ int do_swap_page(struct vm_fault *vmf) >> > struct vm_area_struct *vma = vmf->vma; >> > struct page *page = NULL, *swapcache = NULL; >> > struct mem_cgroup *memcg; >> > - struct vma_swap_readahead swap_ra; >> > swp_entry_t entry; >> > pte_t pte; >> > int locked; >> > int exclusive = 0; >> > int ret = 0; >> > - bool vma_readahead = swap_use_vma_readahead(); >> > >> > - if (vma_readahead) >> > - page = swap_readahead_detect(vmf, &swap_ra); >> > - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { >> > - if (page) >> > - put_page(page); >> > + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) >> > goto out; >> > - } >> >> The page table holding PTE may be unmapped in pte_unmap_same(), so is it >> safe for us to access page table after this in do_swap_page_readahead()? > > That's why I calls pte_offset_map in swap_ra_info before the access. Oh, I found it! Thanks for explanation! Best Regards, Huang, Ying >> >> Best Regards, >> Huang, Ying >> >> > entry = pte_to_swp_entry(vmf->orig_pte); >> > if (unlikely(non_swap_entry(entry))) { >> > @@ -2889,9 +2882,7 @@ int do_swap_page(struct vm_fault *vmf) >> > >> > >> > delayacct_set_flag(DELAYACCT_PF_SWAPIN); >> > - if (!page) >> > - page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, >> > - vmf->address); >> > + page = lookup_swap_cache(entry, vma, vmf->address); >> > if (!page) { >> > struct swap_info_struct *si = swp_swap_info(entry); >> > >> > @@ -2907,9 +2898,9 @@ int do_swap_page(struct vm_fault *vmf) >> > swap_readpage(page, true); >> > } >> > } else { >> > - if (vma_readahead) >> > + if (swap_use_vma_readahead()) >> > page = do_swap_page_readahead(entry, >> > - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); >> > + GFP_HIGHUSER_MOVABLE, vmf); >> > else >> > page = swapin_readahead(entry, >> > GFP_HIGHUSER_MOVABLE, vma, vmf->address); >> > diff --git a/mm/swap_state.c b/mm/swap_state.c >> > index 6c017ced11e6..e3c535fcd2df 100644 >> > --- a/mm/swap_state.c >> > +++ b/mm/swap_state.c >> > @@ -331,32 +331,38 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, >> > unsigned long addr) >> > { >> > struct page *page; >> > - unsigned long ra_info; >> > - int win, hits, readahead; >> > >> > page = find_get_page(swap_address_space(entry), swp_offset(entry)); >> > >> > INC_CACHE_INFO(find_total); >> > if (page) { >> > + bool vma_ra = swap_use_vma_readahead(); >> > + bool readahead = TestClearPageReadahead(page); >> > + >> > INC_CACHE_INFO(find_success); >> > if (unlikely(PageTransCompound(page))) >> > return page; >> > - readahead = TestClearPageReadahead(page); >> > - if (vma) { >> > - ra_info = GET_SWAP_RA_VAL(vma); >> > - win = SWAP_RA_WIN(ra_info); >> > - hits = SWAP_RA_HITS(ra_info); >> > + >> > + if (vma && vma_ra) { >> > + unsigned long ra_val; >> > + int win, hits; >> > + >> > + ra_val = GET_SWAP_RA_VAL(vma); >> > + win = SWAP_RA_WIN(ra_val); >> > + hits = SWAP_RA_HITS(ra_val); >> > if (readahead) >> > hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); >> > atomic_long_set(&vma->swap_readahead_info, >> > SWAP_RA_VAL(addr, win, hits)); >> > } >> > + >> > if (readahead) { >> > count_vm_event(SWAP_RA_HIT); >> > - if (!vma) >> > + if (!vma || !vma_ra) >> > atomic_inc(&swapin_readahead_hits); >> > } >> > } >> > + >> > return page; >> > } >> > >> > @@ -645,16 +651,15 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, >> > PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); >> > } >> > >> > -struct page *swap_readahead_detect(struct vm_fault *vmf, >> > - struct vma_swap_readahead *swap_ra) >> > +static void swap_ra_info(struct vm_fault *vmf, >> > + struct vma_swap_readahead *ra_info) >> > { >> > struct vm_area_struct *vma = vmf->vma; >> > - unsigned long swap_ra_info; >> > - struct page *page; >> > + unsigned long ra_val; >> > swp_entry_t entry; >> > unsigned long faddr, pfn, fpfn; >> > unsigned long start, end; >> > - pte_t *pte; >> > + pte_t *pte, *orig_pte; >> > unsigned int max_win, hits, prev_win, win, left; >> > #ifndef CONFIG_64BIT >> > pte_t *tpte; >> > @@ -663,30 +668,32 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, >> > max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), >> > SWAP_RA_ORDER_CEILING); >> > if (max_win == 1) { >> > - swap_ra->win = 1; >> > - return NULL; >> > + ra_info->win = 1; >> > + return; >> > } >> > >> > faddr = vmf->address; >> > - entry = pte_to_swp_entry(vmf->orig_pte); >> > - if ((unlikely(non_swap_entry(entry)))) >> > - return NULL; >> > - page = lookup_swap_cache(entry, vma, faddr); >> > - if (page) >> > - return page; >> > + orig_pte = pte = pte_offset_map(vmf->pmd, faddr); >> > + entry = pte_to_swp_entry(*pte); >> > + if ((unlikely(non_swap_entry(entry)))) { >> > + pte_unmap(orig_pte); >> > + return; >> > + } >> > >> > fpfn = PFN_DOWN(faddr); >> > - swap_ra_info = GET_SWAP_RA_VAL(vma); >> > - pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); >> > - prev_win = SWAP_RA_WIN(swap_ra_info); >> > - hits = SWAP_RA_HITS(swap_ra_info); >> > - swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, >> > + ra_val = GET_SWAP_RA_VAL(vma); >> > + pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); >> > + prev_win = SWAP_RA_WIN(ra_val); >> > + hits = SWAP_RA_HITS(ra_val); >> > + ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, >> > max_win, prev_win); >> > atomic_long_set(&vma->swap_readahead_info, >> > SWAP_RA_VAL(faddr, win, 0)); >> > >> > - if (win == 1) >> > - return NULL; >> > + if (win == 1) { >> > + pte_unmap(orig_pte); >> > + return; >> > + } >> > >> > /* Copy the PTEs because the page table may be unmapped */ >> > if (fpfn == pfn + 1) >> > @@ -699,23 +706,21 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, >> > swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, >> > &start, &end); >> > } >> > - swap_ra->nr_pte = end - start; >> > - swap_ra->offset = fpfn - start; >> > - pte = vmf->pte - swap_ra->offset; >> > + ra_info->nr_pte = end - start; >> > + ra_info->offset = fpfn - start; >> > + pte -= ra_info->offset; >> > #ifdef CONFIG_64BIT >> > - swap_ra->ptes = pte; >> > + ra_info->ptes = pte; >> > #else >> > - tpte = swap_ra->ptes; >> > + tpte = ra_info->ptes; >> > for (pfn = start; pfn != end; pfn++) >> > *tpte++ = *pte++; >> > #endif >> > - >> > - return NULL; >> > + pte_unmap(orig_pte); >> > } >> > >> > struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, >> > - struct vm_fault *vmf, >> > - struct vma_swap_readahead *swap_ra) >> > + struct vm_fault *vmf) >> > { >> > struct blk_plug plug; >> > struct vm_area_struct *vma = vmf->vma; >> > @@ -724,12 +729,14 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, >> > swp_entry_t entry; >> > unsigned int i; >> > bool page_allocated; >> > + struct vma_swap_readahead ra_info = {0,}; >> > >> > - if (swap_ra->win == 1) >> > + swap_ra_info(vmf, &ra_info); >> > + if (ra_info.win == 1) >> > goto skip; >> > >> > blk_start_plug(&plug); >> > - for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; >> > + for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; >> > i++, pte++) { >> > pentry = *pte; >> > if (pte_none(pentry)) >> > @@ -745,7 +752,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, >> > continue; >> > if (page_allocated) { >> > swap_readpage(page, false); >> > - if (i != swap_ra->offset && >> > + if (i != ra_info.offset && >> > likely(!PageTransCompound(page))) { >> > SetPageReadahead(page); >> > count_vm_event(SWAP_RA); >> > @@ -757,7 +764,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, >> > lru_add_drain(); >> > skip: >> > return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, >> > - swap_ra->win == 1); >> > + ra_info.win == 1); >> > } >> > >> > #ifdef CONFIG_SYSFS >> >> -- >> To unsubscribe, send a message with 'unsubscribe linux-mm' in >> the body to majordomo@xxxxxxxxx. For more info on Linux MM, >> see: http://www.linux-mm.org/ . >> Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>