On Mon, Jul 07, 2014 at 12:39:23PM -0700, Andrew Morton wrote: > On Wed, 2 Jul 2014 00:30:57 -0400 Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> wrote: > > > Subject: [PATCH v2] rmap: fix pgoff calculation to handle hugepage correctly > > > > I triggered VM_BUG_ON() in vma_address() when I try to migrate an anonymous > > hugepage with mbind() in the kernel v3.16-rc3. This is because pgoff's > > calculation in rmap_walk_anon() fails to consider compound_order() only to > > have an incorrect value. > > > > This patch introduces page_to_pgoff(), which gets the page's offset in > > PAGE_CACHE_SIZE. Kirill pointed out that page cache tree should natively > > handle hugepages, and in order to make hugetlbfs fit it, page->index of > > hugetlbfs page should be in PAGE_CACHE_SIZE. This is beyond this patch, > > but page_to_pgoff() contains the point to be fixed in a single function. > > > > ... > > > > --- a/include/linux/pagemap.h > > +++ b/include/linux/pagemap.h > > @@ -399,6 +399,18 @@ static inline struct page *read_mapping_page(struct address_space *mapping, > > } > > > > /* > > + * Get the offset in PAGE_SIZE. > > + * (TODO: hugepage should have ->index in PAGE_SIZE) > > + */ > > +static inline pgoff_t page_to_pgoff(struct page *page) > > +{ > > + if (unlikely(PageHeadHuge(page))) > > + return page->index << compound_order(page); > > + else > > + return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); > > +} > > + > > This is all a bit of a mess. > > We have page_offset() which only works for regular pagecache pages and > not for huge pages. > > We have page_file_offset() which works for regular pagecache as well > as swapcache but not for huge pages. > > We have page_index() and page_file_index() which differ in undocumented > ways which I cannot be bothered working out. The latter calls > __page_file_index() which is grossly misnamed. > > Now we get a new page_to_pgoff() which in inconsistently named but has > a similarly crappy level of documentation and which works for hugepages > and regular pagecache pages but not for swapcache pages. > > > Sigh. > > I'll merge this patch because it's a bugfix but could someone please > drive a truck through all this stuff and see if we can come up with > something tasteful and sane? I wrote a patch for this, could you take a look? Thanks, Naoya Horiguchi --- From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> Date: Tue, 15 Jul 2014 12:05:53 -0400 Subject: [PATCH] mm: refactor page index/offset getters There is a complaint about duplication around the fundamental routines of page index/offset getters. page_(index|offset) and page_file_(index|offset) provide the same functionality, so we can merge them as page_(index|offset), respectively. And this patch gives the clear meaning to the getters: - page_index(): get page cache index (offset in relevant page size) - page_pgoff(): get 4kB page offset - page_offset(): get byte offset All these functions are aware of regular pages, swapcaches, and hugepages. The definition of PageHuge is relocated to include/linux/mm.h, because some users of page_pgoff() doesn't include include/linux/hugetlb.h. __page_file_index() is not well named, because it's only for swap cache. So let's rename it with __page_swap_index(). Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> --- fs/nfs/internal.h | 6 +++--- fs/nfs/pagelist.c | 2 +- fs/nfs/read.c | 2 +- fs/nfs/write.c | 10 +++++----- fs/xfs/xfs_aops.c | 2 +- include/linux/hugetlb.h | 7 ------- include/linux/mm.h | 26 ++++++++++---------------- include/linux/pagemap.h | 22 +++++++++------------- mm/memory-failure.c | 4 ++-- mm/page_io.c | 6 +++--- mm/rmap.c | 6 +++--- mm/swapfile.c | 4 ++-- 12 files changed, 40 insertions(+), 57 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index dd8bfc2e2464..82d942fead90 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -576,11 +576,11 @@ unsigned int nfs_page_length(struct page *page) loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { - pgoff_t page_index = page_file_index(page); + pgoff_t index = page_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; - if (page_index < end_index) + if (index < end_index) return PAGE_CACHE_SIZE; - if (page_index == end_index) + if (index == end_index) return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; } return 0; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 03ed984ab4d8..eace07a9104d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -173,7 +173,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - req->wb_index = page_file_index(page); + req->wb_index = page_index(page); page_cache_get(page); req->wb_offset = offset; req->wb_pgbase = offset; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 411aedda14bb..94ff1cf21d2c 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -538,7 +538,7 @@ int nfs_readpage(struct file *file, struct page *page) int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_CACHE_SIZE, page_file_index(page)); + page, PAGE_CACHE_SIZE, page_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ffb9459f180b..e990dd527764 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -194,9 +194,9 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; - if (i_size > 0 && page_file_index(page) < end_index) + if (i_size > 0 && page_index(page) < end_index) goto out; - end = page_file_offset(page) + ((loff_t)offset+count); + end = page_offset(page) + ((loff_t)offset+count); if (i_size >= end) goto out; i_size_write(inode, end); @@ -337,7 +337,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); - nfs_pageio_cond_complete(pgio, page_file_index(page)); + nfs_pageio_cond_complete(pgio, page_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); @@ -961,7 +961,7 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", - file, count, (long long)(page_file_offset(page) + offset)); + file, count, (long long)(page_offset(page) + offset)); if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); @@ -1817,7 +1817,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) */ int nfs_wb_page(struct inode *inode, struct page *page) { - loff_t range_start = page_file_offset(page); + loff_t range_start = page_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0479c32c5eb1..2f94687c2849 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -695,7 +695,7 @@ xfs_convert_page( unsigned int type; int len, page_dirty; int count = 0, done = 0, uptodate = 1; - xfs_off_t offset = page_offset(page); + xfs_off_t offset = page_offset(page); if (page->index != tindex) goto fail; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 41272bcf73f8..026d8b147027 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,8 +41,6 @@ extern int hugetlb_max_hstate __read_mostly; struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); -int PageHuge(struct page *page); - void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -108,11 +106,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, #else /* !CONFIG_HUGETLB_PAGE */ -static inline int PageHuge(struct page *page) -{ - return 0; -} - static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 84b2a6cf45f6..5a86ce1b4cd0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -450,8 +450,13 @@ static inline int page_count(struct page *page) } #ifdef CONFIG_HUGETLB_PAGE +extern int PageHuge(struct page *page); extern int PageHeadHuge(struct page *page_head); #else /* CONFIG_HUGETLB_PAGE */ +static inline int PageHuge(struct page *page) +{ + return 0; +} static inline int PageHeadHuge(struct page *page_head) { return 0; @@ -980,27 +985,16 @@ static inline int PageAnon(struct page *page) return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; } -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use ->private - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return page_private(page); - return page->index; -} - -extern pgoff_t __page_file_index(struct page *page); +extern pgoff_t __page_swap_index(struct page *page); /* - * Return the file index of the page. Regular pagecache pages use ->index - * whereas swapcache pages use swp_offset(->private) + * Return the pagecache index of the passed page, which is the offset + * in the relevant page size. */ -static inline pgoff_t page_file_index(struct page *page) +static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) - return __page_file_index(page); + return __page_swap_index(page); return page->index; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e1474ae18c88..3b27877ac6d0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -399,28 +399,24 @@ static inline struct page *read_mapping_page(struct address_space *mapping, } /* - * Get the offset in PAGE_SIZE. + * Return the 4kB page offset of the given page. * (TODO: hugepage should have ->index in PAGE_SIZE) */ -static inline pgoff_t page_to_pgoff(struct page *page) +static inline pgoff_t page_pgoff(struct page *page) { - if (unlikely(PageHeadHuge(page))) - return page->index << compound_order(page); - else - return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + if (unlikely(PageHuge(page))) { + VM_BUG_ON_PAGE(PageTail(page), page); + return page_index(page) << compound_order(page); + } else + return page_index(page) << (PAGE_CACHE_SHIFT - PAGE_SHIFT); } /* - * Return byte-offset into filesystem object for page. + * Return the byte offset of the given page. */ static inline loff_t page_offset(struct page *page) { - return ((loff_t)page->index) << PAGE_CACHE_SHIFT; -} - -static inline loff_t page_file_offset(struct page *page) -{ - return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT; + return ((loff_t)page_pgoff(page)) << PAGE_SHIFT; } extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e3e2f007946e..75acb65bd912 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -435,7 +435,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page_to_pgoff(page); + pgoff = page_pgoff(page); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; @@ -469,7 +469,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); for_each_process(tsk) { - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff = page_pgoff(page); struct task_struct *t = task_early_kill(tsk, force_early); if (!t) diff --git a/mm/page_io.c b/mm/page_io.c index 58b50d2901fe..4ca964f83718 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -250,7 +250,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) static sector_t swap_page_sector(struct page *page) { - return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); + return (sector_t)__page_swap_index(page) << (PAGE_CACHE_SHIFT - 9); } int __swap_writepage(struct page *page, struct writeback_control *wbc, @@ -270,7 +270,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, }; init_sync_kiocb(&kiocb, swap_file); - kiocb.ki_pos = page_file_offset(page); + kiocb.ki_pos = page_offset(page); kiocb.ki_nbytes = PAGE_SIZE; set_page_writeback(page); @@ -296,7 +296,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, set_page_dirty(page); ClearPageReclaim(page); pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", - page_file_offset(page)); + page_offset(page)); } end_page_writeback(page); return ret; diff --git a/mm/rmap.c b/mm/rmap.c index 3e8491c504f8..7928ddd91b6e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -517,7 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) static inline unsigned long __vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff = page_pgoff(page); return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); } @@ -1615,7 +1615,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff = page_pgoff(page); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1656,7 +1656,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page_to_pgoff(page); + pgoff_t pgoff = page_pgoff(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; diff --git a/mm/swapfile.c b/mm/swapfile.c index 8798b2e0ac59..1c4027c702fb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2715,13 +2715,13 @@ struct address_space *__page_file_mapping(struct page *page) } EXPORT_SYMBOL_GPL(__page_file_mapping); -pgoff_t __page_file_index(struct page *page) +pgoff_t __page_swap_index(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; VM_BUG_ON_PAGE(!PageSwapCache(page), page); return swp_offset(swap); } -EXPORT_SYMBOL_GPL(__page_file_index); +EXPORT_SYMBOL_GPL(__page_swap_index); /* * add_swap_count_continuation - called when a swap count is duplicated -- 1.9.3 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>