Hi Kirill, I wrote a patch to switch hugetlbfs to multi-order radix tree. Hopefully it's queued to your series. Thanks, Naoya Horiguchi --- From: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> Date: Wed, 10 Aug 2016 09:49:09 +0900 Subject: [PATCH] mm, hugetlb: switch hugetlbfs to multi-order radix-tree entries Currently, hugetlb pages are linked to page cache on the basis of hugepage offset (derived from vma_hugecache_offset()) for historical reason, which doesn't match to the generic usage of page cache and requires some routines to covert page offset <=> hugepage offset in common path. This patch adjusts code for multi-order radix-tree to avoid the situation. Main change is on the behavior of page->index for hugetlbfs. Before this patch, it represented hugepage offset, but with this patch it represents page offset. So index-related code have to be updated. Note that hugetlb_fault_mutex_hash() and reservation region handling are still working with hugepage offset. Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> --- fs/hugetlbfs/inode.c | 22 ++++++++++------------ include/linux/pagemap.h | 10 +--------- mm/filemap.c | 26 +++++++++++++++----------- mm/hugetlb.c | 19 ++++++------------- 4 files changed, 32 insertions(+), 45 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4ea71eba40a5..fc918c0e33e9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -388,8 +388,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; - const pgoff_t start = lstart >> huge_page_shift(h); - const pgoff_t end = lend >> huge_page_shift(h); + const pgoff_t start = lstart >> PAGE_SHIFT; + const pgoff_t end = lend >> PAGE_SHIFT; struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next; @@ -447,8 +447,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, i_mmap_lock_write(mapping); hugetlb_vmdelete_list(&mapping->i_mmap, - next * pages_per_huge_page(h), - (next + 1) * pages_per_huge_page(h)); + next, next + 1); i_mmap_unlock_write(mapping); } @@ -467,7 +466,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, - next, next + 1, 1))) + (next) << huge_page_order(h), + (next + 1) << huge_page_order(h), 1))) hugetlb_fix_reserve_counts(inode, rsv_on_error); } @@ -552,8 +552,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; struct mm_struct *mm = current->mm; - loff_t hpage_size = huge_page_size(h); - unsigned long hpage_shift = huge_page_shift(h); pgoff_t start, index, end; int error; u32 hash; @@ -569,8 +567,8 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * For this range, start is rounded down and end is rounded up * as well as being converted to page offsets. */ - start = offset >> hpage_shift; - end = (offset + len + hpage_size - 1) >> hpage_shift; + start = offset >> PAGE_SHIFT; + end = (offset + len + huge_page_size(h) - 1) >> PAGE_SHIFT; inode_lock(inode); @@ -588,7 +586,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; - for (index = start; index < end; index++) { + for (index = start; index < end; index += pages_per_huge_page(h)) { /* * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. @@ -609,10 +607,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } /* Set numa allocation policy based on index */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); + hugetlb_set_vma_policy(&pseudo_vma, inode, index >> huge_page_order(h)); /* addr is the offset within the file (zero based) */ - addr = index * hpage_size; + addr = index << PAGE_SHIFT & ~huge_page_mask(h); /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d9cf4e0f35dc..e7b79ec9673d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -380,15 +380,11 @@ static inline struct page *read_mapping_page(struct address_space *mapping, /* * Get the offset in PAGE_SIZE. - * (TODO: hugepage should have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_pgoff(struct page *page) { pgoff_t pgoff; - if (unlikely(PageHeadHuge(page))) - return page->index << compound_order(page); - if (likely(!PageTransTail(page))) return page->index; @@ -414,15 +410,11 @@ static inline loff_t page_file_offset(struct page *page) return ((loff_t)page_file_index(page)) << PAGE_SHIFT; } -extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address); - static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; - if (unlikely(is_vm_hugetlb_page(vma))) - return linear_hugepage_index(vma, address); + pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; diff --git a/mm/filemap.c b/mm/filemap.c index 3d46db277e73..f0bcb1329df4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -114,7 +114,7 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { struct radix_tree_node *node; - int nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + int nr = hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); @@ -667,16 +667,20 @@ static int __add_to_page_cache_locked(struct page *page, page->index = offset; spin_lock_irq(&mapping->tree_lock); - if (PageTransHuge(page)) { + if (PageCompound(page)) { /* TODO: shadow handling */ error = __radix_tree_insert(&mapping->page_tree, offset, compound_order(page), page); if (!error) { - count_vm_event(THP_FILE_ALLOC); - mapping->nrpages += HPAGE_PMD_NR; - *shadowp = NULL; - __inc_node_page_state(page, NR_FILE_THPS); + if (hugetlb) { + mapping->nrpages += 1 << compound_order(page); + } else if (PageTransHuge(page)) { + count_vm_event(THP_FILE_ALLOC); + mapping->nrpages += HPAGE_PMD_NR; + *shadowp = NULL; + __inc_node_page_state(page, NR_FILE_THPS); + } } } else { error = page_cache_tree_insert(mapping, page, shadowp); @@ -1118,9 +1122,9 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) } /* For multi-order entries, find relevant subpage */ - if (PageTransHuge(page)) { + if (PageCompound(page)) { VM_BUG_ON(offset - page->index < 0); - VM_BUG_ON(offset - page->index >= HPAGE_PMD_NR); + VM_BUG_ON(offset - page->index >= 1 << compound_order(page)); page += offset - page->index; } } @@ -1475,16 +1479,16 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, } /* For multi-order entries, find relevant subpage */ - if (PageTransHuge(page)) { + if (PageCompound(page)) { VM_BUG_ON(iter.index - page->index < 0); - VM_BUG_ON(iter.index - page->index >= HPAGE_PMD_NR); + VM_BUG_ON(iter.index - page->index >= 1 << compound_order(page)); page += iter.index - page->index; } pages[ret] = page; if (++ret == nr_pages) break; - if (!PageTransCompound(page)) + if (PageHuge(page) || !PageTransCompound(page)) continue; for (refs = 0; ret < nr_pages && (iter.index + 1) % HPAGE_PMD_NR; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51a04e5e9373..e4f1b9e84dda 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -622,13 +622,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address) -{ - return vma_hugecache_offset(hstate_vma(vma), vma, address); -} -EXPORT_SYMBOL_GPL(linear_hugepage_index); - /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -3486,7 +3479,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, pgoff_t idx; mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = linear_page_index(vma, address); return find_lock_page(mapping, idx); } @@ -3503,7 +3496,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, struct page *page; mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = linear_page_index(vma, address); page = find_get_page(mapping, idx); if (page) @@ -3558,7 +3551,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, retry: page = find_lock_page(mapping, idx); if (!page) { - size = i_size_read(mapping->host) >> huge_page_shift(h); + size = i_size_read(mapping->host) >> PAGE_SHIFT; if (idx >= size) goto out; page = alloc_huge_page(vma, address, 0); @@ -3620,7 +3613,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ptl = huge_pte_lockptr(h, mm, ptep); spin_lock(ptl); - size = i_size_read(mapping->host) >> huge_page_shift(h); + size = i_size_read(mapping->host) >> PAGE_SHIFT; if (idx >= size) goto backout; @@ -3667,7 +3660,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, if (vma->vm_flags & VM_SHARED) { key[0] = (unsigned long) mapping; - key[1] = idx; + key[1] = idx >> huge_page_order(h); } else { key[0] = (unsigned long) mm; key[1] = address >> huge_page_shift(h); @@ -3723,7 +3716,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = linear_page_index(vma, address); /* * Serialize hugepage allocation and instantiation, so that we don't -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html