On 06/09/23 12:52, Sidhartha Kumar wrote: > On 6/9/23 12:49 PM, Sidhartha Kumar wrote: > > Signed-off-by: Sidhartha Kumar <sidhartha.kumar@xxxxxxxxxx> > > Sorry, I missed adding the commit message to this. It should be: > > Add filemap_lock_hugetlb_folio() which is wraps __filemap_get_folio() > and passes in a linear page index. hugetlb_add_to_page_cache() is modified > to also compute a linear page index before calling into page cache code. > > linear_page_index() is modified to perform the computation on hugetlb > so we can use it in the page cache wrappers. > > > --- > > fs/hugetlbfs/inode.c | 14 +++++++------- > > include/linux/hugetlb.h | 21 +++++++++++++++++++-- > > include/linux/pagemap.h | 2 -- > > mm/hugetlb.c | 22 +++++++++++++--------- > > 4 files changed, 39 insertions(+), 20 deletions(-) > > > > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c > > index 90361a922cec7..90d27a8af4b6a 100644 > > --- a/fs/hugetlbfs/inode.c > > +++ b/fs/hugetlbfs/inode.c > > @@ -617,20 +617,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, > > struct hstate *h = hstate_inode(inode); > > struct address_space *mapping = &inode->i_data; > > const pgoff_t start = lstart >> huge_page_shift(h); > > - const pgoff_t end = lend >> huge_page_shift(h); > > struct folio_batch fbatch; > > pgoff_t next, index; > > int i, freed = 0; > > bool truncate_op = (lend == LLONG_MAX); > > folio_batch_init(&fbatch); > > - next = start; > > - while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { > > + next = lstart; > > + while (filemap_get_folios(mapping, &next, lend - 1, &fbatch)) { This does not seem correct. At this point next == lstart which is a file offset passed to the routine as opposed to an index. I would think next needs to be set to 'lstart >> PAGE_SHIFT' here. > > for (i = 0; i < folio_batch_count(&fbatch); ++i) { > > struct folio *folio = fbatch.folios[i]; > > u32 hash = 0; > > - index = folio->index; > > + index = (folio->index) >> huge_page_shift(h); Here you want to convert index from the PAGE_SIZE index to a hugetlb page size index. Correct? I am terrible at arithmetic, but huge_page_shift already includes PAGE_SHIFT, so it seems like you want this to me. index = (folio->index) >> huge_page_order(h); > > hash = hugetlb_fault_mutex_hash(mapping, index); > > mutex_lock(&hugetlb_fault_mutex_table[hash]); > > @@ -693,10 +692,11 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, > > loff_t start, > > loff_t end) > > { > > - pgoff_t idx = start >> huge_page_shift(h); > > + struct mm_struct *mm = current->mm; > > + struct vm_area_struct *vma = find_vma(mm, start); > > struct folio *folio; > > - folio = filemap_lock_folio(mapping, idx); > > + folio = filemap_lock_hugetlb_folio(vma, start); Here you are passing an address/index that may be associated with a tail page. I assume since the hugetlb folio is multi-order, the returned folio will be for the 'head page'. Correct? > > if (IS_ERR(folio)) > > return; > > @@ -868,7 +868,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, > > } > > clear_huge_page(&folio->page, addr, pages_per_huge_page(h)); > > __folio_mark_uptodate(folio); > > - error = hugetlb_add_to_page_cache(folio, mapping, index); > > + error = hugetlb_add_to_page_cache(folio, &pseudo_vma, mapping, addr); > > if (unlikely(error)) { > > restore_reserve_on_error(h, &pseudo_vma, addr, folio); > > folio_put(folio); > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > > index 21f942025fecd..55f90e051b7a2 100644 > > --- a/include/linux/hugetlb.h > > +++ b/include/linux/hugetlb.h > > @@ -727,8 +727,8 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, > > nodemask_t *nmask, gfp_t gfp_mask); > > struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, > > unsigned long address); > > -int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, > > - pgoff_t idx); > > +int hugetlb_add_to_page_cache(struct folio *folio, struct vm_area_struct *vma, > > + struct address_space *mapping, unsigned long address); > > void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, > > unsigned long address, struct folio *folio); > > @@ -755,6 +755,16 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio > > return folio->_hugetlb_subpool; > > } > > +/* Wrapper function for __filemap_get_folio*/ > > +static inline struct folio *filemap_lock_hugetlb_folio(struct vm_area_struct *vma, > > + unsigned long address) > > +{ > > + struct address_space *mapping = vma->vm_file->f_mapping; > > + > > + pgoff_t idx = linear_page_index(vma, address); > > + return __filemap_get_folio(mapping, idx, FGP_LOCK, 0); > > +} I like the wrapper idea. This is going to replace existing calls to filemap_lock_folio. What about something like this for the routine: static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, struct address_space *mapping, pgoff_t index); { /* please verify my arithmetic */ return filemap_lock_folio(mapping, index << huge_page_order(h)); } In this way, existing callers would only need to be changed to pass in hstate. Perhaps, there was a reason for your wrapper not obvious to me? > > + > > static inline void hugetlb_set_folio_subpool(struct folio *folio, > > struct hugepage_subpool *subpool) > > { > > @@ -1021,6 +1031,13 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio > > return NULL; > > } > > +/* Wrapper function for __filemap_get_folio*/ > > +static inline struct folio *filemap_lock_hugetlb_folio(struct vm_area_struct *vma, > > + unsigned long address) > > +{ > > + return NULL; > > +} > > + > > static inline int isolate_or_dissolve_huge_page(struct page *page, > > struct list_head *list) > > { > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > > index 17c414fc2136e..ae8f36966d7b3 100644 > > --- a/include/linux/pagemap.h > > +++ b/include/linux/pagemap.h > > @@ -860,8 +860,6 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, > > unsigned long address) > > { > > pgoff_t pgoff; > > - if (unlikely(is_vm_hugetlb_page(vma))) > > - return linear_hugepage_index(vma, address); > > pgoff = (address - vma->vm_start) >> PAGE_SHIFT; > > pgoff += vma->vm_pgoff; > > return pgoff; > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > > index dfa412d8cb300..824d6d215a161 100644 > > --- a/mm/hugetlb.c > > +++ b/mm/hugetlb.c > > @@ -951,7 +951,7 @@ static long region_count(struct resv_map *resv, long f, long t) > > /* > > * Convert the address within this vma to the page offset within > > - * the mapping, in pagecache page units; huge pages here. > > + * the mapping, in huge page units here. > > */ > > static pgoff_t vma_hugecache_offset(struct hstate *h, > > struct vm_area_struct *vma, unsigned long address) > > @@ -5730,7 +5730,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, > > struct vm_area_struct *vma, unsigned long address) > > { > > struct address_space *mapping = vma->vm_file->f_mapping; > > - pgoff_t idx = vma_hugecache_offset(h, vma, address); > > + pgoff_t idx = linear_page_index(vma, address); > > bool present; > > rcu_read_lock(); > > @@ -5740,13 +5740,16 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, > > return present; > > } > > -int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, > > - pgoff_t idx) > > +int hugetlb_add_to_page_cache(struct folio *folio, > > + struct vm_area_struct *vma, > > + struct address_space *mapping, > > + unsigned long address) > > { Like filemap_lock_hugetlb_folio(), can you just add hstate to existing hugetlb_add_to_page_cache() arguments and do arithmetic to convert index to PAGE_SIZE based index? Again, I could be missing something. But, IMO such a conversion would minimize changes to the current code. -- Mike Kravetz > > struct inode *inode = mapping->host; > > struct hstate *h = hstate_inode(inode); > > int err; > > + pgoff_t idx = linear_page_index(vma, address); > > __folio_set_locked(folio); > > err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); > > @@ -5854,7 +5857,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > > * before we get page_table_lock. > > */ > > new_folio = false; > > - folio = filemap_lock_folio(mapping, idx); > > + > > + folio = filemap_lock_hugetlb_folio(vma, address); > > if (IS_ERR(folio)) { > > size = i_size_read(mapping->host) >> huge_page_shift(h); > > if (idx >= size) > > @@ -5913,7 +5917,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > > new_folio = true; > > if (vma->vm_flags & VM_MAYSHARE) { > > - int err = hugetlb_add_to_page_cache(folio, mapping, idx); > > + int err = hugetlb_add_to_page_cache(folio, vma, mapping, address); > > if (err) { > > /* > > * err can't be -EEXIST which implies someone > > @@ -6145,7 +6149,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, > > /* Just decrements count, does not deallocate */ > > vma_end_reservation(h, vma, haddr); > > - pagecache_folio = filemap_lock_folio(mapping, idx); > > + pagecache_folio = filemap_lock_hugetlb_folio(vma, address); > > if (IS_ERR(pagecache_folio)) > > pagecache_folio = NULL; > > } > > @@ -6258,7 +6262,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, > > if (is_continue) { > > ret = -EFAULT; > > - folio = filemap_lock_folio(mapping, idx); > > + folio = filemap_lock_hugetlb_folio(dst_vma, dst_addr); > > if (IS_ERR(folio)) > > goto out; > > folio_in_pagecache = true; > > @@ -6350,7 +6354,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, > > * hugetlb_fault_mutex_table that here must be hold by > > * the caller. > > */ > > - ret = hugetlb_add_to_page_cache(folio, mapping, idx); > > + ret = hugetlb_add_to_page_cache(folio, dst_vma, mapping, dst_addr); > > if (ret) > > goto out_release_nounlock; > > folio_in_pagecache = true; >