On Fri, Feb 15, 2019 at 2:25 PM Matthew Wilcox <willy@xxxxxxxxxxxxx> wrote: > > Transparent Huge Pages are currently stored in i_pages as pointers to > consecutive subpages. This patch changes that to storing consecutive > pointers to the head page in preparation for storing huge pages more > efficiently in i_pages. > > Large parts of this are "inspired" by Kirill's patch > https://lore.kernel.org/lkml/20170126115819.58875-2-kirill.shutemov@xxxxxxxxxxxxxxx/ > > Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxx> > Acked-by: Jan Kara <jack@xxxxxxx> > Reviewed-by: Kirill Shutemov <kirill@xxxxxxxxxxxxx> > --- > include/linux/pagemap.h | 9 +++ > mm/filemap.c | 158 ++++++++++++++++------------------------ > mm/huge_memory.c | 3 + > mm/khugepaged.c | 4 +- > mm/memfd.c | 2 + > mm/migrate.c | 2 +- > mm/shmem.c | 2 +- > mm/swap_state.c | 4 +- > 8 files changed, 81 insertions(+), 103 deletions(-) > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index bcf909d0de5f..7d58e4e0b68e 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -333,6 +333,15 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, > mapping_gfp_mask(mapping)); > } > > +static inline struct page *find_subpage(struct page *page, pgoff_t offset) > +{ > + VM_BUG_ON_PAGE(PageTail(page), page); > + VM_BUG_ON_PAGE(page->index > offset, page); > + VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) <= offset, > + page); > + return page - page->index + offset; > +} > + > struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); > struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); > unsigned find_get_entries(struct address_space *mapping, pgoff_t start, > diff --git a/mm/filemap.c b/mm/filemap.c > index 5673672fd444..d9161cae11b5 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -279,11 +279,11 @@ EXPORT_SYMBOL(delete_from_page_cache); > * @pvec: pagevec with pages to delete > * > * The function walks over mapping->i_pages and removes pages passed in @pvec > - * from the mapping. The function expects @pvec to be sorted by page index. > + * from the mapping. The function expects @pvec to be sorted by page index > + * and is optimised for it to be dense. > * It tolerates holes in @pvec (mapping entries at those indices are not > * modified). The function expects only THP head pages to be present in the > - * @pvec and takes care to delete all corresponding tail pages from the > - * mapping as well. > + * @pvec. > * > * The function expects the i_pages lock to be held. > */ > @@ -292,40 +292,43 @@ static void page_cache_delete_batch(struct address_space *mapping, > { > XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); > int total_pages = 0; > - int i = 0, tail_pages = 0; > + int i = 0; > struct page *page; > > mapping_set_update(&xas, mapping); > xas_for_each(&xas, page, ULONG_MAX) { > - if (i >= pagevec_count(pvec) && !tail_pages) > + if (i >= pagevec_count(pvec)) > break; > + > + /* A swap/dax/shadow entry got inserted? Skip it. */ > if (xa_is_value(page)) > continue; > - if (!tail_pages) { > - /* > - * Some page got inserted in our range? Skip it. We > - * have our pages locked so they are protected from > - * being removed. > - */ > - if (page != pvec->pages[i]) { > - VM_BUG_ON_PAGE(page->index > > - pvec->pages[i]->index, page); > - continue; > - } > - WARN_ON_ONCE(!PageLocked(page)); > - if (PageTransHuge(page) && !PageHuge(page)) > - tail_pages = HPAGE_PMD_NR - 1; > + /* > + * A page got inserted in our range? Skip it. We have our > + * pages locked so they are protected from being removed. > + * If we see a page whose index is higher than ours, it > + * means our page has been removed, which shouldn't be > + * possible because we're holding the PageLock. > + */ > + if (page != pvec->pages[i]) { > + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, > + page); > + continue; > + } > + > + WARN_ON_ONCE(!PageLocked(page)); > + > + if (page->index == xas.xa_index) > page->mapping = NULL; > - /* > - * Leave page->index set: truncation lookup relies > - * upon it > - */ > + /* Leave page->index set: truncation lookup relies on it */ > + > + /* > + * Move to the next page in the vector if this is a small page > + * or the index is of the last page in this compound page). > + */ > + if (page->index + (1UL << compound_order(page)) - 1 == > + xas.xa_index) > i++; > - } else { > - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages > - != pvec->pages[i]->index, page); > - tail_pages--; > - } > xas_store(&xas, NULL); > total_pages++; > } > @@ -1491,7 +1494,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); > struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) > { > XA_STATE(xas, &mapping->i_pages, offset); > - struct page *head, *page; > + struct page *page; > > rcu_read_lock(); > repeat: > @@ -1506,25 +1509,19 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) > if (!page || xa_is_value(page)) > goto out; > > - head = compound_head(page); > - if (!page_cache_get_speculative(head)) > + if (!page_cache_get_speculative(page)) > goto repeat; > > - /* The page was split under us? */ > - if (compound_head(page) != head) { > - put_page(head); > - goto repeat; > - } > - > /* > - * Has the page moved? > + * Has the page moved or been split? > * This is part of the lockless pagecache protocol. See > * include/linux/pagemap.h for details. > */ > if (unlikely(page != xas_reload(&xas))) { > - put_page(head); > + put_page(page); > goto repeat; > } > + page = find_subpage(page, offset); > out: > rcu_read_unlock(); > > @@ -1706,7 +1703,6 @@ unsigned find_get_entries(struct address_space *mapping, > > rcu_read_lock(); > xas_for_each(&xas, page, ULONG_MAX) { > - struct page *head; > if (xas_retry(&xas, page)) > continue; > /* > @@ -1717,17 +1713,13 @@ unsigned find_get_entries(struct address_space *mapping, > if (xa_is_value(page)) > goto export; > > - head = compound_head(page); > - if (!page_cache_get_speculative(head)) > + if (!page_cache_get_speculative(page)) > goto retry; > > - /* The page was split under us? */ > - if (compound_head(page) != head) > - goto put_page; > - > - /* Has the page moved? */ > + /* Has the page moved or been split? */ > if (unlikely(page != xas_reload(&xas))) > goto put_page; > + page = find_subpage(page, xas.xa_index); > > export: > indices[ret] = xas.xa_index; > @@ -1736,7 +1728,7 @@ unsigned find_get_entries(struct address_space *mapping, > break; > continue; > put_page: > - put_page(head); > + put_page(page); > retry: > xas_reset(&xas); > } > @@ -1778,33 +1770,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, > > rcu_read_lock(); > xas_for_each(&xas, page, end) { > - struct page *head; > if (xas_retry(&xas, page)) > continue; > /* Skip over shadow, swap and DAX entries */ > if (xa_is_value(page)) > continue; > > - head = compound_head(page); > - if (!page_cache_get_speculative(head)) > + if (!page_cache_get_speculative(page)) > goto retry; > > - /* The page was split under us? */ > - if (compound_head(page) != head) > - goto put_page; > - > - /* Has the page moved? */ > + /* Has the page moved or been split? */ > if (unlikely(page != xas_reload(&xas))) > goto put_page; > > - pages[ret] = page; > + pages[ret] = find_subpage(page, xas.xa_index); > if (++ret == nr_pages) { > *start = page->index + 1; > goto out; > } > continue; > put_page: > - put_page(head); > + put_page(page); > retry: > xas_reset(&xas); > } > @@ -1849,7 +1835,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, > > rcu_read_lock(); > for (page = xas_load(&xas); page; page = xas_next(&xas)) { > - struct page *head; > if (xas_retry(&xas, page)) > continue; > /* > @@ -1859,24 +1844,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, > if (xa_is_value(page)) > break; > > - head = compound_head(page); > - if (!page_cache_get_speculative(head)) > + if (!page_cache_get_speculative(page)) > goto retry; > > - /* The page was split under us? */ > - if (compound_head(page) != head) > - goto put_page; > - > - /* Has the page moved? */ > + /* Has the page moved or been split? */ > if (unlikely(page != xas_reload(&xas))) > goto put_page; > > - pages[ret] = page; > + pages[ret] = find_subpage(page, xas.xa_index); > if (++ret == nr_pages) > break; > continue; > put_page: > - put_page(head); > + put_page(page); > retry: > xas_reset(&xas); > } > @@ -1912,7 +1892,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, > > rcu_read_lock(); > xas_for_each_marked(&xas, page, end, tag) { > - struct page *head; > if (xas_retry(&xas, page)) > continue; > /* > @@ -1923,26 +1902,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, > if (xa_is_value(page)) > continue; > > - head = compound_head(page); > - if (!page_cache_get_speculative(head)) > + if (!page_cache_get_speculative(page)) > goto retry; > > - /* The page was split under us? */ > - if (compound_head(page) != head) > - goto put_page; > - > - /* Has the page moved? */ > + /* Has the page moved or been split? */ > if (unlikely(page != xas_reload(&xas))) > goto put_page; > > - pages[ret] = page; > + pages[ret] = find_subpage(page, xas.xa_index); > if (++ret == nr_pages) { > *index = page->index + 1; > goto out; > } > continue; > put_page: > - put_page(head); > + put_page(page); > retry: > xas_reset(&xas); > } > @@ -1991,7 +1965,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, > > rcu_read_lock(); > xas_for_each_marked(&xas, page, ULONG_MAX, tag) { > - struct page *head; > if (xas_retry(&xas, page)) > continue; > /* > @@ -2002,17 +1975,13 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, > if (xa_is_value(page)) > goto export; > > - head = compound_head(page); > - if (!page_cache_get_speculative(head)) > + if (!page_cache_get_speculative(page)) > goto retry; > > - /* The page was split under us? */ > - if (compound_head(page) != head) > - goto put_page; > - > - /* Has the page moved? */ > + /* Has the page moved or been split? */ > if (unlikely(page != xas_reload(&xas))) > goto put_page; > + page = find_subpage(page, xas.xa_index); > > export: > indices[ret] = xas.xa_index; > @@ -2021,7 +1990,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, > break; > continue; > put_page: > - put_page(head); > + put_page(page); > retry: > xas_reset(&xas); > } > @@ -2686,7 +2655,7 @@ void filemap_map_pages(struct vm_fault *vmf, > pgoff_t last_pgoff = start_pgoff; > unsigned long max_idx; > XA_STATE(xas, &mapping->i_pages, start_pgoff); > - struct page *head, *page; > + struct page *page; > > rcu_read_lock(); > xas_for_each(&xas, page, end_pgoff) { > @@ -2695,24 +2664,19 @@ void filemap_map_pages(struct vm_fault *vmf, > if (xa_is_value(page)) > goto next; > > - head = compound_head(page); > - > /* > * Check for a locked page first, as a speculative > * reference may adversely influence page migration. > */ > - if (PageLocked(head)) > + if (PageLocked(page)) > goto next; > - if (!page_cache_get_speculative(head)) > + if (!page_cache_get_speculative(page)) > goto next; > > - /* The page was split under us? */ > - if (compound_head(page) != head) > - goto skip; > - > - /* Has the page moved? */ > + /* Has the page moved or been split? */ > if (unlikely(page != xas_reload(&xas))) > goto skip; > + page = find_subpage(page, xas.xa_index); > > if (!PageUptodate(page) || > PageReadahead(page) || > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index d4847026d4b1..7008174c033b 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -2458,6 +2458,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, > if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) > shmem_uncharge(head->mapping->host, 1); > put_page(head + i); > + } else if (!PageAnon(page)) { > + __xa_store(&head->mapping->i_pages, head[i].index, > + head + i, 0); > } > } > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 449044378782..7ba7a1e4fa79 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -1374,7 +1374,7 @@ static void collapse_shmem(struct mm_struct *mm, > result = SCAN_FAIL; > goto xa_locked; > } > - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); > + xas_store(&xas, new_page); > nr_none++; > continue; > } > @@ -1450,7 +1450,7 @@ static void collapse_shmem(struct mm_struct *mm, > list_add_tail(&page->lru, &pagelist); > > /* Finally, replace with the new page. */ > - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); > + xas_store(&xas, new_page); > continue; > out_unlock: > unlock_page(page); > diff --git a/mm/memfd.c b/mm/memfd.c > index 650e65a46b9c..bccbf7dff050 100644 > --- a/mm/memfd.c > +++ b/mm/memfd.c > @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) > xas_for_each(xas, page, ULONG_MAX) { > if (xa_is_value(page)) > continue; > + page = find_subpage(page, xas.xa_index); This should be xas->xa_index. I fixed this and am trying to test the patch. Thanks, Song > if (page_count(page) - page_mapcount(page) > 1) > xas_set_mark(xas, MEMFD_TAG_PINNED); > > @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) > bool clear = true; > if (xa_is_value(page)) > continue; > + page = find_subpage(page, xas.xa_index); > if (page_count(page) - page_mapcount(page) != 1) { > /* > * On the last scan, we clean up all those tags > diff --git a/mm/migrate.c b/mm/migrate.c > index 412d5fff78d4..8cb55dd69b9c 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -465,7 +465,7 @@ int migrate_page_move_mapping(struct address_space *mapping, > > for (i = 1; i < HPAGE_PMD_NR; i++) { > xas_next(&xas); > - xas_store(&xas, newpage + i); > + xas_store(&xas, newpage); > } > } > > diff --git a/mm/shmem.c b/mm/shmem.c > index c8cdaa012f18..a78d4f05a51f 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page, > if (xas_error(&xas)) > goto unlock; > next: > - xas_store(&xas, page + i); > + xas_store(&xas, page); > if (++i < nr) { > xas_next(&xas); > goto next; > diff --git a/mm/swap_state.c b/mm/swap_state.c > index 85245fdec8d9..eb714165afd2 100644 > --- a/mm/swap_state.c > +++ b/mm/swap_state.c > @@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) > for (i = 0; i < nr; i++) { > VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); > set_page_private(page + i, entry.val + i); > - xas_store(&xas, page + i); > + xas_store(&xas, page); > xas_next(&xas); > } > address_space->nrpages += nr; > @@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) > > for (i = 0; i < nr; i++) { > void *entry = xas_store(&xas, NULL); > - VM_BUG_ON_PAGE(entry != page + i, entry); > + VM_BUG_ON_PAGE(entry != page, entry); > set_page_private(page + i, 0); > xas_next(&xas); > } > -- > 2.20.1 >