On Tue, Aug 20, 2019 at 05:30:37PM -0700, Matthew Wilcox wrote: > From: "Matthew Wilcox (Oracle)" <willy@xxxxxxxxxxxxx> > > Change iomap_page from a statically sized uptodate bitmap to a dynamically > allocated uptodate bitmap, allowing an arbitrarily large page. > > The only remaining places where iomap assumes an order-0 page are for > files with inline data, where there's no sense in allocating a larger > page. I wonder, will anything bad happen if that occurs? (XFS doesn't have inline data for files so I have no idea...) > Signed-off-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx> > --- > fs/iomap/buffered-io.c | 119 ++++++++++++++++++++++++++--------------- > include/linux/iomap.h | 2 +- > include/linux/mm.h | 2 + > 3 files changed, 80 insertions(+), 43 deletions(-) > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c > index 0e76a4b6d98a..15d844a88439 100644 > --- a/fs/iomap/buffered-io.c > +++ b/fs/iomap/buffered-io.c > @@ -23,14 +23,14 @@ static struct iomap_page * > iomap_page_create(struct inode *inode, struct page *page) > { > struct iomap_page *iop = to_iomap_page(page); > + unsigned int n; > > if (iop || i_blocks_per_page(inode, page) <= 1) > return iop; > > - iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); > - atomic_set(&iop->read_count, 0); > - atomic_set(&iop->write_count, 0); > - bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); > + n = BITS_TO_LONGS(i_blocks_per_page(inode, page)); > + iop = kmalloc(struct_size(iop, uptodate, n), > + GFP_NOFS | __GFP_NOFAIL | __GFP_ZERO); > > /* > * migrate_page_move_mapping() assumes that pages with private data have > @@ -61,15 +61,16 @@ iomap_page_release(struct page *page) > * Calculate the range inside the page that we actually need to read. > */ > static void > -iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, > +iomap_adjust_read_range(struct inode *inode, struct page *page, > loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp) > { > + struct iomap_page *iop = to_iomap_page(page); > loff_t orig_pos = *pos; > loff_t isize = i_size_read(inode); > unsigned block_bits = inode->i_blkbits; > unsigned block_size = (1 << block_bits); > - unsigned poff = offset_in_page(*pos); > - unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length); > + unsigned poff = offset_in_this_page(page, *pos); > + unsigned plen = min_t(loff_t, page_size(page) - poff, length); > unsigned first = poff >> block_bits; > unsigned last = (poff + plen - 1) >> block_bits; > > @@ -107,7 +108,8 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, > * page cache for blocks that are entirely outside of i_size. > */ > if (orig_pos <= isize && orig_pos + length > isize) { > - unsigned end = offset_in_page(isize - 1) >> block_bits; > + unsigned end = offset_in_this_page(page, isize - 1) >> > + block_bits; > > if (first <= end && last > end) > plen -= (last - end) * block_size; > @@ -121,19 +123,16 @@ static void > iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) > { > struct iomap_page *iop = to_iomap_page(page); > - struct inode *inode = page->mapping->host; > - unsigned first = off >> inode->i_blkbits; > - unsigned last = (off + len - 1) >> inode->i_blkbits; > - unsigned int i; > bool uptodate = true; > > if (iop) { > - for (i = 0; i < i_blocks_per_page(inode, page); i++) { > - if (i >= first && i <= last) > - set_bit(i, iop->uptodate); > - else if (!test_bit(i, iop->uptodate)) > - uptodate = false; > - } > + struct inode *inode = page->mapping->host; > + unsigned first = off >> inode->i_blkbits; > + unsigned count = len >> inode->i_blkbits; > + > + bitmap_set(iop->uptodate, first, count); > + if (!bitmap_full(iop->uptodate, i_blocks_per_page(inode, page))) > + uptodate = false; > } > > if (uptodate && !PageError(page)) > @@ -194,6 +193,7 @@ iomap_read_inline_data(struct inode *inode, struct page *page, > return; > > BUG_ON(page->index); > + BUG_ON(PageCompound(page)); > BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); > > addr = kmap_atomic(page); > @@ -203,6 +203,16 @@ iomap_read_inline_data(struct inode *inode, struct page *page, > SetPageUptodate(page); > } > > +/* > + * Estimate the number of vectors we need based on the current page size; > + * if we're wrong we'll end up doing an overly large allocation or needing > + * to do a second allocation, neither of which is a big deal. > + */ > +static unsigned int iomap_nr_vecs(struct page *page, loff_t length) > +{ > + return (length + page_size(page) - 1) >> page_shift(page); > +} > + > static loff_t > iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > struct iomap *iomap) > @@ -222,7 +232,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > } > > /* zero post-eof blocks as the page may be mapped */ > - iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); > + iomap_adjust_read_range(inode, page, &pos, length, &poff, &plen); > if (plen == 0) > goto done; > > @@ -258,7 +268,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { > gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); > - int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; > + int nr_vecs = iomap_nr_vecs(page, length); > > if (ctx->bio) > submit_bio(ctx->bio); > @@ -293,9 +303,9 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) > unsigned poff; > loff_t ret; > > - for (poff = 0; poff < PAGE_SIZE; poff += ret) { > - ret = iomap_apply(inode, page_offset(page) + poff, > - PAGE_SIZE - poff, 0, ops, &ctx, > + for (poff = 0; poff < page_size(page); poff += ret) { > + ret = iomap_apply(inode, file_offset_of_page(page) + poff, > + page_size(page) - poff, 0, ops, &ctx, > iomap_readpage_actor); > if (ret <= 0) { > WARN_ON_ONCE(ret == 0); > @@ -328,7 +338,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, > while (!list_empty(pages)) { > struct page *page = lru_to_page(pages); > > - if (page_offset(page) >= (u64)pos + length) > + if (file_offset_of_page(page) >= (u64)pos + length) > break; > > list_del(&page->lru); > @@ -342,7 +352,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, > * readpages call itself as every page gets checked again once > * actually needed. > */ > - *done += PAGE_SIZE; > + *done += page_size(page); > put_page(page); > } > > @@ -355,9 +365,14 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, > { > struct iomap_readpage_ctx *ctx = data; > loff_t done, ret; > + size_t left = 0; > + > + if (ctx->cur_page) > + left = page_size(ctx->cur_page) - > + offset_in_this_page(ctx->cur_page, pos); > > for (done = 0; done < length; done += ret) { > - if (ctx->cur_page && offset_in_page(pos + done) == 0) { > + if (ctx->cur_page && left == 0) { > if (!ctx->cur_page_in_bio) > unlock_page(ctx->cur_page); > put_page(ctx->cur_page); > @@ -369,14 +384,27 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, > if (!ctx->cur_page) > break; > ctx->cur_page_in_bio = false; > + left = page_size(ctx->cur_page); > } > ret = iomap_readpage_actor(inode, pos + done, length - done, > ctx, iomap); > + left -= ret; > } > > return done; > } > > +/* move to fs.h? */ > +static inline struct page *readahead_first_page(struct list_head *head) > +{ > + return list_entry(head->prev, struct page, lru); > +} > + > +static inline struct page *readahead_last_page(struct list_head *head) > +{ > + return list_entry(head->next, struct page, lru); > +} > + > int > iomap_readpages(struct address_space *mapping, struct list_head *pages, > unsigned nr_pages, const struct iomap_ops *ops) > @@ -385,9 +413,10 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages, > .pages = pages, > .is_readahead = true, > }; > - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); > - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); > - loff_t length = last - pos + PAGE_SIZE, ret = 0; > + loff_t pos = file_offset_of_page(readahead_first_page(pages)); > + loff_t end = file_offset_of_next_page(readahead_last_page(pages)); > + loff_t length = end - pos; > + loff_t ret = 0; > > while (length > 0) { > ret = iomap_apply(mapping->host, pos, length, 0, ops, > @@ -410,7 +439,7 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages, > } > > /* > - * Check that we didn't lose a page due to the arcance calling > + * Check that we didn't lose a page due to the arcane calling > * conventions.. > */ > WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); > @@ -435,7 +464,7 @@ iomap_is_partially_uptodate(struct page *page, unsigned long from, > unsigned i; > > /* Limit range to one page */ > - len = min_t(unsigned, PAGE_SIZE - from, count); > + len = min_t(unsigned, page_size(page) - from, count); > > /* First and last blocks in range within page */ > first = from >> inode->i_blkbits; > @@ -474,7 +503,7 @@ iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) > * If we are invalidating the entire page, clear the dirty state from it > * and release it to avoid unnecessary buildup of the LRU. > */ > - if (offset == 0 && len == PAGE_SIZE) { > + if (offset == 0 && len == page_size(page)) { > WARN_ON_ONCE(PageWriteback(page)); > cancel_dirty_page(page); > iomap_page_release(page); > @@ -550,18 +579,20 @@ static int > __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, > struct page *page, struct iomap *iomap) > { > - struct iomap_page *iop = iomap_page_create(inode, page); > loff_t block_size = i_blocksize(inode); > loff_t block_start = pos & ~(block_size - 1); > loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1); > - unsigned from = offset_in_page(pos), to = from + len, poff, plen; > + unsigned from = offset_in_this_page(page, pos); > + unsigned to = from + len; > + unsigned poff, plen; > int status = 0; > > if (PageUptodate(page)) > return 0; > + iomap_page_create(inode, page); > > do { > - iomap_adjust_read_range(inode, iop, &block_start, > + iomap_adjust_read_range(inode, page, &block_start, > block_end - block_start, &poff, &plen); > if (plen == 0) > break; > @@ -673,7 +704,7 @@ __iomap_write_end(struct inode *inode, loff_t pos, unsigned len, > */ > if (unlikely(copied < len && !PageUptodate(page))) > return 0; > - iomap_set_range_uptodate(page, offset_in_page(pos), len); > + iomap_set_range_uptodate(page, offset_in_this_page(page, pos), len); > iomap_set_page_dirty(page); > return copied; > } > @@ -685,6 +716,7 @@ iomap_write_end_inline(struct inode *inode, struct page *page, > void *addr; > > WARN_ON_ONCE(!PageUptodate(page)); > + BUG_ON(PageCompound(page)); > BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); > > addr = kmap_atomic(page); > @@ -749,6 +781,10 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > unsigned long bytes; /* Bytes to write to page */ > size_t copied; /* Bytes copied from user */ > > + /* > + * XXX: We don't know what size page we'll find in the > + * page cache, so only copy up to a regular page boundary. How might we fix this? > + */ > offset = offset_in_page(pos); > bytes = min_t(unsigned long, PAGE_SIZE - offset, > iov_iter_count(i)); > @@ -1041,19 +1077,18 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) > lock_page(page); > size = i_size_read(inode); > if ((page->mapping != inode->i_mapping) || > - (page_offset(page) > size)) { > + (file_offset_of_page(page) > size)) { > /* We overload EFAULT to mean page got truncated */ > ret = -EFAULT; > goto out_unlock; > } > > - /* page is wholly or partially inside EOF */ > - if (((page->index + 1) << PAGE_SHIFT) > size) > - length = offset_in_page(size); > + offset = file_offset_of_page(page); > + if (size - offset < page_size(page)) > + length = offset_in_this_page(page, size); > else > - length = PAGE_SIZE; > + length = page_size(page); > > - offset = page_offset(page); > while (length > 0) { > ret = iomap_apply(inode, offset, length, > IOMAP_WRITE | IOMAP_FAULT, ops, page, > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index bc499ceae392..86be24a8259b 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -139,7 +139,7 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, > struct iomap_page { > atomic_t read_count; > atomic_t write_count; > - DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); > + unsigned long uptodate[]; > }; > > static inline struct iomap_page *to_iomap_page(struct page *page) > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 726d7f046b49..6892cd712428 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1414,6 +1414,8 @@ static inline void clear_page_pfmemalloc(struct page *page) > extern void pagefault_out_of_memory(void); > > #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) > +#define offset_in_this_page(page, p) \ > + ((unsigned long)(p) & (page_size(page) - 1)) What's the difference between these two macros? I guess the macro with the longer name works for compound pages? Whereas the first one only works with order-0 pages? --D > > /* > * Flags passed to show_mem() and show_free_areas() to suppress output in > -- > 2.23.0.rc1 >