Hi David. I implemented a version with iov_iter_xarray (below). It appears to be doing "the right thing" when it gets called, but then I get a backtrace in the kernel ring buffer "RIP: 0010:read_pages+0x1a1/0x2c0" which is page dumped because: VM_BUG_ON_PAGE(!PageLocked(page)) ------------[ cut here ]------------ kernel BUG at include/linux/pagemap.h:892! So it seems that in mm/readahead.c/read_pages I end up entering the "Clean up the remaining pages" part, and never make it through even one iteration... it happens whether I use readahead_expand or not. I've been looking at it a long time :-), I'll look more tomorrow... do you see anything obvious? static void orangefs_readahead_cleanup(struct xarray *i_pages, pgoff_t index, unsigned int npages, struct iov_iter *iter) { pgoff_t last; struct page *page; XA_STATE(xas, i_pages, index); last = npages - 1; if (iov_iter_count(iter) > 0) iov_iter_zero(iov_iter_count(iter), iter); rcu_read_lock(); xas_for_each(&xas, page, last) { page_endio(page, false, 0); put_page(page); } rcu_read_unlock(); } static void orangefs_readahead(struct readahead_control *rac) { unsigned int npages; loff_t offset; struct iov_iter iter; struct file *file = rac->file; struct inode *inode = file->f_mapping->host; struct xarray *i_pages; pgoff_t index; int ret; loff_t new_start = readahead_index(rac) * PAGE_SIZE; size_t new_len = 524288; readahead_expand(rac, new_start, new_len); npages = readahead_count(rac); offset = readahead_pos(rac); i_pages = &file->f_mapping->i_pages; iov_iter_xarray(&iter, READ, i_pages, offset, npages * PAGE_SIZE); /* read in the pages. */ ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &offset, &iter, npages * PAGE_SIZE, inode->i_size, NULL, NULL, file); /* clean up. */ index = offset >> PAGE_SHIFT; orangefs_readahead_cleanup(i_pages, index, npages, &iter); } On Wed, Mar 24, 2021 at 7:10 AM David Howells <dhowells@xxxxxxxxxx> wrote: > > Mike Marshall <hubcap@xxxxxxxxxxxx> wrote: > > > /* allocate an array of bio_vecs. */ > > bvs = kzalloc(npages * (sizeof(struct bio_vec)), GFP_KERNEL); > > > > Better to use kcalloc() here as it has overflow checking. > > > /* hook the bio_vecs to the pages. */ > > for (i = 0; i < npages; i++) { > > bvs[i].bv_page = pages[i]; > > bvs[i].bv_len = PAGE_SIZE; > > bvs[i].bv_offset = 0; > > } > > > > iov_iter_bvec(&iter, READ, bvs, npages, npages * PAGE_SIZE); > > > > /* read in the pages. */ > > ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &offset, &iter, > > npages * PAGE_SIZE, inode->i_size, NULL, NULL, file); > > > > /* clean up. */ > > for (i = 0; i < npages; i++) { > > SetPageUptodate(bvs[i].bv_page); > > unlock_page(bvs[i].bv_page); > > put_page(bvs[i].bv_page); > > } > > kfree(pages); > > kfree(bvs); > > } > > Could you try ITER_XARRAY instead of ITER_BVEC: > > https://lore.kernel.org/linux-fsdevel/161653786033.2770958.14154191921867463240.stgit@xxxxxxxxxxxxxxxxxxxxxx/T/#u > > Setting the iterator looks like: > > iov_iter_xarray(&iter, READ, &mapping->i_pages, > offset, npages * PAGE_SIZE); > > The xarray iterator will handle THPs, but I'm not sure if bvecs will. > > Cleanup afterwards would look something like: > > static void afs_file_read_done(struct afs_read *req) > { > struct afs_vnode *vnode = req->vnode; > struct page *page; > pgoff_t index = req->pos >> PAGE_SHIFT; > pgoff_t last = index + req->nr_pages - 1; > > XA_STATE(xas, &vnode->vfs_inode.i_mapping->i_pages, index); > > if (iov_iter_count(req->iter) > 0) { > /* The read was short - clear the excess buffer. */ > _debug("afterclear %zx %zx %llx/%llx", > req->iter->iov_offset, > iov_iter_count(req->iter), > req->actual_len, req->len); > iov_iter_zero(iov_iter_count(req->iter), req->iter); > } > > rcu_read_lock(); > xas_for_each(&xas, page, last) { > page_endio(page, false, 0); > put_page(page); > } > rcu_read_unlock(); > > task_io_account_read(req->len); > req->cleanup = NULL; > } > > David >