Re: [Linux-cachefs] [RFC PATCH v2 09/11] ceph: convert readpages to fscache_read_helper

Jeff Layton <jlayton@xxxxxxxxxx> · Mon, 10 Aug 2020 07:09:21 -0400

On Sun, 2020-08-09 at 11:09 -0400, David Wysochanski wrote:
> On Fri, Jul 31, 2020 at 9:05 AM Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> > Convert ceph_readpages to use the fscache_read_helper. With this we can
> > rip out a lot of the old readpage/readpages infrastructure.
> > 
> > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> > ---
> >  fs/ceph/addr.c | 209 +++++++------------------------------------------
> >  1 file changed, 28 insertions(+), 181 deletions(-)
> > 
> > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > index cee497c108bb..8905fe4a0930 100644
> > --- a/fs/ceph/addr.c
> > +++ b/fs/ceph/addr.c
> > @@ -377,76 +377,23 @@ static int ceph_readpage(struct file *filp, struct page *page)
> >         return err;
> >  }
> > 
> > -/*
> > - * Finish an async read(ahead) op.
> > - */
> > -static void finish_read(struct ceph_osd_request *req)
> > -{
> > -       struct inode *inode = req->r_inode;
> > -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > -       struct ceph_osd_data *osd_data;
> > -       int rc = req->r_result <= 0 ? req->r_result : 0;
> > -       int bytes = req->r_result >= 0 ? req->r_result : 0;
> > -       int num_pages;
> > -       int i;
> > -
> > -       dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
> > -       if (rc == -EBLACKLISTED)
> > -               ceph_inode_to_client(inode)->blacklisted = true;
> > -
> > -       /* unlock all pages, zeroing any data we didn't read */
> > -       osd_data = osd_req_op_extent_osd_data(req, 0);
> > -       BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
> > -       num_pages = calc_pages_for((u64)osd_data->alignment,
> > -                                       (u64)osd_data->length);
> > -       for (i = 0; i < num_pages; i++) {
> > -               struct page *page = osd_data->pages[i];
> > -
> > -               if (rc < 0 && rc != -ENOENT)
> > -                       goto unlock;
> > -               if (bytes < (int)PAGE_SIZE) {
> > -                       /* zero (remainder of) page */
> > -                       int s = bytes < 0 ? 0 : bytes;
> > -                       zero_user_segment(page, s, PAGE_SIZE);
> > -               }
> > -               dout("finish_read %p uptodate %p idx %lu\n", inode, page,
> > -                    page->index);
> > -               flush_dcache_page(page);
> > -               SetPageUptodate(page);
> > -unlock:
> > -               unlock_page(page);
> > -               put_page(page);
> > -               bytes -= PAGE_SIZE;
> > -       }
> > -
> > -       ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency,
> > -                                req->r_end_latency, rc);
> > -
> > -       kfree(osd_data->pages);
> > -}
> > -
> > -/*
> > - * start an async read(ahead) operation.  return nr_pages we submitted
> > - * a read for on success, or negative error code.
> > - */
> > -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
> > -                     struct list_head *page_list, int max)
> > +static int ceph_readpages(struct file *file, struct address_space *mapping,
> > +                         struct list_head *page_list, unsigned nr_pages)
> >  {
> > -       struct ceph_osd_client *osdc =
> > -               &ceph_inode_to_client(inode)->client->osdc;
> > +       struct inode *inode = file_inode(file);
> >         struct ceph_inode_info *ci = ceph_inode(inode);
> > -       struct page *page = lru_to_page(page_list);
> > -       struct ceph_vino vino;
> > -       struct ceph_osd_request *req;
> > -       u64 off;
> > -       u64 len;
> > -       int i;
> > -       struct page **pages;
> > -       pgoff_t next_index;
> > -       int nr_pages = 0;
> > +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > +       struct ceph_file_info *fi = file->private_data;
> > +       struct ceph_rw_context *rw_ctx;
> > +       struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
> >         int got = 0;
> >         int ret = 0;
> > +       int max = fsc->mount_options->rsize >> PAGE_SHIFT;
> 
> Have you ran tests with different values of rsize?
> Specifically, rsize < readahead_size == size_of_readpages
> 
> I'm seeing a lot of problems with NFS when varying rsize are used wrt
> readahead values.  Specifically I'm seeing panics because fscache
> expects a 1:1 mapping of issue_op() to io_done() calls, and I get
> panics because multiple read completions are trying to unlock the
> same pages inside fscache_read_done().
> 
> My understanding is afs does not have such 'rsize' limitation, so it
> may not be an area that is well tested.  It could be my implementation
> of the NFS conversion though, as I thinkwhat needs to happen is the
> respect the above 1:1 mapping of issue_op() to io_done() calls, and my
> initial implementation did not do that.
> 
> FWIW, specifically this unit test was originally failing for me with a panic.
> Sun 09 Aug 2020 11:03:22 AM EDT: 1. On NFS client, install and enable
> cachefilesd
> Sun 09 Aug 2020 11:03:22 AM EDT: 2. On NFS client, mount -o
> vers=4.1,fsc,rsize=16384 127.0.0.1:/export/dir1 /mnt/dir1
> Sun 09 Aug 2020 11:03:22 AM EDT: 3. On NFS client, dd if=/dev/zero
> of=/mnt/dir1/file1.bin bs=65536 count=1
> Sun 09 Aug 2020 11:03:22 AM EDT: 4. On NFS client, echo 3 >
> /proc/sys/vm/drop_caches
> Sun 09 Aug 2020 11:03:22 AM EDT: 5. On NFS client, ./nfs-readahead.sh
> set /mnt/dir1 65536
> Sun 09 Aug 2020 11:03:23 AM EDT: 8. On NFS client, echo 3 >
> /proc/sys/vm/drop_caches
> Sun 09 Aug 2020 11:03:23 AM EDT: 9. On NFS client, dd
> if=/mnt/dir1/file1.bin of=/dev/null
> 
> 

I haven't tested much with varying rsize and wsize (setting them on
cephfs is pretty rare), but I'll plan to. What's in nfs-readahead.sh?

> 
> > +
> > +       if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
> > +               return -EINVAL;
> > 
> > +       rw_ctx = ceph_find_rw_context(fi);
> >         if (!rw_ctx) {
> >                 /* caller of readpages does not hold buffer and read caps
> >                  * (fadvise, madvise and readahead cases) */
> > @@ -459,133 +406,33 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
> >                         dout("start_read %p, no cache cap\n", inode);
> >                         ret = 0;
> >                 }
> > -               if (ret <= 0) {
> > -                       if (got)
> > -                               ceph_put_cap_refs(ci, got);
> > -                       while (!list_empty(page_list)) {
> > -                               page = lru_to_page(page_list);
> > -                               list_del(&page->lru);
> > -                               put_page(page);
> > -                       }
> > -                       return ret;
> > -               }
> > +               if (ret <= 0)
> > +                       goto out;
> >         }
> > 
> > -       off = (u64) page_offset(page);
> > +       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> > +            inode, file, rw_ctx, nr_pages, max);
> > 
> > -       /* count pages */
> > -       next_index = page->index;
> > -       list_for_each_entry_reverse(page, page_list, lru) {
> > -               if (page->index != next_index)
> > -                       break;
> > -               nr_pages++;
> > -               next_index++;
> > -               if (max && nr_pages == max)
> > -                       break;
> > -       }
> > -       len = nr_pages << PAGE_SHIFT;
> > -       dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
> > -            off, len);
> > -       vino = ceph_vino(inode);
> > -       req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
> > -                                   0, 1, CEPH_OSD_OP_READ,
> > -                                   CEPH_OSD_FLAG_READ, NULL,
> > -                                   ci->i_truncate_seq, ci->i_truncate_size,
> > -                                   false);
> > -       if (IS_ERR(req)) {
> > -               ret = PTR_ERR(req);
> > -               goto out;
> > -       }
> > +       while (ret >= 0 && !list_empty(page_list)) {
> > +               struct ceph_fscache_req *req = ceph_fsreq_alloc();
> > 
> > -       /* build page vector */
> > -       nr_pages = calc_pages_for(0, len);
> > -       pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
> > -       if (!pages) {
> > -               ret = -ENOMEM;
> > -               goto out_put;
> > -       }
> > -       for (i = 0; i < nr_pages; ++i) {
> > -               page = list_entry(page_list->prev, struct page, lru);
> > -               BUG_ON(PageLocked(page));
> > -               list_del(&page->lru);
> > -
> > -               dout("start_read %p adding %p idx %lu\n", inode, page,
> > -                    page->index);
> > -               if (add_to_page_cache_lru(page, &inode->i_data, page->index,
> > -                                         GFP_KERNEL)) {
> > -                       put_page(page);
> > -                       dout("start_read %p add_to_page_cache failed %p\n",
> > -                            inode, page);
> > -                       nr_pages = i;
> > -                       if (nr_pages > 0) {
> > -                               len = nr_pages << PAGE_SHIFT;
> > -                               osd_req_op_extent_update(req, 0, len);
> > -                               break;
> > -                       }
> > -                       goto out_pages;
> > +               if (!req) {
> > +                       ret = -ENOMEM;
> > +                       break;
> >                 }
> > -               pages[i] = page;
> > -       }
> > -       osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
> > -       req->r_callback = finish_read;
> > -       req->r_inode = inode;
> > -
> > -       dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
> > -       ret = ceph_osdc_start_request(osdc, req, false);
> > -       if (ret < 0)
> > -               goto out_pages;
> > -       ceph_osdc_put_request(req);
> > -
> > -       /* After adding locked pages to page cache, the inode holds cache cap.
> > -        * So we can drop our cap refs. */
> > -       if (got)
> > -               ceph_put_cap_refs(ci, got);
> > -
> > -       return nr_pages;
> > +               fscache_init_io_request(&req->fscache_req, cookie, &ceph_readpage_fsreq_ops);
> > +               req->fscache_req.mapping = inode->i_mapping;
> > 
> > -out_pages:
> > -       for (i = 0; i < nr_pages; ++i) {
> > -               unlock_page(pages[i]);
> > +               ret = fscache_read_helper_page_list(&req->fscache_req, page_list, max);
> > +               ceph_fsreq_put(&req->fscache_req);
> >         }
> > -       ceph_put_page_vector(pages, nr_pages, false);
> > -out_put:
> > -       ceph_osdc_put_request(req);
> >  out:
> > +       /* After adding locked pages to page cache, the inode holds Fc refs. We can drop ours. */
> >         if (got)
> >                 ceph_put_cap_refs(ci, got);
> > -       return ret;
> > -}
> > 
> > -
> > -/*
> > - * Read multiple pages.  Leave pages we don't read + unlock in page_list;
> > - * the caller (VM) cleans them up.
> > - */
> > -static int ceph_readpages(struct file *file, struct address_space *mapping,
> > -                         struct list_head *page_list, unsigned nr_pages)
> > -{
> > -       struct inode *inode = file_inode(file);
> > -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > -       struct ceph_file_info *fi = file->private_data;
> > -       struct ceph_rw_context *rw_ctx;
> > -       int rc = 0;
> > -       int max = 0;
> > -
> > -       if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
> > -               return -EINVAL;
> > -
> > -       rw_ctx = ceph_find_rw_context(fi);
> > -       max = fsc->mount_options->rsize >> PAGE_SHIFT;
> > -       dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> > -            inode, file, rw_ctx, nr_pages, max);
> > -       while (!list_empty(page_list)) {
> > -               rc = start_read(inode, rw_ctx, page_list, max);
> > -               if (rc < 0)
> > -                       goto out;
> > -       }
> > -out:
> > -       dout("readpages %p file %p ret %d\n", inode, file, rc);
> > -       return rc;
> > +       dout("readpages %p file %p ret %d\n", inode, file, ret);
> > +       return ret;
> >  }
> > 
> >  struct ceph_writeback_ctl
> > --
> > 2.26.2
> > 
> > 
> > --
> > Linux-cachefs mailing list
> > Linux-cachefs@xxxxxxxxxx
> > https://www.redhat.com/mailman/listinfo/linux-cachefs
> > 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>