Re: [PATCH] ceph: refactor ceph_sync_read()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 9, 2018 at 12:48 AM Ilya Dryomov <idryomov@xxxxxxxxx> wrote:
>
> On Mon, Oct 8, 2018 at 3:16 AM Yan, Zheng <zyan@xxxxxxxxxx> wrote:
> >
> >
> >
> > On Oct 3, 2018, at 16:31, Ilya Dryomov <idryomov@xxxxxxxxx> wrote:
> >
> > On Sun, Sep 30, 2018 at 3:58 AM Yan, Zheng <zyan@xxxxxxxxxx> wrote:
> >
> >
> > Make ceph_sync_read() do OSD request per object. This avoid large memory
> > allocation during large read.
> >
> > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx>
> > ---
> > fs/ceph/file.c | 213 +++++++++++++++++++++++--------------------------
> > 1 file changed, 101 insertions(+), 112 deletions(-)
> >
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 92ab20433682..79c610cb5f54 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -556,75 +556,6 @@ enum {
> >        READ_INLINE =  3,
> > };
> >
> > -/*
> > - * Read a range of bytes striped over one or more objects.  Iterate over
> > - * objects we stripe over.  (That's not atomic, but good enough for now.)
> > - *
> > - * If we get a short result from the OSD, check against i_size; we need to
> > - * only return a short read to the caller if we hit EOF.
> > - */
> > -static int striped_read(struct inode *inode,
> > -                       u64 pos, u64 len,
> > -                       struct page **pages, int num_pages,
> > -                       int page_align, int *checkeof)
> > -{
> > -       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > -       struct ceph_inode_info *ci = ceph_inode(inode);
> > -       u64 this_len;
> > -       loff_t i_size;
> > -       int page_idx;
> > -       int ret, read = 0;
> > -       bool hit_stripe, was_short;
> > -
> > -       /*
> > -        * we may need to do multiple reads.  not atomic, unfortunately.
> > -        */
> > -more:
> > -       this_len = len;
> > -       page_idx = (page_align + read) >> PAGE_SHIFT;
> > -       ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
> > -                                 &ci->i_layout, pos, &this_len,
> > -                                 ci->i_truncate_seq, ci->i_truncate_size,
> > -                                 pages + page_idx, num_pages - page_idx,
> > -                                 ((page_align + read) & ~PAGE_MASK));
> > -       if (ret == -ENOENT)
> > -               ret = 0;
> > -       hit_stripe = this_len < len;
> > -       was_short = ret >= 0 && ret < this_len;
> > -       dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
> > -            ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
> > -
> > -       i_size = i_size_read(inode);
> > -       if (ret >= 0) {
> > -               if (was_short && (pos + ret < i_size)) {
> > -                       int zlen = min(this_len - ret, i_size - pos - ret);
> > -                       int zoff = page_align + read + ret;
> > -                       dout(" zero gap %llu to %llu\n",
> > -                            pos + ret, pos + ret + zlen);
> > -                       ceph_zero_page_vector_range(zoff, zlen, pages);
> > -                       ret += zlen;
> > -               }
> > -
> > -               read += ret;
> > -               pos += ret;
> > -               len -= ret;
> > -
> > -               /* hit stripe and need continue*/
> > -               if (len && hit_stripe && pos < i_size)
> > -                       goto more;
> > -       }
> > -
> > -       if (read > 0) {
> > -               ret = read;
> > -               /* did we bounce off eof? */
> > -               if (pos + len > i_size)
> > -                       *checkeof = CHECK_EOF;
> > -       }
> > -
> > -       dout("striped_read returns %d\n", ret);
> > -       return ret;
> > -}
> > -
> > /*
> >  * Completely synchronous read and write methods. Direct from __user
> >  * buffer to osd, or directly to user pages (if O_DIRECT).
> > @@ -632,15 +563,16 @@ static int striped_read(struct inode *inode,
> >  * If the read spans object boundary, just do multiple reads.
> >  */
> > static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
> > -                             int *checkeof)
> > +                             int *retry_op)
> > {
> >        struct file *file = iocb->ki_filp;
> >        struct inode *inode = file_inode(file);
> > -       struct page **pages;
> > -       u64 off = iocb->ki_pos;
> > -       int num_pages;
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +       struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > +       struct ceph_osd_client *osdc = &fsc->client->osdc;
> >        ssize_t ret;
> > -       size_t len = iov_iter_count(to);
> > +       u64 off = iocb->ki_pos;
> > +       u64 len = iov_iter_count(to);
> >
> >        dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
> >             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
> > @@ -653,61 +585,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
> >         * but it will at least behave sensibly when they are
> >         * in sequence.
> >         */
> > -       ret = filemap_write_and_wait_range(inode->i_mapping, off,
> > -                                               off + len);
> > +       ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len);
> >        if (ret < 0)
> >                return ret;
> >
> > -       if (unlikely(to->type & ITER_PIPE)) {
> > +       ret = 0;
> > +       while ((len = iov_iter_count(to)) > 0) {
> > +               struct ceph_osd_request *req;
> > +               struct page **pages;
> > +               int num_pages;
> >                size_t page_off;
> > -               ret = iov_iter_get_pages_alloc(to, &pages, len,
> > -                                              &page_off);
> > -               if (ret <= 0)
> > -                       return -ENOMEM;
> > -               num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> > +               u64 i_size;
> > +               bool more;
> > +
> > +               req = ceph_osdc_new_request(osdc, &ci->i_layout,
> > +                                       ci->i_vino, off, &len, 0, 1,
> > +                                       CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
> > +                                       NULL, ci->i_truncate_seq,
> > +                                       ci->i_truncate_size, false);
> > +               if (IS_ERR(req)) {
> > +                       ret = PTR_ERR(req);
> > +                       break;
> > +               }
> >
> > -               ret = striped_read(inode, off, ret, pages, num_pages,
> > -                                  page_off, checkeof);
> > -               if (ret > 0) {
> > -                       iov_iter_advance(to, ret);
> > -                       off += ret;
> > +               more = len < iov_iter_count(to);
> > +
> > +               if (unlikely(to->type & ITER_PIPE)) {
> > +                       ret = iov_iter_get_pages_alloc(to, &pages, len,
> > +                                                      &page_off);
> > +                       if (ret <= 0) {
> > +                               ceph_osdc_put_request(req);
> > +                               ret = -ENOMEM;
> > +                               break;
> > +                       }
> > +                       num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
> > +                       if (ret < len) {
> > +                               len = ret;
> > +                               osd_req_op_extent_update(req, 0, len);
> > +                               more = false;
> > +                       }
> >                } else {
> > -                       iov_iter_advance(to, 0);
> > +                       num_pages = calc_pages_for(off, len);
> > +                       page_off = off & ~PAGE_MASK;
> > +                       pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> > +                       if (IS_ERR(pages)) {
> > +                               ceph_osdc_put_request(req);
> > +                               ret = PTR_ERR(pages);
> > +                               break;
> > +                       }
> >                }
> > -               ceph_put_page_vector(pages, num_pages, false);
> > -       } else {
> > -               num_pages = calc_pages_for(off, len);
> > -               pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
> > -               if (IS_ERR(pages))
> > -                       return PTR_ERR(pages);
> > -
> > -               ret = striped_read(inode, off, len, pages, num_pages,
> > -                                  (off & ~PAGE_MASK), checkeof);
> > -               if (ret > 0) {
> > -                       int l, k = 0;
> > -                       size_t left = ret;
> > -
> > -                       while (left) {
> > -                               size_t page_off = off & ~PAGE_MASK;
> > -                               size_t copy = min_t(size_t, left,
> > -                                                   PAGE_SIZE - page_off);
> > -                               l = copy_page_to_iter(pages[k++], page_off,
> > -                                                     copy, to);
> > -                               off += l;
> > -                               left -= l;
> > -                               if (l < copy)
> > +
> > +               osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
> > +                                                false, false);
> > +               ret = ceph_osdc_start_request(osdc, req, false);
> > +               if (!ret)
> > +                       ret = ceph_osdc_wait_request(osdc, req);
> > +               ceph_osdc_put_request(req);
> > +
> > +               i_size = i_size_read(inode);
> > +               dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
> > +                    off, len, ret, i_size, (more ? " MORE" : ""));
> > +
> > +               if (ret == -ENOENT)
> > +                       ret = 0;
> > +               if (ret >= 0 && ret < len && (off + ret < i_size)) {
> > +                       int zlen = min(len - ret, i_size - off - ret);
> > +                       int zoff = page_off + ret;
> > +                       dout("sync_read zero gap %llu~%llu\n",
> > +                             off + ret, off + ret + zlen);
> > +                       ceph_zero_page_vector_range(zoff, zlen, pages);
> > +                       ret += zlen;
> > +               }
> > +
> > +               if (unlikely(to->type & ITER_PIPE)) {
> > +                       if (ret > 0) {
> > +                               iov_iter_advance(to, ret);
> > +                               off += ret;
> > +                       } else {
> > +                               iov_iter_advance(to, 0);
> > +                       }
> > +                       ceph_put_page_vector(pages, num_pages, false);
> > +               } else {
> > +                       int idx = 0;
> > +                       size_t left = ret > 0 ? ret : 0;
> > +                       while (left > 0) {
> > +                               size_t len, copied;
> > +                               page_off = off & ~PAGE_MASK;
> > +                               len = min_t(size_t, left, PAGE_SIZE - page_off);
> > +                               copied = copy_page_to_iter(pages[idx++],
> > +                                                          page_off, len, to);
> > +                               off += copied;
> > +                               left -= copied;
> > +                               if (copied < len) {
> > +                                       ret = -EFAULT;
> >                                        break;
> > +                               }
> >                        }
> > +                       ceph_release_page_vector(pages, num_pages);
> >                }
> > -               ceph_release_page_vector(pages, num_pages);
> > +
> > +               if (ret <= 0 || off >= i_size || !more)
> > +                       break;
> >        }
> >
> >        if (off > iocb->ki_pos) {
> > +               if (ret >= 0 &&
> > +                   iov_iter_count(to) > 0 && off >= i_size_read(inode))
> > +                       *retry_op = CHECK_EOF;
> >                ret = off - iocb->ki_pos;
> >                iocb->ki_pos = off;
> >        }
> >
> > -       dout("sync_read result %zd\n", ret);
> > +       dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
> >        return ret;
> > }
> >
> >
> > This is a step forward -- a fixed size allocation vs something
> > arbitrary up to 2G worth of pages (MAX_RW_COUNT).  But upon deeper
> > look, why allocate any temporary pages at all?  Why not just read
> > directly into iov_iter instead, like you do in the ITER_PIPE case?
> >
> > I don't see a reason to keep special casing ITER_PIPE.
> >
> >
> > Sorry for the delay.  This is not direct-IO , I think reading directly into iov_iter is not good.
>
> Ah, I missed that, although it looks like it might get called in the
> O_DIRECT case too, on a retry after EOF?
>

Yes. But we need to use bio vector for direct read (see commit
fc218544fbc80). The problem is there is no helper function that copies
data bio vector into iov_iter.

Regards
Yan, Zheng


> Thanks,
>
>                 Ilya



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux