On Mon, Oct 8, 2018 at 3:16 AM Yan, Zheng <zyan@xxxxxxxxxx> wrote: > > > > On Oct 3, 2018, at 16:31, Ilya Dryomov <idryomov@xxxxxxxxx> wrote: > > On Sun, Sep 30, 2018 at 3:58 AM Yan, Zheng <zyan@xxxxxxxxxx> wrote: > > > Make ceph_sync_read() do OSD request per object. This avoid large memory > allocation during large read. > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > --- > fs/ceph/file.c | 213 +++++++++++++++++++++++-------------------------- > 1 file changed, 101 insertions(+), 112 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 92ab20433682..79c610cb5f54 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -556,75 +556,6 @@ enum { > READ_INLINE = 3, > }; > > -/* > - * Read a range of bytes striped over one or more objects. Iterate over > - * objects we stripe over. (That's not atomic, but good enough for now.) > - * > - * If we get a short result from the OSD, check against i_size; we need to > - * only return a short read to the caller if we hit EOF. > - */ > -static int striped_read(struct inode *inode, > - u64 pos, u64 len, > - struct page **pages, int num_pages, > - int page_align, int *checkeof) > -{ > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > - struct ceph_inode_info *ci = ceph_inode(inode); > - u64 this_len; > - loff_t i_size; > - int page_idx; > - int ret, read = 0; > - bool hit_stripe, was_short; > - > - /* > - * we may need to do multiple reads. not atomic, unfortunately. > - */ > -more: > - this_len = len; > - page_idx = (page_align + read) >> PAGE_SHIFT; > - ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), > - &ci->i_layout, pos, &this_len, > - ci->i_truncate_seq, ci->i_truncate_size, > - pages + page_idx, num_pages - page_idx, > - ((page_align + read) & ~PAGE_MASK)); > - if (ret == -ENOENT) > - ret = 0; > - hit_stripe = this_len < len; > - was_short = ret >= 0 && ret < this_len; > - dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read, > - ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); > - > - i_size = i_size_read(inode); > - if (ret >= 0) { > - if (was_short && (pos + ret < i_size)) { > - int zlen = min(this_len - ret, i_size - pos - ret); > - int zoff = page_align + read + ret; > - dout(" zero gap %llu to %llu\n", > - pos + ret, pos + ret + zlen); > - ceph_zero_page_vector_range(zoff, zlen, pages); > - ret += zlen; > - } > - > - read += ret; > - pos += ret; > - len -= ret; > - > - /* hit stripe and need continue*/ > - if (len && hit_stripe && pos < i_size) > - goto more; > - } > - > - if (read > 0) { > - ret = read; > - /* did we bounce off eof? */ > - if (pos + len > i_size) > - *checkeof = CHECK_EOF; > - } > - > - dout("striped_read returns %d\n", ret); > - return ret; > -} > - > /* > * Completely synchronous read and write methods. Direct from __user > * buffer to osd, or directly to user pages (if O_DIRECT). > @@ -632,15 +563,16 @@ static int striped_read(struct inode *inode, > * If the read spans object boundary, just do multiple reads. > */ > static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > - int *checkeof) > + int *retry_op) > { > struct file *file = iocb->ki_filp; > struct inode *inode = file_inode(file); > - struct page **pages; > - u64 off = iocb->ki_pos; > - int num_pages; > + struct ceph_inode_info *ci = ceph_inode(inode); > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > + struct ceph_osd_client *osdc = &fsc->client->osdc; > ssize_t ret; > - size_t len = iov_iter_count(to); > + u64 off = iocb->ki_pos; > + u64 len = iov_iter_count(to); > > dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, > (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); > @@ -653,61 +585,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > * but it will at least behave sensibly when they are > * in sequence. > */ > - ret = filemap_write_and_wait_range(inode->i_mapping, off, > - off + len); > + ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); > if (ret < 0) > return ret; > > - if (unlikely(to->type & ITER_PIPE)) { > + ret = 0; > + while ((len = iov_iter_count(to)) > 0) { > + struct ceph_osd_request *req; > + struct page **pages; > + int num_pages; > size_t page_off; > - ret = iov_iter_get_pages_alloc(to, &pages, len, > - &page_off); > - if (ret <= 0) > - return -ENOMEM; > - num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); > + u64 i_size; > + bool more; > + > + req = ceph_osdc_new_request(osdc, &ci->i_layout, > + ci->i_vino, off, &len, 0, 1, > + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, > + NULL, ci->i_truncate_seq, > + ci->i_truncate_size, false); > + if (IS_ERR(req)) { > + ret = PTR_ERR(req); > + break; > + } > > - ret = striped_read(inode, off, ret, pages, num_pages, > - page_off, checkeof); > - if (ret > 0) { > - iov_iter_advance(to, ret); > - off += ret; > + more = len < iov_iter_count(to); > + > + if (unlikely(to->type & ITER_PIPE)) { > + ret = iov_iter_get_pages_alloc(to, &pages, len, > + &page_off); > + if (ret <= 0) { > + ceph_osdc_put_request(req); > + ret = -ENOMEM; > + break; > + } > + num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); > + if (ret < len) { > + len = ret; > + osd_req_op_extent_update(req, 0, len); > + more = false; > + } > } else { > - iov_iter_advance(to, 0); > + num_pages = calc_pages_for(off, len); > + page_off = off & ~PAGE_MASK; > + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); > + if (IS_ERR(pages)) { > + ceph_osdc_put_request(req); > + ret = PTR_ERR(pages); > + break; > + } > } > - ceph_put_page_vector(pages, num_pages, false); > - } else { > - num_pages = calc_pages_for(off, len); > - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); > - if (IS_ERR(pages)) > - return PTR_ERR(pages); > - > - ret = striped_read(inode, off, len, pages, num_pages, > - (off & ~PAGE_MASK), checkeof); > - if (ret > 0) { > - int l, k = 0; > - size_t left = ret; > - > - while (left) { > - size_t page_off = off & ~PAGE_MASK; > - size_t copy = min_t(size_t, left, > - PAGE_SIZE - page_off); > - l = copy_page_to_iter(pages[k++], page_off, > - copy, to); > - off += l; > - left -= l; > - if (l < copy) > + > + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, > + false, false); > + ret = ceph_osdc_start_request(osdc, req, false); > + if (!ret) > + ret = ceph_osdc_wait_request(osdc, req); > + ceph_osdc_put_request(req); > + > + i_size = i_size_read(inode); > + dout("sync_read %llu~%llu got %zd i_size %llu%s\n", > + off, len, ret, i_size, (more ? " MORE" : "")); > + > + if (ret == -ENOENT) > + ret = 0; > + if (ret >= 0 && ret < len && (off + ret < i_size)) { > + int zlen = min(len - ret, i_size - off - ret); > + int zoff = page_off + ret; > + dout("sync_read zero gap %llu~%llu\n", > + off + ret, off + ret + zlen); > + ceph_zero_page_vector_range(zoff, zlen, pages); > + ret += zlen; > + } > + > + if (unlikely(to->type & ITER_PIPE)) { > + if (ret > 0) { > + iov_iter_advance(to, ret); > + off += ret; > + } else { > + iov_iter_advance(to, 0); > + } > + ceph_put_page_vector(pages, num_pages, false); > + } else { > + int idx = 0; > + size_t left = ret > 0 ? ret : 0; > + while (left > 0) { > + size_t len, copied; > + page_off = off & ~PAGE_MASK; > + len = min_t(size_t, left, PAGE_SIZE - page_off); > + copied = copy_page_to_iter(pages[idx++], > + page_off, len, to); > + off += copied; > + left -= copied; > + if (copied < len) { > + ret = -EFAULT; > break; > + } > } > + ceph_release_page_vector(pages, num_pages); > } > - ceph_release_page_vector(pages, num_pages); > + > + if (ret <= 0 || off >= i_size || !more) > + break; > } > > if (off > iocb->ki_pos) { > + if (ret >= 0 && > + iov_iter_count(to) > 0 && off >= i_size_read(inode)) > + *retry_op = CHECK_EOF; > ret = off - iocb->ki_pos; > iocb->ki_pos = off; > } > > - dout("sync_read result %zd\n", ret); > + dout("sync_read result %zd retry_op %d\n", ret, *retry_op); > return ret; > } > > > This is a step forward -- a fixed size allocation vs something > arbitrary up to 2G worth of pages (MAX_RW_COUNT). But upon deeper > look, why allocate any temporary pages at all? Why not just read > directly into iov_iter instead, like you do in the ITER_PIPE case? > > I don't see a reason to keep special casing ITER_PIPE. > > > Sorry for the delay. This is not direct-IO , I think reading directly into iov_iter is not good. Ah, I missed that, although it looks like it might get called in the O_DIRECT case too, on a retry after EOF? Thanks, Ilya