On Tue, Oct 9, 2018 at 12:48 AM Ilya Dryomov <idryomov@xxxxxxxxx> wrote: > > On Mon, Oct 8, 2018 at 3:16 AM Yan, Zheng <zyan@xxxxxxxxxx> wrote: > > > > > > > > On Oct 3, 2018, at 16:31, Ilya Dryomov <idryomov@xxxxxxxxx> wrote: > > > > On Sun, Sep 30, 2018 at 3:58 AM Yan, Zheng <zyan@xxxxxxxxxx> wrote: > > > > > > Make ceph_sync_read() do OSD request per object. This avoid large memory > > allocation during large read. > > > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx> > > --- > > fs/ceph/file.c | 213 +++++++++++++++++++++++-------------------------- > > 1 file changed, 101 insertions(+), 112 deletions(-) > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > index 92ab20433682..79c610cb5f54 100644 > > --- a/fs/ceph/file.c > > +++ b/fs/ceph/file.c > > @@ -556,75 +556,6 @@ enum { > > READ_INLINE = 3, > > }; > > > > -/* > > - * Read a range of bytes striped over one or more objects. Iterate over > > - * objects we stripe over. (That's not atomic, but good enough for now.) > > - * > > - * If we get a short result from the OSD, check against i_size; we need to > > - * only return a short read to the caller if we hit EOF. > > - */ > > -static int striped_read(struct inode *inode, > > - u64 pos, u64 len, > > - struct page **pages, int num_pages, > > - int page_align, int *checkeof) > > -{ > > - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > - struct ceph_inode_info *ci = ceph_inode(inode); > > - u64 this_len; > > - loff_t i_size; > > - int page_idx; > > - int ret, read = 0; > > - bool hit_stripe, was_short; > > - > > - /* > > - * we may need to do multiple reads. not atomic, unfortunately. > > - */ > > -more: > > - this_len = len; > > - page_idx = (page_align + read) >> PAGE_SHIFT; > > - ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), > > - &ci->i_layout, pos, &this_len, > > - ci->i_truncate_seq, ci->i_truncate_size, > > - pages + page_idx, num_pages - page_idx, > > - ((page_align + read) & ~PAGE_MASK)); > > - if (ret == -ENOENT) > > - ret = 0; > > - hit_stripe = this_len < len; > > - was_short = ret >= 0 && ret < this_len; > > - dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read, > > - ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); > > - > > - i_size = i_size_read(inode); > > - if (ret >= 0) { > > - if (was_short && (pos + ret < i_size)) { > > - int zlen = min(this_len - ret, i_size - pos - ret); > > - int zoff = page_align + read + ret; > > - dout(" zero gap %llu to %llu\n", > > - pos + ret, pos + ret + zlen); > > - ceph_zero_page_vector_range(zoff, zlen, pages); > > - ret += zlen; > > - } > > - > > - read += ret; > > - pos += ret; > > - len -= ret; > > - > > - /* hit stripe and need continue*/ > > - if (len && hit_stripe && pos < i_size) > > - goto more; > > - } > > - > > - if (read > 0) { > > - ret = read; > > - /* did we bounce off eof? */ > > - if (pos + len > i_size) > > - *checkeof = CHECK_EOF; > > - } > > - > > - dout("striped_read returns %d\n", ret); > > - return ret; > > -} > > - > > /* > > * Completely synchronous read and write methods. Direct from __user > > * buffer to osd, or directly to user pages (if O_DIRECT). > > @@ -632,15 +563,16 @@ static int striped_read(struct inode *inode, > > * If the read spans object boundary, just do multiple reads. > > */ > > static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > > - int *checkeof) > > + int *retry_op) > > { > > struct file *file = iocb->ki_filp; > > struct inode *inode = file_inode(file); > > - struct page **pages; > > - u64 off = iocb->ki_pos; > > - int num_pages; > > + struct ceph_inode_info *ci = ceph_inode(inode); > > + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); > > + struct ceph_osd_client *osdc = &fsc->client->osdc; > > ssize_t ret; > > - size_t len = iov_iter_count(to); > > + u64 off = iocb->ki_pos; > > + u64 len = iov_iter_count(to); > > > > dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, > > (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); > > @@ -653,61 +585,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > > * but it will at least behave sensibly when they are > > * in sequence. > > */ > > - ret = filemap_write_and_wait_range(inode->i_mapping, off, > > - off + len); > > + ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); > > if (ret < 0) > > return ret; > > > > - if (unlikely(to->type & ITER_PIPE)) { > > + ret = 0; > > + while ((len = iov_iter_count(to)) > 0) { > > + struct ceph_osd_request *req; > > + struct page **pages; > > + int num_pages; > > size_t page_off; > > - ret = iov_iter_get_pages_alloc(to, &pages, len, > > - &page_off); > > - if (ret <= 0) > > - return -ENOMEM; > > - num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); > > + u64 i_size; > > + bool more; > > + > > + req = ceph_osdc_new_request(osdc, &ci->i_layout, > > + ci->i_vino, off, &len, 0, 1, > > + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, > > + NULL, ci->i_truncate_seq, > > + ci->i_truncate_size, false); > > + if (IS_ERR(req)) { > > + ret = PTR_ERR(req); > > + break; > > + } > > > > - ret = striped_read(inode, off, ret, pages, num_pages, > > - page_off, checkeof); > > - if (ret > 0) { > > - iov_iter_advance(to, ret); > > - off += ret; > > + more = len < iov_iter_count(to); > > + > > + if (unlikely(to->type & ITER_PIPE)) { > > + ret = iov_iter_get_pages_alloc(to, &pages, len, > > + &page_off); > > + if (ret <= 0) { > > + ceph_osdc_put_request(req); > > + ret = -ENOMEM; > > + break; > > + } > > + num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); > > + if (ret < len) { > > + len = ret; > > + osd_req_op_extent_update(req, 0, len); > > + more = false; > > + } > > } else { > > - iov_iter_advance(to, 0); > > + num_pages = calc_pages_for(off, len); > > + page_off = off & ~PAGE_MASK; > > + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); > > + if (IS_ERR(pages)) { > > + ceph_osdc_put_request(req); > > + ret = PTR_ERR(pages); > > + break; > > + } > > } > > - ceph_put_page_vector(pages, num_pages, false); > > - } else { > > - num_pages = calc_pages_for(off, len); > > - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); > > - if (IS_ERR(pages)) > > - return PTR_ERR(pages); > > - > > - ret = striped_read(inode, off, len, pages, num_pages, > > - (off & ~PAGE_MASK), checkeof); > > - if (ret > 0) { > > - int l, k = 0; > > - size_t left = ret; > > - > > - while (left) { > > - size_t page_off = off & ~PAGE_MASK; > > - size_t copy = min_t(size_t, left, > > - PAGE_SIZE - page_off); > > - l = copy_page_to_iter(pages[k++], page_off, > > - copy, to); > > - off += l; > > - left -= l; > > - if (l < copy) > > + > > + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, > > + false, false); > > + ret = ceph_osdc_start_request(osdc, req, false); > > + if (!ret) > > + ret = ceph_osdc_wait_request(osdc, req); > > + ceph_osdc_put_request(req); > > + > > + i_size = i_size_read(inode); > > + dout("sync_read %llu~%llu got %zd i_size %llu%s\n", > > + off, len, ret, i_size, (more ? " MORE" : "")); > > + > > + if (ret == -ENOENT) > > + ret = 0; > > + if (ret >= 0 && ret < len && (off + ret < i_size)) { > > + int zlen = min(len - ret, i_size - off - ret); > > + int zoff = page_off + ret; > > + dout("sync_read zero gap %llu~%llu\n", > > + off + ret, off + ret + zlen); > > + ceph_zero_page_vector_range(zoff, zlen, pages); > > + ret += zlen; > > + } > > + > > + if (unlikely(to->type & ITER_PIPE)) { > > + if (ret > 0) { > > + iov_iter_advance(to, ret); > > + off += ret; > > + } else { > > + iov_iter_advance(to, 0); > > + } > > + ceph_put_page_vector(pages, num_pages, false); > > + } else { > > + int idx = 0; > > + size_t left = ret > 0 ? ret : 0; > > + while (left > 0) { > > + size_t len, copied; > > + page_off = off & ~PAGE_MASK; > > + len = min_t(size_t, left, PAGE_SIZE - page_off); > > + copied = copy_page_to_iter(pages[idx++], > > + page_off, len, to); > > + off += copied; > > + left -= copied; > > + if (copied < len) { > > + ret = -EFAULT; > > break; > > + } > > } > > + ceph_release_page_vector(pages, num_pages); > > } > > - ceph_release_page_vector(pages, num_pages); > > + > > + if (ret <= 0 || off >= i_size || !more) > > + break; > > } > > > > if (off > iocb->ki_pos) { > > + if (ret >= 0 && > > + iov_iter_count(to) > 0 && off >= i_size_read(inode)) > > + *retry_op = CHECK_EOF; > > ret = off - iocb->ki_pos; > > iocb->ki_pos = off; > > } > > > > - dout("sync_read result %zd\n", ret); > > + dout("sync_read result %zd retry_op %d\n", ret, *retry_op); > > return ret; > > } > > > > > > This is a step forward -- a fixed size allocation vs something > > arbitrary up to 2G worth of pages (MAX_RW_COUNT). But upon deeper > > look, why allocate any temporary pages at all? Why not just read > > directly into iov_iter instead, like you do in the ITER_PIPE case? > > > > I don't see a reason to keep special casing ITER_PIPE. > > > > > > Sorry for the delay. This is not direct-IO , I think reading directly into iov_iter is not good. > > Ah, I missed that, although it looks like it might get called in the > O_DIRECT case too, on a retry after EOF? > Yes. But we need to use bio vector for direct read (see commit fc218544fbc80). The problem is there is no helper function that copies data bio vector into iov_iter. Regards Yan, Zheng > Thanks, > > Ilya