On Fri, 2022-03-18 at 09:50 -0400, Jeff Layton wrote: > Have ceph issue sparse reads instead of normal ones. The callers now > preallocate an sparse extent buffer that the libceph receive code can > populate and hand back after the operation completes. > > After a successful read, we can't use the req->r_result value to > determine the amount of data "read", so instead we set the received > length to be from the end of the last extent in the buffer. Any > interstitial holes will have been filled by the receive code. > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> > --- > fs/ceph/addr.c | 13 +++++++++++-- > fs/ceph/file.c | 41 ++++++++++++++++++++++++++++++++++------- > fs/ceph/super.h | 7 +++++++ > 3 files changed, 52 insertions(+), 9 deletions(-) > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c > index 752c421c9922..6d4f9fbf22ce 100644 > --- a/fs/ceph/addr.c > +++ b/fs/ceph/addr.c > @@ -220,6 +220,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) > struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); > struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); > struct netfs_read_subrequest *subreq = req->r_priv; > + struct ceph_osd_req_op *op = &req->r_ops[0]; > int num_pages; > int err = req->r_result; > > @@ -230,7 +231,9 @@ static void finish_netfs_read(struct ceph_osd_request *req) > subreq->len, i_size_read(req->r_inode)); > > /* no object means success but no data */ > - if (err == -ENOENT) > + if (err >= 0) > + err = ceph_sparse_ext_map_end(op); > + else if (err == -ENOENT) > err = 0; > else if (err == -EBLOCKLISTED) > fsc->blocklisted = true; > @@ -317,7 +320,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) > return; > > req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, > - 0, 1, CEPH_OSD_OP_READ, > + 0, 1, CEPH_OSD_OP_SPARSE_READ, > CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, > NULL, ci->i_truncate_seq, ci->i_truncate_size, false); > if (IS_ERR(req)) { > @@ -326,6 +329,12 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) > goto out; > } > > + err = ceph_alloc_sparse_ext_map(&req->r_ops[0], CEPH_SPARSE_EXT_ARRAY_INITIAL); > + if (err) { > + ceph_osdc_put_request(req); > + goto out; > + } > + > dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); > iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); > err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off); > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index feb75eb1cd82..deba39989a07 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -931,10 +931,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > bool more; > int idx; > size_t left; > + struct ceph_osd_req_op *op; > > req = ceph_osdc_new_request(osdc, &ci->i_layout, > ci->i_vino, off, &len, 0, 1, > - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, > + CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ, > NULL, ci->i_truncate_seq, > ci->i_truncate_size, false); > if (IS_ERR(req)) { > @@ -955,6 +956,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > > osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, > false, false); > + > + op = &req->r_ops[0]; > + ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL); > + if (ret) { > + ceph_osdc_put_request(req); > + break; > + } > + > ret = ceph_osdc_start_request(osdc, req, false); > if (!ret) > ret = ceph_osdc_wait_request(osdc, req); > @@ -964,23 +973,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, > req->r_end_latency, > len, ret); > > - ceph_osdc_put_request(req); > - > i_size = i_size_read(inode); > dout("sync_read %llu~%llu got %zd i_size %llu%s\n", > off, len, ret, i_size, (more ? " MORE" : "")); > > - if (ret == -ENOENT) > + /* Fix it to go to end of extent map */ > + if (ret >= 0) > + ret = ceph_sparse_ext_map_end(op); > + else if (ret == -ENOENT) > ret = 0; > + > if (ret >= 0 && ret < len && (off + ret < i_size)) { > int zlen = min(len - ret, i_size - off - ret); > int zoff = page_off + ret; > + > dout("sync_read zero gap %llu~%llu\n", > - off + ret, off + ret + zlen); > + off + ret, off + ret + zlen); > ceph_zero_page_vector_range(zoff, zlen, pages); > ret += zlen; > } > > + ceph_osdc_put_request(req); > + > idx = 0; > left = ret > 0 ? ret : 0; > while (left > 0) { > @@ -1095,6 +1109,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) > struct inode *inode = req->r_inode; > struct ceph_aio_request *aio_req = req->r_priv; > struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); > + struct ceph_osd_req_op *op = &req->r_ops[0]; > struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; > unsigned int len = osd_data->bvec_pos.iter.bi_size; > > @@ -1117,6 +1132,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) > } > rc = -ENOMEM; > } else if (!aio_req->write) { > + if (rc >= 0) > + rc = ceph_sparse_ext_map_end(op); > if (rc == -ENOENT) > rc = 0; > if (rc >= 0 && len > rc) { > @@ -1280,6 +1297,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, > while (iov_iter_count(iter) > 0) { > u64 size = iov_iter_count(iter); > ssize_t len; > + struct ceph_osd_req_op *op; > > if (write) > size = min_t(u64, size, fsc->mount_options->wsize); > @@ -1291,7 +1309,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, > vino, pos, &size, 0, > 1, > write ? CEPH_OSD_OP_WRITE : > - CEPH_OSD_OP_READ, > + CEPH_OSD_OP_SPARSE_READ, > flags, snapc, > ci->i_truncate_seq, > ci->i_truncate_size, > @@ -1342,6 +1360,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, > } > > osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); > + op = &req->r_ops[0]; > + ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL); > + if (ret) { > + ceph_osdc_put_request(req); > + break; > + } > > if (aio_req) { > aio_req->total_len += len; > @@ -1370,8 +1394,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, > > size = i_size_read(inode); > if (!write) { > - if (ret == -ENOENT) > + if (ret >= 0) > + ret = ceph_sparse_ext_map_end(op); > + else if (ret == -ENOENT) > ret = 0; > + > if (ret >= 0 && ret < len && pos + ret < size) { > struct iov_iter i; > int zlen = min_t(size_t, len - ret, > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index 250aefecd628..ad09c26afac6 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -75,6 +75,13 @@ > #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ > #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ > > +/* > + * How big an extent array should we preallocate for a sparse read? This is > + * just a starting value. If we get more than this back from the OSD, the > + * receiver will reallocate. > + */ > +#define CEPH_SPARSE_EXT_ARRAY_INITIAL 16 > + > struct ceph_mount_options { > unsigned int flags; > For the record, I don't see us merging this patch as-is. This is just what I was using for testing, but in practice, we may want to just use sparse reads when necessary (i.e. only with fscrypt enabled). -- Jeff Layton <jlayton@xxxxxxxxxx>