Re: [PATCH v3 5/5] ceph: convert to sparse reads

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, 2022-03-18 at 09:50 -0400, Jeff Layton wrote:
> Have ceph issue sparse reads instead of normal ones. The callers now
> preallocate an sparse extent buffer that the libceph receive code can
> populate and hand back after the operation completes.
> 
> After a successful read, we can't use the req->r_result value to
> determine the amount of data "read", so instead we set the received
> length to be from the end of the last extent in the buffer. Any
> interstitial holes will have been filled by the receive code.
> 
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
>  fs/ceph/addr.c  | 13 +++++++++++--
>  fs/ceph/file.c  | 41 ++++++++++++++++++++++++++++++++++-------
>  fs/ceph/super.h |  7 +++++++
>  3 files changed, 52 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 752c421c9922..6d4f9fbf22ce 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -220,6 +220,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
>  	struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
>  	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
>  	struct netfs_read_subrequest *subreq = req->r_priv;
> +	struct ceph_osd_req_op *op = &req->r_ops[0];
>  	int num_pages;
>  	int err = req->r_result;
>  
> @@ -230,7 +231,9 @@ static void finish_netfs_read(struct ceph_osd_request *req)
>  	     subreq->len, i_size_read(req->r_inode));
>  
>  	/* no object means success but no data */
> -	if (err == -ENOENT)
> +	if (err >= 0)
> +		err = ceph_sparse_ext_map_end(op);
> +	else if (err == -ENOENT)
>  		err = 0;
>  	else if (err == -EBLOCKLISTED)
>  		fsc->blocklisted = true;
> @@ -317,7 +320,7 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
>  		return;
>  
>  	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
> -			0, 1, CEPH_OSD_OP_READ,
> +			0, 1, CEPH_OSD_OP_SPARSE_READ,
>  			CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
>  			NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
>  	if (IS_ERR(req)) {
> @@ -326,6 +329,12 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
>  		goto out;
>  	}
>  
> +	err = ceph_alloc_sparse_ext_map(&req->r_ops[0], CEPH_SPARSE_EXT_ARRAY_INITIAL);
> +	if (err) {
> +		ceph_osdc_put_request(req);
> +		goto out;
> +	}
> +
>  	dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
>  	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
>  	err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index feb75eb1cd82..deba39989a07 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -931,10 +931,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  		bool more;
>  		int idx;
>  		size_t left;
> +		struct ceph_osd_req_op *op;
>  
>  		req = ceph_osdc_new_request(osdc, &ci->i_layout,
>  					ci->i_vino, off, &len, 0, 1,
> -					CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
> +					CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ,
>  					NULL, ci->i_truncate_seq,
>  					ci->i_truncate_size, false);
>  		if (IS_ERR(req)) {
> @@ -955,6 +956,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  
>  		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
>  						 false, false);
> +
> +		op = &req->r_ops[0];
> +		ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
> +		if (ret) {
> +			ceph_osdc_put_request(req);
> +			break;
> +		}
> +
>  		ret = ceph_osdc_start_request(osdc, req, false);
>  		if (!ret)
>  			ret = ceph_osdc_wait_request(osdc, req);
> @@ -964,23 +973,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
>  					 req->r_end_latency,
>  					 len, ret);
>  
> -		ceph_osdc_put_request(req);
> -
>  		i_size = i_size_read(inode);
>  		dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
>  		     off, len, ret, i_size, (more ? " MORE" : ""));
>  
> -		if (ret == -ENOENT)
> +		/* Fix it to go to end of extent map */
> +		if (ret >= 0)
> +			ret = ceph_sparse_ext_map_end(op);
> +		else if (ret == -ENOENT)
>  			ret = 0;
> +
>  		if (ret >= 0 && ret < len && (off + ret < i_size)) {
>  			int zlen = min(len - ret, i_size - off - ret);
>  			int zoff = page_off + ret;
> +
>  			dout("sync_read zero gap %llu~%llu\n",
> -                             off + ret, off + ret + zlen);
> +				off + ret, off + ret + zlen);
>  			ceph_zero_page_vector_range(zoff, zlen, pages);
>  			ret += zlen;
>  		}
>  
> +		ceph_osdc_put_request(req);
> +
>  		idx = 0;
>  		left = ret > 0 ? ret : 0;
>  		while (left > 0) {
> @@ -1095,6 +1109,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
>  	struct inode *inode = req->r_inode;
>  	struct ceph_aio_request *aio_req = req->r_priv;
>  	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
> +	struct ceph_osd_req_op *op = &req->r_ops[0];
>  	struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
>  	unsigned int len = osd_data->bvec_pos.iter.bi_size;
>  
> @@ -1117,6 +1132,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
>  		}
>  		rc = -ENOMEM;
>  	} else if (!aio_req->write) {
> +		if (rc >= 0)
> +			rc = ceph_sparse_ext_map_end(op);
>  		if (rc == -ENOENT)
>  			rc = 0;
>  		if (rc >= 0 && len > rc) {
> @@ -1280,6 +1297,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  	while (iov_iter_count(iter) > 0) {
>  		u64 size = iov_iter_count(iter);
>  		ssize_t len;
> +		struct ceph_osd_req_op *op;
>  
>  		if (write)
>  			size = min_t(u64, size, fsc->mount_options->wsize);
> @@ -1291,7 +1309,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  					    vino, pos, &size, 0,
>  					    1,
>  					    write ? CEPH_OSD_OP_WRITE :
> -						    CEPH_OSD_OP_READ,
> +						    CEPH_OSD_OP_SPARSE_READ,
>  					    flags, snapc,
>  					    ci->i_truncate_seq,
>  					    ci->i_truncate_size,
> @@ -1342,6 +1360,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  		}
>  
>  		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
> +		op = &req->r_ops[0];
> +		ret = ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL);
> +		if (ret) {
> +			ceph_osdc_put_request(req);
> +			break;
> +		}
>  
>  		if (aio_req) {
>  			aio_req->total_len += len;
> @@ -1370,8 +1394,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>  
>  		size = i_size_read(inode);
>  		if (!write) {
> -			if (ret == -ENOENT)
> +			if (ret >= 0)
> +				ret = ceph_sparse_ext_map_end(op);
> +			else if (ret == -ENOENT)
>  				ret = 0;
> +
>  			if (ret >= 0 && ret < len && pos + ret < size) {
>  				struct iov_iter i;
>  				int zlen = min_t(size_t, len - ret,
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 250aefecd628..ad09c26afac6 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -75,6 +75,13 @@
>  #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
>  #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
>  
> +/*
> + * How big an extent array should we preallocate for a sparse read? This is
> + * just a starting value.  If we get more than this back from the OSD, the
> + * receiver will reallocate.
> + */
> +#define CEPH_SPARSE_EXT_ARRAY_INITIAL	16
> +
>  struct ceph_mount_options {
>  	unsigned int flags;
>  

For the record, I don't see us merging this patch as-is. This is just
what I was using for testing, but in practice, we may want to just use
sparse reads when necessary (i.e. only with fscrypt enabled).

-- 
Jeff Layton <jlayton@xxxxxxxxxx>



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux