Re: [PATCH 1/2] libceph: add new iov_iter-based ceph_msg_data_type and ceph_osd_data_type

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 2022-06-09 at 15:34 -0400, Jeff Layton wrote:
> Add an iov_iter to the unions in ceph_msg_data and ceph_msg_data_cursor.
> Instead of requiring a list of pages or bvecs, we can just use an
> iov_iter directly, and avoid extra allocations.
> 
> Note that we do assume that the pages represented by the iter are pinned
> such that they shouldn't incur page faults.
> 
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> ---
>  include/linux/ceph/messenger.h  |  5 +++
>  include/linux/ceph/osd_client.h |  4 +++
>  net/ceph/messenger.c            | 64 +++++++++++++++++++++++++++++++++
>  net/ceph/osd_client.c           | 27 ++++++++++++++
>  4 files changed, 100 insertions(+)
> 
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 9fd7255172ad..c259021ca4a8 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -123,6 +123,7 @@ enum ceph_msg_data_type {
>  	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
>  #endif /* CONFIG_BLOCK */
>  	CEPH_MSG_DATA_BVECS,	/* data source/destination is a bio_vec array */
> +	CEPH_MSG_DATA_ITER,	/* data source/destination is an iov_iter */
>  };
>  
>  #ifdef CONFIG_BLOCK
> @@ -224,6 +225,7 @@ struct ceph_msg_data {
>  			bool		own_pages;
>  		};
>  		struct ceph_pagelist	*pagelist;
> +		struct iov_iter		iter;
>  	};
>  };
>  
> @@ -248,6 +250,7 @@ struct ceph_msg_data_cursor {
>  			struct page	*page;		/* page from list */
>  			size_t		offset;		/* bytes from list */
>  		};
> +		struct iov_iter		iov_iter;
>  	};
>  };
>  
> @@ -605,6 +608,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
>  #endif /* CONFIG_BLOCK */
>  void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
>  			     struct ceph_bvec_iter *bvec_pos);
> +void ceph_msg_data_add_iter(struct ceph_msg *msg,
> +			    struct iov_iter *iter);
>  
>  struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
>  			       gfp_t flags, bool can_fail);
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 6ec3cb2ac457..ef0ad534b6c5 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -108,6 +108,7 @@ enum ceph_osd_data_type {
>  	CEPH_OSD_DATA_TYPE_BIO,
>  #endif /* CONFIG_BLOCK */
>  	CEPH_OSD_DATA_TYPE_BVECS,
> +	CEPH_OSD_DATA_TYPE_ITER,
>  };
>  
>  struct ceph_osd_data {
> @@ -131,6 +132,7 @@ struct ceph_osd_data {
>  			struct ceph_bvec_iter	bvec_pos;
>  			u32			num_bvecs;
>  		};
> +		struct iov_iter		iter;
>  	};
>  };
>  
> @@ -501,6 +503,8 @@ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
>  void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
>  					 unsigned int which,
>  					 struct ceph_bvec_iter *bvec_pos);
> +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
> +				unsigned int which, struct iov_iter *iter);
>  
>  extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
>  					unsigned int which,
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 6056c8f7dd4c..cea0d75dfc49 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -964,6 +964,48 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
>  	return true;
>  }
>  
> +static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
> +					size_t length)
> +{
> +	struct ceph_msg_data *data = cursor->data;
> +
> +	cursor->iov_iter = data->iter;
> +	iov_iter_truncate(&cursor->iov_iter, length);
> +	cursor->resid = iov_iter_count(&cursor->iov_iter);
> +}
> +
> +static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor,
> +						size_t *page_offset,
> +						size_t *length)
> +{
> +	struct page *page;
> +	ssize_t len = iov_iter_get_pages(&cursor->iov_iter, &page, PAGE_SIZE,
> +					 1, page_offset);
> +
> +	BUG_ON(len < 0);
> +
> +	/*
> +	 * The assumption is that the pages represented by the iov_iter are
> +	 * pinned, with the references held by the upper-level callers, or
> +	 * by virtue of being under writeback. Given that, we should be
> +	 * safe to put the page reference here and still return the pointer.
> +	 */
> +	VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page);
> +	put_page(page);
> +	*length = min_t(size_t, len, cursor->resid);
> +	return page;
> +}
> +
> +static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor,
> +					size_t bytes)
> +{
> +	BUG_ON(bytes > cursor->resid);
> +	cursor->resid -= bytes;
> +	iov_iter_advance(&cursor->iov_iter, bytes);
> +
> +	return cursor->resid;
> +}
> +
>  /*
>   * Message data is handled (sent or received) in pieces, where each
>   * piece resides on a single page.  The network layer might not
> @@ -991,6 +1033,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
>  	case CEPH_MSG_DATA_BVECS:
>  		ceph_msg_data_bvecs_cursor_init(cursor, length);
>  		break;
> +	case CEPH_MSG_DATA_ITER:
> +		ceph_msg_data_iter_cursor_init(cursor, length);
> +		break;
>  	case CEPH_MSG_DATA_NONE:
>  	default:
>  		/* BUG(); */
> @@ -1038,6 +1083,9 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
>  	case CEPH_MSG_DATA_BVECS:
>  		page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
>  		break;
> +	case CEPH_MSG_DATA_ITER:
> +		page = ceph_msg_data_iter_next(cursor, page_offset, length);
> +		break;
>  	case CEPH_MSG_DATA_NONE:
>  	default:
>  		page = NULL;
> @@ -1076,6 +1124,9 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
>  	case CEPH_MSG_DATA_BVECS:
>  		new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
>  		break;
> +	case CEPH_MSG_DATA_ITER:
> +		new_piece = ceph_msg_data_iter_advance(cursor, bytes);
> +		break;
>  	case CEPH_MSG_DATA_NONE:
>  	default:
>  		BUG();
> @@ -1874,6 +1925,19 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
>  }
>  EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
>  
> +void ceph_msg_data_add_iter(struct ceph_msg *msg,
> +			    struct iov_iter *iter)
> +{
> +	struct ceph_msg_data *data;
> +
> +	data = ceph_msg_data_add(msg);
> +	data->type = CEPH_MSG_DATA_ITER;
> +	data->iter = *iter;
> +
> +	msg->data_length += iov_iter_count(&data->iter);
> +}
> +EXPORT_SYMBOL(ceph_msg_data_add_iter);

I don't think this EXPORT_SYMBOL is actually needed. Nothing outside
libceph calls this. I've fixed this up in my tree, and can send a v2, or
you can just remove that line before merging.

> +
>  /*
>   * construct a new message with given type, size
>   * the new msg has a ref count of 1.
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 75761537c644..2a7e46524e71 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -171,6 +171,13 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
>  	osd_data->num_bvecs = num_bvecs;
>  }
>  
> +static void ceph_osd_iter_init(struct ceph_osd_data *osd_data,
> +			       struct iov_iter *iter)
> +{
> +	osd_data->type = CEPH_OSD_DATA_TYPE_ITER;
> +	osd_data->iter = *iter;
> +}
> +
>  static struct ceph_osd_data *
>  osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
>  {
> @@ -264,6 +271,22 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
>  }
>  EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
>  
> +/**
> + * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer
> + * @osd_req: The request to set up
> + * @which: ?
> + * @iter: The buffer iterator
> + */
> +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
> +				unsigned int which, struct iov_iter *iter)
> +{
> +	struct ceph_osd_data *osd_data;
> +
> +	osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
> +	ceph_osd_iter_init(osd_data, iter);
> +}
> +EXPORT_SYMBOL(osd_req_op_extent_osd_iter);
> +
>  static void osd_req_op_cls_request_info_pagelist(
>  			struct ceph_osd_request *osd_req,
>  			unsigned int which, struct ceph_pagelist *pagelist)
> @@ -346,6 +369,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
>  #endif /* CONFIG_BLOCK */
>  	case CEPH_OSD_DATA_TYPE_BVECS:
>  		return osd_data->bvec_pos.iter.bi_size;
> +	case CEPH_OSD_DATA_TYPE_ITER:
> +		return iov_iter_count(&osd_data->iter);
>  	default:
>  		WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
>  		return 0;
> @@ -954,6 +979,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
>  #endif
>  	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
>  		ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
> +	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) {
> +		ceph_msg_data_add_iter(msg, &osd_data->iter);
>  	} else {
>  		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
>  	}

-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux