Re: [RFC 3/3] ublk_drv: add ebpf support

Ming Lei <ming.lei@xxxxxxxxxx> · Thu, 16 Feb 2023 16:11:38 +0800

On Wed, Feb 15, 2023 at 08:41:22AM +0800, Xiaoguang Wang wrote:
> Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
> can use this helper to write ebpf prog to support ublk kernel & usersapce
> zero copy, please see ublksrv test codes for more info.
> 
> Signed-off-by: Xiaoguang Wang <xiaoguang.wang@xxxxxxxxxxxxxxxxx>
> ---
>  drivers/block/ublk_drv.c       | 207 ++++++++++++++++++++++++++++++++-
>  include/uapi/linux/bpf.h       |   1 +
>  include/uapi/linux/ublk_cmd.h  |  11 ++
>  scripts/bpf_doc.py             |   4 +
>  tools/include/uapi/linux/bpf.h |   8 ++
>  5 files changed, 229 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
> index b628e9eaefa6..44c289b72864 100644
> --- a/drivers/block/ublk_drv.c
> +++ b/drivers/block/ublk_drv.c
> @@ -61,6 +61,7 @@
>  struct ublk_rq_data {
>  	struct llist_node node;
>  	struct callback_head work;
> +	struct io_mapped_kbuf *kbuf;
>  };
>  
>  struct ublk_uring_cmd_pdu {
> @@ -163,6 +164,9 @@ struct ublk_device {
>  	unsigned int		nr_queues_ready;
>  	atomic_t		nr_aborted_queues;
>  
> +	struct bpf_prog		*io_prep_prog;
> +	struct bpf_prog		*io_submit_prog;
> +
>  	/*
>  	 * Our ubq->daemon may be killed without any notification, so
>  	 * monitor each queue's daemon periodically
> @@ -189,10 +193,46 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
>  
>  static struct miscdevice ublk_misc;
>  
> +struct ublk_io_bpf_ctx {
> +	struct ublk_bpf_ctx ctx;
> +	struct ublk_device *ub;
> +	struct callback_head work;
> +};
> +
> +BPF_CALL_4(bpf_ublk_queue_sqe, struct ublk_io_bpf_ctx *, bpf_ctx,
> +	   struct io_uring_sqe *, sqe, u32, sqe_len, u32, fd)
> +{
> +	struct request *rq;
> +	struct ublk_rq_data *data;
> +	struct io_mapped_kbuf *kbuf;
> +	u16 q_id = bpf_ctx->ctx.q_id;
> +	u16 tag = bpf_ctx->ctx.tag;
> +
> +	rq = blk_mq_tag_to_rq(bpf_ctx->ub->tag_set.tags[q_id], tag);
> +	data = blk_mq_rq_to_pdu(rq);
> +	kbuf = data->kbuf;
> +	io_uring_submit_sqe(fd, sqe, sqe_len, kbuf);
> +	return 0;
> +}
> +
> +const struct bpf_func_proto ublk_bpf_queue_sqe_proto = {
> +	.func = bpf_ublk_queue_sqe,
> +	.gpl_only = false,
> +	.ret_type = RET_INTEGER,
> +	.arg1_type = ARG_ANYTHING,
> +	.arg2_type = ARG_ANYTHING,
> +	.arg3_type = ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  ublk_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  {
> -	return bpf_base_func_proto(func_id);
> +	switch (func_id) {
> +	case BPF_FUNC_ublk_queue_sqe:
> +		return &ublk_bpf_queue_sqe_proto;
> +	default:
> +		return bpf_base_func_proto(func_id);
> +	}
>  }
>  
>  static bool ublk_bpf_is_valid_access(int off, int size,
> @@ -200,6 +240,23 @@ static bool ublk_bpf_is_valid_access(int off, int size,
>  			const struct bpf_prog *prog,
>  			struct bpf_insn_access_aux *info)
>  {
> +	if (off < 0 || off >= sizeof(struct ublk_bpf_ctx))
> +		return false;
> +	if (off % size != 0)
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct ublk_bpf_ctx, q_id):
> +		return size == sizeof_field(struct ublk_bpf_ctx, q_id);
> +	case offsetof(struct ublk_bpf_ctx, tag):
> +		return size == sizeof_field(struct ublk_bpf_ctx, tag);
> +	case offsetof(struct ublk_bpf_ctx, op):
> +		return size == sizeof_field(struct ublk_bpf_ctx, op);
> +	case offsetof(struct ublk_bpf_ctx, nr_sectors):
> +		return size == sizeof_field(struct ublk_bpf_ctx, nr_sectors);
> +	case offsetof(struct ublk_bpf_ctx, start_sector):
> +		return size == sizeof_field(struct ublk_bpf_ctx, start_sector);
> +	}
>  	return false;
>  }
>  
> @@ -324,7 +381,7 @@ static void ublk_put_device(struct ublk_device *ub)
>  static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
>  		int qid)
>  {
> -       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
> +	return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
>  }
>  
>  static inline bool ublk_rq_has_data(const struct request *rq)
> @@ -492,12 +549,16 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data,
>  static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
>  		struct ublk_io *io)
>  {
> +	struct ublk_device *ub = ubq->dev;
>  	const unsigned int rq_bytes = blk_rq_bytes(req);
>  	/*
>  	 * no zero copy, we delay copy WRITE request data into ublksrv
>  	 * context and the big benefit is that pinning pages in current
>  	 * context is pretty fast, see ublk_pin_user_pages
>  	 */
> +	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
> +		return rq_bytes;

Can you explain a bit why READ isn't supported? Because WRITE zero
copy is supposed to be supported easily with splice based approach,
and I am more interested in READ zc actually.

> +
>  	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
>  		return rq_bytes;
>  
> @@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
>  	}
>  }
>  
> +static void ublk_bpf_io_submit_fn(struct callback_head *work)
> +{
> +	struct ublk_io_bpf_ctx *bpf_ctx = container_of(work,
> +			struct ublk_io_bpf_ctx, work);
> +
> +	if (bpf_ctx->ub->io_submit_prog)
> +		bpf_prog_run_pin_on_cpu(bpf_ctx->ub->io_submit_prog, bpf_ctx);
> +	kfree(bpf_ctx);
> +}
> +
> +static int ublk_init_uring_kbuf(struct request *rq)
> +{
> +	struct bio_vec *bvec;
> +	struct req_iterator rq_iter;
> +	struct bio_vec tmp;
> +	int nr_bvec = 0;
> +	struct io_mapped_kbuf *kbuf;
> +	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
> +
> +	/* Drop previous allocation */
> +	if (data->kbuf) {
> +		kfree(data->kbuf->bvec);
> +		kfree(data->kbuf);
> +		data->kbuf = NULL;
> +	}
> +
> +	kbuf = kmalloc(sizeof(struct io_mapped_kbuf), GFP_NOIO);
> +	if (!kbuf)
> +		return -EIO;
> +
> +	rq_for_each_bvec(tmp, rq, rq_iter)
> +		nr_bvec++;
> +
> +	bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO);
> +	if (!bvec) {
> +		kfree(kbuf);
> +		return -EIO;
> +	}
> +	kbuf->bvec = bvec;
> +	rq_for_each_bvec(tmp, rq, rq_iter) {
> +		*bvec = tmp;
> +		bvec++;
> +	}
> +
> +	kbuf->count = blk_rq_bytes(rq);
> +	kbuf->nr_bvecs = nr_bvec;
> +	data->kbuf = kbuf;
> +	return 0;

bio/req bvec table is immutable, so here you can pass its reference
to kbuf directly.

> +}
> +
> +static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
> +{
> +	int err;
> +	struct ublk_device *ub = ubq->dev;
> +	struct bpf_prog *prog = ub->io_prep_prog;
> +	struct ublk_io_bpf_ctx *bpf_ctx;
> +
> +	if (!prog)
> +		return 0;
> +
> +	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
> +	if (!bpf_ctx)
> +		return -EIO;
> +
> +	err = ublk_init_uring_kbuf(rq);
> +	if (err < 0) {
> +		kfree(bpf_ctx);
> +		return -EIO;
> +	}
> +	bpf_ctx->ub = ub;
> +	bpf_ctx->ctx.q_id = ubq->q_id;
> +	bpf_ctx->ctx.tag = rq->tag;
> +	bpf_ctx->ctx.op = req_op(rq);
> +	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
> +	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);

The above is for setting up target io parameter, which is supposed
to be from userspace, cause it is result of user space logic. If
these parameters are from kernel, the whole logic has to be done
in io_prep_prog.

> +	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
> +
> +	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
> +	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
> +		kfree(bpf_ctx);

task_work_add() is only available in case of ublk builtin.

> +	return 0;
> +}
> +
>  static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  		const struct blk_mq_queue_data *bd)
>  {
> @@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
>  	if (unlikely(res != BLK_STS_OK))
>  		return BLK_STS_IOERR;
>  
> +	/* Currently just for test. */
> +	ublk_run_bpf_prog(ubq, rq);

Can you explain the above comment a bit? When is the io_prep_prog called
in the non-test version? Or can you post the non-test version in list
for review.

Here it is the key for understanding the whole idea, especially when
is io_prep_prog called finally? How to pass parameters to io_prep_prog?

Given it is ebpf prog, I don't think any userspace parameter can be
passed to io_prep_prog when submitting IO, that means all user logic has
to be done inside io_prep_prog? If yes, not sure if it is one good way,
cause ebpf prog is very limited programming environment, but the user
logic could be as complicated as using btree to map io, or communicating
with remote machine for figuring out the mapping. Loop is just the
simplest direct mapping.

Thanks, 
Ming