Introduce vfio APIs to process events, it includes: - mlx5dv_vfio_get_events_fd() - mlx5dv_vfio_process_events() The first API returns an FD number that should be used to detect whether events exist. The second API should be called to process the existing events. At that step the PAGE request event support was added in addition to support async command mode to let the mlx5dv_vfio_process_events() return to the caller without blocking. Detailed man pages were added to describe the expected usage of the new APIs. Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx> --- debian/ibverbs-providers.symbols | 2 + providers/mlx5/libmlx5.map | 2 + providers/mlx5/man/CMakeLists.txt | 2 + providers/mlx5/man/mlx5dv_vfio_get_events_fd.3.md | 41 ++++ providers/mlx5/man/mlx5dv_vfio_process_events.3.md | 43 ++++ providers/mlx5/mlx5_vfio.c | 231 ++++++++++++++++++++- providers/mlx5/mlx5_vfio.h | 14 ++ providers/mlx5/mlx5dv.h | 8 + 8 files changed, 332 insertions(+), 11 deletions(-) create mode 100644 providers/mlx5/man/mlx5dv_vfio_get_events_fd.3.md create mode 100644 providers/mlx5/man/mlx5dv_vfio_process_events.3.md diff --git a/debian/ibverbs-providers.symbols b/debian/ibverbs-providers.symbols index 64e29b1..3e36592 100644 --- a/debian/ibverbs-providers.symbols +++ b/debian/ibverbs-providers.symbols @@ -135,6 +135,8 @@ libmlx5.so.1 ibverbs-providers #MINVER# mlx5dv_qp_cancel_posted_send_wrs@MLX5_1.20 36 _mlx5dv_mkey_check@MLX5_1.20 36 mlx5dv_get_vfio_device_list@MLX5_1.21 37 + mlx5dv_vfio_get_events_fd@MLX5_1.21 37 + mlx5dv_vfio_process_events@MLX5_1.21 37 libefa.so.1 ibverbs-providers #MINVER# * Build-Depends-Package: libibverbs-dev EFA_1.0@EFA_1.0 24 diff --git a/providers/mlx5/libmlx5.map b/providers/mlx5/libmlx5.map index 3e8a4d8..d6294d8 100644 --- a/providers/mlx5/libmlx5.map +++ b/providers/mlx5/libmlx5.map @@ -193,4 +193,6 @@ MLX5_1.20 { MLX5_1.21 { global: mlx5dv_get_vfio_device_list; + mlx5dv_vfio_get_events_fd; + mlx5dv_vfio_process_events; } MLX5_1.20; diff --git a/providers/mlx5/man/CMakeLists.txt b/providers/mlx5/man/CMakeLists.txt index 91aebed..cb3525c 100644 --- a/providers/mlx5/man/CMakeLists.txt +++ b/providers/mlx5/man/CMakeLists.txt @@ -40,6 +40,8 @@ rdma_man_pages( mlx5dv_sched_node_create.3.md mlx5dv_ts_to_ns.3 mlx5dv_wr_mkey_configure.3.md + mlx5dv_vfio_get_events_fd.3.md + mlx5dv_vfio_process_events.3.md mlx5dv_wr_post.3.md mlx5dv_wr_set_mkey_sig_block.3.md mlx5dv.7 diff --git a/providers/mlx5/man/mlx5dv_vfio_get_events_fd.3.md b/providers/mlx5/man/mlx5dv_vfio_get_events_fd.3.md new file mode 100644 index 0000000..3023bb2 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_vfio_get_events_fd.3.md @@ -0,0 +1,41 @@ +--- +layout: page +title: mlx5dv_vfio_get_events_fd +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_vfio_get_events_fd - Get the file descriptor to manage driver events. + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +int mlx5dv_vfio_get_events_fd(struct ibv_context *ctx); +``` + +# DESCRIPTION + +Returns the file descriptor to be used for managing driver events. + +# ARGUMENTS + +*ctx* +: device context that was opened for VFIO by calling mlx5dv_get_vfio_device_list(). + +# RETURN VALUE +Returns the internal matching file descriptor. + +# NOTES +Client code should poll the returned file descriptor and once there is some data to be managed immediately call *mlx5dv_vfio_process_events()*. + +# SEE ALSO + +*ibv_open_device(3)* *ibv_free_device_list(3)* *mlx5dv_get_vfio_device_list(3)* + +# AUTHOR + +Yishai Hadas <yishaih@xxxxxxxxxx> diff --git a/providers/mlx5/man/mlx5dv_vfio_process_events.3.md b/providers/mlx5/man/mlx5dv_vfio_process_events.3.md new file mode 100644 index 0000000..6c4123b --- /dev/null +++ b/providers/mlx5/man/mlx5dv_vfio_process_events.3.md @@ -0,0 +1,43 @@ +--- +layout: page +title: mlx5dv_vfio_process_events +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_vfio_process_events - process vfio driver events + +# SYNOPSIS + +```c +#include <infiniband/mlx5dv.h> + +int mlx5dv_vfio_process_events(struct ibv_context *ctx); +``` + +# DESCRIPTION + +This API should run from application thread and maintain device events. +The application is responsible to get the events FD by calling *mlx5dv_vfio_get_events_fd()* +and once the FD is pollable call the API to let driver process its internal events. + +# ARGUMENTS + +*ctx* +: device context that was opened for VFIO by calling mlx5dv_get_vfio_device_list(). + +# RETURN VALUE +Returns 0 upon success or errno value in case a failure has occurred. + +# NOTES +Application can use this API also to periodically check the device health state even if no events exist. + +# SEE ALSO + +*ibv_open_device(3)* *ibv_free_device_list(3)* *mlx5dv_get_vfio_device_list(3)* *mlx5dv_vfio_get_events_fd(3)* + +# AUTHOR + +Yishai Hadas <yishaih@xxxxxxxxxx> diff --git a/providers/mlx5/mlx5_vfio.c b/providers/mlx5/mlx5_vfio.c index dbb9858..85ee25b 100644 --- a/providers/mlx5/mlx5_vfio.c +++ b/providers/mlx5/mlx5_vfio.c @@ -31,12 +31,21 @@ enum { MLX5_VFIO_CMD_VEC_IDX, }; +static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, uint16_t func_id, + int32_t npages, bool is_event); +static int mlx5_vfio_reclaim_pages(struct mlx5_vfio_context *ctx, uint32_t func_id, + int npages); + static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx, struct mlx5_cmd_msg *msg); static int mlx5_vfio_alloc_cmd_msg(struct mlx5_vfio_context *ctx, uint32_t size, struct mlx5_cmd_msg *msg); +static int mlx5_vfio_post_cmd(struct mlx5_vfio_context *ctx, void *in, + int ilen, void *out, int olen, + unsigned int slot, bool async); + static int mlx5_vfio_register_mem(struct mlx5_vfio_context *ctx, void *vaddr, uint64_t iova, uint64_t size) { @@ -259,6 +268,22 @@ static void eq_update_ci(struct mlx5_eq *eq, uint32_t cc, int arm) udma_to_device_barrier(); } +static int mlx5_vfio_handle_page_req_event(struct mlx5_vfio_context *ctx, + struct mlx5_eqe *eqe) +{ + struct mlx5_eqe_page_req *req = &eqe->data.req_pages; + int32_t num_pages; + int16_t func_id; + + func_id = be16toh(req->func_id); + num_pages = be32toh(req->num_pages); + + if (num_pages > 0) + return mlx5_vfio_give_pages(ctx, func_id, num_pages, true); + + return mlx5_vfio_reclaim_pages(ctx, func_id, -1 * num_pages); +} + static void mlx5_cmd_mbox_status(void *out, uint8_t *status, uint32_t *syndrome) { *status = DEVX_GET(mbox_out, out, status); @@ -365,6 +390,52 @@ static inline uint32_t mlx5_eq_update_cc(struct mlx5_eq *eq, uint32_t cc) return cc; } +static int mlx5_vfio_process_page_request_comp(struct mlx5_vfio_context *ctx, + unsigned long slot) +{ + struct mlx5_vfio_cmd_slot *cmd_slot = &ctx->cmd.cmds[slot]; + struct cmd_async_data *cmd_data = &cmd_slot->curr; + int num_claimed; + int ret, i; + + ret = mlx5_copy_from_msg(cmd_data->buff_out, &cmd_slot->out, + cmd_data->olen, cmd_slot->lay); + if (ret) + goto end; + + ret = mlx5_vfio_cmd_check(ctx, cmd_data->buff_in, cmd_data->buff_out); + if (ret) + goto end; + + if (DEVX_GET(manage_pages_in, cmd_data->buff_in, op_mod) == MLX5_PAGES_GIVE) + goto end; + + num_claimed = DEVX_GET(manage_pages_out, cmd_data->buff_out, output_num_entries); + if (num_claimed > DEVX_GET(manage_pages_in, cmd_data->buff_in, input_num_entries)) { + ret = EINVAL; + errno = ret; + goto end; + } + + for (i = 0; i < num_claimed; i++) + mlx5_vfio_free_page(ctx, DEVX_GET64(manage_pages_out, cmd_data->buff_out, pas[i])); + +end: + free(cmd_data->buff_in); + free(cmd_data->buff_out); + cmd_slot->in_use = false; + if (!ret && cmd_slot->is_pending) { + cmd_data = &cmd_slot->pending; + + pthread_mutex_lock(&cmd_slot->lock); + cmd_slot->is_pending = false; + ret = mlx5_vfio_post_cmd(ctx, cmd_data->buff_in, cmd_data->ilen, + cmd_data->buff_out, cmd_data->olen, slot, true); + pthread_mutex_unlock(&cmd_slot->lock); + } + return ret; +} + static int mlx5_vfio_cmd_comp(struct mlx5_vfio_context *ctx, unsigned long slot) { uint64_t u = 1; @@ -415,6 +486,9 @@ static int mlx5_vfio_process_async_events(struct mlx5_vfio_context *ctx) case MLX5_EVENT_TYPE_CMD: ret = mlx5_vfio_process_cmd_eqe(ctx, eqe); break; + case MLX5_EVENT_TYPE_PAGE_REQUEST: + ret = mlx5_vfio_handle_page_req_event(ctx, eqe); + break; default: break; } @@ -563,9 +637,9 @@ static int mlx5_vfio_cmd_prep_out(struct mlx5_vfio_context *ctx, return 0; } -static int mlx5_vfio_cmd_exec(struct mlx5_vfio_context *ctx, void *in, +static int mlx5_vfio_post_cmd(struct mlx5_vfio_context *ctx, void *in, int ilen, void *out, int olen, - unsigned int slot) + unsigned int slot, bool async) { struct mlx5_init_seg *init_seg = ctx->bar_map; struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay; @@ -573,20 +647,62 @@ static int mlx5_vfio_cmd_exec(struct mlx5_vfio_context *ctx, void *in, struct mlx5_cmd_msg *cmd_out = &ctx->cmd.cmds[slot].out; int err; - pthread_mutex_lock(&ctx->cmd.cmds[slot].lock); + /* Lock was taken by caller */ + if (async && ctx->cmd.cmds[slot].in_use) { + struct cmd_async_data *pending = &ctx->cmd.cmds[slot].pending; + + if (ctx->cmd.cmds[slot].is_pending) { + assert(false); + return EINVAL; + } + + /* We might get another PAGE EVENT before previous CMD was completed. + * Save the new work and once get the CMD completion go and do the job. + */ + pending->buff_in = in; + pending->buff_out = out; + pending->ilen = ilen; + pending->olen = olen; + + ctx->cmd.cmds[slot].is_pending = true; + return 0; + } err = mlx5_vfio_cmd_prep_in(ctx, cmd_in, cmd_lay, in, ilen); if (err) - goto end; + return err; err = mlx5_vfio_cmd_prep_out(ctx, cmd_out, cmd_lay, olen); if (err) - goto end; + return err; + + if (async) { + ctx->cmd.cmds[slot].in_use = true; + ctx->cmd.cmds[slot].curr.ilen = ilen; + ctx->cmd.cmds[slot].curr.olen = olen; + ctx->cmd.cmds[slot].curr.buff_in = in; + ctx->cmd.cmds[slot].curr.buff_out = out; + } cmd_lay->status_own = 0x1; udma_to_device_barrier(); mmio_write32_be(&init_seg->cmd_dbell, htobe32(0x1 << slot)); + return 0; +} + +static int mlx5_vfio_cmd_exec(struct mlx5_vfio_context *ctx, void *in, + int ilen, void *out, int olen, + unsigned int slot) +{ + struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay; + struct mlx5_cmd_msg *cmd_out = &ctx->cmd.cmds[slot].out; + int err; + + pthread_mutex_lock(&ctx->cmd.cmds[slot].lock); + err = mlx5_vfio_post_cmd(ctx, in, ilen, out, olen, slot, false); + if (err) + goto end; if (ctx->have_eq) { err = mlx5_vfio_wait_event(ctx, slot); @@ -775,6 +891,8 @@ static int mlx5_vfio_setup_cmd_slot(struct mlx5_vfio_context *ctx, int slot) if (slot != MLX5_MAX_COMMANDS - 1) cmd_slot->comp_func = mlx5_vfio_cmd_comp; + else + cmd_slot->comp_func = mlx5_vfio_process_page_request_comp; pthread_mutex_init(&cmd_slot->lock, NULL); @@ -1329,7 +1447,8 @@ static int create_async_eqs(struct mlx5_vfio_context *ctx) param = (struct mlx5_eq_param) { .irq_index = MLX5_VFIO_CMD_VEC_IDX, .nent = MLX5_NUM_CMD_EQE, - .mask[0] = 1ull << MLX5_EVENT_TYPE_CMD, + .mask[0] = 1ull << MLX5_EVENT_TYPE_CMD | + 1ull << MLX5_EVENT_TYPE_PAGE_REQUEST, }; err = setup_async_eq(ctx, ¶m, &ctx->async_eq); @@ -1343,6 +1462,49 @@ err: return err; } +static int mlx5_vfio_reclaim_pages(struct mlx5_vfio_context *ctx, uint32_t func_id, + int npages) +{ + uint32_t inlen = DEVX_ST_SZ_BYTES(manage_pages_in); + int outlen; + uint32_t *out; + void *in; + int err; + int slot = MLX5_MAX_COMMANDS - 1; + + outlen = DEVX_ST_SZ_BYTES(manage_pages_out); + + outlen += npages * DEVX_FLD_SZ_BYTES(manage_pages_out, pas[0]); + out = calloc(1, outlen); + if (!out) { + errno = ENOMEM; + return errno; + } + + in = calloc(1, inlen); + if (!in) { + err = ENOMEM; + errno = err; + goto out_free; + } + + DEVX_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES); + DEVX_SET(manage_pages_in, in, op_mod, MLX5_PAGES_TAKE); + DEVX_SET(manage_pages_in, in, function_id, func_id); + DEVX_SET(manage_pages_in, in, input_num_entries, npages); + + pthread_mutex_lock(&ctx->cmd.cmds[slot].lock); + err = mlx5_vfio_post_cmd(ctx, in, inlen, out, outlen, slot, true); + pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock); + if (!err) + return 0; + + free(in); +out_free: + free(out); + return err; +} + static int mlx5_vfio_enable_hca(struct mlx5_vfio_context *ctx) { uint32_t in[DEVX_ST_SZ_DW(enable_hca_in)] = {}; @@ -1382,10 +1544,13 @@ static int mlx5_vfio_set_issi(struct mlx5_vfio_context *ctx) static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, uint16_t func_id, - int32_t npages) + int32_t npages, + bool is_event) { int32_t out[DEVX_ST_SZ_DW(manage_pages_out)] = {}; int inlen = DEVX_ST_SZ_BYTES(manage_pages_in); + int slot = MLX5_MAX_COMMANDS - 1; + void *outp = out; int i, err; int32_t *in; uint64_t iova; @@ -1397,6 +1562,15 @@ static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, return errno; } + if (is_event) { + outp = calloc(1, sizeof(out)); + if (!outp) { + errno = ENOMEM; + err = errno; + goto end; + } + } + for (i = 0; i < npages; i++) { err = mlx5_vfio_alloc_page(ctx, &iova); if (err) @@ -1410,11 +1584,22 @@ static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, DEVX_SET(manage_pages_in, in, function_id, func_id); DEVX_SET(manage_pages_in, in, input_num_entries, npages); - err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), - MLX5_MAX_COMMANDS - 1); - if (!err) + if (is_event) { + pthread_mutex_lock(&ctx->cmd.cmds[slot].lock); + err = mlx5_vfio_post_cmd(ctx, in, inlen, outp, sizeof(out), slot, true); + pthread_mutex_unlock(&ctx->cmd.cmds[slot].lock); + } else { + err = mlx5_vfio_cmd_exec(ctx, in, inlen, outp, sizeof(out), slot); + } + + if (!err) { + if (is_event) + return 0; goto end; + } err: + if (is_event) + free(outp); for (i--; i >= 0; i--) mlx5_vfio_free_page(ctx, DEVX_GET64(manage_pages_in, in, pas[i])); end: @@ -1454,7 +1639,7 @@ static int mlx5_vfio_satisfy_startup_pages(struct mlx5_vfio_context *ctx, if (ret) return ret; - return mlx5_vfio_give_pages(ctx, function_id, npages); + return mlx5_vfio_give_pages(ctx, function_id, npages, false); } static int mlx5_vfio_access_reg(struct mlx5_vfio_context *ctx, void *data_in, @@ -2034,6 +2219,30 @@ static int mlx5_vfio_get_handle(struct mlx5_vfio_device *vfio_dev, return 0; } +int mlx5dv_vfio_get_events_fd(struct ibv_context *ibctx) +{ + struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); + + return ctx->cmd_comp_fd; +} + +int mlx5dv_vfio_process_events(struct ibv_context *ibctx) +{ + struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); + uint64_t u; + ssize_t s; + + /* read to re-arm the FD and process all existing events */ + s = read(ctx->cmd_comp_fd, &u, sizeof(uint64_t)); + if (s < 0 && errno != EAGAIN) { + mlx5_err(ctx->dbg_fp, "%s, read failed, errno=%d\n", + __func__, errno); + return errno; + } + + return mlx5_vfio_process_async_events(ctx); +} + struct ibv_device ** mlx5dv_get_vfio_device_list(struct mlx5dv_vfio_context_attr *attr) { diff --git a/providers/mlx5/mlx5_vfio.h b/providers/mlx5/mlx5_vfio.h index 449a5c5..8e240c8 100644 --- a/providers/mlx5/mlx5_vfio.h +++ b/providers/mlx5/mlx5_vfio.h @@ -151,9 +151,17 @@ struct mlx5_cmd_msg { struct mlx5_cmd_mailbox *next; }; + typedef int (*vfio_cmd_slot_comp)(struct mlx5_vfio_context *ctx, unsigned long slot); +struct cmd_async_data { + void *buff_in; + int ilen; + void *buff_out; + int olen; +}; + struct mlx5_vfio_cmd_slot { struct mlx5_cmd_layout *lay; struct mlx5_cmd_msg in; @@ -161,6 +169,11 @@ struct mlx5_vfio_cmd_slot { pthread_mutex_t lock; int completion_event_fd; vfio_cmd_slot_comp comp_func; + /* async cmd caller data */ + bool in_use; + struct cmd_async_data curr; + bool is_pending; + struct cmd_async_data pending; }; struct mlx5_vfio_cmd { @@ -245,6 +258,7 @@ struct mlx5_vfio_context { uint32_t hca_cur[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)]; uint32_t hca_max[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)]; } caps; + struct mlx5_eq async_eq; struct mlx5_vfio_eqs_uar eqs_uar; pthread_mutex_t eq_lock; diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h index 6aaea37..c9699ec 100644 --- a/providers/mlx5/mlx5dv.h +++ b/providers/mlx5/mlx5dv.h @@ -1487,6 +1487,14 @@ struct mlx5dv_vfio_context_attr { struct ibv_device ** mlx5dv_get_vfio_device_list(struct mlx5dv_vfio_context_attr *attr); +int mlx5dv_vfio_get_events_fd(struct ibv_context *ibctx); + +/* This API should run from application thread and maintain device events. + * The application is responsible to get the events FD by calling mlx5dv_vfio_get_events_fd + * and once the FD is pollable call the API to let driver process the ready events. + */ +int mlx5dv_vfio_process_events(struct ibv_context *context); + struct ibv_context * mlx5dv_open_device(struct ibv_device *device, struct mlx5dv_context_attr *attr); -- 1.8.3.1