Support initial DEVX/DV APIs over vfio for UMEM/UAR/EQN usage. Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx> --- providers/mlx5/mlx5_ifc.h | 70 ++++++++++++++ providers/mlx5/mlx5_vfio.c | 228 ++++++++++++++++++++++++++++++++++++++++++++- providers/mlx5/mlx5_vfio.h | 10 ++ 3 files changed, 307 insertions(+), 1 deletion(-) diff --git a/providers/mlx5/mlx5_ifc.h b/providers/mlx5/mlx5_ifc.h index 1cbe846..1bd7466 100644 --- a/providers/mlx5/mlx5_ifc.h +++ b/providers/mlx5/mlx5_ifc.h @@ -88,6 +88,8 @@ enum { MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT = 0xa01, MLX5_CMD_OP_QUERY_GENERAL_OBJECT = 0xa02, + MLX5_CMD_OP_CREATE_UMEM = 0xa08, + MLX5_CMD_OP_DESTROY_UMEM = 0xa0a, MLX5_CMD_OP_SYNC_STEERING = 0xb00, }; @@ -4656,4 +4658,72 @@ struct mlx5_ifc_dealloc_pd_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_mtt_bits { + u8 ptag_63_32[0x20]; + + u8 ptag_31_8[0x18]; + u8 reserved_at_38[0x6]; + u8 wr_en[0x1]; + u8 rd_en[0x1]; +}; + +struct mlx5_ifc_umem_bits { + u8 reserved_at_0[0x80]; + + u8 reserved_at_80[0x1b]; + u8 log_page_size[0x5]; + + u8 page_offset[0x20]; + + u8 num_of_mtt[0x40]; + + struct mlx5_ifc_mtt_bits mtt[]; +}; + +struct mlx5_ifc_create_umem_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_umem_bits umem; +}; + +struct mlx5_ifc_create_umem_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 umem_id[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_umem_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 umem_id[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_destroy_umem_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + #endif /* MLX5_IFC_H */ diff --git a/providers/mlx5/mlx5_vfio.c b/providers/mlx5/mlx5_vfio.c index 23c6eeb..5e55697 100644 --- a/providers/mlx5/mlx5_vfio.c +++ b/providers/mlx5/mlx5_vfio.c @@ -37,6 +37,8 @@ enum { MLX5_VFIO_SUPP_MR_ACCESS_FLAGS = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_RELAXED_ORDERING, + MLX5_VFIO_SUPP_UMEM_ACCESS_FLAGS = IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ, }; static int mlx5_vfio_give_pages(struct mlx5_vfio_context *ctx, uint16_t func_id, @@ -173,7 +175,6 @@ static void mlx5_vfio_free_page(struct mlx5_vfio_context *ctx, uint64_t iova) bitmap_set_bit(page_block->free_pages, pg); if (bitmap_full(page_block->free_pages, MLX5_VFIO_BLOCK_NUM_PAGES)) mlx5_vfio_free_block(ctx, page_block); - goto end; } @@ -2467,6 +2468,220 @@ vfio_devx_obj_create(struct ibv_context *context, const void *in, return NULL; } +static int vfio_devx_query_eqn(struct ibv_context *ibctx, uint32_t vector, + uint32_t *eqn) +{ + struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); + + if (vector > ibctx->num_comp_vectors - 1) + return EINVAL; + + /* For now use the singleton EQN created for async events */ + *eqn = ctx->async_eq.eqn; + return 0; +} + +static struct mlx5dv_devx_uar * +vfio_devx_alloc_uar(struct ibv_context *ibctx, uint32_t flags) +{ + struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx); + struct mlx5_devx_uar *uar; + + if (flags != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) { + errno = EOPNOTSUPP; + return NULL; + } + + uar = calloc(1, sizeof(*uar)); + if (!uar) { + errno = ENOMEM; + return NULL; + } + + uar->dv_devx_uar.page_id = ctx->eqs_uar.uarn; + uar->dv_devx_uar.base_addr = (void *)ctx->eqs_uar.iova; + uar->dv_devx_uar.reg_addr = uar->dv_devx_uar.base_addr + MLX5_BF_OFFSET; + uar->context = ibctx; + + return &uar->dv_devx_uar; +} + +static void vfio_devx_free_uar(struct mlx5dv_devx_uar *dv_devx_uar) +{ + free(dv_devx_uar); +} + +static struct mlx5dv_devx_umem * +_vfio_devx_umem_reg(struct ibv_context *context, + void *addr, size_t size, uint32_t access, + uint64_t pgsz_bitmap) +{ + struct mlx5_vfio_context *ctx = to_mvfio_ctx(context); + uint32_t out[DEVX_ST_SZ_DW(create_umem_out)] = {}; + struct mlx5_vfio_devx_umem *vfio_umem; + int iova_page_shift; + uint64_t iova_size; + int ret; + void *in; + uint32_t inlen; + __be64 *mtt; + void *umem; + bool writeable; + void *aligned_va; + int num_pas; + + if (!check_comp_mask(access, MLX5_VFIO_SUPP_UMEM_ACCESS_FLAGS)) { + errno = EOPNOTSUPP; + return NULL; + } + + if ((access & IBV_ACCESS_REMOTE_WRITE) && + !(access & IBV_ACCESS_LOCAL_WRITE)) { + errno = EINVAL; + return NULL; + } + + /* Page size that encloses the start and end of the umem range */ + iova_size = max(roundup_pow_of_two(size + ((uint64_t) addr & (ctx->iova_min_page_size - 1))), + ctx->iova_min_page_size); + + if (!(iova_size & pgsz_bitmap)) { + /* input should include the iova page size */ + errno = EOPNOTSUPP; + return NULL; + } + + writeable = access & + (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + + vfio_umem = calloc(1, sizeof(*vfio_umem)); + if (!vfio_umem) { + errno = ENOMEM; + return NULL; + } + + vfio_umem->iova_size = iova_size; + if (ibv_dontfork_range(addr, size)) + goto err; + + ret = iset_alloc_range(ctx->iova_alloc, vfio_umem->iova_size, &vfio_umem->iova); + if (ret) + goto err_alloc; + + /* The registration's arguments have to reflect real VA presently mapped into the process */ + aligned_va = (void *) ((unsigned long) addr & ~(ctx->iova_min_page_size - 1)); + vfio_umem->iova_reg_size = align((addr + size) - aligned_va, ctx->iova_min_page_size); + ret = mlx5_vfio_register_mem(ctx, aligned_va, vfio_umem->iova, vfio_umem->iova_reg_size); + if (ret) + goto err_reg; + + iova_page_shift = ilog32(vfio_umem->iova_size - 1); + num_pas = 1; + if (iova_page_shift > MLX5_MAX_PAGE_SHIFT) { + iova_page_shift = MLX5_MAX_PAGE_SHIFT; + num_pas = DIV_ROUND_UP(vfio_umem->iova_size, (1ULL << iova_page_shift)); + } + + inlen = DEVX_ST_SZ_BYTES(create_umem_in) + DEVX_ST_SZ_BYTES(mtt) * num_pas; + + in = calloc(1, inlen); + if (!in) { + errno = ENOMEM; + goto err_in; + } + + umem = DEVX_ADDR_OF(create_umem_in, in, umem); + mtt = (__be64 *)DEVX_ADDR_OF(umem, umem, mtt); + + DEVX_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM); + DEVX_SET64(umem, umem, num_of_mtt, num_pas); + DEVX_SET(umem, umem, log_page_size, iova_page_shift - MLX5_ADAPTER_PAGE_SHIFT); + DEVX_SET(umem, umem, page_offset, addr - aligned_va); + + mlx5_vfio_populate_pas(vfio_umem->iova, num_pas, (1ULL << iova_page_shift), mtt, + (writeable ? MLX5_MTT_WRITE : 0) | MLX5_MTT_READ); + + ret = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0); + if (ret) + goto err_exec; + + free(in); + + vfio_umem->dv_devx_umem.umem_id = DEVX_GET(create_umem_out, out, umem_id); + vfio_umem->context = context; + vfio_umem->addr = addr; + vfio_umem->size = size; + return &vfio_umem->dv_devx_umem; + +err_exec: + free(in); +err_in: + mlx5_vfio_unregister_mem(ctx, vfio_umem->iova, vfio_umem->iova_reg_size); +err_reg: + iset_insert_range(ctx->iova_alloc, vfio_umem->iova, vfio_umem->iova_size); +err_alloc: + ibv_dofork_range(addr, size); +err: + free(vfio_umem); + return NULL; +} + +static struct mlx5dv_devx_umem * +vfio_devx_umem_reg(struct ibv_context *context, + void *addr, size_t size, uint32_t access) +{ + return _vfio_devx_umem_reg(context, addr, size, access, UINT64_MAX); +} + +static struct mlx5dv_devx_umem * +vfio_devx_umem_reg_ex(struct ibv_context *ctx, struct mlx5dv_devx_umem_in *in) +{ + if (!check_comp_mask(in->comp_mask, 0)) { + errno = EOPNOTSUPP; + return NULL; + } + + return _vfio_devx_umem_reg(ctx, in->addr, in->size, in->access, in->pgsz_bitmap); +} + +static int vfio_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem) +{ + struct mlx5_vfio_devx_umem *vfio_umem = + container_of(dv_devx_umem, struct mlx5_vfio_devx_umem, + dv_devx_umem); + struct mlx5_vfio_context *ctx = to_mvfio_ctx(vfio_umem->context); + uint32_t in[DEVX_ST_SZ_DW(create_umem_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(create_umem_out)] = {}; + int ret; + + DEVX_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM); + DEVX_SET(destroy_umem_in, in, umem_id, dv_devx_umem->umem_id); + + ret = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0); + if (ret) + return ret; + + mlx5_vfio_unregister_mem(ctx, vfio_umem->iova, vfio_umem->iova_reg_size); + iset_insert_range(ctx->iova_alloc, vfio_umem->iova, vfio_umem->iova_size); + ibv_dofork_range(vfio_umem->addr, vfio_umem->size); + free(vfio_umem); + return 0; +} + +static int vfio_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) +{ + struct ibv_pd *pd_in = obj->pd.in; + struct mlx5dv_pd *pd_out = obj->pd.out; + struct mlx5_pd *mpd = to_mpd(pd_in); + + if (obj_type != MLX5DV_OBJ_PD) + return EOPNOTSUPP; + + pd_out->comp_mask = 0; + pd_out->pdn = mpd->pdn; + return 0; +} + static int vfio_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, void *out, size_t outlen) { @@ -2476,6 +2691,13 @@ static int vfio_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in, static struct mlx5_dv_context_ops mlx5_vfio_dv_ctx_ops = { .devx_obj_create = vfio_devx_obj_create, .devx_obj_query = vfio_devx_obj_query, + .devx_query_eqn = vfio_devx_query_eqn, + .devx_alloc_uar = vfio_devx_alloc_uar, + .devx_free_uar = vfio_devx_free_uar, + .devx_umem_reg = vfio_devx_umem_reg, + .devx_umem_reg_ex = vfio_devx_umem_reg_ex, + .devx_umem_dereg = vfio_devx_umem_dereg, + .init_obj = vfio_init_obj, }; static void mlx5_vfio_uninit_context(struct mlx5_vfio_context *ctx) @@ -2544,6 +2766,10 @@ mlx5_vfio_alloc_context(struct ibv_device *ibdev, verbs_set_ops(&mctx->vctx, &mlx5_vfio_common_ops); mctx->dv_ctx_ops = &mlx5_vfio_dv_ctx_ops; + + /* For now only a singelton EQ is supported */ + mctx->vctx.context.num_comp_vectors = 1; + return &mctx->vctx; func_teardown: diff --git a/providers/mlx5/mlx5_vfio.h b/providers/mlx5/mlx5_vfio.h index 79b8033..766c48c 100644 --- a/providers/mlx5/mlx5_vfio.h +++ b/providers/mlx5/mlx5_vfio.h @@ -47,6 +47,16 @@ struct mlx5_vfio_mr { uint64_t iova_reg_size; }; +struct mlx5_vfio_devx_umem { + struct mlx5dv_devx_umem dv_devx_umem; + struct ibv_context *context; + void *addr; + size_t size; + uint64_t iova; + uint64_t iova_size; + uint64_t iova_reg_size; +}; + struct mlx5_vfio_device { struct verbs_device vdev; char *pci_name; -- 1.8.3.1