Use the block layer (bio_submit) to pass through the IO to the nvme driver instead of the direct IO submission hooks. Currently that code supports only read/write, and it still assumes that we talk to an nvme driver. Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx> --- drivers/nvme/mdev/Kconfig | 8 ++ drivers/nvme/mdev/host.c | 239 +++++++++++++++++++++++++++++++++++++- drivers/nvme/mdev/io.c | 7 ++ drivers/nvme/mdev/priv.h | 61 ++++++++++ 4 files changed, 313 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/mdev/Kconfig b/drivers/nvme/mdev/Kconfig index 7ebc66cdeac0..1ace298a364d 100644 --- a/drivers/nvme/mdev/Kconfig +++ b/drivers/nvme/mdev/Kconfig @@ -14,3 +14,11 @@ config NVME_MDEV_VFIO guest, also as a NVME namespace, attached to a virtual NVME controller If unsure, say N. + +config NVME_MDEV_VFIO_GENERIC_IO + bool "Use generic block layer IO" + depends on NVME_MDEV_VFIO + help + Send the IO through the block layer using polled IO queues, + instead of dedicated mdev queues + If unsure, say N. diff --git a/drivers/nvme/mdev/host.c b/drivers/nvme/mdev/host.c index 6590946b86c2..a2ba69dcf4f2 100644 --- a/drivers/nvme/mdev/host.c +++ b/drivers/nvme/mdev/host.c @@ -53,6 +53,7 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) if (!hctrl) return NULL; +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO nr_host_queues = ctrl->ops->ext_queues_available(ctrl); max_lba_transfer = ctrl->max_hw_sectors >> (PAGE_SHIFT - 9); @@ -63,6 +64,15 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) return NULL; } + hctrl->oncs = ctrl->oncs & + (NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES); +#else + /* for now don't deal with bio chaining */ + max_lba_transfer = BIO_MAX_PAGES; + nr_host_queues = MDEV_NVME_NUM_BIO_QUEUES; + /* for now no support for write zeros and discard*/ + hctrl->oncs = 0; +#endif kref_init(&hctrl->ref); mutex_init(&hctrl->lock); @@ -70,8 +80,6 @@ static struct nvme_mdev_hctrl *nvme_mdev_hctrl_create(struct nvme_ctrl *ctrl) hctrl->nvme_ctrl = ctrl; nvme_get_ctrl(ctrl); - hctrl->oncs = ctrl->oncs & - (NVME_CTRL_ONCS_DSM | NVME_CTRL_ONCS_WRITE_ZEROES); hctrl->id = ctrl->instance; hctrl->node = dev_to_node(ctrl->dev); @@ -200,6 +208,8 @@ bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode) } } +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + /* Allocate a host IO queue */ int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) { @@ -228,6 +238,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid) /* Submit a IO passthrough command */ int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, + struct nvme_mdev_vns *vns, u16 qid, u32 tag, struct nvme_command *cmd, struct nvme_ext_data_iter *datait) @@ -248,6 +259,226 @@ int nvme_mdev_hctrl_hq_poll(struct nvme_mdev_hctrl *hctrl, return ctrl->ops->ext_queue_poll(ctrl, qid, results, max_len); } +#else + +/* Allocate a 'host' queue - here the queues are virtual*/ +int nvme_mdev_hctrl_hq_alloc(struct nvme_mdev_hctrl *hctrl) +{ + int qid, ret; + struct hw_mbio_queue *hwq; + + for (qid = 0 ; qid < MDEV_NVME_NUM_BIO_QUEUES ; qid++) + if (!hctrl->hw_queues[qid]) + break; + + if (qid == MDEV_NVME_NUM_BIO_QUEUES) + return -ENOSPC; + + hwq = kzalloc_node(sizeof(*hwq), GFP_KERNEL, hctrl->node); + if (!hwq) + return -ENOMEM; + + INIT_LIST_HEAD(&hwq->bios_in_flight); + + ret = bioset_init(&hwq->bioset, MDEV_NVME_BIO_QUEUE_SIZE, + offsetof(struct mbio, bio), BIOSET_NEED_BVECS); + + if (ret < 0) { + kfree(hwq); + return ret; + } + + hctrl->hw_queues[qid] = hwq; + return qid + 1; +} + +/* Free a 'host' queue - here the queues are virtual*/ +void nvme_mdev_hctrl_hq_free(struct nvme_mdev_hctrl *hctrl, u16 qid) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + + if (WARN_ON(!hwq)) + return; + + WARN_ON(!list_empty(&hwq->bios_in_flight)); + WARN_ON(hwq->inflight); + + hctrl->hw_queues[qid - 1] = NULL; + bioset_exit(&hwq->bioset); + kfree(hwq); +} + +/* + * Check if the host queue has space for submission - also our limit + * not related to the block layer + */ +bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + + if (WARN_ON(!hwq)) + return false; + return hwq->inflight < MDEV_NVME_BIO_QUEUE_SIZE; +} + +/* + * Callback we get from the block layer + * Note that despite polling, this can be run from IRQ context + */ +static void nvme_mdev_hctrl_bio_done(struct bio *bio) +{ + struct mbio *mbio = container_of(bio, struct mbio, bio); + + /* this will mark this bio as done, and allow the polling thread + * to return it to the user + */ + mbio->status = nvme_mdev_translate_error_block(bio->bi_status); +} + +/* Submit a IO passthrough command */ +int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, + struct nvme_mdev_vns *vns, + u16 qid, u32 tag, + struct nvme_command *cmd, + struct nvme_ext_data_iter *datait) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + struct bio *bio = NULL; + struct mbio *mbio; + struct page *page; + u8 opcode = cmd->common.opcode; + int retval, op, op_flags = 0; + int offset; + + if (WARN_ON(!hwq)) + return -EINVAL; + if (WARN_ON(hwq->inflight >= MDEV_NVME_BIO_QUEUE_SIZE)) + return -EBUSY; + + /* read/write buffer processing */ + if (opcode == nvme_cmd_read || opcode == nvme_cmd_write) { + unsigned long datalength = + (le16_to_cpu(cmd->rw.length) + 1) << vns->blksize_shift; + + if (opcode == nvme_cmd_read) { + op = REQ_OP_READ; + } else { + op = REQ_OP_WRITE; + op_flags = REQ_SYNC | REQ_IDLE; + if (cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + op_flags |= REQ_FUA; + } + + if (WARN_ON(datait->count > BIO_MAX_PAGES)) + return -EINVAL; + + bio = bio_alloc_bioset(GFP_KERNEL, datait->count, &hwq->bioset); + if (WARN_ON(!bio)) + return -ENOMEM; + + mbio = container_of(bio, struct mbio, bio); + + /* starting sector */ + bio->bi_iter.bi_sector = le64_to_cpu(cmd->rw.slba) << + (vns->blksize_shift - 9); + + /* Data. Last page might be partial size*/ + while (datait->count) { + int chunk = min(PAGE_SIZE, datalength); + + if (WARN_ON(datalength == 0)) + break; + + page = pfn_to_page(PHYS_PFN(datait->physical)); + offset = OFFSET_IN_PAGE(datait->physical); + + if (bio_add_page(&mbio->bio, page, + chunk, offset) != chunk) { + WARN_ON(1); + retval = -ENOMEM; + goto error; + } + + retval = datait->next(datait); + if (WARN_ON(retval)) + goto error; + datalength -= chunk; + } + + /* flush request */ + } else if (opcode == nvme_cmd_flush) { + op = REQ_OP_WRITE; + op_flags = REQ_PREFLUSH; + bio = bio_alloc_bioset(GFP_KERNEL, 0, &hwq->bioset); + if (WARN_ON(!bio)) + return -ENOMEM; + mbio = container_of(bio, struct mbio, bio); + } else { + retval = -EINVAL; + goto error; + } + + /* set polling */ + op_flags |= REQ_HIPRI | REQ_NOWAIT; + + /* setup the bio */ + bio_set_dev(bio, vns->host_part); + bio->bi_end_io = nvme_mdev_hctrl_bio_done; + bio_set_op_attrs(bio, op, op_flags); + + /* setup our portion of the bio*/ + mbio = container_of(bio, struct mbio, bio); + mbio->tag = tag; + mbio->status = NVME_STATUS_PENDING; + mbio->blk_queue = bdev_get_queue(vns->host_part); + + /* submit the bio*/ + mbio->cookie = submit_bio(bio); + + list_add_tail(&mbio->link, &hwq->bios_in_flight); + hwq->inflight++; + return 0; +error: + if (bio) + bio_put(bio); + return retval; +} + +/* Poll for completion of IO passthrough commands */ +int nvme_mdev_hctrl_hq_poll(struct nvme_mdev_hctrl *hctrl, + u32 qid, + struct nvme_ext_cmd_result *results, + unsigned int max_len) +{ + struct hw_mbio_queue *hwq = hctrl->hw_queues[qid - 1]; + struct mbio *mbio, *tmp; + + int i = 0; + + if (!hwq->inflight) + return -1; + + list_for_each_entry_safe(mbio, tmp, &hwq->bios_in_flight, link) { + if (mbio->status == NVME_STATUS_PENDING) + blk_poll(mbio->blk_queue, mbio->cookie, false); + + if (mbio->status == NVME_STATUS_PENDING) + continue; + + results[i].tag = mbio->tag; + results[i].status = mbio->status; + + hwq->inflight--; + list_del(&mbio->link); + bio_put(&mbio->bio); + + if (++i == max_len) + break; + } + return i; +} +#endif + /* Destroy all host controllers */ void nvme_mdev_hctrl_destroy_all(void) { @@ -486,6 +717,10 @@ static int __init nvme_mdev_init(void) } pr_info("nvme_mdev " NVME_MDEV_FIRMWARE_VERSION " loaded\n"); + +#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + pr_info("nvme_mdev: using block layer polled IO\b"); +#endif return 0; } diff --git a/drivers/nvme/mdev/io.c b/drivers/nvme/mdev/io.c index 39550d0e3649..d3c46de33b01 100644 --- a/drivers/nvme/mdev/io.c +++ b/drivers/nvme/mdev/io.c @@ -70,7 +70,11 @@ static int nvme_mdev_io_translate_rw(struct io_ctx *ctx) if (!check_range(slba, length, ctx->ns->ns_size)) return DNR(NVME_SC_LBA_RANGE); +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO ctx->out.rw.slba = cpu_to_le64(slba + ctx->ns->host_lba_offset); +#else + ctx->out.rw.slba = in->slba; +#endif ctx->out.rw.length = in->length; ret = nvme_mdev_udata_iter_set_dptr(&ctx->udatait, &in->dptr, @@ -195,7 +199,9 @@ static int nvme_mdev_io_translate_dsm(struct io_ctx *ctx) _DBG(ctx->vctrl, "IOQ: DSM_MANAGEMENT: RANGE 0x%llx-0x%x\n", slba, nlb); +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO data_ptr[i].slba = cpu_to_le64(slba + ctx->ns->host_lba_offset); +#endif } ctx->out.dsm.attributes = in->attributes; @@ -280,6 +286,7 @@ static bool nvme_mdev_io_process_sq(struct io_ctx *ctx, u16 sqid) /*passthrough*/ ret = nvme_mdev_hctrl_hq_submit(ctx->hctrl, + ctx->ns, vsq->hsq, (((u32)vsq->qid) << 16) | ((u32)ucid), &ctx->out, diff --git a/drivers/nvme/mdev/priv.h b/drivers/nvme/mdev/priv.h index a11a1842957d..1dd5fce0bfa6 100644 --- a/drivers/nvme/mdev/priv.h +++ b/drivers/nvme/mdev/priv.h @@ -34,7 +34,12 @@ #define MAX_VIRTUAL_NAMESPACES 16 /* NSID = 1..16*/ #define MAX_VIRTUAL_IRQS 16 +#ifndef CONFIG_NVME_MDEV_VFIO_GENERIC_IO #define MAX_HOST_QUEUES 4 +#else +#define MAX_HOST_QUEUES 1 +#endif + #define MAX_AER_COMMANDS 16 #define MAX_LOG_PAGES 16 @@ -323,6 +328,39 @@ struct nvme_mdev_inst_type { struct attribute_group *attrgroup; }; +#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + +#define MDEV_NVME_BIO_QUEUE_SIZE 128 +#define NVME_STATUS_PENDING 0xFFFF +#define MDEV_NVME_NUM_BIO_QUEUES 16 + +struct mbio { + /* link in a list of pending bios*/ + struct list_head link; + + struct request_queue *blk_queue; + + /*GDPR compliant*/ + unsigned int cookie; + + /* tag from the translation (user cid + user qid) */ + u32 tag; + + /* result NVME status */ + u16 status; + + /* must be last for bioset allocation*/ + struct bio bio; +}; + +struct hw_mbio_queue { + int inflight; + struct list_head bios_in_flight; + struct bio_set bioset; +}; + +#endif + /*Abstraction of the host controller that we are connected to */ struct nvme_mdev_hctrl { struct mutex lock; @@ -344,6 +382,10 @@ struct nvme_mdev_hctrl { /* book-keeping for number of host queues we can allocate*/ unsigned int nr_host_queues; + +#ifdef CONFIG_NVME_MDEV_VFIO_GENERIC_IO + struct hw_mbio_queue *hw_queues[MDEV_NVME_NUM_BIO_QUEUES]; +#endif }; /* vctrl.c*/ @@ -415,6 +457,7 @@ bool nvme_mdev_hctrl_hq_can_submit(struct nvme_mdev_hctrl *hctrl, u16 qid); bool nvme_mdev_hctrl_hq_check_op(struct nvme_mdev_hctrl *hctrl, u8 optcode); int nvme_mdev_hctrl_hq_submit(struct nvme_mdev_hctrl *hctrl, + struct nvme_mdev_vns *vns, u16 qid, u32 tag, struct nvme_command *cmd, struct nvme_ext_data_iter *datait); @@ -701,6 +744,24 @@ static inline int nvme_mdev_translate_error(int error) } } +static inline int nvme_mdev_translate_error_block(blk_status_t blk_sts) +{ + switch (blk_sts) { + case BLK_STS_OK: + return NVME_SC_SUCCESS; + case BLK_STS_NOSPC: + return DNR(NVME_SC_CAP_EXCEEDED); + case BLK_STS_TARGET: + return DNR(NVME_SC_LBA_RANGE); + case BLK_STS_NOTSUPP: + return DNR(NVME_SC_INVALID_OPCODE); + case BLK_STS_MEDIUM: + return DNR(NVME_SC_ACCESS_DENIED); + default: + return DNR(NVME_SC_INTERNAL); + } +} + static inline bool timeout(ktime_t event, ktime_t now, unsigned long timeout_ms) { return ktime_ms_delta(now, event) > (long)timeout_ms; -- 2.17.2