Add async_ioctl handler that implements asynchronous handling of ioctl operation. If requested ioctl opcode does not involve submitting a command to device (e.g. NVME_IOCTL_ID), it is made to return instantly. Otherwise, ioctl-completion is decoupled from submission, and -EIOCBQUEUED is returned post submission. When completion arrives from device, nvme calls the ioctl-completion handler supplied by upper-layer. But there is execption to that. An ioctl completion may also require updating certain ioctl-specific user buffers/fields which can be accessed only in context of original submitter-task. For such ioctl, nvme-completion schedules a task-work which first updates ioctl-specific buffers/fields and after that invokes the ioctl-completion handler. Signed-off-by: Kanchan Joshi <joshi.k@xxxxxxxxxxx> Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> --- drivers/nvme/host/core.c | 347 +++++++++++++++++++++++++++++++-------- 1 file changed, 280 insertions(+), 67 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 200bdd672c28..57f3040bae34 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -21,6 +21,7 @@ #include <linux/nvme_ioctl.h> #include <linux/pm_qos.h> #include <asm/unaligned.h> +#include <linux/task_work.h> #include "nvme.h" #include "fabrics.h" @@ -1092,7 +1093,107 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) } } -void nvme_execute_passthru_rq(struct request *rq) +struct async_pt_desc { + struct bio *bio; + int status; /* command status */ + u64 result; /* nvme cmd result */ + void __user *res_ptr; /* can be null, 32bit addr or 64 bit addr */ + void __user *meta_ptr; + void *meta; /* kernel-space resident buffer */ + unsigned metalen; /* length of meta */ + bool is_res64 : 1; /* res_ptr refers to 64bit of space */ + bool is_write : 1; + bool is_taskwork : 1; +}; + +static int nvme_add_task_work(struct task_struct *tsk, + struct callback_head *twork, + task_work_func_t work_func) +{ + int ret; + + get_task_struct(tsk); + init_task_work(twork, work_func); + ret = task_work_add(tsk, twork, TWA_SIGNAL); + if (!ret) + wake_up_process(tsk); + return ret; +} + +static void async_pt_update_work(struct callback_head *cbh) +{ + struct pt_ioctl_ctx *ptioc; + struct async_pt_desc *ptd; + struct task_struct *tsk; + int ret; + + ptioc = container_of(cbh, struct pt_ioctl_ctx, pt_work); + ptd = ptioc->ioc_data; + tsk = ptioc->task; + + /* handle meta update */ + if (ptd->meta) { + if (!ptd->status && !ptd->is_write) + if (copy_to_user(ptd->meta_ptr, ptd->meta, ptd->metalen)) + ptd->status = -EFAULT; + kfree(ptd->meta); + } + /* handle result update */ + if (ptd->res_ptr) { + if (!ptd->is_res64) + ret = put_user(ptd->result, (u32 __user *)ptd->res_ptr); + else + ret = put_user(ptd->result, (u64 __user *)ptd->res_ptr); + if (ret) + ptd->status = -EFAULT; + } + + ptioc->pt_complete(ptioc, ptd->status); + put_task_struct(tsk); + kfree(ptd); +} + +static void nvme_end_async_pt(struct request *req, blk_status_t err) +{ + struct pt_ioctl_ctx *ptioc; + struct async_pt_desc *ptd; + struct bio *bio; + + ptioc = req->end_io_data; + ptd = ptioc->ioc_data; + + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + ptd->status = -EINTR; + else + ptd->status = nvme_req(req)->status; + + ptd->result = le64_to_cpu(nvme_req(req)->result.u64); + bio = ptd->bio; + /* setup task work if needed */ + if (ptd->is_taskwork) { + int ret = nvme_add_task_work(ptioc->task, &ptioc->pt_work, + async_pt_update_work); + /* update failure if task-work could not be setup */ + if (ret < 0) { + put_task_struct(ptioc->task); + ptioc->pt_complete(ptioc, ret); + kfree(ptd->meta); + kfree(ptd); + } + } else { + /* return status via callback, nothing else to update */ + ptioc->pt_complete(ptioc, ptd->status); + kfree(ptd); + } + + /* unmap pages, free bio, nvme command and request */ + blk_rq_unmap_user(bio); + kfree(nvme_req(req)->cmd); + blk_mq_free_request(req); +} + + +void nvme_execute_passthru_rq_common(struct request *rq, int async) { struct nvme_command *cmd = nvme_req(rq)->cmd; struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; @@ -1101,15 +1202,52 @@ void nvme_execute_passthru_rq(struct request *rq) u32 effects; effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - blk_execute_rq(rq->q, disk, rq, 0); + if (!async) + blk_execute_rq(rq->q, disk, rq, 0); + else + blk_execute_rq_nowait(rq->q, disk, rq, 0, nvme_end_async_pt); nvme_passthru_end(ctrl, effects); } + +void nvme_execute_passthru_rq(struct request *rq) +{ + return nvme_execute_passthru_rq_common(rq, 0); +} EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); +static int setup_async_pt_desc(struct request *rq, struct pt_ioctl_ctx *ptioc, + void __user *resptr, void __user *meta_buffer, void *meta, + unsigned meta_len, bool write, bool is_res64) +{ + struct async_pt_desc *ptd; + + ptd = kzalloc(sizeof(struct async_pt_desc), GFP_KERNEL); + if (!ptd) + return -ENOMEM; + + /* to free bio on completion, as req->bio will be null at that time */ + ptd->bio = rq->bio; + ptd->res_ptr = resptr; + ptd->is_write = write; + ptd->is_res64 = is_res64; + if (meta) { + ptd->meta_ptr = meta_buffer; + ptd->meta = meta; + ptd->metalen = meta_len; + } + if (resptr) + ptd->is_taskwork = 1; + + ptioc->ioc_data = ptd; + rq->end_io_data = ptioc; + return 0; +} + static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned timeout, + struct pt_ioctl_ctx *ptioc, bool is_res64) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -1145,6 +1283,18 @@ static int nvme_submit_user_cmd(struct request_queue *q, } } + if (ptioc) { /* async handling */ + ret = setup_async_pt_desc(req, ptioc, result, meta_buffer, + meta, meta_len, write, is_res64); + if (ret) { + kfree(meta); + goto out_unmap; + } + /* send request for async processing */ + nvme_execute_passthru_rq_common(req, 1); + return ret; + } + /* sync handling */ nvme_execute_passthru_rq(req); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; @@ -1521,10 +1671,11 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval) return (void __user *)ptrval; } -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio, + struct pt_ioctl_ctx *ptioc) { struct nvme_user_io io; - struct nvme_command c; + struct nvme_command c, *cptr; unsigned length, meta_len; void __user *metadata; @@ -1554,31 +1705,42 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return -EINVAL; } - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->head->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - - return nvme_submit_user_cmd(ns->queue, &c, + if (!ptioc) + cptr = &c; + else { /* for async - allocate cmd dynamically */ + cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL); + if (!cptr) + return -ENOMEM; + } + + memset(cptr, 0, sizeof(c)); + cptr->rw.opcode = io.opcode; + cptr->rw.flags = io.flags; + cptr->rw.nsid = cpu_to_le32(ns->head->ns_id); + cptr->rw.slba = cpu_to_le64(io.slba); + cptr->rw.length = cpu_to_le16(io.nblocks); + cptr->rw.control = cpu_to_le16(io.control); + cptr->rw.dsmgmt = cpu_to_le32(io.dsmgmt); + cptr->rw.reftag = cpu_to_le32(io.reftag); + cptr->rw.apptag = cpu_to_le16(io.apptag); + cptr->rw.appmask = cpu_to_le16(io.appmask); + + return nvme_submit_user_cmd(ns->queue, cptr, nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); + metadata, meta_len, lower_32_bits(io.slba), NULL, 0, + ptioc, 0); } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) + struct nvme_passthru_cmd __user *ucmd, + struct pt_ioctl_ctx *ptioc) { struct nvme_passthru_cmd cmd; - struct nvme_command c; + struct nvme_command c, *cptr; unsigned timeout = 0; u64 result; int status; + void *resptr; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -1586,43 +1748,61 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return -EFAULT; if (cmd.flags) return -EINVAL; + if (!ptioc) { + cptr = &c; + resptr = &result; + } else { + /* + * for async - (a) allocate cmd dynamically + * (b) use user-space result addr + */ + cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL); + if (!cptr) + return -ENOMEM; + resptr = &ucmd->result; + } - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); + memset(cptr, 0, sizeof(c)); + cptr->common.opcode = cmd.opcode; + cptr->common.flags = cmd.flags; + cptr->common.nsid = cpu_to_le32(cmd.nsid); + cptr->common.cdw2[0] = cpu_to_le32(cmd.cdw2); + cptr->common.cdw2[1] = cpu_to_le32(cmd.cdw3); + cptr->common.cdw10 = cpu_to_le32(cmd.cdw10); + cptr->common.cdw11 = cpu_to_le32(cmd.cdw11); + cptr->common.cdw12 = cpu_to_le32(cmd.cdw12); + cptr->common.cdw13 = cpu_to_le32(cmd.cdw13); + cptr->common.cdw14 = cpu_to_le32(cmd.cdw14); + cptr->common.cdw15 = cpu_to_le32(cmd.cdw15); if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, cptr, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); + 0, resptr, timeout, ptioc, 0); - if (status >= 0) { + if (!ptioc && status >= 0) { if (put_user(result, &ucmd->result)) return -EFAULT; } + /* async case, free cmd in case of error */ + if (ptioc && status < 0) + kfree(cptr); return status; } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) + struct nvme_passthru_cmd64 __user *ucmd, + struct pt_ioctl_ctx *ptioc) { struct nvme_passthru_cmd64 cmd; - struct nvme_command c; + struct nvme_command c, *cptr; unsigned timeout = 0; int status; + void *resptr; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -1631,31 +1811,43 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.flags) return -EINVAL; - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!ptioc) { + cptr = &c; + resptr = &cmd.result; + } else { + cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL); + if (!cptr) + return -ENOMEM; + resptr = &ucmd->result; + } + + memset(cptr, 0, sizeof(struct nvme_command)); + cptr->common.opcode = cmd.opcode; + cptr->common.flags = cmd.flags; + cptr->common.nsid = cpu_to_le32(cmd.nsid); + cptr->common.cdw2[0] = cpu_to_le32(cmd.cdw2); + cptr->common.cdw2[1] = cpu_to_le32(cmd.cdw3); + cptr->common.cdw10 = cpu_to_le32(cmd.cdw10); + cptr->common.cdw11 = cpu_to_le32(cmd.cdw11); + cptr->common.cdw12 = cpu_to_le32(cmd.cdw12); + cptr->common.cdw13 = cpu_to_le32(cmd.cdw13); + cptr->common.cdw14 = cpu_to_le32(cmd.cdw14); + cptr->common.cdw15 = cpu_to_le32(cmd.cdw15); if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, cptr, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); + 0, resptr, timeout, ptioc, 1); - if (status >= 0) { + if (!ptioc && status >= 0) { if (put_user(cmd.result, &ucmd->result)) return -EFAULT; } + if (ptioc && status < 0) + kfree(cptr); return status; } @@ -1702,7 +1894,8 @@ static bool is_ctrl_ioctl(unsigned int cmd) static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp, struct nvme_ns_head *head, - int srcu_idx) + int srcu_idx, + struct pt_ioctl_ctx *ptioc) { struct nvme_ctrl *ctrl = ns->ctrl; int ret; @@ -1712,21 +1905,24 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - ret = nvme_user_cmd(ctrl, NULL, argp); + ret = nvme_user_cmd(ctrl, NULL, argp, ptioc); break; case NVME_IOCTL_ADMIN64_CMD: - ret = nvme_user_cmd64(ctrl, NULL, argp); + ret = nvme_user_cmd64(ctrl, NULL, argp, ptioc); break; default: - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); + if (!ptioc) + ret = sed_ioctl(ctrl->opal_dev, cmd, argp); + else + ret = -EOPNOTSUPP; /* RFP: no support for now */ break; } nvme_put_ctrl(ctrl); return ret; } -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +static int nvme_async_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg, struct pt_ioctl_ctx *ptioc) { struct nvme_ns_head *head = NULL; void __user *argp = (void __user *)arg; @@ -1743,33 +1939,49 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, * deadlock when deleting namespaces using the passthrough interface. */ if (is_ctrl_ioctl(cmd)) - return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, ptioc); switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); ret = ns->head->ns_id; + if (ptioc) + goto put_ns; /* return in sync fashion always */ break; case NVME_IOCTL_IO_CMD: - ret = nvme_user_cmd(ns->ctrl, ns, argp); + ret = nvme_user_cmd(ns->ctrl, ns, argp, ptioc); break; case NVME_IOCTL_SUBMIT_IO: - ret = nvme_submit_io(ns, argp); + ret = nvme_submit_io(ns, argp, ptioc); break; case NVME_IOCTL_IO64_CMD: - ret = nvme_user_cmd64(ns->ctrl, ns, argp); + ret = nvme_user_cmd64(ns->ctrl, ns, argp, ptioc); break; default: + if (ptioc) { + /* RFP- don't support this for now */ + ret = -EOPNOTSUPP; + break; + } if (ns->ndev) ret = nvme_nvm_ioctl(ns, cmd, arg); else ret = -ENOTTY; } - + /* if there is no error, return queued for async-ioctl */ + if (ptioc && ret >= 0) + ret = -EIOCBQUEUED; + put_ns: nvme_put_ns_from_disk(head, srcu_idx); return ret; } +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return nvme_async_ioctl(bdev, mode, cmd, arg, NULL); +} + #ifdef CONFIG_COMPAT struct nvme_user_io32 { __u8 opcode; @@ -2324,6 +2536,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, + .async_ioctl = nvme_async_ioctl, .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, @@ -3261,7 +3474,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) kref_get(&ns->kref); up_read(&ctrl->namespaces_rwsem); - ret = nvme_user_cmd(ctrl, ns, argp); + ret = nvme_user_cmd(ctrl, ns, argp, NULL); nvme_put_ns(ns); return ret; @@ -3278,9 +3491,9 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, NULL); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, NULL); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: -- 2.25.1