Hi Keith, On Thu, Oct 6, 2016 at 12:51 AM, Keith Busch <keith.busch@xxxxxxxxx> wrote: > On Wed, Oct 05, 2016 at 11:19:39AM +0800, Ming Lei wrote: >> But .poll() need to check if the specific request is completed or not, >> then blk_poll() can set 'current' as RUNNING if it is completed. >> >> blk_poll(): >> ... >> ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); >> if (ret > 0) { >> hctx->poll_success++; >> set_current_state(TASK_RUNNING); >> return true; >> } > > > Right, but the task could be waiting on a whole lot more than just that > one tag, so setting the task to running before knowing those all complete > doesn't sound right. > >> I am glad to take a look the patch if you post it out. > > Here's what I got for block + nvme. It relies on all the requests to > complete to set the task back to running. Yeah, but your patch doesn't add that change, and looks 'task_struct *' in submission path need to be stored in request or somewhere else. There are some issues with current polling approach: - in dio, one dio may include lots of bios, but only the last submitted bio is polled - one bio can be splitted into several bios, but submit_bio() only returns one cookie for polling Looks your approach via polling current state can fix this issue. > > --- > diff --git a/block/blk-core.c b/block/blk-core.c > index b993f88..3c1cfbf 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -3342,6 +3342,8 @@ EXPORT_SYMBOL(blk_finish_plug); > bool blk_poll(struct request_queue *q, blk_qc_t cookie) > { > struct blk_plug *plug; > + struct blk_mq_hw_ctx *hctx; > + unsigned int queue_num; > long state; > > if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || > @@ -3353,27 +3355,15 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) > blk_flush_plug_list(plug, false); > > state = current->state; > + queue_num = blk_qc_t_to_queue_num(cookie); > + hctx = q->queue_hw_ctx[queue_num]; > while (!need_resched()) { > - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); > - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; > - int ret; > - > - hctx->poll_invoked++; > - > - ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); > - if (ret > 0) { > - hctx->poll_success++; > - set_current_state(TASK_RUNNING); > - return true; > - } > + q->mq_ops->poll(hctx); > > if (signal_pending_state(state, current)) > set_current_state(TASK_RUNNING); > - > if (current->state == TASK_RUNNING) > return true; > - if (ret < 0) > - break; > cpu_relax(); > } Then looks the whole hw queue is polled and only the queue num in cookie matters. In theory, there might be one race: - one dio need to submit several bios(suppose there are two bios: A and B) - A is submitted to hardware queue M - B is submitted to hardware queue N because the current task is migrated to another CPU - then only hardware queue N is polled > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index befac5b..2e359e0 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -649,7 +651,7 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, > return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; > } > > -static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) > +static void nvme_process_cq(struct nvme_queue *nvmeq) > { > u16 head, phase; > > @@ -665,9 +667,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) > phase = !phase; > } > > - if (tag && *tag == cqe.command_id) > - *tag = -1; > - > if (unlikely(cqe.command_id >= nvmeq->q_depth)) { > dev_warn(nvmeq->dev->ctrl.device, > "invalid id %d completed on queue %d\n", > @@ -711,11 +710,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) > nvmeq->cqe_seen = 1; > } > > -static void nvme_process_cq(struct nvme_queue *nvmeq) > -{ > - __nvme_process_cq(nvmeq, NULL); > -} > - > static irqreturn_t nvme_irq(int irq, void *data) > { > irqreturn_t result; > @@ -736,20 +730,15 @@ static irqreturn_t nvme_irq_check(int irq, void *data) > return IRQ_NONE; > } > > -static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) > +static void nvme_poll(struct blk_mq_hw_ctx *hctx) > { > struct nvme_queue *nvmeq = hctx->driver_data; > > if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { > spin_lock_irq(&nvmeq->q_lock); > - __nvme_process_cq(nvmeq, &tag); > + nvme_process_cq(nvmeq); > spin_unlock_irq(&nvmeq->q_lock); > - > - if (tag == -1) > - return 1; > } > - > - return 0; > } > > static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 2498fdf..a723af9 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -100,7 +100,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int, > typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, > bool); > typedef void (busy_tag_iter_fn)(struct request *, void *, bool); > -typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int); > +typedef void (poll_fn)(struct blk_mq_hw_ctx *); > > > struct blk_mq_ops { > -- -- Ming Lei -- To unsubscribe from this list: send the line "unsubscribe linux-block" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html