Re: [PATCH 4/4] nvme: add support for mq_ops->queue_rqs()

Jens Axboe <axboe@xxxxxxxxx> · Thu, 16 Dec 2021 09:09:45 -0700

On 12/16/21 9:06 AM, Max Gurtovoy wrote:
> 
> On 12/16/2021 5:59 PM, Jens Axboe wrote:
>> On 12/16/21 6:02 AM, Max Gurtovoy wrote:
>>> On 12/15/2021 6:24 PM, Jens Axboe wrote:
>>>> This enables the block layer to send us a full plug list of requests
>>>> that need submitting. The block layer guarantees that they all belong
>>>> to the same queue, but we do have to check the hardware queue mapping
>>>> for each request.
>>>>
>>>> If errors are encountered, leave them in the passed in list. Then the
>>>> block layer will handle them individually.
>>>>
>>>> This is good for about a 4% improvement in peak performance, taking us
>>>> from 9.6M to 10M IOPS/core.
>>>>
>>>> Reviewed-by: Hannes Reinecke <hare@xxxxxxx>
>>>> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
>>>> ---
>>>>    drivers/nvme/host/pci.c | 61 +++++++++++++++++++++++++++++++++++++++++
>>>>    1 file changed, 61 insertions(+)
>>>>
>>>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>>>> index 6be6b1ab4285..197aa45ef7ef 100644
>>>> --- a/drivers/nvme/host/pci.c
>>>> +++ b/drivers/nvme/host/pci.c
>>>> @@ -981,6 +981,66 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
>>>>    	return BLK_STS_OK;
>>>>    }
>>>>    
>>>> +static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
>>>> +{
>>>> +	spin_lock(&nvmeq->sq_lock);
>>>> +	while (!rq_list_empty(*rqlist)) {
>>>> +		struct request *req = rq_list_pop(rqlist);
>>>> +		struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>>>> +
>>>> +		memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
>>>> +				absolute_pointer(&iod->cmd), sizeof(iod->cmd));
>>>> +		if (++nvmeq->sq_tail == nvmeq->q_depth)
>>>> +			nvmeq->sq_tail = 0;
>>>> +	}
>>>> +	nvme_write_sq_db(nvmeq, true);
>>>> +	spin_unlock(&nvmeq->sq_lock);
>>>> +}
>>>> +
>>>> +static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
>>>> +{
>>>> +	/*
>>>> +	 * We should not need to do this, but we're still using this to
>>>> +	 * ensure we can drain requests on a dying queue.
>>>> +	 */
>>>> +	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
>>>> +		return false;
>>>> +	if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
>>>> +		return false;
>>>> +
>>>> +	req->mq_hctx->tags->rqs[req->tag] = req;
>>>> +	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
>>>> +}
>>>> +
>>>> +static void nvme_queue_rqs(struct request **rqlist)
>>>> +{
>>>> +	struct request *req = rq_list_peek(rqlist), *prev = NULL;
>>>> +	struct request *requeue_list = NULL;
>>>> +
>>>> +	do {
>>>> +		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
>>>> +
>>>> +		if (!nvme_prep_rq_batch(nvmeq, req)) {
>>>> +			/* detach 'req' and add to remainder list */
>>>> +			if (prev)
>>>> +				prev->rq_next = req->rq_next;
>>>> +			rq_list_add(&requeue_list, req);
>>>> +		} else {
>>>> +			prev = req;
>>>> +		}
>>>> +
>>>> +		req = rq_list_next(req);
>>>> +		if (!req || (prev && req->mq_hctx != prev->mq_hctx)) {
>>>> +			/* detach rest of list, and submit */
>>>> +			prev->rq_next = NULL;
>>> if req == NULL and prev == NULL we'll get a NULL deref here.
>>>
>>> I think this can happen in the first iteration.
>>>
>>> Correct me if I'm wrong..
>> First iteration we know the list isn't empty, so req can't be NULL
>> there.
> 
> but you set "req = rq_list_next(req);"
> 
> So can't req be NULL ? after the above line ?

I guess if we hit the prep failure path for the first request that could
be a concern. Probably best to add an if (prev) before that detach,
thanks.

-- 
Jens Axboe