Re: [PATCH V2] nbd: fix partial sending

Kevin Wolf <kwolf@xxxxxxxxxx> · Tue, 22 Oct 2024 20:46:07 +0200

Am 18.10.2024 um 16:08 hat Ming Lei geschrieben:
> nbd driver sends request header and payload with multiple call of
> sock_sendmsg, and partial sending can't be avoided. However, nbd driver
> returns BLK_STS_RESOURCE to block core in this situation. This way causes
> one issue: request->tag may change in the next run of nbd_queue_rq(), but
> the original old tag has been sent as part of header cookie, this way
> confuses nbd driver reply handling, since the real request can't be
> retrieved any more with the obsolete old tag.
> 
> Fix it by retrying sending directly in per-socket work function,
> meantime return BLK_STS_OK to block layer core.
> 
> Cc: vincent.chen@xxxxxxxxxx
> Cc: Leon Schuermann <leon@is.currently.online>
> Cc: Bart Van Assche <bvanassche@xxxxxxx>
> Reported-by: Kevin Wolf <kwolf@xxxxxxxxxx>
> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx>
> ---
> V2:
> 	- move pending retry to socket work function and return BLK_STS_OK, so that
> 	userspace can get chance to handle the signal(Kevin)

So first of all: This seems to work.

But looking through the code I have a few comments:

>  drivers/block/nbd.c | 89 +++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 77 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index b852050d8a96..855f4a79e37c 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -62,6 +62,7 @@ struct nbd_sock {
>  	bool dead;
>  	int fallback_index;
>  	int cookie;
> +	struct work_struct work;
>  };
>  
>  struct recv_thread_args {
> @@ -141,6 +142,9 @@ struct nbd_device {
>   */
>  #define NBD_CMD_INFLIGHT	2
>  
> +/* Just part of request header or data payload is sent successfully */
> +#define NBD_CMD_PARTIAL_SEND	3
> +
>  struct nbd_cmd {
>  	struct nbd_device *nbd;
>  	struct mutex lock;
> @@ -466,6 +470,12 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
>  	if (!mutex_trylock(&cmd->lock))
>  		return BLK_EH_RESET_TIMER;
>  
> +	/* partial send is handled in nbd_sock's work function */
> +	if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) {
> +		mutex_unlock(&cmd->lock);
> +		return BLK_EH_RESET_TIMER;
> +	}
> +
>  	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
>  		mutex_unlock(&cmd->lock);
>  		return BLK_EH_DONE;
> @@ -614,6 +624,27 @@ static inline int was_interrupted(int result)
>  	return result == -ERESTARTSYS || result == -EINTR;
>  }
>  
> +/*
> + * We've already sent header or part of data payload, have no choice but
> + * to set pending and schedule it in work.
> + *
> + * And we have to return BLK_STS_OK to block core, otherwise this same
> + * request may be re-dispatched with different tag, but our header has
> + * been sent out with old tag, and this way does confuse reply handling.
> + */
> +static void nbd_run_pending_work(struct nbd_device *nbd,
> +				 struct nbd_sock *nsock,
> +				 struct nbd_cmd *cmd, int sent)

The name of this function is a bit confusing, we don't actually run
anything here. Maybe nbd_schedule_pending_work()? Or something else with
"schedule" and "send".

> +{
> +	struct request *req = blk_mq_rq_from_pdu(cmd);
> +
> +	nsock->pending = req;
> +	nsock->sent = sent;
> +	set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
> +	refcount_inc(&nbd->config_refs);
> +	schedule_work(&nsock->work);
> +}

Important point about this function: It doesn't check if we already have
scheduled the work. You seem to have some code in nbd_pending_cmd_work()
that can cope with being scheduled multiple times, but allowing the
situation makes the control flow hard to follow. It would probably be
better to avoid this and call refcount_inc() and schedule_work() only if
NBD_CMD_PARTIAL_SEND isn't already set.

> +
>  /*
>   * Returns BLK_STS_RESOURCE if the caller should retry after a delay.
>   * Returns BLK_STS_IOERR if sending failed.
> @@ -699,11 +730,12 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
>  			 * completely done.
>  			 */
>  			if (sent) {
> -				nsock->pending = req;
> -				nsock->sent = sent;
> +				nbd_run_pending_work(nbd, nsock, cmd, sent);
> +				return BLK_STS_OK;

Now the question is if requeuing is already implicitly avoided, because
returning EINTR to a worker doesn't seem to make a lot of sense to me,
and so we might actually never hit this code path from a worker. But I'm
not completely sure. Maybe better to detect the situation in
nbd_run_pending_work() anyway.

> +			} else {
> +				set_bit(NBD_CMD_REQUEUED, &cmd->flags);
> +				return BLK_STS_RESOURCE;
>  			}
> -			set_bit(NBD_CMD_REQUEUED, &cmd->flags);
> -			return BLK_STS_RESOURCE;

This is an unrelated style change.

>  		}
>  		dev_err_ratelimited(disk_to_dev(nbd->disk),
>  			"Send control failed (result %d)\n", result);
> @@ -737,14 +769,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
>  			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
>  			if (result < 0) {
>  				if (was_interrupted(result)) {
> -					/* We've already sent the header, we
> -					 * have no choice but to set pending and
> -					 * return BUSY.
> -					 */
> -					nsock->pending = req;
> -					nsock->sent = sent;
> -					set_bit(NBD_CMD_REQUEUED, &cmd->flags);
> -					return BLK_STS_RESOURCE;
> +					nbd_run_pending_work(nbd, nsock, cmd, sent);
> +					return BLK_STS_OK;
>  				}
>  				dev_err(disk_to_dev(nbd->disk),
>  					"Send data failed (result %d)\n",
> @@ -778,6 +804,44 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
>  	return BLK_STS_OK;
>  }
>  
> +/* handle partial sending */
> +static void nbd_pending_cmd_work(struct work_struct *work)
> +{
> +	struct nbd_sock *nsock = container_of(work, struct nbd_sock, work);
> +	struct request *req = nsock->pending;
> +	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
> +	struct nbd_device *nbd = cmd->nbd;
> +	unsigned long deadline = READ_ONCE(req->deadline);
> +	unsigned int wait_ms = 2;
> +
> +	mutex_lock(&cmd->lock);
> +
> +	WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags));
> +	if (!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))
> +		goto out;

I believe this check is only for detecting if the work was scheduled
multiple times? If we avoid that above, we probably can make this a
WARN_ON_ONCE(), too.

> +
> +	mutex_lock(&nsock->tx_lock);
> +	while (true) {
> +		nbd_send_cmd(nbd, cmd, cmd->index);
> +		if (!nsock->pending)
> +			break;

If it's true that we can never get EINTR or ERESTARTSYS in a worker,
then this condition would always be true and the loop and sleeping logic
is unnecessary.

If it actually is necessary, I wonder if it wouldn't be better to just
let nbd_send_cmd() reschedule the work without a loop and sleeping here.
I'm not entirely sure what EINTR/ERESTARTSYS should even mean in this
context, but like in the .queue_rq() path, the thing that clears these
error conditions would probably be returning, not sleeping?

> +
> +		/* don't bother timeout handler for partial sending */
> +		if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) {
> +			cmd->status = BLK_STS_IOERR;
> +			blk_mq_complete_request(req);
> +			break;
> +		}
> +		msleep(wait_ms);
> +		wait_ms *= 2;
> +	}
> +	mutex_unlock(&nsock->tx_lock);
> +	clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags);
> +out:
> +	mutex_unlock(&cmd->lock);
> +	nbd_config_put(nbd);
> +}

Kevin