Re: [PATCH v2 1/5] NFS: Fix error handling for O_DIRECT write scheduling

Benjamin Coddington <bcodding@xxxxxxxxxx> · Thu, 09 Nov 2023 10:05:31 -0500

Hi Trond,

On 4 Sep 2023, at 12:34, trondmy@xxxxxxxxxx wrote:

> From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
>
> If we fail to schedule a request for transmission, there are 2
> possibilities:
> 1) Either we hit a fatal error, and we just want to drop the remaining
>    requests on the floor.
> 2) We were asked to try again, in which case we should allow the
>    outstanding RPC calls to complete, so that we can recoalesce requests
>    and try again.
>
> Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to application")
> Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> ---
>  fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 46 insertions(+), 16 deletions(-)
>
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index 47d892a1d363..ee88f0a6e7b8 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
>  static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
>  {
>  	struct nfs_pageio_descriptor desc;
> -	struct nfs_page *req, *tmp;
> +	struct nfs_page *req;
>  	LIST_HEAD(reqs);
>  	struct nfs_commit_info cinfo;
> -	LIST_HEAD(failed);
>
>  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
>  	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
> @@ -549,27 +548,36 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
>  			      &nfs_direct_write_completion_ops);
>  	desc.pg_dreq = dreq;
>
> -	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
> +	while (!list_empty(&reqs)) {
> +		req = nfs_list_entry(reqs.next);
>  		/* Bump the transmission count */
>  		req->wb_nio++;
>  		if (!nfs_pageio_add_request(&desc, req)) {
> -			nfs_list_move_request(req, &failed);
>  			spin_lock(&cinfo.inode->i_lock);
> -			dreq->flags = 0;
> -			if (desc.pg_error < 0)
> +			if (dreq->error < 0) {
> +				desc.pg_error = dreq->error;
> +			} else if (desc.pg_error != -EAGAIN) {
> +				dreq->flags = 0;
> +				if (!desc.pg_error)
> +					desc.pg_error = -EIO;
>  				dreq->error = desc.pg_error;
> -			else
> -				dreq->error = -EIO;
> +			} else
> +				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
>  			spin_unlock(&cinfo.inode->i_lock);
> +			break;
>  		}
>  		nfs_release_request(req);
>  	}
>  	nfs_pageio_complete(&desc);
>
> -	while (!list_empty(&failed)) {
> -		req = nfs_list_entry(failed.next);
> +	while (!list_empty(&reqs)) {
> +		req = nfs_list_entry(reqs.next);
>  		nfs_list_remove_request(req);
>  		nfs_unlock_and_release_request(req);
> +		if (desc.pg_error == -EAGAIN)
> +			nfs_mark_request_commit(req, NULL, &cinfo, 0);
> +		else
> +			nfs_release_request(req);
>  	}
>
>  	if (put_dreq(dreq))
> @@ -794,9 +802,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
>  {
>  	struct nfs_pageio_descriptor desc;
>  	struct inode *inode = dreq->inode;
> +	struct nfs_commit_info cinfo;
>  	ssize_t result = 0;
>  	size_t requested_bytes = 0;
>  	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
> +	bool defer = false;
>
>  	trace_nfs_direct_write_schedule_iovec(dreq);
>
> @@ -837,17 +847,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
>  				break;
>  			}
>
> -			nfs_lock_request(req);
> -			if (!nfs_pageio_add_request(&desc, req)) {
> -				result = desc.pg_error;
> -				nfs_unlock_and_release_request(req);
> -				break;
> -			}
>  			pgbase = 0;
>  			bytes -= req_len;
>  			requested_bytes += req_len;
>  			pos += req_len;
>  			dreq->bytes_left -= req_len;
> +
> +			if (defer) {
> +				nfs_mark_request_commit(req, NULL, &cinfo, 0);
> +				continue;
> +			}
> +
> +			nfs_lock_request(req);
> +			if (nfs_pageio_add_request(&desc, req))
> +				continue;

Just a note, we've hit some trouble with this one on pNFS SCSI.

When we re-order the update of dreq->bytes_left and
nfs_pageio_add_request(), the blocklayout driver machinery ends up with bad
calculations for LAYOUTGET on the first req.   What I see is a short
LAYOUTGET, and then a 2nd LAYOUGET for the last req with loga_length 0,
causing some havoc.

Eventually, there's a corruption somewhere - I think because
nfs_pageio_add_request() ends up doing nfs_pageio_do() in the middle of this
thing and then we race to something in completion - I haven't quite been
able to figure that part out yet.

Ben