Re: [PATCH v2 1/5] NFS: Fix error handling for O_DIRECT write scheduling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 2023-11-09 at 10:05 -0500, Benjamin Coddington wrote:
> Hi Trond,
> 
> On 4 Sep 2023, at 12:34, trondmy@xxxxxxxxxx wrote:
> 
> > From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> > 
> > If we fail to schedule a request for transmission, there are 2
> > possibilities:
> > 1) Either we hit a fatal error, and we just want to drop the
> > remaining
> >    requests on the floor.
> > 2) We were asked to try again, in which case we should allow the
> >    outstanding RPC calls to complete, so that we can recoalesce
> > requests
> >    and try again.
> > 
> > Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to
> > application")
> > Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
> > ---
> >  fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++---------
> > ----
> >  1 file changed, 46 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> > index 47d892a1d363..ee88f0a6e7b8 100644
> > --- a/fs/nfs/direct.c
> > +++ b/fs/nfs/direct.c
> > @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode
> > *inode,
> >  static void nfs_direct_write_reschedule(struct nfs_direct_req
> > *dreq)
> >  {
> >  	struct nfs_pageio_descriptor desc;
> > -	struct nfs_page *req, *tmp;
> > +	struct nfs_page *req;
> >  	LIST_HEAD(reqs);
> >  	struct nfs_commit_info cinfo;
> > -	LIST_HEAD(failed);
> > 
> >  	nfs_init_cinfo_from_dreq(&cinfo, dreq);
> >  	nfs_direct_write_scan_commit_list(dreq->inode, &reqs,
> > &cinfo);
> > @@ -549,27 +548,36 @@ static void
> > nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
> >  			      &nfs_direct_write_completion_ops);
> >  	desc.pg_dreq = dreq;
> > 
> > -	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
> > +	while (!list_empty(&reqs)) {
> > +		req = nfs_list_entry(reqs.next);
> >  		/* Bump the transmission count */
> >  		req->wb_nio++;
> >  		if (!nfs_pageio_add_request(&desc, req)) {
> > -			nfs_list_move_request(req, &failed);
> >  			spin_lock(&cinfo.inode->i_lock);
> > -			dreq->flags = 0;
> > -			if (desc.pg_error < 0)
> > +			if (dreq->error < 0) {
> > +				desc.pg_error = dreq->error;
> > +			} else if (desc.pg_error != -EAGAIN) {
> > +				dreq->flags = 0;
> > +				if (!desc.pg_error)
> > +					desc.pg_error = -EIO;
> >  				dreq->error = desc.pg_error;
> > -			else
> > -				dreq->error = -EIO;
> > +			} else
> > +				dreq->flags =
> > NFS_ODIRECT_RESCHED_WRITES;
> >  			spin_unlock(&cinfo.inode->i_lock);
> > +			break;
> >  		}
> >  		nfs_release_request(req);
> >  	}
> >  	nfs_pageio_complete(&desc);
> > 
> > -	while (!list_empty(&failed)) {
> > -		req = nfs_list_entry(failed.next);
> > +	while (!list_empty(&reqs)) {
> > +		req = nfs_list_entry(reqs.next);
> >  		nfs_list_remove_request(req);
> >  		nfs_unlock_and_release_request(req);
> > +		if (desc.pg_error == -EAGAIN)
> > +			nfs_mark_request_commit(req, NULL, &cinfo,
> > 0);
> > +		else
> > +			nfs_release_request(req);
> >  	}
> > 
> >  	if (put_dreq(dreq))
> > @@ -794,9 +802,11 @@ static ssize_t
> > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
> >  {
> >  	struct nfs_pageio_descriptor desc;
> >  	struct inode *inode = dreq->inode;
> > +	struct nfs_commit_info cinfo;
> >  	ssize_t result = 0;
> >  	size_t requested_bytes = 0;
> >  	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize,
> > PAGE_SIZE);
> > +	bool defer = false;
> > 
> >  	trace_nfs_direct_write_schedule_iovec(dreq);
> > 
> > @@ -837,17 +847,37 @@ static ssize_t
> > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
> >  				break;
> >  			}
> > 
> > -			nfs_lock_request(req);
> > -			if (!nfs_pageio_add_request(&desc, req)) {
> > -				result = desc.pg_error;
> > -
> > 				nfs_unlock_and_release_request(req);
> > -				break;
> > -			}
> >  			pgbase = 0;
> >  			bytes -= req_len;
> >  			requested_bytes += req_len;
> >  			pos += req_len;
> >  			dreq->bytes_left -= req_len;
> > +
> > +			if (defer) {
> > +				nfs_mark_request_commit(req, NULL,
> > &cinfo, 0);
> > +				continue;
> > +			}
> > +
> > +			nfs_lock_request(req);
> > +			if (nfs_pageio_add_request(&desc, req))
> > +				continue;
> 
> Just a note, we've hit some trouble with this one on pNFS SCSI.
> 
> When we re-order the update of dreq->bytes_left and
> nfs_pageio_add_request(), the blocklayout driver machinery ends up
> with bad
> calculations for LAYOUTGET on the first req.   What I see is a short
> LAYOUTGET, and then a 2nd LAYOUGET for the last req with loga_length
> 0,
> causing some havoc.
> 
> Eventually, there's a corruption somewhere - I think because
> nfs_pageio_add_request() ends up doing nfs_pageio_do() in the middle
> of this
> thing and then we race to something in completion - I haven't quite
> been
> able to figure that part out yet.
> 

Hi Ben,

Relying on the value of dreq->bytes_left is just not a good idea, given
that the layoutget request could end up returning NFS4ERR_DELAY.

How about something like the following patch?

8<-----------------------------------------------------
>From e68e9c928c9d843c482ec08e1d26350b7999cafa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
Date: Thu, 9 Nov 2023 11:46:28 -0500
Subject: [PATCH] pNFS: Fix the pnfs block driver's calculation of layoutget
 size

Instead of relying on the value of the 'bytes_left' field, we should
calculate the layout size based on the offset of the request that is
being written out.

Reported-by: Benjamin Coddington <bcodding@xxxxxxxxxx>
Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx>
---
 fs/nfs/blocklayout/blocklayout.c | 5 ++---
 fs/nfs/direct.c                  | 5 +++--
 fs/nfs/internal.h                | 2 +-
 fs/nfs/pnfs.c                    | 3 ++-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 943aeea1eb16..c1cc9fe93dd4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -893,10 +893,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 	}
 
 	if (pgio->pg_dreq == NULL)
-		wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
-					      req->wb_index);
+		wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index);
 	else
-		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+		wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req));
 
 	pnfs_generic_pg_init_write(pgio, req, wb_size);
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f6c74f424691..5918c67dae0d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
 	kref_put(&dreq->kref, nfs_direct_req_free);
 }
 
-ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
 {
-	return dreq->bytes_left;
+	loff_t start = offset - dreq->io_start;
+	return dreq->max_count - start;
 }
 EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9c9cf764f600..b1fa81c9dff6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
 /* direct.c */
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 			      struct nfs_direct_req *dreq);
-extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset);
 
 /* nfs4proc.c */
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 21a365357629..0c0fed1ecd0b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 		if (pgio->pg_dreq == NULL)
 			rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
 		else
-			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq,
+						      req_offset(req));
 
 		pgio->pg_lseg =
 			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
-- 
2.41.0


-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@xxxxxxxxxxxxxxx






[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux