On Thu, 2023-11-09 at 10:05 -0500, Benjamin Coddington wrote: > Hi Trond, > > On 4 Sep 2023, at 12:34, trondmy@xxxxxxxxxx wrote: > > > From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > > > > If we fail to schedule a request for transmission, there are 2 > > possibilities: > > 1) Either we hit a fatal error, and we just want to drop the > > remaining > > requests on the floor. > > 2) We were asked to try again, in which case we should allow the > > outstanding RPC calls to complete, so that we can recoalesce > > requests > > and try again. > > > > Fixes: d600ad1f2bdb ("NFS41: pop some layoutget errors to > > application") > > Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > > --- > > fs/nfs/direct.c | 62 ++++++++++++++++++++++++++++++++++++--------- > > ---- > > 1 file changed, 46 insertions(+), 16 deletions(-) > > > > diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c > > index 47d892a1d363..ee88f0a6e7b8 100644 > > --- a/fs/nfs/direct.c > > +++ b/fs/nfs/direct.c > > @@ -528,10 +528,9 @@ nfs_direct_write_scan_commit_list(struct inode > > *inode, > > static void nfs_direct_write_reschedule(struct nfs_direct_req > > *dreq) > > { > > struct nfs_pageio_descriptor desc; > > - struct nfs_page *req, *tmp; > > + struct nfs_page *req; > > LIST_HEAD(reqs); > > struct nfs_commit_info cinfo; > > - LIST_HEAD(failed); > > > > nfs_init_cinfo_from_dreq(&cinfo, dreq); > > nfs_direct_write_scan_commit_list(dreq->inode, &reqs, > > &cinfo); > > @@ -549,27 +548,36 @@ static void > > nfs_direct_write_reschedule(struct nfs_direct_req *dreq) > > &nfs_direct_write_completion_ops); > > desc.pg_dreq = dreq; > > > > - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { > > + while (!list_empty(&reqs)) { > > + req = nfs_list_entry(reqs.next); > > /* Bump the transmission count */ > > req->wb_nio++; > > if (!nfs_pageio_add_request(&desc, req)) { > > - nfs_list_move_request(req, &failed); > > spin_lock(&cinfo.inode->i_lock); > > - dreq->flags = 0; > > - if (desc.pg_error < 0) > > + if (dreq->error < 0) { > > + desc.pg_error = dreq->error; > > + } else if (desc.pg_error != -EAGAIN) { > > + dreq->flags = 0; > > + if (!desc.pg_error) > > + desc.pg_error = -EIO; > > dreq->error = desc.pg_error; > > - else > > - dreq->error = -EIO; > > + } else > > + dreq->flags = > > NFS_ODIRECT_RESCHED_WRITES; > > spin_unlock(&cinfo.inode->i_lock); > > + break; > > } > > nfs_release_request(req); > > } > > nfs_pageio_complete(&desc); > > > > - while (!list_empty(&failed)) { > > - req = nfs_list_entry(failed.next); > > + while (!list_empty(&reqs)) { > > + req = nfs_list_entry(reqs.next); > > nfs_list_remove_request(req); > > nfs_unlock_and_release_request(req); > > + if (desc.pg_error == -EAGAIN) > > + nfs_mark_request_commit(req, NULL, &cinfo, > > 0); > > + else > > + nfs_release_request(req); > > } > > > > if (put_dreq(dreq)) > > @@ -794,9 +802,11 @@ static ssize_t > > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, > > { > > struct nfs_pageio_descriptor desc; > > struct inode *inode = dreq->inode; > > + struct nfs_commit_info cinfo; > > ssize_t result = 0; > > size_t requested_bytes = 0; > > size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, > > PAGE_SIZE); > > + bool defer = false; > > > > trace_nfs_direct_write_schedule_iovec(dreq); > > > > @@ -837,17 +847,37 @@ static ssize_t > > nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, > > break; > > } > > > > - nfs_lock_request(req); > > - if (!nfs_pageio_add_request(&desc, req)) { > > - result = desc.pg_error; > > - > > nfs_unlock_and_release_request(req); > > - break; > > - } > > pgbase = 0; > > bytes -= req_len; > > requested_bytes += req_len; > > pos += req_len; > > dreq->bytes_left -= req_len; > > + > > + if (defer) { > > + nfs_mark_request_commit(req, NULL, > > &cinfo, 0); > > + continue; > > + } > > + > > + nfs_lock_request(req); > > + if (nfs_pageio_add_request(&desc, req)) > > + continue; > > Just a note, we've hit some trouble with this one on pNFS SCSI. > > When we re-order the update of dreq->bytes_left and > nfs_pageio_add_request(), the blocklayout driver machinery ends up > with bad > calculations for LAYOUTGET on the first req. What I see is a short > LAYOUTGET, and then a 2nd LAYOUGET for the last req with loga_length > 0, > causing some havoc. > > Eventually, there's a corruption somewhere - I think because > nfs_pageio_add_request() ends up doing nfs_pageio_do() in the middle > of this > thing and then we race to something in completion - I haven't quite > been > able to figure that part out yet. > Hi Ben, Relying on the value of dreq->bytes_left is just not a good idea, given that the layoutget request could end up returning NFS4ERR_DELAY. How about something like the following patch? 8<----------------------------------------------------- >From e68e9c928c9d843c482ec08e1d26350b7999cafa Mon Sep 17 00:00:00 2001 From: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> Date: Thu, 9 Nov 2023 11:46:28 -0500 Subject: [PATCH] pNFS: Fix the pnfs block driver's calculation of layoutget size Instead of relying on the value of the 'bytes_left' field, we should calculate the layout size based on the offset of the request that is being written out. Reported-by: Benjamin Coddington <bcodding@xxxxxxxxxx> Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> --- fs/nfs/blocklayout/blocklayout.c | 5 ++--- fs/nfs/direct.c | 5 +++-- fs/nfs/internal.h | 2 +- fs/nfs/pnfs.c | 3 ++- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 943aeea1eb16..c1cc9fe93dd4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -893,10 +893,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) } if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index); else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req)); pnfs_generic_pg_init_write(pgio, req, wb_size); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f6c74f424691..5918c67dae0d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq) kref_put(&dreq->kref, nfs_direct_req_free); } -ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq) +ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset) { - return dreq->bytes_left; + loff_t start = offset - dreq->io_start; + return dreq->max_count - start; } EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9c9cf764f600..b1fa81c9dff6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); -extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 21a365357629..0c0fed1ecd0b 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq, + req_offset(req)); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), -- 2.41.0 -- Trond Myklebust Linux NFS client maintainer, Hammerspace trond.myklebust@xxxxxxxxxxxxxxx