On Wed, 09 Oct 2024, cel@xxxxxxxxxx wrote: > From: Chuck Lever <chuck.lever@xxxxxxxxxx> > > We've found that there are cases where a transport disconnection > results in the loss of callback RPCs. NFS servers typically do not > retransmit callback operations after a disconnect. > > This can be a problem for the Linux NFS client's current > implementation of asynchronous COPY, which waits indefinitely for a > CB_OFFLOAD callback. If a transport disconnect occurs while an async > COPY is running, there's a good chance the client will never get the > completing CB_OFFLOAD. > > Fix this by implementing the OFFLOAD_STATUS operation so that the > Linux NFS client can probe the NFS server if it doesn't see a > CB_OFFLOAD in a reasonable amount of time. > > This patch implements a simplistic check. As future work, the client > might also be able to detect whether there is no forward progress on > the request asynchronous COPY operation, and CANCEL it. > > Suggested-by: Olga Kornievskaia <kolga@xxxxxxxxxx> > Link: https://bugzilla.kernel.org/show_bug.cgi?id=218735 > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > --- > fs/nfs/nfs42proc.c | 56 ++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 49 insertions(+), 7 deletions(-) > > diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c > index 175330843558..fc4f64750dc5 100644 > --- a/fs/nfs/nfs42proc.c > +++ b/fs/nfs/nfs42proc.c > @@ -175,6 +175,11 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) > return err; > } > > +/* Wait this long before checking progress on a COPY operation */ > +enum { > + NFS42_COPY_TIMEOUT = 3 * HZ, > +}; > + > static int handle_async_copy(struct nfs42_copy_res *res, > struct nfs_server *dst_server, > struct nfs_server *src_server, > @@ -184,9 +189,10 @@ static int handle_async_copy(struct nfs42_copy_res *res, > bool *restart) > { > struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter; > - int status = NFS4_OK; > struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); > struct nfs_open_context *src_ctx = nfs_file_open_context(src); > + int status = NFS4_OK; > + u64 copied; > > copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); > if (!copy) > @@ -224,7 +230,9 @@ static int handle_async_copy(struct nfs42_copy_res *res, > spin_unlock(&src_server->nfs_client->cl_lock); > } > > - status = wait_for_completion_interruptible(©->completion); > +wait: > + status = wait_for_completion_interruptible_timeout(©->completion, > + NFS42_COPY_TIMEOUT); > spin_lock(&dst_server->nfs_client->cl_lock); > list_del_init(©->copies); > spin_unlock(&dst_server->nfs_client->cl_lock); > @@ -233,15 +241,21 @@ static int handle_async_copy(struct nfs42_copy_res *res, > list_del_init(©->src_copies); > spin_unlock(&src_server->nfs_client->cl_lock); > } > - if (status == -ERESTARTSYS) { > - goto out_cancel; > - } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { > - status = -EAGAIN; > - *restart = true; > + switch (status) { > + case 0: > + goto timeout; > + case -ERESTARTSYS: > goto out_cancel; > + default: > + if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { > + status = -EAGAIN; > + *restart = true; > + goto out_cancel; > + } > } > out: > res->write_res.count = copy->count; > + /* Copy out the updated write verifier provided by CB_OFFLOAD. */ > memcpy(&res->write_res.verifier, ©->verf, sizeof(copy->verf)); > status = -copy->error; > > @@ -253,6 +267,34 @@ static int handle_async_copy(struct nfs42_copy_res *res, > if (!nfs42_files_from_same_server(src, dst)) > nfs42_do_offload_cancel_async(src, src_stateid); > goto out_free; > +timeout: > + status = nfs42_proc_offload_status(src, ©->stateid, &copied); > + switch (status) { > + case 0: > + case -EREMOTEIO: > + /* The server recognized the copy stateid, so it hasn't > + * rebooted. Don't overwrite the verifier returned in the > + * COPY result. */ > + res->write_res.count = copied; > + goto out_free; > + case -EINPROGRESS: > + goto wait; > + case -EBADF: > + /* Server did not recognize the copy stateid. It has > + * probably restarted and lost the plot. State recovery > + * might redrive the COPY from the beginning, in this > + * case? */ > + res->write_res.count = 0; > + status = -EREMOTEIO; > + break; > + case -EOPNOTSUPP: > + /* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when > + * it has signed up for an async COPY, so server is not > + * spec-compliant. */ > + res->write_res.count = 0; > + status = -EREMOTEIO; Should -ERESTARTSYS be handled here? NeilBrown > + } > + goto out; > } > > static int process_copy_commit(struct file *dst, loff_t pos_dst, > -- > 2.46.2 > > >