There have been confirmed reports where a container with an NFS mount inside it dies abruptly, along with all of its processes, but the NFS client sticks around and keeps trying to send RPCs after the networking is gone. We have a reproducer where if we SIGKILL a container with an NFS mount, the RPC clients will stick around indefinitely. The orchestrator does a MNT_DETACH unmount on the NFS mount, and then tears down the networking while there are still RPCs in flight. Recently new controls were added[1] that allow shutting down an NFS mount. That doesn't help here since the mount namespace is detached from any tasks at this point. Transplant shutdown_client() to the sunrpc module, and give it a more distinct name. Add a new debugfs sunrpc/rpc_clnt/*/shutdown knob that allows the same functionality as the one in /sys/fs/nfs, but at the rpc_clnt level. [1]: commit d9615d166c7e ("NFS: add sysfs shutdown knob"). Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> --- fs/nfs/sysfs.c | 19 ++++--------------- include/linux/sunrpc/sched.h | 1 + net/sunrpc/clnt.c | 12 ++++++++++++ net/sunrpc/debugfs.c | 15 +++++++++++++++ 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 7b59a40d40c061a41b0fbde91aa006314f02c1fb..c29c5fd639554461bdcd9ff612726194910d85b5 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -217,17 +217,6 @@ void nfs_netns_sysfs_destroy(struct nfs_net *netns) } } -static bool shutdown_match_client(const struct rpc_task *task, const void *data) -{ - return true; -} - -static void shutdown_client(struct rpc_clnt *clnt) -{ - clnt->cl_shutdown = 1; - rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); -} - static ssize_t shutdown_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -258,14 +247,14 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, goto out; server->flags |= NFS_MOUNT_SHUTDOWN; - shutdown_client(server->client); - shutdown_client(server->nfs_client->cl_rpcclient); + rpc_clnt_shutdown(server->client); + rpc_clnt_shutdown(server->nfs_client->cl_rpcclient); if (!IS_ERR(server->client_acl)) - shutdown_client(server->client_acl); + rpc_clnt_shutdown(server->client_acl); if (server->nlm_host) - shutdown_client(server->nlm_host->h_rpcclnt); + rpc_clnt_shutdown(server->nlm_host->h_rpcclnt); out: return count; } diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index eac57914dcf3200c1a6ed39ab030e3fe8b4da3e1..fe7c39a17ce44ec68c0cf057133d0f8e7f0ae797 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -232,6 +232,7 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, bool (*fnmatch)(const struct rpc_task *, const void *), const void *data); +void rpc_clnt_shutdown(struct rpc_clnt *clnt); void rpc_execute(struct rpc_task *); void rpc_init_priority_wait_queue(struct rpc_wait_queue *, const char *); void rpc_init_wait_queue(struct rpc_wait_queue *, const char *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2fe88ea79a70c134e58abfb03fca230883eddf1f..0028858b12d97e7b45f4c24cfbd761ba2a734b32 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -934,6 +934,18 @@ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, } EXPORT_SYMBOL_GPL(rpc_cancel_tasks); +static bool shutdown_match_client(const struct rpc_task *task, const void *data) +{ + return true; +} + +void rpc_clnt_shutdown(struct rpc_clnt *clnt) +{ + clnt->cl_shutdown = 1; + rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); +} +EXPORT_SYMBOL_GPL(rpc_clnt_shutdown); + static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *dummy) { diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 32417db340de3775c533d0ad683b5b37800d7fe5..4df31dcca2d747db6767c12ddfa29963ed7be204 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -145,6 +145,17 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n return 0; } +static int +clnt_shutdown(void *data, u64 value) +{ + struct rpc_clnt *clnt = data; + + rpc_clnt_shutdown(clnt); + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(shutdown_fops, NULL, clnt_shutdown, "%llu\n"); + void rpc_clnt_debugfs_register(struct rpc_clnt *clnt) { @@ -163,6 +174,10 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt) debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt, &tasks_fops); + /* make shutdown file */ + debugfs_create_file("shutdown", S_IFREG | 0200, clnt->cl_debugfs, clnt, + &shutdown_fops); + rpc_clnt_iterate_for_each_xprt(clnt, do_xprt_debugfs, &xprtnum); } --- base-commit: 80e54e84911a923c40d7bee33a34c1b4be148d7a change-id: 20250312-rpc-shutdown-ce9b6d3599da Best regards, -- Jeff Layton <jlayton@xxxxxxxxxx>