> On Mar 31, 2022, at 4:38 PM, Chuck Lever <chuck.lever@xxxxxxxxxx> wrote: > > Cache timeout injection stress-tests the cache timeout logic as well > as upper layer protocol deferred request handlers. > > A file called /sys/kernel/debug/fail_sunrpc/ignore-cache-timeout > enables administrators to turn off cache timeout injection while > allowing other types of sunrpc errors to be injected. The default > setting is that cache timeout injection is enabled (ignore=false). > > To enable cache timeout injection, CONFIG_FAULT_INJECTION, > CONFIG_FAULT_INJECTION_DEBUG_FS, and CONFIG_SUNRPC_DEBUG must all be > set to "Y". > > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > --- > net/sunrpc/cache.c | 16 ++++++++++++++++ > net/sunrpc/debugfs.c | 3 +++ > net/sunrpc/fail.h | 2 +- > 3 files changed, 20 insertions(+), 1 deletion(-) > > > Proof of concept: compile-tested only. The idea is to inject timeout > failures in the cache code so we can see what happens when a rqst > actually has to be deferred. Using v2 of this RFC patch, I am able to reproduce Trond's crash exactly on the same nfsd thread that's handling a deferred request. I'll work on addressing it. > diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c > index bb1177395b99..e5ec125afec9 100644 > --- a/net/sunrpc/cache.c > +++ b/net/sunrpc/cache.c > @@ -33,7 +33,9 @@ > #include <linux/sunrpc/stats.h> > #include <linux/sunrpc/rpc_pipe_fs.h> > #include <trace/events/sunrpc.h> > + > #include "netns.h" > +#include "fail.h" > > #define RPCDBG_FACILITY RPCDBG_CACHE > > @@ -629,6 +631,19 @@ static void cache_restart_thread(struct cache_deferred_req *dreq, int too_many) > complete(&dr->completion); > } > > +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) > +static inline bool cache_timeout_should_fail(void) > +{ > + return !fail_sunrpc.ignore_cache_timeout && > + should_fail(&fail_sunrpc.attr, 1); > +} > +#else > +static inline bool cache_timeout_should_fail(void) > +{ > + return false; > +} > +#endif > + > static void cache_wait_req(struct cache_req *req, struct cache_head *item) > { > struct thread_deferred_req sleeper; > @@ -640,6 +655,7 @@ static void cache_wait_req(struct cache_req *req, struct cache_head *item) > setup_deferral(dreq, item, 0); > > if (!test_bit(CACHE_PENDING, &item->flags) || > + cache_timeout_should_fail() || > wait_for_completion_interruptible_timeout( > &sleeper.completion, req->thread_wait) <= 0) { > /* The completion wasn't completed, so we need > diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c > index 7dc9cc929bfd..68272885873a 100644 > --- a/net/sunrpc/debugfs.c > +++ b/net/sunrpc/debugfs.c > @@ -262,6 +262,9 @@ static void fail_sunrpc_init(void) > > debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir, > &fail_sunrpc.ignore_server_disconnect); > + > + debugfs_create_bool("ignore-cache-timeout", S_IFREG | 0600, dir, > + &fail_sunrpc.ignore_cache_timeout); > } > #else > static void fail_sunrpc_init(void) > diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h > index 69dc30cc44b8..13b8436b5f15 100644 > --- a/net/sunrpc/fail.h > +++ b/net/sunrpc/fail.h > @@ -14,8 +14,8 @@ struct fail_sunrpc_attr { > struct fault_attr attr; > > bool ignore_client_disconnect; > - > bool ignore_server_disconnect; > + bool ignore_cache_timeout; > }; > > extern struct fail_sunrpc_attr fail_sunrpc; > > -- Chuck Lever