I ran Dave's test in a loop over night with these 3 patches on top of v4.20-rc7 and didn't see any more of the XPRT_WRITE_SPACE hangs. -Scott On Mon, 17 Dec 2018, Trond Myklebust wrote: > When the socket is closed, we need to call xprt_disconnect_done() in order > to clean up the XPRT_WRITE_SPACE flag, and wake up the sleeping tasks. > > However, we also want to ensure that we don't wake them up before the socket > is closed, since that would cause thundering herd issues with everyone > piling up to retransmit before the TCP shutdown dance has completed. > Only the task that holds XPRT_LOCKED needs to wake up early in order to > allow the close to complete. > > Reported-by: Dave Wysochanski <dwysocha@xxxxxxxxxx> > Reported-by: Scott Mayhew <smayhew@xxxxxxxxxx> > Cc: Chuck Lever <chuck.lever@xxxxxxxxxx> > Signed-off-by: Trond Myklebust <trond.myklebust@xxxxxxxxxxxxxxx> > --- > net/sunrpc/clnt.c | 1 + > net/sunrpc/xprt.c | 5 ++++- > net/sunrpc/xprtsock.c | 6 ++---- > 3 files changed, 7 insertions(+), 5 deletions(-) > > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c > index c6782aa47525..24cbddc44c88 100644 > --- a/net/sunrpc/clnt.c > +++ b/net/sunrpc/clnt.c > @@ -1952,6 +1952,7 @@ call_connect_status(struct rpc_task *task) > /* retry with existing socket, after a delay */ > rpc_delay(task, 3*HZ); > /* fall through */ > + case -ENOTCONN: > case -EAGAIN: > /* Check for timeouts before looping back to call_bind */ > case -ETIMEDOUT: > diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c > index ce927002862a..3fb001dff670 100644 > --- a/net/sunrpc/xprt.c > +++ b/net/sunrpc/xprt.c > @@ -680,7 +680,9 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) > /* Try to schedule an autoclose RPC call */ > if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) > queue_work(xprtiod_workqueue, &xprt->task_cleanup); > - xprt_wake_pending_tasks(xprt, -EAGAIN); > + else if (xprt->snd_task) > + rpc_wake_up_queued_task_set_status(&xprt->pending, > + xprt->snd_task, -ENOTCONN); > spin_unlock_bh(&xprt->transport_lock); > } > EXPORT_SYMBOL_GPL(xprt_force_disconnect); > @@ -852,6 +854,7 @@ static void xprt_connect_status(struct rpc_task *task) > case -ENETUNREACH: > case -EHOSTUNREACH: > case -EPIPE: > + case -ENOTCONN: > case -EAGAIN: > dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); > break; > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c > index 8a5e823e0b33..4c471b4235ba 100644 > --- a/net/sunrpc/xprtsock.c > +++ b/net/sunrpc/xprtsock.c > @@ -1217,6 +1217,8 @@ static void xs_reset_transport(struct sock_xprt *transport) > > trace_rpc_socket_close(xprt, sock); > sock_release(sock); > + > + xprt_disconnect_done(xprt); > } > > /** > @@ -1237,8 +1239,6 @@ static void xs_close(struct rpc_xprt *xprt) > > xs_reset_transport(transport); > xprt->reestablish_timeout = 0; > - > - xprt_disconnect_done(xprt); > } > > static void xs_inject_disconnect(struct rpc_xprt *xprt) > @@ -1489,8 +1489,6 @@ static void xs_tcp_state_change(struct sock *sk) > &transport->sock_state)) > xprt_clear_connecting(xprt); > clear_bit(XPRT_CLOSING, &xprt->state); > - if (sk->sk_err) > - xprt_wake_pending_tasks(xprt, -sk->sk_err); > /* Trigger the socket release */ > xs_tcp_force_close(xprt); > } > -- > 2.19.2 >