Re: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls

Chuck Lever <chuck.lever@xxxxxxxxxx> · Sun, 10 Nov 2013 15:33:22 -0500

On Nov 9, 2013, at 1:20 PM, Trond Myklebust <Trond.Myklebust@xxxxxxxxxx> wrote:

> The following scenario can cause silent data corruption when doing
> NFS writes. It has mainly been observed when doing database writes
> using O_DIRECT.
> 
> 1) The RPC client uses sendpage() to do zero-copy of the page data.
> 2) Due to networking issues, the reply from the server is delayed,
>   and so the RPC client times out.
> 
> 3) The client issues a second sendpage of the page data as part of
>   an RPC call retransmission.
> 
> 4) The reply to the first transmission arrives from the server
>   _before_ the client hardware has emptied the TCP socket send
>   buffer.
> 5) After processing the reply, the RPC state machine rules that
>   the call to be done, and triggers the completion callbacks.
> 6) The application notices the RPC call is done, and reuses the
>   pages to store something else (e.g. a new write).
> 
> 7) The client NIC drains the TCP socket send buffer. Since the
>   page data has now changed, it reads a corrupted version of the
>   initial RPC call, and puts it on the wire.
> 
> This patch fixes the problem in the following manner:
> 
> The ordering guarantees of TCP ensure that when the server sends a
> reply, then we know that the _first_ transmission has completed. Using
> zero-copy in that situation is therefore safe.
> If a time out occurs, we then send the retransmission using sendmsg()
> (i.e. no zero-copy), We then know that the socket contains a full copy of
> the data, and so it will retransmit a faithful reproduction even if the
> RPC call completes, and the application reuses the O_DIRECT buffer in
> the meantime.

Clever!

But if wsize is large, the retransmission will require a potentially large contiguous piece of memory to complete a WRITE RPC.  In low memory scenarios, that might be hard to come by.  Should be safe for direct I/O, but if this I/O is driven by memory reclaim, it could deadlock.

We see this all the time when using NFS on network devices that do not support scatter/gather (and thus have no sock_sendpage() method to begin with).

> Signed-off-by: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx
> ---
> net/sunrpc/xprtsock.c | 28 +++++++++++++++++++++-------
> 1 file changed, 21 insertions(+), 7 deletions(-)
> 
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 17c88928b7db..dd9d295813cf 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
> 	return kernel_sendmsg(sock, &msg, NULL, 0, 0);
> }
> 
> -static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
> +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
> {
> +	ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
> +			int offset, size_t size, int flags);
> 	struct page **ppage;
> 	unsigned int remainder;
> 	int err, sent = 0;
> @@ -403,6 +405,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> 	base += xdr->page_base;
> 	ppage = xdr->pages + (base >> PAGE_SHIFT);
> 	base &= ~PAGE_MASK;
> +	do_sendpage = sock->ops->sendpage;
> +	if (!zerocopy)
> +		do_sendpage = sock_no_sendpage;
> 	for(;;) {
> 		unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
> 		int flags = XS_SENDMSG_FLAGS;
> @@ -410,7 +415,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
> 		remainder -= len;
> 		if (remainder != 0 || more)
> 			flags |= MSG_MORE;
> -		err = sock->ops->sendpage(sock, *ppage, base, len, flags);
> +		err = do_sendpage(sock, *ppage, base, len, flags);
> 		if (remainder == 0 || err != len)
> 			break;
> 		sent += err;
> @@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>  * @addrlen: UDP only -- length of destination address
>  * @xdr: buffer containing this request
>  * @base: starting position in the buffer
> + * @zerocopy: true if it is safe to use sendpage()
>  *
>  */
> -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
> +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
> {
> 	unsigned int remainder = xdr->len - base;
> 	int err, sent = 0;
> @@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
> 	if (base < xdr->page_len) {
> 		unsigned int len = xdr->page_len - base;
> 		remainder -= len;
> -		err = xs_send_pagedata(sock, xdr, base, remainder != 0);
> +		err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
> 		if (remainder == 0 || err != len)
> 			goto out;
> 		sent += err;
> @@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
> 			req->rq_svec->iov_base, req->rq_svec->iov_len);
> 
> 	status = xs_sendpages(transport->sock, NULL, 0,
> -						xdr, req->rq_bytes_sent);
> +						xdr, req->rq_bytes_sent, true);
> 	dprintk("RPC:       %s(%u) = %d\n",
> 			__func__, xdr->len - req->rq_bytes_sent, status);
> 	if (likely(status >= 0)) {
> @@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
> 	status = xs_sendpages(transport->sock,
> 			      xs_addr(xprt),
> 			      xprt->addrlen, xdr,
> -			      req->rq_bytes_sent);
> +			      req->rq_bytes_sent, true);
> 
> 	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
> 			xdr->len - req->rq_bytes_sent, status);
> @@ -693,6 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
> 	struct rpc_xprt *xprt = req->rq_xprt;
> 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
> 	struct xdr_buf *xdr = &req->rq_snd_buf;
> +	bool zerocopy = true;
> 	int status;
> 
> 	xs_encode_stream_record_marker(&req->rq_snd_buf);
> @@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
> 	xs_pktdump("packet data:",
> 				req->rq_svec->iov_base,
> 				req->rq_svec->iov_len);
> +	/* Don't use zero copy if this is a resend. If the RPC call
> +	 * completes while the socket holds a reference to the pages,
> +	 * then we may end up resending corrupted data.
> +	 */
> +	if (task->tk_flags & RPC_TASK_SENT)
> +		zerocopy = false;
> 
> 	/* Continue transmitting the packet/record. We must be careful
> 	 * to cope with writespace callbacks arriving _after_ we have
> 	 * called sendmsg(). */
> 	while (1) {
> 		status = xs_sendpages(transport->sock,
> -					NULL, 0, xdr, req->rq_bytes_sent);
> +					NULL, 0, xdr, req->rq_bytes_sent,
> +					zerocopy);
> 
> 		dprintk("RPC:       xs_tcp_send_request(%u) = %d\n",
> 				xdr->len - req->rq_bytes_sent, status);
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Chuck Lever
chuck[dot]lever[at]oracle[dot]com

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html