Re: [PATCH] SUNRPC: Fix a data corruption issue when retransmitting RPC calls

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Trond Myklebust [Trond.Myklebust@xxxxxxxxxx] wrote:
> The following scenario can cause silent data corruption when doing
> NFS writes. It has mainly been observed when doing database writes
> using O_DIRECT.
> 
> 1) The RPC client uses sendpage() to do zero-copy of the page data.
> 2) Due to networking issues, the reply from the server is delayed,
>    and so the RPC client times out.
> 
> 3) The client issues a second sendpage of the page data as part of
>    an RPC call retransmission.
> 
> 4) The reply to the first transmission arrives from the server
>    _before_ the client hardware has emptied the TCP socket send
>    buffer.
> 5) After processing the reply, the RPC state machine rules that
>    the call to be done, and triggers the completion callbacks.
> 6) The application notices the RPC call is done, and reuses the
>    pages to store something else (e.g. a new write).
> 
> 7) The client NIC drains the TCP socket send buffer. Since the
>    page data has now changed, it reads a corrupted version of the
>    initial RPC call, and puts it on the wire.
> 
> This patch fixes the problem in the following manner:
> 
> The ordering guarantees of TCP ensure that when the server sends a
> reply, then we know that the _first_ transmission has completed. Using
> zero-copy in that situation is therefore safe.
> If a time out occurs, we then send the retransmission using sendmsg()
> (i.e. no zero-copy), We then know that the socket contains a full copy of
> the data, and so it will retransmit a faithful reproduction even if the
> RPC call completes, and the application reuses the O_DIRECT buffer in
> the meantime.
> 
> Signed-off-by: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx
> ---
>  net/sunrpc/xprtsock.c | 28 +++++++++++++++++++++-------
>  1 file changed, 21 insertions(+), 7 deletions(-)
> 
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index 17c88928b7db..dd9d295813cf 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
>  	return kernel_sendmsg(sock, &msg, NULL, 0, 0);
>  }
> 
> -static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
> +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
>  {
> +	ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
> +			int offset, size_t size, int flags);
>  	struct page **ppage;
>  	unsigned int remainder;
>  	int err, sent = 0;
> @@ -403,6 +405,9 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>  	base += xdr->page_base;
>  	ppage = xdr->pages + (base >> PAGE_SHIFT);
>  	base &= ~PAGE_MASK;
> +	do_sendpage = sock->ops->sendpage;
> +	if (!zerocopy)
> +		do_sendpage = sock_no_sendpage;
>  	for(;;) {
>  		unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
>  		int flags = XS_SENDMSG_FLAGS;
> @@ -410,7 +415,7 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>  		remainder -= len;
>  		if (remainder != 0 || more)
>  			flags |= MSG_MORE;
> -		err = sock->ops->sendpage(sock, *ppage, base, len, flags);
> +		err = do_sendpage(sock, *ppage, base, len, flags);
>  		if (remainder == 0 || err != len)
>  			break;
>  		sent += err;
> @@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
>   * @addrlen: UDP only -- length of destination address
>   * @xdr: buffer containing this request
>   * @base: starting position in the buffer
> + * @zerocopy: true if it is safe to use sendpage()
>   *
>   */
> -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
> +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
>  {
>  	unsigned int remainder = xdr->len - base;
>  	int err, sent = 0;
> @@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
>  	if (base < xdr->page_len) {
>  		unsigned int len = xdr->page_len - base;
>  		remainder -= len;
> -		err = xs_send_pagedata(sock, xdr, base, remainder != 0);
> +		err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
>  		if (remainder == 0 || err != len)
>  			goto out;
>  		sent += err;
> @@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
>  			req->rq_svec->iov_base, req->rq_svec->iov_len);
> 
>  	status = xs_sendpages(transport->sock, NULL, 0,
> -						xdr, req->rq_bytes_sent);
> +						xdr, req->rq_bytes_sent, true);
>  	dprintk("RPC:       %s(%u) = %d\n",
>  			__func__, xdr->len - req->rq_bytes_sent, status);
>  	if (likely(status >= 0)) {
> @@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
>  	status = xs_sendpages(transport->sock,
>  			      xs_addr(xprt),
>  			      xprt->addrlen, xdr,
> -			      req->rq_bytes_sent);
> +			      req->rq_bytes_sent, true);
> 
>  	dprintk("RPC:       xs_udp_send_request(%u) = %d\n",
>  			xdr->len - req->rq_bytes_sent, status);
> @@ -693,6 +699,7 @@ static int xs_tcp_send_request(struct rpc_task *task)
>  	struct rpc_xprt *xprt = req->rq_xprt;
>  	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
>  	struct xdr_buf *xdr = &req->rq_snd_buf;
> +	bool zerocopy = true;
>  	int status;
> 
>  	xs_encode_stream_record_marker(&req->rq_snd_buf);
> @@ -700,13 +707,20 @@ static int xs_tcp_send_request(struct rpc_task *task)
>  	xs_pktdump("packet data:",
>  				req->rq_svec->iov_base,
>  				req->rq_svec->iov_len);
> +	/* Don't use zero copy if this is a resend. If the RPC call
> +	 * completes while the socket holds a reference to the pages,
> +	 * then we may end up resending corrupted data.
> +	 */
> +	if (task->tk_flags & RPC_TASK_SENT)

How about RPC_WAS_SENT() ?

> +		zerocopy = false;
> 
>  	/* Continue transmitting the packet/record. We must be careful
>  	 * to cope with writespace callbacks arriving _after_ we have
>  	 * called sendmsg(). */
>  	while (1) {
>  		status = xs_sendpages(transport->sock,
> -					NULL, 0, xdr, req->rq_bytes_sent);
> +					NULL, 0, xdr, req->rq_bytes_sent,
> +					zerocopy);
> 
>  		dprintk("RPC:       xs_tcp_send_request(%u) = %d\n",
>  				xdr->len - req->rq_bytes_sent, status);
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux