[PATCH 4.14] tcp: refine memory limit test in tcp_fragment()

Josh Hunt <johunt@xxxxxxxxxx> · Tue, 25 Jun 2019 13:19:37 -0400

Backport of dad3a9314ac95dedc007bc7dacacb396ea10e376:

tcp_fragment() might be called for skbs in the write queue.

Memory limits might have been exceeded because tcp_sendmsg() only
checks limits at full skb (64KB) boundaries.

Therefore, we need to make sure tcp_fragment() wont punish applications
that might have setup very low SO_SNDBUF values.

Backport notes:
Initial version used tcp_queue type which is not present in older kernels,
so added a new arg to tcp_fragment() to determine whether this is a
retransmit or not.

Fixes: 9daf226ff926 ("tcp: tcp_fragment() should apply sane memory limits")
Signed-off-by: Josh Hunt <johunt@xxxxxxxxxx>
Reviewed-by: Jason Baron <jbaron@xxxxxxxxxx>
---

Eric/Greg - This applies on top of v4.14.130. I did not see anything come
through for the older (<4.19) stable kernels yet. Without this change
Christoph Paasch's packetrill script (https://lore.kernel.org/netdev/CALMXkpYVRxgeqarp4gnmX7GqYh1sWOAt6UaRFqYBOaaNFfZ5sw@xxxxxxxxxxxxxx/)
will fail on 4.14 stable kernels, but passes with this change.

 include/net/tcp.h     |  3 ++-
 net/ipv4/tcp_input.c  |  4 ++--
 net/ipv4/tcp_output.c | 16 ++++++++--------
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1179ef4f0768..9d69fefa365c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -554,7 +554,8 @@ void tcp_xmit_retransmit_queue(struct sock *);
 void tcp_simple_retransmit(struct sock *);
 void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
+int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t,
+		 bool retrans);
 
 void tcp_send_probe0(struct sock *);
 void tcp_send_partial(struct sock *);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8e080f3b75bd..0fd629587104 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1202,7 +1202,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
 		if (pkt_len >= skb->len && !in_sack)
 			return 0;
 
-		err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+		err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC, true);
 		if (err < 0)
 			return err;
 	}
@@ -2266,7 +2266,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 			/* If needed, chop off the prefix to mark as lost. */
 			lost = (packets - oldcnt) * mss;
 			if (lost < skb->len &&
-			    tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+			    tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC, true) < 0)
 				break;
 			cnt = packets;
 		}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a8772e11dc1c..ca14770dd7ba 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1259,7 +1259,7 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
  * Remember, these are still headerless SKBs at this point.
  */
 int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
-		 unsigned int mss_now, gfp_t gfp)
+		 unsigned int mss_now, gfp_t gfp, bool retrans)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *buff;
@@ -1274,7 +1274,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	if (nsize < 0)
 		nsize = 0;
 
-	if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) {
+	if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf && retrans)) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
 		return -ENOMEM;
 	}
@@ -1834,7 +1834,7 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
  * packet has never been sent out before (and thus is not cloned).
  */
 static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
-			unsigned int mss_now, gfp_t gfp)
+			unsigned int mss_now, gfp_t gfp, bool retrans)
 {
 	struct sk_buff *buff;
 	int nlen = skb->len - len;
@@ -1842,7 +1842,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
 	/* All of a TSO frame must be composed of paged data.  */
 	if (skb->len != skb->data_len)
-		return tcp_fragment(sk, skb, len, mss_now, gfp);
+		return tcp_fragment(sk, skb, len, mss_now, gfp, retrans);
 
 	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
 	if (unlikely(!buff))
@@ -2361,7 +2361,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 						    nonagle);
 
 		if (skb->len > limit &&
-		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp, false)))
 			break;
 
 		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
@@ -2514,7 +2514,7 @@ void tcp_send_loss_probe(struct sock *sk)
 
 	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
 		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
-					  GFP_ATOMIC)))
+					  GFP_ATOMIC, true)))
 			goto rearm_timer;
 		skb = tcp_write_queue_next(sk, skb);
 	}
@@ -2874,7 +2874,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 
 	len = cur_mss * segs;
 	if (skb->len > len) {
-		if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+		if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC, true))
 			return -ENOMEM; /* We'll try again later. */
 	} else {
 		if (skb_unclone(skb, GFP_ATOMIC))
@@ -3696,7 +3696,7 @@ int tcp_write_wakeup(struct sock *sk, int mib)
 		    skb->len > mss) {
 			seg_size = min(seg_size, mss);
 			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
-			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC, false))
 				return -1;
 		} else if (!tcp_skb_pcount(skb))
 			tcp_set_skb_tso_segs(skb, mss);
-- 
2.7.4