From: Eric Dumazet <edumazet@xxxxxxxxxx> [ Upstream commit 355a901e6cf1b2b763ec85caa2a9f04fbcc4ab4a ] While working on sk_forward_alloc problems reported by Denys Fedoryshchenko, we found that tcp connect() (and fastopen) do not call sk_wmem_schedule() for SYN packet (and/or SYN/DATA packet), so sk_forward_alloc is negative while connect is in progress. We can fix this by calling regular sk_stream_alloc_skb() both for the SYN packet (in tcp_connect()) and the syn_data packet in tcp_send_syn_data() Then, tcp_send_syn_data() can avoid copying syn_data as we simply can manipulate syn_data->cb[] to remove SYN flag (and increment seq) Instead of open coding memcpy_fromiovecend(), simply use this helper. This leaves in socket write queue clean fast clone skbs. This was tested against our fastopen packetdrill tests. Reported-by: Denys Fedoryshchenko <nuclearcat@xxxxxxxxxxxxxx> Signed-off-by: Eric Dumazet <edumazet@xxxxxxxxxx> Acked-by: Yuchung Cheng <ycheng@xxxxxxxxxx> Signed-off-by: David S. Miller <davem@xxxxxxxxxxxxx> Signed-off-by: Sasha Levin <sasha.levin@xxxxxxxxxx> --- net/ipv4/tcp_output.c | 68 +++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 40 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1174736..022ecbc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2994,9 +2994,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; - int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen; - struct sk_buff *syn_data = NULL, *data; + int syn_loss = 0, space, err = 0; unsigned long last_syn_loss = 0; + struct sk_buff *syn_data; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, @@ -3027,48 +3027,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space, - sk->sk_allocation); - if (syn_data == NULL) + syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation); + if (!syn_data) goto fallback; + syn_data->ip_summed = CHECKSUM_PARTIAL; + memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); + if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space), + fo->data->msg_iov, 0, space))) { + kfree_skb(syn_data); + goto fallback; + } - for (i = 0; i < iovlen && syn_data->len < space; ++i) { - struct iovec *iov = &fo->data->msg_iov[i]; - unsigned char __user *from = iov->iov_base; - int len = iov->iov_len; + /* No more data pending in inet_wait_for_connect() */ + if (space == fo->size) + fo->data = NULL; + fo->copied = space; - if (syn_data->len + len > space) - len = space - syn_data->len; - else if (i + 1 == iovlen) - /* No more data pending in inet_wait_for_connect() */ - fo->data = NULL; + tcp_connect_queue_skb(sk, syn_data); - if (skb_add_data(syn_data, from, len)) - goto fallback; - } + err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - /* Queue a data-only packet after the regular SYN for retransmission */ - data = pskb_copy(syn_data, sk->sk_allocation); - if (data == NULL) - goto fallback; - TCP_SKB_CB(data)->seq++; - TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN; - TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH); - tcp_connect_queue_skb(sk, data); - fo->copied = data->len; - - /* syn_data is about to be sent, we need to take current time stamps - * for the packets that are in write queue : SYN packet and DATA - */ - skb_mstamp_get(&syn->skb_mstamp); - data->skb_mstamp = syn->skb_mstamp; + syn->skb_mstamp = syn_data->skb_mstamp; - if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) { + /* Now full SYN+DATA was cloned and sent (or not), + * remove the SYN from the original skb (syn_data) + * we keep in write queue in case of a retransmit, as we + * also have the SYN packet (with no data) in the same queue. + */ + TCP_SKB_CB(syn_data)->seq++; + TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; + if (!err) { tp->syn_data = (fo->copied > 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - syn_data = NULL; fallback: /* Send a regular SYN with Fast Open cookie request option */ @@ -3077,7 +3069,6 @@ fallback: err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; - kfree_skb(syn_data); done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; @@ -3097,13 +3088,10 @@ int tcp_connect(struct sock *sk) return 0; } - buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); - if (unlikely(buff == NULL)) + buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + if (unlikely(!buff)) return -ENOBUFS; - /* Reserve space for headers. */ - skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); tp->retrans_stamp = tcp_time_stamp; tcp_connect_queue_skb(sk, buff); -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html