From: Amery Hung <amery.hung@xxxxxxxxxxxxx> This series prevents sockops users from accidentally causing packet drops. This can happen when a BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves different option lengths in tcp_sendmsg(). Initially, sockops BPF_SOCK_OPS_HDR_OPT_LEN_CB program will be called to reserve a space in tcp_send_mss(), which will return the MSS for TSO. Then, BPF_SOCK_OPS_HDR_OPT_LEN_CB will be called in __tcp_transmit_skb() again to calculate the actual tcp_option_size and skb_push() the total header size. skb->gso_size is restored from TCP_SKB_CB(skb)->tcp_gso_size, which is derived from tcp_send_mss() where we first call HDR_OPT_LEN. If the reserved opt size is smaller than the actual header size, the len of the skb can exceed the MTU. As a result, ip(6)_fragment will drop the packet if skb->ignore_df is not set. To prevent this accidental packet drop, we need to make sure the second call to the BPF_SOCK_OPS_HDR_OPT_LEN_CB program reserves space not more than the first time. Since this cannot be done during verification time, we add a runtime sanity check to have bpf_reserve_hdr_opt return an error instead of causing packet drops later. Signed-off-by: Amery Hung <amery.hung@xxxxxxxxxxxxx> Co-developed-by: Zijian Zhang <zijianzhang@xxxxxxxxxxxxx> Signed-off-by: Zijian Zhang <zijianzhang@xxxxxxxxxxxxx> --- include/net/tcp.h | 8 ++++++++ net/ipv4/tcp_input.c | 8 -------- net/ipv4/tcp_output.c | 13 +++++++++++-- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2aac11e7e1cc..e202eeb19be4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1058,6 +1058,14 @@ static inline int tcp_skb_mss(const struct sk_buff *skb) return TCP_SKB_CB(skb)->tcp_gso_size; } +/* I wish gso_size would have a bit more sane initialization than + * something-or-zero which complicates things + */ +static inline int tcp_skb_seglen(const struct sk_buff *skb) +{ + return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); +} + static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) { return likely(!TCP_SKB_CB(skb)->eor); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e37488d3453f..c1ffe19b0717 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1550,14 +1550,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, return true; } -/* I wish gso_size would have a bit more sane initialization than - * something-or-zero which complicates things - */ -static int tcp_skb_seglen(const struct sk_buff *skb) -{ - return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); -} - /* Shifting pages past head area doesn't work */ static int skb_can_shift(const struct sk_buff *skb) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 16c48df8df4c..f5996cdbb2ba 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1033,10 +1033,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + unsigned int old_remaining; - bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + if (skb) { + unsigned int reserved_opt_spc; + + reserved_opt_spc = tp->mss_cache - tcp_skb_seglen(skb); + if (reserved_opt_spc < remaining) + remaining = reserved_opt_spc; + } - size = MAX_TCP_OPTION_SPACE - remaining; + old_remaining = remaining; + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + size += old_remaining - remaining; } return size; -- 2.20.1