> On Mon, Jan 9, 2023 at 4:20 AM Ziyang Xuan (William) > <william.xuanziyang@xxxxxxxxxx> wrote: >> >>> On Fri, Jan 6, 2023 at 2:55 PM <sdf@xxxxxxxxxx> wrote: >>>> >>>> On 01/06, Ziyang Xuan wrote: >>>>> Add ipip6 and ip6ip decap support for bpf_skb_adjust_room(). >>>>> Main use case is for using cls_bpf on ingress hook to decapsulate >>>>> IPv4 over IPv6 and IPv6 over IPv4 tunnel packets. >>>> >>>> CC'd Willem since he has done bpf_skb_adjust_room changes in the past. >>>> There might be a lot of GRO/GSO context I'm missing. >>>> >>>>> Signed-off-by: Ziyang Xuan <william.xuanziyang@xxxxxxxxxx> >>>>> --- >>>>> net/core/filter.c | 34 ++++++++++++++++++++++++++++++++-- >>>>> 1 file changed, 32 insertions(+), 2 deletions(-) >>>> >>>>> diff --git a/net/core/filter.c b/net/core/filter.c >>>>> index 929358677183..73982fb4fe2e 100644 >>>>> --- a/net/core/filter.c >>>>> +++ b/net/core/filter.c >>>>> @@ -3495,6 +3495,12 @@ static int bpf_skb_net_grow(struct sk_buff *skb, >>>>> u32 off, u32 len_diff, >>>>> static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, >>>>> u64 flags) >>>>> { >>>>> + union { >>>>> + struct iphdr *v4; >>>>> + struct ipv6hdr *v6; >>>>> + unsigned char *hdr; >>>>> + } ip; >>>>> + __be16 proto; >>>>> int ret; >>>> >>>>> if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | >>>>> @@ -3512,10 +3518,19 @@ static int bpf_skb_net_shrink(struct sk_buff >>>>> *skb, u32 off, u32 len_diff, >>>>> if (unlikely(ret < 0)) >>>>> return ret; >>>> >>>>> + ip.hdr = skb_inner_network_header(skb); >>>>> + if (ip.v4->version == 4) >>>>> + proto = htons(ETH_P_IP); >>>>> + else >>>>> + proto = htons(ETH_P_IPV6); >>>>> + >>>>> ret = bpf_skb_net_hdr_pop(skb, off, len_diff); >>>>> if (unlikely(ret < 0)) >>>>> return ret; >>>> >>>>> + /* Match skb->protocol to new outer l3 protocol */ >>>>> + skb->protocol = proto; >>>>> + >>>>> if (skb_is_gso(skb)) { >>>>> struct skb_shared_info *shinfo = skb_shinfo(skb); >>>> >>>>> @@ -3578,10 +3593,14 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, >>>>> skb, s32, len_diff, >>>>> u32, mode, u64, flags) >>>>> { >>>>> u32 len_cur, len_diff_abs = abs(len_diff); >>>>> - u32 len_min = bpf_skb_net_base_len(skb); >>>>> - u32 len_max = BPF_SKB_MAX_LEN; >>>>> + u32 len_min, len_max = BPF_SKB_MAX_LEN; >>>>> __be16 proto = skb->protocol; >>>>> bool shrink = len_diff < 0; >>>>> + union { >>>>> + struct iphdr *v4; >>>>> + struct ipv6hdr *v6; >>>>> + unsigned char *hdr; >>>>> + } ip; >>>>> u32 off; >>>>> int ret; >>>> >>>>> @@ -3594,6 +3613,9 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, >>>>> skb, s32, len_diff, >>>>> proto != htons(ETH_P_IPV6))) >>>>> return -ENOTSUPP; >>>> >>>>> + if (unlikely(shrink && !skb->encapsulation)) >>>>> + return -ENOTSUPP; >>>>> + >>> >>> This new restriction might break existing users. >>> >>> There is no pre-existing requirement that shrink is used solely with >>> packets encapsulated by the protocol stack. >>> >>> Indeed, skb->encapsulation is likely not set on packets arriving from >>> the wire, even if encapsulated. Referring to your comment "Main use >>> case is for using cls_bpf on ingress hook to decapsulate" >>> >>> Can a combination of the existing bpf_skb_adjust_room and >>> bpf_skb_change_proto address your problem? >> >> Hello Willem, >> >> I think combination bpf_skb_adjust_room and bpf_skb_change_proto can not >> address my problem. >> >> Now, bpf_skb_adjust_room() would fail for "len_cur - len_diff_abs < len_min" >> when decap ipip6 packet, because "len_min" should be sizeof(struct iphdr) >> but not sizeof(struct ipv6hdr). >> >> We can remove skb->encapsulation restriction and parse outer and inner IP >> header to determine ipip6 and ip6ip packets. As following: > > Adding logic for network layer protocol conversion like this looks > good to me. bpf_skb_adjust_room already has a few other metadata > quirks. > > But like those, let's make this intent explicit: define a new flag > that requests this behavior. Hello Willem, I think you prefer like this: --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2644,6 +2644,11 @@ union bpf_attr { * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicates the new IP header version after decapsulate the + * outer IP header. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -5803,6 +5808,8 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index 43cc1fe58a2c..0bbe5e67337c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3381,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) +#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \ + BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ - BPF_ADJ_ROOM_ENCAP_L2_MASK)) + BPF_ADJ_ROOM_ENCAP_L2_MASK) | \ + BPF_F_ADJ_ROOM_DECAP_L3_MASK) static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u64 flags) @@ -3501,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, int ret; if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_DECAP_L3_MASK | BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; @@ -3519,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, if (unlikely(ret < 0)) return ret; + /* Match skb->protocol to new outer l3 protocol */ + if (skb->protocol == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) + skb->protocol = htons(ETH_P_IPV6); + else if (skb->protocol == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) + skb->protocol = htons(ETH_P_IP); + if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); @@ -3597,6 +3610,10 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, proto != htons(ETH_P_IPV6))) return -ENOTSUPP; + if (unlikely(shrink && flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4 && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)) + return -EINVAL; + off = skb_mac_header_len(skb); switch (mode) { case BPF_ADJ_ROOM_NET: @@ -3608,6 +3625,16 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOTSUPP; } + if (shrink) { + if (proto == htons(ETH_P_IP) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6) { + len_min = sizeof(struct ipv6hdr); + } else if (proto == htons(ETH_P_IPV6) && + flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4) { + len_min = sizeof(struct iphdr); + } + } + len_cur = skb->len - skb_network_offset(skb); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 464ca3f01fe7..041361bc6ccf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2644,6 +2644,11 @@ union bpf_attr { * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicates the new IP header version after decapsulate the + * outer IP header. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -5803,6 +5808,8 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), }; enum { Thank you! > > Let's avoid introducing a new union. Just use check (ip_hdr(skb)->version == 4). > >> >> --- a/net/core/filter.c >> +++ b/net/core/filter.c >> @@ -3498,6 +3498,12 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, >> static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, >> u64 flags) >> { >> + union { >> + struct iphdr *v4; >> + struct ipv6hdr *v6; >> + unsigned char *hdr; >> + } ip; >> + __be16 proto; >> int ret; >> >> if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | >> @@ -3515,10 +3521,23 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, >> if (unlikely(ret < 0)) >> return ret; >> >> + ip.hdr = skb_network_header(skb); >> + if (ip.v4->version == 4) { >> + if (ip.v4->protocol == IPPROTO_IPV6) >> + proto = htons(ETH_P_IPV6); >> + } else { >> + struct ipv6_opt_hdr *opt_hdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + sizeof(struct ipv6hdr)); >> + if (ip.v6->nexthdr == NEXTHDR_DEST && opt_hdr->nexthdr == NEXTHDR_IPV4) >> + proto = htons(ETH_P_IP); >> + } >> + >> ret = bpf_skb_net_hdr_pop(skb, off, len_diff); >> if (unlikely(ret < 0)) >> return ret; >> >> + /* Match skb->protocol to new outer l3 protocol */ >> + skb->protocol = proto; >> + >> if (skb_is_gso(skb)) { >> struct skb_shared_info *shinfo = skb_shinfo(skb); >> >> @@ -3585,6 +3604,11 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, >> u32 len_max = BPF_SKB_MAX_LEN; >> __be16 proto = skb->protocol; >> bool shrink = len_diff < 0; >> + union { >> + struct iphdr *v4; >> + struct ipv6hdr *v6; >> + unsigned char *hdr; >> + } ip; >> u32 off; >> int ret; >> >> @@ -3608,6 +3632,19 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, >> return -ENOTSUPP; >> } >> >> + if (shrink) { >> + ip.hdr = skb_network_header(skb); >> + if (ip.v4->version == 4) { >> + if (ip.v4->protocol == IPPROTO_IPV6) >> + len_min = sizeof(struct ipv6hdr); >> + } else { >> + struct ipv6_opt_hdr *opt_hdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + sizeof(struct ipv6hdr)); >> + if (ip.v6->nexthdr == NEXTHDR_DEST && opt_hdr->nexthdr == NEXTHDR_IPV4) { >> + len_min = sizeof(struct iphdr); >> + } >> + } >> + } >> + >> len_cur = skb->len - skb_network_offset(skb); >> >> >> Look forward to your comments and suggestions. >> >> Thank you! >> >>> >>>>> off = skb_mac_header_len(skb); >>>>> switch (mode) { >>>>> case BPF_ADJ_ROOM_NET: >>>>> @@ -3605,6 +3627,14 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, >>>>> skb, s32, len_diff, >>>>> return -ENOTSUPP; >>>>> } >>>> >>>>> + if (shrink) { >>>>> + ip.hdr = skb_inner_network_header(skb); >>>>> + if (ip.v4->version == 4) >>>>> + len_min = sizeof(struct iphdr); >>>>> + else >>>>> + len_min = sizeof(struct ipv6hdr); >>>>> + } >>>>> + >>>>> len_cur = skb->len - skb_network_offset(skb); >>>>> if ((shrink && (len_diff_abs >= len_cur || >>>>> len_cur - len_diff_abs < len_min)) || >>>>> -- >>>>> 2.25.1 >>>> >>> . >>> > . >