Thanks Cong! Thanks to your suggestion, I try to add a simple test case to test_tc_tunnel. It works for me :) Thanks for your review. > 2021年3月3日 下午8:33,Xuesen Huang <hxseverything@xxxxxxxxx> 写道: > > From: Xuesen Huang <huangxuesen@xxxxxxxxxxxx> > > bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets > encapsulation. But that is not appropriate when pushing Ethernet header. > > Add an option to further specify encap L2 type and set the inner_protocol > as ETH_P_TEB. > > Update test_tc_tunnel to verify adding vxlan encapsulation works with > this flag. > > Suggested-by: Willem de Bruijn <willemb@xxxxxxxxxx> > Signed-off-by: Xuesen Huang <huangxuesen@xxxxxxxxxxxx> > Signed-off-by: Zhiyong Cheng <chengzhiyong@xxxxxxxxxxxx> > Signed-off-by: Li Wang <wangli09@xxxxxxxxxxxx> > --- > include/uapi/linux/bpf.h | 5 + > net/core/filter.c | 11 ++- > tools/include/uapi/linux/bpf.h | 5 + > tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 107 ++++++++++++++++++--- > tools/testing/selftests/bpf/test_tc_tunnel.sh | 15 ++- > 5 files changed, 124 insertions(+), 19 deletions(-) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 77d7c1b..d791596 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -1751,6 +1751,10 @@ struct bpf_stack_build_id { > * Use with ENCAP_L3/L4 flags to further specify the tunnel > * type; *len* is the length of the inner MAC header. > * > + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: > + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the > + * L2 type as Ethernet. > + * > * A call to this helper is susceptible to change the underlying > * packet buffer. Therefore, at load time, all checks on pointers > * previously done by the verifier are invalidated and must be > @@ -4088,6 +4092,7 @@ enum { > BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), > BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), > BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), > + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), > }; > > enum { > diff --git a/net/core/filter.c b/net/core/filter.c > index 255aeee..8d1fb61 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -3412,6 +3412,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) > BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ > BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ > BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ > + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ > BPF_F_ADJ_ROOM_ENCAP_L2( \ > BPF_ADJ_ROOM_ENCAP_L2_MASK)) > > @@ -3448,6 +3449,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, > flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) > return -EINVAL; > > + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && > + inner_mac_len < ETH_HLEN) > + return -EINVAL; > + > if (skb->encapsulation) > return -EALREADY; > > @@ -3466,7 +3471,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, > skb->inner_mac_header = inner_net - inner_mac_len; > skb->inner_network_header = inner_net; > skb->inner_transport_header = inner_trans; > - skb_set_inner_protocol(skb, skb->protocol); > + > + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) > + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); > + else > + skb_set_inner_protocol(skb, skb->protocol); > > skb->encapsulation = 1; > skb_set_network_header(skb, mac_len); > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h > index 77d7c1b..d791596 100644 > --- a/tools/include/uapi/linux/bpf.h > +++ b/tools/include/uapi/linux/bpf.h > @@ -1751,6 +1751,10 @@ struct bpf_stack_build_id { > * Use with ENCAP_L3/L4 flags to further specify the tunnel > * type; *len* is the length of the inner MAC header. > * > + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: > + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the > + * L2 type as Ethernet. > + * > * A call to this helper is susceptible to change the underlying > * packet buffer. Therefore, at load time, all checks on pointers > * previously done by the verifier are invalidated and must be > @@ -4088,6 +4092,7 @@ enum { > BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), > BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), > BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), > + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), > }; > > enum { > diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c > index 37bce7a..6e144db 100644 > --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c > +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c > @@ -20,6 +20,14 @@ > #include <bpf/bpf_endian.h> > #include <bpf/bpf_helpers.h> > > +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0) > + > +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__) > + > +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0) > + > +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__) > + > static const int cfg_port = 8000; > > static const int cfg_udp_src = 20000; > @@ -27,11 +35,24 @@ > #define UDP_PORT 5555 > #define MPLS_OVER_UDP_PORT 6635 > #define ETH_OVER_UDP_PORT 7777 > +#define VXLAN_UDP_PORT 8472 > + > +#define EXTPROTO_VXLAN 0x1 > + > +#define VXLAN_N_VID (1u << 24) > +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) > +#define VXLAN_FLAGS 0x8 > +#define VXLAN_VNI 1 > > /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ > static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | > MPLS_LS_S_MASK | 0xff); > > +struct vxlanhdr { > + __be32 vx_flags; > + __be32 vx_vni; > +} __attribute__((packed)); > + > struct gre_hdr { > __be16 flags; > __be16 protocol; > @@ -45,13 +66,13 @@ struct gre_hdr { > struct v4hdr { > struct iphdr ip; > union l4hdr l4hdr; > - __u8 pad[16]; /* enough space for L2 header */ > + __u8 pad[24]; /* space for L2 header / vxlan header ... */ > } __attribute__((packed)); > > struct v6hdr { > struct ipv6hdr ip; > union l4hdr l4hdr; > - __u8 pad[16]; /* enough space for L2 header */ > + __u8 pad[24]; /* space for L2 header / vxlan header ... */ > } __attribute__((packed)); > > static __always_inline void set_ipv4_csum(struct iphdr *iph) > @@ -69,14 +90,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph) > iph->check = ~((csum & 0xffff) + (csum >> 16)); > } > > -static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, > - __u16 l2_proto) > +static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, > + __u16 l2_proto, __u16 ext_proto) > { > __u16 udp_dst = UDP_PORT; > struct iphdr iph_inner; > struct v4hdr h_outer; > struct tcphdr tcph; > int olen, l2_len; > + __u8 *l2_hdr = NULL; > int tcp_off; > __u64 flags; > > @@ -141,7 +163,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, > break; > case ETH_P_TEB: > l2_len = ETH_HLEN; > - udp_dst = ETH_OVER_UDP_PORT; > + if (ext_proto & EXTPROTO_VXLAN) { > + udp_dst = VXLAN_UDP_PORT; > + l2_len += sizeof(struct vxlanhdr); > + } else > + udp_dst = ETH_OVER_UDP_PORT; > break; > } > flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); > @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, > } > > /* add L2 encap (if specified) */ > + l2_hdr = (__u8 *)&h_outer + olen; > switch (l2_proto) { > case ETH_P_MPLS_UC: > - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; > + *(__u32 *)l2_hdr = mpls_label; > break; > case ETH_P_TEB: > - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, > - ETH_HLEN)) > + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; > + > + if (ext_proto & EXTPROTO_VXLAN) { > + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; > + > + vxlan_hdr->vx_flags = VXLAN_FLAGS; > + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); > + > + l2_hdr += sizeof(struct vxlanhdr); > + } > + > + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) > return TC_ACT_SHOT; > + > break; > } > olen += l2_len; > @@ -214,14 +252,15 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, > return TC_ACT_OK; > } > > -static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > - __u16 l2_proto) > +static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > + __u16 l2_proto, __u16 ext_proto) > { > __u16 udp_dst = UDP_PORT; > struct ipv6hdr iph_inner; > struct v6hdr h_outer; > struct tcphdr tcph; > int olen, l2_len; > + __u8 *l2_hdr = NULL; > __u16 tot_len; > __u64 flags; > > @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > break; > case ETH_P_TEB: > l2_len = ETH_HLEN; > - udp_dst = ETH_OVER_UDP_PORT; > + if (ext_proto & EXTPROTO_VXLAN) { > + udp_dst = VXLAN_UDP_PORT; > + l2_len += sizeof(struct vxlanhdr); > + } else > + udp_dst = ETH_OVER_UDP_PORT; > break; > } > flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); > @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src); > h_outer.l4hdr.udp.dest = bpf_htons(udp_dst); > tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) + > - sizeof(h_outer.l4hdr.udp); > + sizeof(h_outer.l4hdr.udp) + l2_len; > h_outer.l4hdr.udp.check = 0; > h_outer.l4hdr.udp.len = bpf_htons(tot_len); > break; > @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > } > > /* add L2 encap (if specified) */ > + l2_hdr = (__u8 *)&h_outer + olen; > switch (l2_proto) { > case ETH_P_MPLS_UC: > - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; > + *(__u32 *)l2_hdr = mpls_label; > break; > case ETH_P_TEB: > - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, > - ETH_HLEN)) > + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; > + > + if (ext_proto & EXTPROTO_VXLAN) { > + struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr; > + > + vxlan_hdr->vx_flags = VXLAN_FLAGS; > + vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8); > + > + l2_hdr += sizeof(struct vxlanhdr); > + } > + > + if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN)) > return TC_ACT_SHOT; > break; > } > @@ -372,6 +426,16 @@ int __encap_udp_eth(struct __sk_buff *skb) > return TC_ACT_OK; > } > > +SEC("encap_vxlan_eth") > +int __encap_vxlan_eth(struct __sk_buff *skb) > +{ > + if (skb->protocol == __bpf_constant_htons(ETH_P_IP)) > + return encap_ipv4_with_ext_proto(skb, IPPROTO_UDP, > + ETH_P_TEB, EXTPROTO_VXLAN); > + else > + return TC_ACT_OK; > +} > + > SEC("encap_sit_none") > int __encap_sit_none(struct __sk_buff *skb) > { > @@ -444,6 +508,16 @@ int __encap_ip6udp_eth(struct __sk_buff *skb) > return TC_ACT_OK; > } > > +SEC("encap_ip6vxlan_eth") > +int __encap_ip6vxlan_eth(struct __sk_buff *skb) > +{ > + if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6)) > + return encap_ipv6_with_ext_proto(skb, IPPROTO_UDP, > + ETH_P_TEB, EXTPROTO_VXLAN); > + else > + return TC_ACT_OK; > +} > + > static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) > { > char buf[sizeof(struct v6hdr)]; > @@ -479,6 +553,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) > case ETH_OVER_UDP_PORT: > olen += ETH_HLEN; > break; > + case VXLAN_UDP_PORT: > + olen += ETH_HLEN + sizeof(struct vxlanhdr); > + break; > } > break; > default: > diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh > index 7c76b84..c9dde9b 100755 > --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh > +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh > @@ -44,8 +44,8 @@ setup() { > # clamp route to reserve room for tunnel headers > ip -netns "${ns1}" -4 route flush table main > ip -netns "${ns1}" -6 route flush table main > - ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1 > - ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1 > + ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1 > + ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1 > > sleep 1 > > @@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then > echo "sit" > $0 ipv6 sit none 100 > > + echo "ip4 vxlan" > + $0 ipv4 vxlan eth 2000 > + > + echo "ip6 vxlan" > + $0 ipv6 ip6vxlan eth 2000 > + > for mac in none mpls eth ; do > echo "ip gre $mac" > $0 ipv4 gre $mac 100 > @@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then > targs="encap fou encap-sport auto encap-dport $dport" > elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then > ttype=$gretaptype > +elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then > + ttype="vxlan" > + targs="id 1 dstport 8472 udp6zerocsumrx" > else > ttype=$tuntype > targs="" > @@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then > elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then > # No support for TEB fou tunnel; expect failure. > expect_tun_fail=1 > -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then > +elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then > # Share ethernet address between tunnel/veth2 so L2 decap works. > ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \ > awk '/ether/ { print $2 }') > -- > 1.8.3.1 >