On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <hxseverything@xxxxxxxxx> wrote: > > From: Xuesen Huang <huangxuesen@xxxxxxxxxxxx> > > bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets > encapsulation. But that is not appropriate when pushing Ethernet header. > > Add an option to further specify encap L2 type and set the inner_protocol > as ETH_P_TEB. > > Update test_tc_tunnel to verify adding vxlan encapsulation works with > this flag. > > Suggested-by: Willem de Bruijn <willemb@xxxxxxxxxx> > Signed-off-by: Xuesen Huang <huangxuesen@xxxxxxxxxxxx> > Signed-off-by: Zhiyong Cheng <chengzhiyong@xxxxxxxxxxxx> > Signed-off-by: Li Wang <wangli09@xxxxxxxxxxxx> Thanks for adding the test. Perhaps that is better in a separate patch? Overall looks great to me. The patch has not (yet?) arrived on patchwork. > enum { > diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c > index 37bce7a..6e144db 100644 > --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c > +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c > @@ -20,6 +20,14 @@ > #include <bpf/bpf_endian.h> > #include <bpf/bpf_helpers.h> > > +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0) > + > +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__) > + > +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0) > + > +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__) > + Instead of untyped macros, I'd define encap_ipv4 as a function that calls __encap_ipv4. And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4. > static const int cfg_port = 8000; > > static const int cfg_udp_src = 20000; > @@ -27,11 +35,24 @@ > #define UDP_PORT 5555 > #define MPLS_OVER_UDP_PORT 6635 > #define ETH_OVER_UDP_PORT 7777 > +#define VXLAN_UDP_PORT 8472 > + > +#define EXTPROTO_VXLAN 0x1 > + > +#define VXLAN_N_VID (1u << 24) > +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8) > +#define VXLAN_FLAGS 0x8 > +#define VXLAN_VNI 1 > > /* MPLS label 1000 with S bit (last label) set and ttl of 255. */ > static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | > MPLS_LS_S_MASK | 0xff); > > +struct vxlanhdr { > + __be32 vx_flags; > + __be32 vx_vni; > +} __attribute__((packed)); > + > struct gre_hdr { > __be16 flags; > __be16 protocol; > @@ -45,13 +66,13 @@ struct gre_hdr { > struct v4hdr { > struct iphdr ip; > union l4hdr l4hdr; > - __u8 pad[16]; /* enough space for L2 header */ > + __u8 pad[24]; /* space for L2 header / vxlan header ... */ could we use something like sizeof(..) instead of a constant? > @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto, > } > > /* add L2 encap (if specified) */ > + l2_hdr = (__u8 *)&h_outer + olen; > switch (l2_proto) { > case ETH_P_MPLS_UC: > - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; > + *(__u32 *)l2_hdr = mpls_label; > break; > case ETH_P_TEB: > - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, > - ETH_HLEN)) This is non-standard indentation? Here and elsewhere. > @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > break; > case ETH_P_TEB: > l2_len = ETH_HLEN; > - udp_dst = ETH_OVER_UDP_PORT; > + if (ext_proto & EXTPROTO_VXLAN) { > + udp_dst = VXLAN_UDP_PORT; > + l2_len += sizeof(struct vxlanhdr); > + } else > + udp_dst = ETH_OVER_UDP_PORT; > break; > } > flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len); > @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src); > h_outer.l4hdr.udp.dest = bpf_htons(udp_dst); > tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) + > - sizeof(h_outer.l4hdr.udp); > + sizeof(h_outer.l4hdr.udp) + l2_len; Was this a bug previously? > h_outer.l4hdr.udp.check = 0; > h_outer.l4hdr.udp.len = bpf_htons(tot_len); > break; > @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, > } > > /* add L2 encap (if specified) */ > + l2_hdr = (__u8 *)&h_outer + olen; > switch (l2_proto) { > case ETH_P_MPLS_UC: > - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label; > + *(__u32 *)l2_hdr = mpls_label; > break; > case ETH_P_TEB: > - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen, > - ETH_HLEN)) > + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH; This is a change also for the existing case. Correctly so, I imagine. But the test used to pass with the wrong protocol?