Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Thanks Cong!

Thanks to your suggestion, I try to add a simple test case to test_tc_tunnel. It works 
for me :)

Thanks for your review.

> 2021年3月3日 下午8:33,Xuesen Huang <hxseverything@xxxxxxxxx> 写道:
> 
> From: Xuesen Huang <huangxuesen@xxxxxxxxxxxx>
> 
> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
> encapsulation. But that is not appropriate when pushing Ethernet header.
> 
> Add an option to further specify encap L2 type and set the inner_protocol
> as ETH_P_TEB.
> 
> Update test_tc_tunnel to verify adding vxlan encapsulation works with
> this flag.
> 
> Suggested-by: Willem de Bruijn <willemb@xxxxxxxxxx>
> Signed-off-by: Xuesen Huang <huangxuesen@xxxxxxxxxxxx>
> Signed-off-by: Zhiyong Cheng <chengzhiyong@xxxxxxxxxxxx>
> Signed-off-by: Li Wang <wangli09@xxxxxxxxxxxx>
> ---
> include/uapi/linux/bpf.h                           |   5 +
> net/core/filter.c                                  |  11 ++-
> tools/include/uapi/linux/bpf.h                     |   5 +
> tools/testing/selftests/bpf/progs/test_tc_tunnel.c | 107 ++++++++++++++++++---
> tools/testing/selftests/bpf/test_tc_tunnel.sh      |  15 ++-
> 5 files changed, 124 insertions(+), 19 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 77d7c1b..d791596 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
>  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
>  *		  type; *len* is the length of the inner MAC header.
>  *
> + *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
> + *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
> + *		  L2 type as Ethernet.
> + *
>  * 		A call to this helper is susceptible to change the underlying
>  * 		packet buffer. Therefore, at load time, all checks on pointers
>  * 		previously done by the verifier are invalidated and must be
> @@ -4088,6 +4092,7 @@ enum {
> 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
> 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
> 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
> +	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
> };
> 
> enum {
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 255aeee..8d1fb61 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3412,6 +3412,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
> 					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
> 					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
> 					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
> +					 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
> 					 BPF_F_ADJ_ROOM_ENCAP_L2( \
> 					  BPF_ADJ_ROOM_ENCAP_L2_MASK))
> 
> @@ -3448,6 +3449,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
> 		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
> 			return -EINVAL;
> 
> +		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
> +		    inner_mac_len < ETH_HLEN)
> +			return -EINVAL;
> +
> 		if (skb->encapsulation)
> 			return -EALREADY;
> 
> @@ -3466,7 +3471,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
> 		skb->inner_mac_header = inner_net - inner_mac_len;
> 		skb->inner_network_header = inner_net;
> 		skb->inner_transport_header = inner_trans;
> -		skb_set_inner_protocol(skb, skb->protocol);
> +
> +		if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
> +			skb_set_inner_protocol(skb, htons(ETH_P_TEB));
> +		else
> +			skb_set_inner_protocol(skb, skb->protocol);
> 
> 		skb->encapsulation = 1;
> 		skb_set_network_header(skb, mac_len);
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 77d7c1b..d791596 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -1751,6 +1751,10 @@ struct bpf_stack_build_id {
>  *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
>  *		  type; *len* is the length of the inner MAC header.
>  *
> + *		* **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
> + *		  Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
> + *		  L2 type as Ethernet.
> + *
>  * 		A call to this helper is susceptible to change the underlying
>  * 		packet buffer. Therefore, at load time, all checks on pointers
>  * 		previously done by the verifier are invalidated and must be
> @@ -4088,6 +4092,7 @@ enum {
> 	BPF_F_ADJ_ROOM_ENCAP_L4_GRE	= (1ULL << 3),
> 	BPF_F_ADJ_ROOM_ENCAP_L4_UDP	= (1ULL << 4),
> 	BPF_F_ADJ_ROOM_NO_CSUM_RESET	= (1ULL << 5),
> +	BPF_F_ADJ_ROOM_ENCAP_L2_ETH	= (1ULL << 6),
> };
> 
> enum {
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> index 37bce7a..6e144db 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> @@ -20,6 +20,14 @@
> #include <bpf/bpf_endian.h>
> #include <bpf/bpf_helpers.h>
> 
> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
> +
> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
> +
> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
> +
> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
> +
> static const int cfg_port = 8000;
> 
> static const int cfg_udp_src = 20000;
> @@ -27,11 +35,24 @@
> #define	UDP_PORT		5555
> #define	MPLS_OVER_UDP_PORT	6635
> #define	ETH_OVER_UDP_PORT	7777
> +#define	VXLAN_UDP_PORT		8472
> +
> +#define	EXTPROTO_VXLAN	0x1
> +
> +#define	VXLAN_N_VID     (1u << 24)
> +#define	VXLAN_VNI_MASK	bpf_htonl((VXLAN_N_VID - 1) << 8)
> +#define	VXLAN_FLAGS     0x8
> +#define	VXLAN_VNI       1
> 
> /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
> static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
> 						     MPLS_LS_S_MASK | 0xff);
> 
> +struct vxlanhdr {
> +	__be32 vx_flags;
> +	__be32 vx_vni;
> +} __attribute__((packed));
> +
> struct gre_hdr {
> 	__be16 flags;
> 	__be16 protocol;
> @@ -45,13 +66,13 @@ struct gre_hdr {
> struct v4hdr {
> 	struct iphdr ip;
> 	union l4hdr l4hdr;
> -	__u8 pad[16];			/* enough space for L2 header */
> +	__u8 pad[24];			/* space for L2 header / vxlan header ... */
> } __attribute__((packed));
> 
> struct v6hdr {
> 	struct ipv6hdr ip;
> 	union l4hdr l4hdr;
> -	__u8 pad[16];			/* enough space for L2 header */
> +	__u8 pad[24];			/* space for L2 header / vxlan header ... */
> } __attribute__((packed));
> 
> static __always_inline void set_ipv4_csum(struct iphdr *iph)
> @@ -69,14 +90,15 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
> 	iph->check = ~((csum & 0xffff) + (csum >> 16));
> }
> 
> -static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> -				      __u16 l2_proto)
> +static __always_inline int __encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> +				      __u16 l2_proto, __u16 ext_proto)
> {
> 	__u16 udp_dst = UDP_PORT;
> 	struct iphdr iph_inner;
> 	struct v4hdr h_outer;
> 	struct tcphdr tcph;
> 	int olen, l2_len;
> +	__u8 *l2_hdr = NULL;
> 	int tcp_off;
> 	__u64 flags;
> 
> @@ -141,7 +163,11 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> 		break;
> 	case ETH_P_TEB:
> 		l2_len = ETH_HLEN;
> -		udp_dst = ETH_OVER_UDP_PORT;
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			udp_dst = VXLAN_UDP_PORT;
> +			l2_len += sizeof(struct vxlanhdr);
> +		} else
> +			udp_dst = ETH_OVER_UDP_PORT;
> 		break;
> 	}
> 	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> 	}
> 
> 	/* add L2 encap (if specified) */
> +	l2_hdr = (__u8 *)&h_outer + olen;
> 	switch (l2_proto) {
> 	case ETH_P_MPLS_UC:
> -		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +		*(__u32 *)l2_hdr = mpls_label;
> 		break;
> 	case ETH_P_TEB:
> -		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -				       ETH_HLEN))
> +		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
> +
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
> +
> +			vxlan_hdr->vx_flags = VXLAN_FLAGS;
> +			vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
> +
> +			l2_hdr += sizeof(struct vxlanhdr);
> +		}
> +
> +		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
> 			return TC_ACT_SHOT;
> +
> 		break;
> 	}
> 	olen += l2_len;
> @@ -214,14 +252,15 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> 	return TC_ACT_OK;
> }
> 
> -static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> -				      __u16 l2_proto)
> +static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> +				      __u16 l2_proto, __u16 ext_proto)
> {
> 	__u16 udp_dst = UDP_PORT;
> 	struct ipv6hdr iph_inner;
> 	struct v6hdr h_outer;
> 	struct tcphdr tcph;
> 	int olen, l2_len;
> +	__u8 *l2_hdr = NULL;
> 	__u16 tot_len;
> 	__u64 flags;
> 
> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> 		break;
> 	case ETH_P_TEB:
> 		l2_len = ETH_HLEN;
> -		udp_dst = ETH_OVER_UDP_PORT;
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			udp_dst = VXLAN_UDP_PORT;
> +			l2_len += sizeof(struct vxlanhdr);
> +		} else
> +			udp_dst = ETH_OVER_UDP_PORT;
> 		break;
> 	}
> 	flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> 		h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
> 		h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
> 		tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
> -			  sizeof(h_outer.l4hdr.udp);
> +			  sizeof(h_outer.l4hdr.udp) + l2_len;
> 		h_outer.l4hdr.udp.check = 0;
> 		h_outer.l4hdr.udp.len = bpf_htons(tot_len);
> 		break;
> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> 	}
> 
> 	/* add L2 encap (if specified) */
> +	l2_hdr = (__u8 *)&h_outer + olen;
> 	switch (l2_proto) {
> 	case ETH_P_MPLS_UC:
> -		*((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> +		*(__u32 *)l2_hdr = mpls_label;
> 		break;
> 	case ETH_P_TEB:
> -		if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> -				       ETH_HLEN))
> +		flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;
> +
> +		if (ext_proto & EXTPROTO_VXLAN) {
> +			struct vxlanhdr *vxlan_hdr = (struct vxlanhdr *)l2_hdr;
> +
> +			vxlan_hdr->vx_flags = VXLAN_FLAGS;
> +			vxlan_hdr->vx_vni = bpf_htonl((VXLAN_VNI & VXLAN_VNI_MASK) << 8);
> +
> +			l2_hdr += sizeof(struct vxlanhdr);
> +		}
> +
> +		if (bpf_skb_load_bytes(skb, 0, l2_hdr, ETH_HLEN))
> 			return TC_ACT_SHOT;
> 		break;
> 	}
> @@ -372,6 +426,16 @@ int __encap_udp_eth(struct __sk_buff *skb)
> 		return TC_ACT_OK;
> }
> 
> +SEC("encap_vxlan_eth")
> +int __encap_vxlan_eth(struct __sk_buff *skb)
> +{
> +	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
> +		return encap_ipv4_with_ext_proto(skb, IPPROTO_UDP,
> +				ETH_P_TEB, EXTPROTO_VXLAN);
> +	else
> +		return TC_ACT_OK;
> +}
> +
> SEC("encap_sit_none")
> int __encap_sit_none(struct __sk_buff *skb)
> {
> @@ -444,6 +508,16 @@ int __encap_ip6udp_eth(struct __sk_buff *skb)
> 		return TC_ACT_OK;
> }
> 
> +SEC("encap_ip6vxlan_eth")
> +int __encap_ip6vxlan_eth(struct __sk_buff *skb)
> +{
> +	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
> +		return encap_ipv6_with_ext_proto(skb, IPPROTO_UDP,
> +				ETH_P_TEB, EXTPROTO_VXLAN);
> +	else
> +		return TC_ACT_OK;
> +}
> +
> static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
> {
> 	char buf[sizeof(struct v6hdr)];
> @@ -479,6 +553,9 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
> 		case ETH_OVER_UDP_PORT:
> 			olen += ETH_HLEN;
> 			break;
> +		case VXLAN_UDP_PORT:
> +			olen += ETH_HLEN + sizeof(struct vxlanhdr);
> +			break;
> 		}
> 		break;
> 	default:
> diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
> index 7c76b84..c9dde9b 100755
> --- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
> +++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
> @@ -44,8 +44,8 @@ setup() {
> 	# clamp route to reserve room for tunnel headers
> 	ip -netns "${ns1}" -4 route flush table main
> 	ip -netns "${ns1}" -6 route flush table main
> -	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
> -	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
> +	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1450 dev veth1
> +	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1430 dev veth1
> 
> 	sleep 1
> 
> @@ -105,6 +105,12 @@ if [[ "$#" -eq "0" ]]; then
> 	echo "sit"
> 	$0 ipv6 sit none 100
> 
> +	echo "ip4 vxlan"
> +	$0 ipv4 vxlan eth 2000
> +
> +	echo "ip6 vxlan"
> +	$0 ipv6 ip6vxlan eth 2000
> +
> 	for mac in none mpls eth ; do
> 		echo "ip gre $mac"
> 		$0 ipv4 gre $mac 100
> @@ -214,6 +220,9 @@ if [[ "$tuntype" =~ "udp" ]]; then
> 	targs="encap fou encap-sport auto encap-dport $dport"
> elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
> 	ttype=$gretaptype
> +elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
> +	ttype="vxlan"
> +	targs="id 1 dstport 8472 udp6zerocsumrx"
> else
> 	ttype=$tuntype
> 	targs=""
> @@ -242,7 +251,7 @@ if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
> elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
> 	# No support for TEB fou tunnel; expect failure.
> 	expect_tun_fail=1
> -elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
> +elif [[ "$tuntype" =~ (gre|vxlan) && "$mac" == "eth" ]]; then
> 	# Share ethernet address between tunnel/veth2 so L2 decap works.
> 	ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
> 		  awk '/ether/ { print $2 }')
> -- 
> 1.8.3.1
> 





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux