Re: [PATCH bpf-next v3 3/6] bpf: add redirect_neigh helper as redirect drop-in

Martin KaFai Lau <kafai@xxxxxx> · Tue, 29 Sep 2020 23:48:11 -0700

On Tue, Sep 29, 2020 at 11:23:03PM +0200, Daniel Borkmann wrote:

[ ... ]

> ---
>  include/linux/skbuff.h         |   5 +
>  include/uapi/linux/bpf.h       |  14 ++
>  net/core/filter.c              | 273 +++++++++++++++++++++++++++++++--
>  tools/include/uapi/linux/bpf.h |  14 ++
>  4 files changed, 293 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 04a18e01b362..3d0cf3722bb4 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb)
>  	return skb->mac_header != (typeof(skb->mac_header))~0U;
>  }
>  
> +static inline void skb_unset_mac_header(struct sk_buff *skb)
> +{
> +	skb->mac_header = (typeof(skb->mac_header))~0U;
> +}
> +
>  static inline void skb_reset_mac_header(struct sk_buff *skb)
>  {
>  	skb->mac_header = skb->data - skb->head;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 6116a7f54c8f..1f17c6752deb 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3652,6 +3652,19 @@ union bpf_attr {
>   * 		associated socket instead of the current process.
>   * 	Return
>   * 		The id is returned or 0 in case the id could not be retrieved.
> + *
> + * long bpf_redirect_neigh(u32 ifindex, u64 flags)
> + * 	Description
> + * 		Redirect the packet to another net device of index *ifindex*
> + * 		and fill in L2 addresses from neighboring subsystem. This helper
> + * 		is somewhat similar to **bpf_redirect**\ (), except that it
> + * 		fills in e.g. MAC addresses based on the L3 information from
> + * 		the packet. This helper is supported for IPv4 and IPv6 protocols.
> + * 		The *flags* argument is reserved and must be 0. The helper is
> + * 		currently only supported for tc BPF program types.
> + * 	Return
> + * 		The helper returns **TC_ACT_REDIRECT** on success or
> + * 		**TC_ACT_SHOT** on error.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -3806,6 +3819,7 @@ union bpf_attr {
>  	FN(snprintf_btf),		\
>  	FN(seq_printf_btf),		\
>  	FN(skb_cgroup_classid),		\
> +	FN(redirect_neigh),		\
>  	/* */
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/net/core/filter.c b/net/core/filter.c
> index a0776e48dcc9..14b1534f6b46 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2163,6 +2163,222 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
>  		return __bpf_redirect_no_mac(skb, dev, flags);
>  }
>  
> +#if IS_ENABLED(CONFIG_IPV6)
> +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
> +{
> +	struct dst_entry *dst = skb_dst(skb);
> +	struct net_device *dev = dst->dev;
> +	u32 hh_len = LL_RESERVED_SPACE(dev);
> +	const struct in6_addr *nexthop;
> +	struct neighbour *neigh;
> +
> +	if (dev_xmit_recursion())
> +		goto out_rec;
> +
> +	skb->dev = dev;
> +	skb->tstamp = 0;
> +
> +	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
> +		struct sk_buff *skb2;
> +
> +		skb2 = skb_realloc_headroom(skb, hh_len);
> +		if (!skb2) {
> +			kfree_skb(skb);
> +			return -ENOMEM;
> +		}
> +		if (skb->sk)
> +			skb_set_owner_w(skb2, skb->sk);
> +		consume_skb(skb);
> +		skb = skb2;
> +	}
> +
> +	rcu_read_lock_bh();
> +	nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
> +			      &ipv6_hdr(skb)->daddr);
> +	neigh = ip_neigh_gw6(dev, nexthop);
> +	if (likely(!IS_ERR(neigh))) {
> +		int ret;
> +
> +		sock_confirm_neigh(skb, neigh);
> +		dev_xmit_recursion_inc();
> +		ret = neigh_output(neigh, skb, false);
> +		dev_xmit_recursion_dec();
> +		rcu_read_unlock_bh();
> +		return ret;
> +	}
> +	rcu_read_unlock_bh();
> +	IP6_INC_STATS(dev_net(dst->dev),
> +		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
> +out_drop:
> +	kfree_skb(skb);
> +	return -EINVAL;
> +out_rec:
> +	net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
> +	goto out_drop;
nit. may be log this at the earlier "if (dev_xmit_recursion)" and
then directly goto out_drop.

> +}
> +

[ ... ]

> +/* Internal, non-exposed redirect flags. */
> +enum {
> +	BPF_F_NEIGH = (1ULL << 1),
> +};
It will be useful to ensure the future "flags" of BPF_FUNC_redirect
will not overlap with this.  May be a BUILD_BUG_ON?

Others LGTM.

Acked-by: Martin KaFai Lau <kafai@xxxxxx>

>  
>  int skb_do_redirect(struct sk_buff *skb)
>  {
>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>  	struct net_device *dev;
> +	u32 flags = ri->flags;
>  
>  	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
>  	ri->tgt_index = 0;
> @@ -2231,7 +2440,22 @@ int skb_do_redirect(struct sk_buff *skb)
>  		return -EINVAL;
>  	}
>  
> -	return __bpf_redirect(skb, dev, ri->flags);
> +	return flags & BPF_F_NEIGH ?
> +	       __bpf_redirect_neigh(skb, dev) :
> +	       __bpf_redirect(skb, dev, flags);
> +}