On Tue, Sep 29, 2020 at 11:23:03PM +0200, Daniel Borkmann wrote: [ ... ] > --- > include/linux/skbuff.h | 5 + > include/uapi/linux/bpf.h | 14 ++ > net/core/filter.c | 273 +++++++++++++++++++++++++++++++-- > tools/include/uapi/linux/bpf.h | 14 ++ > 4 files changed, 293 insertions(+), 13 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 04a18e01b362..3d0cf3722bb4 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb) > return skb->mac_header != (typeof(skb->mac_header))~0U; > } > > +static inline void skb_unset_mac_header(struct sk_buff *skb) > +{ > + skb->mac_header = (typeof(skb->mac_header))~0U; > +} > + > static inline void skb_reset_mac_header(struct sk_buff *skb) > { > skb->mac_header = skb->data - skb->head; > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 6116a7f54c8f..1f17c6752deb 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -3652,6 +3652,19 @@ union bpf_attr { > * associated socket instead of the current process. > * Return > * The id is returned or 0 in case the id could not be retrieved. > + * > + * long bpf_redirect_neigh(u32 ifindex, u64 flags) > + * Description > + * Redirect the packet to another net device of index *ifindex* > + * and fill in L2 addresses from neighboring subsystem. This helper > + * is somewhat similar to **bpf_redirect**\ (), except that it > + * fills in e.g. MAC addresses based on the L3 information from > + * the packet. This helper is supported for IPv4 and IPv6 protocols. > + * The *flags* argument is reserved and must be 0. The helper is > + * currently only supported for tc BPF program types. > + * Return > + * The helper returns **TC_ACT_REDIRECT** on success or > + * **TC_ACT_SHOT** on error. > */ > #define __BPF_FUNC_MAPPER(FN) \ > FN(unspec), \ > @@ -3806,6 +3819,7 @@ union bpf_attr { > FN(snprintf_btf), \ > FN(seq_printf_btf), \ > FN(skb_cgroup_classid), \ > + FN(redirect_neigh), \ > /* */ > > /* integer value in 'imm' field of BPF_CALL instruction selects which helper > diff --git a/net/core/filter.c b/net/core/filter.c > index a0776e48dcc9..14b1534f6b46 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -2163,6 +2163,222 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, > return __bpf_redirect_no_mac(skb, dev, flags); > } > > +#if IS_ENABLED(CONFIG_IPV6) > +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) > +{ > + struct dst_entry *dst = skb_dst(skb); > + struct net_device *dev = dst->dev; > + u32 hh_len = LL_RESERVED_SPACE(dev); > + const struct in6_addr *nexthop; > + struct neighbour *neigh; > + > + if (dev_xmit_recursion()) > + goto out_rec; > + > + skb->dev = dev; > + skb->tstamp = 0; > + > + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { > + struct sk_buff *skb2; > + > + skb2 = skb_realloc_headroom(skb, hh_len); > + if (!skb2) { > + kfree_skb(skb); > + return -ENOMEM; > + } > + if (skb->sk) > + skb_set_owner_w(skb2, skb->sk); > + consume_skb(skb); > + skb = skb2; > + } > + > + rcu_read_lock_bh(); > + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), > + &ipv6_hdr(skb)->daddr); > + neigh = ip_neigh_gw6(dev, nexthop); > + if (likely(!IS_ERR(neigh))) { > + int ret; > + > + sock_confirm_neigh(skb, neigh); > + dev_xmit_recursion_inc(); > + ret = neigh_output(neigh, skb, false); > + dev_xmit_recursion_dec(); > + rcu_read_unlock_bh(); > + return ret; > + } > + rcu_read_unlock_bh(); > + IP6_INC_STATS(dev_net(dst->dev), > + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); > +out_drop: > + kfree_skb(skb); > + return -EINVAL; > +out_rec: > + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); > + goto out_drop; nit. may be log this at the earlier "if (dev_xmit_recursion)" and then directly goto out_drop. > +} > + [ ... ] > +/* Internal, non-exposed redirect flags. */ > +enum { > + BPF_F_NEIGH = (1ULL << 1), > +}; It will be useful to ensure the future "flags" of BPF_FUNC_redirect will not overlap with this. May be a BUILD_BUG_ON? Others LGTM. Acked-by: Martin KaFai Lau <kafai@xxxxxx> > > int skb_do_redirect(struct sk_buff *skb) > { > struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); > struct net_device *dev; > + u32 flags = ri->flags; > > dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); > ri->tgt_index = 0; > @@ -2231,7 +2440,22 @@ int skb_do_redirect(struct sk_buff *skb) > return -EINVAL; > } > > - return __bpf_redirect(skb, dev, ri->flags); > + return flags & BPF_F_NEIGH ? > + __bpf_redirect_neigh(skb, dev) : > + __bpf_redirect(skb, dev, flags); > +}