On 9/28/20 7:38 AM, Daniel Borkmann wrote: > diff --git a/net/core/filter.c b/net/core/filter.c > index a0776e48dcc9..64c6e5ec97d7 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > +{ > + const struct ipv6hdr *ip6h = ipv6_hdr(skb); > + struct net *net = dev_net(dev); > + int err, ret = NET_XMIT_DROP; > + struct flowi6 fl6 = { > + .flowi6_flags = FLOWI_FLAG_ANYSRC, > + .flowi6_mark = skb->mark, > + .flowlabel = ip6_flowinfo(ip6h), > + .flowi6_proto = ip6h->nexthdr, > + .flowi6_oif = dev->ifindex, > + .daddr = ip6h->daddr, > + .saddr = ip6h->saddr, > + }; > + struct dst_entry *dst; > + > + skb->dev = dev; this is not needed here. You set dev in bpf_out_neigh_v6 to the dst dev. Everything else is an error path where the skb is dropped. > + skb->tstamp = 0; > + > + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); > + if (IS_ERR(dst)) > + goto out_drop; > + > + skb_dst_set(skb, dst); > + > + err = bpf_out_neigh_v6(net, skb); > + if (unlikely(net_xmit_eval(err))) > + dev->stats.tx_errors++; > + else > + ret = NET_XMIT_SUCCESS; > + goto out_xmit; > +out_drop: > + dev->stats.tx_errors++; > + kfree_skb(skb); > +out_xmit: > + return ret; > +} > +#else > +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) > +{ > + kfree_skb(skb); > + return NET_XMIT_DROP; > +} > +#endif /* CONFIG_IPV6 */ > + > +#if IS_ENABLED(CONFIG_INET) > +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) > +{ > + struct dst_entry *dst = skb_dst(skb); > + struct rtable *rt = container_of(dst, struct rtable, dst); > + struct net_device *dev = dst->dev; > + u32 hh_len = LL_RESERVED_SPACE(dev); > + struct neighbour *neigh; > + bool is_v6gw = false; > + > + if (dev_xmit_recursion()) > + goto out_rec; > + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { Why is this check needed for v4 but not v6? > + struct sk_buff *skb2; > + > + skb2 = skb_realloc_headroom(skb, hh_len); > + if (!skb2) { > + kfree_skb(skb); > + return -ENOMEM; > + } > + if (skb->sk) > + skb_set_owner_w(skb2, skb->sk); > + consume_skb(skb); > + skb = skb2; > + } > + rcu_read_lock_bh(); > + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); > + if (likely(!IS_ERR(neigh))) { > + int ret; > + > + sock_confirm_neigh(skb, neigh); > + dev_xmit_recursion_inc(); > + ret = neigh_output(neigh, skb, is_v6gw); > + dev_xmit_recursion_dec(); > + rcu_read_unlock_bh(); > + return ret; > + } > + rcu_read_unlock_bh(); > +out_drop: > + kfree_skb(skb); > + return -EINVAL; > +out_rec: > + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); > + goto out_drop; > +} > + > +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) > +{ > + const struct iphdr *ip4h = ip_hdr(skb); > + struct net *net = dev_net(dev); > + int err, ret = NET_XMIT_DROP; > + struct flowi4 fl4 = { > + .flowi4_flags = FLOWI_FLAG_ANYSRC, > + .flowi4_mark = skb->mark, > + .flowi4_tos = RT_TOS(ip4h->tos), set flowi4_proto here. > + .flowi4_oif = dev->ifindex, > + .daddr = ip4h->daddr, > + .saddr = ip4h->saddr, > + }; > + struct rtable *rt; > + > + skb->dev = dev; > + skb->tstamp = 0; > + > + rt = ip_route_output_flow(net, &fl4, NULL); > + if (IS_ERR(rt)) > + goto out_drop; > + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { > + ip_rt_put(rt); > + goto out_drop; > + } > + > + skb_dst_set(skb, &rt->dst); > + > + err = bpf_out_neigh_v4(net, skb); > + if (unlikely(net_xmit_eval(err))) > + dev->stats.tx_errors++; > + else > + ret = NET_XMIT_SUCCESS; > + goto out_xmit; > +out_drop: > + dev->stats.tx_errors++; > + kfree_skb(skb); > +out_xmit: > + return ret; > +}