Re: [PATCH bpf-next 2/2] xdp: Use bulking for non-map XDP_REDIRECT

Björn Töpel <bjorn.topel@xxxxxxxxx> · Fri, 10 Jan 2020 16:15:00 +0100

On Fri, 10 Jan 2020 at 15:22, Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote:
>
> From: Toke Høiland-Jørgensen <toke@xxxxxxxxxx>
>
> Since the bulk queue used by XDP_REDIRECT now lives in struct net_device,
> we can re-use the bulking for the non-map version of the bpf_redirect()
> helper. This is a simple matter of having xdp_do_redirect_slow() queue the
> frame on the bulk queue instead of sending it out with __bpf_tx_xdp().
>
> Unfortunately we can't make the bpf_redirect() helper return an error if
> the ifindex doesn't exit (as bpf_redirect_map() does), because we don't
> have a reference to the network namespace of the ingress device at the time
> the helper is called. So we have to leave it as-is and keep the device
> lookup in xdp_do_redirect_slow().
>
> With this change, the performance of the xdp_redirect sample program goes
> from 5Mpps to 8.4Mpps (a 68% increase).
>

After these changes, does the noinline (commit 47b123ed9e99 ("xdp:
split code for map vs non-map redirect")) still make sense?

> Signed-off-by: Toke Høiland-Jørgensen <toke@xxxxxxxxxx>
> ---
>  include/linux/bpf.h |   13 +++++++++++--
>  kernel/bpf/devmap.c |   31 ++++++++++++++++++++++---------
>  net/core/filter.c   |   30 ++----------------------------
>  3 files changed, 35 insertions(+), 39 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index b14e51d56a82..25c050202536 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -962,7 +962,9 @@ struct sk_buff;
>
>  struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
>  struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
> -void __dev_map_flush(void);
> +void __dev_flush(void);
> +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
> +                   struct net_device *dev_rx);
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>                     struct net_device *dev_rx);
>  int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
> @@ -1071,13 +1073,20 @@ static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map
>         return NULL;
>  }
>
> -static inline void __dev_map_flush(void)
> +static inline void __dev_flush(void)
>  {
>  }
>
>  struct xdp_buff;
>  struct bpf_dtab_netdev;
>
> +static inline
> +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
> +                   struct net_device *dev_rx)
> +{
> +       return 0;
> +}
> +
>  static inline
>  int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>                     struct net_device *dev_rx)
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index bcb05cb6b728..adbb82770d02 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -81,7 +81,7 @@ struct bpf_dtab {
>         u32 n_buckets;
>  };
>
> -static DEFINE_PER_CPU(struct list_head, dev_map_flush_list);
> +static DEFINE_PER_CPU(struct list_head, dev_flush_list);
>  static DEFINE_SPINLOCK(dev_map_lock);
>  static LIST_HEAD(dev_map_list);
>
> @@ -357,16 +357,16 @@ static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
>         goto out;
>  }
>
> -/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
> +/* __dev_flush is called from xdp_do_flush_map() which _must_ be signaled
>   * from the driver before returning from its napi->poll() routine. The poll()
>   * routine is called either from busy_poll context or net_rx_action signaled
>   * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
>   * net device can be torn down. On devmap tear down we ensure the flush list
>   * is empty before completing to ensure all flush operations have completed.
>   */
> -void __dev_map_flush(void)
> +void __dev_flush(void)
>  {
> -       struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list);
> +       struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
>         struct xdp_dev_bulk_queue *bq, *tmp;
>
>         rcu_read_lock();
> @@ -398,7 +398,7 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>                       struct net_device *dev_rx)
>
>  {
> -       struct list_head *flush_list = this_cpu_ptr(&dev_map_flush_list);
> +       struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
>         struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
>
>         if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> @@ -419,10 +419,9 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
>         return 0;
>  }
>
> -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> -                   struct net_device *dev_rx)
> +static inline int _xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
> +                              struct net_device *dev_rx)
>  {
> -       struct net_device *dev = dst->dev;
>         struct xdp_frame *xdpf;
>         int err;
>
> @@ -440,6 +439,20 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
>         return bq_enqueue(dev, xdpf, dev_rx);
>  }
>
> +int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
> +                   struct net_device *dev_rx)
> +{
> +       return _xdp_enqueue(dev, xdp, dev_rx);
> +}
> +

dev_xdp_enqueue, and dev_map_enqueue are *very* similar. Can these be
combined, and maybe fold the xdp_do_redirect_slow() into
xdp_do_direct_map? OTOH the TP are different, so maybe combining the
two functions will be messy... It's only that with your changes the
map/ifindex redirect are very similar. Just an idea, might be messy.
:-P

> +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
> +                   struct net_device *dev_rx)
> +{
> +       struct net_device *dev = dst->dev;
> +
> +       return _xdp_enqueue(dev, xdp, dev_rx);
> +}
> +
>  int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
>                              struct bpf_prog *xdp_prog)
>  {
> @@ -760,7 +773,7 @@ static int __init dev_map_init(void)
>         register_netdevice_notifier(&dev_map_notifier);
>
>         for_each_possible_cpu(cpu)
> -               INIT_LIST_HEAD(&per_cpu(dev_map_flush_list, cpu));
> +               INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
>         return 0;
>  }
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 42fd17c48c5f..550488162fe1 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3458,32 +3458,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
>         .arg2_type      = ARG_ANYTHING,
>  };
>
> -static int __bpf_tx_xdp(struct net_device *dev,
> -                       struct bpf_map *map,
> -                       struct xdp_buff *xdp,
> -                       u32 index)
> -{
> -       struct xdp_frame *xdpf;
> -       int err, sent;
> -
> -       if (!dev->netdev_ops->ndo_xdp_xmit) {
> -               return -EOPNOTSUPP;
> -       }
> -
> -       err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
> -       if (unlikely(err))
> -               return err;
> -
> -       xdpf = convert_to_xdp_frame(xdp);
> -       if (unlikely(!xdpf))
> -               return -EOVERFLOW;
> -
> -       sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
> -       if (sent <= 0)
> -               return sent;
> -       return 0;
> -}
> -
>  static noinline int
>  xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
>                      struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
> @@ -3499,7 +3473,7 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
>                 goto err;
>         }
>
> -       err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
> +       err = dev_xdp_enqueue(fwd, xdp, dev);
>         if (unlikely(err))
>                 goto err;
>
> @@ -3529,7 +3503,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>
>  void xdp_do_flush_map(void)
>  {
> -       __dev_map_flush();
> +       __dev_flush();
>         __cpu_map_flush();
>         __xsk_map_flush();
>  }
>