On 2019-10-08 08:16, Sridhar Samudrala wrote:
Introduce a flag that can be specified during the bind() call
of an AF_XDP socket to receive packets directly from a queue when there is
no XDP program attached to the device.
This is enabled by introducing a special BPF prog pointer called
BPF_PROG_DIRECT_XSK and a new bind flag XDP_DIRECT that can be specified
when binding an AF_XDP socket to a queue. At the time of bind(), an AF_XDP
socket in XDP_DIRECT mode, will attach BPF_PROG_DIRECT_XSK as a bpf program
if there is no other XDP program attached to the device. The device receive
queue is also associated with the AF_XDP socket.
In the XDP receive path, if the bpf program is a DIRECT_XSK program, the
XDP buffer is passed to the AF_XDP socket that is associated with the
receive queue on which the packet is received.
To avoid any overhead for nomal XDP programs, a static key is used to keep
a count of AF_XDP direct xsk sockets and static_branch_unlikely() is used
to handle receives for direct XDP sockets.
Any attach of a normal XDP program will take precedence and the direct xsk
program will be removed. The direct XSK program will be attached
automatically when the normal XDP program is removed when there are any
AF_XDP direct sockets associated with that device.
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@xxxxxxxxx>
---
include/linux/filter.h | 18 ++++++++++++
include/linux/netdevice.h | 10 +++++++
include/net/xdp_sock.h | 5 ++++
include/uapi/linux/if_xdp.h | 5 ++++
kernel/bpf/syscall.c | 7 +++--
net/core/dev.c | 50 +++++++++++++++++++++++++++++++++
net/core/filter.c | 58 +++++++++++++++++++++++++++++++++++++++
net/xdp/xsk.c | 51 ++++++++++++++++++++++++++++++++--
tools/include/uapi/linux/if_xdp.h | 5 ++++
9 files changed, 204 insertions(+), 5 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2ce57645f3cd..db4ad85d8321 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -585,6 +585,9 @@ struct bpf_redirect_info {
struct bpf_map *map;
struct bpf_map *map_to_flush;
u32 kern_flags;
+#ifdef CONFIG_XDP_SOCKETS
+ struct xdp_sock *xsk;
+#endif
};
DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
@@ -693,6 +696,16 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
return res;
}
+#ifdef CONFIG_XDP_SOCKETS
+#define BPF_PROG_DIRECT_XSK 0x1
+#define bpf_is_direct_xsk_prog(prog) \
+ ((unsigned long)prog == BPF_PROG_DIRECT_XSK)
+DECLARE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
+u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp);
+#else
+#define bpf_is_direct_xsk_prog(prog) (false)
+#endif
+
static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
struct xdp_buff *xdp)
{
@@ -702,6 +715,11 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
* already takes rcu_read_lock() when fetching the program, so
* it's not necessary here anymore.
*/
+#ifdef CONFIG_XDP_SOCKETS
+ if (static_branch_unlikely(&xdp_direct_xsk_needed) &&
+ bpf_is_direct_xsk_prog(prog))
+ return bpf_direct_xsk(prog, xdp);
+#endif
return BPF_PROG_RUN(prog, xdp);
}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 48cc71aae466..f4d0f70aa718 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -743,6 +743,7 @@ struct netdev_rx_queue {
struct xdp_rxq_info xdp_rxq;
#ifdef CONFIG_XDP_SOCKETS
struct xdp_umem *umem;
+ struct xdp_sock *xsk;
#endif
} ____cacheline_aligned_in_smp;
@@ -1836,6 +1837,10 @@ struct net_device {
atomic_t carrier_up_count;
atomic_t carrier_down_count;
+#ifdef CONFIG_XDP_SOCKETS
+ u16 direct_xsk_count;
Why u16? num_rx/tx_queues are unsigned ints.
+#endif
+
#ifdef CONFIG_WIRELESS_EXT
const struct iw_handler_def *wireless_handlers;
struct iw_public_data *wireless_data;
@@ -3712,6 +3717,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
const struct sk_buff *skb);
+#ifdef CONFIG_XDP_SOCKETS
+int dev_set_direct_xsk_prog(struct net_device *dev);
+int dev_clear_direct_xsk_prog(struct net_device *dev);
+#endif
+
static __always_inline int ____dev_forward_skb(struct net_device *dev,
struct sk_buff *skb)
{
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index c9398ce7960f..9158233d34e1 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -76,6 +76,9 @@ struct xsk_map_node {
struct xdp_sock **map_entry;
};
+/* Flags for the xdp_sock flags field. */
+#define XDP_SOCK_DIRECT (1 << 0)
+
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
@@ -104,6 +107,7 @@ struct xdp_sock {
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
+ u16 flags;
Right now only the XDP_DIRECT is tracked here. Maybe track all flags,
and show them in xsk_diag.
};
struct xdp_buff;
@@ -129,6 +133,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id);
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry);
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index be328c59389d..d202b5d40859 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -25,6 +25,11 @@
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
+/* This option allows an AF_XDP socket bound to a queue to receive all
+ * the packets directly from that queue when there is no XDP program
+ * attached to the device.
+ */
+#define XDP_DIRECT (1 << 4)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 205f95af67d2..871d738a78d2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1349,13 +1349,14 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
void bpf_prog_put(struct bpf_prog *prog)
{
- __bpf_prog_put(prog, true);
+ if (!bpf_is_direct_xsk_prog(prog))
+ __bpf_prog_put(prog, true);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
u32 bpf_get_prog_id(const struct bpf_prog *prog)
{
- if (prog)
+ if (prog && !bpf_is_direct_xsk_prog(prog))
return prog->aux->id;
return 0;
@@ -1364,7 +1365,7 @@ EXPORT_SYMBOL(bpf_get_prog_id);
void bpf_set_prog_id(struct bpf_prog *prog, u32 id)
{
- if (prog)
+ if (prog && !bpf_is_direct_xsk_prog(prog))
prog->aux->id = id;
}
EXPORT_SYMBOL(bpf_set_prog_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index 866d0ad936a5..eb3cd718e580 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8269,6 +8269,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
} else {
if (!__dev_xdp_query(dev, bpf_op, query))
return 0;
+#ifdef CONFIG_XDP_SOCKETS
+ if (dev->direct_xsk_count)
+ prog = (void *)BPF_PROG_DIRECT_XSK;
+#endif
Nit, but maybe hide this weirdness in a function?
}
err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
@@ -8278,6 +8282,52 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
return err;
}
+#ifdef CONFIG_XDP_SOCKETS
+int dev_set_direct_xsk_prog(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct bpf_prog *prog;
+ bpf_op_t bpf_op;
+
+ ASSERT_RTNL();
+
+ dev->direct_xsk_count++;
+
+ bpf_op = ops->ndo_bpf;
+ if (!bpf_op)
+ return -EOPNOTSUPP;
+
+ if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
+ return 0;
+
+ prog = (void *)BPF_PROG_DIRECT_XSK;
+
+ return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, prog);
+}
+
+int dev_clear_direct_xsk_prog(struct net_device *dev)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ bpf_op_t bpf_op;
+
+ ASSERT_RTNL();
+
+ dev->direct_xsk_count--;
+
+ if (dev->direct_xsk_count)
+ return 0;
+
+ bpf_op = ops->ndo_bpf;
+ if (!bpf_op)
+ return -EOPNOTSUPP;
+
+ if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
+ return 0;
+
+ return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, NULL);
+}
+#endif
+
/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
diff --git a/net/core/filter.c b/net/core/filter.c
index ed6563622ce3..391d7d600284 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
#include <net/bpf_sk_storage.h>
+#include <linux/static_key.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -3546,6 +3547,22 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
return 0;
}
+#ifdef CONFIG_XDP_SOCKETS
+static void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
+{
+ struct xdp_sock *xsk = READ_ONCE(ri->xsk);
Why READ_ONCE here?
+
+ if (xsk) {
+ ri->xsk = NULL;
+ xsk_flush(xsk);
+ }
+}
+#else
+static inline void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
+{
+}
+#endif
+
Move CONFIG_XDP_SOCKETS into the function, and remove the empty/bottom one.
void xdp_do_flush_map(void)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@@ -3568,6 +3585,8 @@ void xdp_do_flush_map(void)
break;
}
}
+
+ xdp_do_flush_xsk(ri);
}
EXPORT_SYMBOL_GPL(xdp_do_flush_map);
@@ -3631,11 +3650,28 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
return err;
}
+#ifdef CONFIG_XDP_SOCKETS
+static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri)
+{
+ return READ_ONCE(ri->xsk);
Again, why READ_ONCE? Please leave the inlining to the compiler in .c files.
+}
+#else
+static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri)
+{
+ return NULL;
+}
+#endif
+
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
+ struct xdp_sock *xsk;
+
+ xsk = xdp_get_direct_xsk(ri);
+ if (xsk)
+ return xsk_rcv(xsk, xdp);
Hmm, maybe you need a xsk_to_flush as well. Say that a user swaps in a
regular XDP program, then xsk_rcv() will be called until the flush
occurs, right? IOW, all packets processed (napi budget) in the napi_poll
will end up in the socket.
if (likely(map))
return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
@@ -8934,4 +8970,26 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
const struct bpf_prog_ops sk_reuseport_prog_ops = {
};
+
+#ifdef CONFIG_XDP_SOCKETS
+DEFINE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
+EXPORT_SYMBOL_GPL(xdp_direct_xsk_needed);
+
+u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
+{
+ struct xdp_sock *xsk;
+
+ xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
+ if (xsk) {
+ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+ ri->xsk = xsk;
+ return XDP_REDIRECT;
From the comment above. I *think* you need to ri->xsk_to_flush. Can the
direct socket (swap socket) change before flush?
+ }
+
+ return XDP_PASS;
+}
+EXPORT_SYMBOL(bpf_direct_xsk);
+#endif
+
#endif /* CONFIG_INET */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index fa8fbb8fa3c8..8a29939bac7e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -24,6 +24,7 @@
#include <linux/rculist.h>
#include <net/xdp_sock.h>
#include <net/xdp.h>
+#include <linux/if_link.h>
#include "xsk_queue.h"
#include "xdp_umem.h"
@@ -264,6 +265,45 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
return err;
}
+static void xdp_clear_direct_xsk(struct xdp_sock *xsk)
Use xs, and not xsk for consistency.
+{
+ struct net_device *dev = xsk->dev;
+ u32 qid = xsk->queue_id;
+
+ if (!dev->_rx[qid].xsk)
+ return;
+
+ dev_clear_direct_xsk_prog(dev);
+ dev->_rx[qid].xsk = NULL;
+ static_branch_dec(&xdp_direct_xsk_needed);
+ xsk->flags &= ~XDP_SOCK_DIRECT;
+}
+
+static int xdp_set_direct_xsk(struct xdp_sock *xsk)
Same here.
+{
+ struct net_device *dev = xsk->dev;
+ u32 qid = xsk->queue_id;
+ int err = 0;
+
+ if (dev->_rx[qid].xsk)
+ return -EEXIST;
+
+ xsk->flags |= XDP_SOCK_DIRECT;
+ static_branch_inc(&xdp_direct_xsk_needed);
+ dev->_rx[qid].xsk = xsk;
+ err = dev_set_direct_xsk_prog(dev);
+ if (err)
+ xdp_clear_direct_xsk(xsk);
+
+ return err;
+}
+
+struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id)
+{
+ return dev->_rx[queue_id].xsk;
+}
+EXPORT_SYMBOL(xdp_get_xsk_from_qid);
+
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
xskq_produce_flush_addr_n(umem->cq, nb_entries);
@@ -464,6 +504,11 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
return;
WRITE_ONCE(xs->state, XSK_UNBOUND);
+ if (xs->flags & XDP_SOCK_DIRECT) {
+ rtnl_lock();
+ xdp_clear_direct_xsk(xs);
+ rtnl_unlock();
+ }
/* Wait for driver to stop using the xdp socket. */
xdp_del_sk_umem(xs->umem, xs);
xs->dev = NULL;
@@ -604,7 +649,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
flags = sxdp->sxdp_flags;
if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
- XDP_USE_NEED_WAKEUP))
+ XDP_USE_NEED_WAKEUP | XDP_DIRECT))
return -EINVAL;
rtnl_lock();
@@ -632,7 +677,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
struct socket *sock;
if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
- (flags & XDP_USE_NEED_WAKEUP)) {
+ (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_DIRECT)) {
/* Cannot specify flags for shared sockets. */
err = -EINVAL;
goto out_unlock;
@@ -688,6 +733,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
xdp_add_sk_umem(xs->umem, xs);
+ if (flags & XDP_DIRECT)
+ err = xdp_set_direct_xsk(xs);
out_unlock:
if (err) {
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index be328c59389d..d202b5d40859 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -25,6 +25,11 @@
* application.
*/
#define XDP_USE_NEED_WAKEUP (1 << 3)
+/* This option allows an AF_XDP socket bound to a queue to receive all
+ * the packets directly from that queue when there is no XDP program
+ * attached to the device.
+ */
+#define XDP_DIRECT (1 << 4)
/* Flags for xsk_umem_config flags */
#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)