This needs to be reviewed by AF_XDP maintainers Magnus and Bjørn (Cc)
On 21/06/2023 19.02, Stanislav Fomichev wrote:
For zerocopy mode, tx_desc->addr can point to the arbitrary offset
and carry some TX metadata in the headroom. For copy mode, there
is no way currently to populate skb metadata.
Introduce new XDP_TX_METADATA_LEN that indicates how many bytes
to treat as metadata. Metadata bytes come prior to tx_desc address
(same as in RX case).
From looking at the code, this introduces a socket option for this TX
metadata length (tx_metadata_len).
This implies the same fixed TX metadata size is used for all packets.
Maybe describe this in patch desc.
What is the plan for dealing with cases that doesn't populate same/full
TX metadata size ?
Signed-off-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
---
include/net/xdp_sock.h | 1 +
include/net/xsk_buff_pool.h | 1 +
include/uapi/linux/if_xdp.h | 1 +
net/xdp/xsk.c | 31 ++++++++++++++++++++++++++++++-
net/xdp/xsk_buff_pool.c | 1 +
net/xdp/xsk_queue.h | 7 ++++---
6 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index e96a1151ec75..30018b3b862d 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -51,6 +51,7 @@ struct xdp_sock {
struct list_head flush_node;
struct xsk_buff_pool *pool;
u16 queue_id;
+ u8 tx_metadata_len;
bool zc;
enum {
XSK_READY = 0,
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index a8d7b8a3688a..751fea51a6af 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -75,6 +75,7 @@ struct xsk_buff_pool {
u32 chunk_size;
u32 chunk_shift;
u32 frame_len;
+ u8 tx_metadata_len; /* inherited from xsk_sock */
u8 cached_need_wakeup;
bool uses_need_wakeup;
bool dma_need_sync;
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index a78a8096f4ce..2374eafff7db 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -63,6 +63,7 @@ struct xdp_mmap_offsets {
#define XDP_UMEM_COMPLETION_RING 6
#define XDP_STATISTICS 7
#define XDP_OPTIONS 8
+#define XDP_TX_METADATA_LEN 9
struct xdp_umem_reg {
__u64 addr; /* Start of packet data area */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cc1e7f15fa73..c9b2daba7b6d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -485,6 +485,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
int err;
hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+ hr = max(hr, L1_CACHE_ALIGN((u32)xs->tx_metadata_len));
tr = dev->needed_tailroom;
len = desc->len;
@@ -493,14 +494,21 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
- skb_put(skb, len);
+ skb_put(skb, len + xs->tx_metadata_len);
buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+ buffer -= xs->tx_metadata_len;
+
err = skb_store_bits(skb, 0, buffer, len);
if (unlikely(err)) {
kfree_skb(skb);
return ERR_PTR(err);
}
+
+ if (xs->tx_metadata_len) {
+ skb_metadata_set(skb, xs->tx_metadata_len);
+ __skb_pull(skb, xs->tx_metadata_len);
+ }
}
skb->dev = dev;
@@ -1137,6 +1145,27 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
mutex_unlock(&xs->mutex);
return err;
}
+ case XDP_TX_METADATA_LEN:
+ {
+ int val;
+
+ if (optlen < sizeof(val))
+ return -EINVAL;
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
+ return -EFAULT;
+
+ if (val >= 256)
+ return -EINVAL;
+
+ mutex_lock(&xs->mutex);
+ if (xs->state != XSK_READY) {
+ mutex_unlock(&xs->mutex);
+ return -EBUSY;
+ }
+ xs->tx_metadata_len = val;
+ mutex_unlock(&xs->mutex);
+ return err;
+ }
default:
break;
}
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 26f6d304451e..66ff9c345a67 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -85,6 +85,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
XDP_PACKET_HEADROOM;
pool->umem = umem;
pool->addrs = umem->addrs;
+ pool->tx_metadata_len = xs->tx_metadata_len;
INIT_LIST_HEAD(&pool->free_list);
INIT_LIST_HEAD(&pool->xsk_tx_list);
spin_lock_init(&pool->xsk_tx_list_lock);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 6d40a77fccbe..c8d287c18d64 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -133,12 +133,13 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 offset = desc->addr & (pool->chunk_size - 1);
+ u64 addr = desc->addr - pool->tx_metadata_len;
+ u64 offset = addr & (pool->chunk_size - 1);
if (offset + desc->len > pool->chunk_size)
return false;
- if (desc->addr >= pool->addrs_cnt)
+ if (addr >= pool->addrs_cnt)
return false;
if (desc->options)
@@ -149,7 +150,7 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
- u64 addr = xp_unaligned_add_offset_to_addr(desc->addr);
+ u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len;
if (desc->len > pool->chunk_size)
return false;