Different cpumap tests showed that a couple of little careful prefetches helps the performance. The only thing is to not go crazy: only one cacheline to the right from the frame start and one to the left -- if there is a metadata in front. Signed-off-by: Alexander Lobakin <alexandr.lobakin@xxxxxxxxx> --- net/bpf/core.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/net/bpf/core.c b/net/bpf/core.c index a8685bcc6e00..775f9648e8cf 100644 --- a/net/bpf/core.c +++ b/net/bpf/core.c @@ -620,10 +620,26 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf); + u32 dist, metasize = xdpf->metasize; unsigned int headroom, frame_size; + void *data = xdpf->data; void *hard_start; u8 nr_frags; + /* Bring the headers to the current CPU, as well as the + * metadata if present. This helps eth_type_trans() and + * xdp_populate_skb_meta_generic(). + * The idea here is to prefetch no more than 2 cachelines: + * one to the left from the data start and one to the right. + */ +#define to_cl(ptr) PTR_ALIGN_DOWN(ptr, L1_CACHE_BYTES) + dist = min_t(typeof(dist), metasize, L1_CACHE_BYTES); + if (dist && to_cl(data - dist) != to_cl(data)) + prefetch(data - dist); +#undef to_cl + + prefetch(data); + /* xdp frags frame */ if (unlikely(xdp_frame_has_frags(xdpf))) nr_frags = sinfo->nr_frags; @@ -636,15 +652,15 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, */ frame_size = xdpf->frame_sz; - hard_start = xdpf->data - headroom; + hard_start = data - headroom; skb = build_skb_around(skb, hard_start, frame_size); if (unlikely(!skb)) return NULL; skb_reserve(skb, headroom); __skb_put(skb, xdpf->len); - if (xdpf->metasize) - skb_metadata_set(skb, xdpf->metasize); + if (metasize) + skb_metadata_set(skb, metasize); if (unlikely(xdp_frame_has_frags(xdpf))) xdp_update_skb_shared_info(skb, nr_frags, -- 2.36.1