Dear RT folks! I'm pleased to announce the v6.10-rc3-rt5 patch set. Changes since v6.10-rc3-rt4: - Update the BH-locking series to the upcoming v6. This reworks locking within the series which avoids a case where pahole fails to process the BTF information. Reported by Clark Williams, Luis Claudio R. Goncalves and Kurt Kanzenbach. Known issues None. The delta patch against v6.10-rc3-rt4 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.10/incr/patch-6.10-rc3-rt4-rt5.patch.xz You can get this release via the git tree at: https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v6.10-rc3-rt5 The RT patch against v6.10-rc3 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.10/older/patch-6.10-rc3-rt5.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.10/older/patches-6.10-rc3-rt5.tar.xz Sebastian diff --git a/include/linux/filter.h b/include/linux/filter.h index 2fc16ce4a404b..62662ab99d1f4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -733,15 +733,22 @@ struct bpf_nh_params { }; }; +/* flags for bpf_redirect_info kern_flags */ +#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ +#define BPF_RI_F_RI_INIT BIT(1) +#define BPF_RI_F_CPU_MAP_INIT BIT(2) +#define BPF_RI_F_DEV_MAP_INIT BIT(3) +#define BPF_RI_F_XSK_MAP_INIT BIT(4) + struct bpf_redirect_info { u64 tgt_index; void *tgt_value; struct bpf_map *map; u32 flags; - u32 kern_flags; u32 map_id; enum bpf_map_type map_type; struct bpf_nh_params nh; + u32 kern_flags; }; struct bpf_net_context { @@ -757,14 +764,7 @@ static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bp if (tsk->bpf_net_context != NULL) return NULL; - memset(&bpf_net_ctx->ri, 0, sizeof(bpf_net_ctx->ri)); - - if (IS_ENABLED(CONFIG_BPF_SYSCALL)) { - INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list); - INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list); - } - if (IS_ENABLED(CONFIG_XDP_SOCKETS)) - INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list); + bpf_net_ctx->ri.kern_flags = 0; tsk->bpf_net_context = bpf_net_ctx; return bpf_net_ctx; @@ -785,6 +785,11 @@ static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) { + memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh)); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT; + } + return &bpf_net_ctx->ri; } @@ -792,6 +797,11 @@ static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT; + } + return &bpf_net_ctx->cpu_map_flush_list; } @@ -799,6 +809,11 @@ static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT; + } + return &bpf_net_ctx->dev_map_flush_list; } @@ -806,14 +821,14 @@ static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); + if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) { + INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list); + bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT; + } + return &bpf_net_ctx->xskmap_map_flush_list; } -DEFINE_FREE(bpf_net_ctx_clear, struct bpf_net_context *, bpf_net_ctx_clear(_T)); - -/* flags for bpf_redirect_info kern_flags */ -#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ - /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) diff --git a/localversion-rt b/localversion-rt index ad3da1bcab7e8..0efe7ba1930e1 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt4 +-rt5 diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 9596ad19224ad..3c9f6538990ea 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -853,6 +853,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); unsigned int mtu, mtu_reserved; + int ret; mtu_reserved = nf_bridge_mtu_reduction(skb); mtu = skb->dev->mtu; @@ -885,7 +886,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IPCB(skb)->frag_max_size = nf_bridge->frag_max_size; - guard(local_lock_nested_bh)(&brnf_frag_data_storage.bh_lock); + local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); if (skb_vlan_tag_present(skb)) { @@ -901,7 +902,9 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); + ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit); + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); + return ret; } if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { @@ -913,7 +916,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size; - guard(local_lock_nested_bh)(&brnf_frag_data_storage.bh_lock); + local_lock_nested_bh(&brnf_frag_data_storage.bh_lock); data = this_cpu_ptr(&brnf_frag_data_storage); data->encap_size = nf_bridge_encap_header_len(skb); data->size = ETH_HLEN + data->encap_size; @@ -921,8 +924,12 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff skb_copy_from_linear_data_offset(skb, -data->size, data->mac, data->size); - if (v6ops) - return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); + if (v6ops) { + ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit); + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); + return ret; + } + local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/core/dev.c b/net/core/dev.c index 0ff8b11c8ab6f..85702022c5cd4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4030,9 +4030,8 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev, bool *another) { struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); - struct bpf_net_context *bpf_net_ctx __free(bpf_net_ctx_clear) = NULL; enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS; - struct bpf_net_context __bpf_net_ctx; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) @@ -4067,10 +4066,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; } *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_RX_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4080,8 +4081,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, fallthrough; case TC_ACT_CONSUMED: *ret = NET_RX_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -4089,10 +4092,9 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, static __always_inline struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - struct bpf_net_context *bpf_net_ctx __free(bpf_net_ctx_clear) = NULL; struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; - struct bpf_net_context __bpf_net_ctx; + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) @@ -4115,10 +4117,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN: @@ -4128,8 +4132,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; + bpf_net_ctx_clear(bpf_net_ctx); return NULL; } + bpf_net_ctx_clear(bpf_net_ctx); return skb; } @@ -6372,6 +6378,7 @@ static void __napi_busy_loop(unsigned int napi_id, struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); + restart: napi_poll = NULL; @@ -6893,12 +6900,11 @@ static int napi_threaded_poll(void *data) static __latent_entropy void net_rx_action(struct softirq_action *h) { - struct bpf_net_context *bpf_net_ctx __free(bpf_net_ctx_clear) = NULL; struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs)); + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int budget = READ_ONCE(net_hotdata.netdev_budget); - struct bpf_net_context __bpf_net_ctx; LIST_HEAD(list); LIST_HEAD(repoll); @@ -6955,7 +6961,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) sd->in_net_rx_action = false; net_rps_action_and_irq_enable(sd); -end:; +end: + bpf_net_ctx_clear(bpf_net_ctx); } struct netdev_adjacent { diff --git a/net/core/filter.c b/net/core/filter.c index 64a1696a52920..f00cc56123013 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2019,6 +2019,7 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); u32 diff_size = from_size + to_size; int i, j = 0; + __wsum ret; /* This is quite flexible, some examples: * @@ -2032,13 +2033,15 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, diff_size > sizeof(sp->diff))) return -EINVAL; - guard(local_lock_nested_bh)(&bpf_sp.bh_lock); + local_lock_nested_bh(&bpf_sp.bh_lock); for (i = 0; i < from_size / sizeof(__be32); i++, j++) sp->diff[j] = ~from[i]; for (i = 0; i < to_size / sizeof(__be32); i++, j++) sp->diff[j] = to[i]; - return csum_partial(sp->diff, diff_size, seed); + ret = csum_partial(sp->diff, diff_size, seed); + local_unlock_nested_bh(&bpf_sp.bh_lock); + return ret; } static const struct bpf_func_proto bpf_csum_diff_proto = { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b33bae4ba78b6..5000394a4f51a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -309,12 +309,16 @@ void napi_get_frags_check(struct napi_struct *napi) void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + void *data; fragsz = SKB_DATA_ALIGN(fragsz); - guard(local_lock_nested_bh)(&napi_alloc_cache.bh_lock); - return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + return data; + } EXPORT_SYMBOL(__napi_alloc_frag_align); @@ -342,17 +346,20 @@ static struct sk_buff *napi_skb_cache_get(void) struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; - guard(local_lock_nested_bh)(&napi_alloc_cache.bh_lock); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, GFP_ATOMIC, NAPI_SKB_CACHE_BULK, nc->skb_cache); - if (unlikely(!nc->skb_count)) + if (unlikely(!nc->skb_count)) { + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; + } } skb = nc->skb_cache[--nc->skb_count]; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); return skb; @@ -1439,7 +1446,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) if (!kasan_mempool_poison_object(skb)) return; - guard(local_lock_nested_bh)(&napi_alloc_cache.bh_lock); + local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { @@ -1451,6 +1458,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) nc->skb_cache + NAPI_SKB_CACHE_HALF); nc->skb_count = NAPI_SKB_CACHE_HALF; } + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)