From: Hou Tao <houtao1@xxxxxxxxxx> Under PREEMPT_RT, it is not safe to use GPF_ATOMIC kmalloc when preemption or irq is disabled. The following warning is reported when running test_progs under PREEMPT_RT: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 675, name: test_progs preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 2 locks held by test_progs/675: #0: ffffffff864b0240 (rcu_read_lock_trace){....}-{0:0}, at: bpf_prog_test_run_syscall+0x2c0/0x830 #1: ffff8881f4ec40c8 ((&c->lock)){....}-{2:2}, at: ___slab_alloc+0xbc/0x1280 Preemption disabled at: [<ffffffff8175ae2b>] __bpf_async_init+0xbb/0xb10 CPU: 1 UID: 0 PID: 675 Comm: test_progs Tainted: G O 6.12.0+ #11 Tainted: [O]=OOT_MODULE Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ... Call Trace: <TASK> dump_stack_lvl+0x57/0x70 dump_stack+0x10/0x20 __might_resched+0x337/0x4d0 rt_spin_lock+0xd4/0x230 ___slab_alloc+0xbc/0x1280 __slab_alloc.isra.0+0x5d/0xa0 __kmalloc_node_noprof+0xf7/0x4f0 bpf_map_kmalloc_node+0xf5/0x6b0 __bpf_async_init+0x20e/0xb10 bpf_timer_init+0x30/0x40 bpf_prog_c7e2dc9ff3d5ba62_start_cb+0x55/0x85 bpf_prog_4eb421be69ae82fa_start_timer+0x5d/0x7e bpf_prog_test_run_syscall+0x322/0x830 __sys_bpf+0x135d/0x3ca0 __x64_sys_bpf+0x75/0xb0 x64_sys_call+0x1b5/0xa10 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fix the problem by using bpf_global_ma to allocate bpf_async_cb when PREEMPT_RT is enabled. The reason for still using kmalloc for no-PREEMPT_RT case is that bpf_global_ma doesn't support accouting the allocated memory to specific memcg. Also doing the memory allocation before invoking __bpf_spin_lock_irqsave() to reduce the possibility of -ENOMEM for bpf_global_ma. Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> --- kernel/bpf/helpers.c | 48 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bcda671feafd9..5041f22812936 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1109,12 +1109,14 @@ struct bpf_async_cb { * freeing the timers when inner map is replaced or deleted by user space. */ struct bpf_hrtimer { + /* cb must be the first member */ struct bpf_async_cb cb; struct hrtimer timer; atomic_t cancelling; }; struct bpf_work { + /* cb must be the first member */ struct bpf_async_cb cb; struct work_struct work; struct work_struct delete_work; @@ -1141,6 +1143,34 @@ enum bpf_async_type { static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running); +static void bpf_async_free(struct bpf_async_cb *cb) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + bpf_mem_free(&bpf_global_ma, cb); + else + kfree(cb); +} + +static void bpf_async_free_rcu(struct bpf_async_cb *cb) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + bpf_mem_free_rcu(&bpf_global_ma, cb); + else + kfree_rcu(cb, rcu); +} + +static struct bpf_async_cb *bpf_async_alloc(struct bpf_map *map, size_t size) +{ + struct bpf_async_cb *cb; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + cb = bpf_mem_alloc(&bpf_global_ma, size); + else + /* allocate hrtimer via map_kmalloc to use memcg accounting */ + cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + return cb; +} + static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) { struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer); @@ -1221,7 +1251,7 @@ static void bpf_wq_delete_work(struct work_struct *work) cancel_work_sync(&w->work); - kfree_rcu(w, cb.rcu); + bpf_async_free_rcu(&w->cb); } static void bpf_timer_delete_work(struct work_struct *work) @@ -1236,7 +1266,7 @@ static void bpf_timer_delete_work(struct work_struct *work) * bpf_timer_cancel_and_free will have been cancelled. */ hrtimer_cancel(&t->timer); - kfree_rcu(t, cb.rcu); + bpf_async_free_rcu(&t->cb); } static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, @@ -1263,20 +1293,18 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } + cb = bpf_async_alloc(map, size); + if (!cb) + return -ENOMEM; + __bpf_spin_lock_irqsave(&async->lock); t = async->timer; if (t) { + bpf_async_free(cb); ret = -EBUSY; goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } - switch (type) { case BPF_ASYNC_TYPE_TIMER: clockid = flags & (MAX_CLOCKS - 1); @@ -1313,7 +1341,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u * or pinned in bpffs. */ WRITE_ONCE(async->cb, NULL); - kfree(cb); + bpf_async_free(cb); ret = -EPERM; } out: -- 2.29.2