[PATCH bpf-next] bpf: Alloc bpf_async_cb by using bpf_global_ma under PREEMPT_RT

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Hou Tao <houtao1@xxxxxxxxxx>

Under PREEMPT_RT, it is not safe to use GPF_ATOMIC kmalloc when
preemption or irq is disabled. The following warning is reported when
running test_progs under PREEMPT_RT:

  BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
  in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 675, name: test_progs
  preempt_count: 1, expected: 0
  RCU nest depth: 0, expected: 0
  2 locks held by test_progs/675:
   #0: ffffffff864b0240 (rcu_read_lock_trace){....}-{0:0}, at: bpf_prog_test_run_syscall+0x2c0/0x830
   #1: ffff8881f4ec40c8 ((&c->lock)){....}-{2:2}, at: ___slab_alloc+0xbc/0x1280
  Preemption disabled at:
  [<ffffffff8175ae2b>] __bpf_async_init+0xbb/0xb10
  CPU: 1 UID: 0 PID: 675 Comm: test_progs Tainted: G           O       6.12.0+ #11
  Tainted: [O]=OOT_MODULE
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...
  Call Trace:
   <TASK>
   dump_stack_lvl+0x57/0x70
   dump_stack+0x10/0x20
   __might_resched+0x337/0x4d0
   rt_spin_lock+0xd4/0x230
   ___slab_alloc+0xbc/0x1280
   __slab_alloc.isra.0+0x5d/0xa0
   __kmalloc_node_noprof+0xf7/0x4f0
   bpf_map_kmalloc_node+0xf5/0x6b0
   __bpf_async_init+0x20e/0xb10
   bpf_timer_init+0x30/0x40
   bpf_prog_c7e2dc9ff3d5ba62_start_cb+0x55/0x85
   bpf_prog_4eb421be69ae82fa_start_timer+0x5d/0x7e
   bpf_prog_test_run_syscall+0x322/0x830
   __sys_bpf+0x135d/0x3ca0
   __x64_sys_bpf+0x75/0xb0
   x64_sys_call+0x1b5/0xa10
   do_syscall_64+0x3b/0xc0
   entry_SYSCALL_64_after_hwframe+0x4b/0x53

Fix the problem by using bpf_global_ma to allocate bpf_async_cb when
PREEMPT_RT is enabled. The reason for still using kmalloc for
no-PREEMPT_RT case is that bpf_global_ma doesn't support accouting the
allocated memory to specific memcg. Also doing the memory allocation
before invoking __bpf_spin_lock_irqsave() to reduce the possibility of
-ENOMEM for bpf_global_ma.

Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
---
 kernel/bpf/helpers.c | 48 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bcda671feafd9..5041f22812936 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1109,12 +1109,14 @@ struct bpf_async_cb {
  * freeing the timers when inner map is replaced or deleted by user space.
  */
 struct bpf_hrtimer {
+	/* cb must be the first member */
 	struct bpf_async_cb cb;
 	struct hrtimer timer;
 	atomic_t cancelling;
 };
 
 struct bpf_work {
+	/* cb must be the first member */
 	struct bpf_async_cb cb;
 	struct work_struct work;
 	struct work_struct delete_work;
@@ -1141,6 +1143,34 @@ enum bpf_async_type {
 
 static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
 
+static void bpf_async_free(struct bpf_async_cb *cb)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		bpf_mem_free(&bpf_global_ma, cb);
+	else
+		kfree(cb);
+}
+
+static void bpf_async_free_rcu(struct bpf_async_cb *cb)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		bpf_mem_free_rcu(&bpf_global_ma, cb);
+	else
+		kfree_rcu(cb, rcu);
+}
+
+static struct bpf_async_cb *bpf_async_alloc(struct bpf_map *map, size_t size)
+{
+	struct bpf_async_cb *cb;
+
+	if (IS_ENABLED(CONFIG_PREEMPT_RT))
+		cb = bpf_mem_alloc(&bpf_global_ma, size);
+	else
+		/* allocate hrtimer via map_kmalloc to use memcg accounting */
+		cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
+	return cb;
+}
+
 static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
 {
 	struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
@@ -1221,7 +1251,7 @@ static void bpf_wq_delete_work(struct work_struct *work)
 
 	cancel_work_sync(&w->work);
 
-	kfree_rcu(w, cb.rcu);
+	bpf_async_free_rcu(&w->cb);
 }
 
 static void bpf_timer_delete_work(struct work_struct *work)
@@ -1236,7 +1266,7 @@ static void bpf_timer_delete_work(struct work_struct *work)
 	 * bpf_timer_cancel_and_free will have been cancelled.
 	 */
 	hrtimer_cancel(&t->timer);
-	kfree_rcu(t, cb.rcu);
+	bpf_async_free_rcu(&t->cb);
 }
 
 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
@@ -1263,20 +1293,18 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 		return -EINVAL;
 	}
 
+	cb = bpf_async_alloc(map, size);
+	if (!cb)
+		return -ENOMEM;
+
 	__bpf_spin_lock_irqsave(&async->lock);
 	t = async->timer;
 	if (t) {
+		bpf_async_free(cb);
 		ret = -EBUSY;
 		goto out;
 	}
 
-	/* allocate hrtimer via map_kmalloc to use memcg accounting */
-	cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node);
-	if (!cb) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	switch (type) {
 	case BPF_ASYNC_TYPE_TIMER:
 		clockid = flags & (MAX_CLOCKS - 1);
@@ -1313,7 +1341,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
 		 * or pinned in bpffs.
 		 */
 		WRITE_ONCE(async->cb, NULL);
-		kfree(cb);
+		bpf_async_free(cb);
 		ret = -EPERM;
 	}
 out:
-- 
2.29.2





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux