bpf_free_inode() is invoked as a RCU callback. Usually RCU callbacks are invoked within softirq context. By setting rcutree.use_softirq=0 boot option the RCU callbacks will be invoked in a per-CPU kthread with bottom halves disabled which implies a RCU read section. On PREEMPT_RT the context remains fully preemptible. The RCU read section however does not allow schedule() invocation. The latter happens in mutex_lock() performed by bpf_trampoline_unlink_prog() originated from bpf_link_put(). It was pointed out that the bpf_link_put() invocation should not be delayed if originated from close(). It was also pointed out that other invocations from within a syscall should also avoid the workqueue. After an audit of all bpf_link_put() callers it looks that the only atomic caller is the RCU callback. Everything else is called from preemptible context because it is a syscall, a mutex_t is acquired near by or due a GFP_KERNEL memory allocation. Let bpf_link_put() free the resources directly. Add bpf_link_put_from_atomic() which uses the kworker to free the resources. Let bpf_any_put() invoke one or the other depending on the context it is called from (RCU or not). Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> --- On 2023-05-25 10:51:23 [-0700], Andrii Nakryiko wrote: v2…v3: - Drop bpf_link_put_direct(). Let bpf_link_put() do the direct free and add bpf_link_put_from_atomic() to do the delayed free via the worker. v1…v2: - Add bpf_link_put_direct() to be used from bpf_link_release() as suggested. > Looks good to me, but it's not sufficient. See kernel/bpf/inode.c, we > need to do bpf_link_put_direct() from bpf_put_any(), which should be > safe as well because it all is either triggered from bpf() syscall or > by unlink()'ing BPF FS file. For file deletion we have the same > requirement to have deterministic release of bpf_link. Okay. I checked all callers and it seems that the only atomic caller is the RCU callback. include/linux/bpf.h | 5 +++++ kernel/bpf/inode.c | 13 ++++++++----- kernel/bpf/syscall.c | 21 ++++++++++++--------- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e53ceee1df370..dced1f880cfa6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2073,6 +2073,7 @@ int bpf_link_settle(struct bpf_link_primer *primer); void bpf_link_cleanup(struct bpf_link_primer *primer); void bpf_link_inc(struct bpf_link *link); void bpf_link_put(struct bpf_link *link); +void bpf_link_put_from_atomic(struct bpf_link *link); int bpf_link_new_fd(struct bpf_link *link); struct file *bpf_link_new_file(struct bpf_link *link, int *reserved_fd); struct bpf_link *bpf_link_get_from_fd(u32 ufd); @@ -2432,6 +2433,10 @@ static inline void bpf_link_put(struct bpf_link *link) { } +static inline void bpf_link_put_from_atomic(struct bpf_link *link) +{ +} + static inline int bpf_obj_get_user(const char __user *pathname, int flags) { return -EOPNOTSUPP; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9948b542a470e..2e1e9f3c7f701 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -49,7 +49,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) return raw; } -static void bpf_any_put(void *raw, enum bpf_type type) +static void bpf_any_put(void *raw, enum bpf_type type, bool may_sleep) { switch (type) { case BPF_TYPE_PROG: @@ -59,7 +59,10 @@ static void bpf_any_put(void *raw, enum bpf_type type) bpf_map_put_with_uref(raw); break; case BPF_TYPE_LINK: - bpf_link_put(raw); + if (may_sleep) + bpf_link_put(raw); + else + bpf_link_put_from_atomic(raw); break; default: WARN_ON_ONCE(1); @@ -490,7 +493,7 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pathname, raw, type); if (ret != 0) - bpf_any_put(raw, type); + bpf_any_put(raw, type, true); return ret; } @@ -552,7 +555,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) return -ENOENT; if (ret < 0) - bpf_any_put(raw, type); + bpf_any_put(raw, type, true); return ret; } @@ -617,7 +620,7 @@ static void bpf_free_inode(struct inode *inode) if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) - bpf_any_put(inode->i_private, type); + bpf_any_put(inode->i_private, type, false); free_inode_nonrcu(inode); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 14f39c1e573ee..87b07ebd6d146 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2777,20 +2777,23 @@ static void bpf_link_put_deferred(struct work_struct *work) bpf_link_free(link); } -/* bpf_link_put can be called from atomic context, but ensures that resources - * are freed from process context +/* bpf_link_put_from_atomic is called from atomic context. It needs to be called + * from sleepable context in order to acquire sleeping locks during the process. */ -void bpf_link_put(struct bpf_link *link) +void bpf_link_put_from_atomic(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; - if (in_atomic()) { - INIT_WORK(&link->work, bpf_link_put_deferred); - schedule_work(&link->work); - } else { - bpf_link_free(link); - } + INIT_WORK(&link->work, bpf_link_put_deferred); + schedule_work(&link->work); +} + +void bpf_link_put(struct bpf_link *link) +{ + if (!atomic64_dec_and_test(&link->refcnt)) + return; + bpf_link_free(link); } EXPORT_SYMBOL(bpf_link_put); -- 2.40.1