On Mon, 14 Aug 2023 at 22:59, Yonghong Song <yonghong.song@xxxxxxxxx> wrote: > > Add two new kfunc's, bpf_percpu_obj_new_impl() and > bpf_percpu_obj_drop_impl(), to allocate a percpu obj. > Two functions are very similar to bpf_obj_new_impl() > and bpf_obj_drop_impl(). The major difference is related > to percpu handling. > > bpf_rcu_read_lock() > struct val_t __percpu *v = map_val->percpu_data; > ... > bpf_rcu_read_unlock() > > For a percpu data map_val like above 'v', the reg->type > is set as > PTR_TO_BTF_ID | MEM_PERCPU | MEM_RCU > if inside rcu critical section. > > MEM_RCU marking here is similar to NON_OWN_REF as 'v' > is not a owning referenace. But NON_OWN_REF is typo: reference > trusted and typically inside the spinlock while > MEM_RCU is under rcu read lock. RCU is preferred here > since percpu data structures mean potential concurrent > access into its contents. > > Also, bpf_percpu_obj_new_impl() is restricted to only accept > scalar struct which means nested kptr's are not allowed > but some other special field, e.g., bpf_list_head, bpf_spin_lock, etc. > could be nested (nested 'struct'). Later patch will improve verifier to > handle such nested special fields. > > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > include/linux/bpf.h | 3 +- > kernel/bpf/helpers.c | 49 +++++++++++++++++++++++ > kernel/bpf/syscall.c | 21 +++++++--- > kernel/bpf/verifier.c | 90 ++++++++++++++++++++++++++++++++++--------- > 4 files changed, 137 insertions(+), 26 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index e6348fd0a785..a2cb380c43c7 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -197,7 +197,8 @@ struct btf_field_kptr { > struct btf *btf; > struct module *module; > /* dtor used if btf_is_kernel(btf), otherwise the type is > - * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used > + * program-allocated, dtor is NULL, and __bpf_obj_drop_impl > + * or __bpf_percpu_drop_impl is used > */ > btf_dtor_kfunc_t dtor; > u32 btf_id; > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index eb91cae0612a..dd14cb7da4af 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -1900,6 +1900,29 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) > return p; > } > > +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) > +{ > + struct btf_struct_meta *meta = meta__ign; > + const struct btf_record *rec; > + u64 size = local_type_id__k; > + void __percpu *pptr; > + void *p; > + int cpu; > + > + p = bpf_mem_alloc(&bpf_global_percpu_ma, size); > + if (!p) > + return NULL; > + if (meta) { > + pptr = *((void __percpu **)p); > + rec = meta->record; > + for_each_possible_cpu(cpu) { > + bpf_obj_init(rec, per_cpu_ptr(pptr, cpu)); > + } > + } > + > + return p; > +} > + > /* Must be called under migrate_disable(), as required by bpf_mem_free */ > void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) > { > @@ -1924,6 +1947,30 @@ __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) > __bpf_obj_drop_impl(p, meta ? meta->record : NULL); > } > > +/* Must be called under migrate_disable(), as required by bpf_mem_free_rcu */ > +void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec) > +{ > + void __percpu *pptr; > + int cpu; > + > + if (rec) { > + pptr = *((void __percpu **)p); > + for_each_possible_cpu(cpu) { > + bpf_obj_free_fields(rec, per_cpu_ptr(pptr, cpu)); Should this loop be done after we have waited for the RCU grace period? Otherwise any other CPU can reinitialize a field after this is done, move objects into lists/rbtree, and leak memory. Please correct me if I'm mistaken. > + } > + } > + > + bpf_mem_free_rcu(&bpf_global_percpu_ma, p); > +} > + > +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) > +{ > + struct btf_struct_meta *meta = meta__ign; > + void *p = p__alloc; > + > + __bpf_percpu_obj_drop_impl(p, meta ? meta->record : NULL); > +} > + > __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) > { > struct btf_struct_meta *meta = meta__ign; > @@ -2436,7 +2483,9 @@ BTF_SET8_START(generic_btf_ids) > BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) > #endif > BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) > +BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) > BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) > +BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) > BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL) > BTF_ID_FLAGS(func, bpf_list_push_front_impl) > BTF_ID_FLAGS(func, bpf_list_push_back_impl) > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 1c30b6ee84d4..9ceb6fd9a0e2 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -627,6 +627,7 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) > } > > extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); > +extern void __bpf_percpu_obj_drop_impl(void *p, const struct btf_record *rec); > > void bpf_obj_free_fields(const struct btf_record *rec, void *obj) > { > @@ -660,13 +661,21 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) > if (!btf_is_kernel(field->kptr.btf)) { > pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, > field->kptr.btf_id); > - if (field->type != BPF_KPTR_PERCPU_REF) > + > + if (field->type == BPF_KPTR_PERCPU_REF) { > + migrate_disable(); > + __bpf_percpu_obj_drop_impl(xchgd_field, pointee_struct_meta ? > + pointee_struct_meta->record : > + NULL); > + migrate_enable(); > + } else { > WARN_ON_ONCE(!pointee_struct_meta); > - migrate_disable(); > - __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? > - pointee_struct_meta->record : > - NULL); > - migrate_enable(); > + migrate_disable(); > + __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? > + pointee_struct_meta->record : > + NULL); > + migrate_enable(); > + } > } else { > field->kptr.dtor(xchgd_field); > } > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 4ccca1f6c998..a985fbf18a11 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -304,7 +304,7 @@ struct bpf_kfunc_call_arg_meta { > /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, > * generally to pass info about user-defined local kptr types to later > * verification logic > - * bpf_obj_drop > + * bpf_obj_drop/bpf_percpu_obj_drop > * Record the local kptr type to be drop'd > * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) > * Record the local kptr type to be refcount_incr'd and use > @@ -4997,13 +4997,20 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, > if (kptr_field->type == BPF_KPTR_UNREF) > perm_flags |= PTR_UNTRUSTED; > > + if (kptr_field->type == BPF_KPTR_PERCPU_REF) > + perm_flags |= MEM_PERCPU | MEM_ALLOC; > + I think just this would permit PTR_TO_BTF_ID | MEM_ALLOC for percpu kptr? It would probably be good to include negative selftests for kptr_xchg type matching with percpu_kptr to prevent things like these. Alexei already said map_kptr_match_type is not being invoked for MEM_ALLOC kptr_xchg, so that is also an existing bug. > if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) > goto bad_type; > > [...] > /* We need to verify reg->type and reg->btf, before accessing reg->btf */ > reg_name = btf_type_name(reg->btf, reg->btf_id); > > @@ -5084,7 +5091,17 @@ static bool rcu_safe_kptr(const struct btf_field *field) > { > const struct btf_field_kptr *kptr = &field->kptr; > > - return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id); > + return field->type == BPF_KPTR_PERCPU_REF || > + (field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id)); > +} > + > +static u32 btf_ld_kptr_type(struct bpf_verifier_env *env, struct btf_field *kptr_field) > +{ > + if (!rcu_safe_kptr(kptr_field) || !in_rcu_cs(env)) > + return PTR_MAYBE_NULL | PTR_UNTRUSTED; > + if (kptr_field->type != BPF_KPTR_PERCPU_REF) > + return PTR_MAYBE_NULL | MEM_RCU; > + return PTR_MAYBE_NULL | MEM_RCU | MEM_PERCPU; The inverted conditions are a bit hard to follow. Maybe better to explicitly check for both RCU cases, and default to untrusted otherwise? > } > > [...] >