Hi, On 6/21/2023 10:32 AM, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@xxxxxxxxxx> > > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves > objects into free_by_rcu_ttrace list where they are waiting for RCU > task trace grace period to be freed into slab. > > The life cycle of objects: > alloc: dequeue free_llist > free: enqeueu free_llist > free_rcu: enqueue free_by_rcu -> waiting_for_gp > free_llist above high watermark -> free_by_rcu_ttrace > after RCU GP waiting_for_gp -> free_by_rcu_ttrace > free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab > > Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx> > --- > include/linux/bpf_mem_alloc.h | 2 + > kernel/bpf/memalloc.c | 118 ++++++++++++++++++++++++++++++++-- > 2 files changed, 116 insertions(+), 4 deletions(-) > > diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h > index 3929be5743f4..d644bbb298af 100644 > --- a/include/linux/bpf_mem_alloc.h > +++ b/include/linux/bpf_mem_alloc.h > @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); > /* kmalloc/kfree equivalent: */ > void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); > void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); > +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr); > > /* kmem_cache_alloc/free equivalent: */ > void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); > void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); > +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr); > void bpf_mem_cache_raw_free(void *ptr); > void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); > > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c > index 10d027674743..4d1002e7b4b5 100644 > --- a/kernel/bpf/memalloc.c > +++ b/kernel/bpf/memalloc.c > @@ -100,6 +100,15 @@ struct bpf_mem_cache { > int percpu_size; > struct bpf_mem_cache *tgt; > > + /* list of objects to be freed after RCU GP */ > + struct llist_head free_by_rcu; > + struct llist_node *free_by_rcu_tail; > + struct llist_head waiting_for_gp; > + struct llist_node *waiting_for_gp_tail; > + struct rcu_head rcu; > + atomic_t call_rcu_in_progress; > + struct llist_head free_llist_extra_rcu; > + > /* list of objects to be freed after RCU tasks trace GP */ > struct llist_head free_by_rcu_ttrace; > struct llist_node *free_by_rcu_ttrace_tail; > @@ -340,6 +349,56 @@ static void free_bulk(struct bpf_mem_cache *c) > do_call_rcu_ttrace(tgt); > } > > +static void __free_by_rcu(struct rcu_head *head) > +{ > + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); > + struct bpf_mem_cache *tgt = c->tgt; > + struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); > + > + if (!llnode) > + goto out; > + > + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) > + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; > + > + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ > + do_call_rcu_ttrace(tgt); > +out: > + atomic_set(&c->call_rcu_in_progress, 0); > +} > + > +static void check_free_by_rcu(struct bpf_mem_cache *c) > +{ > + struct llist_node *llnode, *t; > + > + if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu)) > + return; > + > + /* drain free_llist_extra_rcu */ > + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) > + if (__llist_add(llnode, &c->free_by_rcu)) > + c->free_by_rcu_tail = llnode; > + > + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { > + /* > + * Instead of kmalloc-ing new rcu_head and triggering 10k > + * call_rcu() to hit rcutree.qhimark and force RCU to notice > + * the overload just ask RCU to hurry up. There could be many > + * objects in free_by_rcu list. > + * This hint reduces memory consumption for an artifical > + * benchmark from 2 Gbyte to 150 Mbyte. > + */ > + rcu_request_urgent_qs_task(current); > + return; > + } > + > + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); > + > + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); > + c->waiting_for_gp_tail = c->free_by_rcu_tail; > + call_rcu_hurry(&c->rcu, __free_by_rcu); > +} > + > static void bpf_mem_refill(struct irq_work *work) > { > struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); > @@ -354,6 +413,8 @@ static void bpf_mem_refill(struct irq_work *work) > alloc_bulk(c, c->batch, NUMA_NO_NODE); > else if (cnt > c->high_watermark) > free_bulk(c); > + > + check_free_by_rcu(c); > } > > static void notrace irq_work_raise(struct bpf_mem_cache *c) > @@ -482,6 +543,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c) > free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); > free_all(__llist_del_all(&c->free_llist), percpu); > free_all(__llist_del_all(&c->free_llist_extra), percpu); > + free_all(__llist_del_all(&c->free_by_rcu), percpu); > + free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); > + free_all(llist_del_all(&c->waiting_for_gp), percpu); > } > > static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) > @@ -494,8 +558,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) > > static void free_mem_alloc(struct bpf_mem_alloc *ma) > { > - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might > - * still execute. Wait for it now before we freeing percpu caches. > + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks > + * might still execute. Wait for them. > * > * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), > * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used > @@ -504,9 +568,10 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) > * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by > * using rcu_trace_implies_rcu_gp() as well. > */ > - rcu_barrier_tasks_trace(); > + rcu_barrier(); /* wait for __free_by_rcu() */ > + rcu_barrier_tasks_trace(); /* wait for __free_rcu() via call_rcu_tasks_trace */ Using rcu_barrier_tasks_trace and rcu_barrier() is not enough, the objects in c->free_by_rcu_ttrace may be leaked as shown below. We may need to add an extra variable to notify __free_by_rcu() to free these elements directly instead of trying to move it into waiting_for_gp_ttrace as I did before. Or we can just drain free_by_rcu_ttrace twice. destroy process __free_by_rcu llist_del_all(&c->free_by_rcu_ttrace) // add to free_by_rcu_ttrace again llist_add_batch(..., &tgt->free_by_rcu_ttrace) do_call_rcu_ttrace() // call_rcu_ttrace_in_progress is 1, so xchg return 1 // and it will not be moved to waiting_for_gp_ttrace atomic_xchg(&c->call_rcu_ttrace_in_progress, 1) // got 1 atomic_read(&c->call_rcu_ttrace_in_progress) > if (!rcu_trace_implies_rcu_gp()) > - rcu_barrier(); > + rcu_barrier(); /* wait for __free_rcu() via call_rcu */ > free_mem_alloc_no_barrier(ma); > } > > @@ -565,6 +630,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) > irq_work_sync(&c->refill_work); > drain_mem_cache(c); > rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); > + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); > } > /* objcg is the same across cpus */ > if (c->objcg) > @@ -580,6 +646,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) > irq_work_sync(&c->refill_work); > drain_mem_cache(c); > rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); > + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); I got a oops in rcu_tasks_invoke_cbs() during stressing test and it seems we should do atomic_read(&call_rcu_in_progress) first, then do atomic_read(&call_rcu_ttrace_in_progress) to fix the problem. And to prevent memory reordering in non-x86 host, a memory barrier (e.g., smp_mb__before_atomic) is also needed between these two reads. Otherwise it is possible there are inflight RCU callbacks, but we don't wait for these callbacks as shown in the scenario below: destroy process __free_by_rcu // got 0 atomic_read(&c->call_rcu_ttrace_in_progress) do_call_rcu_ttrace() atomic_xchg(&c->call_rcu_ttrace_in_progress, 1) atomic_set(&c->call_rcu_in_progress, 0) // also got 0 atomic_read(&c->call_rcu_in_progress) The introduction of c->tgt make the destroy procedure more complicated. Even with the proposed fix above, there is still oops in rcu_tasks_invoke_cbs() and I think it happens as follows: bpf_mem_alloc_destroy free_by_rcu for c1 // both of in_progress counter is 0 read c0->call_rcu_in_progress read c0->call_rcu_ttrace_in_progress // c1->tgt = c0 // c1->call_rcu_in_progress == 1 add c0->free_by_rcu_ttrace xchg(c0->call_rcu_ttrace_in_progress, 1) call_rcu_tasks_trace(c0) c1->call_rcu_in_progress = 0 // both of in_progress counter is 0 read c1->call_rcu_in_progress read c1->call_rcu_ttrace_in_progress // BAD! There is still inflight tasks trace RCU callback free_mem_alloc_no_barrier() My original though is trying to do "rcu_in_progress += atomic_read(c->tgt->call_rcu_ttrace_in_progress)" after "rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress)" in bpf_mem_alloc_destroy(), but c->tgt is changed in unit_free(), so c->tgt may have been changed to B after the setting of A->call_rcu_ttrace_in_progress. I think we could read c->call_rcu_ttrace_in_progress for all bpf_mem_cache again when the summary of c->call_rcu_in_progress and c->call_rcu_ttrace_in_progress for all bpf_mem_cache is 0, because when rcu_in_progress is zero, it means the setting of c->tgt->call_rcu_ttrace_in_progress must have been completed. > } > } > if (c->objcg) > @@ -664,6 +731,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) > irq_work_raise(c); > } > > +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) > +{ > + struct llist_node *llnode = ptr - LLIST_NODE_SZ; > + unsigned long flags; > + > + c->tgt = *(struct bpf_mem_cache **)llnode; > + > + local_irq_save(flags); > + if (local_inc_return(&c->active) == 1) { > + if (__llist_add(llnode, &c->free_by_rcu)) > + c->free_by_rcu_tail = llnode; > + } else { > + llist_add(llnode, &c->free_llist_extra_rcu); > + } > + local_dec(&c->active); > + local_irq_restore(flags); > + > + if (!atomic_read(&c->call_rcu_in_progress)) > + irq_work_raise(c); > +} > + > /* Called from BPF program or from sys_bpf syscall. > * In both cases migration is disabled. > */ > @@ -697,6 +785,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) > unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); > } > > +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) > +{ > + int idx; > + > + if (!ptr) > + return; > + > + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); > + if (idx < 0) > + return; > + > + unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); > +} > + > void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) > { > void *ret; > @@ -713,6 +815,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) > unit_free(this_cpu_ptr(ma->cache), ptr); > } > > +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) > +{ > + if (!ptr) > + return; > + > + unit_free_rcu(this_cpu_ptr(ma->cache), ptr); > +} > + > /* Directly does a kfree() without putting 'ptr' back to the free_llist > * for reuse and without waiting for a rcu_tasks_trace gp. > * The caller must first go through the rcu_tasks_trace gp for 'ptr'