Re: [PATCH bpf-next 11/12] bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().

Hou Tao <houtao@xxxxxxxxxxxxxxx> · Wed, 21 Jun 2023 20:26:30 +0800

Hi,

On 6/21/2023 10:32 AM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@xxxxxxxxxx>
>
> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> objects into free_by_rcu_ttrace list where they are waiting for RCU
> task trace grace period to be freed into slab.
>
> The life cycle of objects:
> alloc: dequeue free_llist
> free: enqeueu free_llist
> free_rcu: enqueue free_by_rcu -> waiting_for_gp
> free_llist above high watermark -> free_by_rcu_ttrace
> after RCU GP waiting_for_gp -> free_by_rcu_ttrace
> free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
>
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
> ---
>  include/linux/bpf_mem_alloc.h |   2 +
>  kernel/bpf/memalloc.c         | 118 ++++++++++++++++++++++++++++++++--
>  2 files changed, 116 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
> index 3929be5743f4..d644bbb298af 100644
> --- a/include/linux/bpf_mem_alloc.h
> +++ b/include/linux/bpf_mem_alloc.h
> @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
>  /* kmalloc/kfree equivalent: */
>  void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
>  void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
> +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
>  
>  /* kmem_cache_alloc/free equivalent: */
>  void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
>  void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
> +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
>  void bpf_mem_cache_raw_free(void *ptr);
>  void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
>  
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 10d027674743..4d1002e7b4b5 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -100,6 +100,15 @@ struct bpf_mem_cache {
>  	int percpu_size;
>  	struct bpf_mem_cache *tgt;
>  
> +	/* list of objects to be freed after RCU GP */
> +	struct llist_head free_by_rcu;
> +	struct llist_node *free_by_rcu_tail;
> +	struct llist_head waiting_for_gp;
> +	struct llist_node *waiting_for_gp_tail;
> +	struct rcu_head rcu;
> +	atomic_t call_rcu_in_progress;
> +	struct llist_head free_llist_extra_rcu;
> +
>  	/* list of objects to be freed after RCU tasks trace GP */
>  	struct llist_head free_by_rcu_ttrace;
>  	struct llist_node *free_by_rcu_ttrace_tail;
> @@ -340,6 +349,56 @@ static void free_bulk(struct bpf_mem_cache *c)
>  	do_call_rcu_ttrace(tgt);
>  }
>  
> +static void __free_by_rcu(struct rcu_head *head)
> +{
> +	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> +	struct bpf_mem_cache *tgt = c->tgt;
> +	struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
> +
> +	if (!llnode)
> +		goto out;
> +
> +	if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace))
> +		tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail;
> +
> +	/* Objects went through regular RCU GP. Send them to RCU tasks trace */
> +	do_call_rcu_ttrace(tgt);
> +out:
> +	atomic_set(&c->call_rcu_in_progress, 0);
> +}
> +
> +static void check_free_by_rcu(struct bpf_mem_cache *c)
> +{
> +	struct llist_node *llnode, *t;
> +
> +	if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu))
> +		return;
> +
> +	/* drain free_llist_extra_rcu */
> +	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
> +		if (__llist_add(llnode, &c->free_by_rcu))
> +			c->free_by_rcu_tail = llnode;
> +
> +	if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
> +		/*
> +		 * Instead of kmalloc-ing new rcu_head and triggering 10k
> +		 * call_rcu() to hit rcutree.qhimark and force RCU to notice
> +		 * the overload just ask RCU to hurry up. There could be many
> +		 * objects in free_by_rcu list.
> +		 * This hint reduces memory consumption for an artifical
> +		 * benchmark from 2 Gbyte to 150 Mbyte.
> +		 */
> +		rcu_request_urgent_qs_task(current);
> +		return;
> +	}
> +
> +	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
> +
> +	WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
> +	c->waiting_for_gp_tail = c->free_by_rcu_tail;
> +	call_rcu_hurry(&c->rcu, __free_by_rcu);
> +}
> +
>  static void bpf_mem_refill(struct irq_work *work)
>  {
>  	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
> @@ -354,6 +413,8 @@ static void bpf_mem_refill(struct irq_work *work)
>  		alloc_bulk(c, c->batch, NUMA_NO_NODE);
>  	else if (cnt > c->high_watermark)
>  		free_bulk(c);
> +
> +	check_free_by_rcu(c);
>  }
>  
>  static void notrace irq_work_raise(struct bpf_mem_cache *c)
> @@ -482,6 +543,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
>  	free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
>  	free_all(__llist_del_all(&c->free_llist), percpu);
>  	free_all(__llist_del_all(&c->free_llist_extra), percpu);
> +	free_all(__llist_del_all(&c->free_by_rcu), percpu);
> +	free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
> +	free_all(llist_del_all(&c->waiting_for_gp), percpu);
>  }
>  
>  static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
> @@ -494,8 +558,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
>  
>  static void free_mem_alloc(struct bpf_mem_alloc *ma)
>  {
> -	/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
> -	 * still execute. Wait for it now before we freeing percpu caches.
> +	/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
> +	 * might still execute. Wait for them.
>  	 *
>  	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
>  	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
> @@ -504,9 +568,10 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
>  	 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
>  	 * using rcu_trace_implies_rcu_gp() as well.
>  	 */
> -	rcu_barrier_tasks_trace();
> +	rcu_barrier(); /* wait for __free_by_rcu() */
> +	rcu_barrier_tasks_trace(); /* wait for __free_rcu() via call_rcu_tasks_trace */
Using rcu_barrier_tasks_trace and rcu_barrier() is not enough, the
objects in c->free_by_rcu_ttrace may be leaked as shown below. We may
need to add an extra variable to notify __free_by_rcu() to free these
elements directly instead of trying to move it into
waiting_for_gp_ttrace as I did before. Or we can just drain
free_by_rcu_ttrace twice.

destroy process       __free_by_rcu

llist_del_all(&c->free_by_rcu_ttrace)

                        // add to free_by_rcu_ttrace again
                        llist_add_batch(..., &tgt->free_by_rcu_ttrace)
                            do_call_rcu_ttrace()
                                // call_rcu_ttrace_in_progress is 1, so
xchg return 1
                                // and it will not be moved to
waiting_for_gp_ttrace

atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)

// got 1
atomic_read(&c->call_rcu_ttrace_in_progress)
>  	if (!rcu_trace_implies_rcu_gp())
> -		rcu_barrier();
> +		rcu_barrier(); /* wait for __free_rcu() via call_rcu */
>  	free_mem_alloc_no_barrier(ma);
>  }
>  
> @@ -565,6 +630,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>  			irq_work_sync(&c->refill_work);
>  			drain_mem_cache(c);
>  			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
> +			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
>  		}
>  		/* objcg is the same across cpus */
>  		if (c->objcg)
> @@ -580,6 +646,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
>  				irq_work_sync(&c->refill_work);
>  				drain_mem_cache(c);
>  				rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
> +				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
I got a oops in rcu_tasks_invoke_cbs() during stressing test and it
seems we should do atomic_read(&call_rcu_in_progress) first, then do
atomic_read(&call_rcu_ttrace_in_progress) to fix the problem. And to
prevent memory reordering in non-x86 host, a memory barrier (e.g.,
smp_mb__before_atomic) is also needed between these two reads. Otherwise
it is possible there are inflight RCU callbacks, but we don't wait for
these callbacks as shown in the scenario below:

destroy process     __free_by_rcu

// got  0
atomic_read(&c->call_rcu_ttrace_in_progress)
                    do_call_rcu_ttrace()                                  
                        atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)

                    atomic_set(&c->call_rcu_in_progress, 0)
// also got 0
atomic_read(&c->call_rcu_in_progress)

The introduction of c->tgt make the destroy procedure more complicated.
Even with the proposed fix above, there is still oops in
rcu_tasks_invoke_cbs() and I think it happens as follows:

bpf_mem_alloc_destroy           free_by_rcu for c1

// both of in_progress counter is 0
read c0->call_rcu_in_progress
read c0->call_rcu_ttrace_in_progress

                                // c1->tgt = c0
                                // c1->call_rcu_in_progress == 1
                                add c0->free_by_rcu_ttrace
                                xchg(c0->call_rcu_ttrace_in_progress, 1)
                                call_rcu_tasks_trace(c0)
                                c1->call_rcu_in_progress = 0

// both of in_progress counter is 0
read c1->call_rcu_in_progress
read c1->call_rcu_ttrace_in_progress

// BAD! There is still inflight tasks trace RCU callback
free_mem_alloc_no_barrier()

My original though is trying to do "rcu_in_progress +=
atomic_read(c->tgt->call_rcu_ttrace_in_progress)" after "rcu_in_progress
+= atomic_read(&c->call_rcu_ttrace_in_progress)" in
bpf_mem_alloc_destroy(), but c->tgt is changed in unit_free(), so c->tgt
may have been changed to B after the setting of
A->call_rcu_ttrace_in_progress. I think we could read 
c->call_rcu_ttrace_in_progress for all bpf_mem_cache again when the
summary of c->call_rcu_in_progress and c->call_rcu_ttrace_in_progress
for all bpf_mem_cache is 0, because when rcu_in_progress is zero, it
means the setting of c->tgt->call_rcu_ttrace_in_progress must have been
completed.
>  			}
>  		}
>  		if (c->objcg)
> @@ -664,6 +731,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
>  		irq_work_raise(c);
>  }
>  
> +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
> +{
> +	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
> +	unsigned long flags;
> +
> +	c->tgt = *(struct bpf_mem_cache **)llnode;
> +
> +	local_irq_save(flags);
> +	if (local_inc_return(&c->active) == 1) {
> +		if (__llist_add(llnode, &c->free_by_rcu))
> +			c->free_by_rcu_tail = llnode;
> +	} else {
> +		llist_add(llnode, &c->free_llist_extra_rcu);
> +	}
> +	local_dec(&c->active);
> +	local_irq_restore(flags);
> +
> +	if (!atomic_read(&c->call_rcu_in_progress))
> +		irq_work_raise(c);
> +}
> +
>  /* Called from BPF program or from sys_bpf syscall.
>   * In both cases migration is disabled.
>   */
> @@ -697,6 +785,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
>  	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
>  }
>  
> +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
> +{
> +	int idx;
> +
> +	if (!ptr)
> +		return;
> +
> +	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
> +	if (idx < 0)
> +		return;
> +
> +	unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
> +}
> +
>  void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
>  {
>  	void *ret;
> @@ -713,6 +815,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
>  	unit_free(this_cpu_ptr(ma->cache), ptr);
>  }
>  
> +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
> +{
> +	if (!ptr)
> +		return;
> +
> +	unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
> +}
> +
>  /* Directly does a kfree() without putting 'ptr' back to the free_llist
>   * for reuse and without waiting for a rcu_tasks_trace gp.
>   * The caller must first go through the rcu_tasks_trace gp for 'ptr'