Re: [PATCH v3 bpf-next 12/13] bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

On 6/29/2023 11:42 AM, Alexei Starovoitov wrote:
> On Wed, Jun 28, 2023 at 7:24 PM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote:
>> Hi,
>>
>> On 6/28/2023 9:56 AM, Alexei Starovoitov wrote:
>>> From: Alexei Starovoitov <ast@xxxxxxxxxx>
>>>
>>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
>>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
>>> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
>>> objects into free_by_rcu_ttrace list where they are waiting for RCU
>>> task trace grace period to be freed into slab.
>>>
>>> The life cycle of objects:
>>> alloc: dequeue free_llist
>>> free: enqeueu free_llist
>>> free_rcu: enqueue free_by_rcu -> waiting_for_gp
>>> free_llist above high watermark -> free_by_rcu_ttrace
>>> after RCU GP waiting_for_gp -> free_by_rcu_ttrace
>>> free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab
>>>
>>> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
>> SNIP
>>> +static void __free_by_rcu(struct rcu_head *head)
>>> +{
>>> +     struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>>> +     struct bpf_mem_cache *tgt = c->tgt;
>>> +     struct llist_node *llnode;
>>> +
>>> +     llnode = llist_del_all(&c->waiting_for_gp);
>>> +     if (!llnode)
>>> +             goto out;
>>> +
>>> +     llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace);
>>> +
>>> +     /* Objects went through regular RCU GP. Send them to RCU tasks trace */
>>> +     do_call_rcu_ttrace(tgt);
>> I still got report about leaked free_by_rcu_ttrace without adding any
>> extra hack except using bpf_mem_cache_free_rcu() in htab.
> Please share the steps to repro.
Firstly, I added the attached patch to check whether or not there are
leaked elements when calling free_mem_alloc_no_barrier(), then I ran the
following scripts to do the stress test in a kvm VM with 72 CPUs and
64GB memory:


#!/bin/bash

BASE=/all/code/linux_working/

{
        cd $BASE/tools/testing/selftests/bpf
        for x in $(seq 2)
        do
                while true; do ./test_maps &>/dev/null; done &
        done
} &

{
        cd $BASE/samples/bpf
        for y in $(seq 8)
        do
                while true; do ./map_perf_test &>/dev/null; done &
        done
} &

{
        cd $BASE/tools/testing/selftests/bpf
        for z in $(seq 8)
        do
                while true
                do
                        for name in overwrite batch_add_batch_del
add_del_on_diff_cpu
                        do
                                ./bench htab-mem -w1 -d5 -a -p32
--use-case $name
                        done
                done &>/dev/null &
                sleep 0.3
        done
} &


>
>> When bpf ma is freed through free_mem_alloc(), the following sequence
>> may lead to leak of free_by_rcu_ttrace:
>>
>> P1: bpf_mem_alloc_destroy()
>>     P2: __free_by_rcu()
>>
>>     // got false
>>     P2: read c->draining
>>
>> P1: c->draining = true
>> P1: llist_del_all(&c->free_by_rcu_ttrace)
>>
>>     // add to free_by_rcu_ttrace again
>>     P2: llist_add_batch(..., &tgt->free_by_rcu_ttrace)
>>         P2: do_call_rcu_ttrace()
>>             // call_rcu_ttrace_in_progress is 1, so xchg return 1
>>             // and it doesn't being moved to waiting_for_gp_ttrace
>>             P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)
>>
>> // got 1
>> P1: atomic_read(&c->call_rcu_ttrace_in_progress)
>> // objects in free_by_rcu_ttrace is leaked
>>
>> I think the race could be fixed by checking c->draining in
>> do_call_rcu_ttrace() when atomic_xchg() returns 1 as shown below:
> If the theory of the bug holds true then the fix makes sense,
> but did you repro without fix and cannot repro with the fix?
> We should not add extra code based on a hunch.
Yes. With the fix applied only, the leak didn't occur in the last 13 hours.
>
> .

From 5e44ec29f5aa037ba8d1188ccee456ab3996d03e Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@xxxxxxxxxx>
Date: Wed, 21 Jun 2023 17:17:52 +0800
Subject: [PATCH] bpf: Check leaked objects

---
 kernel/bpf/memalloc.c | 48 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 3081d06a434c..2bdb894392c5 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -565,8 +565,50 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
 	free_all(llist_del_all(&c->waiting_for_gp), percpu);
 }
 
-static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
+static void check_mem_cache(struct bpf_mem_cache *c, bool direct)
+{
+	if (!llist_empty(&c->free_by_rcu_ttrace))
+		pr_warn("leak: free_by_rcu_ttrace %d\n", direct);
+	if (!llist_empty(&c->waiting_for_gp_ttrace))
+		pr_warn("leak: waiting_for_gp_ttrace %d\n", direct);
+	if (!llist_empty(&c->free_llist))
+		pr_warn("leak: free_llist %d\n", direct);
+	if (!llist_empty(&c->free_llist_extra))
+		pr_warn("leak: free_llist_extra %d\n", direct);
+	if (!llist_empty(&c->free_by_rcu))
+		pr_warn("leak: free_by_rcu %d\n", direct);
+	if (!llist_empty(&c->free_llist_extra_rcu))
+		pr_warn("leak: free_llist_extra_rcu %d\n", direct);
+	if (!llist_empty(&c->waiting_for_gp))
+		pr_warn("leak: waiting_for_gp %d\n", direct);
+}
+
+static void check_leaked_objs(struct bpf_mem_alloc *ma, bool direct)
 {
+	struct bpf_mem_caches *cc;
+	struct bpf_mem_cache *c;
+	int cpu, i;
+
+	if (ma->cache) {
+		for_each_possible_cpu(cpu) {
+			c = per_cpu_ptr(ma->cache, cpu);
+			check_mem_cache(c, direct);
+		}
+	}
+	if (ma->caches) {
+		for_each_possible_cpu(cpu) {
+			cc = per_cpu_ptr(ma->caches, cpu);
+			for (i = 0; i < NUM_CACHES; i++) {
+				c = &cc->cache[i];
+				check_mem_cache(c, direct);
+			}
+		}
+	}
+}
+
+static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma, bool direct)
+{
+	check_leaked_objs(ma, direct);
 	free_percpu(ma->cache);
 	free_percpu(ma->caches);
 	ma->cache = NULL;
@@ -589,7 +631,7 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
 	rcu_barrier_tasks_trace(); /* wait for __free_rcu */
 	if (!rcu_trace_implies_rcu_gp())
 		rcu_barrier();
-	free_mem_alloc_no_barrier(ma);
+	free_mem_alloc_no_barrier(ma, false);
 }
 
 static void free_mem_alloc_deferred(struct work_struct *work)
@@ -608,7 +650,7 @@ static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
 		/* Fast path. No callbacks are pending, hence no need to do
 		 * rcu_barrier-s.
 		 */
-		free_mem_alloc_no_barrier(ma);
+		free_mem_alloc_no_barrier(ma, true);
 		return;
 	}
 
-- 
2.39.2


[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux