Re: [PATCH bpf-next 7/7] bpf: Wait for sleepable BPF program in maybe_wait_bpf_programs()

Hou Tao <houtao@xxxxxxxxxxxxxxx> · Mon, 11 Dec 2023 09:17:09 +0800



Hi,

On 12/9/2023 6:30 AM, Andrii Nakryiko wrote:
> On Fri, Dec 8, 2023 at 2:23 AM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote:
>> From: Hou Tao <houtao1@xxxxxxxxxx>
>>
>> Since commit 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in
>> sleepable programs"), sleepable BPF program can use map-in-map, but
>> maybe_wait_bpf_programs() doesn't consider it accordingly.
>>
>> So checking the value of sleepable_refcnt in maybe_wait_bpf_programs(),
>> if the value is not zero, use synchronize_rcu_mult() to wait for both
>> sleepable and non-sleepable BPF programs. But bpf syscall from syscall
>> program is special, because the bpf syscall is called with
>> rcu_read_lock_trace() being held, and there will be dead-lock if
>> synchronize_rcu_mult() is used to wait for the exit of sleepable BPF
>> program, so just skip the waiting of sleepable BPF program for bpf
>> syscall which comes from syscall program.
>>
>> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
>> ---
>>  kernel/bpf/syscall.c | 28 +++++++++++++++++++---------
>>  1 file changed, 19 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index d2641e51a1a7..6b9d7990d95f 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -35,6 +35,7 @@
>>  #include <linux/rcupdate_trace.h>
>>  #include <linux/memcontrol.h>
>>  #include <linux/trace_events.h>
>> +#include <linux/rcupdate_wait.h>
>>
>>  #include <net/netfilter/nf_bpf_link.h>
>>  #include <net/netkit.h>
>> @@ -140,15 +141,24 @@ static u32 bpf_map_value_size(const struct bpf_map *map)
>>                 return  map->value_size;
>>  }
>>
>> -static void maybe_wait_bpf_programs(struct bpf_map *map)
>> +static void maybe_wait_bpf_programs(struct bpf_map *map, bool rcu_trace_lock_held)
>>  {
>> -       /* Wait for any running BPF programs to complete so that
>> -        * userspace, when we return to it, knows that all programs
>> -        * that could be running use the new map value.
>> +       /* Wait for any running non-sleepable and sleepable BPF programs to
>> +        * complete, so that userspace, when we return to it, knows that all
>> +        * programs that could be running use the new map value. However
>> +        * syscall program can also use bpf syscall to update or delete inner
>> +        * map in outer map, and it holds rcu_read_lock_trace() before doing
>> +        * the bpf syscall. If use synchronize_rcu_mult(call_rcu_tasks_trace)
>> +        * to wait for the exit of running sleepable BPF programs, there will
>> +        * be dead-lock, so skip the waiting for syscall program.
>>          */
>>         if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS ||
>> -           map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
>> -               synchronize_rcu();
>> +           map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
>> +               if (atomic64_read(&map->sleepable_refcnt) && !rcu_trace_lock_held)
> why is this correct and non-racy without holding used_maps_mutex under
> which this sleepable_refcnt is incremented?

Do you mean bpf_prog_bind_map(), right ? The program which binds with
the map doesn't access it, so if the atomic64_inc() is missed, there
will still be OK. But if the implementation is changed afterwards, there
will be a problem.
>
>> +                       synchronize_rcu_mult(call_rcu, call_rcu_tasks_trace);
>> +               else
>> +                       synchronize_rcu();
>> +       }
>>  }
>>
>>  static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
>> @@ -1561,7 +1571,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
>>
>>         err = bpf_map_update_value(map, f.file, key, value, attr->flags);
>>         if (!err)
>> -               maybe_wait_bpf_programs(map);
>> +               maybe_wait_bpf_programs(map, bpfptr_is_kernel(uattr));
>>
>>         kvfree(value);
>>  free_key:
>> @@ -1618,7 +1628,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
>>         rcu_read_unlock();
>>         bpf_enable_instrumentation();
>>         if (!err)
>> -               maybe_wait_bpf_programs(map);
>> +               maybe_wait_bpf_programs(map, bpfptr_is_kernel(uattr));
>>  out:
>>         kvfree(key);
>>  err_put:
>> @@ -4973,7 +4983,7 @@ static int bpf_map_do_batch(union bpf_attr *attr,
>>  err_put:
>>         if (has_write) {
>>                 if (attr->batch.count)
>> -                       maybe_wait_bpf_programs(map);
>> +                       maybe_wait_bpf_programs(map, false);
>>                 bpf_map_write_active_dec(map);
>>         }
>>         fdput(f);
>> --
>> 2.29.2
>>