Re: [PATCH bpf v3 4/6] bpf: Optimize the free of inner map

Hou Tao <houtao@xxxxxxxxxxxxxxx> · Mon, 27 Nov 2023 09:24:52 +0800

Hi Yonghong,

On 11/26/2023 3:13 PM, Yonghong Song wrote:
>
> On 11/24/23 6:30 AM, Hou Tao wrote:
>> From: Hou Tao <houtao1@xxxxxxxxxx>
>>
>> When removing the inner map from the outer map, the inner map will be
>> freed after one RCU grace period and one RCU tasks trace grace
>> period, so it is certain that the bpf program, which may access the
>> inner map, has exited before the inner map is freed.
>>
>> However there is unnecessary to wait for any RCU grace period, one RCU
>> grace period or one RCU tasks trace grace period if the outer map is
>> only accessed by userspace, sleepable program or non-sleepable program.
>> So recording the sleepable attributes of the owned bpf programs when
>> adding the outer map into env->used_maps, copying the recorded
>> attributes to inner map atomically when removing inner map from the
>> outer map and using the recorded attributes in the inner map to decide
>> which, and how many, RCU grace periods are needed when freeing the
>> inner map.
>>
>> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
>> ---
>>   include/linux/bpf.h     |  8 +++++++-
>>   kernel/bpf/map_in_map.c | 19 ++++++++++++++-----
>>   kernel/bpf/syscall.c    | 15 +++++++++++++--
>>   kernel/bpf/verifier.c   |  4 ++++
>>   4 files changed, 38 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 15a6bb951b70..c5b549f352d7 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -245,6 +245,11 @@ struct bpf_list_node_kern {
>>       void *owner;
>>   } __attribute__((aligned(8)));
>>   +enum {
>> +    BPF_MAP_RCU_GP = BIT(0),
>> +    BPF_MAP_RCU_TT_GP = BIT(1),
>> +};
>> +
>>   struct bpf_map {
>>       /* The first two cachelines with read-mostly members of which some
>>        * are also accessed in fast-path (e.g. ops, max_entries).
>> @@ -296,7 +301,8 @@ struct bpf_map {
>>       } owner;
>>       bool bypass_spec_v1;
>>       bool frozen; /* write-once; write-protected by freeze_mutex */
>> -    bool free_after_mult_rcu_gp;
>> +    atomic_t used_in_rcu_gp;
>> +    atomic_t free_by_rcu_gp;
>>       s64 __percpu *elem_count;
>>   };
>>   diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
>> index cf3363065566..d044ee677107 100644
>> --- a/kernel/bpf/map_in_map.c
>> +++ b/kernel/bpf/map_in_map.c
>> @@ -131,12 +131,21 @@ void bpf_map_fd_put_ptr(struct bpf_map *map,
>> void *ptr, bool deferred)
>>   {
>>       struct bpf_map *inner_map = ptr;
>>   -    /* The inner map may still be used by both non-sleepable and
>> sleepable
>> -     * bpf program, so free it after one RCU grace period and one tasks
>> -     * trace RCU grace period.
>> +    /* Defer the freeing of inner map according to the attribute of bpf
>> +     * program which owns the outer map, so unnecessary multiple RCU GP
>> +     * waitings can be avoided.
>>        */
>> -    if (deferred)
>> -        WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true);
>> +    if (deferred) {
>> +        /* used_in_rcu_gp may be updated concurrently by new bpf
>> +         * program, so add smp_mb() to guarantee the order between
>> +         * used_in_rcu_gp and lookup/deletion operation of inner map.
>> +         * If a new bpf program finds the inner map before it is
>> +         * removed from outer map, reading used_in_rcu_gp below will
>> +         * return the newly-set bit set by the new bpf program.
>> +         */
>> +        smp_mb();
>
> smp_mb__before_atomic()?

The memory barrier is used for atomic_read() instead of atomic_or(), so
I think smp_mb() is appropriate.

>> +        atomic_or(atomic_read(&map->used_in_rcu_gp),
>> &inner_map->free_by_rcu_gp);
>> +    }
>>       bpf_map_put(inner_map);
>>   }
>>   diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 88882cb58121..014a8cd55a41 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -734,7 +734,10 @@ static void bpf_map_free_rcu_gp(struct rcu_head
>> *rcu)
>>     static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
>>   {
>> -    if (rcu_trace_implies_rcu_gp())
>> +    struct bpf_map *map = container_of(rcu, struct bpf_map, rcu);
>> +
>> +    if (!(atomic_read(&map->free_by_rcu_gp) & BPF_MAP_RCU_GP) ||
>> +        rcu_trace_implies_rcu_gp())
>>           bpf_map_free_rcu_gp(rcu);
>>       else
>>           call_rcu(rcu, bpf_map_free_rcu_gp);
>> @@ -746,11 +749,16 @@ static void bpf_map_free_mult_rcu_gp(struct
>> rcu_head *rcu)
>>   void bpf_map_put(struct bpf_map *map)
>>   {
>>       if (atomic64_dec_and_test(&map->refcnt)) {
>> +        int free_by_rcu_gp;
>> +
>>           /* bpf_map_free_id() must be called first */
>>           bpf_map_free_id(map);
>>           btf_put(map->btf);
>>   -        if (READ_ONCE(map->free_after_mult_rcu_gp))
>> +        free_by_rcu_gp = atomic_read(&map->free_by_rcu_gp);
>> +        if (free_by_rcu_gp == BPF_MAP_RCU_GP)
>> +            call_rcu(&map->rcu, bpf_map_free_rcu_gp);
>> +        else if (free_by_rcu_gp)
>>               call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
>>           else
>>               bpf_map_free_in_work(map);
>> @@ -5343,6 +5351,9 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
>>           goto out_unlock;
>>       }
>>   +    /* No need to update used_in_rcu_gp, because the bpf program
>> doesn't
>> +     * access the map.
>> +     */
>>       memcpy(used_maps_new, used_maps_old,
>>              sizeof(used_maps_old[0]) * prog->aux->used_map_cnt);
>>       used_maps_new[prog->aux->used_map_cnt] = map;
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 6da370a047fe..3b86c02077f1 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -18051,6 +18051,10 @@ static int resolve_pseudo_ldimm64(struct
>> bpf_verifier_env *env)
>>                   return -E2BIG;
>>               }
>>   +            atomic_or(env->prog->aux->sleepable ?
>> BPF_MAP_RCU_TT_GP : BPF_MAP_RCU_GP,
>> +                  &map->used_in_rcu_gp);
>> +            /* Pairs with smp_mb() in bpf_map_fd_put_ptr() */
>> +            smp_mb__before_atomic();
>
> smp_mb__after_atomic()?

smp_mb__after_atomic() is better because it doesn't depend on the
implementation of bpf_map_inc() below. Will use it in next version.
>
> Just curious, are two smp_mb*() memory barriers in this patch truely
> necessary or just
> want to be cautious?

Martin had asked me the same question in [1]. The reason for these two
memory barrier is just want to be cautious.

[1]:
https://lore.kernel.org/bpf/467cd7b0-9b41-4db5-9646-9b044db14bf0@xxxxxxxxx/
>
>>               /* hold the map. If the program is rejected by verifier,
>>                * the map will be released by release_maps() or it
>>                * will be used by the valid program until it's unloaded