Re: [PATCH v3 bpf-next 1/2] bpf: Patch to Fix deadlocks in queue and stack maps

Hou Tao <houtao@xxxxxxxxxxxxxxx> · Fri, 17 May 2024 09:53:00 +0800

Hi,

On 5/14/2024 8:40 PM, Siddharth Chintamaneni wrote:
> This patch is a revised version which addresses a possible deadlock issue in
> queue and stack map types.
>
> Deadlock could happen when a nested BPF program acquires the same lock
> as the parent BPF program to perform a write operation on the same map
> as the first one. This bug is also reported by syzbot.
>
> Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@xxxxxxxxxx/
> Reported-by: syzbot+8bdfc2c53fb2b63e1871@xxxxxxxxxxxxxxxxxxxxxxxxx
> Fixes: f1a2e44a3aec ("bpf: add queue and stack maps")
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@xxxxxxxxx>
> ---
>  kernel/bpf/queue_stack_maps.c | 76 +++++++++++++++++++++++++++++++++--
>  1 file changed, 73 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
> index d869f51ea93a..b5ed76c9ddd7 100644
> --- a/kernel/bpf/queue_stack_maps.c
> +++ b/kernel/bpf/queue_stack_maps.c
> @@ -13,11 +13,13 @@
>  #define QUEUE_STACK_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
>  
> +
>  struct bpf_queue_stack {
>  	struct bpf_map map;
>  	raw_spinlock_t lock;
>  	u32 head, tail;
>  	u32 size; /* max_entries + 1 */
> +	int __percpu *map_locked;
>  
>  	char elements[] __aligned(8);
>  };
> @@ -78,6 +80,15 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
>  
>  	qs->size = size;
>  
> +	qs->map_locked = bpf_map_alloc_percpu(&qs->map,
> +						sizeof(int),
> +						sizeof(int),
> +						GFP_USER | __GFP_NOWARN);
> +	if (!qs->map_locked) {
> +		bpf_map_area_free(qs);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>  	raw_spin_lock_init(&qs->lock);
>  
>  	return &qs->map;
> @@ -88,19 +99,57 @@ static void queue_stack_map_free(struct bpf_map *map)
>  {
>  	struct bpf_queue_stack *qs = bpf_queue_stack(map);
>  
> +	free_percpu(qs->map_locked);
>  	bpf_map_area_free(qs);
>  }
>  
> +static inline int map_lock_inc(struct bpf_queue_stack *qs)
> +{
> +	unsigned long flags;
> +
> +	preempt_disable();
> +	local_irq_save(flags);
> +	if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
> +		__this_cpu_dec(*(qs->map_locked));
> +		local_irq_restore(flags);
> +		preempt_enable();
> +		return -EBUSY;
> +	}
> +
> +	local_irq_restore(flags);
> +	preempt_enable();
> +
> +	return 0;
> +}
> +
> +static inline void map_unlock_dec(struct bpf_queue_stack *qs)
> +{
> +	unsigned long flags;
> +
> +	preempt_disable();
> +	local_irq_save(flags);
> +	__this_cpu_dec(*(qs->map_locked));
> +	local_irq_restore(flags);
> +	preempt_enable();
> +}
> +
>  static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
>  {
>  	struct bpf_queue_stack *qs = bpf_queue_stack(map);
>  	unsigned long flags;
>  	int err = 0;
>  	void *ptr;
> +	int ret;
> +
> +	ret = map_lock_inc(qs);
> +	if (ret)
> +		return ret;
>  
>  	if (in_nmi()) {
> -		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
> +		if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
> +			map_unlock_dec(qs);
>  			return -EBUSY;
> +		}

With percpu map-locked in place, I think the in_nmi() checking could
also be remove. When the BPF program X which has already acquired the
lock is interrupted by a NMI, if the BPF program Y for the NMI also
tries to acquire the same lock, it will find map_locked is 1 and return
early.