Re: [PATCH v3 bpf-next 1/2] bpf: Patch to Fix deadlocks in queue and stack maps

Siddharth Chintamaneni <sidchintamaneni@xxxxxxxxx> · Thu, 16 May 2024 23:32:37 -0400

On Thu, 16 May 2024 at 21:53, Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote:
>
> Hi,
>
> On 5/14/2024 8:40 PM, Siddharth Chintamaneni wrote:
> > This patch is a revised version which addresses a possible deadlock issue in
> > queue and stack map types.
> >
> > Deadlock could happen when a nested BPF program acquires the same lock
> > as the parent BPF program to perform a write operation on the same map
> > as the first one. This bug is also reported by syzbot.
> >
> > Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@xxxxxxxxxx/
> > Reported-by: syzbot+8bdfc2c53fb2b63e1871@xxxxxxxxxxxxxxxxxxxxxxxxx
> > Fixes: f1a2e44a3aec ("bpf: add queue and stack maps")
> > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@xxxxxxxxx>
> > ---
> >  kernel/bpf/queue_stack_maps.c | 76 +++++++++++++++++++++++++++++++++--
> >  1 file changed, 73 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
> > index d869f51ea93a..b5ed76c9ddd7 100644
> > --- a/kernel/bpf/queue_stack_maps.c
> > +++ b/kernel/bpf/queue_stack_maps.c
> > @@ -13,11 +13,13 @@
> >  #define QUEUE_STACK_CREATE_FLAG_MASK \
> >       (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
> >
> > +
> >  struct bpf_queue_stack {
> >       struct bpf_map map;
> >       raw_spinlock_t lock;
> >       u32 head, tail;
> >       u32 size; /* max_entries + 1 */
> > +     int __percpu *map_locked;
> >
> >       char elements[] __aligned(8);
> >  };
> > @@ -78,6 +80,15 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
> >
> >       qs->size = size;
> >
> > +     qs->map_locked = bpf_map_alloc_percpu(&qs->map,
> > +                                             sizeof(int),
> > +                                             sizeof(int),
> > +                                             GFP_USER | __GFP_NOWARN);
> > +     if (!qs->map_locked) {
> > +             bpf_map_area_free(qs);
> > +             return ERR_PTR(-ENOMEM);
> > +     }
> > +
> >       raw_spin_lock_init(&qs->lock);
> >
> >       return &qs->map;
> > @@ -88,19 +99,57 @@ static void queue_stack_map_free(struct bpf_map *map)
> >  {
> >       struct bpf_queue_stack *qs = bpf_queue_stack(map);
> >
> > +     free_percpu(qs->map_locked);
> >       bpf_map_area_free(qs);
> >  }
> >
> > +static inline int map_lock_inc(struct bpf_queue_stack *qs)
> > +{
> > +     unsigned long flags;
> > +
> > +     preempt_disable();
> > +     local_irq_save(flags);
> > +     if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
> > +             __this_cpu_dec(*(qs->map_locked));
> > +             local_irq_restore(flags);
> > +             preempt_enable();
> > +             return -EBUSY;
> > +     }
> > +
> > +     local_irq_restore(flags);
> > +     preempt_enable();
> > +
> > +     return 0;
> > +}
> > +
> > +static inline void map_unlock_dec(struct bpf_queue_stack *qs)
> > +{
> > +     unsigned long flags;
> > +
> > +     preempt_disable();
> > +     local_irq_save(flags);
> > +     __this_cpu_dec(*(qs->map_locked));
> > +     local_irq_restore(flags);
> > +     preempt_enable();
> > +}
> > +
> >  static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
> >  {
> >       struct bpf_queue_stack *qs = bpf_queue_stack(map);
> >       unsigned long flags;
> >       int err = 0;
> >       void *ptr;
> > +     int ret;
> > +
> > +     ret = map_lock_inc(qs);
> > +     if (ret)
> > +             return ret;
> >
> >       if (in_nmi()) {
> > -             if (!raw_spin_trylock_irqsave(&qs->lock, flags))
> > +             if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
> > +                     map_unlock_dec(qs);
> >                       return -EBUSY;
> > +             }
>
> With percpu map-locked in place, I think the in_nmi() checking could
> also be remove. When the BPF program X which has already acquired the
> lock is interrupted by a NMI, if the BPF program Y for the NMI also
> tries to acquire the same lock, it will find map_locked is 1 and return
> early.

Agreed. Same thing could be done for ringbuf as well. I will fix this
in the revision for both the patches.

>
>