Hi, On 5/14/2024 8:40 PM, Siddharth Chintamaneni wrote: > This patch is a revised version which addresses a possible deadlock issue in > queue and stack map types. > > Deadlock could happen when a nested BPF program acquires the same lock > as the parent BPF program to perform a write operation on the same map > as the first one. This bug is also reported by syzbot. > > Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@xxxxxxxxxx/ > Reported-by: syzbot+8bdfc2c53fb2b63e1871@xxxxxxxxxxxxxxxxxxxxxxxxx > Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@xxxxxxxxx> > --- > kernel/bpf/queue_stack_maps.c | 76 +++++++++++++++++++++++++++++++++-- > 1 file changed, 73 insertions(+), 3 deletions(-) > > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c > index d869f51ea93a..b5ed76c9ddd7 100644 > --- a/kernel/bpf/queue_stack_maps.c > +++ b/kernel/bpf/queue_stack_maps.c > @@ -13,11 +13,13 @@ > #define QUEUE_STACK_CREATE_FLAG_MASK \ > (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) > > + > struct bpf_queue_stack { > struct bpf_map map; > raw_spinlock_t lock; > u32 head, tail; > u32 size; /* max_entries + 1 */ > + int __percpu *map_locked; > > char elements[] __aligned(8); > }; > @@ -78,6 +80,15 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) > > qs->size = size; > > + qs->map_locked = bpf_map_alloc_percpu(&qs->map, > + sizeof(int), > + sizeof(int), > + GFP_USER | __GFP_NOWARN); > + if (!qs->map_locked) { > + bpf_map_area_free(qs); > + return ERR_PTR(-ENOMEM); > + } > + > raw_spin_lock_init(&qs->lock); > > return &qs->map; > @@ -88,19 +99,57 @@ static void queue_stack_map_free(struct bpf_map *map) > { > struct bpf_queue_stack *qs = bpf_queue_stack(map); > > + free_percpu(qs->map_locked); > bpf_map_area_free(qs); > } > > +static inline int map_lock_inc(struct bpf_queue_stack *qs) > +{ > + unsigned long flags; > + > + preempt_disable(); > + local_irq_save(flags); > + if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) { > + __this_cpu_dec(*(qs->map_locked)); > + local_irq_restore(flags); > + preempt_enable(); > + return -EBUSY; > + } > + > + local_irq_restore(flags); > + preempt_enable(); > + > + return 0; > +} > + > +static inline void map_unlock_dec(struct bpf_queue_stack *qs) > +{ > + unsigned long flags; > + > + preempt_disable(); > + local_irq_save(flags); > + __this_cpu_dec(*(qs->map_locked)); > + local_irq_restore(flags); > + preempt_enable(); > +} > + > static long __queue_map_get(struct bpf_map *map, void *value, bool delete) > { > struct bpf_queue_stack *qs = bpf_queue_stack(map); > unsigned long flags; > int err = 0; > void *ptr; > + int ret; > + > + ret = map_lock_inc(qs); > + if (ret) > + return ret; > > if (in_nmi()) { > - if (!raw_spin_trylock_irqsave(&qs->lock, flags)) > + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) { > + map_unlock_dec(qs); > return -EBUSY; > + } With percpu map-locked in place, I think the in_nmi() checking could also be remove. When the BPF program X which has already acquired the lock is interrupted by a NMI, if the BPF program Y for the NMI also tries to acquire the same lock, it will find map_locked is 1 and return early.