Re: Possible deadlock in bpf queue map

Hsin-Wei Hung <hsinweih@xxxxxxx> · Thu, 7 Sep 2023 22:29:40 -0700

On Thu, Sep 7, 2023 at 9:08 AM Alexei Starovoitov
<alexei.starovoitov@xxxxxxxxx> wrote:
>
> On Thu, Sep 7, 2023 at 9:05 AM Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote:
> >
> > Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> writes:
> >
> > > On Thu, Sep 7, 2023 at 6:04 AM Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote:
> > >>
> > >> Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> writes:
> > >>
> > >> > On Thu, 7 Sept 2023 at 12:26, Toke Høiland-Jørgensen <toke@xxxxxxxxxx> wrote:
> > >> >>
> > >> >> +Arnaldo
> > >> >>
> > >> >> > Hi,
> > >> >> >
> > >> >> > Our bpf fuzzer, a customized Syzkaller, triggered a lockdep warning in
> > >> >> > the bpf queue map in v5.15. Since queue_stack_maps.c has no major changes
> > >> >> > since v5.15, we think this should still exist in the latest kernel.
> > >> >> > The bug can be occasionally triggered, and we suspect one of the
> > >> >> > eBPF programs involved to be the following one. We also attached the lockdep
> > >> >> > warning at the end.
> > >> >> >
> > >> >> > #define DEFINE_BPF_MAP_NO_KEY(the_map, TypeOfMap, MapFlags,
> > >> >> > TypeOfValue, MaxEntries) \
> > >> >> >         struct {                                                        \
> > >> >> >             __uint(type, TypeOfMap);                                    \
> > >> >> >             __uint(map_flags, (MapFlags));                              \
> > >> >> >             __uint(max_entries, (MaxEntries));                          \
> > >> >> >             __type(value, TypeOfValue);                                 \
> > >> >> >         } the_map SEC(".maps");
> > >> >> >
> > >> >> > DEFINE_BPF_MAP_NO_KEY(map_0, BPF_MAP_TYPE_QUEUE, 0 | BPF_F_WRONLY,
> > >> >> > struct_0, 162);
> > >> >> > SEC("perf_event")
> > >> >> > int func(struct bpf_perf_event_data *ctx) {
> > >> >> >         char v0[96] = {};
> > >> >> >         uint64_t v1 = 0;
> > >> >> >         v1 = bpf_map_pop_elem(&map_0, v0);
> > >> >> >         return 163819661;
> > >> >> > }
> > >> >> >
> > >> >> >
> > >> >> > The program is attached to the following perf event.
> > >> >> >
> > >> >> > struct perf_event_attr attr_type_hw = {
> > >> >> >         .type = PERF_TYPE_HARDWARE,
> > >> >> >         .config = PERF_COUNT_HW_CPU_CYCLES,
> > >> >> >         .sample_freq = 50,
> > >> >> >         .inherit = 1,
> > >> >> >         .freq = 1,
> > >> >> > };
> > >> >> >
> > >> >> > ================================WARNING: inconsistent lock state
> > >> >> > 5.15.26+ #2 Not tainted
> > >> >> > --------------------------------
> > >> >> > inconsistent {INITIAL USE} -> {IN-NMI} usage.
> > >> >> > syz-executor.5/19749 [HC1[1]:SC0[0]:HE0:SE1] takes:
> > >> >> > ffff88804c9fc198 (&qs->lock){..-.}-{2:2}, at: __queue_map_get+0x31/0x250
> > >> >> > {INITIAL USE} state was registered at:
> > >> >> >   lock_acquire+0x1a3/0x4b0
> > >> >> >   _raw_spin_lock_irqsave+0x48/0x60
> > >> >> >   __queue_map_get+0x31/0x250
> > >> >> >   bpf_prog_577904e86c81dead_func+0x12/0x4b4
> > >> >> >   trace_call_bpf+0x262/0x5d0
> > >> >> >   perf_trace_run_bpf_submit+0x91/0x1c0
> > >> >> >   perf_trace_sched_switch+0x46c/0x700
> > >> >> >   __schedule+0x11b5/0x24a0
> > >> >> >   schedule+0xd4/0x270
> > >> >> >   futex_wait_queue_me+0x25f/0x520
> > >> >> >   futex_wait+0x1e0/0x5f0
> > >> >> >   do_futex+0x395/0x1890
> > >> >> >   __x64_sys_futex+0x1cb/0x480
> > >> >> >   do_syscall_64+0x3b/0xc0
> > >> >> >   entry_SYSCALL_64_after_hwframe+0x44/0xae
> > >> >> > irq event stamp: 13640
> > >> >> > hardirqs last  enabled at (13639): [<ffffffff95eb2bf4>]
> > >> >> > _raw_spin_unlock_irq+0x24/0x40
> > >> >> > hardirqs last disabled at (13640): [<ffffffff95eb2d4d>]
> > >> >> > _raw_spin_lock_irqsave+0x5d/0x60
> > >> >> > softirqs last  enabled at (13464): [<ffffffff93e26de5>] __sys_bpf+0x3e15/0x4e80
> > >> >> > softirqs last disabled at (13462): [<ffffffff93e26da3>] __sys_bpf+0x3dd3/0x4e80
> > >> >> >
> > >> >> > other info that might help us debug this:
> > >> >> >  Possible unsafe locking scenario:
> > >> >> >
> > >> >> >        CPU0
> > >> >> >        ----
> > >> >> >   lock(&qs->lock);
> > >> >> >   <Interrupt>
> > >> >> >     lock(&qs->lock);
> > >> >>
> > >> >> Hmm, so that lock() uses raw_spin_lock_irqsave(), which *should* be
> > >> >> disabling interrupts entirely for the critical section. But I guess a
> > >> >> Perf hardware event can still trigger? Which seems like it would
> > >> >> potentially wreak havoc with lots of things, not just this queue map
> > >> >> function?
> > >> >>
> > >> >> No idea how to protect against this, though. Hoping Arnaldo knows? :)
> > >> >>
> > >> >
> > >> > The locking should probably be protected by a percpu integer counter,
> > >> > incremented and decremented before and after the lock is taken,
> > >> > respectively. If it is already non-zero, then -EBUSY should be
> > >> > returned. It is similar to what htab_lock_bucket protects against in
> > >> > hashtab.c.
> > >>
> > >> Ah, neat! Okay, seems straight-forward enough to replicate. Hsin, could
> > >> you please check if the patch below gets rid of the splat?
> > >
> > > Instead of adding all this complexity for the map that is so rarely used
> > > it's easier to disallow it perf_event prog types.
> > > Or add a single if (in_nmi()) return -EBUSY.
> >
> > Heh, I was actually thinking the "nmi-safe lock" thing might be
> > something that could be neatly encapsulated into the lock library
> > helpers. But OK, so you mean just something like the below, then?
>
> Yep.
> In bpf timers we do:
>         if (in_nmi())
>                 return -EOPNOTSUPP;
> while in ringbuf we have:
>         if (in_nmi()) {
>                 if (!spin_trylock_irqsave(&rb->spinlock, flags))
>                         return NULL;
>         } else {
>                 spin_lock_irqsave(&rb->spinlock, flags);
>         }
>
> I think both options are fine to use with queue/stack map.
> trylock approach is probably less drastic.
>

I tried the trylock approach as shown below. The fuzzer now
no longer triggers the lockdep warning.

-Hsin-Wei

diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f9c734aaa990..ef95b796a0fa 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -103,7 +103,12 @@ static int __queue_map_get(struct bpf_map *map,
void *value, bool delete)
        int err = 0;
        void *ptr;

-       raw_spin_lock_irqsave(&qs->lock, flags);
+       if (in_nmi()) {
+               if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+                       return -EBUSY;
+       } else {
+               raw_spin_lock_irqsave(&qs->lock, flags);
+       }

        if (queue_stack_map_is_empty(qs)) {
                memset(value, 0, qs->map.value_size);
@@ -133,7 +138,12 @@ static int __stack_map_get(struct bpf_map *map,
void *value, bool delete)
        void *ptr;
        u32 index;

-       raw_spin_lock_irqsave(&qs->lock, flags);
+       if (in_nmi()) {
+               if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+                       return -EBUSY;
+       } else {
+               raw_spin_lock_irqsave(&qs->lock, flags);
+       }

        if (queue_stack_map_is_empty(qs)) {
                memset(value, 0, qs->map.value_size);
@@ -198,7 +208,12 @@ static int queue_stack_map_push_elem(struct
bpf_map *map, void *value,
        if (flags & BPF_NOEXIST || flags > BPF_EXIST)
                return -EINVAL;

-       raw_spin_lock_irqsave(&qs->lock, irq_flags);
+       if (in_nmi()) {
+               if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
+                       return -EBUSY;
+       } else {
+               raw_spin_lock_irqsave(&qs->lock, irq_flags);
+       }

        if (queue_stack_map_is_full(qs)) {
                if (!replace) {




> > I'll send a proper patch for that later if no one objects (or beats me
> > to it) :)
> >
> > -Toke
> >
> > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
> > index 8d2ddcb7566b..138525424745 100644
> > --- a/kernel/bpf/queue_stack_maps.c
> > +++ b/kernel/bpf/queue_stack_maps.c
> > @@ -98,6 +98,9 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
> >         int err = 0;
> >         void *ptr;
> >
> > +       if (in_nmi())
> > +               return -EBUSY;
> > +
> >         raw_spin_lock_irqsave(&qs->lock, flags);
> >
> >         if (queue_stack_map_is_empty(qs)) {
> > @@ -128,6 +131,9 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
> >         void *ptr;
> >         u32 index;
> >
> > +       if (in_nmi())
> > +               return -EBUSY;
> > +
> >         raw_spin_lock_irqsave(&qs->lock, flags);
> >
> >         if (queue_stack_map_is_empty(qs)) {
> > @@ -193,6 +199,9 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
> >         if (flags & BPF_NOEXIST || flags > BPF_EXIST)
> >                 return -EINVAL;
> >
> > +       if (in_nmi())
> > +               return -EBUSY;
> > +
> >         raw_spin_lock_irqsave(&qs->lock, irq_flags);
> >
> >         if (queue_stack_map_is_full(qs)) {