Re: possible deadlock in bpf_lru_push_free

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Feb 18, 2020 at 3:56 PM Yonghong Song <yhs@xxxxxx> wrote:
>
>
>
> On 2/18/20 9:44 AM, Yonghong Song wrote:
> >
> >
> > On 2/16/20 9:23 PM, Hillf Danton wrote:
> >>
> >> On Sun, 16 Feb 2020 04:17:09 -0800
> >>> syzbot has found a reproducer for the following crash on:
> >>>
> >>> HEAD commit:    2019fc96 Merge
> >>> git://git.kernel.org/pub/scm/linux/kernel/g..
> >>> git tree:       net
> >>> console output:
> >>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_log.txt-3Fx-3D1358bb11e00000&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=zrgWcBnddWkMWG2zm-9nC8EwvHMsuqw_-EEXwl23XLg&e=
> >>>
> >>> kernel config:
> >>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_.config-3Fx-3D735296e4dd620b10&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=kbT6Yw89JDoIWSQtlLJ7sjyNoP2Ulud27GNorna1zQk&e=
> >>>
> >>> dashboard link:
> >>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D122b5421d14e68f29cd1&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=U3pdUmrcroaeNsJ9DgFbTlvftQUCUcJ1CW_0NxS8yGA&e=
> >>>
> >>> compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
> >>> syz repro:
> >>> https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_repro.syz-3Fx-3D14b67d6ee00000&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=TuSfjosRFQW3ArpQwikTtx-dgLLBSMgJfVKtUltqQBM&e=
> >>>
> >>>
> >>> IMPORTANT: if you fix the bug, please add the following tag to the
> >>> commit:
> >>> Reported-by: syzbot+122b5421d14e68f29cd1@xxxxxxxxxxxxxxxxxxxxxxxxx
> >>>
> >>> ======================================================
> >>> WARNING: possible circular locking dependency detected
> >>> 5.6.0-rc1-syzkaller #0 Not tainted
> >>> ------------------------------------------------------
> >>> syz-executor.4/13544 is trying to acquire lock:
> >>> ffffe8ffffcba0b8 (&loc_l->lock){....}, at: bpf_common_lru_push_free
> >>> kernel/bpf/bpf_lru_list.c:516 [inline]
> >>> ffffe8ffffcba0b8 (&loc_l->lock){....}, at:
> >>> bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
> >>>
> >>> but task is already holding lock:
> >>> ffff888094985960 (&htab->buckets[i].lock){....}, at:
> >>> __htab_map_lookup_and_delete_batch+0x617/0x1540
> >>> kernel/bpf/hashtab.c:1322
> >>>
> >>> which lock already depends on the new lock.
> >>>
> >>>
> >>> the existing dependency chain (in reverse order) is:
> >>>
> >>> -> #2 (&htab->buckets[i].lock){....}:
> >>>         __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110
> >>> [inline]
> >>>         _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
> >>>         htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593
> >>>         __bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220
> >>> [inline]
> >>>         __bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266
> >>>         bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340
> >>> [inline]
> >>>         bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline]
> >>>         bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499
> >>>         prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132
> >>>         __htab_lru_percpu_map_update_elem+0x67e/0xa90
> >>> kernel/bpf/hashtab.c:1069
> >>>         bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585
> >>>         bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181
> >>>         generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319
> >>>         bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
> >>>         __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460
> >>>         __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
> >>>         __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
> >>>         do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
> >>>         entry_SYSCALL_64_after_hwframe+0x49/0xbe
> >>>
> >>> -> #1 (&l->lock){....}:
> >>>         __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
> >>>         _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151
> >>>         bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:325
> >>> [inline]
> >>>         bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline]
> >>>         bpf_lru_pop_free+0x67f/0x1670 kernel/bpf/bpf_lru_list.c:499
> >>>         prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132
> >>>         __htab_lru_percpu_map_update_elem+0x67e/0xa90
> >>> kernel/bpf/hashtab.c:1069
> >>>         bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585
> >>>         bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181
> >>>         generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319
> >>>         bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
> >>>         __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460
> >>>         __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
> >>>         __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
> >>>         do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
> >>>         entry_SYSCALL_64_after_hwframe+0x49/0xbe
> >>>
> >>> -> #0 (&loc_l->lock){....}:
> >>>         check_prev_add kernel/locking/lockdep.c:2475 [inline]
> >>>         check_prevs_add kernel/locking/lockdep.c:2580 [inline]
> >>>         validate_chain kernel/locking/lockdep.c:2970 [inline]
> >>>         __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954
> >>>         lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484
> >>>         __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110
> >>> [inline]
> >>>         _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
> >>>         bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline]
> >>>         bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
> >>>         __htab_map_lookup_and_delete_batch+0x8d4/0x1540
> >>> kernel/bpf/hashtab.c:1374
> >>>         htab_lru_map_lookup_and_delete_batch+0x34/0x40
> >>> kernel/bpf/hashtab.c:1491
> >>>         bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
> >>>         __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456
> >>>         __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
> >>>         __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
> >>>         do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
> >>>         entry_SYSCALL_64_after_hwframe+0x49/0xbe
> >>>
> >>> other info that might help us debug this:
> >>>
> >>> Chain exists of:
> >>>    &loc_l->lock --> &l->lock --> &htab->buckets[i].lock
> >>>
> >>>   Possible unsafe locking scenario:
> >>>
> >>>         CPU0                    CPU1
> >>>         ----                    ----
> >>>    lock(&htab->buckets[i].lock);
> >>>                                 lock(&l->lock);
> >>>                                 lock(&htab->buckets[i].lock);
> >>>    lock(&loc_l->lock);
> >>>
> >>>   *** DEADLOCK ***
> >>>
> >>> 2 locks held by syz-executor.4/13544:
> >>>   #0: ffffffff89bac240 (rcu_read_lock){....}, at:
> >>> __htab_map_lookup_and_delete_batch+0x54b/0x1540
> >>> kernel/bpf/hashtab.c:1308
> >>>   #1: ffff888094985960 (&htab->buckets[i].lock){....}, at:
> >>> __htab_map_lookup_and_delete_batch+0x617/0x1540
> >>> kernel/bpf/hashtab.c:1322
> >>>
> >>> stack backtrace:
> >>> CPU: 0 PID: 13544 Comm: syz-executor.4 Not tainted
> >>> 5.6.0-rc1-syzkaller #0
> >>> Hardware name: Google Google Compute Engine/Google Compute Engine,
> >>> BIOS Google 01/01/2011
> >>> Call Trace:
> >>>   __dump_stack lib/dump_stack.c:77 [inline]
> >>>   dump_stack+0x197/0x210 lib/dump_stack.c:118
> >>>   print_circular_bug.isra.0.cold+0x163/0x172
> >>> kernel/locking/lockdep.c:1684
> >>>   check_noncircular+0x32e/0x3e0 kernel/locking/lockdep.c:1808
> >>>   check_prev_add kernel/locking/lockdep.c:2475 [inline]
> >>>   check_prevs_add kernel/locking/lockdep.c:2580 [inline]
> >>>   validate_chain kernel/locking/lockdep.c:2970 [inline]
> >>>   __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954
> >>>   lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484
> >>>   __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
> >>>   _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
> >>>   bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline]
> >>>   bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
> >>>   __htab_map_lookup_and_delete_batch+0x8d4/0x1540
> >>> kernel/bpf/hashtab.c:1374
> >>>   htab_lru_map_lookup_and_delete_batch+0x34/0x40
> >>> kernel/bpf/hashtab.c:1491
> >>>   bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
> >>>   __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456
> >>>   __se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
> >>>   __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
> >>>   do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
> >>>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> >>
> >> Reclaim hash table elememt outside bucket lock.
> >
> > Thanks for the following patch. Yes, we do have an potential issue
> > with the above deadlock if LRU hash map is not preallocated.
> >
> > I am not a RCU expert, but maybe you could you help clarify
> > one thing below?
> >
> >>
> >> --- a/kernel/bpf/hashtab.c
> >> +++ b/kernel/bpf/hashtab.c
> >> @@ -1259,6 +1259,7 @@ __htab_map_lookup_and_delete_batch(struc
> >>       u64 elem_map_flags, map_flags;
> >>       struct hlist_nulls_head *head;
> >>       struct hlist_nulls_node *n;
> >> +    struct hlist_nulls_node *node_to_free = NULL;
> >>       unsigned long flags;
> >>       struct htab_elem *l;
> >>       struct bucket *b;
> >> @@ -1370,9 +1371,10 @@ again_nocopy:
> >>           }
> >>           if (do_delete) {
> >>               hlist_nulls_del_rcu(&l->hash_node);
> >> -            if (is_lru_map)
> >> -                bpf_lru_push_free(&htab->lru, &l->lru_node);
> >> -            else
> >> +            if (is_lru_map) {
> >> +                l->hash_node.next = node_to_free;
> >> +                node_to_free = &l->hash_node;
> >
> > Here, we change "next" pointer. How does this may impact the existing
> > parallel map lookup which does not need to take bucket pointer?
>
> Thanks for Martin for explanation! I think changing l->hash_node.next is
> unsafe here as another thread may execute on a different cpu and
> traverse the same list. It will see hash_node.next = NULL and it is
> unexpected.
>
> How about the following patch?

I think I'm missing some emails here, but overall the patch looks good to me.
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 2d182c4ee9d9..246ef0f2e985 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -56,6 +56,7 @@ struct htab_elem {
>                          union {
>                                  struct bpf_htab *htab;
>                                  struct pcpu_freelist_node fnode;
> +                               struct htab_elem *link;
>                          };
>                  };
>          };
> @@ -1256,6 +1257,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
> *map,
>          void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
>          void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
>          u32 batch, max_count, size, bucket_size;
> +       struct htab_elem *node_to_free = NULL;
>          u64 elem_map_flags, map_flags;
>          struct hlist_nulls_head *head;
>          struct hlist_nulls_node *n;
> @@ -1370,9 +1372,14 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
> *map,
>                  }
>                  if (do_delete) {
>                          hlist_nulls_del_rcu(&l->hash_node);
> -                       if (is_lru_map)
> -                               bpf_lru_push_free(&htab->lru, &l->lru_node);
> -                       else
> +                       if (is_lru_map) {
> +                               /* l->hnode overlaps with *
> l->hash_node.pprev
> +                                * in memory. l->hash_node.pprev has been
> +                                * poisoned and nobody should access it.
> +                                */
> +                               l->link = node_to_free;
> +                               node_to_free = l;
> +                       } else
>                                  free_htab_elem(htab, l);
>                  }
>                  dst_key += key_size;
> @@ -1380,6 +1387,13 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
> *map,
>          }
>
>          raw_spin_unlock_irqrestore(&b->lock, flags);
> +
> +       while (node_to_free) {
> +               l = node_to_free;
> +               node_to_free = node_to_free->link;
> +               bpf_lru_push_free(&htab->lru, &l->lru_node);
> +       }
> +
>          /* If we are not copying data, we can go to next bucket and avoid
>           * unlocking the rcu.
>           */
>
>




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux