On 2/18/20 9:44 AM, Yonghong Song wrote:
On 2/16/20 9:23 PM, Hillf Danton wrote:
On Sun, 16 Feb 2020 04:17:09 -0800
syzbot has found a reproducer for the following crash on:
HEAD commit: 2019fc96 Merge
git://git.kernel.org/pub/scm/linux/kernel/g..
git tree: net
console output:
https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_log.txt-3Fx-3D1358bb11e00000&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=zrgWcBnddWkMWG2zm-9nC8EwvHMsuqw_-EEXwl23XLg&e=
kernel config:
https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_.config-3Fx-3D735296e4dd620b10&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=kbT6Yw89JDoIWSQtlLJ7sjyNoP2Ulud27GNorna1zQk&e=
dashboard link:
https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_bug-3Fextid-3D122b5421d14e68f29cd1&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=U3pdUmrcroaeNsJ9DgFbTlvftQUCUcJ1CW_0NxS8yGA&e=
compiler: gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:
https://urldefense.proofpoint.com/v2/url?u=https-3A__syzkaller.appspot.com_x_repro.syz-3Fx-3D14b67d6ee00000&d=DwIDAg&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=npe_gMkFnfxt6F5dGLs6zsNHWkYM30LkMFOk1_ZR1w8&s=TuSfjosRFQW3ArpQwikTtx-dgLLBSMgJfVKtUltqQBM&e=
IMPORTANT: if you fix the bug, please add the following tag to the
commit:
Reported-by: syzbot+122b5421d14e68f29cd1@xxxxxxxxxxxxxxxxxxxxxxxxx
======================================================
WARNING: possible circular locking dependency detected
5.6.0-rc1-syzkaller #0 Not tainted
------------------------------------------------------
syz-executor.4/13544 is trying to acquire lock:
ffffe8ffffcba0b8 (&loc_l->lock){....}, at: bpf_common_lru_push_free
kernel/bpf/bpf_lru_list.c:516 [inline]
ffffe8ffffcba0b8 (&loc_l->lock){....}, at:
bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
but task is already holding lock:
ffff888094985960 (&htab->buckets[i].lock){....}, at:
__htab_map_lookup_and_delete_batch+0x617/0x1540
kernel/bpf/hashtab.c:1322
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&htab->buckets[i].lock){....}:
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110
[inline]
_raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593
__bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220
[inline]
__bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266
bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340
[inline]
bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline]
bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499
prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132
__htab_lru_percpu_map_update_elem+0x67e/0xa90
kernel/bpf/hashtab.c:1069
bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585
bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181
generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319
bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
__do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460
__se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
__x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&l->lock){....}:
__raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
_raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151
bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:325
[inline]
bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline]
bpf_lru_pop_free+0x67f/0x1670 kernel/bpf/bpf_lru_list.c:499
prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132
__htab_lru_percpu_map_update_elem+0x67e/0xa90
kernel/bpf/hashtab.c:1069
bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585
bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181
generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319
bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
__do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460
__se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
__x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&loc_l->lock){....}:
check_prev_add kernel/locking/lockdep.c:2475 [inline]
check_prevs_add kernel/locking/lockdep.c:2580 [inline]
validate_chain kernel/locking/lockdep.c:2970 [inline]
__lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954
lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110
[inline]
_raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline]
bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
__htab_map_lookup_and_delete_batch+0x8d4/0x1540
kernel/bpf/hashtab.c:1374
htab_lru_map_lookup_and_delete_batch+0x34/0x40
kernel/bpf/hashtab.c:1491
bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
__do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456
__se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
__x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&loc_l->lock --> &l->lock --> &htab->buckets[i].lock
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&htab->buckets[i].lock);
lock(&l->lock);
lock(&htab->buckets[i].lock);
lock(&loc_l->lock);
*** DEADLOCK ***
2 locks held by syz-executor.4/13544:
#0: ffffffff89bac240 (rcu_read_lock){....}, at:
__htab_map_lookup_and_delete_batch+0x54b/0x1540
kernel/bpf/hashtab.c:1308
#1: ffff888094985960 (&htab->buckets[i].lock){....}, at:
__htab_map_lookup_and_delete_batch+0x617/0x1540
kernel/bpf/hashtab.c:1322
stack backtrace:
CPU: 0 PID: 13544 Comm: syz-executor.4 Not tainted
5.6.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x197/0x210 lib/dump_stack.c:118
print_circular_bug.isra.0.cold+0x163/0x172
kernel/locking/lockdep.c:1684
check_noncircular+0x32e/0x3e0 kernel/locking/lockdep.c:1808
check_prev_add kernel/locking/lockdep.c:2475 [inline]
check_prevs_add kernel/locking/lockdep.c:2580 [inline]
validate_chain kernel/locking/lockdep.c:2970 [inline]
__lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954
lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159
bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline]
bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555
__htab_map_lookup_and_delete_batch+0x8d4/0x1540
kernel/bpf/hashtab.c:1374
htab_lru_map_lookup_and_delete_batch+0x34/0x40
kernel/bpf/hashtab.c:1491
bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348
__do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456
__se_sys_bpf kernel/bpf/syscall.c:3355 [inline]
__x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355
do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
entry_SYSCALL_64_after_hwframe+0x49/0xbe
Reclaim hash table elememt outside bucket lock.
Thanks for the following patch. Yes, we do have an potential issue
with the above deadlock if LRU hash map is not preallocated.
I am not a RCU expert, but maybe you could you help clarify
one thing below?
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1259,6 +1259,7 @@ __htab_map_lookup_and_delete_batch(struc
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
+ struct hlist_nulls_node *node_to_free = NULL;
unsigned long flags;
struct htab_elem *l;
struct bucket *b;
@@ -1370,9 +1371,10 @@ again_nocopy:
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
- if (is_lru_map)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
- else
+ if (is_lru_map) {
+ l->hash_node.next = node_to_free;
+ node_to_free = &l->hash_node;
Here, we change "next" pointer. How does this may impact the existing
parallel map lookup which does not need to take bucket pointer?
Thanks for Martin for explanation! I think changing l->hash_node.next is
unsafe here as another thread may execute on a different cpu and
traverse the same list. It will see hash_node.next = NULL and it is
unexpected.
How about the following patch?
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2d182c4ee9d9..246ef0f2e985 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -56,6 +56,7 @@ struct htab_elem {
union {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
+ struct htab_elem *link;
};
};
};
@@ -1256,6 +1257,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
*map,
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size;
+ struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
@@ -1370,9 +1372,14 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
*map,
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
- if (is_lru_map)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
- else
+ if (is_lru_map) {
+ /* l->hnode overlaps with *
l->hash_node.pprev
+ * in memory. l->hash_node.pprev has been
+ * poisoned and nobody should access it.
+ */
+ l->link = node_to_free;
+ node_to_free = l;
+ } else
free_htab_elem(htab, l);
}
dst_key += key_size;
@@ -1380,6 +1387,13 @@ __htab_map_lookup_and_delete_batch(struct bpf_map
*map,
}
raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ while (node_to_free) {
+ l = node_to_free;
+ node_to_free = node_to_free->link;
+ bpf_lru_push_free(&htab->lru, &l->lru_node);
+ }
+
/* If we are not copying data, we can go to next bucket and avoid
* unlocking the rcu.
*/