Patch "bpf: Use raw_spin_trylock() for pcpu_freelist_push/pop in NMI" has been added to the 5.8-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    bpf: Use raw_spin_trylock() for pcpu_freelist_push/pop in NMI

to the 5.8-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     bpf-use-raw_spin_trylock-for-pcpu_freelist_push-pop-.patch
and it can be found in the queue-5.8 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 62238afd8acdf1cb7618260b4f736cffcb8322e9
Author: Song Liu <songliubraving@xxxxxx>
Date:   Mon Oct 5 09:58:38 2020 -0700

    bpf: Use raw_spin_trylock() for pcpu_freelist_push/pop in NMI
    
    [ Upstream commit 39d8f0d1026a990604770a658708f5845f7dbec0 ]
    
    Recent improvements in LOCKDEP highlighted a potential A-A deadlock with
    pcpu_freelist in NMI:
    
    ./tools/testing/selftests/bpf/test_progs -t stacktrace_build_id_nmi
    
    [   18.984807] ================================
    [   18.984807] WARNING: inconsistent lock state
    [   18.984808] 5.9.0-rc6-01771-g1466de1330e1 #2967 Not tainted
    [   18.984809] --------------------------------
    [   18.984809] inconsistent {INITIAL USE} -> {IN-NMI} usage.
    [   18.984810] test_progs/1990 [HC2[2]:SC0[0]:HE0:SE1] takes:
    [   18.984810] ffffe8ffffc219c0 (&head->lock){....}-{2:2}, at: __pcpu_freelist_pop+0xe3/0x180
    [   18.984813] {INITIAL USE} state was registered at:
    [   18.984814]   lock_acquire+0x175/0x7c0
    [   18.984814]   _raw_spin_lock+0x2c/0x40
    [   18.984815]   __pcpu_freelist_pop+0xe3/0x180
    [   18.984815]   pcpu_freelist_pop+0x31/0x40
    [   18.984816]   htab_map_alloc+0xbbf/0xf40
    [   18.984816]   __do_sys_bpf+0x5aa/0x3ed0
    [   18.984817]   do_syscall_64+0x2d/0x40
    [   18.984818]   entry_SYSCALL_64_after_hwframe+0x44/0xa9
    [   18.984818] irq event stamp: 12
    [...]
    [   18.984822] other info that might help us debug this:
    [   18.984823]  Possible unsafe locking scenario:
    [   18.984823]
    [   18.984824]        CPU0
    [   18.984824]        ----
    [   18.984824]   lock(&head->lock);
    [   18.984826]   <Interrupt>
    [   18.984826]     lock(&head->lock);
    [   18.984827]
    [   18.984828]  *** DEADLOCK ***
    [   18.984828]
    [   18.984829] 2 locks held by test_progs/1990:
    [...]
    [   18.984838]  <NMI>
    [   18.984838]  dump_stack+0x9a/0xd0
    [   18.984839]  lock_acquire+0x5c9/0x7c0
    [   18.984839]  ? lock_release+0x6f0/0x6f0
    [   18.984840]  ? __pcpu_freelist_pop+0xe3/0x180
    [   18.984840]  _raw_spin_lock+0x2c/0x40
    [   18.984841]  ? __pcpu_freelist_pop+0xe3/0x180
    [   18.984841]  __pcpu_freelist_pop+0xe3/0x180
    [   18.984842]  pcpu_freelist_pop+0x17/0x40
    [   18.984842]  ? lock_release+0x6f0/0x6f0
    [   18.984843]  __bpf_get_stackid+0x534/0xaf0
    [   18.984843]  bpf_prog_1fd9e30e1438d3c5_oncpu+0x73/0x350
    [   18.984844]  bpf_overflow_handler+0x12f/0x3f0
    
    This is because pcpu_freelist_head.lock is accessed in both NMI and
    non-NMI context. Fix this issue by using raw_spin_trylock() in NMI.
    
    Since NMI interrupts non-NMI context, when NMI context tries to lock the
    raw_spinlock, non-NMI context of the same CPU may already have locked a
    lock and is blocked from unlocking the lock. For a system with N CPUs,
    there could be N NMIs at the same time, and they may block N non-NMI
    raw_spinlocks. This is tricky for pcpu_freelist_push(), where unlike
    _pop(), failing _push() means leaking memory. This issue is more likely to
    trigger in non-SMP system.
    
    Fix this issue with an extra list, pcpu_freelist.extralist. The extralist
    is primarily used to take _push() when raw_spin_trylock() failed on all
    the per CPU lists. It should be empty most of the time. The following
    table summarizes the behavior of pcpu_freelist in NMI and non-NMI:
    
    non-NMI pop():  use _lock(); check per CPU lists first;
                    if all per CPU lists are empty, check extralist;
                    if extralist is empty, return NULL.
    
    non-NMI push(): use _lock(); only push to per CPU lists.
    
    NMI pop():    use _trylock(); check per CPU lists first;
                  if all per CPU lists are locked or empty, check extralist;
                  if extralist is locked or empty, return NULL.
    
    NMI push():   use _trylock(); check per CPU lists first;
                  if all per CPU lists are locked; try push to extralist;
                  if extralist is also locked, keep trying on per CPU lists.
    
    Reported-by: Alexei Starovoitov <ast@xxxxxxxxxx>
    Signed-off-by: Song Liu <songliubraving@xxxxxx>
    Signed-off-by: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
    Acked-by: Martin KaFai Lau <kafai@xxxxxx>
    Link: https://lore.kernel.org/bpf/20201005165838.3735218-1-songliubraving@xxxxxx
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index b367430e611c7..3d897de890612 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
 		raw_spin_lock_init(&head->lock);
 		head->first = NULL;
 	}
+	raw_spin_lock_init(&s->extralist.lock);
+	s->extralist.first = NULL;
 	return 0;
 }
 
@@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
 	raw_spin_unlock(&head->lock);
 }
 
+static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
+						struct pcpu_freelist_node *node)
+{
+	if (!raw_spin_trylock(&s->extralist.lock))
+		return false;
+
+	pcpu_freelist_push_node(&s->extralist, node);
+	raw_spin_unlock(&s->extralist.lock);
+	return true;
+}
+
+static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
+					     struct pcpu_freelist_node *node)
+{
+	int cpu, orig_cpu;
+
+	orig_cpu = cpu = raw_smp_processor_id();
+	while (1) {
+		struct pcpu_freelist_head *head;
+
+		head = per_cpu_ptr(s->freelist, cpu);
+		if (raw_spin_trylock(&head->lock)) {
+			pcpu_freelist_push_node(head, node);
+			raw_spin_unlock(&head->lock);
+			return;
+		}
+		cpu = cpumask_next(cpu, cpu_possible_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = 0;
+
+		/* cannot lock any per cpu lock, try extralist */
+		if (cpu == orig_cpu &&
+		    pcpu_freelist_try_push_extra(s, node))
+			return;
+	}
+}
+
 void __pcpu_freelist_push(struct pcpu_freelist *s,
 			struct pcpu_freelist_node *node)
 {
-	struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
-
-	___pcpu_freelist_push(head, node);
+	if (in_nmi())
+		___pcpu_freelist_push_nmi(s, node);
+	else
+		___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
 }
 
 void pcpu_freelist_push(struct pcpu_freelist *s,
@@ -81,7 +121,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
 	}
 }
 
-struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
@@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
 		if (cpu == orig_cpu)
-			return NULL;
+			break;
+	}
+
+	/* per cpu lists are all empty, try extralist */
+	raw_spin_lock(&s->extralist.lock);
+	node = s->extralist.first;
+	if (node)
+		s->extralist.first = node->next;
+	raw_spin_unlock(&s->extralist.lock);
+	return node;
+}
+
+static struct pcpu_freelist_node *
+___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
+{
+	struct pcpu_freelist_head *head;
+	struct pcpu_freelist_node *node;
+	int orig_cpu, cpu;
+
+	orig_cpu = cpu = raw_smp_processor_id();
+	while (1) {
+		head = per_cpu_ptr(s->freelist, cpu);
+		if (raw_spin_trylock(&head->lock)) {
+			node = head->first;
+			if (node) {
+				head->first = node->next;
+				raw_spin_unlock(&head->lock);
+				return node;
+			}
+			raw_spin_unlock(&head->lock);
+		}
+		cpu = cpumask_next(cpu, cpu_possible_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = 0;
+		if (cpu == orig_cpu)
+			break;
 	}
+
+	/* cannot pop from per cpu lists, try extralist */
+	if (!raw_spin_trylock(&s->extralist.lock))
+		return NULL;
+	node = s->extralist.first;
+	if (node)
+		s->extralist.first = node->next;
+	raw_spin_unlock(&s->extralist.lock);
+	return node;
+}
+
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+	if (in_nmi())
+		return ___pcpu_freelist_pop_nmi(s);
+	return ___pcpu_freelist_pop(s);
 }
 
 struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index fbf8a8a289791..3c76553cfe571 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -13,6 +13,7 @@ struct pcpu_freelist_head {
 
 struct pcpu_freelist {
 	struct pcpu_freelist_head __percpu *freelist;
+	struct pcpu_freelist_head extralist;
 };
 
 struct pcpu_freelist_node {



[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux