[PATCH -rt] rt-slab: fix cpu inconsistency case

Hiroshi Shimamoto <h-shimamoto@xxxxxxxxxxxxx> · Thu, 20 Mar 2008 11:00:27 -0700

Hi Ingo,

I encountered BUG at mm/slab.c.
It tells me there is an inconsistency case.

One of the panics, it shows obviously exclusive control in slab is not right.

The console log;
Unable to handle kernel paging request at ffff8108cf81d3c8 RIP:
 [<ffffffff8029e92f>] kmem_cache_alloc+0x5e/0xdb
PGD 8063 PUD 0
Oops: 0000 [1] PREEMPT SMP
CPU 1
Modules linked in:
Pid: 32268, comm: dcachebench Not tainted 2.6.24.3-rt3 #3
RIP: 0010:[<ffffffff8029e92f>]  [<ffffffff8029e92f>] kmem_cache_alloc+0x5e/0xdb
RSP: 0018:ffff810048895e58  EFLAGS: 00010097
RAX: 00000000ffffffff RBX: 0000000000000246 RCX: 0000000000000000
RDX: ffff8100cf81d3b8 RSI: 0000000000000000 RDI: ffff8100cf80ee40
RBP: ffff8100cf80ee40 R08: 0000000000000c62 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000000000d0
R13: ffffffff802aa83d R14: 0000000000000006 R15: 0000000000000000
FS:  00002adc7b6686f0(0000) GS:ffff8100cf81f340(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffff8108cf81d3c8 CR3: 00000000b36e0000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process dcachebench (pid: 32268, threadinfo ffff810048894000, task ffff8100cd248080)
Stack:  0000000000000000 ffff810097d1c000 0000000100000282 0000000000000006
 00000000ffffff9c 0000000000000000 00000000005038b0 ffffffff802aa83d
 ffff8100cf811080 0000000000000006 00000000ffffff9c 0000000000000000
Call Trace:
 [<ffffffff802aa83d>] getname+0x1e/0x1f0
 [<ffffffff802ac38c>] do_rmdir+0x17/0xfd
 [<ffffffff8026fb5f>] audit_syscall_entry+0x163/0x189
 [<ffffffff8020c2f3>] tracesys+0x71/0xe1
 [<ffffffff8020c35e>] tracesys+0xdc/0xe1
---------------------------
| preempt count: 00000001 ]
| 1-level deep critical section nesting:
----------------------------------------
.. [<ffffffff80640737>] .... __spin_trylock+0xe/0x49
.....[<00000000>] ..   ( <= 0x0)
Code: 48 8b 54 c2 18 53 9d 4c 89 e9 44 89 e6 48 89 ef e8 f1 e1 ff
RIP  [<ffffffff8029e92f>] kmem_cache_alloc+0x5e/0xdb
 RSP <ffff810048895e58>


The kernel dump;
(gdb) info thr
  4 process 32262  flush_tlb_others (cpumask={bits = {4}}, mm=0xffff8100cd16d450,
    va=46966178668544) at arch/x86/kernel/smp_64.c:195
  3 process 32325  0xffffffff8020cad0 in invalidate_interrupt3 ()
  2 process 32268  0xffffffff8029e92f in kmem_cache_alloc (cachep=0xffff8100cf80ee40,
    flags=208) at mm/slab.c:3320
* 1 process 32326  0xffffffff806408c4 in __spin_lock (lock=0xffff8100cdc9a4d0)
    at kernel/spinlock.c:333
(gdb) thr 2
[Switching to thread 2 (process 32268)]#0  0xffffffff8029e92f in kmem_cache_alloc (
    cachep=0xffff8100cf80ee40, flags=208) at mm/slab.c:3320
3320    mm/slab.c: No such file or directory.
        in mm/slab.c

0xffffffff8029e903 <kmem_cache_alloc+50>:       callq  0xffffffff8029c292 <check_irq_off>
0xffffffff8029e908 <kmem_cache_alloc+55>:       movslq 0x14(%rsp),%rax
0xffffffff8029e90d <kmem_cache_alloc+60>:       mov    0x0(%rbp,%rax,8),%rdx
0xffffffff8029e912 <kmem_cache_alloc+65>:       mov    (%rdx),%eax
0xffffffff8029e914 <kmem_cache_alloc+67>:       test   %eax,%eax
0xffffffff8029e916 <kmem_cache_alloc+69>:       je     0xffffffff8029e990 <kmem_cache_alloc+191>

1st. %eax, ac->avail is not 0.

0xffffffff8029e918 <kmem_cache_alloc+71>:       lock incl 0xf8(%rbp)
0xffffffff8029e91f <kmem_cache_alloc+78>:       movl   $0x1,0xc(%rdx)
0xffffffff8029e926 <kmem_cache_alloc+85>:       mov    (%rdx),%eax
0xffffffff8029e928 <kmem_cache_alloc+87>:       sub    $0x1,%eax
0xffffffff8029e92b <kmem_cache_alloc+90>:       mov    %eax,(%rdx)
0xffffffff8029e92d <kmem_cache_alloc+92>:       mov    %eax,%eax
0xffffffff8029e92f <kmem_cache_alloc+94>:       mov    0x18(%rdx,%rax,8),%rdx

2nd. %eax, ac->avail = 0xffffffff, it causes this panic.

0xffffffff8029e934 <kmem_cache_alloc+99>:       push   %rbx
0xffffffff8029e935 <kmem_cache_alloc+100>:      popfq

static inline void *
____cache_alloc(struct kmem_cache *cachep, gfp_t flags, int *this_cpu)
{
	void *objp;
	struct array_cache *ac;

	check_irq_off();

	ac = cpu_cache_get(cachep, *this_cpu);
	if (likely(ac->avail)) { ---------------------------> 1st. check
		STATS_INC_ALLOCHIT(cachep);
		ac->touched = 1;
		objp = ac->entry[--ac->avail]; -------------> PANIC HERE
	} else {
		STATS_INC_ALLOCMISS(cachep);
		objp = cache_alloc_refill(cachep, flags, this_cpu);
	}
	return objp;
}

It means that ac->avail is changed by the other cpu.
So I looked for who touch another cpu's data and found the root cause.

I think, calling cache_grow() in cache_alloc_refill() can change cpu,
but cache_grow() doesn't update this_cpu variable in spite of changing cpu.
It makes the kernel access other cpu's data in cache_alloc_refill().

I'm testing this patch now.
---
From: Hiroshi Shimamoto <h-shimamoto@xxxxxxxxxxxxx>

On !PREEMPT_RT, an inconsistency case may be occurred.
After releasing cpu by slab_irq_enable_nort() the cpu can be changed.
In slab_irq_disable_nort() new cpu id should be gotten.
If not, it makes data of other cpu corrupted.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@xxxxxxxxxxxxx>
---
 mm/slab.c |   18 +++++++++---------
 1 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 4bf6ca8..7beb378 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -138,8 +138,8 @@
  *
  * (On PREEMPT_RT, these are NOPs, but we have to drop/get the irq locks.)
  */
-# define slab_irq_disable_nort()	local_irq_disable()
-# define slab_irq_enable_nort()		local_irq_enable()
+# define slab_irq_disable_nort(cpu)	slab_irq_disable(cpu)
+# define slab_irq_enable_nort(cpu)	slab_irq_enable(cpu)
 # define slab_irq_disable_rt(flags)	do { (void)(flags); } while (0)
 # define slab_irq_enable_rt(flags)	do { (void)(flags); } while (0)
 # define slab_spin_lock_irq(lock, cpu) \
@@ -160,8 +160,8 @@ DEFINE_PER_CPU_LOCKED(int, slab_irq_locks) = { 0, };
 	do { slab_irq_enable(cpu); (void) (flags); } while (0)
 # define slab_irq_disable_rt(cpu)	slab_irq_disable(cpu)
 # define slab_irq_enable_rt(cpu)	slab_irq_enable(cpu)
-# define slab_irq_disable_nort()	do { } while (0)
-# define slab_irq_enable_nort()		do { } while (0)
+# define slab_irq_disable_nort(cpu)	do { } while (0)
+# define slab_irq_enable_nort(cpu)	do { } while (0)
 # define slab_spin_lock_irq(lock, cpu) \
 		do { slab_irq_disable(cpu); spin_lock(lock); } while (0)
 # define slab_spin_unlock_irq(lock, cpu) \
@@ -2899,7 +2899,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	offset *= cachep->colour_off;
 
 	if (local_flags & __GFP_WAIT)
-		slab_irq_enable_nort();
+		slab_irq_enable_nort(*this_cpu);
 	slab_irq_enable_rt(*this_cpu);
 
 	/*
@@ -2932,7 +2932,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 
 	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		slab_irq_disable_nort();
+		slab_irq_disable_nort(*this_cpu);
 
 	check_irq_off();
 	spin_lock(&l3->list_lock);
@@ -2948,7 +2948,7 @@ opps1:
 failed:
 	slab_irq_disable_rt(*this_cpu);
 	if (local_flags & __GFP_WAIT)
-		slab_irq_disable_nort();
+		slab_irq_disable_nort(*this_cpu);
 	return 0;
 }
 
@@ -3397,7 +3397,7 @@ retry:
 		 * set and go into memory reserves if necessary.
 		 */
 		if (local_flags & __GFP_WAIT)
-			slab_irq_enable_nort();
+			slab_irq_enable_nort(*this_cpu);
 		slab_irq_enable_rt(*this_cpu);
 
 		kmem_flagcheck(cache, flags);
@@ -3405,7 +3405,7 @@ retry:
 
 		slab_irq_disable_rt(*this_cpu);
 		if (local_flags & __GFP_WAIT)
-			slab_irq_disable_nort();
+			slab_irq_disable_nort(*this_cpu);
 
 		if (obj) {
 			/*
-- 
1.5.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html