Re: [PATCH] mm: Fix boot crash in mm_alloc()

Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> · Sun, 29 May 2011 09:22:49 -0700

On Sun, May 29, 2011 at 12:22 AM, Ingo Molnar <mingo@xxxxxxx> wrote:
>
> Would be nice to get the fix below into -rc1 as well, it triggers
> rather easily on bootup when CONFIG_CPUMASK_OFFSTACK is turned on.

Looking at that commit de03c72cfce5, it looks odd in other ways too.

For example, it looks like mm_cpumask is always initialized to zero.
That's a bit odd, isn't it, since it *used* to be initialized
statically with this:

-       .cpu_vm_mask    = CPU_MASK_ALL,

which is rather different from zero.

Now, I'm sure the init mm_cpumask doesn't really matter, but I'd have
expected a commentary about it.

I also wonder if that whole conversion to cpumask_var_t was worth it,
since clearly it wasn't very well tested. It results in an extra
allocation at fork() time for the many-cpu case, and I do get the
feeling that we would have been better off keeping the cpumask inside
the mm_struct. Moving it to the end of mm_struct makes sense for the
many-cpu case, but at the same time I end up wondering what it does to
the switch_mm() cache behavior. (And perhaps the TLB flush IPI cache
activity).

Ho humm. I have this suspicion that that whole patch wasn't fully
thought out, and that I should revert it rather than fix the oops.

Or, in fact, we could just do something like the attached (UNTESTED!)
which does the whole "move big allocation to end, but keep the
cpumask_var_t at the beginning, and don't do any extra allocations"
thing.

NOTE NOTE NOTE! Not only is the attached patch untested, but please
see the added FIXME comment about the whole mm_struct
kmem_cache_create(). Right now we always allocate the whole
maximum-sized bitmap.

Comments?

                          Linus
 include/linux/mm_types.h |   14 +++++++++++---
 include/linux/sched.h    |    1 -
 init/main.c              |    2 +-
 kernel/fork.c            |   39 ++++++++++-----------------------------
 4 files changed, 22 insertions(+), 34 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2a78aae78c69..f4e9bb17bdf2 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -243,7 +243,7 @@ struct mm_struct {
 						 * together off init_mm.mmlist, and are protected
 						 * by mmlist_lock
 						 */
-
+	cpumask_var_t cpu_vm_mask_var;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
@@ -311,10 +311,18 @@ struct mm_struct {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
-
-	cpumask_var_t cpu_vm_mask_var;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	struct cpumask cpumask_allocation;
+#endif
 };
 
+static inline void mm_init_cpumask(struct mm_struct *mm)
+{
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	mm->cpu_vm_mask_var = &mm->cpumask_allocation;
+#endif
+}
+
 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
 static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bcddd0138105..2a8621c4be1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2194,7 +2194,6 @@ static inline void mmdrop(struct mm_struct * mm)
 	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
 		__mmdrop(mm);
 }
-extern int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm);
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
diff --git a/init/main.c b/init/main.c
index d2f1e086bf33..cafba67c13bf 100644
--- a/init/main.c
+++ b/init/main.c
@@ -487,6 +487,7 @@ asmlinkage void __init start_kernel(void)
 	printk(KERN_NOTICE "%s", linux_banner);
 	setup_arch(&command_line);
 	mm_init_owner(&init_mm, &init_task);
+	mm_init_cpumask(&init_mm);
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
@@ -510,7 +511,6 @@ asmlinkage void __init start_kernel(void)
 	sort_main_extable();
 	trap_init();
 	mm_init();
-	BUG_ON(mm_init_cpumask(&init_mm, 0));
 
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
diff --git a/kernel/fork.c b/kernel/fork.c
index ca406d916713..d30c792a83a2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -484,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
-int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
-		return -ENOMEM;
-
-	if (oldmm)
-		cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
-	else
-		memset(mm_cpumask(mm), 0, cpumask_size());
-#endif
-	return 0;
-}
-
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
 	atomic_set(&mm->mm_users, 1);
@@ -538,17 +524,8 @@ struct mm_struct * mm_alloc(void)
 		return NULL;
 
 	memset(mm, 0, sizeof(*mm));
-	mm = mm_init(mm, current);
-	if (!mm)
-		return NULL;
-
-	if (mm_init_cpumask(mm, NULL)) {
-		mm_free_pgd(mm);
-		free_mm(mm);
-		return NULL;
-	}
-
-	return mm;
+	mm_init_cpumask(mm);
+	return mm_init(mm, current);
 }
 
 /*
@@ -753,6 +730,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 		goto fail_nomem;
 
 	memcpy(mm, oldmm, sizeof(*mm));
+	mm_init_cpumask(mm);
 
 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -765,9 +743,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
-	if (mm_init_cpumask(mm, oldmm))
-		goto fail_nocpumask;
-
 	if (init_new_context(tsk, mm))
 		goto fail_nocontext;
 
@@ -796,7 +771,6 @@ fail_nomem:
 fail_nocontext:
 	free_cpumask_var(mm->cpu_vm_mask_var);
 
-fail_nocpumask:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
@@ -1591,6 +1565,13 @@ void __init proc_caches_init(void)
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+	/*
+	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
+	 * whole struct cpumask for the OFFSTACK case. We could change
+	 * this to *only* allocate as much of it as required by the
+	 * maximum number of CPU's we can ever have.  The cpumask_allocation
+	 * is at the end of the structure, exactly for that reason.
+	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);