On Thu, 2012-05-03 at 11:41 +0200, Thomas Gleixner wrote: > On Fri, 20 Apr 2012, Suresh Siddha wrote: > > Also, do we really need the workqueue/kthreadd based allocation? Just > > like the percpu areas getting allocated for each possible cpu during > > boot, shouldn't we extend this to the per-cpu idle threads too? So > > something like the appended should be ok to? > > The idea is correct, there are just a few problems :) > > > Signed-off-by: Suresh Siddha <suresh.b.siddha@xxxxxxxxx> > > --- > > diff --git a/kernel/cpu.c b/kernel/cpu.c > > index 05c46ba..a5144ab 100644 > > --- a/kernel/cpu.c > > +++ b/kernel/cpu.c > > @@ -303,10 +303,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) > > > > cpu_hotplug_begin(); > > > > - ret = smpboot_prepare(cpu); > > - if (ret) > > - goto out; > > - > > If we failed to allocate an idle_thread for this cpu in smp_init() > then we unconditionally call __cpu_up() with a NULL pointer. That > might surprise the arch code :) > > Aside of that, we now miss to reinitialize the idle thread. We call > init_idle() once when we allocate the thread, but not after a cpu > offline operation. That might leave stuff in weird state. Second one slipped through and wasn't intentional. Anyways your modifications look good. While I am here, noticed that we could do 'node' aware idle task struct allocations. Appended the patch for this. Thanks. --- From: Suresh Siddha <suresh.b.siddha@xxxxxxxxx> Subject: idle: allocate percpu idle taks from the local node Use the arch specific early_cpu_to_node() to find out the local node for a given 'cpu' and use that info while allocating memory for that cpu's idle task. Signed-off-by: Suresh Siddha <suresh.b.siddha@xxxxxxxxx> --- arch/x86/include/asm/topology.h | 8 +++++--- arch/x86/mm/numa.c | 2 +- include/asm-generic/topology.h | 4 ++++ include/linux/kthread.h | 9 ++++++++- kernel/fork.c | 9 ++++++--- kernel/kthread.c | 8 +++----- 6 files changed, 27 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index b9676ae..bdbcee2 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -57,12 +57,12 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); extern int __cpu_to_node(int cpu); #define cpu_to_node __cpu_to_node -extern int early_cpu_to_node(int cpu); +extern int __early_cpu_to_node(int cpu); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ /* Same function but used if called before per_cpu areas are setup */ -static inline int early_cpu_to_node(int cpu) +static inline int __early_cpu_to_node(int cpu) { return early_per_cpu(x86_cpu_to_node_map, cpu); } @@ -144,7 +144,7 @@ static inline int numa_node_id(void) */ #define numa_node_id numa_node_id -static inline int early_cpu_to_node(int cpu) +static inline int __early_cpu_to_node(int cpu) { return 0; } @@ -153,6 +153,8 @@ static inline void setup_node_to_cpumask_map(void) { } #endif +#define early_cpu_to_node(cpu) __early_cpu_to_node(cpu) + #include <asm-generic/topology.h> extern const struct cpumask *cpu_coregroup_mask(int cpu); diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 19d3fa0..142738e 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -734,7 +734,7 @@ EXPORT_SYMBOL(__cpu_to_node); * Same function as cpu_to_node() but used if called before the * per_cpu areas are setup. */ -int early_cpu_to_node(int cpu) +int __early_cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index fc824e2..9ace6da 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -73,4 +73,8 @@ #endif /* !CONFIG_NUMA || !CONFIG_HAVE_MEMORYLESS_NODES */ +#ifndef early_cpu_to_node +#define early_cpu_to_node(cpu) ((void)(cpu), -1) +#endif + #endif /* _ASM_GENERIC_TOPOLOGY_H */ diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0714b24..c1f05e3 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -40,7 +40,7 @@ void *kthread_data(struct task_struct *k); int kthreadd(void *unused); extern struct task_struct *kthreadd_task; -extern int tsk_fork_get_node(struct task_struct *tsk); +extern int tsk_fork_get_node(struct task_struct *tsk, int idle_tsk); /* * Simple work processor based on kthread. @@ -131,4 +131,11 @@ bool queue_kthread_work(struct kthread_worker *worker, void flush_kthread_work(struct kthread_work *work); void flush_kthread_worker(struct kthread_worker *worker); +static inline void set_fork_pref_node(int node) +{ +#ifdef CONFIG_NUMA + current->pref_node_fork = node; +#endif +} + #endif /* _LINUX_KTHREAD_H */ diff --git a/kernel/fork.c b/kernel/fork.c index ca9a384..108d566 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -253,12 +253,13 @@ int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, return 0; } -static struct task_struct *dup_task_struct(struct task_struct *orig) +static struct task_struct *dup_task_struct(struct task_struct *orig, + int idle_tsk) { struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - int node = tsk_fork_get_node(orig); + int node = tsk_fork_get_node(orig, idle_tsk); int err; prepare_to_copy(orig); @@ -1165,7 +1166,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; retval = -ENOMEM; - p = dup_task_struct(current); + p = dup_task_struct(current, pid == &init_struct_pid); if (!p) goto fork_out; @@ -1531,6 +1532,8 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct task_struct *task; struct pt_regs regs; + set_fork_pref_node(early_cpu_to_node(cpu)); + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de63..3d74ab1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -125,10 +125,10 @@ static int kthread(void *_create) } /* called from do_fork() to get node information for about to be created task */ -int tsk_fork_get_node(struct task_struct *tsk) +int tsk_fork_get_node(struct task_struct *tsk, int idle_tsk) { #ifdef CONFIG_NUMA - if (tsk == kthreadd_task) + if (tsk == kthreadd_task || idle_tsk) return tsk->pref_node_fork; #endif return numa_node_id(); @@ -138,9 +138,7 @@ static void create_kthread(struct kthread_create_info *create) { int pid; -#ifdef CONFIG_NUMA - current->pref_node_fork = create->node; -#endif + set_fork_pref_node(create->node); /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { -- To unsubscribe from this list: send the line "unsubscribe linux-arch" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html