On 03/16/2012 08:10 PM, Peter Zijlstra wrote: > start/stop numa balance threads on-demand using cpu-hotlpug. > > Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> > --- > kernel/sched/numa.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 55 insertions(+), 7 deletions(-) > --- a/kernel/sched/numa.c > +++ b/kernel/sched/numa.c > @@ -596,31 +596,79 @@ static int numad_thread(void *data) > return 0; > } > > +static int __cpuinit > +numa_hotplug(struct notifier_block *nb, unsigned long action, void *hcpu) > +{ > + int cpu = (long)hcpu; > + int node = cpu_to_node(cpu); > + struct node_queue *nq = nq_of(node); > + struct task_struct *numad; > + int err = 0; > + > + switch (action & ~CPU_TASKS_FROZEN) { > + case CPU_UP_PREPARE: > + if (nq->numad) > + break; > + > + numad = kthread_create_on_node(numad_thread, > + nq, node, "numad/%d", node); > + if (IS_ERR(numad)) { > + err = PTR_ERR(numad); > + break; > + } > + > + nq->numad = numad; > + nq->next_schedule = jiffies + HZ; // XXX sync-up? > + break; > + > + case CPU_ONLINE: > + wake_up_process(nq->numad); > + break; > + > + case CPU_DEAD: > + case CPU_UP_CANCELED: > + if (!nq->numad) > + break; > + > + if (cpumask_any_and(cpu_online_mask, > + cpumask_of_node(node)) >= nr_cpu_ids) { > + kthread_stop(nq->numad); > + nq->numad = NULL; > + } > + break; > + } > + > + return notifier_from_errno(err); > +} > + > static __init int numa_init(void) > { > - int node; > + int node, cpu, err; > > nqs = kzalloc(sizeof(struct node_queue*) * nr_node_ids, GFP_KERNEL); > BUG_ON(!nqs); > > - for_each_node(node) { // XXX hotplug > + for_each_node(node) { > struct node_queue *nq = kmalloc_node(sizeof(*nq), > GFP_KERNEL | __GFP_ZERO, node); > BUG_ON(!nq); > > - nq->numad = kthread_create_on_node(numad_thread, > - nq, node, "numad/%d", node); > - BUG_ON(IS_ERR(nq->numad)); > - > spin_lock_init(&nq->lock); > INIT_LIST_HEAD(&nq->entity_list); > > nq->next_schedule = jiffies + HZ; > nq->node = node; > nqs[node] = nq; > + } > > - wake_up_process(nq->numad); > + get_online_cpus(); > + cpu_notifier(numa_hotplug, 0); ABBA deadlock! CPU 0 CPU1 echo 0/1 > /sys/devices/.../cpu*/online acquire cpu_add_remove_lock get_online_cpus() acquire cpu_hotplug lock Blocked on cpu hotplug lock cpu_notifier() acquire cpu_add_remove_lock ABBA DEADLOCK! [cpu_maps_update_begin/done() deal with cpu_add_remove_lock]. So, basically, at the moment there is no way to register a CPU Hotplug notifier and do setup for all currently online cpus in a totally race-free manner. One approach to fix this is to audit whether register_cpu_notifier() really needs to take cpu_add_remove_lock and if no, then acquire cpu hotplug lock instead. The other approach is to keep the existing lock ordering as it is and yet provide a race-free way to register, as I had posted some time ago (incomplete/untested): http://thread.gmane.org/gmane.linux.kernel/1258880/focus=15826 > + for_each_online_cpu(cpu) { > + err = numa_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)cpu); > + BUG_ON(notifier_to_errno(err)); > + numa_hotplug(NULL, CPU_ONLINE, (void *)(long)cpu); > } > + put_online_cpus(); > > return 0; > } > > Regards, Srivatsa S. Bhat -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>