Hi Rusty, I'm testing this now on x86_64 and one question comes up. The initialization of the woc_wq thread happens quite late. Might it be better to initialize it earlier? (I haven't tested all work_on_cpu callers yet, plus there are a couple that are waiting in the queue that I'm also testing.) Oops, one problem just popped up. After running a couple of offline/online tests, this came up: [ 1080.185022] INFO: task kwork_on_cpu:491 blocked for more than 480 seconds. [ 1080.205849] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1080.229745] kwork_on_cpu D 0000000000000000 3432 491 2 [ 1080.229750] ffff88012e043ec0 0000000000000046 000000002e043e60 ffffffff81d5ef00 [ 1080.229754] ffffffff81d5ef00 ffffffff81d5ef00 ffffffff81d5ef00 ffffffff81d5ef00 [ 1080.229757] ffffffff81d5ef00 ffffffff81d58580 ffffffff81d5ef00 ffff88012e03ab40 [ 1080.229760] Call Trace: [ 1080.229770] [<ffffffff8106e241>] ? trace_hardirqs_on+0xd/0xf [ 1080.229774] [<ffffffff810a6bc5>] do_work_on_cpu+0x72/0x13d [ 1080.229778] [<ffffffff8105d994>] ? autoremove_wake_function+0x0/0x3d [ 1080.229781] [<ffffffff810a6b53>] ? do_work_on_cpu+0x0/0x13d [ 1080.229783] [<ffffffff810a6b53>] ? do_work_on_cpu+0x0/0x13d [ 1080.229786] [<ffffffff8105d809>] kthread+0x4e/0x7d [ 1080.229790] [<ffffffff8100d7ba>] child_rip+0xa/0x20 [ 1080.229793] [<ffffffff8100d1bc>] ? restore_args+0x0/0x30 [ 1080.229795] [<ffffffff8105d796>] ? kthreadd+0x167/0x18c [ 1080.229798] [<ffffffff8105d7bb>] ? kthread+0x0/0x7d [ 1080.229801] [<ffffffff8100d7b0>] ? child_rip+0x0/0x20 [ 1080.229802] INFO: lockdep is turned off. Thanks, Mike Rusty Russell wrote: > On Tuesday 27 January 2009 17:55:19 Andrew Morton wrote: > [ lots of good stuff ] > >> Making work_on_cu() truly generic is quite hard! > > Yes. What do you think of this workqueue-less approach? > > Subject: work_on_cpu: non-workqueue solution. > > work_on_cpu was designed to replace the meme of: > > cpumask_t saved = current->cpus_allowed; > set_cpus_allowed(cpumask_of(dst)); > ... do something on cpu dst ... > set_cpus_allowed(saved); > > It should have been a trivial routine, and a trivial conversion. > > The original version did a get_online_cpus(), then put the work on the > kevent queue, called flush_work(), then put_online_cpus(). See > 2d3854a37e8b767a51aba38ed6d22817b0631e33. > > Several locking warnings resulted from different users being converted > over the months this patch was used leading up to 2.6.29. One was > fixed to use smp_call_function_single > (b2bb85549134c005e997e5a7ed303bda6a1ae738), but it's always risky to > convert code which was running in user context into interrupt context. > When another one was reported, we dropped the get_online_cpus() and > relied on the callers to do that (as they should have been doing > before) as seen in 31ad9081200c06ccc350625d41d1f8b2d1cef29f. > > But there was still a locking issue with > arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c, so that conversion was > reverted in Ingo's tree. Turns out that it the conversion caused > work_on_cpu to be called from inside keventd in some paths. > > The obvious solution was to change to using our own workqueue, so we > could use it in ACPI and so it would actually be a generically useful > function as intended. > > Andrew Morton complained about NR_CPUS new threads. Which I happen to > agree with. (He also bitched about the comment being too short, so > this one is a fucking novel). > > While I was at linux.conf.au and avoiding reading my mail, a long > discussion ensued (See lkml "[PATCH 2/3] work_on_cpu: Use our own > workqueue."), including useful advice such as not ever grabbing a lock > inside a work function. Or something; it gets a bit confusing. > > To make the pain stop, this attempts to create work_on_cpu without > using workqueues at all. It does create one new thread. But as long > as you don't call work_on_cpu inside work_on_cpu, or create normal > locking inversions, it should work just peachy. > > FIXME: It includes work_on_cpu.h from workqueue.h to avoid chasing all > the users, but they should include it directly. > > I tested it lightly (as seen in this patch), but my test machine > doesn't hit any work_on_cpu paths. > > Signed-off-by: Rusty Russell <rusty@xxxxxxxxxxxxxxx> > > diff --git a/include/linux/work_on_cpu.h b/include/linux/work_on_cpu.h > new file mode 100644 > --- /dev/null > +++ b/include/linux/work_on_cpu.h > @@ -0,0 +1,13 @@ > +#ifndef _LINUX_WORK_ON_CPU_H > +#define _LINUX_WORK_ON_CPU_H > + > +#ifndef CONFIG_SMP > +static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) > +{ > + return fn(arg); > +} > +#else > +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); > +#endif /* CONFIG_SMP */ > + > +#endif /* _LINUX_WORK_ON_CPU_H */ > diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h > --- a/include/linux/workqueue.h > +++ b/include/linux/workqueue.h > @@ -9,6 +9,7 @@ > #include <linux/linkage.h> > #include <linux/bitops.h> > #include <linux/lockdep.h> > +#include <linux/work_on_cpu.h> > #include <asm/atomic.h> > > struct workqueue_struct; > @@ -251,13 +252,4 @@ void cancel_rearming_delayed_work(struct > { > cancel_delayed_work_sync(work); > } > - > -#ifndef CONFIG_SMP > -static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) > -{ > - return fn(arg); > -} > -#else > -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); > -#endif /* CONFIG_SMP */ > #endif > diff --git a/kernel/Makefile b/kernel/Makefile > --- a/kernel/Makefile > +++ b/kernel/Makefile > @@ -92,6 +92,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ > obj-$(CONFIG_FUNCTION_TRACER) += trace/ > obj-$(CONFIG_TRACING) += trace/ > obj-$(CONFIG_SMP) += sched_cpupri.o > +obj-$(CONFIG_SMP) += work_on_cpu.o > > ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) > # According to Alan Modra <alan@xxxxxxxxxxxxxxxx>, the -fno-omit-frame-pointer is > diff --git a/kernel/work_on_cpu.c b/kernel/work_on_cpu.c > new file mode 100644 > --- /dev/null > +++ b/kernel/work_on_cpu.c > @@ -0,0 +1,108 @@ > +#include <linux/work_on_cpu.h> > +#include <linux/kthread.h> > +#include <linux/mutex.h> > +#include <linux/completion.h> > +#include <linux/cpumask.h> > +#include <linux/module.h> > + > +/* The thread waits for new work on this waitqueue. */ > +static DECLARE_WAIT_QUEUE_HEAD(woc_wq); > +/* The lock ensures only one job is done at a time. */ > +static DEFINE_MUTEX(woc_mutex); > + > +/* The details of the current job. */ > +struct work_for_cpu { > + unsigned int cpu; > + long (*fn)(void *); > + void *arg; > + long ret; > + struct completion done; > +}; > + > +/* The pointer to the current job. NULL if nothing pending. */ > +static struct work_for_cpu *current_work; > + > +/* We leave our thread on whatever cpu it was on last. We can get > + * batted onto another CPU by move_task_off_dead_cpu if that cpu goes > + * down, but the caller of work_on_cpu() was supposed to ensure that > + * doesn't happen, so it should only happen when idle. */ > +static int do_work_on_cpu(void *unused) > +{ > + for (;;) { > + struct completion *done; > + > + wait_event(woc_wq, current_work != NULL); > + > + set_cpus_allowed_ptr(current, cpumask_of(current_work->cpu)); > + WARN_ON(smp_processor_id() != current_work->cpu); > + > + current_work->ret = current_work->fn(current_work->arg); > + /* Make sure ret is set before we complete(). Paranoia. */ > + wmb(); > + > + /* Reset current_work so we don't spin. */ > + done = ¤t_work->done; > + current_work = NULL; > + > + /* Reset current_work for next work_on_cpu(). */ > + complete(done); > + } > +} > + > +/** > + * work_on_cpu - run a function in user context on a particular cpu > + * @cpu: the cpu to run on > + * @fn: the function to run > + * @arg: the function arg > + * > + * This will return the value @fn returns. > + * It is up to the caller to ensure that the cpu doesn't go offline. > + */ > +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) > +{ > + struct work_for_cpu work; > + > + work.cpu = cpu; > + work.fn = fn; > + work.arg = arg; > + init_completion(&work.done); > + > + mutex_lock(&woc_mutex); > + /* Make sure all is in place before it sees fn set. */ > + wmb(); > + current_work = &work; > + wake_up(&woc_wq); > + > + wait_for_completion(&work.done); > + BUG_ON(current_work); > + mutex_unlock(&woc_mutex); > + > + return work.ret; > +} > +EXPORT_SYMBOL_GPL(work_on_cpu); > + > +#if 1 > +static long test_fn(void *arg) > +{ > + printk("%u: %lu\n", smp_processor_id(), (long)arg); > + return (long)arg + 100; > +} > +#endif > + > +static int __init init(void) > +{ > + unsigned int i; > + > + kthread_run(do_work_on_cpu, NULL, "kwork_on_cpu"); > + > +#if 1 > + for_each_online_cpu(i) { > + long ret = work_on_cpu(i, test_fn, (void *)i); > + printk("CPU %i returned %li\n", i, ret); > + BUG_ON(ret != i + 100); > + } > +#endif > + > + return 0; > +} > +module_init(init); > diff --git a/kernel/workqueue.c b/kernel/workqueue.c > --- a/kernel/workqueue.c > +++ b/kernel/workqueue.c > @@ -970,47 +970,6 @@ undo: > return ret; > } > > -#ifdef CONFIG_SMP > -static struct workqueue_struct *work_on_cpu_wq __read_mostly; > - > -struct work_for_cpu { > - struct work_struct work; > - long (*fn)(void *); > - void *arg; > - long ret; > -}; > - > -static void do_work_for_cpu(struct work_struct *w) > -{ > - struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); > - > - wfc->ret = wfc->fn(wfc->arg); > -} > - > -/** > - * work_on_cpu - run a function in user context on a particular cpu > - * @cpu: the cpu to run on > - * @fn: the function to run > - * @arg: the function arg > - * > - * This will return the value @fn returns. > - * It is up to the caller to ensure that the cpu doesn't go offline. > - */ > -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) > -{ > - struct work_for_cpu wfc; > - > - INIT_WORK(&wfc.work, do_work_for_cpu); > - wfc.fn = fn; > - wfc.arg = arg; > - queue_work_on(cpu, work_on_cpu_wq, &wfc.work); > - flush_work(&wfc.work); > - > - return wfc.ret; > -} > -EXPORT_SYMBOL_GPL(work_on_cpu); > -#endif /* CONFIG_SMP */ > - > void __init init_workqueues(void) > { > alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); > @@ -1021,8 +980,4 @@ void __init init_workqueues(void) > hotcpu_notifier(workqueue_cpu_callback, 0); > keventd_wq = create_workqueue("events"); > BUG_ON(!keventd_wq); > -#ifdef CONFIG_SMP > - work_on_cpu_wq = create_workqueue("work_on_cpu"); > - BUG_ON(!work_on_cpu_wq); > -#endif > } > -- To unsubscribe from this list: send the line "unsubscribe cpufreq" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html