On Thu, Jan 14, 2010 at 01:11:33AM +0100, Jan Kiszka wrote: > Paul E. McKenney wrote: > > On Tue, Jan 12, 2010 at 09:28:15AM +0100, Jan Kiszka wrote: > >> If so, I will try to write something like this the next days. Will > >> surely appreciate your review afterwards! > > > > Sounds good! > > > > Here we go, find the commit inlined below. You may be primarily > interested in kvm_synchronize_sched_expedited() and the helper thread > function kvm_rcu_sync_thread. As I use a dedicated thread I simplified > some parts compared to upstream. Light testing indicates that it works. > > The full set of changes (specifically interesting for anyone who wants > to test it) is available on kernel.org. There is now also a > commit [1] included for CPU hotplug support, but it's untested as I don't > have the required hardware at hand. This patch is only for backporting, right? In mainline, you should be able to simply call synchronize_srcu_expedited(). > Thanks in advance, > Jan > > [1] http://git.kernel.org/?p=virt/kvm/kvm-kmod.git;a=commitdiff;h=81a3ddcd8ea343a9570f3164bb31160ff4be0ab7 > > ---------> > > From fe13d0785586623176879a87949ad626dace192c Mon Sep 17 00:00:00 2001 > From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > Date: Wed, 13 Jan 2010 09:32:50 +0100 > Subject: [PATCH] Compat support for synchronize_srcu_sync > > This services was first introduced to 2.6.32 and further optimized in > 2.6.33. KVM makes use of it for replacing the rw-sem based slot lock > with SRCU. > > TODO: Add CPU-hotplug support to the sync kthreads. Oh. Never mind my complaint about lack of CPU-hotplug support below. ;-) The function migration_call() in kernel/sched.c has the code you would want to harvest for this. And re-reading your text above, perhaps you already have done so. Anyway, looks good. There are some questions interspersed below to make sure I understand the situation. There is one cut-and-paste spelling error and another cut-and-paste C-standard violation, the latter being my fault, and as far as I know, not a problem in practice. > Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > --- > external-module-compat-comm.h | 17 ++ > srcu.c | 382 ++++++++++++++++++++++++++++++----------- > sync | 27 ++- > 3 files changed, 316 insertions(+), 110 deletions(-) > > diff --git a/external-module-compat-comm.h b/external-module-compat-comm.h > index ca57567..eabf8a1 100644 > --- a/external-module-compat-comm.h > +++ b/external-module-compat-comm.h > @@ -1053,3 +1053,20 @@ static inline void kvm_fire_urn(void) > static inline void kvm_fire_urn(void) {} > > #endif /* CONFIG_USER_RETURN_NOTIFIER */ > + > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) > + > +#ifdef CONFIG_SMP > +void kvm_synchronize_srcu_expedited(struct srcu_struct *sp); > +#else > +static inline void kvm_synchronize_srcu_expedited(struct srcu_struct *sp) { } > +#endif > + > +#else > + > +#define kvm_synchronize_srcu_expedited synchronize_srcu_expedited > + > +#endif > + > +int kvm_init_srcu(void); > +void kvm_exit_srcu(void); > diff --git a/srcu.c b/srcu.c Are we in kernel/srcu.c or a kvm-specific file? My guess is that we are in kernel/srcu.c for a backport-only patch, but have to ask! > index e9734bc..985d627 100644 > --- a/srcu.c > +++ b/srcu.c > @@ -24,8 +24,6 @@ > * > */ > > -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) > - > #include <linux/module.h> > #include <linux/mutex.h> > #include <linux/percpu.h> > @@ -35,6 +33,118 @@ > #include <linux/slab.h> > #include <linux/smp.h> > #include <linux/srcu.h> > +#include <linux/kthread.h> > + > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) || \ > + (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) && defined(CONFIG_SMP)) > + > +/* > + * srcu_readers_active_idx -- returns approximate number of readers > + * active on the specified rank of per-CPU counters. > + */ > + > +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) > +{ > + int cpu; > + int sum; > + > + sum = 0; > + for_each_possible_cpu(cpu) > + sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; > + return sum; > +} > + > +/* > + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). > + */ > +static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) > +{ > + int idx; > + > + idx = sp->completed; > + mutex_lock(&sp->mutex); > + > + /* > + * Check to see if someone else did the work for us while we were > + * waiting to acquire the lock. We need -two- advances of > + * the counter, not just one. If there was but one, we might have > + * shown up -after- our helper's first synchronize_sched(), thus > + * having failed to prevent CPU-reordering races with concurrent > + * srcu_read_unlock()s on other CPUs (see comment below). So we > + * either (1) wait for two or (2) supply the second ourselves. > + */ > + > + if ((sp->completed - idx) >= 2) { > + mutex_unlock(&sp->mutex); > + return; > + } > + > + sync_func(); /* Force memory barrier on all CPUs. */ > + > + /* > + * The preceding synchronize_sched() ensures that any CPU that > + * sees the new value of sp->completed will also see any preceding > + * changes to data structures made by this CPU. This prevents > + * some other CPU from reordering the accesses in its SRCU > + * read-side critical section to precede the corresponding > + * srcu_read_lock() -- ensuring that such references will in > + * fact be protected. > + * > + * So it is now safe to do the flip. > + */ > + > + idx = sp->completed & 0x1; > + sp->completed++; > + > + sync_func(); /* Force memory barrier on all CPUs. */ > + > + /* > + * At this point, because of the preceding synchronize_sched(), > + * all srcu_read_lock() calls using the old counters have completed. > + * Their corresponding critical sections might well be still > + * executing, but the srcu_read_lock() primitives themselves > + * will have finished executing. > + */ > + > + while (srcu_readers_active_idx(sp, idx)) > + schedule_timeout_interruptible(1); > + > + sync_func(); /* Force memory barrier on all CPUs. */ > + > + /* > + * The preceding synchronize_sched() forces all srcu_read_unlock() > + * primitives that were executing concurrently with the preceding > + * for_each_possible_cpu() loop to have completed by this point. > + * More importantly, it also forces the corresponding SRCU read-side > + * critical sections to have also completed, and the corresponding > + * references to SRCU-protected data items to be dropped. > + * > + * Note: > + * > + * Despite what you might think at first glance, the > + * preceding synchronize_sched() -must- be within the > + * critical section ended by the following mutex_unlock(). > + * Otherwise, a task taking the early exit can race > + * with a srcu_read_unlock(), which might have executed > + * just before the preceding srcu_readers_active() check, > + * and whose CPU might have reordered the srcu_read_unlock() > + * with the preceding critical section. In this case, there > + * is nothing preventing the synchronize_sched() task that is > + * taking the early exit from freeing a data structure that > + * is still being referenced (out of order) by the task > + * doing the srcu_read_unlock(). > + * > + * Alternatively, the comparison with "2" on the early exit > + * could be changed to "3", but this increases synchronize_srcu() > + * latency for bulk loads. So the current code is preferred. > + */ > + > + mutex_unlock(&sp->mutex); > +} > + > +#endif > + > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) > > #undef kvm_init_srcu_struct > #undef kvm_cleanup_srcu_struct > @@ -42,6 +152,11 @@ > #undef kvm_srcu_read_unlock > #undef kvm_synchronize_srcu > #undef kvm_srcu_batches_completed > + > +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx); > +static void > +__synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)); > + > /** > * init_srcu_struct - initialize a sleep-RCU structure > * @sp: structure to initialize. > @@ -58,22 +173,6 @@ int kvm_init_srcu_struct(struct srcu_struct *sp) > return (sp->per_cpu_ref ? 0 : -ENOMEM); > } > > -/* > - * srcu_readers_active_idx -- returns approximate number of readers > - * active on the specified rank of per-CPU counters. > - */ > - > -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) > -{ > - int cpu; > - int sum; > - > - sum = 0; > - for_each_possible_cpu(cpu) > - sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; > - return sum; > -} > - > /** > * srcu_readers_active - returns approximate number of readers. > * @sp: which srcu_struct to count active readers (holding srcu_read_lock). > @@ -154,94 +253,14 @@ void kvm_srcu_read_unlock(struct srcu_struct *sp, int idx) > * synchronizing concurrent updates. Can block; must be called from > * process context. > * > - * Note that it is illegal to call synchornize_srcu() from the corresponding > + * Note that it is illegal to call synchronize_srcu() from the corresponding > * SRCU read-side critical section; doing so will result in deadlock. > * However, it is perfectly legal to call synchronize_srcu() on one > * srcu_struct from some other srcu_struct's read-side critical section. > */ > void kvm_synchronize_srcu(struct srcu_struct *sp) > { > - int idx; > - > - idx = sp->completed; > - mutex_lock(&sp->mutex); > - > - /* > - * Check to see if someone else did the work for us while we were > - * waiting to acquire the lock. We need -two- advances of > - * the counter, not just one. If there was but one, we might have > - * shown up -after- our helper's first synchronize_sched(), thus > - * having failed to prevent CPU-reordering races with concurrent > - * srcu_read_unlock()s on other CPUs (see comment below). So we > - * either (1) wait for two or (2) supply the second ourselves. > - */ > - > - if ((sp->completed - idx) >= 2) { > - mutex_unlock(&sp->mutex); > - return; > - } > - > - synchronize_sched(); /* Force memory barrier on all CPUs. */ > - > - /* > - * The preceding synchronize_sched() ensures that any CPU that > - * sees the new value of sp->completed will also see any preceding > - * changes to data structures made by this CPU. This prevents > - * some other CPU from reordering the accesses in its SRCU > - * read-side critical section to precede the corresponding > - * srcu_read_lock() -- ensuring that such references will in > - * fact be protected. > - * > - * So it is now safe to do the flip. > - */ > - > - idx = sp->completed & 0x1; > - sp->completed++; > - > - synchronize_sched(); /* Force memory barrier on all CPUs. */ > - > - /* > - * At this point, because of the preceding synchronize_sched(), > - * all srcu_read_lock() calls using the old counters have completed. > - * Their corresponding critical sections might well be still > - * executing, but the srcu_read_lock() primitives themselves > - * will have finished executing. > - */ > - > - while (srcu_readers_active_idx(sp, idx)) > - schedule_timeout_interruptible(1); > - > - synchronize_sched(); /* Force memory barrier on all CPUs. */ > - > - /* > - * The preceding synchronize_sched() forces all srcu_read_unlock() > - * primitives that were executing concurrently with the preceding > - * for_each_possible_cpu() loop to have completed by this point. > - * More importantly, it also forces the corresponding SRCU read-side > - * critical sections to have also completed, and the corresponding > - * references to SRCU-protected data items to be dropped. > - * > - * Note: > - * > - * Despite what you might think at first glance, the > - * preceding synchronize_sched() -must- be within the > - * critical section ended by the following mutex_unlock(). > - * Otherwise, a task taking the early exit can race > - * with a srcu_read_unlock(), which might have executed > - * just before the preceding srcu_readers_active() check, > - * and whose CPU might have reordered the srcu_read_unlock() > - * with the preceding critical section. In this case, there > - * is nothing preventing the synchronize_sched() task that is > - * taking the early exit from freeing a data structure that > - * is still being referenced (out of order) by the task > - * doing the srcu_read_unlock(). > - * > - * Alternatively, the comparison with "2" on the early exit > - * could be changed to "3", but this increases synchronize_srcu() > - * latency for bulk loads. So the current code is preferred. > - */ > - > - mutex_unlock(&sp->mutex); > + __synchronize_srcu(sp, synchronize_sched); > } Hmmm... Why not simply "synchronize_srcu()"? Is this to allow a smaller piece of code to cover all the combinations of Linux kernel versions? > /** > @@ -265,3 +284,166 @@ EXPORT_SYMBOL_GPL(kvm_synchronize_srcu); > EXPORT_SYMBOL_GPL(kvm_srcu_batches_completed); > > #endif > + > +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) && defined(CONFIG_SMP) > + > +struct sync_req { > + struct list_head list; > + bool pending; > + bool success; > + struct completion done; > +}; > + > +static DEFINE_PER_CPU(struct sync_req, sync_req); > +static DEFINE_PER_CPU(struct task_struct *, sync_thread); > +static DEFINE_MUTEX(rcu_sched_expedited_mutex); > + > +static long synchronize_sched_expedited_count; Sigh!!! You copied one of my theoretical bugs. According to the C standard, overflowing a signed integer is undefined (though it has been a -long- time since I have seen hardware that did anything other than wrap like you would expect). My todo list includes converting these to "unsigned long". I can do certainly add this instance to my list, but I would of course be quite happy if you made the appropriate adjustments. (This effort got preempted by applying lockdep to RCU, given that Thomas Gleixner found some non-theoretical RCU-usage bugs.) > +static int kvm_rcu_sync_thread(void *data) > +{ > + int badcpu; > + int cpu = (long)data; > + struct sync_req *req = &per_cpu(sync_req, cpu); > + > + set_current_state(TASK_INTERRUPTIBLE); > + while (!kthread_should_stop()) { > + if (!req->pending) { > + schedule(); > + set_current_state(TASK_INTERRUPTIBLE); > + continue; > + } > + req->pending = false; > + > + preempt_disable(); > + badcpu = smp_processor_id(); > + if (likely(cpu == badcpu)) { > + req->success = true; > + } else { > + req->success = false; > + WARN_ONCE(1, "kvm_rcu_sync_thread() on CPU %d, " > + "expected %d\n", badcpu, cpu); > + } > + preempt_enable(); > + > + complete(&req->done); > + } > + __set_current_state(TASK_RUNNING); > + > + return 0; > +} > + > +static void kvm_synchronize_sched_expedited(void) > +{ > + int cpu; > + bool need_full_sync = 0; > + struct sync_req *req; > + long snap; > + int trycount = 0; > + > + smp_mb(); /* ensure prior mod happens before capturing snap. */ > + snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; > + get_online_cpus(); > + while (!mutex_trylock(&rcu_sched_expedited_mutex)) { > + put_online_cpus(); > + if (trycount++ < 10) > + udelay(trycount * num_online_cpus()); > + else { > + synchronize_sched(); > + return; > + } > + if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { > + smp_mb(); /* ensure test happens before caller kfree */ > + return; > + } > + get_online_cpus(); > + } > + for_each_online_cpu(cpu) { > + req = &per_cpu(sync_req, cpu); > + init_completion(&req->done); > + smp_wmb(); > + req->pending = true; > + wake_up_process(per_cpu(sync_thread, cpu)); > + } > + for_each_online_cpu(cpu) { > + req = &per_cpu(sync_req, cpu); > + wait_for_completion(&req->done); > + if (unlikely(!req->success)) > + need_full_sync = 1; > + } > + synchronize_sched_expedited_count++; > + mutex_unlock(&rcu_sched_expedited_mutex); > + put_online_cpus(); > + if (need_full_sync) > + synchronize_sched(); > +} > + > +/** > + * synchronize_srcu_expedited - like synchronize_srcu, but less patient "kvm_synchronize_srcu_expedited", right? ;-) > + * @sp: srcu_struct with which to synchronize. > + * > + * Flip the completed counter, and wait for the old count to drain to zero. > + * As with classic RCU, the updater must use some separate means of > + * synchronizing concurrent updates. Can block; must be called from > + * process context. > + * > + * Note that it is illegal to call synchronize_srcu_expedited() > + * from the corresponding SRCU read-side critical section; doing so > + * will result in deadlock. However, it is perfectly legal to call > + * synchronize_srcu_expedited() on one srcu_struct from some other > + * srcu_struct's read-side critical section. > + */ > +void kvm_synchronize_srcu_expedited(struct srcu_struct *sp) > +{ > + __synchronize_srcu(sp, kvm_synchronize_sched_expedited); > +} > +EXPORT_SYMBOL_GPL(kvm_synchronize_srcu_expedited); > + > +int kvm_init_srcu(void) > +{ > + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; > + struct task_struct *p; > + int cpu; > + int err; > + > + for_each_online_cpu(cpu) { > + p = kthread_create(kvm_rcu_sync_thread, (void *)(long)cpu, > + "kvmsrcusync/%d", cpu); > + if (IS_ERR(p)) > + goto error_out; > + > + kthread_bind(p, cpu); > + sched_setscheduler(p, SCHED_FIFO, ¶m); > + per_cpu(sync_thread, cpu) = p; > + wake_up_process(p); > + } > + return 0; > + > +error_out: > + printk(KERN_ERR "kvm: kvmsrcsync for %d failed\n", cpu); > + err = PTR_ERR(p); > + kvm_exit_srcu(); > + return err; > +} CPU hotplug? > +void kvm_exit_srcu(void) > +{ > + int cpu; > + > + for_each_online_cpu(cpu) > + if (per_cpu(sync_thread, cpu)) > + kthread_stop(per_cpu(sync_thread, cpu)); > +} > + > +#else > + > +int kvm_init_srcu(void) > +{ > + return 0; > +} > + > +void kvm_exit_srcu(void) > +{ > +} > + > +#endif I do not understand what follows -- a KVM-specific test suite or some such? > diff --git a/sync b/sync > index 0eaff31..d67469e 100755 > --- a/sync > +++ b/sync > @@ -40,8 +40,9 @@ def __hack(data): > 'do_machine_check eventfd_signal get_desc_base get_desc_limit ' > 'vma_kernel_pagesize native_read_tsc user_return_notifier ' > 'user_return_notifier_register user_return_notifier_unregister ' > + 'synchronize_srcu_expedited ' > ) > - anon_inodes = anon_inodes_exit = False > + kvm_init = kvm_exit = False > mce = False > result = [] > pr_fmt = '' > @@ -58,24 +59,30 @@ def __hack(data): > pr_fmt = sub(r'#define pr_fmt\([^)]*\) ("[^"]*").*', r'\1', line) + ' ' > line = '' > line = sub(r'pr_debug\(([^),]*)', r'pr_debug(' + pr_fmt + r'\1', line) > - if match(r'^int kvm_init\('): anon_inodes = True > - if match(r'r = kvm_arch_init\(opaque\);') and anon_inodes: > + if match(r'^int kvm_init\('): kvm_init = True > + if match(r'r = kvm_arch_init\(opaque\);') and kvm_init: > w('\tr = kvm_init_anon_inodes();') > w('\tif (r)') > w('\t\treturn r;\n') > + w('\tr = kvm_init_srcu();') > + w('\tif (r)') > + w('\t\tgoto cleanup_anon_inodes;\n') > w('\tpreempt_notifier_sys_init();') > w('\thrtimer_kallsyms_resolve();\n') > - if match(r'return 0;') and anon_inodes: > + if match(r'return 0;') and kvm_init: > w('\tprintk("loaded kvm module (%s)\\n");\n' % (version,)) > - if match(r'return r;') and anon_inodes: > - w('\tkvm_exit_anon_inodes();') > + if match(r'return r;') and kvm_init: > w('\tpreempt_notifier_sys_exit();') > - anon_inodes = False > - if match(r'^void kvm_exit'): anon_inodes_exit = True > - if match(r'\}') and anon_inodes_exit: > + w('\tkvm_exit_srcu();') > + w('cleanup_anon_inodes:') > w('\tkvm_exit_anon_inodes();') > + kvm_init = False > + if match(r'^void kvm_exit'): kvm_exit = True > + if match(r'\}') and kvm_exit: > w('\tpreempt_notifier_sys_exit();') > - anon_inodes_exit = False > + w('\tkvm_exit_srcu();') > + w('\tkvm_exit_anon_inodes();') > + kvm_exit = False > if match(r'^int kvm_arch_init'): kvm_arch_init = True > if match(r'\btsc_khz\b') and kvm_arch_init: > line = sub(r'\btsc_khz\b', 'kvm_tsc_khz', line) > -- > 1.6.0.2 > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html