Re: qemu-kvm.git build problem

"Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx> · Thu, 14 Jan 2010 15:48:27 -0800

On Thu, Jan 14, 2010 at 01:11:33AM +0100, Jan Kiszka wrote:
> Paul E. McKenney wrote:
> > On Tue, Jan 12, 2010 at 09:28:15AM +0100, Jan Kiszka wrote:
> >> If so, I will try to write something like this the next days. Will
> >> surely appreciate your review afterwards!
> > 
> > Sounds good!
> > 
> 
> Here we go, find the commit inlined below. You may be primarily
> interested in kvm_synchronize_sched_expedited() and the helper thread
> function kvm_rcu_sync_thread. As I use a dedicated thread I simplified
> some parts compared to upstream. Light testing indicates that it works.
> 
> The full set of changes (specifically interesting for anyone who wants
> to test it) is available on kernel.org. There is now also a
> commit [1] included for CPU hotplug support, but it's untested as I don't
> have the required hardware at hand.

This patch is only for backporting, right?  In mainline, you should be
able to simply call synchronize_srcu_expedited().

> Thanks in advance,
> Jan
> 
> [1] http://git.kernel.org/?p=virt/kvm/kvm-kmod.git;a=commitdiff;h=81a3ddcd8ea343a9570f3164bb31160ff4be0ab7
> 
> --------->
> 
> From fe13d0785586623176879a87949ad626dace192c Mon Sep 17 00:00:00 2001
> From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> Date: Wed, 13 Jan 2010 09:32:50 +0100
> Subject: [PATCH] Compat support for synchronize_srcu_sync
> 
> This services was first introduced to 2.6.32 and further optimized in
> 2.6.33. KVM makes use of it for replacing the rw-sem based slot lock
> with SRCU.
> 
> TODO: Add CPU-hotplug support to the sync kthreads.

Oh.  Never mind my complaint about lack of CPU-hotplug support below.  ;-)

The function migration_call() in kernel/sched.c has the code you would
want to harvest for this.  And re-reading your text above, perhaps you
already have done so.

Anyway, looks good.  There are some questions interspersed below to make
sure I understand the situation.  There is one cut-and-paste spelling
error and another cut-and-paste C-standard violation, the latter being
my fault, and as far as I know, not a problem in practice.

> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> ---
>  external-module-compat-comm.h |   17 ++
>  srcu.c                        |  382 ++++++++++++++++++++++++++++++-----------
>  sync                          |   27 ++-
>  3 files changed, 316 insertions(+), 110 deletions(-)
> 
> diff --git a/external-module-compat-comm.h b/external-module-compat-comm.h
> index ca57567..eabf8a1 100644
> --- a/external-module-compat-comm.h
> +++ b/external-module-compat-comm.h
> @@ -1053,3 +1053,20 @@ static inline void kvm_fire_urn(void)
>  static inline void kvm_fire_urn(void) {}
>  
>  #endif /* CONFIG_USER_RETURN_NOTIFIER */
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33)
> +
> +#ifdef CONFIG_SMP
> +void kvm_synchronize_srcu_expedited(struct srcu_struct *sp);
> +#else
> +static inline void kvm_synchronize_srcu_expedited(struct srcu_struct *sp) { }
> +#endif
> +
> +#else
> +
> +#define kvm_synchronize_srcu_expedited synchronize_srcu_expedited
> +
> +#endif
> +
> +int kvm_init_srcu(void);
> +void kvm_exit_srcu(void);
> diff --git a/srcu.c b/srcu.c

Are we in kernel/srcu.c or a kvm-specific file?

My guess is that we are in kernel/srcu.c for a backport-only patch, but
have to ask!

> index e9734bc..985d627 100644
> --- a/srcu.c
> +++ b/srcu.c
> @@ -24,8 +24,6 @@
>   *
>   */
>  
> -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
> -
>  #include <linux/module.h>
>  #include <linux/mutex.h>
>  #include <linux/percpu.h>
> @@ -35,6 +33,118 @@
>  #include <linux/slab.h>
>  #include <linux/smp.h>
>  #include <linux/srcu.h>
> +#include <linux/kthread.h>
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) || \
> +    (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) && defined(CONFIG_SMP))
> +
> +/*
> + * srcu_readers_active_idx -- returns approximate number of readers
> + *	active on the specified rank of per-CPU counters.
> + */
> +
> +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> +{
> +	int cpu;
> +	int sum;
> +
> +	sum = 0;
> +	for_each_possible_cpu(cpu)
> +		sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
> +	return sum;
> +}
> +
> +/*
> + * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
> + */
> +static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
> +{
> +	int idx;
> +
> +	idx = sp->completed;
> +	mutex_lock(&sp->mutex);
> +
> +	/*
> +	 * Check to see if someone else did the work for us while we were
> +	 * waiting to acquire the lock.  We need -two- advances of
> +	 * the counter, not just one.  If there was but one, we might have
> +	 * shown up -after- our helper's first synchronize_sched(), thus
> +	 * having failed to prevent CPU-reordering races with concurrent
> +	 * srcu_read_unlock()s on other CPUs (see comment below).  So we
> +	 * either (1) wait for two or (2) supply the second ourselves.
> +	 */
> +
> +	if ((sp->completed - idx) >= 2) {
> +		mutex_unlock(&sp->mutex);
> +		return;
> +	}
> +
> +	sync_func();  /* Force memory barrier on all CPUs. */
> +
> +	/*
> +	 * The preceding synchronize_sched() ensures that any CPU that
> +	 * sees the new value of sp->completed will also see any preceding
> +	 * changes to data structures made by this CPU.  This prevents
> +	 * some other CPU from reordering the accesses in its SRCU
> +	 * read-side critical section to precede the corresponding
> +	 * srcu_read_lock() -- ensuring that such references will in
> +	 * fact be protected.
> +	 *
> +	 * So it is now safe to do the flip.
> +	 */
> +
> +	idx = sp->completed & 0x1;
> +	sp->completed++;
> +
> +	sync_func();  /* Force memory barrier on all CPUs. */
> +
> +	/*
> +	 * At this point, because of the preceding synchronize_sched(),
> +	 * all srcu_read_lock() calls using the old counters have completed.
> +	 * Their corresponding critical sections might well be still
> +	 * executing, but the srcu_read_lock() primitives themselves
> +	 * will have finished executing.
> +	 */
> +
> +	while (srcu_readers_active_idx(sp, idx))
> +		schedule_timeout_interruptible(1);
> +
> +	sync_func();  /* Force memory barrier on all CPUs. */
> +
> +	/*
> +	 * The preceding synchronize_sched() forces all srcu_read_unlock()
> +	 * primitives that were executing concurrently with the preceding
> +	 * for_each_possible_cpu() loop to have completed by this point.
> +	 * More importantly, it also forces the corresponding SRCU read-side
> +	 * critical sections to have also completed, and the corresponding
> +	 * references to SRCU-protected data items to be dropped.
> +	 *
> +	 * Note:
> +	 *
> +	 *	Despite what you might think at first glance, the
> +	 *	preceding synchronize_sched() -must- be within the
> +	 *	critical section ended by the following mutex_unlock().
> +	 *	Otherwise, a task taking the early exit can race
> +	 *	with a srcu_read_unlock(), which might have executed
> +	 *	just before the preceding srcu_readers_active() check,
> +	 *	and whose CPU might have reordered the srcu_read_unlock()
> +	 *	with the preceding critical section.  In this case, there
> +	 *	is nothing preventing the synchronize_sched() task that is
> +	 *	taking the early exit from freeing a data structure that
> +	 *	is still being referenced (out of order) by the task
> +	 *	doing the srcu_read_unlock().
> +	 *
> +	 *	Alternatively, the comparison with "2" on the early exit
> +	 *	could be changed to "3", but this increases synchronize_srcu()
> +	 *	latency for bulk loads.  So the current code is preferred.
> +	 */
> +
> +	mutex_unlock(&sp->mutex);
> +}
> +
> +#endif
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
>  
>  #undef kvm_init_srcu_struct
>  #undef kvm_cleanup_srcu_struct
> @@ -42,6 +152,11 @@
>  #undef kvm_srcu_read_unlock
>  #undef kvm_synchronize_srcu
>  #undef kvm_srcu_batches_completed
> +
> +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx);
> +static void
> +__synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void));
> +
>  /**
>   * init_srcu_struct - initialize a sleep-RCU structure
>   * @sp: structure to initialize.
> @@ -58,22 +173,6 @@ int kvm_init_srcu_struct(struct srcu_struct *sp)
>  	return (sp->per_cpu_ref ? 0 : -ENOMEM);
>  }
>  
> -/*
> - * srcu_readers_active_idx -- returns approximate number of readers
> - *	active on the specified rank of per-CPU counters.
> - */
> -
> -static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
> -{
> -	int cpu;
> -	int sum;
> -
> -	sum = 0;
> -	for_each_possible_cpu(cpu)
> -		sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
> -	return sum;
> -}
> -
>  /**
>   * srcu_readers_active - returns approximate number of readers.
>   * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
> @@ -154,94 +253,14 @@ void kvm_srcu_read_unlock(struct srcu_struct *sp, int idx)
>   * synchronizing concurrent updates.  Can block; must be called from
>   * process context.
>   *
> - * Note that it is illegal to call synchornize_srcu() from the corresponding
> + * Note that it is illegal to call synchronize_srcu() from the corresponding
>   * SRCU read-side critical section; doing so will result in deadlock.
>   * However, it is perfectly legal to call synchronize_srcu() on one
>   * srcu_struct from some other srcu_struct's read-side critical section.
>   */
>  void kvm_synchronize_srcu(struct srcu_struct *sp)
>  {
> -	int idx;
> -
> -	idx = sp->completed;
> -	mutex_lock(&sp->mutex);
> -
> -	/*
> -	 * Check to see if someone else did the work for us while we were
> -	 * waiting to acquire the lock.  We need -two- advances of
> -	 * the counter, not just one.  If there was but one, we might have
> -	 * shown up -after- our helper's first synchronize_sched(), thus
> -	 * having failed to prevent CPU-reordering races with concurrent
> -	 * srcu_read_unlock()s on other CPUs (see comment below).  So we
> -	 * either (1) wait for two or (2) supply the second ourselves.
> -	 */
> -
> -	if ((sp->completed - idx) >= 2) {
> -		mutex_unlock(&sp->mutex);
> -		return;
> -	}
> -
> -	synchronize_sched();  /* Force memory barrier on all CPUs. */
> -
> -	/*
> -	 * The preceding synchronize_sched() ensures that any CPU that
> -	 * sees the new value of sp->completed will also see any preceding
> -	 * changes to data structures made by this CPU.  This prevents
> -	 * some other CPU from reordering the accesses in its SRCU
> -	 * read-side critical section to precede the corresponding
> -	 * srcu_read_lock() -- ensuring that such references will in
> -	 * fact be protected.
> -	 *
> -	 * So it is now safe to do the flip.
> -	 */
> -
> -	idx = sp->completed & 0x1;
> -	sp->completed++;
> -
> -	synchronize_sched();  /* Force memory barrier on all CPUs. */
> -
> -	/*
> -	 * At this point, because of the preceding synchronize_sched(),
> -	 * all srcu_read_lock() calls using the old counters have completed.
> -	 * Their corresponding critical sections might well be still
> -	 * executing, but the srcu_read_lock() primitives themselves
> -	 * will have finished executing.
> -	 */
> -
> -	while (srcu_readers_active_idx(sp, idx))
> -		schedule_timeout_interruptible(1);
> -
> -	synchronize_sched();  /* Force memory barrier on all CPUs. */
> -
> -	/*
> -	 * The preceding synchronize_sched() forces all srcu_read_unlock()
> -	 * primitives that were executing concurrently with the preceding
> -	 * for_each_possible_cpu() loop to have completed by this point.
> -	 * More importantly, it also forces the corresponding SRCU read-side
> -	 * critical sections to have also completed, and the corresponding
> -	 * references to SRCU-protected data items to be dropped.
> -	 *
> -	 * Note:
> -	 *
> -	 *	Despite what you might think at first glance, the
> -	 *	preceding synchronize_sched() -must- be within the
> -	 *	critical section ended by the following mutex_unlock().
> -	 *	Otherwise, a task taking the early exit can race
> -	 *	with a srcu_read_unlock(), which might have executed
> -	 *	just before the preceding srcu_readers_active() check,
> -	 *	and whose CPU might have reordered the srcu_read_unlock()
> -	 *	with the preceding critical section.  In this case, there
> -	 *	is nothing preventing the synchronize_sched() task that is
> -	 *	taking the early exit from freeing a data structure that
> -	 *	is still being referenced (out of order) by the task
> -	 *	doing the srcu_read_unlock().
> -	 *
> -	 *	Alternatively, the comparison with "2" on the early exit
> -	 *	could be changed to "3", but this increases synchronize_srcu()
> -	 *	latency for bulk loads.  So the current code is preferred.
> -	 */
> -
> -	mutex_unlock(&sp->mutex);
> +	__synchronize_srcu(sp, synchronize_sched);
>  }

Hmmm...  Why not simply "synchronize_srcu()"?  Is this to allow a smaller
piece of code to cover all the combinations of Linux kernel versions?

>  /**
> @@ -265,3 +284,166 @@ EXPORT_SYMBOL_GPL(kvm_synchronize_srcu);
>  EXPORT_SYMBOL_GPL(kvm_srcu_batches_completed);
>  
>  #endif
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) && defined(CONFIG_SMP)
> +
> +struct sync_req {
> +	struct list_head list;
> +	bool pending;
> +	bool success;
> +	struct completion done;
> +};
> +
> +static DEFINE_PER_CPU(struct sync_req, sync_req);
> +static DEFINE_PER_CPU(struct task_struct *, sync_thread);
> +static DEFINE_MUTEX(rcu_sched_expedited_mutex);
> +
> +static long synchronize_sched_expedited_count;

Sigh!!!  You copied one of my theoretical bugs.

According to the C standard, overflowing a signed integer is undefined
(though it has been a -long- time since I have seen hardware that
did anything other than wrap like you would expect).  My todo list
includes converting these to "unsigned long".  I can do certainly add
this instance to my list, but I would of course be quite happy if you
made the appropriate adjustments.

(This effort got preempted by applying lockdep to RCU, given that Thomas
Gleixner found some non-theoretical RCU-usage bugs.)

> +static int kvm_rcu_sync_thread(void *data)
> +{
> +	int badcpu;
> +	int cpu = (long)data;
> +	struct sync_req *req = &per_cpu(sync_req, cpu);
> +
> +	set_current_state(TASK_INTERRUPTIBLE);
> +	while (!kthread_should_stop()) {
> +		if (!req->pending) {
> +			schedule();
> +			set_current_state(TASK_INTERRUPTIBLE);
> +			continue;
> +		}
> +		req->pending = false;
> +
> +		preempt_disable();
> +		badcpu = smp_processor_id();
> +		if (likely(cpu == badcpu)) {
> +			req->success = true;
> +		} else {
> +			req->success = false;
> +			WARN_ONCE(1, "kvm_rcu_sync_thread() on CPU %d, "
> +				  "expected %d\n", badcpu, cpu);
> +		}
> +		preempt_enable();
> +
> +		complete(&req->done);
> +	}
> +	__set_current_state(TASK_RUNNING);
> +
> +	return 0;
> +}
> +
> +static void kvm_synchronize_sched_expedited(void)
> +{
> +	int cpu;
> +	bool need_full_sync = 0;
> +	struct sync_req *req;
> +	long snap;
> +	int trycount = 0;
> +
> +	smp_mb();  /* ensure prior mod happens before capturing snap. */
> +	snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
> +	get_online_cpus();
> +	while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
> +		put_online_cpus();
> +		if (trycount++ < 10)
> +			udelay(trycount * num_online_cpus());
> +		else {
> +			synchronize_sched();
> +			return;
> +		}
> +		if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
> +			smp_mb(); /* ensure test happens before caller kfree */
> +			return;
> +		}
> +		get_online_cpus();
> +	}
> +	for_each_online_cpu(cpu) {
> +		req = &per_cpu(sync_req, cpu);
> +		init_completion(&req->done);
> +		smp_wmb();
> +		req->pending = true;
> +		wake_up_process(per_cpu(sync_thread, cpu));
> +	}
> +	for_each_online_cpu(cpu) {
> +		req = &per_cpu(sync_req, cpu);
> +		wait_for_completion(&req->done);
> +		if (unlikely(!req->success))
> +			need_full_sync = 1;
> +	}
> +	synchronize_sched_expedited_count++;
> +	mutex_unlock(&rcu_sched_expedited_mutex);
> +	put_online_cpus();
> +	if (need_full_sync)
> +		synchronize_sched();
> +}
> +
> +/**
> + * synchronize_srcu_expedited - like synchronize_srcu, but less patient

"kvm_synchronize_srcu_expedited", right?  ;-)

> + * @sp: srcu_struct with which to synchronize.
> + *
> + * Flip the completed counter, and wait for the old count to drain to zero.
> + * As with classic RCU, the updater must use some separate means of
> + * synchronizing concurrent updates.  Can block; must be called from
> + * process context.
> + *
> + * Note that it is illegal to call synchronize_srcu_expedited()
> + * from the corresponding SRCU read-side critical section; doing so
> + * will result in deadlock.  However, it is perfectly legal to call
> + * synchronize_srcu_expedited() on one srcu_struct from some other
> + * srcu_struct's read-side critical section.
> + */
> +void kvm_synchronize_srcu_expedited(struct srcu_struct *sp)
> +{
> +	__synchronize_srcu(sp, kvm_synchronize_sched_expedited);
> +}
> +EXPORT_SYMBOL_GPL(kvm_synchronize_srcu_expedited);
> +
> +int kvm_init_srcu(void)
> +{
> +	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
> +	struct task_struct *p;
> +	int cpu;
> +	int err;
> +
> +	for_each_online_cpu(cpu) {
> +		p = kthread_create(kvm_rcu_sync_thread, (void *)(long)cpu,
> +				   "kvmsrcusync/%d", cpu);
> +		if (IS_ERR(p))
> +			goto error_out;
> +
> +		kthread_bind(p, cpu);
> +		sched_setscheduler(p, SCHED_FIFO, &param);
> +		per_cpu(sync_thread, cpu) = p;
> +		wake_up_process(p);
> +	}
> +	return 0;
> +
> +error_out:
> +	printk(KERN_ERR "kvm: kvmsrcsync for %d failed\n", cpu);
> +	err = PTR_ERR(p);
> +	kvm_exit_srcu();
> +	return err;
> +}

CPU hotplug?

> +void kvm_exit_srcu(void)
> +{
> +	int cpu;
> +
> +	for_each_online_cpu(cpu)
> +		if (per_cpu(sync_thread, cpu))
> +			kthread_stop(per_cpu(sync_thread, cpu));
> +}
> +
> +#else
> +
> +int kvm_init_srcu(void)
> +{
> +	return 0;
> +}
> +
> +void kvm_exit_srcu(void)
> +{
> +}
> +
> +#endif

I do not understand what follows -- a KVM-specific test suite or some
such?

> diff --git a/sync b/sync
> index 0eaff31..d67469e 100755
> --- a/sync
> +++ b/sync
> @@ -40,8 +40,9 @@ def __hack(data):
>          'do_machine_check eventfd_signal get_desc_base get_desc_limit '
>          'vma_kernel_pagesize native_read_tsc user_return_notifier '
>          'user_return_notifier_register user_return_notifier_unregister '
> +        'synchronize_srcu_expedited '
>          )
> -    anon_inodes = anon_inodes_exit = False
> +    kvm_init = kvm_exit = False
>      mce = False
>      result = []
>      pr_fmt = ''
> @@ -58,24 +59,30 @@ def __hack(data):
>              pr_fmt = sub(r'#define pr_fmt\([^)]*\) ("[^"]*").*', r'\1', line) + ' '
>              line = ''
>          line = sub(r'pr_debug\(([^),]*)', r'pr_debug(' + pr_fmt + r'\1', line)
> -        if match(r'^int kvm_init\('): anon_inodes = True
> -        if match(r'r = kvm_arch_init\(opaque\);') and anon_inodes:
> +        if match(r'^int kvm_init\('): kvm_init = True
> +        if match(r'r = kvm_arch_init\(opaque\);') and kvm_init:
>              w('\tr = kvm_init_anon_inodes();')
>              w('\tif (r)')
>              w('\t\treturn r;\n')
> +            w('\tr = kvm_init_srcu();')
> +            w('\tif (r)')
> +            w('\t\tgoto cleanup_anon_inodes;\n')
>              w('\tpreempt_notifier_sys_init();')
>              w('\thrtimer_kallsyms_resolve();\n')
> -        if match(r'return 0;') and anon_inodes:
> +        if match(r'return 0;') and kvm_init:
>              w('\tprintk("loaded kvm module (%s)\\n");\n' % (version,))
> -        if match(r'return r;') and anon_inodes:
> -            w('\tkvm_exit_anon_inodes();')
> +        if match(r'return r;') and kvm_init:
>              w('\tpreempt_notifier_sys_exit();')
> -            anon_inodes = False
> -        if match(r'^void kvm_exit'): anon_inodes_exit = True
> -        if match(r'\}') and anon_inodes_exit:
> +            w('\tkvm_exit_srcu();')
> +            w('cleanup_anon_inodes:')
>              w('\tkvm_exit_anon_inodes();')
> +            kvm_init = False
> +        if match(r'^void kvm_exit'): kvm_exit = True
> +        if match(r'\}') and kvm_exit:
>              w('\tpreempt_notifier_sys_exit();')
> -            anon_inodes_exit = False
> +            w('\tkvm_exit_srcu();')
> +            w('\tkvm_exit_anon_inodes();')
> +            kvm_exit = False
>          if match(r'^int kvm_arch_init'): kvm_arch_init = True
>          if match(r'\btsc_khz\b') and kvm_arch_init:
>              line = sub(r'\btsc_khz\b', 'kvm_tsc_khz', line)
> -- 
> 1.6.0.2
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html