Re: [PATCH] [RFC] kernel/cpu: Use lockref for online CPU reference counting

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

On ma, 2016-02-15 at 18:06 +0100, Peter Zijlstra wrote:
> On Mon, Feb 15, 2016 at 03:17:55PM +0100, Peter Zijlstra wrote:
> > On Mon, Feb 15, 2016 at 02:36:43PM +0200, Joonas Lahtinen wrote:
> > > Instead of implementing a custom locked reference counting, use lockref.
> > > 
> > > Current implementation leads to a deadlock splat on Intel SKL platforms
> > > when lockdep debugging is enabled.
> > > 
> > > This is due to few of CPUfreq drivers (including Intel P-state) having this;
> > > policy->rwsem is locked during driver initialization and the functions called
> > > during init that actually apply CPU limits use get_online_cpus (because they
> > > have other calling paths too), which will briefly lock cpu_hotplug.lock to
> > > increase cpu_hotplug.refcount.
> > > 
> > > On later calling path, when doing a suspend, when cpu_hotplug_begin() is called
> > > in disable_nonboot_cpus(), callbacks to CPUfreq functions get called after,
> > > which will lock policy->rwsem and cpu_hotplug.lock is already held by
> > > cpu_hotplug_begin() and we do have a potential deadlock scenario reported by
> > > our CI system (though it is a very unlikely one). See the Bugzilla link for more
> > > details.
> > 
> > I've been meaning to change the thing into a percpu-rwsem, I just
> > haven't had time to look into the lockdep splat that generated.
> 
> 
> The below has plenty lockdep issues because percpu-rwsem is
> reader-writer fair (like the regular rwsem), so it does throw up a fair
> number of very icky issues.
> 
> If at all possible, I'd really rather fix those and have a 'saner'
> hotplug lock, rather than muddle on with open-coded horror lock we have
> now.
> 

I do still agree the below would be a worthy change to proceed with.

CC'd Oleg here too to give a comment.

Regards, Joonas

> 
> --- a/include/linux/cpu.h
> +++ b/include/linux/cpu.h
> @@ -225,6 +225,8 @@ extern struct bus_type cpu_subsys;
>  #ifdef CONFIG_HOTPLUG_CPU
>  /* Stop CPUs going up and down. */
>  
> +extern void cpu_hotplug_init_task(struct task_struct *p);
> +
>  extern void cpu_hotplug_begin(void);
>  extern void cpu_hotplug_done(void);
>  extern void get_online_cpus(void);
> @@ -242,6 +244,8 @@ int cpu_down(unsigned int cpu);
>  
>  #else		/* CONFIG_HOTPLUG_CPU */
>  
> +static inline void cpu_hotplug_init_task(struct task_struct *p) {}
> +
>  static inline void cpu_hotplug_begin(void) {}
>  static inline void cpu_hotplug_done(void) {}
>  #define get_online_cpus()	do { } while (0)
> --- a/include/linux/percpu-rwsem.h
> +++ b/include/linux/percpu-rwsem.h
> @@ -16,6 +16,15 @@ struct percpu_rw_semaphore {
>  	wait_queue_head_t	write_waitq;
>  };
>  
> +#define DEFINE_STATIC_PERCPU_RWSEM(name)				\
> +static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_frc_##name);	\
> +static struct percpu_rw_semaphore name = {				\
> +	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
> +	.fast_read_ctr = &__percpu_rwsem_frc_##name,			\
> +	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
> +	.write_waitq = __WAIT_QUEUE_HEAD_INITIALIZER(name.write_waitq),	\
> +}
> +
>  extern void percpu_down_read(struct percpu_rw_semaphore *);
>  extern int  percpu_down_read_trylock(struct percpu_rw_semaphore *);
>  extern void percpu_up_read(struct percpu_rw_semaphore *);
> @@ -33,9 +42,11 @@ extern void percpu_free_rwsem(struct per
>  	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
>  })
>  
> -
>  #define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
>  
> +#define percpu_rwsem_assert_held(sem)                          \
> +	lockdep_assert_held(&(sem)->rw_sem)
> +
>  static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
>  					bool read, unsigned long ip)
>  {
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1403,6 +1403,9 @@ struct task_struct {
>  	struct task_struct *last_wakee;
>  
>  	int wake_cpu;
> +#ifdef CONFIG_HOTPLUG_CPU
> +	int cpuhp_ref;
> +#endif
>  #endif
>  	int on_rq;
>  
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -23,6 +23,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "smpboot.h"
>  
> @@ -51,121 +52,52 @@ EXPORT_SYMBOL(cpu_notifier_register_done
>  
>  static RAW_NOTIFIER_HEAD(cpu_chain);
>  
> -/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
> +/*
> + * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
>   * Should always be manipulated under cpu_add_remove_lock
>   */
>  static int cpu_hotplug_disabled;
>  
>  #ifdef CONFIG_HOTPLUG_CPU
>  
> -static struct {
> -	struct task_struct *active_writer;
> -	/* wait queue to wake up the active_writer */
> -	wait_queue_head_t wq;
> -	/* verifies that no writer will get active while readers are active */
> -	struct mutex lock;
> -	/*
> -	 * Also blocks the new readers during
> -	 * an ongoing cpu hotplug operation.
> -	 */
> -	atomic_t refcount;
> -
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> -	struct lockdep_map dep_map;
> -#endif
> -} cpu_hotplug = {
> -	.active_writer = NULL,
> -	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
> -	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
> -#ifdef CONFIG_DEBUG_LOCK_ALLOC
> -	.dep_map = {.name = "cpu_hotplug.lock" },
> -#endif
> -};
> -
> -/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
> -#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
> -#define cpuhp_lock_acquire_tryread() \
> -				  lock_map_acquire_tryread(&cpu_hotplug.dep_map)
> -#define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
> -#define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
> +DEFINE_STATIC_PERCPU_RWSEM(hotplug);
>  
> +void cpu_hotplug_init_task(struct task_struct *p)
> +{
> +	if (WARN_ON_ONCE(p->cpuhp_ref))
> +		p->cpuhp_ref = 0;
> +}
>  
>  void get_online_cpus(void)
>  {
>  	might_sleep();
> -	if (cpu_hotplug.active_writer == current)
> +
> +	if (current->cpuhp_ref++) /* read recursion */
>  		return;
> -	cpuhp_lock_acquire_read();
> -	mutex_lock(&cpu_hotplug.lock);
> -	atomic_inc(&cpu_hotplug.refcount);
> -	mutex_unlock(&cpu_hotplug.lock);
> +
> +	percpu_down_read(&hotplug);
>  }
>  EXPORT_SYMBOL_GPL(get_online_cpus);
>  
>  void put_online_cpus(void)
>  {
> -	int refcount;
> -
> -	if (cpu_hotplug.active_writer == current)
> +	if (--current->cpuhp_ref)
>  		return;
>  
> -	refcount = atomic_dec_return(&cpu_hotplug.refcount);
> -	if (WARN_ON(refcount < 0)) /* try to fix things up */
> -		atomic_inc(&cpu_hotplug.refcount);
> -
> -	if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
> -		wake_up(&cpu_hotplug.wq);
> -
> -	cpuhp_lock_release();
> -
> +	percpu_up_read(&hotplug);
>  }
>  EXPORT_SYMBOL_GPL(put_online_cpus);
>  
> -/*
> - * This ensures that the hotplug operation can begin only when the
> - * refcount goes to zero.
> - *
> - * Note that during a cpu-hotplug operation, the new readers, if any,
> - * will be blocked by the cpu_hotplug.lock
> - *
> - * Since cpu_hotplug_begin() is always called after invoking
> - * cpu_maps_update_begin(), we can be sure that only one writer is active.
> - *
> - * Note that theoretically, there is a possibility of a livelock:
> - * - Refcount goes to zero, last reader wakes up the sleeping
> - *   writer.
> - * - Last reader unlocks the cpu_hotplug.lock.
> - * - A new reader arrives at this moment, bumps up the refcount.
> - * - The writer acquires the cpu_hotplug.lock finds the refcount
> - *   non zero and goes to sleep again.
> - *
> - * However, this is very difficult to achieve in practice since
> - * get_online_cpus() not an api which is called all that often.
> - *
> - */
>  void cpu_hotplug_begin(void)
>  {
> -	DEFINE_WAIT(wait);
> -
> -	cpu_hotplug.active_writer = current;
> -	cpuhp_lock_acquire();
> -
> -	for (;;) {
> -		mutex_lock(&cpu_hotplug.lock);
> -		prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
> -		if (likely(!atomic_read(&cpu_hotplug.refcount)))
> -				break;
> -		mutex_unlock(&cpu_hotplug.lock);
> -		schedule();
> -	}
> -	finish_wait(&cpu_hotplug.wq, &wait);
> +	percpu_down_write(&hotplug);
> +	current->cpuhp_ref++; /* allow read-in-write recursion */
>  }
>  
>  void cpu_hotplug_done(void)
>  {
> -	cpu_hotplug.active_writer = NULL;
> -	mutex_unlock(&cpu_hotplug.lock);
> -	cpuhp_lock_release();
> +	current->cpuhp_ref--;
> +	percpu_up_write(&hotplug);
>  }
>  
>  /*
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1414,6 +1414,8 @@ static struct task_struct *copy_process(
>  	p->sequential_io_avg	= 0;
>  #endif
>  
> +	cpu_hotplug_init_task(p);
> +
>  	/* Perform scheduler related setup. Assign this task to a CPU. */
>  	retval = sched_fork(clone_flags, p);
>  	if (retval)
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -53,6 +53,11 @@ config GENERIC_IO
>  config STMP_DEVICE
>  	bool
>  
> +config PERCPU_RWSEM_HOTPLUG
> +	def_bool y
> +	depends on HOTPLUG_CPU
> +	select PERCPU_RWSEM
> +
>  config ARCH_USE_CMPXCHG_LOCKREF
>  	bool
>  
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux