Re: [RFC PATCH 1/1] epoll: use rwlock in order to reduce ep_poll_callback() contention

Jason Baron <jbaron@xxxxxxxxxx> · Tue, 4 Dec 2018 12:23:08 -0500

On 12/3/18 6:02 AM, Roman Penyaev wrote:
> Hi all,
> 
> The goal of this patch is to reduce contention of ep_poll_callback() which
> can be called concurrently from different CPUs in case of high events
> rates and many fds per epoll.  Problem can be very well reproduced by
> generating events (write to pipe or eventfd) from many threads, while
> consumer thread does polling.  In other words this patch increases the
> bandwidth of events which can be delivered from sources to the poller by
> adding poll items in a lockless way to the list.
> 
> The main change is in replacement of the spinlock with a rwlock, which is
> taken on read in ep_poll_callback(), and then by adding poll items to the
> tail of the list using xchg atomic instruction.  Write lock is taken
> everywhere else in order to stop list modifications and guarantee that list
> updates are fully completed (I assume that write side of a rwlock does not
> starve, it seems qrwlock implementation has these guarantees).
> 
> The following are some microbenchmark results based on the test [1] which
> starts threads which generate N events each.  The test ends when all
> events are successfully fetched by the poller thread:
> 
> spinlock
> ========
> 
> threads       run time    events per ms
> -------       ---------   -------------
>       8         13191ms         6064/ms
>      16         30758ms         5201/ms
>      32         44315ms         7220/ms
> 
> rwlock + xchg
> =============
> 
> threads       run time    events per ms
> -------       ---------   -------------
>       8          8581ms         9323/ms
>      16         13800ms        11594/ms
>      32         24167ms        13240/ms
> 
> According to the results bandwidth of delivered events is significantly
> increased, thus execution time is reduced.
> 
> This is RFC because I did not run any benchmarks comparing current
> qrwlock and spinlock implementations (4.19 kernel), although I did
> not notice any epoll performance degradations in other benchmarks.
> 
> Also I'm not quite sure where to put very special lockless variant
> of adding element to the list (list_add_tail_lockless() in this
> patch).  Seems keeping it locally is safer.
> 
> [1] https://github.com/rouming/test-tools/blob/master/stress-epoll.c
> 
> Signed-off-by: Roman Penyaev <rpenyaev@xxxxxxx>
> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
> Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx>
> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Cc: linux-fsdevel@xxxxxxxxxxxxxxx
> Cc: linux-kernel@xxxxxxxxxxxxxxx
> ---
>  fs/eventpoll.c | 107 +++++++++++++++++++++++++++++++------------------
>  1 file changed, 69 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 42bbe6824b4b..89debda47aca 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -50,10 +50,10 @@
>   *
>   * 1) epmutex (mutex)
>   * 2) ep->mtx (mutex)
> - * 3) ep->wq.lock (spinlock)
> + * 3) ep->lock (rwlock)
>   *
>   * The acquire order is the one listed above, from 1 to 3.
> - * We need a spinlock (ep->wq.lock) because we manipulate objects
> + * We need a rwlock (ep->lock) because we manipulate objects
>   * from inside the poll callback, that might be triggered from
>   * a wake_up() that in turn might be called from IRQ context.
>   * So we can't sleep inside the poll callback and hence we need
> @@ -85,7 +85,7 @@
>   * of epoll file descriptors, we use the current recursion depth as
>   * the lockdep subkey.
>   * It is possible to drop the "ep->mtx" and to use the global
> - * mutex "epmutex" (together with "ep->wq.lock") to have it working,
> + * mutex "epmutex" (together with "ep->lock") to have it working,
>   * but having "ep->mtx" will make the interface more scalable.
>   * Events that require holding "epmutex" are very rare, while for
>   * normal operations the epoll private "ep->mtx" will guarantee
> @@ -182,8 +182,6 @@ struct epitem {
>   * This structure is stored inside the "private_data" member of the file
>   * structure and represents the main data structure for the eventpoll
>   * interface.
> - *
> - * Access to it is protected by the lock inside wq.
>   */
>  struct eventpoll {
>  	/*
> @@ -203,13 +201,16 @@ struct eventpoll {
>  	/* List of ready file descriptors */
>  	struct list_head rdllist;
>  
> +	/* Lock which protects rdllist and ovflist */
> +	rwlock_t lock;
> +
>  	/* RB tree root used to store monitored fd structs */
>  	struct rb_root_cached rbr;
>  
>  	/*
>  	 * This is a single linked list that chains all the "struct epitem" that
>  	 * happened while transferring ready events to userspace w/out
> -	 * holding ->wq.lock.
> +	 * holding ->lock.
>  	 */
>  	struct epitem *ovflist;
>  
> @@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
>  	 * because we want the "sproc" callback to be able to do it
>  	 * in a lockless way.
>  	 */
> -	spin_lock_irq(&ep->wq.lock);
> +	write_lock_irq(&ep->lock);
>  	list_splice_init(&ep->rdllist, &txlist);
>  	ep->ovflist = NULL;
> -	spin_unlock_irq(&ep->wq.lock);
> +	write_unlock_irq(&ep->lock);
>  
>  	/*
>  	 * Now call the callback function.
>  	 */
>  	res = (*sproc)(ep, &txlist, priv);
>  
> -	spin_lock_irq(&ep->wq.lock);
> +	write_lock_irq(&ep->lock);
>  	/*
>  	 * During the time we spent inside the "sproc" callback, some
>  	 * other events might have been queued by the poll callback.
> @@ -722,7 +723,8 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
>  		 * contain them, and the list_splice() below takes care of them.
>  		 */
>  		if (!ep_is_linked(epi)) {
> -			list_add_tail(&epi->rdllink, &ep->rdllist);
> +			/* Reverse ->ovflist, events should be in FIFO */
> +			list_add(&epi->rdllink, &ep->rdllist);
>  			ep_pm_stay_awake(epi);
>  		}
>  	}
> @@ -745,11 +747,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
>  		 * the ->poll() wait list (delayed after we release the lock).
>  		 */
>  		if (waitqueue_active(&ep->wq))
> -			wake_up_locked(&ep->wq);
> +			wake_up(&ep->wq);
>  		if (waitqueue_active(&ep->poll_wait))
>  			pwake++;
>  	}
> -	spin_unlock_irq(&ep->wq.lock);
> +	write_unlock_irq(&ep->lock);
>  
>  	if (!ep_locked)
>  		mutex_unlock(&ep->mtx);
> @@ -789,10 +791,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
>  
>  	rb_erase_cached(&epi->rbn, &ep->rbr);
>  
> -	spin_lock_irq(&ep->wq.lock);
> +	write_lock_irq(&ep->lock);
>  	if (ep_is_linked(epi))
>  		list_del_init(&epi->rdllink);
> -	spin_unlock_irq(&ep->wq.lock);
> +	write_unlock_irq(&ep->lock);
>  
>  	wakeup_source_unregister(ep_wakeup_source(epi));
>  	/*
> @@ -842,7 +844,7 @@ static void ep_free(struct eventpoll *ep)
>  	 * Walks through the whole tree by freeing each "struct epitem". At this
>  	 * point we are sure no poll callbacks will be lingering around, and also by
>  	 * holding "epmutex" we can be sure that no file cleanup code will hit
> -	 * us during this operation. So we can avoid the lock on "ep->wq.lock".
> +	 * us during this operation. So we can avoid the lock on "ep->lock".
>  	 * We do not need to lock ep->mtx, either, we only do it to prevent
>  	 * a lockdep warning.
>  	 */
> @@ -1023,6 +1025,7 @@ static int ep_alloc(struct eventpoll **pep)
>  		goto free_uid;
>  
>  	mutex_init(&ep->mtx);
> +	rwlock_init(&ep->lock);
>  	init_waitqueue_head(&ep->wq);
>  	init_waitqueue_head(&ep->poll_wait);
>  	INIT_LIST_HEAD(&ep->rdllist);
> @@ -1112,10 +1115,38 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
>  }
>  #endif /* CONFIG_CHECKPOINT_RESTORE */
>  
> +/*
> + * Adds a new entry to the tail of the list in a lockless way, i.e.
> + * multiple CPUs are allowed to call this function concurrently.
> + *
> + * Beware: it is necessary to prevent any other modifications of the
> + *         existing list until all changes are completed, in other words
> + *         concurrent list_add_tail_lockless() calls should be protected
> + *         with a read lock, where write lock acts as a barrier which
> + *         makes sure all list_add_tail_lockless() calls are fully
> + *         completed.
> + */
> +static inline void list_add_tail_lockless(struct list_head *new,
> +					  struct list_head *head)
> +{
> +	struct list_head *prev;
> +
> +	new->next = head;
> +	prev = xchg(&head->prev, new);
> +	prev->next = new;
> +	new->prev = prev;
> +}
> +
>  /*
>   * This is the callback that is passed to the wait queue wakeup
>   * mechanism. It is called by the stored file descriptors when they
>   * have events to report.
> + *
> + * This callback takes a read lock in order not to content with concurrent
> + * events from another file descriptors, thus all modifications to ->rdllist
> + * or ->ovflist are lockless.  Read lock is paired with the write lock from
> + * ep_scan_ready_list(), which stops all list modifications and guarantees
> + * that lists won't be corrupted.
>   */
>  static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
>  {
> @@ -1126,7 +1157,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
>  	__poll_t pollflags = key_to_poll(key);
>  	int ewake = 0;
>  
> -	spin_lock_irqsave(&ep->wq.lock, flags);
> +	read_lock_irqsave(&ep->lock, flags);
>  
>  	ep_set_busy_poll_napi_id(epi);
>  
> @@ -1156,8 +1187,8 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
>  	 */
>  	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
>  		if (epi->next == EP_UNACTIVE_PTR) {
> -			epi->next = ep->ovflist;
> -			ep->ovflist = epi;
> +			/* Atomically exchange tail */
> +			epi->next = xchg(&ep->ovflist, epi);

This also relies on the fact that the same epi can't be added to the
list in parallel, because the wait queue doing the wakeup will have the
wait_queue_head lock. That was a little confusing for me b/c we only had
the read_lock() above.

>  			if (epi->ws) {
>  				/*
>  				 * Activate ep->ws since epi->ws may get
> @@ -1172,7 +1203,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
>  
>  	/* If this file is already in the ready list we exit soon */
>  	if (!ep_is_linked(epi)) {
> -		list_add_tail(&epi->rdllink, &ep->rdllist);
> +		list_add_tail_lockless(&epi->rdllink, &ep->rdllist);
>  		ep_pm_stay_awake_rcu(epi);
>  	}

same for this.

>  
> @@ -1197,13 +1228,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
>  				break;
>  			}
>  		}
> -		wake_up_locked(&ep->wq);
> +		wake_up(&ep->wq);

why the switch here to the locked() variant? Shouldn't the 'reader'
side, in this case which takes the rwlock for write see all updates in a
coherent state at this point?

>  	}
>  	if (waitqueue_active(&ep->poll_wait))
>  		pwake++;
>  
>  out_unlock:
> -	spin_unlock_irqrestore(&ep->wq.lock, flags);
> +	read_unlock_irqrestore(&ep->lock, flags);
>  
>  	/* We have to call this outside the lock */
>  	if (pwake)
> @@ -1489,7 +1520,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
>  		goto error_remove_epi;
>  
>  	/* We have to drop the new item inside our item list to keep track of it */
> -	spin_lock_irq(&ep->wq.lock);
> +	write_lock_irq(&ep->lock);
>  
>  	/* record NAPI ID of new item if present */
>  	ep_set_busy_poll_napi_id(epi);
> @@ -1501,12 +1532,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
>  
>  		/* Notify waiting tasks that events are available */
>  		if (waitqueue_active(&ep->wq))
> -			wake_up_locked(&ep->wq);
> +			wake_up(&ep->wq);

is this necessary to switch as well? Is this to make lockdep happy?
Looks like there are few more conversions too below...

Thanks,

-Jason

>  		if (waitqueue_active(&ep->poll_wait))
>  			pwake++;
>  	}
>  
> -	spin_unlock_irq(&ep->wq.lock);
> +	write_unlock_irq(&ep->lock);
>  
>  	atomic_long_inc(&ep->user->epoll_watches);
>  
> @@ -1532,10 +1563,10 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
>  	 * list, since that is used/cleaned only inside a section bound by "mtx".
>  	 * And ep_insert() is called with "mtx" held.
>  	 */
> -	spin_lock_irq(&ep->wq.lock);
> +	write_lock_irq(&ep->lock);
>  	if (ep_is_linked(epi))
>  		list_del_init(&epi->rdllink);
> -	spin_unlock_irq(&ep->wq.lock);
> +	write_unlock_irq(&ep->lock);
>  
>  	wakeup_source_unregister(ep_wakeup_source(epi));
>  
> @@ -1579,9 +1610,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
>  	 * 1) Flush epi changes above to other CPUs.  This ensures
>  	 *    we do not miss events from ep_poll_callback if an
>  	 *    event occurs immediately after we call f_op->poll().
> -	 *    We need this because we did not take ep->wq.lock while
> +	 *    We need this because we did not take ep->lock while
>  	 *    changing epi above (but ep_poll_callback does take
> -	 *    ep->wq.lock).
> +	 *    ep->lock).
>  	 *
>  	 * 2) We also need to ensure we do not miss _past_ events
>  	 *    when calling f_op->poll().  This barrier also
> @@ -1600,18 +1631,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
>  	 * list, push it inside.
>  	 */
>  	if (ep_item_poll(epi, &pt, 1)) {
> -		spin_lock_irq(&ep->wq.lock);
> +		write_lock_irq(&ep->lock);
>  		if (!ep_is_linked(epi)) {
>  			list_add_tail(&epi->rdllink, &ep->rdllist);
>  			ep_pm_stay_awake(epi);
>  
>  			/* Notify waiting tasks that events are available */
>  			if (waitqueue_active(&ep->wq))
> -				wake_up_locked(&ep->wq);
> +				wake_up(&ep->wq);
>  			if (waitqueue_active(&ep->poll_wait))
>  				pwake++;
>  		}
> -		spin_unlock_irq(&ep->wq.lock);
> +		write_unlock_irq(&ep->lock);
>  	}
>  
>  	/* We have to call this outside the lock */
> @@ -1764,7 +1795,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>  		 * caller specified a non blocking operation.
>  		 */
>  		timed_out = 1;
> -		spin_lock_irq(&ep->wq.lock);
> +		write_lock_irq(&ep->lock);
>  		goto check_events;
>  	}
>  
> @@ -1773,7 +1804,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>  	if (!ep_events_available(ep))
>  		ep_busy_loop(ep, timed_out);
>  
> -	spin_lock_irq(&ep->wq.lock);
> +	write_lock_irq(&ep->lock);
>  
>  	if (!ep_events_available(ep)) {
>  		/*
> @@ -1789,7 +1820,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>  		 * ep_poll_callback() when events will become available.
>  		 */
>  		init_waitqueue_entry(&wait, current);
> -		__add_wait_queue_exclusive(&ep->wq, &wait);
> +		add_wait_queue_exclusive(&ep->wq, &wait);
>  
>  		for (;;) {
>  			/*
> @@ -1815,21 +1846,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
>  				break;
>  			}
>  
> -			spin_unlock_irq(&ep->wq.lock);
> +			write_unlock_irq(&ep->lock);
>  			if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
>  				timed_out = 1;
>  
> -			spin_lock_irq(&ep->wq.lock);
> +			write_lock_irq(&ep->lock);
>  		}
>  
> -		__remove_wait_queue(&ep->wq, &wait);
> +		remove_wait_queue(&ep->wq, &wait);
>  		__set_current_state(TASK_RUNNING);
>  	}
>  check_events:
>  	/* Is it worth to try to dig for events ? */
>  	eavail = ep_events_available(ep);
>  
> -	spin_unlock_irq(&ep->wq.lock);
> +	write_unlock_irq(&ep->lock);
>  
>  	/*
>  	 * Try to transfer events to user space. In case we get 0 events and
>