Re: [PATCH] kyber: fix another domain token wait queue hang

Omar Sandoval <osandov@xxxxxxxxxxx> · Tue, 5 Dec 2017 13:49:31 -0800

On Mon, Dec 04, 2017 at 03:12:15PM -0800, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@xxxxxx>
> 
> Commit 8cf466602028 ("kyber: fix hang on domain token wait queue") fixed
> a hang caused by leaving wait entries on the domain token wait queue
> after the __sbitmap_queue_get() retry succeeded, making that wait entry
> a "dud" which won't in turn wake more entries up. However, we can also
> get a dud entry if kyber_get_domain_token() fails once but is then
> called again and succeeds. This can happen if the hardware queue is
> rerun for some other reason, or, more likely, kyber_dispatch_request()
> tries the same domain twice.
> 
> The fix is to remove our entry from the wait queue whenever we
> successfully get a token. The only complication is that we might be on
> one of many wait queues in the struct sbitmap_queue, but that's easily
> fixed by remembering which wait queue we were put on.
> 
> While we're here, only initialize the wait queue entry once instead on
> every wait, and use spin_lock_irq() instead of spin_lock_irqsave(),
> since this is always called from process context with irqs enabled.
> 
> Signed-off-by: Omar Sandoval <osandov@xxxxxx>
> ---
> I have another, rarer hang I'm still working out, but I can get this one
> out of the way.
> 
>  block/kyber-iosched.c | 39 +++++++++++++++++++++++++--------------
>  1 file changed, 25 insertions(+), 14 deletions(-)
> 
> diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
> index b4df317c2916..00cf624ce3ed 100644
> --- a/block/kyber-iosched.c
> +++ b/block/kyber-iosched.c
> @@ -100,9 +100,13 @@ struct kyber_hctx_data {
>  	unsigned int cur_domain;
>  	unsigned int batching;
>  	wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
> +	struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
>  	atomic_t wait_index[KYBER_NUM_DOMAINS];
>  };
>  
> +static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
> +			     void *key);
> +
>  static int rq_sched_domain(const struct request *rq)
>  {
>  	unsigned int op = rq->cmd_flags;
> @@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
>  
>  	for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
>  		INIT_LIST_HEAD(&khd->rqs[i]);
> +		init_waitqueue_func_entry(&khd->domain_wait[i],
> +					  kyber_domain_wake);
> +		khd->domain_wait[i].private = hctx;
>  		INIT_LIST_HEAD(&khd->domain_wait[i].entry);
>  		atomic_set(&khd->wait_index[i], 0);
>  	}
> @@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
>  	int nr;
>  
>  	nr = __sbitmap_queue_get(domain_tokens);
> -	if (nr >= 0)
> -		return nr;
>  
>  	/*
>  	 * If we failed to get a domain token, make sure the hardware queue is
>  	 * run when one becomes available. Note that this is serialized on
>  	 * khd->lock, but we still need to be careful about the waker.
>  	 */
> -	if (list_empty_careful(&wait->entry)) {
> -		init_waitqueue_func_entry(wait, kyber_domain_wake);
> -		wait->private = hctx;
> +	if (nr < 0 && list_empty_careful(&wait->entry)) {
>  		ws = sbq_wait_ptr(domain_tokens,
>  				  &khd->wait_index[sched_domain]);
> -		add_wait_queue(&ws->wait, wait);
> +		khd->domain_ws[sched_domain] = ws;
> +		add_wait_queue_exclusive(&ws->wait, wait);

Hm, just realized that I changed this to add_wait_queue_exclusive()...
That wasn't intentional. I'll send a v2.

>  		/*
>  		 * Try again in case a token was freed before we got on the wait
> -		 * queue. The waker may have already removed the entry from the
> -		 * wait queue, but list_del_init() is okay with that.
> +		 * queue.
>  		 */
>  		nr = __sbitmap_queue_get(domain_tokens);
> -		if (nr >= 0) {
> -			unsigned long flags;
> +	}
>  
> -			spin_lock_irqsave(&ws->wait.lock, flags);
> -			list_del_init(&wait->entry);
> -			spin_unlock_irqrestore(&ws->wait.lock, flags);
> -		}
> +	/*
> +	 * If we got a token while we were on the wait queue, remove ourselves
> +	 * from the wait queue to ensure that all wake ups make forward
> +	 * progress. It's possible that the waker already deleted the entry
> +	 * between the !list_empty_careful() check and us grabbing the lock, but
> +	 * list_del_init() is okay with that.
> +	 */
> +	if (nr >= 0 && !list_empty_careful(&wait->entry)) {
> +		ws = khd->domain_ws[sched_domain];
> +		spin_lock_irq(&ws->wait.lock);
> +		list_del_init(&wait->entry);
> +		spin_unlock_irq(&ws->wait.lock);
>  	}
> +
>  	return nr;
>  }
>  
> -- 
> 2.15.1
>