Re: [PATCH RFC -next] sbitmap: fix possible io hung due to lost wakeups

Jan Kara <jack@xxxxxxx> · Mon, 20 Jun 2022 14:24:13 +0200

On Fri 17-06-22 22:11:25, Yu Kuai wrote:
> Currently, same waitqueue might be woken up continuously:
> 
> __sbq_wake_up		__sbq_wake_up
>  sbq_wake_ptr -> assume	0
> 			 sbq_wake_ptr -> 0
>  atomic_dec_return
> 			atomic_dec_return
>  atomic_cmpxchg -> succeed
> 			 atomic_cmpxchg -> failed
> 			  return true
> 
> 			__sbq_wake_up
> 			 sbq_wake_ptr
> 			  atomic_read(&sbq->wake_index) -> still 0
>  sbq_index_atomic_inc -> inc to 1
> 			  if (waitqueue_active(&ws->wait))
> 			   if (wake_index != atomic_read(&sbq->wake_index))
> 			    atomic_set -> reset from 1 to 0
>  wake_up_nr -> wake up first waitqueue
> 			    // continue to wake up in first waitqueue
> 
> What's worse, io hung is possible in theory because wakeups might be
> missed. For example, 2 * wake_batch tags are put, while only wake_batch
> threads are worken:
> 
> __sbq_wake_up
>  atomic_cmpxchg -> reset wait_cnt
> 			__sbq_wake_up -> decrease wait_cnt
> 			...
> 			__sbq_wake_up -> wait_cnt is decreased to 0 again
> 			 atomic_cmpxchg
> 			 sbq_index_atomic_inc -> increase wake_index
> 			 wake_up_nr -> wake up and waitqueue might be empty
>  sbq_index_atomic_inc -> increase again, one waitqueue is skipped
>  wake_up_nr -> invalid wake up because old wakequeue might be empty
> 
> To fix the problem, refactor to make sure waitqueues will be woken up
> one by one,
> 
> Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>

So as far as I can tell your patch does not completely fix this race. See
below:

> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
> index ae4fd4de9ebe..dc2959cb188c 100644
> --- a/lib/sbitmap.c
> +++ b/lib/sbitmap.c
> @@ -574,66 +574,69 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
>  }
>  EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
>  
> -static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
> +static void sbq_update_wake_index(struct sbitmap_queue *sbq,
> +				  int old_wake_index)
>  {
>  	int i, wake_index;
> -
> -	if (!atomic_read(&sbq->ws_active))
> -		return NULL;
> +	struct sbq_wait_state *ws;
>  
>  	wake_index = atomic_read(&sbq->wake_index);
> -	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
> -		struct sbq_wait_state *ws = &sbq->ws[wake_index];
> +	if (old_wake_index != wake_index)
> +		return;
>  
> +	for (i = 1; i < SBQ_WAIT_QUEUES; i++) {
> +		wake_index = sbq_index_inc(wake_index);
> +		ws = &sbq->ws[wake_index];
> +		/* Find the next active waitqueue in round robin manner */
>  		if (waitqueue_active(&ws->wait)) {
> -			if (wake_index != atomic_read(&sbq->wake_index))
> -				atomic_set(&sbq->wake_index, wake_index);
> -			return ws;
> +			atomic_cmpxchg(&sbq->wake_index, old_wake_index,
> +				       wake_index);
> +			return;
>  		}
> -
> -		wake_index = sbq_index_inc(wake_index);
>  	}
> -
> -	return NULL;
>  }
>  
>  static bool __sbq_wake_up(struct sbitmap_queue *sbq)
>  {
>  	struct sbq_wait_state *ws;
>  	unsigned int wake_batch;
> -	int wait_cnt;
> +	int wait_cnt, wake_index;
>  
> -	ws = sbq_wake_ptr(sbq);
> -	if (!ws)
> +	if (!atomic_read(&sbq->ws_active))
>  		return false;
>  
> -	wait_cnt = atomic_dec_return(&ws->wait_cnt);
> -	if (wait_cnt <= 0) {
> -		int ret;
> -
> -		wake_batch = READ_ONCE(sbq->wake_batch);
> -
> -		/*
> -		 * Pairs with the memory barrier in sbitmap_queue_resize() to
> -		 * ensure that we see the batch size update before the wait
> -		 * count is reset.
> -		 */
> -		smp_mb__before_atomic();
> +	wake_index = atomic_read(&sbq->wake_index);
> +	ws = &sbq->ws[wake_index];
> +	/*
> +	 * This can only happen in the first wakeup when sbitmap waitqueues
> +	 * are no longer idle.
> +	 */
> +	if (!waitqueue_active(&ws->wait)) {
> +		sbq_update_wake_index(sbq, wake_index);
> +		return true;
> +	}
>  
> -		/*
> -		 * For concurrent callers of this, the one that failed the
> -		 * atomic_cmpxhcg() race should call this function again
> -		 * to wakeup a new batch on a different 'ws'.
> -		 */
> -		ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch);
> -		if (ret == wait_cnt) {
> -			sbq_index_atomic_inc(&sbq->wake_index);
> -			wake_up_nr(&ws->wait, wake_batch);
> -			return false;
> -		}
> +	wait_cnt = atomic_dec_return(&ws->wait_cnt);
> +	if (wait_cnt > 0)
> +		return false;

The following race is still possible:

CPU1					CPU2
__sbq_wake_up				__sbq_wake_up
  wake_index = atomic_read(&sbq->wake_index);
					  wake_index = atomic_read(&sbq->wake_index);

  if (!waitqueue_active(&ws->wait)) -> not taken
					  if (!waitqueue_active(&ws->wait)) -> not taken
  wait_cnt = atomic_dec_return(&ws->wait_cnt);
  /* decremented to 0 now */
  if (wait_cnt > 0) -> not taken
  sbq_update_wake_index(sbq, wake_index);
  if (wait_cnt < 0) -> not taken
  ...
  atomic_set(&ws->wait_cnt, wake_batch);
  wake_up_nr(&ws->wait, wake_batch);
					  wait_cnt = atomic_dec_return(&ws->wait_cnt);
					  /*
					   * decremented to wake_batch - 1 but
					   * there are no tasks waiting anymore
					   * so the wakeup should have gone
					   * to a different waitqueue.
					   */

I have an idea how to fix all these lost wakeups, I'll try to code it
whether it would look usable...

								Honza

> +	sbq_update_wake_index(sbq, wake_index);
> +	/*
> +	 * Concurrent callers should call this function again
> +	 * to wakeup a new batch on a different 'ws'.
> +	 */
> +	if (wait_cnt < 0)
>  		return true;
> -	}
> +
> +	wake_batch = READ_ONCE(sbq->wake_batch);
> +	/*
> +	 * Pairs with the memory barrier in sbitmap_queue_resize() to
> +	 * ensure that we see the batch size update before the wait
> +	 * count is reset.
> +	 */
> +	smp_mb__before_atomic();
> +	atomic_set(&ws->wait_cnt, wake_batch);
> +	wake_up_nr(&ws->wait, wake_batch);
>  
>  	return false;
>  }
> -- 
> 2.31.1
> 
-- 
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR