Patch "sbitmap: fix io hung due to race on sbitmap_word::cleared" has been added to the 6.1-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    sbitmap: fix io hung due to race on sbitmap_word::cleared

to the 6.1-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     sbitmap-fix-io-hung-due-to-race-on-sbitmap_word-clea.patch
and it can be found in the queue-6.1 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit aa7e09ef9cf16e2ae3fb470979fda5b041cbb2e7
Author: Yang Yang <yang.yang@xxxxxxxx>
Date:   Tue Jul 16 16:26:27 2024 +0800

    sbitmap: fix io hung due to race on sbitmap_word::cleared
    
    [ Upstream commit 72d04bdcf3f7d7e07d82f9757946f68802a7270a ]
    
    Configuration for sbq:
      depth=64, wake_batch=6, shift=6, map_nr=1
    
    1. There are 64 requests in progress:
      map->word = 0xFFFFFFFFFFFFFFFF
    2. After all the 64 requests complete, and no more requests come:
      map->word = 0xFFFFFFFFFFFFFFFF, map->cleared = 0xFFFFFFFFFFFFFFFF
    3. Now two tasks try to allocate requests:
      T1:                                       T2:
      __blk_mq_get_tag                          .
      __sbitmap_queue_get                       .
      sbitmap_get                               .
      sbitmap_find_bit                          .
      sbitmap_find_bit_in_word                  .
      __sbitmap_get_word  -> nr=-1              __blk_mq_get_tag
      sbitmap_deferred_clear                    __sbitmap_queue_get
      /* map->cleared=0xFFFFFFFFFFFFFFFF */     sbitmap_find_bit
        if (!READ_ONCE(map->cleared))           sbitmap_find_bit_in_word
          return false;                         __sbitmap_get_word -> nr=-1
        mask = xchg(&map->cleared, 0)           sbitmap_deferred_clear
        atomic_long_andnot()                    /* map->cleared=0 */
                                                  if (!(map->cleared))
                                                    return false;
                                         /*
                                          * map->cleared is cleared by T1
                                          * T2 fail to acquire the tag
                                          */
    
    4. T2 is the sole tag waiter. When T1 puts the tag, T2 cannot be woken
    up due to the wake_batch being set at 6. If no more requests come, T1
    will wait here indefinitely.
    
    This patch achieves two purposes:
    1. Check on ->cleared and update on both ->cleared and ->word need to
    be done atomically, and using spinlock could be the simplest solution.
    2. Add extra check in sbitmap_deferred_clear(), to identify whether
    ->word has free bits.
    
    Fixes: ea86ea2cdced ("sbitmap: ammortize cost of clearing bits")
    Signed-off-by: Yang Yang <yang.yang@xxxxxxxx>
    Reviewed-by: Ming Lei <ming.lei@xxxxxxxxxx>
    Reviewed-by: Bart Van Assche <bvanassche@xxxxxxx>
    Link: https://lore.kernel.org/r/20240716082644.659566-1-yang.yang@xxxxxxxx
    Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index d662cf136021d..c09cdcc99471e 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -36,6 +36,11 @@ struct sbitmap_word {
 	 * @cleared: word holding cleared bits
 	 */
 	unsigned long cleared ____cacheline_aligned_in_smp;
+
+	/**
+	 * @swap_lock: serializes simultaneous updates of ->word and ->cleared
+	 */
+	spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index a727d0b12763a..61075535a8073 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -60,12 +60,30 @@ static inline void update_alloc_hint_after_get(struct sbitmap *sb,
 /*
  * See if we have deferred clears that we can batch move
  */
-static inline bool sbitmap_deferred_clear(struct sbitmap_word *map)
+static inline bool sbitmap_deferred_clear(struct sbitmap_word *map,
+		unsigned int depth, unsigned int alloc_hint, bool wrap)
 {
-	unsigned long mask;
+	unsigned long mask, word_mask;
 
-	if (!READ_ONCE(map->cleared))
-		return false;
+	guard(spinlock_irqsave)(&map->swap_lock);
+
+	if (!map->cleared) {
+		if (depth == 0)
+			return false;
+
+		word_mask = (~0UL) >> (BITS_PER_LONG - depth);
+		/*
+		 * The current behavior is to always retry after moving
+		 * ->cleared to word, and we change it to retry in case
+		 * of any free bits. To avoid an infinite loop, we need
+		 * to take wrap & alloc_hint into account, otherwise a
+		 * soft lockup may occur.
+		 */
+		if (!wrap && alloc_hint)
+			word_mask &= ~((1UL << alloc_hint) - 1);
+
+		return (READ_ONCE(map->word) & word_mask) != word_mask;
+	}
 
 	/*
 	 * First get a stable cleared mask, setting the old mask to 0.
@@ -85,6 +103,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		      bool alloc_hint)
 {
 	unsigned int bits_per_word;
+	int i;
 
 	if (shift < 0)
 		shift = sbitmap_calculate_shift(depth);
@@ -116,6 +135,9 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		return -ENOMEM;
 	}
 
+	for (i = 0; i < sb->map_nr; i++)
+		spin_lock_init(&sb->map[i].swap_lock);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(sbitmap_init_node);
@@ -126,7 +148,7 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 	unsigned int i;
 
 	for (i = 0; i < sb->map_nr; i++)
-		sbitmap_deferred_clear(&sb->map[i]);
+		sbitmap_deferred_clear(&sb->map[i], 0, 0, 0);
 
 	sb->depth = depth;
 	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
@@ -179,7 +201,7 @@ static int sbitmap_find_bit_in_word(struct sbitmap_word *map,
 					alloc_hint, wrap);
 		if (nr != -1)
 			break;
-		if (!sbitmap_deferred_clear(map))
+		if (!sbitmap_deferred_clear(map, depth, alloc_hint, wrap))
 			break;
 	} while (1);
 
@@ -505,7 +527,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
 		unsigned int map_depth = __map_depth(sb, index);
 		unsigned long val;
 
-		sbitmap_deferred_clear(map);
+		sbitmap_deferred_clear(map, 0, 0, 0);
 		val = READ_ONCE(map->word);
 		if (val == (1UL << (map_depth - 1)) - 1)
 			goto next;




[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux