Patch "btrfs: update stripe_sectors::uptodate in steal_rbio" has been added to the 5.19-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Sat, 13 Aug 2022 16:48:41 -0400

This is a note to let you know that I've just added the patch titled

    btrfs: update stripe_sectors::uptodate in steal_rbio

to the 5.19-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     btrfs-update-stripe_sectors-uptodate-in-steal_rbio.patch
and it can be found in the queue-5.19 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 9ee3a884f1a152d3f3831e9ea216c906e1a5b4fd
Author: Qu Wenruo <wqu@xxxxxxxx>
Date:   Wed Jun 1 13:54:28 2022 +0800

    btrfs: update stripe_sectors::uptodate in steal_rbio
    
    [ Upstream commit 4d10046613333508d31fe926c545c8c0b620508a ]
    
    [BUG]
    With added debugging, it turns out the following write sequence would
    cause extra read which is unnecessary:
    
      # xfs_io -f -s -c "pwrite -b 32k 0 32k" -c "pwrite -b 32k 32k 32k" \
                     -c "pwrite -b 32k 64k 32k" -c "pwrite -b 32k 96k 32k" \
                     $mnt/file
    
    The debug message looks like this (btrfs header skipped):
    
      partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=32768 physical=323059712 len=32768
      partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536
      full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=0 physical=323026944 len=32768
      full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=0 physical=323026944 len=32768
      partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=32768 physical=22052864 len=32768
      partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536
      full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=0 physical=22020096 len=32768
      full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=0 physical=277872640 len=32768
      partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=0 physical=323026944 len=32768
      partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536
      ^^^^
       Still partial read, even 389152768 is already cached by the first.
       write.
    
      full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=32768 physical=323059712 len=32768
      full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=32768 physical=323059712 len=32768
      partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=0 physical=22020096 len=32768
      partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536
      ^^^^
       Still partial read for 298844160.
    
      full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=32768 physical=22052864 len=32768
      full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=32768 physical=277905408 len=32768
    
    This means every 32K writes, even they are in the same full stripe,
    still trigger read for previously cached data.
    
    This would cause extra RAID56 IO, making the btrfs raid56 cache useless.
    
    [CAUSE]
    Commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage
    compatible") tries to make steal_rbio() subpage compatible, but during
    that conversion, there is one thing missing.
    
    We no longer rely on PageUptodate(rbio->stripe_pages[i]), but
    rbio->stripe_nsectors[i].uptodate to determine if a sector is uptodate.
    
    This means, previously if we switch the pointer, everything is done,
    as the PageUptodate flag is still bound to that page.
    
    But now we have to manually mark the involved sectors uptodate, or later
    raid56_rmw_stripe() will find the stolen sector is not uptodate, and
    assemble the read bio for it, wasting IO.
    
    [FIX]
    We can easily fix the bug, by also update the
    rbio->stripe_sectors[].uptodate in steal_rbio().
    
    With this fixed, now the same write pattern no longer leads to the same
    unnecessary read:
    
      partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=32768 physical=323059712 len=32768
      partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536
      full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=0 physical=323026944 len=32768
      full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=0 physical=323026944 len=32768
      partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=32768 physical=22052864 len=32768
      partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536
      full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=0 physical=22020096 len=32768
      full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=0 physical=277872640 len=32768
      ^^^ No more partial read, directly into the write path.
      full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=32768 physical=323059712 len=32768
      full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=32768 physical=323059712 len=32768
      full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=32768 physical=22052864 len=32768
      full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=32768 physical=277905408 len=32768
    
    Fixes: d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible")
    Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
    Reviewed-by: David Sterba <dsterba@xxxxxxxx>
    Signed-off-by: David Sterba <dsterba@xxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index a5b623ee6fac..13e0bb0479e6 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -347,6 +347,24 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
 	}
 }
 
+static void steal_rbio_page(struct btrfs_raid_bio *src,
+			    struct btrfs_raid_bio *dest, int page_nr)
+{
+	const u32 sectorsize = src->bioc->fs_info->sectorsize;
+	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
+	int i;
+
+	if (dest->stripe_pages[page_nr])
+		__free_page(dest->stripe_pages[page_nr]);
+	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
+	src->stripe_pages[page_nr] = NULL;
+
+	/* Also update the sector->uptodate bits. */
+	for (i = sectors_per_page * page_nr;
+	     i < sectors_per_page * page_nr + sectors_per_page; i++)
+		dest->stripe_sectors[i].uptodate = true;
+}
+
 /*
  * Stealing an rbio means taking all the uptodate pages from the stripe array
  * in the source rbio and putting them into the destination rbio.
@@ -358,7 +376,6 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 {
 	int i;
 	struct page *s;
-	struct page *d;
 
 	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 		return;
@@ -368,12 +385,7 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 		if (!s || !full_page_sectors_uptodate(src, i))
 			continue;
 
-		d = dest->stripe_pages[i];
-		if (d)
-			__free_page(d);
-
-		dest->stripe_pages[i] = s;
-		src->stripe_pages[i] = NULL;
+		steal_rbio_page(src, dest, i);
 	}
 	index_stripe_sectors(dest);
 	index_stripe_sectors(src);