Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Jonathan

I did a test in my environment, but I didn't see such a big
performance difference.

The first environment:
All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
I used your way to rebuild the kernel
/sys/block/nvme0n1/queue/physical_block_size 512
/sys/block/nvme0n1/queue/optimal_io_size 4096
cat /sys/block/nvme0n1/queue/logical_block_size 512

without the patch set
write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
with the patch set
write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets

The second environment:
The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
/sys/block/nvme0n1/queue/logical_block_size
/sys/block/nvme0n1/queue/physical_block_size
/sys/block/nvme0n1/queue/optimal_io_size

without the patch set
write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
with the patch set
write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets

Best Regards
Xiao

On Sat, Feb 25, 2023 at 2:34 AM Jonathan Derrick
<jonathan.derrick@xxxxxxxxx> wrote:
>
> From: Jon Derrick <jonathan.derrick@xxxxxxxxx>
>
> If the bitmap space has enough room, size the I/O for the last bitmap
> page write to the optimal I/O size for the storage device. The expanded
> write is checked that it won't overrun the data or metadata.
>
> The drive this was tested against has higher latencies when there are
> sub-4k writes due to device-side read-mod-writes of its atomic 4k write
> unit. This change helps increase performance by sizing the last bitmap
> page I/O for the device's preferred write unit, if it is given.
>
> Example Intel/Solidigm P5520
> Raid10, Chunk-size 64M, bitmap-size 57228 bits
>
> $ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1
>         --assume-clean --bitmap=internal --bitmap-chunk=64M
> $ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60
>
> Without patch:
>   write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets
>
> With patch:
>   write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets
>
> Biosnoop:
> Without patch:
> Time        Process        PID     Device      LBA        Size      Lat
> 1.410377    md0_raid10     6900    nvme0n1   W 16         4096      0.02
> 1.410387    md0_raid10     6900    nvme2n1   W 16         4096      0.02
> 1.410374    md0_raid10     6900    nvme3n1   W 16         4096      0.01
> 1.410381    md0_raid10     6900    nvme1n1   W 16         4096      0.02
> 1.410411    md0_raid10     6900    nvme1n1   W 115346512  4096      0.01
> 1.410418    md0_raid10     6900    nvme0n1   W 115346512  4096      0.02
> 1.410915    md0_raid10     6900    nvme2n1   W 24         3584      0.43 <--
> 1.410935    md0_raid10     6900    nvme3n1   W 24         3584      0.45 <--
> 1.411124    md0_raid10     6900    nvme1n1   W 24         3584      0.64 <--
> 1.411147    md0_raid10     6900    nvme0n1   W 24         3584      0.66 <--
> 1.411176    md0_raid10     6900    nvme3n1   W 2019022184 4096      0.01
> 1.411189    md0_raid10     6900    nvme2n1   W 2019022184 4096      0.02
>
> With patch:
> Time        Process        PID     Device      LBA        Size      Lat
> 5.747193    md0_raid10     727     nvme0n1   W 16         4096      0.01
> 5.747192    md0_raid10     727     nvme1n1   W 16         4096      0.02
> 5.747195    md0_raid10     727     nvme3n1   W 16         4096      0.01
> 5.747202    md0_raid10     727     nvme2n1   W 16         4096      0.02
> 5.747229    md0_raid10     727     nvme3n1   W 1196223704 4096      0.02
> 5.747224    md0_raid10     727     nvme0n1   W 1196223704 4096      0.01
> 5.747279    md0_raid10     727     nvme0n1   W 24         4096      0.01 <--
> 5.747279    md0_raid10     727     nvme1n1   W 24         4096      0.02 <--
> 5.747284    md0_raid10     727     nvme3n1   W 24         4096      0.02 <--
> 5.747291    md0_raid10     727     nvme2n1   W 24         4096      0.02 <--
> 5.747314    md0_raid10     727     nvme2n1   W 2234636712 4096      0.01
> 5.747317    md0_raid10     727     nvme1n1   W 2234636712 4096      0.02
>
> Reviewed-by: Christoph Hellwig <hch@xxxxxx>
> Signed-off-by: Jon Derrick <jonathan.derrick@xxxxxxxxx>
> ---
>  drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index bf250a5e3a86..920bb68156d2 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
>         return NULL;
>  }
>
> +static unsigned int optimal_io_size(struct block_device *bdev,
> +                                   unsigned int last_page_size,
> +                                   unsigned int io_size)
> +{
> +       if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
> +               return roundup(last_page_size, bdev_io_opt(bdev));
> +       return io_size;
> +}
> +
> +static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
> +                                  sector_t start, sector_t boundary)
> +{
> +       if (io_size != opt_size &&
> +           start + opt_size / SECTOR_SIZE <= boundary)
> +               return opt_size;
> +       if (start + io_size / SECTOR_SIZE <= boundary)
> +               return io_size;
> +
> +       /* Overflows boundary */
> +       return 0;
> +}
> +
>  static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>                            struct page *page)
>  {
> @@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>         sector_t offset = mddev->bitmap_info.offset;
>         sector_t ps, sboff, doff;
>         unsigned int size = PAGE_SIZE;
> +       unsigned int opt_size = PAGE_SIZE;
>
>         bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
>         if (page->index == store->file_pages - 1) {
> @@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>
>                 if (last_page_size == 0)
>                         last_page_size = PAGE_SIZE;
> -               size = roundup(last_page_size,
> -                              bdev_logical_block_size(bdev));
> +               size = roundup(last_page_size, bdev_logical_block_size(bdev));
> +               opt_size = optimal_io_size(bdev, last_page_size, size);
>         }
>
>         ps = page->index * PAGE_SIZE / SECTOR_SIZE;
> @@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>                         return -EINVAL;
>         } else if (offset < 0) {
>                 /* DATA  BITMAP METADATA  */
> -               if (offset + ps + size / SECTOR_SIZE > 0)
> +               size = bitmap_io_size(size, opt_size, offset + ps, 0);
> +               if (size == 0)
>                         /* bitmap runs in to metadata */
>                         return -EINVAL;
>
> @@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>                         return -EINVAL;
>         } else if (rdev->sb_start < rdev->data_offset) {
>                 /* METADATA BITMAP DATA */
> -               if (sboff + ps + size / SECTOR_SIZE > doff)
> +               size = bitmap_io_size(size, opt_size, sboff + ps, doff);
> +               if (size == 0)
>                         /* bitmap runs in to data */
>                         return -EINVAL;
>         } else {
> --
> 2.27.0
>





[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux