Re: [RFC V6] md/raid5: optimize Scattered Address Space and Thread-Level Parallelism to improve RAID5 performance

Song Liu <song@xxxxxxxxxx> · Tue, 11 Jun 2024 14:18:20 -0700

Hi Shushu,

Thanks for your persistent work with this effort!

On Tue, Jun 11, 2024 at 12:35 PM Shushu Yi <teddyxym@xxxxxxxxxxx> wrote:

This subject and commit log need some work.

For the subject, I think we can use something like:
md/bitmap: Optimize lock contention.

> Optimize scattered address space. Achieves significant improvements in
> both throughput and latency.
>
> Maximize thread-level parallelism and reduce CPU suspension time caused
> by lock contention. Achieve increased overall storage throughput by
> 89.4% and decrease the 99.99th percentile I/O latency by 85.4% on a
> system with four PCIe 4.0 SSDs. (Set the iodepth to 32 and employ
> libaio. Configure the I/O size as 4 KB with sequential write and 16
> threads. In RAID5 consist of 2+1 1TB Samsung 980Pro SSDs, throughput
> went from 5218MB/s to 9884MB/s.)

For the commit log, please provide more details. There is no word
limit here. Let's make it easy to understand.

> Note: Publish this work as a paper and provide the URL
> (https://www.hotstorage.org/2022/camera-ready/hotstorage22-5/pdf/
> hotstorage22-5.pdf).
>
> Co-developed-by: Yiming Xu <teddyxym@xxxxxxxxxxx>
> Signed-off-by: Yiming Xu <teddyxym@xxxxxxxxxxx>
> Signed-off-by: Shushu Yi <firnyee@xxxxxxxxx>
> Tested-by: Paul Luse <paul.e.luse@xxxxxxxxx>
[...]
> -static int md_bitmap_checkpage(struct bitmap_counts *bitmap,
> -                              unsigned long page, int create, int no_hijack)
> -__releases(bitmap->lock)
> -__acquires(bitmap->lock)
> +static int md_bitmap_checkpage(struct bitmap_counts *bitmap, unsigned long page,
> +                              int create, int no_hijack, spinlock_t *bmclock, int locktype)
> +__releases(bmclock)
> +__acquires(bmclock)
> +__releases(bitmap->mlock)
> +__acquires(bitmap->mlock)
>  {
>         unsigned char *mappage;
>
> @@ -65,8 +67,10 @@ __acquires(bitmap->lock)
>                 return -ENOENT;
>
>         /* this page has not been allocated yet */
> -
> -       spin_unlock_irq(&bitmap->lock);
> +       if (locktype)
> +               spin_unlock_irq(bmclock);
> +       else
> +               write_unlock_irq(&bitmap->mlock);

locktype seems unnecessary here. We can do something like

if (bmclock)
    spin_unlock_irq(bmclock);
else
    write_unlock_irq(...);

Also, please add some comment here to explain the logic.

>         /* It is possible that this is being called inside a
>          * prepare_to_wait/finish_wait loop from raid5c:make_request().
>          * In general it is not permitted to sleep in that context as it
> @@ -81,7 +85,11 @@ __acquires(bitmap->lock)
>          */
>         sched_annotate_sleep();
>         mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
> -       spin_lock_irq(&bitmap->lock);
> +
> +       if (locktype)
> +               spin_lock_irq(bmclock);
> +       else
> +               write_lock_irq(&bitmap->mlock);
>
>         if (mappage == NULL) {
>                 pr_debug("md/bitmap: map page allocation failed, hijacking\n");
> @@ -399,7 +407,7 @@ static int read_file_page(struct file *file, unsigned long index,
>         }
>
>         wait_event(bitmap->write_wait,
> -                  atomic_read(&bitmap->pending_writes)==0);
> +                  atomic_read(&bitmap->pending_writes) == 0);

I would prefer not to include this fix here.

>         if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
>                 ret = -EIO;
>  out:
> @@ -458,7 +466,7 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap)
>  {
>         if (bitmap->storage.file)
>                 wait_event(bitmap->write_wait,
> -                          atomic_read(&bitmap->pending_writes)==0);
> +                          atomic_read(&bitmap->pending_writes) == 0);

and here. (and a few more later).

>         else
>                 /* Note that we ignore the return value.  The writes
>                  * might have failed, but that would just mean that
> @@ -1248,16 +1256,28 @@ void md_bitmap_write_all(struct bitmap *bitmap)
>  static void md_bitmap_count_page(struct bitmap_counts *bitmap,
>                                  sector_t offset, int inc)
>  {
> -       sector_t chunk = offset >> bitmap->chunkshift;
> -       unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
> +       sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
> +       sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
> +       unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
> +       unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
> +                                       (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
> +       unsigned long cntid = blockno & mask;
> +       unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
> +

We have similar logic 4 times in the code. Let's add some helper for it.

>         bitmap->bp[page].count += inc;
>         md_bitmap_checkfree(bitmap, page);
>  }
>
>  static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
>  {
> -       sector_t chunk = offset >> bitmap->chunkshift;
> -       unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
> +       sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
> +       sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
> +       unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
> +       unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
> +                                       (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
> +       unsigned long cntid = blockno & mask;
> +       unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
> +
>         struct bitmap_page *bp = &bitmap->bp[page];
>
>         if (!bp->pending)
> @@ -1266,7 +1286,7 @@ static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
>
>  static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
>                                                sector_t offset, sector_t *blocks,
> -                                              int create);
> +                                              int create, spinlock_t *bmclock, int locktype);

We don't need bmclock and locktype here, right?

>
>  static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
>                               bool force)
> @@ -1349,7 +1369,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
>          * decrement and handle accordingly.
>          */
>         counts = &bitmap->counts;
[...]
> @@ -2227,18 +2309,29 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
>         blocks = min(old_counts.chunks << old_counts.chunkshift,
>                      chunks << chunkshift);
>
> +       /* initialize bmc locks */
> +       num_bmclocks = DIV_ROUND_UP(chunks, BITMAP_COUNTER_LOCK_RATIO);
> +       num_bmclocks = min(num_bmclocks, BITMAP_COUNTER_LOCK_MAX);
> +
> +       new_bmclocks = kvcalloc(num_bmclocks, sizeof(*new_bmclocks), GFP_KERNEL);
> +       bitmap->counts.bmclocks = new_bmclocks;
> +       for (i = 0; i < num_bmclocks; ++i) {
> +               spinlock_t *bmclock = &(bitmap->counts.bmclocks)[i];
> +
> +               spin_lock_init(bmclock);
> +       }
> +
>         /* For cluster raid, need to pre-allocate bitmap */
>         if (mddev_is_clustered(bitmap->mddev)) {
>                 unsigned long page;
>                 for (page = 0; page < pages; page++) {
> -                       ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1);
> +                       ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1, 0, 0);

For 0 pointer (bmclock), please use NULL.

>                         if (ret) {
>                                 unsigned long k;
>
>                                 /* deallocate the page memory */
> -                               for (k = 0; k < page; k++) {
> +                               for (k = 0; k < page; k++)
>                                         kfree(new_bp[k].map);
> -                               }
>                                 kfree(new_bp);
[...]
>
>  /* the superblock at the front of the bitmap file -- little endian */
> @@ -180,7 +186,8 @@ struct bitmap_page {
>  struct bitmap {
>
>         struct bitmap_counts {
> -               spinlock_t lock;
> +               rwlock_t mlock;                         /* lock for metadata */
> +               spinlock_t *bmclocks;           /* locks for bmc */

A summary about the locking design here can be really helpful.

>                 struct bitmap_page *bp;
>                 unsigned long pages;            /* total number of pages
>                                                  * in the bitmap */
> --
> 2.43.0
>