Hi Shushu, Thanks for your persistent work with this effort! On Tue, Jun 11, 2024 at 12:35 PM Shushu Yi <teddyxym@xxxxxxxxxxx> wrote: This subject and commit log need some work. For the subject, I think we can use something like: md/bitmap: Optimize lock contention. > Optimize scattered address space. Achieves significant improvements in > both throughput and latency. > > Maximize thread-level parallelism and reduce CPU suspension time caused > by lock contention. Achieve increased overall storage throughput by > 89.4% and decrease the 99.99th percentile I/O latency by 85.4% on a > system with four PCIe 4.0 SSDs. (Set the iodepth to 32 and employ > libaio. Configure the I/O size as 4 KB with sequential write and 16 > threads. In RAID5 consist of 2+1 1TB Samsung 980Pro SSDs, throughput > went from 5218MB/s to 9884MB/s.) For the commit log, please provide more details. There is no word limit here. Let's make it easy to understand. > Note: Publish this work as a paper and provide the URL > (https://www.hotstorage.org/2022/camera-ready/hotstorage22-5/pdf/ > hotstorage22-5.pdf). > > Co-developed-by: Yiming Xu <teddyxym@xxxxxxxxxxx> > Signed-off-by: Yiming Xu <teddyxym@xxxxxxxxxxx> > Signed-off-by: Shushu Yi <firnyee@xxxxxxxxx> > Tested-by: Paul Luse <paul.e.luse@xxxxxxxxx> [...] > -static int md_bitmap_checkpage(struct bitmap_counts *bitmap, > - unsigned long page, int create, int no_hijack) > -__releases(bitmap->lock) > -__acquires(bitmap->lock) > +static int md_bitmap_checkpage(struct bitmap_counts *bitmap, unsigned long page, > + int create, int no_hijack, spinlock_t *bmclock, int locktype) > +__releases(bmclock) > +__acquires(bmclock) > +__releases(bitmap->mlock) > +__acquires(bitmap->mlock) > { > unsigned char *mappage; > > @@ -65,8 +67,10 @@ __acquires(bitmap->lock) > return -ENOENT; > > /* this page has not been allocated yet */ > - > - spin_unlock_irq(&bitmap->lock); > + if (locktype) > + spin_unlock_irq(bmclock); > + else > + write_unlock_irq(&bitmap->mlock); locktype seems unnecessary here. We can do something like if (bmclock) spin_unlock_irq(bmclock); else write_unlock_irq(...); Also, please add some comment here to explain the logic. > /* It is possible that this is being called inside a > * prepare_to_wait/finish_wait loop from raid5c:make_request(). > * In general it is not permitted to sleep in that context as it > @@ -81,7 +85,11 @@ __acquires(bitmap->lock) > */ > sched_annotate_sleep(); > mappage = kzalloc(PAGE_SIZE, GFP_NOIO); > - spin_lock_irq(&bitmap->lock); > + > + if (locktype) > + spin_lock_irq(bmclock); > + else > + write_lock_irq(&bitmap->mlock); > > if (mappage == NULL) { > pr_debug("md/bitmap: map page allocation failed, hijacking\n"); > @@ -399,7 +407,7 @@ static int read_file_page(struct file *file, unsigned long index, > } > > wait_event(bitmap->write_wait, > - atomic_read(&bitmap->pending_writes)==0); > + atomic_read(&bitmap->pending_writes) == 0); I would prefer not to include this fix here. > if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) > ret = -EIO; > out: > @@ -458,7 +466,7 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap) > { > if (bitmap->storage.file) > wait_event(bitmap->write_wait, > - atomic_read(&bitmap->pending_writes)==0); > + atomic_read(&bitmap->pending_writes) == 0); and here. (and a few more later). > else > /* Note that we ignore the return value. The writes > * might have failed, but that would just mean that > @@ -1248,16 +1256,28 @@ void md_bitmap_write_all(struct bitmap *bitmap) > static void md_bitmap_count_page(struct bitmap_counts *bitmap, > sector_t offset, int inc) > { > - sector_t chunk = offset >> bitmap->chunkshift; > - unsigned long page = chunk >> PAGE_COUNTER_SHIFT; > + sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT); > + sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT)); > + unsigned long bits = totblocks ? fls((totblocks - 1)) : 0; > + unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX << > + (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT))); > + unsigned long cntid = blockno & mask; > + unsigned long page = cntid >> PAGE_COUNTER_SHIFT; > + We have similar logic 4 times in the code. Let's add some helper for it. > bitmap->bp[page].count += inc; > md_bitmap_checkfree(bitmap, page); > } > > static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) > { > - sector_t chunk = offset >> bitmap->chunkshift; > - unsigned long page = chunk >> PAGE_COUNTER_SHIFT; > + sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT); > + sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT)); > + unsigned long bits = totblocks ? fls((totblocks - 1)) : 0; > + unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX << > + (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT))); > + unsigned long cntid = blockno & mask; > + unsigned long page = cntid >> PAGE_COUNTER_SHIFT; > + > struct bitmap_page *bp = &bitmap->bp[page]; > > if (!bp->pending) > @@ -1266,7 +1286,7 @@ static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) > > static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, > sector_t offset, sector_t *blocks, > - int create); > + int create, spinlock_t *bmclock, int locktype); We don't need bmclock and locktype here, right? > > static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, > bool force) > @@ -1349,7 +1369,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) > * decrement and handle accordingly. > */ > counts = &bitmap->counts; [...] > @@ -2227,18 +2309,29 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, > blocks = min(old_counts.chunks << old_counts.chunkshift, > chunks << chunkshift); > > + /* initialize bmc locks */ > + num_bmclocks = DIV_ROUND_UP(chunks, BITMAP_COUNTER_LOCK_RATIO); > + num_bmclocks = min(num_bmclocks, BITMAP_COUNTER_LOCK_MAX); > + > + new_bmclocks = kvcalloc(num_bmclocks, sizeof(*new_bmclocks), GFP_KERNEL); > + bitmap->counts.bmclocks = new_bmclocks; > + for (i = 0; i < num_bmclocks; ++i) { > + spinlock_t *bmclock = &(bitmap->counts.bmclocks)[i]; > + > + spin_lock_init(bmclock); > + } > + > /* For cluster raid, need to pre-allocate bitmap */ > if (mddev_is_clustered(bitmap->mddev)) { > unsigned long page; > for (page = 0; page < pages; page++) { > - ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1); > + ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1, 0, 0); For 0 pointer (bmclock), please use NULL. > if (ret) { > unsigned long k; > > /* deallocate the page memory */ > - for (k = 0; k < page; k++) { > + for (k = 0; k < page; k++) > kfree(new_bp[k].map); > - } > kfree(new_bp); [...] > > /* the superblock at the front of the bitmap file -- little endian */ > @@ -180,7 +186,8 @@ struct bitmap_page { > struct bitmap { > > struct bitmap_counts { > - spinlock_t lock; > + rwlock_t mlock; /* lock for metadata */ > + spinlock_t *bmclocks; /* locks for bmc */ A summary about the locking design here can be really helpful. > struct bitmap_page *bp; > unsigned long pages; /* total number of pages > * in the bitmap */ > -- > 2.43.0 >