__send_empty_flush() sends empty flush bios to every target in the dm_table. However, if the num_targets exceeds the number of block devices in the dm_table's device list, it could lead to multiple invocations of __send_duplicate_bios() for the same block device. Typically, a single thread sending numerous empty flush bios to one block device is redundant, as these bios are likely to be merged by the flush state machine. In scenarios where num_targets significantly outweighs the number of block devices, such behavior may result in a noteworthy decrease in performance. This issue can be reproduced using this command line: for i in {0..1023}; do echo $((8000*$i)) 8000 linear /dev/sda2 $((16384*$i)) done | dmsetup create example With this fix, a random write with fsync workload executed with the following fio command: fio --group_reporting --name=benchmark --filename=/dev/mapper/example \ --ioengine=sync --invalidate=1 --numjobs=16 --rw=randwrite \ --blocksize=4k --size=2G --time_based --runtime=30 --fdatasync=1 results in an increase from 857 KB/s to 30.8 MB/s of the write throughput (3580% increase). Signed-off-by: Yang Yang <yang.yang@xxxxxxxx> --- drivers/md/dm-core.h | 1 + drivers/md/dm-table.c | 7 +++++ drivers/md/dm.c | 59 +++++++++++++++++++++++++++++++++++ include/linux/device-mapper.h | 6 ++++ 4 files changed, 73 insertions(+) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index e6757a30dcca..7e3f2168289f 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -217,6 +217,7 @@ struct dm_table { /* a list of devices used by this table */ struct list_head devices; struct rw_semaphore devices_lock; + unsigned short num_devices; /* events get handed up using this callback */ void (*event_fn)(void *data); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 41f1d731ae5a..ddc60e498afb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2133,6 +2133,8 @@ void dm_table_postsuspend_targets(struct dm_table *t) int dm_table_resume_targets(struct dm_table *t) { + struct list_head *devices = dm_table_get_devices(t); + struct dm_dev_internal *dd; unsigned int i; int r = 0; @@ -2159,6 +2161,11 @@ int dm_table_resume_targets(struct dm_table *t) ti->type->resume(ti); } + t->num_devices = 0; + + list_for_each_entry(dd, devices, list) + dd->dm_dev->index = ++(t->num_devices); + return 0; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 56aa2a8b9d71..7297235291f6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -48,6 +48,8 @@ */ #define REQ_DM_POLL_LIST REQ_DRV +#define DM_MAX_TABLE_DEVICES 1024 + static const char *_name = DM_NAME; static unsigned int major; @@ -1543,10 +1545,38 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe return ret; } +static inline bool has_redundant_flush(struct dm_table *t, + unsigned long **bitmap) +{ + if (t->num_devices < t->num_targets) { + /* Add a limit here to prevent excessive memory usage for bitmaps */ + if (t->num_devices >= DM_MAX_TABLE_DEVICES) + return false; + + /* dm_dev's index starts from 1, so need plus 1 here */ + *bitmap = bitmap_zalloc(t->num_devices + 1, GFP_KERNEL); + if (*bitmap) + return true; + } + + return false; +} + +static int dm_get_dev_index(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + unsigned short *index = data; + *index = dev->index; + return 0; +} + static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; + unsigned long *handled_map; + unsigned int nr_handled = 0; + bool check = has_redundant_flush(t, &handled_map); /* * Use an on-stack bio for this, it's safe since we don't @@ -1562,17 +1592,46 @@ static void __send_empty_flush(struct clone_info *ci) for (unsigned int i = 0; i < t->num_targets; i++) { unsigned int bios; + unsigned short index = 0; struct dm_target *ti = dm_table_get_target(t, i); if (unlikely(ti->num_flush_bios == 0)) continue; + /* + * If the num_targets is greater than the number of block devices + * in the dm_table's devices list, __send_empty_flush() might + * invoke __send_duplicate_bios() multiple times for the same + * block device. This could lead to a substantial decrease in + * performance when num_targets significantly exceeds the number + * of block devices. + * Ensure that __send_duplicate_bios() is only called once for + * each block device. + */ + if (check) { + if (nr_handled == t->num_devices) + break; + + if (ti->type->iterate_devices) + ti->type->iterate_devices(ti, dm_get_dev_index, &index); + + if (index > 0) { + if (__test_and_set_bit(index, handled_map)) + continue; + else + nr_handled++; + } + } + atomic_add(ti->num_flush_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL, GFP_NOWAIT); atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); } + if (check) + bitmap_free(handled_map); + /* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following subtraction diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 82b2195efaca..4a54b4f0a609 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -169,6 +169,12 @@ struct dm_dev { struct dax_device *dax_dev; blk_mode_t mode; char name[16]; + + /* + * sequential number for each dm_dev in dm_table's devices list, + * start from 1 + */ + unsigned short index; }; /* -- 2.34.1