Enlarging the hash table size can significantly improve transaction manager hash operations. This commit adds the tm_hash_table_size module_param that sets the hash size. For demonstration, I've created a thin volume, filled it with 4.8TiB of sequential data and took a snapshot. then, I overwritten the volume in a way that caused a maximum btree nodes to be allocated. I've repeated the process for 5 snapshots and measured the first snapshot deletion time. When using 128k hash size (instead of the 256 default value) I was able to reduce snapshot deletion time from 42min to 4min (10x improvement). Signed-off-by: Meir Elisha <meir.elisha@xxxxxxxxxxx> --- Script used for demonstration attached below. .../persistent-data/dm-transaction-manager.c | 51 ++++++++++++++++--- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index c7ba4e6cbbc7..8d486b1e6693 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -10,6 +10,7 @@ #include "dm-space-map-metadata.h" #include "dm-persistent-data-internal.h" +#include <linux/moduleparam.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/hash.h> @@ -84,8 +85,35 @@ struct shadow_info { /* * It would be nice if we scaled with the size of transaction. */ -#define DM_HASH_SIZE 256 -#define DM_HASH_MASK (DM_HASH_SIZE - 1) +static uint tm_hash_table_size = 256; + +static int param_set_hash_size(const char *val, const struct kernel_param *kp) +{ + unsigned int num; + int ret; + + if (!val) + return -EINVAL; + + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + + /* Hash size must be a power of 2 */ + if (!(num && !(num & (num - 1)))) + return -EINVAL; + + *((unsigned int *)kp->arg) = num; + return 0; +} + +static const struct kernel_param_ops tm_hash_table_size_ops = { + .set = param_set_hash_size, + .get = param_get_uint +}; + +module_param_cb(tm_hash_table_size, &tm_hash_table_size_ops, &tm_hash_table_size, 0644); +MODULE_PARM_DESC(tm_hash_table_size, "transaction manager hash size"); struct dm_transaction_manager { int is_clone; @@ -95,8 +123,9 @@ struct dm_transaction_manager { struct dm_space_map *sm; spinlock_t lock; - struct hlist_head buckets[DM_HASH_SIZE]; - + struct hlist_head *buckets; + uint hash_size; + uint hash_mask; struct prefetch_set prefetches; }; @@ -105,7 +134,7 @@ struct dm_transaction_manager { static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) { int r = 0; - unsigned int bucket = dm_hash_block(b, DM_HASH_MASK); + unsigned int bucket = dm_hash_block(b, tm->hash_mask); struct shadow_info *si; spin_lock(&tm->lock); @@ -131,7 +160,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) si = kmalloc(sizeof(*si), GFP_NOIO); if (si) { si->where = b; - bucket = dm_hash_block(b, DM_HASH_MASK); + bucket = dm_hash_block(b, tm->hash_mask); spin_lock(&tm->lock); hlist_add_head(&si->hlist, tm->buckets + bucket); spin_unlock(&tm->lock); @@ -146,7 +175,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm) int i; spin_lock(&tm->lock); - for (i = 0; i < DM_HASH_SIZE; i++) { + for (i = 0; i < tm->hash_size; i++) { bucket = tm->buckets + i; hlist_for_each_entry_safe(si, tmp, bucket, hlist) kfree(si); @@ -169,13 +198,19 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, if (!tm) return ERR_PTR(-ENOMEM); + tm->hash_size = tm_hash_table_size; + tm->buckets = kmalloc_array(tm->hash_size, sizeof(*tm->buckets), GFP_KERNEL); + if (!tm->buckets) + return ERR_PTR(-ENOMEM); + tm->is_clone = 0; tm->real = NULL; tm->bm = bm; tm->sm = sm; + tm->hash_mask = tm->hash_size - 1; spin_lock_init(&tm->lock); - for (i = 0; i < DM_HASH_SIZE; i++) + for (i = 0; i < tm->hash_size; i++) INIT_HLIST_HEAD(tm->buckets + i); prefetch_init(&tm->prefetches); -- 2.34.1 Script used for testing(assumes PAGE_SIZE=4096): # Create volume group vgcreate --physicalextentsize 16384k vg1 /dev/nvme1n1 # Create thin pool lvcreate --yes --extents 320002 --poolmetadatasize 104860412 --chunksize 128k --thinpool tp vg1 # Create thin volume lvcreate --yes --virtualsize 5868826560512b --thin --name lv1 vg1/tp # Fill volume with initial data fio --name=write_data --ioengine=libaio \ --direct=1 \ --randrepeat=0 \ --numjobs=1 \ --bs=128k \ --filename=/dev/vg1/lv1 \ --size=4800G \ --iodepth=8 \ --rw=randwrite for i in {1..5}; do # Create a snapshot lvcreate --name snap$i --snapshot vg1/lv1 # Fill snap tree with the worst case scenario # write 1 chunk at every btree leaf to enforce new tree for each snapshot fio --name=write --filename="/dev/vg1/lv1" --rw=write:32128k --rw_sequencer=sequential \ --bs=128k --size=4800G --direct=1 --ioengine=libaio sleep 1 done # Remove a snapshot time lvremove vg1/snap1 -- 2.34.1