zram maps each page (struct page) to a zsmalloc object that contains a compressed buffer of that page's data. This fact generates data redundancy: for example, two identical pages will be store in compressed form in zsmalloc allocator twice. This patch adds a mechanism to scan zram_table_entry array and frees all identical objects in zsmalloc allocator, leaving only one. All zram_table_entry elements which reference this freed objects now refer to the same, not freed, object. To implement this mechanism, we sequentially scan the zram_table_entry array, counting the hash from the contents of the compressed pages (zsmalloc objects) and enter the index of the object into the hash table (hlist_head). If the hash matches, we remove the identical zsmalloc (zs_free) objects and update the link rbtree. Also, we can't just call zs_free() function during zram_free_page(). Before calling this function we have to make sure that no one else refers to this zsmalloc object. To implement the mechanism for merging identical compressed pages (zsmalloc objects), a rbtree is needed. The tree node key is a reference to the zsmalloc object (handle), and the value is the number of references to this object (atomic counter). This is necessary for data consistency so that we do not zs_free the object referenced by any element of the zram_table_entry array. Signed-off-by: Alexey Romanov <avromanov@xxxxxxxxxxxxxx> --- drivers/block/zram/zram_drv.c | 278 +++++++++++++++++++++++++++++++++- drivers/block/zram/zram_drv.h | 6 + 2 files changed, 283 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e290d6d97047..716c2f72805e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -33,12 +33,15 @@ #include <linux/debugfs.h> #include <linux/cpuhotplug.h> #include <linux/part_stat.h> +#include <linux/hashtable.h> +#include <linux/xxhash.h> #include "zram_drv.h" static DEFINE_IDR(zram_index_idr); /* idr index must be protected */ static DEFINE_MUTEX(zram_index_mutex); +static DEFINE_MUTEX(zram_rbtree_mutex); static int zram_major; static const char *default_compressor = CONFIG_ZRAM_DEF_COMP; @@ -57,6 +60,16 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); +struct zram_rbtree_node { + struct rb_node node; + unsigned long key; + unsigned long cnt; +}; + +struct zram_hash_node { + unsigned long index; + struct hlist_node next; +}; static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -1283,6 +1296,247 @@ static DEVICE_ATTR_RO(bd_stat); #endif static DEVICE_ATTR_RO(debug_stat); +static bool zram_rbtree_insert(struct rb_root *root, struct zram_rbtree_node *data) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct zram_rbtree_node *this; + + while (*new) { + this = rb_entry(*new, struct zram_rbtree_node, node); + parent = *new; + if (data->key < this->key) + new = &((*new)->rb_left); + else if (data->key > this->key) + new = &((*new)->rb_right); + else + return false; + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + return true; +} + +static struct zram_rbtree_node *zram_rbtree_search(struct rb_root *root, + unsigned long key) +{ + struct rb_node *node = root->rb_node; + struct zram_rbtree_node *data; + + while (node) { + data = rb_entry(node, struct zram_rbtree_node, node); + if (key < data->key) + node = node->rb_left; + else if (key > data->key) + node = node->rb_right; + else + return data; + } + + return NULL; +} + +static unsigned long zram_calc_hash(void *src, size_t len) +{ + return xxhash(src, len, 0); +} + +static int zram_cmp_obj_and_merge(struct zram *zram, struct hlist_head *htable, + size_t htable_size, size_t index) +{ + struct zram_rbtree_node *rb_node; + struct zram_hash_node *node; + unsigned long handle, cur_handle; + size_t obj_size; + char *src, *buf; + unsigned long hash; + int ret = 0; + + handle = zram_get_handle(zram, index); + if (!handle) + return ret; + + obj_size = zram_get_obj_size(zram, index); + buf = kmalloc(obj_size, GFP_KERNEL); + if (!buf) { + pr_err("Failed to allocate zs_map_object buffer\n"); + return -ENOMEM; + } + + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + memcpy(buf, src, obj_size); + zs_unmap_object(zram->mem_pool, handle); + hash = zram_calc_hash(buf, obj_size); + + mutex_lock(&zram_rbtree_mutex); + hlist_for_each_entry(node, &htable[hash % htable_size], next) { + int cmp; + + zram_slot_lock(zram, node->index); + + /* + * Page may change as the hash table is being formed, + * so the checks below are necessary. + */ + cur_handle = zram_get_handle(zram, node->index); + if (handle == cur_handle || + obj_size != zram_get_obj_size(zram, node->index)) { + zram_slot_unlock(zram, node->index); + continue; + } + + src = zs_map_object(zram->mem_pool, cur_handle, ZS_MM_RO); + cmp = memcmp(buf, src, obj_size); + zs_unmap_object(zram->mem_pool, cur_handle); + + if (!cmp) { + rb_node = zram_rbtree_search(&zram->sph_rbtree, handle); + + /* + * This check is necessary in order not to zs_free an object + * that someone already refers to. This situation is possible + * when with repeated calls to zram_do_scan(). For example: + * + * [slot0] [slot1] [slot2] [slot3] [slot4] + * [obj0] [obj1] [obj2] [obj3] [obj4] + * + * Let's imagine that obj2 and obj3 are equal, and we called + * zram_do_scan() function: + * + * [slot0] [slot1] [slot2] [slot3] [slot4] + * [obj0] [obj1] [obj2] [obj2] [obj4] + * + * Now, slot2 and slot3 refers to obj2 zsmalloc object. + * Time passed, now slot0 refres to obj0_n, which is equal + * to obj2: + * + * [slot0] [slot1] [slot2] [slot3] [slot4] + * [obj0_n] [obj1] [obj2] [obj2] [obj4] + * + * Now we call zram_do_scan() function again. We get to slot2, + * and we understand that obj2 and obj0_n hashes are the same. We + * try to zs_free(obj2), but slot3 also already refers to it. + * + * This is not correct! + */ + if (unlikely(rb_node)) + if (rb_node->cnt > 1) { + zram_slot_unlock(zram, node->index); + continue; + } + + zram_set_handle(zram, index, cur_handle); + zs_free(zram->mem_pool, handle); + + rb_node = zram_rbtree_search(&zram->sph_rbtree, cur_handle); + + if (!rb_node) { + rb_node = kzalloc(sizeof(struct zram_rbtree_node), + GFP_KERNEL); + if (!rb_node) { + pr_err("Failed to allocate rb_node\n"); + ret = -ENOMEM; + zram_slot_unlock(zram, node->index); + mutex_unlock(&zram_rbtree_mutex); + goto merged_or_err; + } + + rb_node->key = cur_handle; + /* Two slots refers to an zsmalloc object with cur_handle key */ + rb_node->cnt = 2; + zram_rbtree_insert(&zram->sph_rbtree, rb_node); + } else { + rb_node->cnt++; + } + + atomic64_sub(obj_size, &zram->stats.compr_data_size); + zram_set_flag(zram, index, ZRAM_MERGED); + zram_set_flag(zram, node->index, ZRAM_MERGED); + + zram_slot_unlock(zram, node->index); + mutex_unlock(&zram_rbtree_mutex); + goto merged_or_err; + } + + zram_slot_unlock(zram, node->index); + } + + mutex_unlock(&zram_rbtree_mutex); + + node = kmalloc(sizeof(struct zram_hash_node), GFP_KERNEL); + if (!node) { + ret = -ENOMEM; + goto merged_or_err; + } + + node->index = index; + hlist_add_head(&node->next, &htable[hash % htable_size]); + +merged_or_err: + kfree(buf); + return ret; +} + +static void zram_free_htable_entries(struct hlist_head *htable, + size_t htable_size) +{ + struct hlist_node *n; + struct zram_hash_node *node; + + hlist_for_each_entry_safe(node, n, htable, next) { + hlist_del(&node->next); + kfree(node); + } +} + +static int zram_do_scan(struct zram *zram) +{ + size_t num_pages = zram->disksize >> PAGE_SHIFT; + size_t htable_size = num_pages; + size_t index; + struct hlist_head *htable; + int i, ret = 0; + + htable = vzalloc(htable_size * sizeof(struct hlist_head)); + if (!htable) { + pr_err("Failed to allocate hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < htable_size; i++) + INIT_HLIST_HEAD(&htable[i]); + + for (index = 0; index < num_pages; index++) { + zram_slot_lock(zram, index); + + if (!zram_allocated(zram, index)) { + zram_slot_unlock(zram, index); + continue; + } + + if (zram_test_flag(zram, index, ZRAM_UNDER_WB) || + zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) { + zram_slot_unlock(zram, index); + continue; + } + + /* Ignore pages that have been recompressed */ + if (zram_get_priority(zram, index) != 0) + continue; + + ret = zram_cmp_obj_and_merge(zram, htable, htable_size, index); + zram_slot_unlock(zram, index); + if (ret != 0) + goto out; + } + +out: + zram_free_htable_entries(htable, htable_size); + vfree(htable); + return ret; +} + static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -1324,6 +1578,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + struct zram_rbtree_node *node; #ifdef CONFIG_ZRAM_MEMORY_TRACKING zram->table[index].ac_time = 0; @@ -1361,7 +1616,26 @@ static void zram_free_page(struct zram *zram, size_t index) if (!handle) return; - zs_free(zram->mem_pool, handle); + if (zram_test_flag(zram, index, ZRAM_MERGED)) { + zram_clear_flag(zram, index, ZRAM_MERGED); + mutex_lock(&zram_rbtree_mutex); + + node = zram_rbtree_search(&zram->sph_rbtree, handle); + BUG_ON(!node); + + node->cnt--; + if (node->cnt == 0) { + rb_erase(&node->node, &zram->sph_rbtree); + mutex_unlock(&zram_rbtree_mutex); + + zs_free(zram->mem_pool, handle); + kfree(node); + } else { + mutex_unlock(&zram_rbtree_mutex); + } + } else { + zs_free(zram->mem_pool, handle); + } atomic64_sub(zram_get_obj_size(zram, index), &zram->stats.compr_data_size); @@ -2421,6 +2695,8 @@ static int zram_add(void) comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); + zram->sph_rbtree = RB_ROOT; + zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c5254626f051..4a7151c94523 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -56,6 +56,7 @@ enum zram_pageflags { ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */ ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */ + ZRAM_MERGED, /* page was merged */ __NR_ZRAM_PAGEFLAGS, }; @@ -140,5 +141,10 @@ struct zram { #ifdef CONFIG_ZRAM_MEMORY_TRACKING struct dentry *debugfs_dir; #endif + /* + * This is same pages handle's rb tree, where the key is a handle + * to same pages and the value is a link counter + */ + struct rb_root sph_rbtree; }; #endif -- 2.25.1