[PATCH 1/4] zswap: Share zpool memory of duplicate pages

Srividya Desireddy <srividya.dr@xxxxxxxxxxx> · Wed, 17 Aug 2016 10:07:36 +0000

From: Srividya Desireddy <srividya.dr@xxxxxxxxxxx>
Date: Wed, 17 Aug 2016 14:31:01 +0530
Subject: [PATCH 1/4] zswap: Share zpool memory of duplicate pages

This patch shares the compressed pool memory of duplicate pages and reduces
compressed pool memory utilized by zswap.

For each page requested for swap-out to zswap, calculate 32-bit checksum of
the page. Search for duplicate pages by comparing the checksum of the new
page with existing pages. Compare the contents of the pages if checksum
matches. If the contents also match, then share the compressed data of the
existing page with the new page. Increment the reference count to check
the number of pages sharing the compressed page in zpool.

If a duplicate page is not found then treat the new page as a 'unique' page
in zswap. Compress the new page and store the compressed data in the zpool.
Insert the unique page in the Red-Black Tree which is balanced based on
32-bit checksum value of the page.

Signed-off-by: Srividya Desireddy <srividya.dr@xxxxxxxxxxx>
---
 mm/zswap.c |  265 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 248 insertions(+), 17 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 275b22c..f7efede 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -41,6 +41,7 @@
 #include <linux/swapops.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
+#include <linux/jhash.h>
 
 /*********************************
 * statistics
@@ -51,6 +52,13 @@ static u64 zswap_pool_total_size;
 static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
 
 /*
+ * The number of swapped out pages which are identified as duplicate
+ * to the existing zswap pages. Compression and storing of these pages
+ * is avoided.
+ */
+static atomic_t zswap_duplicate_pages = ATOMIC_INIT(0);
+
+/*
  * The statistics below are not protected from concurrent access for
  * performance reasons so they may not be a 100% accurate.  However,
  * they do provide useful information on roughly how many times a
@@ -123,6 +131,28 @@ struct zswap_pool {
 };
 
 /*
+ * struct zswap_handle
+ * This structure contains the metadata for tracking single zpool handle
+ * allocation.
+ *
+ * rbnode - links the zswap_handle into red-black tree
+ * checksum - 32-bit checksum value of the page swapped to zswap
+ * ref_count - number of pages sharing this handle
+ * length - the length in bytes of the compressed page data.
+ *          Needed during decompression.
+ * handle - zpool allocation handle that stores the compressed page data.
+ * pool - the zswap_pool the entry's data is in.
+ */
+struct zswap_handle {
+	struct rb_node rbnode;
+	u32 checksum;
+	u16 ref_count;
+	unsigned int length;
+	unsigned long handle;
+	struct zswap_pool *pool;
+};
+
+/*
  * struct zswap_entry
  *
  * This structure contains the metadata for tracking a single compressed
@@ -136,18 +166,15 @@ struct zswap_pool {
  *            for the zswap_tree structure that contains the entry must
  *            be held while changing the refcount.  Since the lock must
  *            be held, there is no reason to also make refcount atomic.
- * length - the length in bytes of the compressed page data.  Needed during
- *          decompression
  * pool - the zswap_pool the entry's data is in
- * handle - zpool allocation handle that stores the compressed page data
+ * zhandle - pointer to struct zswap_handle
  */
 struct zswap_entry {
 	struct rb_node rbnode;
 	pgoff_t offset;
 	int refcount;
-	unsigned int length;
 	struct zswap_pool *pool;
-	unsigned long handle;
+	struct zswap_handle *zhandle;
 };
 
 struct zswap_header {
@@ -161,6 +188,8 @@ struct zswap_header {
  */
 struct zswap_tree {
 	struct rb_root rbroot;
+	struct rb_root zhandleroot;
+	void  *buffer;
 	spinlock_t lock;
 };
 
@@ -236,6 +265,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
 	if (!entry)
 		return NULL;
 	entry->refcount = 1;
+	entry->zhandle = NULL;
 	RB_CLEAR_NODE(&entry->rbnode);
 	return entry;
 }
@@ -246,6 +276,39 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 }
 
 /*********************************
+* zswap handle functions
+**********************************/
+static struct kmem_cache *zswap_handle_cache;
+
+static int __init zswap_handle_cache_create(void)
+{
+	zswap_handle_cache = KMEM_CACHE(zswap_handle, 0);
+	return zswap_handle_cache == NULL;
+}
+
+static void __init zswap_handle_cache_destroy(void)
+{
+	kmem_cache_destroy(zswap_handle_cache);
+}
+
+static struct zswap_handle *zswap_handle_cache_alloc(gfp_t gfp)
+{
+	struct zswap_handle *zhandle;
+
+	zhandle = kmem_cache_alloc(zswap_handle_cache, gfp);
+	if (!zhandle)
+		return NULL;
+	zhandle->ref_count = 1;
+	RB_CLEAR_NODE(&zhandle->rbnode);
+	return zhandle;
+}
+
+static void zswap_handle_cache_free(struct zswap_handle *zhandle)
+{
+	kmem_cache_free(zswap_handle_cache, zhandle);
+}
+
+/*********************************
 * rbtree functions
 **********************************/
 static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
@@ -300,14 +363,124 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 	}
 }
 
+static struct zswap_handle *zswap_handle_rb_search(struct rb_root *root,
+						u32 checksum)
+{
+	struct rb_node *node = root->rb_node;
+	struct zswap_handle *zhandle;
+
+	while (node) {
+		zhandle = rb_entry(node, struct zswap_handle, rbnode);
+		if (zhandle->checksum > checksum)
+			node = node->rb_left;
+		else if (zhandle->checksum < checksum)
+			node = node->rb_right;
+		else
+			return zhandle;
+	}
+	return NULL;
+}
+
+/*
+ * In the case that zhandle with the same checksum is found, a pointer to
+ * the existing zhandle is stored in duphandle and the function returns
+ * -EEXIST
+ */
+static int zswap_handle_rb_insert(struct rb_root *root,
+				struct zswap_handle *zhandle,
+				struct zswap_handle **duphandle)
+{
+	struct rb_node **link = &root->rb_node, *parent = NULL;
+	struct zswap_handle *myhandle;
+
+	while (*link) {
+		parent = *link;
+		myhandle = rb_entry(parent, struct zswap_handle, rbnode);
+		if (myhandle->checksum > zhandle->checksum)
+			link = &parent->rb_left;
+		else if (myhandle->checksum < zhandle->checksum)
+			link = &parent->rb_right;
+		else {
+			*duphandle = myhandle;
+			return -EEXIST;
+		}
+	}
+	rb_link_node(&zhandle->rbnode, parent, link);
+	rb_insert_color(&zhandle->rbnode, root);
+	return 0;
+}
+
+static void zswap_handle_erase(struct rb_root *root,
+				struct zswap_handle *zhandle)
+{
+	if (!RB_EMPTY_NODE(&zhandle->rbnode)) {
+		rb_erase(&zhandle->rbnode, root);
+		RB_CLEAR_NODE(&zhandle->rbnode);
+	}
+}
+
+/*
+ * This function searches for the same page in the zhandle RB-Tree based
+ * on the checksum value of the new page. If the same page is found the
+ * zhandle of that page is returned.
+ */
+static struct zswap_handle *zswap_same_page_search(struct zswap_tree *tree,
+						u8 *uncmem, u32 checksum)
+{
+	int ret = 0;
+	unsigned int dlen = PAGE_SIZE;
+	u8 *src = "" *dst = NULL;
+	struct zswap_handle *zhandle = NULL;
+	struct crypto_comp *tfm;
+
+	zhandle = zswap_handle_rb_search(&tree->zhandleroot, checksum);
+	if (!zhandle)
+		return NULL;
+	if (!zhandle->pool)
+		return NULL;
+
+	/* Compare memory contents */
+	dst = (u8 *)tree->buffer;
+	src = "" *)zpool_map_handle(zhandle->pool->zpool, zhandle->handle,
+			ZPOOL_MM_RO) + sizeof(struct zswap_header);
+	tfm = *get_cpu_ptr(zhandle->pool->tfm);
+	ret = crypto_comp_decompress(tfm, src, zhandle->length, dst, &dlen);
+	put_cpu_ptr(zhandle->pool->tfm);
+	zpool_unmap_handle(zhandle->pool->zpool, zhandle->handle);
+
+	if (ret) /* Consider the page as unique if decompression failed;*/
+		return NULL;
+
+	if (memcmp(dst, uncmem, PAGE_SIZE))
+		return NULL;
+
+	return zhandle;
+}
+
+/*
+ * This function returns true if the zswap_handle is referenced by only
+ * one page entry.
+ */
+static bool zswap_handle_is_unique(struct zswap_handle *zhandle)
+{
+	WARN_ON(zhandle->ref_count < 1);
+	return zhandle->ref_count == 1;
+}
+
 /*
  * Carries out the common pattern of freeing and entry's zpool allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
  */
 static void zswap_free_entry(struct zswap_entry *entry)
 {
-	zpool_free(entry->pool->zpool, entry->handle);
-	zswap_pool_put(entry->pool);
+	if (zswap_handle_is_unique(entry->zhandle)) {
+		zpool_free(entry->pool->zpool, entry->zhandle->handle);
+		zswap_handle_cache_free(entry->zhandle);
+		zswap_pool_put(entry->pool);
+	} else {
+		entry->zhandle->ref_count--;
+		atomic_dec(&zswap_duplicate_pages);
+	}
 	zswap_entry_cache_free(entry);
 	atomic_dec(&zswap_stored_pages);
 	zswap_update_total_size();
@@ -329,6 +502,9 @@ static void zswap_entry_put(struct zswap_tree *tree,
 
 	BUG_ON(refcount < 0);
 	if (refcount == 0) {
+		if (entry->zhandle && zswap_handle_is_unique(entry->zhandle))
+			zswap_handle_erase(&tree->zhandleroot,
+					entry->zhandle);
 		zswap_rb_erase(&tree->rbroot, entry);
 		zswap_free_entry(entry);
 	}
@@ -886,15 +1062,16 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
 		/* decompress */
 		dlen = PAGE_SIZE;
-		src = "" *)zpool_map_handle(entry->pool->zpool, entry->handle,
-				ZPOOL_MM_RO) + sizeof(struct zswap_header);
+		src = "" *)zpool_map_handle(entry->pool->zpool,
+				entry->zhandle->handle, ZPOOL_MM_RO)
+				+ sizeof(struct zswap_header);
 		dst = kmap_atomic(page);
 		tfm = *get_cpu_ptr(entry->pool->tfm);
-		ret = crypto_comp_decompress(tfm, src, entry->length,
+		ret = crypto_comp_decompress(tfm, src, entry->zhandle->length,
 					     dst, &dlen);
 		put_cpu_ptr(entry->pool->tfm);
 		kunmap_atomic(dst);
-		zpool_unmap_handle(entry->pool->zpool, entry->handle);
+		zpool_unmap_handle(entry->pool->zpool, entry->zhandle->handle);
 		BUG_ON(ret);
 		BUG_ON(dlen != PAGE_SIZE);
 
@@ -975,6 +1152,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	char *buf;
 	u8 *src, *dst;
 	struct zswap_header *zhdr;
+	struct zswap_handle *zhandle = NULL, *duphandle = NULL;
+	u32 checksum = 0;
 
 	if (!zswap_enabled || !tree) {
 		ret = -ENODEV;
@@ -999,6 +1178,23 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 		goto reject;
 	}
 
+	src = ""
+
+	checksum = jhash2((const u32 *)src, PAGE_SIZE / 4, 17);
+	spin_lock(&tree->lock);
+	zhandle = zswap_same_page_search(tree, src, checksum);
+	if (zhandle) {
+		entry->offset = offset;
+		entry->zhandle = zhandle;
+		entry->pool = zhandle->pool;
+		entry->zhandle->ref_count++;
+		spin_unlock(&tree->lock);
+		kunmap_atomic(src);
+		atomic_inc(&zswap_duplicate_pages);
+		goto insert_entry;
+	}
+	spin_unlock(&tree->lock);
+
 	/* if entry is successfully added, it keeps the reference */
 	entry->pool = zswap_pool_current_get();
 	if (!entry->pool) {
@@ -1009,7 +1205,6 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	/* compress */
 	dst = get_cpu_var(zswap_dstmem);
 	tfm = *get_cpu_ptr(entry->pool->tfm);
-	src = ""
 	ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
 	kunmap_atomic(src);
 	put_cpu_ptr(entry->pool->tfm);
@@ -1040,9 +1235,24 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 
 	/* populate entry */
 	entry->offset = offset;
-	entry->handle = handle;
-	entry->length = dlen;
+	zhandle = zswap_handle_cache_alloc(GFP_KERNEL);
+	if (!zhandle) {
+		zswap_reject_kmemcache_fail++;
+		ret = -ENOMEM;
+		goto freeentry;
+	}
+
+	entry->zhandle = zhandle;
+	entry->zhandle->handle = handle;
+	entry->zhandle->length = dlen;
+	entry->zhandle->checksum = checksum;
+	entry->zhandle->pool = entry->pool;
+	spin_lock(&tree->lock);
+	ret = zswap_handle_rb_insert(&tree->zhandleroot, entry->zhandle,
+								&duphandle);
+	spin_unlock(&tree->lock);
 
+insert_entry:
 	/* map */
 	spin_lock(&tree->lock);
 	do {
@@ -1064,6 +1274,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 
 put_dstmem:
 	put_cpu_var(zswap_dstmem);
+freeentry:
 	zswap_pool_put(entry->pool);
 freepage:
 	zswap_entry_cache_free(entry);
@@ -1097,14 +1308,15 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
 	/* decompress */
 	dlen = PAGE_SIZE;
-	src = "" *)zpool_map_handle(entry->pool->zpool, entry->handle,
+	src = "" *)zpool_map_handle(entry->pool->zpool, entry->zhandle->handle,
 			ZPOOL_MM_RO) + sizeof(struct zswap_header);
 	dst = kmap_atomic(page);
 	tfm = *get_cpu_ptr(entry->pool->tfm);
-	ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
+	ret = crypto_comp_decompress(tfm, src, entry->zhandle->length,
+					dst, &dlen);
 	put_cpu_ptr(entry->pool->tfm);
 	kunmap_atomic(dst);
-	zpool_unmap_handle(entry->pool->zpool, entry->handle);
+	zpool_unmap_handle(entry->pool->zpool, entry->zhandle->handle);
 	BUG_ON(ret);
 
 	spin_lock(&tree->lock);
@@ -1152,7 +1364,9 @@ static void zswap_frontswap_invalidate_area(unsigned type)
 	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
 		zswap_free_entry(entry);
 	tree->rbroot = RB_ROOT;
+	tree->zhandleroot = RB_ROOT;
 	spin_unlock(&tree->lock);
+	free_page((unsigned long)tree->buffer);
 	kfree(tree);
 	zswap_trees[type] = NULL;
 }
@@ -1167,6 +1381,13 @@ static void zswap_frontswap_init(unsigned type)
 		return;
 	}
 
+	tree->buffer = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!tree->buffer) {
+		pr_err("zswap: Error allocating buffer for decompression\n");
+		kfree(tree);
+		return;
+	}
+	tree->zhandleroot = RB_ROOT;
 	tree->rbroot = RB_ROOT;
 	spin_lock_init(&tree->lock);
 	zswap_trees[type] = tree;
@@ -1215,6 +1436,9 @@ static int __init zswap_debugfs_init(void)
 			zswap_debugfs_root, &zswap_pool_total_size);
 	debugfs_create_atomic_t("stored_pages", S_IRUGO,
 			zswap_debugfs_root, &zswap_stored_pages);
+	debugfs_create_atomic_t("duplicate_pages", S_IRUGO,
+			zswap_debugfs_root, &zswap_duplicate_pages);
+
 
 	return 0;
 }
@@ -1246,6 +1470,11 @@ static int __init init_zswap(void)
 		goto cache_fail;
 	}
 
+	if (zswap_handle_cache_create()) {
+		pr_err("handle cache creation failed\n");
+		goto handlecachefail;
+	}
+
 	if (zswap_cpu_dstmem_init()) {
 		pr_err("dstmem alloc failed\n");
 		goto dstmem_fail;
@@ -1269,6 +1498,8 @@ static int __init init_zswap(void)
 pool_fail:
 	zswap_cpu_dstmem_destroy();
 dstmem_fail:
+	zswap_handle_cache_destroy();
+handlecachefail:
 	zswap_entry_cache_destroy();
 cache_fail:
 	return -ENOMEM;
-- 
1.7.9.5