[RFC] zswap: use B-tree for search

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The current zswap implementation uses red-black trees to store entries
and to perform lookups. Although this algorithm obviously has complexity
of O(log N) it still takes a while to complete lookup (or, even more for
replacement) of an entry, when the amount of entries is huge (100K+).

B-trees are known to handle such cases more efficiently (i. e. also with
O(log N) complexity but with way lower coefficient) so trying zswap with
B-trees sounded good to me from the start.

The implementation of B-trees that is currently present in Linux kernel
isn't really doing things in the best possible way (i. e. it has
recursion) but I thought it was worth trying anyway. So I modified zswap
(the patch follows below) to use lib/btree and I can see substantial
improvement in my tests when thie patch is applied. I do not post test
results here since we're not quite there yet; I'd like to first hear
opinions on the idea as such and on the way I use B-trees here. Thanks
in advance :)

Signed-off-by: Vitaly Wool <vitaly.wool@xxxxxxxxxxxx>

 mm/zswap.c |  147 +++++++++++++++++++++++++++-------------------------------------
 1 file changed, 64 insertions(+), 83 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 46a322316e52..c2cc07be1322 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -21,7 +21,7 @@
 #include <linux/types.h>
 #include <linux/atomic.h>
 #include <linux/frontswap.h>
-#include <linux/rbtree.h>
+#include <linux/btree.h>
 #include <linux/swap.h>
 #include <linux/crypto.h>
 #include <linux/mempool.h>
@@ -134,7 +134,6 @@ struct zswap_pool {
  * This structure contains the metadata for tracking a single compressed
  * page within zswap.
  *
- * rbnode - links the entry into red-black tree for the appropriate swap type
  * offset - the swap offset for the entry.  Index into the red-black tree.
  * refcount - the number of outstanding reference to the entry. This is needed
  *            to protect against premature freeing of the entry by code
@@ -149,7 +148,6 @@ struct zswap_pool {
  * value - value of the same-value filled pages which have same content
  */
 struct zswap_entry {
-	struct rb_node rbnode;
 	pgoff_t offset;
 	int refcount;
 	unsigned int length;
@@ -166,11 +164,11 @@ struct zswap_header {
 
 /*
  * The tree lock in the zswap_tree struct protects a few things:
- * - the rbtree
+ * - the tree
  * - the refcount field of each entry in the tree
  */
 struct zswap_tree {
-	struct rb_root rbroot;
+	struct btree_head head;
 	spinlock_t lock;
 };
 
@@ -252,7 +250,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
 	if (!entry)
 		return NULL;
 	entry->refcount = 1;
-	RB_CLEAR_NODE(&entry->rbnode);
 	return entry;
 }
 
@@ -262,58 +259,18 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 }
 
 /*********************************
-* rbtree functions
+* btree functions
 **********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
-{
-	struct rb_node *node = root->rb_node;
-	struct zswap_entry *entry;
+static struct btree_geo *btree_pgofft_geo;
 
-	while (node) {
-		entry = rb_entry(node, struct zswap_entry, rbnode);
-		if (entry->offset > offset)
-			node = node->rb_left;
-		else if (entry->offset < offset)
-			node = node->rb_right;
-		else
-			return entry;
-	}
-	return NULL;
-}
-
-/*
- * In the case that a entry with the same offset is found, a pointer to
- * the existing entry is stored in dupentry and the function returns -EEXIST
- */
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
-			struct zswap_entry **dupentry)
+static struct zswap_entry *zswap_search(struct btree_head *head, pgoff_t offset)
 {
-	struct rb_node **link = &root->rb_node, *parent = NULL;
-	struct zswap_entry *myentry;
-
-	while (*link) {
-		parent = *link;
-		myentry = rb_entry(parent, struct zswap_entry, rbnode);
-		if (myentry->offset > entry->offset)
-			link = &(*link)->rb_left;
-		else if (myentry->offset < entry->offset)
-			link = &(*link)->rb_right;
-		else {
-			*dupentry = myentry;
-			return -EEXIST;
-		}
-	}
-	rb_link_node(&entry->rbnode, parent, link);
-	rb_insert_color(&entry->rbnode, root);
-	return 0;
+	return btree_lookup(head, btree_pgofft_geo, &offset);
 }
 
-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+static void zswap_erase(struct btree_head *head, struct zswap_entry *entry)
 {
-	if (!RB_EMPTY_NODE(&entry->rbnode)) {
-		rb_erase(&entry->rbnode, root);
-		RB_CLEAR_NODE(&entry->rbnode);
-	}
+	btree_remove(head, btree_pgofft_geo, &entry->offset);
 }
 
 /*
@@ -342,25 +299,40 @@ static void zswap_entry_get(struct zswap_entry *entry)
 /* caller must hold the tree lock
 * remove from the tree and free it, if nobody reference the entry
 */
-static void zswap_entry_put(struct zswap_tree *tree,
+static void zswap_entry_put(struct btree_head *head,
 			struct zswap_entry *entry)
 {
 	int refcount = --entry->refcount;
 
 	BUG_ON(refcount < 0);
 	if (refcount == 0) {
-		zswap_rb_erase(&tree->rbroot, entry);
+		zswap_erase(head, entry);
 		zswap_free_entry(entry);
 	}
 }
 
+static int zswap_insert_or_replace(struct btree_head *head,
+				struct zswap_entry *entry)
+{
+	struct zswap_entry *old;
+
+	do {
+		old = btree_remove(head, btree_pgofft_geo, &entry->offset);
+		if (old) {
+			zswap_duplicate_entry++;
+			zswap_entry_put(head, old);
+		}
+	} while (old);
+	return btree_insert(head, btree_pgofft_geo, &entry->offset, entry,
+			GFP_ATOMIC);
+}
 /* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+static struct zswap_entry *zswap_entry_find_get(struct btree_head *head,
 				pgoff_t offset)
 {
 	struct zswap_entry *entry;
 
-	entry = zswap_rb_search(root, offset);
+	entry = zswap_search(head, offset);
 	if (entry)
 		zswap_entry_get(entry);
 
@@ -861,7 +833,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 
 	/* find and ref zswap entry */
 	spin_lock(&tree->lock);
-	entry = zswap_entry_find_get(&tree->rbroot, offset);
+	entry = zswap_entry_find_get(&tree->head, offset);
 	if (!entry) {
 		/* entry was invalidated */
 		spin_unlock(&tree->lock);
@@ -910,7 +882,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 
 	spin_lock(&tree->lock);
 	/* drop local reference */
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(&tree->head, entry);
 
 	/*
 	* There are two possible situations for entry here:
@@ -919,8 +891,8 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	*     because invalidate happened during writeback
 	*  search the tree and free the entry if find entry
 	*/
-	if (entry == zswap_rb_search(&tree->rbroot, offset))
-		zswap_entry_put(tree, entry);
+	if (entry == zswap_search(&tree->head, offset))
+		zswap_entry_put(&tree->head, entry);
 	spin_unlock(&tree->lock);
 
 	goto end;
@@ -934,7 +906,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 	*/
 fail:
 	spin_lock(&tree->lock);
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(&tree->head, entry);
 	spin_unlock(&tree->lock);
 
 end:
@@ -988,7 +960,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 				struct page *page)
 {
 	struct zswap_tree *tree = zswap_trees[type];
-	struct zswap_entry *entry, *dupentry;
+	struct zswap_entry *entry;
 	struct crypto_comp *tfm;
 	int ret;
 	unsigned int hlen, dlen = PAGE_SIZE;
@@ -1096,16 +1068,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 insert_entry:
 	/* map */
 	spin_lock(&tree->lock);
-	do {
-		ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
-		if (ret == -EEXIST) {
-			zswap_duplicate_entry++;
-			/* remove from rbtree */
-			zswap_rb_erase(&tree->rbroot, dupentry);
-			zswap_entry_put(tree, dupentry);
-		}
-	} while (ret == -EEXIST);
+	ret = zswap_insert_or_replace(&tree->head, entry);
 	spin_unlock(&tree->lock);
+	if (ret < 0)  {
+		zswap_reject_alloc_fail++;
+		goto put_dstmem;
+	}
 
 	/* update stats */
 	atomic_inc(&zswap_stored_pages);
@@ -1138,7 +1106,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
 	/* find */
 	spin_lock(&tree->lock);
-	entry = zswap_entry_find_get(&tree->rbroot, offset);
+	entry = zswap_entry_find_get(&tree->head, offset);
 	if (!entry) {
 		/* entry was written back */
 		spin_unlock(&tree->lock);
@@ -1168,7 +1136,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 
 freeentry:
 	spin_lock(&tree->lock);
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(&tree->head, entry);
 	spin_unlock(&tree->lock);
 
 	return 0;
@@ -1182,36 +1150,41 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
 
 	/* find */
 	spin_lock(&tree->lock);
-	entry = zswap_rb_search(&tree->rbroot, offset);
+	entry = zswap_search(&tree->head, offset);
 	if (!entry) {
 		/* entry was written back */
 		spin_unlock(&tree->lock);
 		return;
 	}
 
-	/* remove from rbtree */
-	zswap_rb_erase(&tree->rbroot, entry);
+	/* remove from tree */
+	zswap_erase(&tree->head, entry);
 
 	/* drop the initial reference from entry creation */
-	zswap_entry_put(tree, entry);
+	zswap_entry_put(&tree->head, entry);
 
 	spin_unlock(&tree->lock);
 }
 
+void do_free_entry(void *elem, unsigned long opaque, unsigned long *key,
+		size_t index, void *func2)
+{
+	struct zswap_entry *entry = elem;
+	zswap_free_entry(entry);
+}
+
 /* frees all zswap entries for the given swap type */
 static void zswap_frontswap_invalidate_area(unsigned type)
 {
 	struct zswap_tree *tree = zswap_trees[type];
-	struct zswap_entry *entry, *n;
 
 	if (!tree)
 		return;
 
 	/* walk the tree and free everything */
 	spin_lock(&tree->lock);
-	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
-		zswap_free_entry(entry);
-	tree->rbroot = RB_ROOT;
+	btree_visitor(&tree->head, btree_pgofft_geo, 0, do_free_entry, NULL);
+	btree_destroy(&tree->head);
 	spin_unlock(&tree->lock);
 	kfree(tree);
 	zswap_trees[type] = NULL;
@@ -1226,8 +1199,11 @@ static void zswap_frontswap_init(unsigned type)
 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 		return;
 	}
-
-	tree->rbroot = RB_ROOT;
+	if (btree_init(&tree->head) < 0) {
+		pr_err("couldn't init the tree head\n");
+		kfree(tree);
+		return;
+	}
 	spin_lock_init(&tree->lock);
 	zswap_trees[type] = tree;
 }
@@ -1302,6 +1278,11 @@ static int __init init_zswap(void)
 
 	zswap_init_started = true;
 
+	if (sizeof(pgoff_t) == 8)
+		btree_pgofft_geo = &btree_geo64;
+	else
+		btree_pgofft_geo = &btree_geo32;
+
 	if (zswap_entry_cache_create()) {
 		pr_err("entry cache creation failed\n");
 		goto cache_fail;




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux