[PATCH 6/8] md: switch from list to rb tree for IO serialization

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Guoqing Jiang <guoqing.jiang@xxxxxxxxxxxxxxx>

Obviously, IO serialization could cause the degradation of
performance. In order to reduce the degradation, it is better
to replace link list with rb tree.

And with the inspiration of drbd_interval.c, a simpler
implementation of rb tree is introduced. The rb tree mainly
implements insert/remove rb node, check if bio is overlapped
and get bio related rb node. When a rb node is inserted/deleted,
rb node needs to re-compute the highest end of it's subtree
to search certain range of bio.

Also, rename serial_list_lock to serial_rb_lock since it is
used to protect rb tree now.

Signed-off-by: Guoqing Jiang <guoqing.jiang@xxxxxxxxxxxxxxx>
---
 drivers/md/md.c    |   4 +-
 drivers/md/md.h    |  11 ++--
 drivers/md/raid1.c | 152 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8df94a58512b..6d0e076f8099 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -127,8 +127,8 @@ static inline int speed_max(struct mddev *mddev)
 
 static int rdev_init_serial(struct md_rdev *rdev)
 {
-	spin_lock_init(&rdev->serial_list_lock);
-	INIT_LIST_HEAD(&rdev->serial_list);
+	spin_lock_init(&rdev->serial_rb_lock);
+	rdev->serial_rb = RB_ROOT;
 	init_waitqueue_head(&rdev->serial_io_wait);
 	set_bit(CollisionCheck, &rdev->flags);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 161772066dab..eeceb79ae226 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -113,8 +113,8 @@ struct md_rdev {
 	/*
 	 * The members for check collision of write behind IOs.
 	 */
-	struct list_head serial_list;
-	spinlock_t serial_list_lock;
+	struct rb_root serial_rb;
+	spinlock_t serial_rb_lock;
 	wait_queue_head_t serial_io_wait;
 
 	struct work_struct del_work;	/* used for delayed sysfs removal */
@@ -266,9 +266,10 @@ enum mddev_sb_flags {
 #define NR_SERIAL_INFOS		8
 /* record current range of serialize IOs */
 struct serial_info {
-	sector_t lo;
-	sector_t hi;
-	struct list_head list;
+	sector_t sector;	/* start sector of IO range */
+	unsigned long size;	/* size in bytes of IO */
+	struct rb_node rb;
+	sector_t end;		/* highest interval end in subtree of rb node */
 };
 
 struct mddev {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4f683a3d44c0..48e7cd8f26a8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
+#include <linux/rbtree_augmented.h>
 
 #include <trace/events/block.h>
 
@@ -50,31 +51,135 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #include "raid1-10.c"
 
+/*
+ * interval_end - return end of rb node
+ */
+static inline sector_t interval_end(struct rb_node *node)
+{
+	struct serial_info *this = rb_entry(node, struct serial_info, rb);
+
+	return this->end;
+}
+
+#define NODE_END(node) ((node)->sector + ((node)->size >> 9))
+RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
+			 struct serial_info, rb, sector_t, end, NODE_END);
+
+/*
+ * insert_interval - insert a new interval into a tree
+ */
+static void insert_interval(struct rb_root *root, struct serial_info *this)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+	sector_t this_end = this->sector + (this->size >> 9);
+
+	while (*new) {
+		struct serial_info *here =
+			rb_entry(*new, struct serial_info, rb);
+
+		parent = *new;
+		if (here->end < this_end)
+			here->end = this_end;
+
+		if (this->sector < here->sector)
+			new = &(*new)->rb_left;
+		else if (this->sector > here->sector)
+			new = &(*new)->rb_right;
+	}
+
+	this->end = this_end;
+	rb_link_node(&this->rb, parent, new);
+	rb_insert_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/*
+ * contains_interval - check if a tree contains interval [sector, sector + size]
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * or NULL if interval is not in the tree.
+ */
+static struct serial_info *
+contains_interval(struct rb_root *root, sector_t sector, unsigned long size)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct serial_info *here =
+			rb_entry(node, struct serial_info, rb);
+
+		if (sector == here->sector && size == here->size)
+			return here;
+		if (sector < here->sector)
+			node = node->rb_left;
+		else if (sector > here->sector)
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+/*
+ * remove_interval - remove an interval from a rb tree
+ */
+static void remove_interval(struct rb_root *root, struct serial_info *this)
+{
+	rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/*
+ * find_overlap - search an interval overlapping with [sector, sector + size]
+ *
+ * Returns an interval overlapping with the interval or NULL if there is none.
+ */
+static struct serial_info *
+find_overlap(struct rb_root *root, sector_t sector, unsigned long size)
+{
+	struct rb_node *node = root->rb_node;
+	struct serial_info *overlap = NULL;
+	sector_t end = sector + (size >> 9);
+
+	while (node) {
+		struct serial_info *here =
+			rb_entry(node, struct serial_info, rb);
+
+		if (node->rb_left &&
+		    sector < interval_end(node->rb_left)) {
+			/* Overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (here->sector < end &&
+			   sector < here->sector + (here->size >> 9)) {
+			overlap = here;
+			break;
+		} else if (sector >= here->sector) {
+			/* Overlap if any must be on right side */
+			node = node->rb_right;
+		} else
+			break;
+	}
+	return overlap;
+}
+
 static int check_and_add_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
-	struct serial_info *wi, *temp_wi;
+	struct serial_info *wi;
 	unsigned long flags;
 	int ret = 0;
+	unsigned long size = (hi - lo) << 9;
 	struct mddev *mddev = rdev->mddev;
 
 	wi = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
 
-	spin_lock_irqsave(&rdev->serial_list_lock, flags);
-	list_for_each_entry(temp_wi, &rdev->serial_list, list) {
-		/* collision happened */
-		if (hi > temp_wi->lo && lo < temp_wi->hi) {
-			ret = -EBUSY;
-			break;
-		}
-	}
+	spin_lock_irqsave(&rdev->serial_rb_lock, flags);
+	/* collision happened */
+	if (find_overlap(&rdev->serial_rb, lo, size))
+		ret = -EBUSY;
 
 	if (!ret) {
-		wi->lo = lo;
-		wi->hi = hi;
-		list_add(&wi->list, &rdev->serial_list);
+		wi->sector = lo;
+		wi->size = size;
+		insert_interval(&rdev->serial_rb, wi);
 	} else
 		mempool_free(wi, mddev->serial_info_pool);
-	spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
+	spin_unlock_irqrestore(&rdev->serial_rb_lock, flags);
 
 	return ret;
 }
@@ -83,21 +188,18 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
 	struct serial_info *wi;
 	unsigned long flags;
-	int found = 0;
 	struct mddev *mddev = rdev->mddev;
+	unsigned long size = (hi - lo) << 9;
 
-	spin_lock_irqsave(&rdev->serial_list_lock, flags);
-	list_for_each_entry(wi, &rdev->serial_list, list)
-		if (hi == wi->hi && lo == wi->lo) {
-			list_del(&wi->list);
-			mempool_free(wi, mddev->serial_info_pool);
-			found = 1;
-			break;
-		}
-
-	if (!found)
+	spin_lock_irqsave(&rdev->serial_rb_lock, flags);
+	wi = contains_interval(&rdev->serial_rb, lo, size);
+	if (wi) {
+		remove_interval(&rdev->serial_rb, wi);
+		RB_CLEAR_NODE(&wi->rb);
+		mempool_free(wi, mddev->serial_info_pool);
+	} else
 		WARN(1, "The write IO is not recorded for serialization\n");
-	spin_unlock_irqrestore(&rdev->serial_list_lock, flags);
+	spin_unlock_irqrestore(&rdev->serial_rb_lock, flags);
 	wake_up(&rdev->serial_io_wait);
 }
 
-- 
2.17.1




[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux