[RFC PATCH 2/2] dm-mpath: add service-time oriented dynamic load balancer

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch adds a service time oriented dynamic load balancer,
dm-service-time, which selects a path to complete the incoming I/O
with the shortest time.
To calculate the service time, it uses the size of I/Os and
recent throughput of each path.


Signed-off-by: Kiyoshi Ueda <k-ueda@xxxxxxxxxxxxx>
Signed-off-by: Jun'ichi Nomura <j-nomura@xxxxxxxxxxxxx>
---
 drivers/md/Kconfig           |    9 +
 drivers/md/Makefile          |    1 
 drivers/md/dm-service-time.c |  312 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 322 insertions(+)

Index: 2.6.29-rc2/drivers/md/dm-service-time.c
===================================================================
--- /dev/null
+++ 2.6.29-rc2/drivers/md/dm-service-time.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#define DM_MSG_PREFIX	"multipath service-time"
+#define ST_MIN_IO	2
+#define ST_VERSION	"0.1.0"
+
+struct selector {
+	struct list_head valid_paths;
+	struct list_head failed_paths;
+};
+
+struct path_info {
+	struct list_head list;
+	struct dm_path *path;
+	unsigned int repeat_count;
+
+	atomic_t in_flight;	/* Total size of in-flight I/Os */
+	size_t perf;		/* Recent performance of the path */
+	sector_t last_sectors;	/* Total sectors of the last part_stat_read */
+	size_t last_io_ticks;	/* io_ticks of the last part_stat_read */
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->failed_paths);
+	}
+
+	return s;
+}
+
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s = alloc_selector();
+
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		pi->path->pscontext = NULL;
+		kfree(pi);
+	}
+}
+
+static void st_destroy(struct path_selector *ps)
+{
+	struct selector *s = (struct selector *) ps->context;
+
+	free_paths(&s->valid_paths);
+	free_paths(&s->failed_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int st_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	struct path_info *pi;
+
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		pi = path->pscontext;
+
+		switch (type) {
+		case STATUSTYPE_INFO:
+			DMEMIT("if:%08lu pf:%06lu ",
+			       (unsigned long) atomic_read(&pi->in_flight),
+			       pi->perf);
+			break;
+		case STATUSTYPE_TABLE:
+			DMEMIT("%u ", pi->repeat_count);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi;
+	unsigned int repeat_count = ST_MIN_IO;
+	struct gendisk *disk = path->dev->bdev->bd_disk;
+
+	if (argc > 1) {
+		*error = "service-time ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	/* First path argument is number of I/Os before switching path. */
+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "service-time ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	/* allocate the path */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "service-time ps: Error allocating path context";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+
+	pi->perf = 0;
+	pi->last_sectors = part_stat_read(&disk->part0, sectors[READ])
+			   + part_stat_read(&disk->part0, sectors[WRITE]);
+	pi->last_io_ticks = part_stat_read(&disk->part0, io_ticks);
+	atomic_set(&pi->in_flight, 0);
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move(&pi->list, &s->failed_paths);
+}
+
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void stats_update(struct path_info *pi)
+{
+	sector_t sectors;
+	size_t io_ticks, tmp;
+	struct gendisk *disk = pi->path->dev->bdev->bd_disk;
+
+	sectors = part_stat_read(&disk->part0, sectors[READ])
+		  + part_stat_read(&disk->part0, sectors[WRITE]);
+	io_ticks = part_stat_read(&disk->part0, io_ticks);
+
+	if ((sectors != pi->last_sectors) && (io_ticks != pi->last_io_ticks)) {
+		tmp = (sectors - pi->last_sectors) << 9;
+		do_div(tmp, jiffies_to_msecs((io_ticks - pi->last_io_ticks)));
+		pi->perf = tmp;
+
+		pi->last_sectors = sectors;
+		pi->last_io_ticks = io_ticks;
+	}
+}
+
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+			   size_t new_io)
+{
+	size_t if1, if2;
+
+	if1 = atomic_read(&pi1->in_flight);
+	if2 = atomic_read(&pi2->in_flight);
+
+	/*
+	 * Case 1: No performace data available. Choose less loaded path.
+	 */
+	if (!pi1->perf || !pi2->perf)
+		return if1 - if2;
+
+	/*
+	 * Case 2: Calculate service time. Choose faster path.
+	 *           if ((if1+new_io)/pi1->perf < (if2+new_io)/pi2->perf) pi1.
+	 *           if ((if1+new_io)/pi1->perf > (if2+new_io)/pi2->perf) pi2.
+	 *         To avoid do_div(), use
+	 *           if ((if1+new_io)*pi2->perf < (if2+new_io)*pi1->perf) pi1.
+	 *           if ((if1+new_io)*pi2->perf > (if2+new_io)*pi1->perf) pi2.
+	 */
+	if1 = (if1 + new_io) << 10;
+	if2 = (if2 + new_io) << 10;
+	do_div(if1, pi1->perf);
+	do_div(if2, pi2->perf);
+
+	if (if1 != if2)
+		return if1 - if2;
+
+	/*
+	 * Case 3: Service time is equal. Choose faster path.
+	 */
+	return pi2->perf - pi1->perf;
+}
+
+static struct dm_path *st_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = NULL, *best = NULL;
+
+	if (list_empty(&s->valid_paths))
+		return NULL;
+
+	/* Change preferred (first in list) path to evenly balance. */
+	list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+	/* Update performance information before best path selection */
+	list_for_each_entry(pi, &s->valid_paths, list)
+		stats_update(pi);
+
+	list_for_each_entry(pi, &s->valid_paths, list) {
+		if (!best)
+			best = pi;
+		else if (st_compare_load(pi, best, nr_bytes) < 0)
+			best = pi;
+	}
+
+	if (best) {
+		*repeat_count = best->repeat_count;
+		return best->path;
+	}
+
+	return NULL;
+}
+
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_add(nr_bytes, &pi->in_flight);
+
+	return 0;
+}
+
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+		     size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_sub(nr_bytes, &pi->in_flight);
+
+	return 0;
+}
+
+static struct path_selector_type st_ps = {
+	.name		= "service-time",
+	.module		= THIS_MODULE,
+	.table_args	= 1,
+	.info_args	= 2,
+	.create		= st_create,
+	.destroy	= st_destroy,
+	.status		= st_status,
+	.add_path	= st_add_path,
+	.fail_path	= st_fail_path,
+	.reinstate_path	= st_reinstate_path,
+	.select_path	= st_select_path,
+	.start_io	= st_start_io,
+	.end_io		= st_end_io,
+};
+
+static int __init dm_st_init(void)
+{
+	int r = dm_register_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version " ST_VERSION " loaded");
+
+	return r;
+}
+
+static void __exit dm_st_exit(void)
+{
+	int r = dm_unregister_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@xxxxxxxxxxxxx>");
+MODULE_LICENSE("GPL");
Index: 2.6.29-rc2/drivers/md/Makefile
===================================================================
--- 2.6.29-rc2.orig/drivers/md/Makefile
+++ 2.6.29-rc2/drivers/md/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
Index: 2.6.29-rc2/drivers/md/Kconfig
===================================================================
--- 2.6.29-rc2.orig/drivers/md/Kconfig
+++ 2.6.29-rc2/drivers/md/Kconfig
@@ -283,6 +283,15 @@ config DM_MULTIPATH_QL
 
 	  If unsure, say N.
 
+config DM_MULTIPATH_ST
+	tristate "I/O Path Selector based on the service time"
+	depends on DM_MULTIPATH
+	---help---
+	  This path selector is a dynamic load balancer which selects
+	  a path to complete the incoming I/O with the shortest time.
+
+	  If unsure, say N.
+
 config DM_DELAY
 	tristate "I/O delaying target (EXPERIMENTAL)"
 	depends on BLK_DEV_DM && EXPERIMENTAL

--
dm-devel mailing list
dm-devel@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/dm-devel

[Index of Archives]     [DM Crypt]     [Fedora Desktop]     [ATA RAID]     [Fedora Marketing]     [Fedora Packaging]     [Fedora SELinux]     [Yosemite Discussion]     [KDE Users]     [Fedora Docs]

  Powered by Linux