This patch adds a service time oriented dynamic load balancer, dm-service-time, which selects a path to complete the incoming I/O with the shortest time. To calculate the service time, it uses the size of I/Os and recent throughput of each path. Signed-off-by: Kiyoshi Ueda <k-ueda@xxxxxxxxxxxxx> Signed-off-by: Jun'ichi Nomura <j-nomura@xxxxxxxxxxxxx> --- drivers/md/Kconfig | 9 + drivers/md/Makefile | 1 drivers/md/dm-service-time.c | 312 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 322 insertions(+) Index: 2.6.29-rc2/drivers/md/dm-service-time.c =================================================================== --- /dev/null +++ 2.6.29-rc2/drivers/md/dm-service-time.c @@ -0,0 +1,312 @@ +/* + * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. + * + * Module Author: Kiyoshi Ueda + * + * This file is released under the GPL. + * + * Throughput oriented path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#define DM_MSG_PREFIX "multipath service-time" +#define ST_MIN_IO 2 +#define ST_VERSION "0.1.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; + + atomic_t in_flight; /* Total size of in-flight I/Os */ + size_t perf; /* Recent performance of the path */ + sector_t last_sectors; /* Total sectors of the last part_stat_read */ + size_t last_io_ticks; /* io_ticks of the last part_stat_read */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kzalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + } + + return s; +} + +static int st_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + pi->path->pscontext = NULL; + kfree(pi); + } +} + +static void st_destroy(struct path_selector *ps) +{ + struct selector *s = (struct selector *) ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int st_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + int sz = 0; + struct path_info *pi; + + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("if:%08lu pf:%06lu ", + (unsigned long) atomic_read(&pi->in_flight), + pi->perf); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +static int st_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi; + unsigned int repeat_count = ST_MIN_IO; + struct gendisk *disk = path->dev->bdev->bd_disk; + + if (argc > 1) { + *error = "service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + /* First path argument is number of I/Os before switching path. */ + if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { + *error = "service-time ps: invalid repeat count"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + pi->perf = 0; + pi->last_sectors = part_stat_read(&disk->part0, sectors[READ]) + + part_stat_read(&disk->part0, sectors[WRITE]); + pi->last_io_ticks = part_stat_read(&disk->part0, io_ticks); + atomic_set(&pi->in_flight, 0); + + path->pscontext = pi; + + list_add_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void st_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = path->pscontext; + + list_move(&pi->list, &s->failed_paths); +} + +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = path->pscontext; + + list_move_tail(&pi->list, &s->valid_paths); + + return 0; +} + +static void stats_update(struct path_info *pi) +{ + sector_t sectors; + size_t io_ticks, tmp; + struct gendisk *disk = pi->path->dev->bdev->bd_disk; + + sectors = part_stat_read(&disk->part0, sectors[READ]) + + part_stat_read(&disk->part0, sectors[WRITE]); + io_ticks = part_stat_read(&disk->part0, io_ticks); + + if ((sectors != pi->last_sectors) && (io_ticks != pi->last_io_ticks)) { + tmp = (sectors - pi->last_sectors) << 9; + do_div(tmp, jiffies_to_msecs((io_ticks - pi->last_io_ticks))); + pi->perf = tmp; + + pi->last_sectors = sectors; + pi->last_io_ticks = io_ticks; + } +} + +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, + size_t new_io) +{ + size_t if1, if2; + + if1 = atomic_read(&pi1->in_flight); + if2 = atomic_read(&pi2->in_flight); + + /* + * Case 1: No performace data available. Choose less loaded path. + */ + if (!pi1->perf || !pi2->perf) + return if1 - if2; + + /* + * Case 2: Calculate service time. Choose faster path. + * if ((if1+new_io)/pi1->perf < (if2+new_io)/pi2->perf) pi1. + * if ((if1+new_io)/pi1->perf > (if2+new_io)/pi2->perf) pi2. + * To avoid do_div(), use + * if ((if1+new_io)*pi2->perf < (if2+new_io)*pi1->perf) pi1. + * if ((if1+new_io)*pi2->perf > (if2+new_io)*pi1->perf) pi2. + */ + if1 = (if1 + new_io) << 10; + if2 = (if2 + new_io) << 10; + do_div(if1, pi1->perf); + do_div(if2, pi2->perf); + + if (if1 != if2) + return if1 - if2; + + /* + * Case 3: Service time is equal. Choose faster path. + */ + return pi2->perf - pi1->perf; +} + +static struct dm_path *st_select_path(struct path_selector *ps, + unsigned *repeat_count, size_t nr_bytes) +{ + struct selector *s = (struct selector *) ps->context; + struct path_info *pi = NULL, *best = NULL; + + if (list_empty(&s->valid_paths)) + return NULL; + + /* Change preferred (first in list) path to evenly balance. */ + list_move_tail(s->valid_paths.next, &s->valid_paths); + + /* Update performance information before best path selection */ + list_for_each_entry(pi, &s->valid_paths, list) + stats_update(pi); + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best) + best = pi; + else if (st_compare_load(pi, best, nr_bytes) < 0) + best = pi; + } + + if (best) { + *repeat_count = best->repeat_count; + return best->path; + } + + return NULL; +} + +static int st_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_add(nr_bytes, &pi->in_flight); + + return 0; +} + +static int st_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_sub(nr_bytes, &pi->in_flight); + + return 0; +} + +static struct path_selector_type st_ps = { + .name = "service-time", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 2, + .create = st_create, + .destroy = st_destroy, + .status = st_status, + .add_path = st_add_path, + .fail_path = st_fail_path, + .reinstate_path = st_reinstate_path, + .select_path = st_select_path, + .start_io = st_start_io, + .end_io = st_end_io, +}; + +static int __init dm_st_init(void) +{ + int r = dm_register_path_selector(&st_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " ST_VERSION " loaded"); + + return r; +} + +static void __exit dm_st_exit(void) +{ + int r = dm_unregister_path_selector(&st_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_st_init); +module_exit(dm_st_exit); + +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); +MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@xxxxxxxxxxxxx>"); +MODULE_LICENSE("GPL"); Index: 2.6.29-rc2/drivers/md/Makefile =================================================================== --- 2.6.29-rc2.orig/drivers/md/Makefile +++ 2.6.29-rc2/drivers/md/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o +obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_ZERO) += dm-zero.o Index: 2.6.29-rc2/drivers/md/Kconfig =================================================================== --- 2.6.29-rc2.orig/drivers/md/Kconfig +++ 2.6.29-rc2/drivers/md/Kconfig @@ -283,6 +283,15 @@ config DM_MULTIPATH_QL If unsure, say N. +config DM_MULTIPATH_ST + tristate "I/O Path Selector based on the service time" + depends on DM_MULTIPATH + ---help--- + This path selector is a dynamic load balancer which selects + a path to complete the incoming I/O with the shortest time. + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target (EXPERIMENTAL)" depends on BLK_DEV_DM && EXPERIMENTAL -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel