The DM multipath service-time path-selector has historically tracked the amount of outstanding IO per path and used that to approximate the service-time of each path. In practice this has shown itself to work fairly well but we can do better by measuring the actual service-time during IO completion and using it as the basis for path selection. Measuring the actual service-time is still prone to inaccuracies given that service-times vary with IO size. But to counter any potential for drawing incorrect conclusions about the service-times of a given path the measured service-times are reset periodically. This approach has provided a 10% increase in the selection of a path that was forcibly made to be less loaded than the alternative path. Reported-by: Todd Gill <tgill@xxxxxxxxxx> Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx> --- drivers/md/dm-mpath.c | 6 ++- drivers/md/dm-path-selector.h | 5 ++- drivers/md/dm-queue-length.c | 4 +- drivers/md/dm-service-time.c | 101 +++++++++++------------------------------- 4 files changed, 35 insertions(+), 81 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 52baf8a..fb37dff 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -105,6 +105,7 @@ struct multipath { struct dm_mpath_io { struct pgpath *pgpath; size_t nr_bytes; + ktime_t io_start_time; }; typedef int (*action_fn) (struct pgpath *pgpath); @@ -507,7 +508,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, - nr_bytes); + nr_bytes, &mpio->io_start_time); return DM_MAPIO_REMAPPED; } @@ -1374,7 +1375,8 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) - ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); + ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, + &mpio->io_start_time); } clear_request_fn_mpio(m, map_context); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index b6eb536..c09d2bb 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -13,6 +13,7 @@ #define DM_PATH_SELECTOR_H #include <linux/device-mapper.h> +#include <linux/ktime.h> #include "dm-mpath.h" @@ -72,9 +73,9 @@ struct path_selector_type { status_type_t type, char *result, unsigned int maxlen); int (*start_io) (struct path_selector *ps, struct dm_path *path, - size_t nr_bytes); + size_t nr_bytes, ktime_t *io_start_time); int (*end_io) (struct path_selector *ps, struct dm_path *path, - size_t nr_bytes); + size_t nr_bytes, ktime_t *io_start_time); }; /* Register a path selector */ diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 23f1786..4affb39 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -217,7 +217,7 @@ out: } static int ql_start_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, ktime_t *io_start_time) { struct path_info *pi = path->pscontext; @@ -227,7 +227,7 @@ static int ql_start_io(struct path_selector *ps, struct dm_path *path, } static int ql_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, ktime_t *io_start_time) { struct path_info *pi = path->pscontext; diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 7b86420..1b3e45a 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -19,7 +19,7 @@ #define ST_MAX_RELATIVE_THROUGHPUT 100 #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) -#define ST_VERSION "0.3.0" +#define ST_VERSION "0.4.0" struct selector { struct list_head valid_paths; @@ -32,7 +32,8 @@ struct path_info { struct dm_path *path; unsigned repeat_count; unsigned relative_throughput; - atomic_t in_flight_size; /* Total size of in-flight I/Os */ + s64 service_time_usecs; + unsigned select_count; }; static struct selector *alloc_selector(void) @@ -92,7 +93,7 @@ static int st_status(struct path_selector *ps, struct dm_path *path, switch (type) { case STATUSTYPE_INFO: - DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), + DMEMIT("%u %u ", (unsigned)pi->service_time_usecs, pi->relative_throughput); break; case STATUSTYPE_TABLE: @@ -159,7 +160,8 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, pi->path = path; pi->repeat_count = repeat_count; pi->relative_throughput = relative_throughput; - atomic_set(&pi->in_flight_size, 0); + pi->service_time_usecs = 0; + pi->select_count = 0; path->pscontext = pi; @@ -195,80 +197,21 @@ static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) } /* - * Compare the estimated service time of 2 paths, pi1 and pi2, + * Compare the most recent service time of 2 paths, pi1 and pi2, * for the incoming I/O. * * Returns: * < 0 : pi1 is better * 0 : no difference between pi1 and pi2 * > 0 : pi2 is better - * - * Description: - * Basically, the service time is estimated by: - * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' - * To reduce the calculation, some optimizations are made. - * (See comments inline) */ -static int st_compare_load(struct path_info *pi1, struct path_info *pi2, - size_t incoming) +static s64 st_compare(struct path_info *pi1, struct path_info *pi2) { - size_t sz1, sz2, st1, st2; - - sz1 = atomic_read(&pi1->in_flight_size); - sz2 = atomic_read(&pi2->in_flight_size); - - /* - * Case 1: Both have same throughput value. Choose less loaded path. - */ - if (pi1->relative_throughput == pi2->relative_throughput) - return sz1 - sz2; - - /* - * Case 2a: Both have same load. Choose higher throughput path. - * Case 2b: One path has no throughput value. Choose the other one. - */ - if (sz1 == sz2 || - !pi1->relative_throughput || !pi2->relative_throughput) + if (pi1->relative_throughput != pi2->relative_throughput) return pi2->relative_throughput - pi1->relative_throughput; - /* - * Case 3: Calculate service time. Choose faster path. - * Service time using pi1: - * st1 = (sz1 + incoming) / pi1->relative_throughput - * Service time using pi2: - * st2 = (sz2 + incoming) / pi2->relative_throughput - * - * To avoid the division, transform the expression to use - * multiplication. - * Because ->relative_throughput > 0 here, if st1 < st2, - * the expressions below are the same meaning: - * (sz1 + incoming) / pi1->relative_throughput < - * (sz2 + incoming) / pi2->relative_throughput - * (sz1 + incoming) * pi2->relative_throughput < - * (sz2 + incoming) * pi1->relative_throughput - * So use the later one. - */ - sz1 += incoming; - sz2 += incoming; - if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || - sz2 >= ST_MAX_INFLIGHT_SIZE)) { - /* - * Size may be too big for multiplying pi->relative_throughput - * and overflow. - * To avoid the overflow and mis-selection, shift down both. - */ - sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; - sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; - } - st1 = sz1 * pi2->relative_throughput; - st2 = sz2 * pi1->relative_throughput; - if (st1 != st2) - return st1 - st2; - - /* - * Case 4: Service time is equal. Choose higher throughput path. - */ - return pi2->relative_throughput - pi1->relative_throughput; + /* select path with faster service time */ + return pi1->service_time_usecs - pi2->service_time_usecs; } static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) @@ -286,34 +229,42 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) list_move_tail(s->valid_paths.next, &s->valid_paths); list_for_each_entry(pi, &s->valid_paths, list) - if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) + if (!best || (st_compare(pi, best) < 0)) best = pi; if (!best) goto out; ret = best->path; + + /* + * Reset the observed service-times periodically (sooner rather than later); + * to avoid a momentary observed low service-time from starving the selection of + * other paths that might have lower service times now. The bursty nature of IO + * submission forces the need for this. + */ + if ((best->select_count++ & 15) == 0) + list_for_each_entry(pi, &s->valid_paths, list) + pi->service_time_usecs = 0; out: spin_unlock_irqrestore(&s->lock, flags); return ret; } static int st_start_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, ktime_t *io_start_time) { - struct path_info *pi = path->pscontext; - - atomic_add(nr_bytes, &pi->in_flight_size); + *io_start_time = ktime_get(); return 0; } static int st_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) + size_t nr_bytes, ktime_t *io_start_time) { struct path_info *pi = path->pscontext; - atomic_sub(nr_bytes, &pi->in_flight_size); + pi->service_time_usecs = ktime_us_delta(ktime_get(), *io_start_time); return 0; } -- 2.6.4 (Apple Git-63) -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel