Hi This is the dm-switch target to be included in the next kernel. It is equivalent to the last code sent by Jim Ramsay with the exception that REQ_FLUSH processing was removed (because hardware has no write-back cache). Mikulas --- dm-switch target Originally developed by Jim Ramsay. Simplified by Mikulas Patocka. Signed-off-by: Mikulas Patocka <mpatocka@xxxxxxxxxx> Signed-off-by: Jim Ramsay <jim_ramsay@xxxxxxxx> --- drivers/md/Kconfig | 11 + drivers/md/Makefile | 1 drivers/md/dm-switch.c | 520 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 532 insertions(+) Index: linux-3.5.4-fast/drivers/md/Kconfig =================================================================== --- linux-3.5.4-fast.orig/drivers/md/Kconfig 2012-09-25 22:15:36.000000000 +0200 +++ linux-3.5.4-fast/drivers/md/Kconfig 2012-09-25 22:21:56.000000000 +0200 @@ -417,4 +417,15 @@ config DM_VERITY2 source "drivers/md/enhanceio/Kconfig" +config DM_SWITCH + tristate "Switch target support (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Help text needs writing + + To compile this code as a module, choose M here: the module will + be called dm-switch. + + If unsure, say N. + endif # MD Index: linux-3.5.4-fast/drivers/md/Makefile =================================================================== --- linux-3.5.4-fast.orig/drivers/md/Makefile 2012-09-25 22:15:36.000000000 +0200 +++ linux-3.5.4-fast/drivers/md/Makefile 2012-09-25 22:21:56.000000000 +0200 @@ -48,6 +48,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_ZEROED) += dm-zeroed.o obj-$(CONFIG_DM_ENHANCEIO) += enhanceio/ +obj-$(CONFIG_DM_SWITCH) += dm-switch.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o Index: linux-3.5.4-fast/drivers/md/dm-switch.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.5.4-fast/drivers/md/dm-switch.c 2012-09-25 22:21:53.000000000 +0200 @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2010-2012 by Dell Inc. All rights reserved. + * + * This file is released under the GPL. + * + * Description: + * + * file: dm-switch.c + * authors: Kevin_OKelley@xxxxxxxx + * Jim_Ramsay@xxxxxxxx + * Narendran_Ganapathy@xxxxxxxx + * mpatocka@xxxxxxxxxx + * + * This file implements a "switch" target which efficiently implements a + * mapping of IOs to underlying block devices in scenarios where there are: + * (1) a large number of address regions + * (2) a fixed size equal across all address regions + * (3) no pattern than allows for a compact description with something like + * the dm-stripe target. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/device-mapper.h> +#include <linux/vmalloc.h> + +#define DM_MSG_PREFIX "switch" + +/* + * Switch device context block: A new one is created for each dm device. + * Contains an array of devices from which we have taken references. + */ +struct switch_dev { + struct dm_dev *dmdev; + sector_t start; +}; + +typedef unsigned long pt_entry; + +/* Switch context header */ +struct switch_ctx { + unsigned dev_count; /* Number of devices */ + unsigned page_size; /* Page size in 512B sectors */ + unsigned long n_pages; /* Number of pages */ + signed char page_size_bits; /* log2 of page_size or -1 */ + + unsigned char pte_size; /* Page table entry size in bits */ + unsigned char pte_fields; /* Number of entries per pt_entry */ + signed char pte_fields_bits; /* log2 of pte_fields or -1 */ + pt_entry *page_table; /* Page table */ + + /* Array of dm devices to switch between */ + struct switch_dev dev_list[0]; +}; + +static inline void switch_get_position(struct switch_ctx *pctx, + unsigned long page, + unsigned long *index, + unsigned *bit) + +{ + if (pctx->pte_fields_bits >= 0) { + *index = page >> pctx->pte_fields_bits; + *bit = page & (pctx->pte_fields - 1); + } else { + *index = page / pctx->pte_fields; + *bit = page % pctx->pte_fields; + } + *bit *= pctx->pte_size; + +} + +static inline unsigned switch_get_deviceidx(struct switch_ctx *pctx, + sector_t sector) +{ + unsigned long index; + unsigned bit, idev; + sector_t p; + + p = sector; + if (pctx->page_size_bits >= 0) + p >>= pctx->page_size_bits; + else + sector_div(p, pctx->page_size); + + switch_get_position(pctx, p, &index, &bit); + idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) & + ((1 << pctx->pte_size) - 1); + + /* This can only happen if the processor uses non-atomic stores. */ + if (unlikely(idev >= pctx->dev_count)) + idev = 0; + + return idev; +} + +static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page, + unsigned value) +{ + unsigned long index; + unsigned bit; + pt_entry pte; + + switch_get_position(pctx, page, &index, &bit); + + pte = pctx->page_table[index]; + pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit); + pte |= (pt_entry)value << bit; + pctx->page_table[index] = pte; +} + +/* + * Constructor: Called each time a dmsetup command creates a dm device. The + * target parameter will already have the table, type, begin and len fields + * filled in. Arguments are in pairs: <dev_path> <offset>. Therefore, we get + * multiple constructor calls, but we will need to build a list of switch_ctx + * blocks so that the page table information gets matched to the correct + * device. + */ +static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + unsigned a; + int n; + int r; + unsigned dev_count; + struct switch_ctx *pctx; + sector_t dev_size; + unsigned long e; + + if (argc < 4) { + ti->error = "Insufficient arguments"; + r = -EINVAL; + goto error; + } + if (kstrtouint(argv[0], 10, &dev_count) || + !dev_count || + dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) { + ti->error = "Invalid device count"; + r = -EINVAL; + goto error; + } + if (dev_count != (argc - 2) / 2) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto error; + } + pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)), + GFP_KERNEL); + if (!pctx) { + ti->error = "Cannot allocate redirect context"; + r = -ENOMEM; + goto error; + } + pctx->dev_count = dev_count; + if (kstrtouint(argv[1], 10, &pctx->page_size) || + !pctx->page_size) { + ti->error = "Invalid page size"; + r = -EINVAL; + goto error_kfree; + } + + if (!(pctx->page_size & (pctx->page_size - 1))) + pctx->page_size_bits = __ffs(pctx->page_size); + else + pctx->page_size_bits = -1; + + pctx->pte_size = 1; + while (pctx->pte_size < sizeof(pt_entry) * 8 && + (pt_entry)1 << pctx->pte_size < pctx->dev_count) + pctx->pte_size++; + + pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size; + if (!(pctx->pte_fields & (pctx->pte_fields - 1))) + pctx->pte_fields_bits = __ffs(pctx->pte_fields); + else + pctx->pte_fields_bits = -1; + + dev_size = ti->len; + if (sector_div(dev_size, pctx->page_size)) + dev_size++; + + pctx->n_pages = dev_size; + if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) { + ti->error = "Too long page table"; + r = -EINVAL; + goto error_kfree; + } + + if (sector_div(dev_size, pctx->pte_fields)) + dev_size++; + + if (dev_size > ULONG_MAX / sizeof(pt_entry)) { + ti->error = "Too long page table"; + r = -EINVAL; + goto error_kfree; + } + + r = dm_set_target_max_io_len(ti, pctx->page_size); + if (r) + goto error_kfree; + + pctx->page_table = vmalloc(dev_size * sizeof(pt_entry)); + if (!pctx->page_table) { + ti->error = "Cannot allocate page table"; + r = -ENOMEM; + goto error_kfree; + } + + a = 0; + for (e = 0; e < pctx->n_pages; e++) { + switch_page_table_write(pctx, e, a); + a++; + if (a >= pctx->dev_count) + a = 0; + } + + /* + * Check each device beneath the target to ensure that the limits are + * consistent. + */ + for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) { + struct dm_dev *dm; + sector_t dev_size; + unsigned long long start; + + if (kstrtoull(argv[a + 1], 10, &start) || + start != (sector_t)start) { + ti->error = "Invalid device starting offset"; + r = -EINVAL; + n--; + goto error_release_n; + } + r = dm_get_device + (ti, argv[a], dm_table_get_mode(ti->table), &dm); + if (r) { + ti->error = "Device lookup failed"; + n--; + goto error_release_n; + } + pctx->dev_list[n].dmdev = dm; + pctx->dev_list[n].start = start; + + dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT; + + if (ti->len > start + dev_size) { + ti->error = "Device is too small"; + r = -EINVAL; + goto error_release_n; + } + } + + /* For UNMAP, sending the request down any path is sufficient */ + ti->num_discard_requests = 1; + + ti->private = pctx; + + return 0; + +error_release_n: /* De-reference all devices */ + for (; n >= 0; n--) + dm_put_device(ti, pctx->dev_list[n].dmdev); + + vfree(pctx->page_table); +error_kfree: + kfree(pctx); + +error: + return r; +} + +/* + * Destructor: Don't free the dm_target, just the ti->private data (if any). + */ +static void switch_dtr(struct dm_target *ti) +{ + int n; + struct switch_ctx *pctx = ti->private; + + for (n = 0; n < pctx->dev_count; n++) + dm_put_device(ti, pctx->dev_list[n].dmdev); + + vfree(pctx->page_table); + kfree(pctx); +} + +static int switch_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct switch_ctx *pctx = ti->private; + + sector_t offset = bio->bi_sector - ti->begin; + unsigned idev; + + idev = switch_get_deviceidx(pctx, offset); + + bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev; + bio->bi_sector = pctx->dev_list[idev].start + offset; + + return DM_MAPIO_REMAPPED; +} + +/* + * We need to parse hex numbers as fast as possible. + * Message is used to load the whole table. + * + * This table-based hex parser improves performance. + * It improves a time to load 1000000 entries compared to the condition-based + * parser. + * table-based parser condition-based parser + * PA-RISC 0.29s 0.31s + * Opteron 0.0495s 0.0498s + */ + +static const unsigned char hex_table[256] = { +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +0,1,2,3,4,5,6,7,8,9,255,255,255,255,255,255, +255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, +255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255 +}; + +static inline void parse_hex(const char *string, sector_t *result, const char **end) +{ + unsigned char d; + sector_t r = 0; +#if 1 + while ((d = hex_table[(unsigned char)*string]) < 16) { + r = (r << 4) | d; + string++; + } +#else + while (1) { + d = *string; + if (d >= '0' && d <= '9') + d -= '0'; + else if (d >= 'A' && d <= 'F') + d -= 'A' - 10; + else if (d >= 'a' && d <= 'f') + d -= 'a' - 10; + else + break; + r = (r << 4) | d; + string++; + } +#endif + *end = string; + *result = r; +} + +static int switch_message(struct dm_target *ti, unsigned argc, char **argv) +{ + static DEFINE_MUTEX(message_mutex); + + struct switch_ctx *pctx = ti->private; + int r; + + mutex_lock(&message_mutex); + + if (!argc) { + goto invalid_message; + } else if (!strcasecmp(argv[0], "set-table")) { + unsigned i; + sector_t table_index = 0; + for (i = 1; i < argc; i++) { + sector_t device; + const char *string = argv[i]; + if (*string == ':') + table_index++; + else { + parse_hex(string, &table_index, &string); + if (unlikely(*string != ':')) { +invalid_table: + DMWARN("invalid set-table argument"); + r = -EINVAL; + goto ret; + } + } + string++; + if (unlikely(!*string)) + goto invalid_table; + parse_hex(string, &device, &string); + if (unlikely(*string)) + goto invalid_table; + if (unlikely(table_index >= pctx->n_pages)) { + DMWARN("invalid set-table page"); + r = -EINVAL; + goto ret; + } + if (unlikely(device >= pctx->dev_count)) { + DMWARN("invalid set-table device"); + r = -EINVAL; + goto ret; + } + switch_page_table_write(pctx, table_index, device); + } + r = 0; + } else { +invalid_message: + DMWARN("unrecognised message received."); + r = -EINVAL; + } +ret: + mutex_unlock(&message_mutex); + return r; +} + +static int switch_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct switch_ctx *pctx = ti->private; + unsigned sz = 0; + int n; + + result[0] = '\0'; + switch (type) { + case STATUSTYPE_INFO: + result[0] = 0; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%u %u", pctx->dev_count, pctx->page_size); + for (n = 0; n < pctx->dev_count; n++) { + DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name, + (unsigned long long)pctx->dev_list[n].start); + } + break; + + default: + return 0; + } + return 0; +} + +/* + * Switch ioctl: + * + * Passthrough all ioctls to the path for sector 0 + */ +static int switch_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct switch_ctx *pctx = ti->private; + struct block_device *bdev; + fmode_t mode; + unsigned idev; + + idev = switch_get_deviceidx(pctx, 0); + + bdev = pctx->dev_list[idev].dmdev->bdev; + mode = pctx->dev_list[idev].dmdev->mode; + + return __blkdev_driver_ioctl(bdev, mode, cmd, arg); +} + +static int switch_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct switch_ctx *pctx = (struct switch_ctx *)ti->private; + int n, ret = 0; + + for (n = 0; n < pctx->dev_count; n++) { + ret = fn(ti, pctx->dev_list[n].dmdev, ti->begin, ti->len, data); + if (ret) + goto out; + } + +out: + return ret; +} + +static struct target_type switch_target = { + .name = "switch", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = switch_ctr, + .dtr = switch_dtr, + .map = switch_map, + .message = switch_message, + .status = switch_status, + .ioctl = switch_ioctl, + .iterate_devices = switch_iterate_devices, +}; + +int __init dm_switch_init(void) +{ + int r; + + r = dm_register_target(&switch_target); + if (r) { + DMERR("dm_register_target() failed %d", r); + return r; + } + + return 0; +} + +void dm_switch_exit(void) +{ + dm_unregister_target(&switch_target); +} + +module_init(dm_switch_init); +module_exit(dm_switch_exit); + +MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector"); +MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@xxxxxxxx>"); +MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@xxxxxxxx>"); +MODULE_AUTHOR("Mikulas Patocka <mpatocka@xxxxxxxxxx>"); +MODULE_LICENSE("GPL"); -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel