From: Mikulas Patocka Originally developed by Jim Ramsay. Simplified by Mikulas Patocka. Signed-off-by: Mikulas Patocka Signed-off-by: Jim Ramsay FIXME Patch header, help text, documentation file, code review, add merge fn, FIXME pte fields renaming for improved clarity? --- Documentation/device-mapper/switch.txt | 31 + drivers/md/Kconfig | 11 drivers/md/Makefile | 1 drivers/md/dm-switch.c | 516 +++++++++++++++++++++++++++++++++ 4 files changed, 559 insertions(+) Index: linux/Documentation/device-mapper/switch.txt =================================================================== --- /dev/null +++ linux/Documentation/device-mapper/switch.txt @@ -0,0 +1,31 @@ + +dm-switch target is suitable for Dell EqualLogic storage system. + +The EqualLogic storage consists of several nodes. Each host is connected +to each node. The host may send I/O requests to any node, the node that +received the requests forwards it to the node where the data is stored. + +However, there is a performance advantage of sending I/O requests to the +node where the data is stored to avoid forwarding. The dm-switch targets +is created to use this performance advantage. + +The dm-switch target splits the device to fixed-size pages. It maintains +a page table that maps pages to storage nodes. Every request is +forwarded to the corresponding storage node specified in the page table. +The table may be changed with messages while the dm-switch target is +running. + +DM table arguments: +- number of storage nodes +- page size +for every storage node: +- the underlying block device +- offset to the start of data in 512-byte sectors + +DM message: +set-table index1:node1 index2:node2 index3:node3 ... +- modify page table, set values at index to point to the specific node. + Index and node numbers are hexadecimal. You can omit the index number, + in this case previous index plus 1 is used. + + Index: linux/drivers/md/Kconfig =================================================================== --- linux.orig/drivers/md/Kconfig +++ linux/drivers/md/Kconfig @@ -389,4 +389,15 @@ config DM_VERITY If unsure, say N. +config DM_SWITCH + tristate "Switch target support (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Help text needs writing + + To compile this code as a module, choose M here: the module will + be called dm-switch. + + If unsure, say N. + endif # MD Index: linux/drivers/md/Makefile =================================================================== --- linux.orig/drivers/md/Makefile +++ linux/drivers/md/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o +obj-$(CONFIG_DM_SWITCH) += dm-switch.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o Index: linux/drivers/md/dm-switch.c =================================================================== --- /dev/null +++ linux/drivers/md/dm-switch.c @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. + * Copyright (C) 2011-2012 Red Hat, Inc. + * + * This file is released under the GPL. + * + * Description: + * + * file: dm-switch.c + * authors: Kevin_OKelley@dell.com + * Jim_Ramsay@dell.com + * Narendran_Ganapathy@dell.com + * mpatocka@redhat.com + * + * This file implements a "switch" target which efficiently implements a + * mapping of IOs to underlying block devices in scenarios where there are: + * (1) a large number of address regions + * (2) a fixed size equal across all address regions + * (3) no pattern than allows for a compact description with something like + * the dm-stripe target. + */ + +#include +#include +#include +#include + +#define DM_MSG_PREFIX "switch" + +/* + * Switch device context block: A new one is created for each dm device. + * Contains an array of devices from which we have taken references. + */ +struct switch_dev { + struct dm_dev *dmdev; + sector_t start; +}; + +/* + * Holds a variable number (pte_fields) of page table entries, + * each a fixed number of bits in size. + */ +typedef unsigned long pt_entries; + +/* + * Switch context header + */ +struct switch_ctx { + unsigned dev_count; /* Number of devices */ + unsigned page_size; /* Page size in 512B sectors */ + unsigned long n_pages; /* Number of pages */ + signed char page_size_bits; /* log2 of page_size or -1 */ + + unsigned char pte_size; /* Page table entry size in bits */ + unsigned char pte_fields; /* Number of entries per pt_entry */ + signed char pte_fields_bits; /* log2 of pte_fields or -1 */ + pt_entries *page_table; /* Page table */ + + /* Array of dm devices to switch between */ + struct switch_dev dev_list[0]; +}; + +static void switch_get_position(struct switch_ctx *pctx, unsigned long page, + unsigned long *index, unsigned *bit) +{ + if (pctx->pte_fields_bits >= 0) { + *index = page >> pctx->pte_fields_bits; + *bit = page & (pctx->pte_fields - 1); + } else { + *index = page / pctx->pte_fields; + *bit = page % pctx->pte_fields; + } + + *bit *= pctx->pte_size; +} + +static unsigned switch_get_deviceidx(struct switch_ctx *pctx, sector_t sector) +{ + unsigned long index; + unsigned bit, idev; + sector_t p; + + p = sector; + if (pctx->page_size_bits >= 0) + p >>= pctx->page_size_bits; + else + sector_div(p, pctx->page_size); + + switch_get_position(pctx, p, &index, &bit); + idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) & + ((1 << pctx->pte_size) - 1); + + /* This can only happen if the processor uses non-atomic stores. */ + if (unlikely(idev >= pctx->dev_count)) + idev = 0; + + return idev; +} + +static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page, + unsigned value) +{ + unsigned long index; + unsigned bit; + pt_entries pte; + + switch_get_position(pctx, page, &index, &bit); + + pte = pctx->page_table[index]; + pte &= ~((((pt_entries)1 << pctx->pte_size) - 1) << bit); + pte |= (pt_entries)value << bit; + pctx->page_table[index] = pte; +} + +/* + * Constructor: Called each time a dmsetup command creates a dm device. The + * target parameter will already have the table, type, begin and len fields + * filled in. Arguments are in pairs: . Therefore, we get + * + * FIXME: NACK - I hope this comment is bogus! + * + * multiple constructor calls, but we will need to build a list of switch_ctx + * blocks so that the page table information gets matched to the correct + * device. + */ +static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + unsigned a; + int n; + int r; + unsigned dev_count; + struct switch_ctx *pctx; + sector_t dev_size; + unsigned long e; + + if (argc < 4) { + ti->error = "Insufficient arguments"; + r = -EINVAL; + goto error; + } + if (kstrtouint(argv[0], 10, &dev_count) || + !dev_count || + dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) { + ti->error = "Invalid device count"; + r = -EINVAL; + goto error; + } + if (dev_count != (argc - 2) / 2) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto error; + } + pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)), + GFP_KERNEL); + if (!pctx) { + ti->error = "Cannot allocate redirect context"; + r = -ENOMEM; + goto error; + } + pctx->dev_count = dev_count; + if (kstrtouint(argv[1], 10, &pctx->page_size) || + !pctx->page_size) { + ti->error = "Invalid page size"; + r = -EINVAL; + goto error_kfree; + } + + if (!(pctx->page_size & (pctx->page_size - 1))) + pctx->page_size_bits = __ffs(pctx->page_size); + else + pctx->page_size_bits = -1; + + pctx->pte_size = 1; + while (pctx->pte_size < sizeof(pt_entries) * 8 && + (pt_entries)1 << pctx->pte_size < pctx->dev_count) + pctx->pte_size++; + + pctx->pte_fields = (sizeof(pt_entries) * 8) / pctx->pte_size; + if (!(pctx->pte_fields & (pctx->pte_fields - 1))) + pctx->pte_fields_bits = __ffs(pctx->pte_fields); + else + pctx->pte_fields_bits = -1; + + dev_size = ti->len; + if (sector_div(dev_size, pctx->page_size)) + dev_size++; + + pctx->n_pages = dev_size; + if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) { + ti->error = "Too long page table"; + r = -EINVAL; + goto error_kfree; + } + + if (sector_div(dev_size, pctx->pte_fields)) + dev_size++; + + if (dev_size > ULONG_MAX / sizeof(pt_entries)) { + ti->error = "Too long page table"; + r = -EINVAL; + goto error_kfree; + } + + r = dm_set_target_max_io_len(ti, pctx->page_size); + if (r) + goto error_kfree; + + pctx->page_table = vmalloc(dev_size * sizeof(pt_entries)); + if (!pctx->page_table) { + ti->error = "Cannot allocate page table"; + r = -ENOMEM; + goto error_kfree; + } + + a = 0; + for (e = 0; e < pctx->n_pages; e++) { + switch_page_table_write(pctx, e, a); + a++; + if (a >= pctx->dev_count) + a = 0; + } + +// FIXME NACK. Do this like dm-mpath. + /* + * Check each device beneath the target to ensure that the limits are + * consistent. + */ + for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) { + struct dm_dev *dm; + sector_t dev_size; + unsigned long long start; + + if (kstrtoull(argv[a + 1], 10, &start) || + start != (sector_t)start) { + ti->error = "Invalid device starting offset"; + r = -EINVAL; + n--; + goto error_release_n; + } + r = dm_get_device(ti, argv[a], dm_table_get_mode(ti->table), &dm); + if (r) { + ti->error = "Device lookup failed"; + n--; + goto error_release_n; + } + pctx->dev_list[n].dmdev = dm; + pctx->dev_list[n].start = start; + + dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT; + + if (ti->len > start + dev_size) { + ti->error = "Device is too small"; + r = -EINVAL; + goto error_release_n; + } + } + + /* For UNMAP, sending the request down any path is sufficient */ + ti->num_discard_requests = 1; + + ti->private = pctx; + + return 0; + +error_release_n: /* De-reference all devices */ + for (; n >= 0; n--) + dm_put_device(ti, pctx->dev_list[n].dmdev); + + vfree(pctx->page_table); +error_kfree: + kfree(pctx); + +error: + return r; +} + +/* + * Destructor: Don't free the dm_target, just the ti->private data (if any). + */ +static void switch_dtr(struct dm_target *ti) +{ + int n; + struct switch_ctx *pctx = ti->private; + + for (n = 0; n < pctx->dev_count; n++) + dm_put_device(ti, pctx->dev_list[n].dmdev); + + vfree(pctx->page_table); + kfree(pctx); +} + +static int switch_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct switch_ctx *pctx = ti->private; + + sector_t offset = bio->bi_sector - ti->begin; + unsigned idev; + + idev = switch_get_deviceidx(pctx, offset); + + bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev; + bio->bi_sector = pctx->dev_list[idev].start + offset; + + return DM_MAPIO_REMAPPED; +} + +/* + * We need to parse hex numbers as quickly as possible. + * Message is used to load the whole table. + * + * This table-based hex parser improves performance. + * It improves a time to load 1000000 entries compared to the condition-based + * parser. + * table-based parser condition-based parser + * PA-RISC 0.29s 0.31s + * Opteron 0.0495s 0.0498s + */ + +static const unsigned char hex_table[256] = { +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 +}; + +static void parse_hex(const char *string, sector_t *result, const char **end) +{ + unsigned char d; + sector_t r = 0; + + while ((d = hex_table[(unsigned char)*string]) < 16) { + r = (r << 4) | d; + string++; + } + + *end = string; + *result = r; +} + +// FIXME Sort out DMWARNs +static int process_set_table(struct switch_ctx *pctx, unsigned argc, char **argv) +{ + unsigned i; + sector_t table_index = 0; + + for (i = 1; i < argc; i++) { + sector_t device; + const char *string = argv[i]; + + if (*string == ':') + table_index++; + else { + parse_hex(string, &table_index, &string); + if (unlikely(*string != ':')) { + DMWARN("invalid set-table argument"); + return -EINVAL; + } + } + + string++; + if (unlikely(!*string)) { + DMWARN("invalid set-table argument"); + return -EINVAL; + } + + parse_hex(string, &device, &string); + if (unlikely(*string)) { + DMWARN("invalid set-table argument"); + return -EINVAL; + } + if (unlikely(table_index >= pctx->n_pages)) { + DMWARN("invalid set-table page"); + return -EINVAL; + } + if (unlikely(device >= pctx->dev_count)) { + DMWARN("invalid set-table device"); + return -EINVAL; + } + + switch_page_table_write(pctx, table_index, device); + } + + return 0; +} + +static int switch_message(struct dm_target *ti, unsigned argc, char **argv) +{ + static DEFINE_MUTEX(message_mutex); + + struct switch_ctx *pctx = ti->private; + int r = -EINVAL; + + mutex_lock(&message_mutex); + + if (!strcasecmp(argv[0], "set-table")) + r = process_set_table(pctx, argc, argv); + else + DMWARN("Unrecognised switch message received."); + + mutex_unlock(&message_mutex); + + return r; +} + +static int switch_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct switch_ctx *pctx = ti->private; + unsigned sz = 0; + int n; + + result[0] = '\0'; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = 0; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%u %u", pctx->dev_count, pctx->page_size); + for (n = 0; n < pctx->dev_count; n++) { + DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name, + (unsigned long long)pctx->dev_list[n].start); + } + break; + } + + return 0; +} + +/* + * Switch ioctl: + * + * Passthrough all ioctls to the path for sector 0 + */ +static int switch_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct switch_ctx *pctx = ti->private; + struct block_device *bdev; + fmode_t mode; + unsigned idev; + + idev = switch_get_deviceidx(pctx, 0); + + bdev = pctx->dev_list[idev].dmdev->bdev; + mode = pctx->dev_list[idev].dmdev->mode; + + return __blkdev_driver_ioctl(bdev, mode, cmd, arg); +} + +static int switch_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct switch_ctx *pctx = ti->private; + int n, ret = 0; + + for (n = 0; n < pctx->dev_count; n++) { + ret = fn(ti, pctx->dev_list[n].dmdev, ti->begin, ti->len, data); + if (ret) + break; + } + + return ret; +} + +static struct target_type switch_target = { + .name = "switch", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = switch_ctr, + .dtr = switch_dtr, + .map = switch_map, + .message = switch_message, + .status = switch_status, + .ioctl = switch_ioctl, + .iterate_devices = switch_iterate_devices, +}; + +static int __init dm_switch_init(void) +{ + int r; + + r = dm_register_target(&switch_target); + if (r < 0) { + DMERR("dm_register_target() failed %d", r); + + return r; +} + +static void __exit dm_switch_exit(void) +{ + dm_unregister_target(&switch_target); +} + +module_init(dm_switch_init); +module_exit(dm_switch_exit); + +MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector"); +MODULE_AUTHOR("Kevin D. O'Kelley "); +MODULE_AUTHOR("Jim Ramsay "); +MODULE_AUTHOR("Mikulas Patocka "); +MODULE_LICENSE("GPL");