From: Mikulas Patocka Originally developed by Jim Ramsay. Simplified by Mikulas Patocka. Signed-off-by: Mikulas Patocka Signed-off-by: Jim Ramsay FIXME Needs patch header, Kconfig help text, reformatted new-style documentation file, full code review, rename page to region / improve related var names, pull out inlined fns --- Documentation/device-mapper/switch.txt | 37 ++ drivers/md/Kconfig | 11 drivers/md/Makefile | 1 drivers/md/dm-switch.c | 559 +++++++++++++++++++++++++++++++++ 4 files changed, 608 insertions(+) Index: linux/Documentation/device-mapper/switch.txt =================================================================== --- /dev/null +++ linux/Documentation/device-mapper/switch.txt @@ -0,0 +1,37 @@ +dm-switch target is suitable for Dell EqualLogic storage system. + +The EqualLogic storage consists of several nodes. Each host is connected +to each node. The host may send I/O requests to any node, the node that +received the requests forwards it to the node where the data is stored. + +However, there is a performance advantage of sending I/O requests to the +node where the data is stored to avoid forwarding. The dm-switch targets +is created to use this performance advantage. + +The dm-switch target splits the device to fixed-size pages. It maintains +a page table that maps pages to storage nodes. Every request is +forwarded to the corresponding storage node specified in the page table. +The table may be changed with messages while the dm-switch target is +running. + + +DM table arguments: +- number of paths +- region size +- number of optional arguments (must be 0 currently) + - optional arguments (none accepted) +for every path +- the underlying device +- offset to the start of data in 512-byte sectors + + + +DM message: + +set_region_mappings index1:node1 index2:node2 index3:node3 ... +- modify page table, set values at index to point to the specific node. + Index and node numbers are hexadecimal. You can omit the index number, + in which case previous index plus 1 is used. + +No status line + Index: linux/drivers/md/Kconfig =================================================================== --- linux.orig/drivers/md/Kconfig +++ linux/drivers/md/Kconfig @@ -389,4 +389,15 @@ config DM_VERITY If unsure, say N. +config DM_SWITCH + tristate "Switch target support (EXPERIMENTAL)" + depends on BLK_DEV_DM && EXPERIMENTAL + ---help--- + Help text needs writing + + To compile this code as a module, choose M here: the module will + be called dm-switch. + + If unsure, say N. + endif # MD Index: linux/drivers/md/Makefile =================================================================== --- linux.orig/drivers/md/Makefile +++ linux/drivers/md/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_DM_ZERO) += dm-zero.o obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o +obj-$(CONFIG_DM_SWITCH) += dm-switch.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o Index: linux/drivers/md/dm-switch.c =================================================================== --- /dev/null +++ linux/drivers/md/dm-switch.c @@ -0,0 +1,559 @@ +/* + * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. + * Copyright (C) 2011-2012 Red Hat, Inc. + * + * This file is released under the GPL. + * + * Description: + * + * file: dm-switch.c + * authors: Kevin_OKelley@dell.com + * Jim_Ramsay@dell.com + * Narendran_Ganapathy@dell.com + * mpatocka@redhat.com + * + * This file implements a "switch" target which efficiently implements a + * mapping of IOs to underlying block devices in scenarios where there are: + * (1) a large number of address regions + * (2) a fixed size equal across all address regions + * (3) no pattern than allows for a compact description with something like + * the dm-stripe target. + */ + +#include + +#include +#include +#include + +#define DM_MSG_PREFIX "switch" + +/* + * Switch device context block: A new one is created for each dm device. + * Contains an array of devices from which we have taken references. + */ +struct switch_dev { + struct dm_dev *dmdev; + sector_t start; +}; + +/* + * Holds a variable number (pte_fields) of page table entries, + * each a fixed number of bits in size. + */ +typedef unsigned long pt_entries; + +/* + * Switch context header + */ +struct switch_ctx { + struct dm_target *ti; + + unsigned nr_paths; /* Number of paths */ + unsigned region_size; /* Page size in 512B sectors */ + unsigned long nr_regions; /* Number of regions */ + signed char region_size_bits; /* log2 of region_size or -1 */ + +// continue renaming to region not page + unsigned char pte_bits; /* Number of bits in one page table entry */ + unsigned char pte_fields; /* Number of entries that fit into one pt_entry */ + signed char pte_fields_bits; /* log2 of pte_fields or -1 */ + pt_entries *region_table; /* Region table */ + + /* + * Array of dm devices to switch between. + */ + struct switch_dev dev_list[0]; +}; + +static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths, + unsigned region_size) +{ + struct switch_ctx *sctx; + + sctx = kmalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_dev), + GFP_KERNEL); + if (!sctx) + return NULL; + + sctx->ti = ti; + sctx->region_size = region_size; + + ti->private = sctx; + + return sctx; +} + +static void switch_get_position(struct switch_ctx *sctx, unsigned long page, + unsigned long *region_index, unsigned *bit) +{ + if (sctx->pte_fields_bits >= 0) { + *region_index = page >> sctx->pte_fields_bits; + *bit = page & (sctx->pte_fields - 1); + } else { + *region_index = page / sctx->pte_fields; + *bit = page % sctx->pte_fields; + } + + *bit *= sctx->pte_bits; +} + +/* + * Find which path to use at given offset. + */ +static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) +{ + unsigned long region_index; + unsigned bit, path_nr; + sector_t p; + + p = offset; + if (sctx->region_size_bits >= 0) + p >>= sctx->region_size_bits; + else + sector_div(p, sctx->region_size); + + switch_get_position(sctx, p, ®ion_index, &bit); + path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->pte_bits) - 1); + + /* This can only happen if the processor uses non-atomic stores. */ + if (unlikely(path_nr >= sctx->nr_paths)) + path_nr = 0; + + return path_nr; +} + +static void switch_region_table_write(struct switch_ctx *sctx, unsigned long page, + unsigned value) +{ + unsigned long region_index; + unsigned bit; + pt_entries pte; + + switch_get_position(sctx, page, ®ion_index, &bit); + + pte = sctx->region_table[region_index]; + pte &= ~((((pt_entries)1 << sctx->pte_bits) - 1) << bit); + pte |= (pt_entries)value << bit; + sctx->region_table[region_index] = pte; +} + +/* + * Fill the region table with an initial round robin pattern. + */ +static void initialise_switch_region_table(struct switch_ctx *sctx) +{ + unsigned path_nr = 0; + unsigned long region_nr; + + for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { + switch_region_table_write(sctx, region_nr, path_nr); + region_nr++; + if (path_nr >= sctx->nr_paths) + path_nr = 0; + } +} + +static int parse_path(struct dm_arg_set *as, struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + unsigned long long start; + int r; + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &sctx->dev_list[sctx->nr_paths].dmdev); + if (r) { + ti->error = "Device lookup failed"; + return r; + } + + if (kstrtoull(as->argv[0], 10, &start) || start != (sector_t)start) { + ti->error = "Invalid device starting offset"; + dm_put_device(ti, sctx->dev_list[sctx->nr_paths].dmdev); + return -EINVAL; + } + + sctx->dev_list[sctx->nr_paths].start = start; + as->argc--; + + sctx->nr_paths++; + + return 0; +} + +/* + * Destructor: Don't free the dm_target, just the ti->private data (if any). + */ +static void switch_dtr(struct dm_target *ti) +{ + struct switch_ctx *sctx = ti->private; + + while (sctx->nr_paths--) + dm_put_device(ti, sctx->dev_list[sctx->nr_paths].dmdev); + + vfree(sctx->region_table); + kfree(sctx); +} + +/* + * Constructor arguments: + * [...] + * [ ]+ + * + * Optional args are to allow for future extension: currently this + * parameter must be 0. + */ +static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + static struct dm_arg _args[] = { + {0, UINT_MAX, "invalid number of paths"}, + {1, UINT_MAX, "invalid page size"}, + {0, 0, "invalid number of optional args"}, + }; + + int r; + struct switch_ctx *sctx; + struct dm_arg_set as; + unsigned nr_paths, region_size, nr_optional_args; + + sector_t dev_size; + + if (argc < 5) { + ti->error = "Insufficient arguments"; + return -EINVAL; + } + + as.argc = argc; + as.argv = argv; + + r = dm_read_arg(_args, &as, &nr_paths, &ti->error); + if (r) + return -EINVAL; + + if (argc != nr_paths * 2 + 2) { + ti->error = "Incorrect number of path arguments"; + return -EINVAL; + } + + if (nr_paths > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) { + ti->error = "Too many devices for system"; + return -EINVAL; + } + + r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); + if (r) + return -EINVAL; + + r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); + if (r) + return -EINVAL; + + sctx = alloc_switch_ctx(ti, nr_paths, region_size); + if (!sctx) { + ti->error = "Cannot allocate redirection context"; + return -ENOMEM; + } + +// Move these into alloc fn? + if (!(sctx->region_size & (sctx->region_size - 1))) + sctx->region_size_bits = __ffs(sctx->region_size); + else + sctx->region_size_bits = -1; + + sctx->pte_bits = 1; + while (sctx->pte_bits < sizeof(pt_entries) * 8 && + (pt_entries)1 << sctx->pte_bits < nr_paths) + sctx->pte_bits++; + + sctx->pte_fields = (sizeof(pt_entries) * 8) / sctx->pte_bits; + if (!(sctx->pte_fields & (sctx->pte_fields - 1))) + sctx->pte_fields_bits = __ffs(sctx->pte_fields); + else + sctx->pte_fields_bits = -1; + + dev_size = ti->len; + if (sector_div(dev_size, sctx->region_size)) + dev_size++; + + sctx->nr_regions = dev_size; + if (sctx->nr_regions != dev_size || sctx->nr_regions >= ULONG_MAX) { + ti->error = "Too long page table"; + r = -EINVAL; + goto error_kfree; + } + + if (sector_div(dev_size, sctx->pte_fields)) + dev_size++; + + if (dev_size > ULONG_MAX / sizeof(pt_entries)) { + ti->error = "Too long page table"; + r = -EINVAL; + goto error_kfree; + } + + r = dm_set_target_max_io_len(ti, sctx->region_size); + if (r) + goto error_kfree; + + sctx->region_table = vmalloc(dev_size * sizeof(pt_entries)); + if (!sctx->region_table) { + ti->error = "Cannot allocate page table"; + r = -ENOMEM; + goto error_kfree; + } + + initialise_switch_region_table(sctx); + + while (as.argc) { + r = parse_path(&as, ti); + if (r) { + switch_dtr(ti); + return r; + } + } + + /* For UNMAP, sending the request down any path is sufficient */ + ti->num_discard_requests = 1; + + return 0; + +error_kfree: + kfree(sctx); + + return r; +} + +static int switch_map(struct dm_target *ti, struct bio *bio, + union map_info *map_context) +{ + struct switch_ctx *sctx = ti->private; + sector_t offset = dm_target_offset(ti, bio->bi_sector); + unsigned path_nr = switch_get_path_nr(sctx, offset); + + bio->bi_bdev = sctx->dev_list[path_nr].dmdev->bdev; + bio->bi_sector = sctx->dev_list[path_nr].start + offset; + + return DM_MAPIO_REMAPPED; +} + +/* + * We need to parse hex numbers as quickly as possible. + * Message is used to load the whole table. + * + * This table-based hex parser improves performance. + * It improves a time to load 1000000 entries compared to the condition-based + * parser. + * table-based parser condition-based parser + * PA-RISC 0.29s 0.31s + * Opteron 0.0495s 0.0498s + */ + +static const unsigned char hex_table[256] = { +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 +}; + +static void parse_hex(const char *string, sector_t *result, const char **end) +{ + unsigned char d; + sector_t r = 0; + + while ((d = hex_table[(unsigned char)*string]) < 16) { + r = (r << 4) | d; + string++; + } + + *end = string; + *result = r; +} + +// FIXME Sort out DMWARNs +static int process_set_region_mappings(struct switch_ctx *sctx, + unsigned argc, char **argv) +{ + unsigned i; + sector_t region_index = 0; + + for (i = 1; i < argc; i++) { + sector_t device; + const char *string = argv[i]; + + if (*string == ':') + region_index++; + else { + parse_hex(string, ®ion_index, &string); + if (unlikely(*string != ':')) { + DMWARN("invalid set_region_mappings argument"); + return -EINVAL; + } + } + + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument"); + return -EINVAL; + } + + parse_hex(string, &device, &string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument"); + return -EINVAL; + } + if (unlikely(region_index >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings page"); + return -EINVAL; + } + if (unlikely(device >= sctx->nr_paths)) { + DMWARN("invalid set_region_mappings device"); + return -EINVAL; + } + + switch_region_table_write(sctx, region_index, device); + } + + return 0; +} + +/* + * Messages are processed one-at-a-time. + * + * Only set_region_mappings is supported. + */ +static int switch_message(struct dm_target *ti, unsigned argc, char **argv) +{ + static DEFINE_MUTEX(message_mutex); + + struct switch_ctx *sctx = ti->private; + int r = -EINVAL; + + mutex_lock(&message_mutex); + + if (!strcasecmp(argv[0], "set_region_mappings")) + r = process_set_region_mappings(sctx, argc, argv); + else + DMWARN("Unrecognised message received."); + + mutex_unlock(&message_mutex); + + return r; +} + +static int switch_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct switch_ctx *sctx = ti->private; + unsigned sz = 0; + int path_nr; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) + DMEMIT(" %s %llu", sctx->dev_list[path_nr].dmdev->name, + (unsigned long long)sctx->dev_list[path_nr].start); + break; + } + + return 0; +} + +/* + * Switch ioctl: + * + * Passthrough all ioctls to the path for sector 0 + */ +static int switch_ioctl(struct dm_target *ti, unsigned cmd, + unsigned long arg) +{ + struct switch_ctx *sctx = ti->private; + struct block_device *bdev; + fmode_t mode; + unsigned path_nr; + int r = 0; + + path_nr = switch_get_path_nr(sctx, 0); + + bdev = sctx->dev_list[path_nr].dmdev->bdev; + mode = sctx->dev_list[path_nr].dmdev->mode; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (ti->len + sctx->dev_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg); +} + +static int switch_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct switch_ctx *sctx = ti->private; + int path_nr; + + for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { + r = fn(ti, sctx->dev_list[path_nr].dmdev, + sctx->dev_list[path_nr].start, ti->len, data); + if (r) + return r; + } + + return 0; +} + +static struct target_type switch_target = { + .name = "switch", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = switch_ctr, + .dtr = switch_dtr, + .map = switch_map, + .message = switch_message, + .status = switch_status, + .ioctl = switch_ioctl, + .iterate_devices = switch_iterate_devices, +}; + +static int __init dm_switch_init(void) +{ + int r; + + r = dm_register_target(&switch_target); + if (r < 0) + DMERR("dm_register_target() failed %d", r); + + return r; +} + +static void __exit dm_switch_exit(void) +{ + dm_unregister_target(&switch_target); +} + +module_init(dm_switch_init); +module_exit(dm_switch_exit); + +MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector"); +MODULE_AUTHOR("Kevin D. O'Kelley "); +MODULE_AUTHOR("Jim Ramsay "); +MODULE_AUTHOR("Mikulas Patocka "); +MODULE_LICENSE("GPL");