From: Mikulas Patocka Dm-zeroed target provides a device that returns zeroed in areas that have not yet been written. It maintains a log containing bitmap of written areas. Use: the target accepts four arguments: sectors_per_data_block sectors_per_metadata_block data_device metadata_device On first use, zero the first 512 bytes of the metadata device. The target will then auto-initialize the metadata device. We may resize data or metadata device, if we are resizing, the target must be suspended and resumed. It detects new sizes on resume. Signed-off-by: Mikulas Patocka --- Documentation/device-mapper/zeroed.txt | 47 + drivers/md/Kconfig | 7 drivers/md/Makefile | 1 drivers/md/dm-zeroed.c | 1153 +++++++++++++++++++++++++++++++++ 4 files changed, 1208 insertions(+) -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel Index: linux/Documentation/device-mapper/zeroed.txt =================================================================== --- /dev/null +++ linux/Documentation/device-mapper/zeroed.txt @@ -0,0 +1,47 @@ +dm-zeroed +========= + +The dm-zeroed target provides transparent initialization of a logical +volume. When a logical volume is created, it is not initialized and it +contains data that were previously stored in that location. In an +environment with virtual machines belonging to different customers this +can cause a security breach. Overwriting the whole logical volume to +erase previous information can be very slow. + +The dm-zeroed target uses a bitmap with the logical volume. Each bit in +the bitmap corresponds to one chunk - the bit determines if the chunk +was written to or not. When trying to read the logical volume, the +dm-zeroed targets returns zeroes for the chunks that were not written +to. Consequently, there is no security breach from reading uninitialized +blocks. + +Parameters: + + - the size of data chunk in 512-byte sectors. +For optimum performance it is recommended to set this as a block size +of the filesystem that will be used, typically 4k (thus the value 8 +should be used). + + - the block size of metadata. Metadata +device is read and written in these units. Increasing this value causes +that more metadata will be read, but read requests will be submitted +less often, thus it may or may not improve performance, depending on +workload. + + - the underlying data device + + - the metadata device. The metadata device should +either have the first 512 bytes cleared (in this case, a new metadata +is created with all blocks marked as not-written). Or it should contain +data from the previous dm-zeroed invocation (in this case, the bitmap is +used as it was left by the previous invocation; in this case, data and +metadata chunk size must match the previous values). + +The required size of the metadata device can be calculated in the +following way: + data_chunks := roundup(data_device_size / data_chunk_size) + metadata_chunks := roundup(data_chunks / (metadata_chunk_size * 8)) + metadata_size := metadata_chunk_size * (1 + metadata_chunks) + +The first chunk in the metadata device contains the superblock, the +remaining chunks contains bits, each bit for one data chunk Index: linux/drivers/md/Kconfig =================================================================== --- linux.orig/drivers/md/Kconfig +++ linux/drivers/md/Kconfig @@ -389,6 +389,13 @@ config DM_DELAY If unsure, say N. +config DM_ZEROED + tristate "Zeroed target" + depends on BLK_DEV_DM + select DM_BUFIO + ---help--- + This target initializes all blocks with zeros. + config DM_UEVENT bool "DM uevents" depends on BLK_DEV_DM Index: linux/drivers/md/Makefile =================================================================== --- linux.orig/drivers/md/Makefile +++ linux/drivers/md/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_SWITCH) += dm-switch.o +obj-$(CONFIG_DM_ZEROED) += dm-zeroed.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o Index: linux/drivers/md/dm-zeroed.c =================================================================== --- /dev/null +++ linux/drivers/md/dm-zeroed.c @@ -0,0 +1,1153 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include + +#include "dm-bufio.h" + +#define DM_MSG_PREFIX "zeroed" + +#define DM_ZEROED_SUPERBLOCK_MAGIC cpu_to_be32(0xF21) + +/* + * On-disk superblock format + */ +struct dm_zeroed_superblock { + __be32 magic; + __le32 sectors_per_data_chunk; + __le32 sectors_per_metadata_chunk; + __le32 pad; + __le64 device_sectors; +}; + +/* + * In-memory target structure + */ +struct dm_zeroed { + struct dm_dev *dev; + struct dm_dev *log; + + unsigned sectors_per_data_chunk; + unsigned sectors_per_metadata_chunk; + unsigned char sectors_per_data_chunk_bits; + unsigned char sectors_per_metadata_chunk_bits; + sector_t device_sectors; + + struct bio_set *bioset; + struct dm_io_client *dm_io; + struct workqueue_struct *workqueue; + struct dm_bufio_client *bufio; + + /* + * This tree holds all write requests that toggle log bits. + */ + struct mutex range_tree_lock; + struct rb_root range_tree; + struct list_head overlaping_requests; + + /* + * The queue of write requests that tohhle bits after their completion. + */ + spinlock_t flush_request_lock; + struct list_head flush_request_list; + struct work_struct flush_work; +}; + +/* + * A structure for one read or write request. + */ +struct dm_zeroed_request { + struct work_struct work; + + struct dm_zeroed *z; + + bio_end_io_t *original_bi_end_io; + void *original_bi_private; + sector_t original_sector; + unsigned original_n_sectors; + + atomic_t outstanding; + int error; + + struct rb_node tree_node; + struct list_head list_entry; +}; + +static void zeroed_work(struct work_struct *work); +static void read_end_io(struct bio *new_bio, int error); +static void read_dec_outstanding(struct dm_zeroed_request *rq); +static void zero_end_io(unsigned long error, void *context); +static void write_end_io(struct bio *bio, int error); +static void write_dec_outstanding(struct dm_zeroed_request *rq); +static void zeroed_flush(struct work_struct *work); +static void write_end_request(struct dm_zeroed_request *rq, int r); +static void resume_overlappnig_requests(struct dm_zeroed *z); + +static struct page_list zero_page_list; + +/* + * Returns a log block number for a given sector number. + */ +static sector_t log_block(struct dm_zeroed *z, sector_t sector) +{ + sector_t chunk = sector >> z->sectors_per_data_chunk_bits; + return (chunk >> + (z->sectors_per_metadata_chunk_bits + SECTOR_SHIFT + 3)) + 1; +} + +/* + * Returns a bit position in log for a given sector number. + */ +static unsigned log_position(struct dm_zeroed *z, sector_t sector) +{ + sector_t chunk = sector >> z->sectors_per_data_chunk_bits; + return chunk & + ((z->sectors_per_metadata_chunk << (SECTOR_SHIFT + 3)) - 1); +} + +/* + * Read a log block with dm-bufio. + */ +static void *read_log_block(struct dm_zeroed *z, sector_t sector, unsigned *pos, + struct dm_buffer **bp) +{ + sector_t chunk = log_block(z, sector); + *pos = log_position(z, sector); + + return dm_bufio_read(z->bufio, chunk, bp); +} + +/* + * Get a log block from cache but don't read it from disk. + */ +static void *get_log_block(struct dm_zeroed *z, sector_t sector, unsigned *pos, + struct dm_buffer **bp) +{ + void *log; + sector_t chunk = log_block(z, sector); + *pos = log_position(z, sector); + + log = dm_bufio_get(z->bufio, chunk, bp); + if (!log) + dm_bufio_prefetch(z->bufio, chunk, 1); + return log; +} + +/* + * Read the superblock. + */ +static struct dm_zeroed_superblock *read_superblock(struct dm_zeroed *z, + bool allow_uninit, + struct dm_buffer **bp) +{ + struct dm_zeroed_superblock *s; + s = dm_bufio_read(z->bufio, 0, bp); + if (IS_ERR(s)) + return s; + if (s->magic != DM_ZEROED_SUPERBLOCK_MAGIC) { + if (allow_uninit) { + int i; + for (i = 0; i < 1 << SECTOR_SHIFT; i++) + if (((char *)s)[i] != 0) + goto bad_magic; + goto return_ok; + } +bad_magic: + DMERR("Bad superblock magic %x", be32_to_cpu(s->magic)); + dm_bufio_release(*bp); + return ERR_PTR(-EINVAL); + } +return_ok: + return s; +} + +/* + * Return the required size of log in sectors. + */ +static sector_t minimum_log_sectors(struct dm_zeroed *z, + sector_t device_sectors) +{ + sector_t log_blocks = + device_sectors ? log_block(z, device_sectors - 1) + 1 : 1; + return log_blocks << z->sectors_per_metadata_chunk_bits; +} + +/* + * Zero the requested range on the device. + * + * If fn != NULL, fn(context) is called on completion. + * If fn == NULL, the operation is performed synchronously. + */ +static int zero_sectors(struct dm_zeroed *z, sector_t start, sector_t count, + io_notify_fn fn, void *context) +{ + struct dm_io_request req; + struct dm_io_region dest; + + req.bi_rw = WRITE; + req.mem.type = DM_IO_PAGE_LIST; + req.mem.offset = 0; + req.mem.ptr.pl = &zero_page_list; + req.notify.fn = fn; + req.notify.context = context; + req.client = z->dm_io; + + dest.bdev = z->dev->bdev; + dest.sector = start; + dest.count = count; + + return dm_io(&req, 1, &dest, NULL); +} + +/* + * Issue cache flush on the device. + */ +static int issue_device_flush_sync(struct dm_zeroed *z) +{ + struct dm_io_request req; + struct dm_io_region dest; + + req.bi_rw = REQ_FLUSH; + req.mem.type = DM_IO_KMEM; + req.mem.ptr.addr = NULL; + req.notify.fn = NULL; + req.client = z->dm_io; + + dest.bdev = z->dev->bdev; + dest.sector = 0; + dest.count = 0; + + return dm_io(&req, 1, &dest, NULL); +} + +/* + * Zero the last chunk when extending the device. + * If the device size wasn't a multiple of chunk size and we extend the device, + * we must zero a part of the last chunk. + */ +static int zero_trailing_chunk(struct dm_zeroed *z, sector_t device_sectors) +{ + if (z->device_sectors & (z->sectors_per_data_chunk - 1)) { + int r; + unsigned n_sectors; + + n_sectors = -z->device_sectors & + (z->sectors_per_data_chunk - 1); + if (n_sectors > device_sectors - z->device_sectors) + n_sectors = device_sectors - z->device_sectors; + + r = zero_sectors(z, z->device_sectors, n_sectors, + NULL, NULL); + if (unlikely(r)) + return r; + r = issue_device_flush_sync(z); + if (unlikely(r)) + return r; + } + + return 0; +} + +/* + * Perform device extension. + */ +static int extend_device(struct dm_zeroed *z, sector_t device_sectors) +{ + int r; + sector_t s = z->device_sectors; + + r = zero_trailing_chunk(z, device_sectors); + if (r) + return r; + + do { + void *log; + unsigned pos; + struct dm_buffer *bp; + + log = read_log_block(z, s, &pos, &bp); + if (IS_ERR(log)) + return PTR_ERR(log); + + if (!pos) { + memset(log, 0, + z->sectors_per_metadata_chunk << SECTOR_SHIFT); + s += + z->sectors_per_metadata_chunk << (SECTOR_SHIFT + 3); + } else while (pos < + z->sectors_per_metadata_chunk << (SECTOR_SHIFT + 3)) { + __clear_bit_le(pos, log); + s++; + pos++; + } + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } while (s && s < device_sectors); + + return 0; +} + +/* + * A target constructor. + */ +static int zeroed_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct dm_zeroed *z; + unsigned long sectors_per_data_chunk; + unsigned long sectors_per_metadata_chunk; + char *endstr; + + struct dm_buffer *bp; + struct dm_zeroed_superblock *superblock; + + if (argc != 4) { + ti->error = "Invalid argument count"; + r = -EINVAL; + goto bad; + } + + sectors_per_data_chunk = simple_strtoul(argv[0], &endstr, 10); + if (!*argv[0] || *endstr || + !sectors_per_data_chunk || + sectors_per_data_chunk & (sectors_per_data_chunk - 1) || + sectors_per_data_chunk > INT_MAX >> SECTOR_SHIFT) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad; + } + + sectors_per_metadata_chunk = simple_strtoul(argv[1], &endstr, 10); + if (!*argv[0] || *endstr || + !sectors_per_metadata_chunk || + sectors_per_metadata_chunk & (sectors_per_metadata_chunk - 1) || + sectors_per_metadata_chunk > INT_MAX >> (SECTOR_SHIFT + 3)) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad; + } + + z = kmalloc(sizeof(struct dm_zeroed), GFP_KERNEL); + if (!z) { + ti->error = "Could not allocate memory"; + r = -ENOMEM; + goto bad; + } + ti->private = z; + + z->sectors_per_data_chunk = sectors_per_data_chunk; + z->sectors_per_data_chunk_bits = __ffs(z->sectors_per_data_chunk); + z->sectors_per_metadata_chunk = sectors_per_metadata_chunk; + z->sectors_per_metadata_chunk_bits = __ffs(z->sectors_per_metadata_chunk); + + mutex_init(&z->range_tree_lock); + z->range_tree = RB_ROOT; + INIT_LIST_HEAD(&z->overlaping_requests); + + spin_lock_init(&z->flush_request_lock); + INIT_LIST_HEAD(&z->flush_request_list); + INIT_WORK(&z->flush_work, zeroed_flush); + + z->bioset = bioset_create(1, 0); + if (!z->bioset) { + ti->error = "Could not create bioset"; + r = -ENOMEM; + goto bad_bioset; + } + + z->dm_io = dm_io_client_create(); + if (IS_ERR(z->dm_io)) { + ti->error = "Could not create dm-io client"; + r = PTR_ERR(z->dm_io); + goto bad_dm_io; + } + + z->workqueue = alloc_workqueue("dm-zeroed", WQ_MEM_RECLAIM, 2); + if (!z->workqueue) { + ti->error = "Could not create workqueue"; + r = -ENOMEM; + goto bad_workqueue; + } + + r = dm_get_device(ti, argv[2], dm_table_get_mode(ti->table), &z->dev); + if (r) { + ti->error = "Could not open underlying device"; + goto bad_dev; + } + + r = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &z->log); + if (r) { + ti->error = "Could not open log device"; + goto bad_log; + } + + z->bufio = dm_bufio_client_create(z->log->bdev, + z->sectors_per_metadata_chunk << SECTOR_SHIFT, + 1, 0, NULL, NULL); + if (IS_ERR(z->bufio)) { + r = PTR_ERR(z->bufio); + ti->error = "Unable create bufio"; + goto bad_bufio; + } + + superblock = read_superblock(z, true, &bp); + if (IS_ERR(superblock)) { + r = PTR_ERR(superblock); + ti->error = "Unable to read superblock"; + goto bad_superblock; + } + + if (superblock->magic != DM_ZEROED_SUPERBLOCK_MAGIC) { + superblock->magic = DM_ZEROED_SUPERBLOCK_MAGIC; + superblock->sectors_per_data_chunk = + cpu_to_le32(z->sectors_per_data_chunk); + superblock->sectors_per_metadata_chunk = + cpu_to_le32(z->sectors_per_metadata_chunk); + superblock->device_sectors = cpu_to_le64(0); + dm_bufio_mark_buffer_dirty(bp); + } + + if (le32_to_cpu(superblock->sectors_per_data_chunk) != + z->sectors_per_data_chunk) { + dm_bufio_release(bp); + r = -EINVAL; + ti->error = "Invalid chunk size"; + goto bad_superblock; + } + + if (le32_to_cpu(superblock->sectors_per_metadata_chunk) != + z->sectors_per_metadata_chunk) { + dm_bufio_release(bp); + r = -EINVAL; + ti->error = "Invalid metadata chunk size"; + goto bad_superblock; + } + + z->device_sectors = le64_to_cpu(superblock->device_sectors); + dm_bufio_release(bp); + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->per_bio_data_size = sizeof(struct dm_zeroed_request); + r = dm_set_target_max_io_len(ti, z->sectors_per_metadata_chunk * + 8 * z->sectors_per_data_chunk); + if (r) { + ti->error = "Couldn't set max_io_len"; + goto bad_superblock; + } + + return 0; + +bad_superblock: + dm_bufio_client_destroy(z->bufio); +bad_bufio: + dm_put_device(ti, z->log); +bad_log: + dm_put_device(ti, z->dev); +bad_dev: + destroy_workqueue(z->workqueue); +bad_workqueue: + dm_io_client_destroy(z->dm_io); +bad_dm_io: + bioset_free(z->bioset); +bad_bioset: + kfree(z); +bad: + return r; +} + +/* + * A target destructor. + */ +static void zeroed_dtr(struct dm_target *ti) +{ + struct dm_zeroed *z = ti->private; + + destroy_workqueue(z->workqueue); + dm_bufio_client_destroy(z->bufio); + dm_put_device(ti, z->log); + dm_put_device(ti, z->dev); + dm_io_client_destroy(z->dm_io); + bioset_free(z->bioset); + kfree(z); +} + +/* + * A resume function. Device extending or shrinking is detected at this point. + */ +static void zeroed_resume(struct dm_target *ti) +{ + struct dm_zeroed *z = ti->private; + + sector_t device_sectors = ti->len; + sector_t log_sectors = + i_size_read(z->log->bdev->bd_inode) >> SECTOR_SHIFT; + + sector_t needed_log_sectors = minimum_log_sectors(z, device_sectors); + + if (log_sectors < needed_log_sectors) { + DMERR("Log is too small: %Lx < %Lx (device sectors %Lx)", + (unsigned long long)log_sectors, + (unsigned long long)needed_log_sectors, + (unsigned long long)device_sectors); + goto skip_extend; + } + + if (device_sectors != z->device_sectors) { + int r; + struct dm_zeroed_superblock *s; + struct dm_buffer *bp; + + if (device_sectors > z->device_sectors) { + if (extend_device(z, device_sectors)) + goto skip_extend; + } + + r = dm_bufio_write_dirty_buffers(z->bufio); + if (r) { + DMERR("Error writing dirty buffers: %d", r); + goto skip_extend; + } + r = dm_bufio_issue_flush(z->bufio); + if (r) { + DMERR("Error flushing disk cache: %d", r); + goto skip_extend; + } + + s = read_superblock(z, false, &bp); + if (IS_ERR(s)) + goto skip_extend; + s->device_sectors = cpu_to_le64(device_sectors); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + z->device_sectors = device_sectors; + } +skip_extend: + return; +} + +/* + * Advance a bio by the specified number of bytes. + * Increase bi_sector, decrease bi_size and advance the vector. + */ +static void advance_bio(struct bio *bio, unsigned n_bytes) +{ + unsigned n_sectors; + + BUG_ON(n_bytes & ((1 << SECTOR_SHIFT) - 1)); + + n_sectors = n_bytes >> SECTOR_SHIFT; + + bio->bi_sector += n_sectors; + bio->bi_size -= n_bytes; +next_bvec: + BUG_ON(bio->bi_idx >= bio->bi_vcnt); + if (bio_iovec(bio)->bv_len > n_bytes) { + bio_iovec(bio)->bv_len -= n_bytes; + } else { + n_bytes -= bio_iovec(bio)->bv_len; + bio->bi_idx++; + if (n_bytes) { + cond_resched(); + goto next_bvec; + } + } +} + +/* + * Test n bits at a specified position in the log. + * Return true if all the bits are set. + */ +static bool test_log_bits(struct dm_zeroed *z, void *log, + unsigned pos, unsigned n) +{ + BUG_ON(pos + n > z->sectors_per_metadata_chunk << (SECTOR_SHIFT + 3)); + do { + if (!(pos & (BITS_PER_LONG - 1)) && n >= BITS_PER_LONG) { + long val = ((long *)log)[pos / BITS_PER_LONG]; + if (unlikely(val != -1L)) + return false; + pos += BITS_PER_LONG; + n -= BITS_PER_LONG; + } else if (!(pos & 7) && n >= 8) { + u8 val = ((u8 *)log)[pos / 8]; + if (unlikely(val != 0xff)) + return false; + pos += 8; + n -= 8; + } else { + if (unlikely(!test_bit_le(pos, log))) + return false; + pos++; + n--; + } + cond_resched(); + } while (n); + return true; +} + +/* + * Check if a specified range overlaps with an existing range. + * If insert != NULL, add this request to the rb-tree, if it is non-overlapping. + */ +static bool range_check(struct dm_zeroed *z, + sector_t sector, unsigned n_sectors, + struct dm_zeroed_request *insert) +{ + struct rb_node **p = &z->range_tree.rb_node; + struct rb_node *parent = NULL; + while (*p) { + parent = *p; +#define node rb_entry(parent, struct dm_zeroed_request, tree_node) + if (sector + n_sectors <= node->original_sector) + p = &node->tree_node.rb_left; + else if (sector >= + node->original_sector + node->original_n_sectors) + p = &node->tree_node.rb_right; + else + return true; +#undef node + } + if (insert) { + rb_link_node(&insert->tree_node, parent, p); + rb_insert_color(&insert->tree_node, &z->range_tree); + } + return false; +} + +/* + * The map function. + * + * Note: we can't read device log here, because it would deadlock. + * So we only perform get_log_block and if the block is not found in + * cache, we queue the request to the workqueue. + */ +static int zeroed_map(struct dm_target *ti, struct bio *bio) +{ + struct dm_zeroed *z = ti->private; + void *log; + unsigned pos, pos_end; + struct dm_buffer *bp; + struct dm_zeroed_request *rq; + + bio->bi_bdev = z->dev->bdev; + bio->bi_sector = dm_target_offset(ti, bio->bi_sector); + if (unlikely(!bio->bi_size) || unlikely((bio->bi_rw & REQ_DISCARD) != 0)) + return DM_MAPIO_REMAPPED; + + if (unlikely(bio->bi_sector + bio_sectors(bio) > z->device_sectors)) { + DMERR("bio out of device size, bi_sector %Lx, bi_size %x, device_sectors %Lx", + (unsigned long long)bio->bi_sector, + bio->bi_size, + (unsigned long long)z->device_sectors); + return -EIO; + } + + log = get_log_block(z, bio->bi_sector, &pos, &bp); + if (unlikely(!log)) + goto queue_to_thread; + if (unlikely(IS_ERR(log))) { + DMERR("unable to access log block for sector %Lx: %d", + (unsigned long long)bio->bi_sector, + (int)PTR_ERR(log)); + return PTR_ERR(log); + } + pos_end = log_position(z, bio->bi_sector + bio_sectors(bio) - 1); + if (likely(test_log_bits(z, log, pos, pos_end - pos + 1))) { + + dm_bufio_release(bp); + + if (unlikely((bio->bi_rw & RW_MASK) == WRITE)) { + /* + * Make sure that test_log_bits is not reordered with + * z->range_tree.rb_node != NULL + */ + smp_rmb(); + + if (unlikely(z->range_tree.rb_node != NULL)) { + mutex_lock(&z->range_tree_lock); + if (unlikely(range_check(z, bio->bi_sector, + bio_sectors(bio), + NULL))) { + mutex_unlock(&z->range_tree_lock); + goto queue_to_thread; + } + mutex_unlock(&z->range_tree_lock); + } + } + + return DM_MAPIO_REMAPPED; + } + dm_bufio_release(bp); + +queue_to_thread: + rq = dm_per_bio_data(bio, sizeof(struct dm_zeroed_request)); + rq->z = z; + INIT_WORK(&rq->work, zeroed_work); + queue_work(z->workqueue, &rq->work); + + return DM_MAPIO_SUBMITTED; +} + +/* + * A continuation of zeroed_map. + */ +static void zeroed_work(struct work_struct *work) +{ + struct dm_zeroed_request *rq = + container_of(work, struct dm_zeroed_request, work); + struct dm_zeroed *z = rq->z; + struct bio *bio = dm_bio_from_per_bio_data(rq, + sizeof(struct dm_zeroed_request)); + void *log; + unsigned pos, pos_end; + struct dm_buffer *bp; + + struct bio *new_bio; + + log = read_log_block(z, bio->bi_sector, &pos, &bp); + if (unlikely(IS_ERR(log))) { + DMERR("unable to access log block for sector %Lx: %d", + (unsigned long long)bio->bi_sector, + (int)PTR_ERR(log)); + bio_endio(bio, PTR_ERR(log)); + return; + } + pos_end = log_position(z, bio->bi_sector + bio_sectors(bio) - 1); + if (likely(test_log_bits(z, log, pos, pos_end - pos + 1))) { + + dm_bufio_release(bp); + + if (unlikely((bio->bi_rw & RW_MASK) == WRITE)) { + /* + * Make sure that test_log_bits is not reordered with + * z->range_tree.rb_node != NULL + */ + smp_rmb(); + + if (unlikely(z->range_tree.rb_node != NULL)) { + mutex_lock(&z->range_tree_lock); + if (unlikely(range_check(z, bio->bi_sector, + bio_sectors(bio), + NULL))) { + list_add_tail(&rq->list_entry, + &z->overlaping_requests); + mutex_unlock(&z->range_tree_lock); + return; + } + mutex_unlock(&z->range_tree_lock); + } + } + + generic_make_request(bio); + return; + } + + rq->error = 0; + + if ((bio->bi_rw & RW_MASK) == WRITE) { + unsigned pre_sectors, post_sectors; + + if (test_bit_le(log_position(z, bio->bi_sector), log)) + pre_sectors = 0; + else + pre_sectors = bio->bi_sector & + (z->sectors_per_data_chunk - 1); + + if (test_bit_le(log_position(z, + bio->bi_sector + bio_sectors(bio) - 1), log)) + post_sectors = 0; + else { + post_sectors = -(bio->bi_sector + bio_sectors(bio)) & + (z->sectors_per_data_chunk - 1); + if (unlikely(bio->bi_sector + bio_sectors(bio) + + (u64)post_sectors > z->device_sectors)) + post_sectors = z->device_sectors - + (bio->bi_sector + bio_sectors(bio)); + } + + dm_bufio_release(bp); + + rq->original_sector = bio->bi_sector - pre_sectors; + rq->original_n_sectors = bio_sectors(bio) + + pre_sectors + post_sectors; + mutex_lock(&z->range_tree_lock); + if (unlikely(range_check(z, rq->original_sector, + rq->original_n_sectors, rq))) { + list_add_tail(&rq->list_entry, &z->overlaping_requests); + mutex_unlock(&z->range_tree_lock); + return; + } + mutex_unlock(&z->range_tree_lock); + + atomic_set(&rq->outstanding, 2 + !!pre_sectors + !!post_sectors); + + if (unlikely(pre_sectors != 0)) + zero_sectors(z, bio->bi_sector - pre_sectors, + pre_sectors, zero_end_io, rq); + + rq->original_bi_end_io = bio->bi_end_io; + rq->original_bi_private = bio->bi_private; + bio->bi_end_io = write_end_io; + bio->bi_private = rq; + generic_make_request(bio); + + if (unlikely(post_sectors != 0)) + zero_sectors(z, bio->bi_sector + bio_sectors(bio), + post_sectors, zero_end_io, rq); + + write_dec_outstanding(rq); + + return; + } + + atomic_set(&rq->outstanding, 1); + + zero_fill_bio(bio); + + new_bio = NULL; + while (bio->bi_size) { + unsigned i, n_sectors, n_bytes; + + cond_resched(); + + i = log_position(z, bio->bi_sector); + n_sectors = z->sectors_per_data_chunk - + (bio->bi_sector & (z->sectors_per_data_chunk - 1)); + n_bytes = n_sectors << SECTOR_SHIFT; + + if (unlikely(n_bytes > bio->bi_size)) { + n_sectors = bio->bi_size >> SECTOR_SHIFT; + n_bytes = bio->bi_size; + } + + if (test_bit_le(i, log)) { + unsigned len; + if (!new_bio) { + new_bio = bio_alloc_bioset(GFP_NOIO, + bio->bi_vcnt - bio->bi_idx, z->bioset); + new_bio->bi_bdev = bio->bi_bdev; + new_bio->bi_sector = bio->bi_sector; + new_bio->bi_end_io = read_end_io; + new_bio->bi_private = rq; + } + len = min(n_bytes, bio_iovec(bio)->bv_len); + if (!bio_add_page(new_bio, bio_page(bio), len, + bio_offset(bio))) + goto submit_new_bio; + advance_bio(bio, len); + } else { + advance_bio(bio, n_bytes); + if (new_bio) { +submit_new_bio: + atomic_inc(&rq->outstanding); + submit_bio(READ, new_bio); + new_bio = NULL; + } + } + } + if (new_bio) + goto submit_new_bio; + + dm_bufio_release(bp); + + read_dec_outstanding(rq); +} + +/* + * End of read request. + */ +static void read_end_io(struct bio *new_bio, int error) +{ + struct dm_zeroed_request *rq = new_bio->bi_private; + + if (unlikely(error)) + rq->error = error; + + bio_put(new_bio); + + read_dec_outstanding(rq); +} + +/* + * Decrease the outstanding counter on read requests. + * If it reaches zero, the bio is finished. + */ +static void read_dec_outstanding(struct dm_zeroed_request *rq) +{ + if (atomic_dec_and_test(&rq->outstanding)) { + int error = rq->error; + struct bio *bio = dm_bio_from_per_bio_data(rq, + sizeof(struct dm_zeroed_request)); + bio_endio(bio, error); + } +} + +/* + * The end of zero request performed by dm-io. + */ +static void zero_end_io(unsigned long error, void *context) +{ + struct dm_zeroed_request *rq = context; + + if (unlikely(error != 0)) + rq->error = -EIO; + + write_dec_outstanding(rq); +} + +/* + * The end of write request. + */ +static void write_end_io(struct bio *bio, int error) +{ + struct dm_zeroed_request *rq = bio->bi_private; + + bio->bi_end_io = rq->original_bi_end_io; + bio->bi_private = rq->original_bi_private; + + if (unlikely(error)) + rq->error = error; + + write_dec_outstanding(rq); +} + +/* + * Decrease the outstanding count on write requests. + * If it reaches zero, the request is queued to zeroed_flush. + */ +static void write_dec_outstanding(struct dm_zeroed_request *rq) +{ + if (atomic_dec_and_test(&rq->outstanding)) { + struct dm_zeroed *z = rq->z; + + unsigned long flags; + + spin_lock_irqsave(&z->flush_request_lock, flags); + list_add_tail(&rq->list_entry, &z->flush_request_list); + spin_unlock_irqrestore(&z->flush_request_lock, flags); + + queue_work(z->workqueue, &z->flush_work); + } +} + +/* + * This function processes finished write requests. + * We sync hardware write cache (to make the requests really finished). + * We set bits in the log. + * We sync the log. + * Finally we return write requests to device mapper as finished. + */ +static void zeroed_flush(struct work_struct *work) +{ + struct dm_zeroed *z = + container_of(work, struct dm_zeroed, flush_work); + struct list_head list; + struct dm_zeroed_request *rq, *rqn; + int r; + + spin_lock_irq(&z->flush_request_lock); + if (list_empty(&z->flush_request_list)) { + spin_unlock_irq(&z->flush_request_lock); + return; + } + list = z->flush_request_list; + INIT_LIST_HEAD(&z->flush_request_list); + list.next->prev = &list; + list.prev->next = &list; + spin_unlock_irq(&z->flush_request_lock); + + r = issue_device_flush_sync(z); + if (unlikely(r)) + goto return_error; + + /* + * Pair with smp_rmb, make sure that other processes see + * z->range_tree.rb_node != NULL before they see __set_bit_le. + * In practice, this smp_wmb is almost useless because + * there were a lot of operations since rb_link_node and + * so z->range_tree.rb_node != NULL is already visible. + */ + smp_wmb(); + + list_for_each_entry_safe(rq, rqn, &list, list_entry) { + void *log; + unsigned pos, pos_end; + struct dm_buffer *bp; + + if (unlikely(rq->error)) { + list_del(&rq->list_entry); + write_end_request(rq, rq->error); + continue; + } + + log = read_log_block(z, rq->original_sector, &pos, &bp); + if (unlikely(IS_ERR(log))) { + list_del(&rq->list_entry); + write_end_request(rq, PTR_ERR(log)); + continue; + } + pos_end = log_position(z, rq->original_sector + + rq->original_n_sectors - 1); + for (; pos <= pos_end; pos++) + __set_bit_le(pos, log); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + + r = dm_bufio_write_dirty_buffers(z->bufio); + if (unlikely(r)) + goto return_error; + r = dm_bufio_issue_flush(z->bufio); + if (unlikely(r)) + goto return_error; + + r = 0; +return_error: + list_for_each_entry_safe(rq, rqn, &list, list_entry) { + list_del(&rq->list_entry); + write_end_request(rq, r); + } + resume_overlappnig_requests(z); +} + +/* + * Finish one write request. + * Remove it from the rb-tree, if that enables other held requests to be + * resubmitted, resubmit them. + * Finally, report the request as finished. + */ +static void write_end_request(struct dm_zeroed_request *rq, int r) +{ + struct dm_zeroed *z = rq->z; + struct bio *bio; + + mutex_lock(&z->range_tree_lock); + rb_erase(&rq->tree_node, &z->range_tree); + mutex_unlock(&z->range_tree_lock); + + bio = dm_bio_from_per_bio_data(rq, sizeof(struct dm_zeroed_request)); + bio_endio(bio, r); + + cond_resched(); +} + +/* + * Check the list of overlapping requests. The requests that are no longer + * overlappnig are resubmitted. + */ +static void resume_overlappnig_requests(struct dm_zeroed *z) +{ + struct dm_zeroed_request *rq, *rqn; + mutex_lock(&z->range_tree_lock); + list_for_each_entry_safe(rq, rqn, &z->overlaping_requests, list_entry) { + struct bio *bio = dm_bio_from_per_bio_data(rq, + sizeof(struct dm_zeroed_request)); + if (!range_check(z, bio->bi_sector, bio_sectors(bio), NULL)) { + list_del(&rq->list_entry); + queue_work(z->workqueue, &rq->work); + } + cond_resched(); + } + mutex_unlock(&z->range_tree_lock); +} + +/* + * The merge method. Pass the merge request to the device queue. + */ +static int zeroed_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct dm_zeroed *z = ti->private; + struct request_queue *q = bdev_get_queue(z->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = z->dev->bdev; + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +/* + * Iterate devices. + * We return only the underlying device, not the log device, + * because requests are never routed to the log device. + */ +static int zeroed_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, + void *data) +{ + struct dm_zeroed *z = ti->private; + + return fn(ti, z->dev, 0, ti->len, data); +} + +static void zeroed_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_zeroed *z = ti->private; + + blk_limits_io_opt(limits, z->sectors_per_data_chunk << SECTOR_SHIFT); +} + +static struct target_type zeroed_target = { + .name = "zeroed", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = zeroed_ctr, + .dtr = zeroed_dtr, + .map = zeroed_map, + .merge = zeroed_merge, + .resume = zeroed_resume, + .iterate_devices = zeroed_iterate_devices, + .io_hints = zeroed_io_hints, +}; + +/* + * Module initializetion. + */ +static int __init dm_zeroed_init(void) +{ + int r; + + zero_page_list.next = &zero_page_list; + zero_page_list.page = ZERO_PAGE(0); + + r = dm_register_target(&zeroed_target); + if (r < 0) { + DMERR("Target register failed %d", r); + goto bad_target; + } + + return 0; + +bad_target: + return r; +} + +/* + * Module termination. + */ +static void __exit dm_zeroed_exit(void) +{ + dm_unregister_target(&zeroed_target); +} + +module_init(dm_zeroed_init) +module_exit(dm_zeroed_exit) + +MODULE_AUTHOR("Mikulas Patocka "); +MODULE_DESCRIPTION(DM_NAME " zeroed target"); +MODULE_LICENSE("GPL");