New snapshot implementation. This implementation has shared storage and unlimited number of snapshots. Usage: Create two logical volumes, one for origin and one for snapshots. (assume /dev/mapper/vg1-lv1 for origin and /dev/mapper/vg1-lv2 for snapshot in these examples) Clear the first sector of the snapshot volume: dd if=/dev/zero of=/dev/mapper/vg1-lv2 bs=4096 count=1 Load the shared snapshot driver: echo 0 `blockdev --getsize /dev/mapper/vg1-lv1` multisnapshot /dev/mapper/vg1-lv1 /dev/mapper/vg1-lv2 4096|dmsetup create ms (4096 is the chunk size. You can place different number there) This creates the origin store on /dev/mapper/ms. If the store was zeroed, it creates new structure, otherwise it loads existing structure. Once this is done, you should no longer access /dev/mapper/vg1-lv1 and /dev/mapper/vg1-lv2 and only use /dev/mapper/ms. Create new snapshot: dmsetup message /dev/mapper/ms 0 create (snapshots have IDs assigned from 0 upwards --- it prints the newly created ID to syslog ... in the final version it will use status to publish the ID) Attach the snapshot: echo 0 `blockdev --getsize /dev/mapper/vg1-lv1` multisnap-snap /dev/mapper/vg1-lv1 0|dmsetup create ms0 (that '0' is the snapshot id ... you can use different number) This attaches the snapshot '0' on /dev/mapper/ms0 See status: dmsetup status prints these information about the multisnapshot device: - 0 on active storage, error number on error (ENOSPC, EIO, etc.) - the last created snapshot number - total number of chunks on the device - total number of allocated chunks - a number of chunks allocated for metadata - a number of snapshots - existing snapshot IDs Unload it: dmsetup remove ms dmsetup remove ms0 ... etc. (note, once you unload the origin, the snapshots become inaccessible - the devices exist but they return -EIO on everything) Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 6 drivers/md/Makefile | 2 drivers/md/dm-multisnap-alloc.c | 384 +++++++++++++++ drivers/md/dm-multisnap-btree.c | 409 ++++++++++++++++ drivers/md/dm-multisnap-commit.c | 182 +++++++ drivers/md/dm-multisnap-io.c | 395 ++++++++++++++++ drivers/md/dm-multisnap-snaps.c | 189 +++++++ drivers/md/dm-multisnap-struct.h | 118 ++++ drivers/md/dm-multisnap.c | 949 +++++++++++++++++++++++++++++++++++++++ drivers/md/dm-multisnap.h | 254 ++++++++++ 10 files changed, 2888 insertions(+) Index: linux-2.6.28-rc5-devel/drivers/md/Kconfig =================================================================== --- linux-2.6.28-rc5-devel.orig/drivers/md/Kconfig 2008-11-25 16:09:59.000000000 +0100 +++ linux-2.6.28-rc5-devel/drivers/md/Kconfig 2008-11-25 16:12:40.000000000 +0100 @@ -258,6 +258,12 @@ config DM_SNAPSHOT ---help--- Allow volume managers to take writable snapshots of a device. +config DM_MULTISNAPSHOT + tristate "Multisnapshot target" + depends on BLK_DEV_DM + ---help--- + A new implementation of snapshots. + config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM Index: linux-2.6.28-rc5-devel/drivers/md/Makefile =================================================================== --- linux-2.6.28-rc5-devel.orig/drivers/md/Makefile 2008-11-25 16:12:37.000000000 +0100 +++ linux-2.6.28-rc5-devel/drivers/md/Makefile 2008-11-25 16:12:40.000000000 +0100 @@ -6,6 +6,7 @@ dm-mod-objs := dm.o dm-table.o dm-target dm-ioctl.o dm-io.o dm-kcopyd.o dm-bufio.o dm-multipath-objs := dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception-store.o +dm-multisnapshot-objs := dm-multisnap.o dm-multisnap-alloc.o dm-multisnap-btree.o dm-multisnap-commit.o dm-multisnap-snaps.o dm-multisnap-io.o dm-mirror-objs := dm-raid1.o md-mod-objs := md.o bitmap.o raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ @@ -35,6 +36,7 @@ obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_LOOP) += dm-loop.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o +obj-$(CONFIG_DM_MULTISNAPSHOT) += dm-multisnapshot.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_ZERO) += dm-zero.o Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap.c 2008-12-01 20:04:09.000000000 +0100 @@ -0,0 +1,949 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + +#define MESG_STR(x) x, sizeof(x) + +void dm_multisnap_set_error(struct dm_multisnap *s, int error) +{ + if (!s->error) + s->error = error; +} + +int dm_multisnap_has_error(struct dm_multisnap *s) +{ + return s->error; +} + +static struct tmp_remap *find_tmp_remap(struct dm_multisnap *s, chunk_t block) +{ + struct tmp_remap *t; + struct hlist_node *hn; + unsigned hash = TMP_REMAP_HASH(block); + hlist_for_each_entry(t, hn, &s->tmp_remap[hash], hash_list) + if (t->old == block) + return t; + return NULL; +} + +void *dm_multisnap_read_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp) +{ + void *buf; + struct tmp_remap *t; + cond_resched(); + t = find_tmp_remap(s, block); + if (t) + block = t->new; + buf = dm_bufio_read(s->bufio, block, bp); + if (unlikely(IS_ERR(buf))) { + DMERR("dm_multisnap_read_block: error read chunk %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, PTR_ERR(buf)); + return NULL; + } + return buf; +} + +int dm_multisnap_block_is_uncommitted(struct dm_multisnap *s, chunk_t chunk) +{ + struct tmp_remap *t = find_tmp_remap(s, chunk); + return t && t->uncommitted; +} + +void *dm_multisnap_duplicate_block(struct dm_multisnap *s, chunk_t old_chunk, chunk_t new_chunk, bitmap_t bitmap_idx, struct dm_buffer **bp) +{ + void *buf; + struct tmp_remap *t = find_tmp_remap(s, old_chunk); + if (t) { + if (unlikely(t->bitmap_idx != bitmap_idx)) { + DMERR("dm_multisnap_duplicate_block: bitmap_idx doesn't match, %X != %X", t->bitmap_idx, bitmap_idx); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + dm_multisnap_free_block(s, t->new); + t->new = new_chunk; + } else { + if (unlikely(list_empty(&s->free_tmp_remaps))) { + DMERR("dm_multisnap_duplicate_block: all remap blocks used"); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + t = list_first_entry(&s->free_tmp_remaps, struct tmp_remap, list); + t->new = new_chunk; + t->old = old_chunk; + t->bitmap_idx = bitmap_idx; + hlist_add_head(&t->hash_list, &s->tmp_remap[TMP_REMAP_HASH(old_chunk)]); + s->n_used_tmp_remaps++; + } + list_del(&t->list); + if (bitmap_idx == CB_BITMAP_IDX_NONE) + list_add_tail(&t->list, &s->used_bt_tmp_remaps); + else + list_add_tail(&t->list, &s->used_bitmap_tmp_remaps); + t->uncommitted = 1; + dm_bufio_release_move(*bp, new_chunk); + buf = dm_bufio_read(s->bufio, new_chunk, bp); + if (IS_ERR(buf)) { + DMERR("dm_multisnap_duplicate_block: error reading chunk %Lx", (unsigned long long)new_chunk); + dm_multisnap_set_error(s, PTR_ERR(buf)); + return NULL; + } + return buf; +} + +void dm_multisnap_delete_remapped_block(struct dm_multisnap *s, chunk_t chunk) +{ + struct tmp_remap *t = find_tmp_remap(s, chunk); + if (likely(!t)) + return; + dm_multisnap_free_tmp_remap(s, t); +} + +void dm_multisnap_free_tmp_remap(struct dm_multisnap *s, struct tmp_remap *t) +{ + list_del(&t->list); + hlist_del(&t->hash_list); + s->n_used_tmp_remaps--; + list_add(&t->list, &s->free_tmp_remaps); +} + +void *dm_multisnap_make_block(struct dm_multisnap *s, chunk_t new_chunk, struct dm_buffer **bp) +{ + void *buf; + /* !!! FIXME: add it to the list of recently allocated blocks */ + + buf = dm_bufio_new(s->bufio, new_chunk, bp); + if (unlikely(IS_ERR(buf))) { + DMERR("dm_multisnap_make_block: error creating new block at chunk %Lx", (unsigned long long)new_chunk); + dm_multisnap_set_error(s, PTR_ERR(buf)); + return NULL; + } + return buf; +} + +int dm_multisnap_is_cb_block(struct dm_multisnap *s, chunk_t block) +{ + if (block < FIRST_CB_BLOCK) return 0; + return sector_div(block, CB_STRIDE) == FIRST_CB_BLOCK % CB_STRIDE; +} + +static void initialize_device(struct dm_multisnap *s) +{ + int r; + struct dm_buffer *bp; + struct multisnap_superblock *sb; + struct multisnap_commit_block *cb; + chunk_t cb_block; + chunk_t block_to_write; + __u64 dev_size; + + dev_size = i_size_read(s->snapshot->bdev->bd_inode) >> s->chunk_shift; + if ((dev_size + CB_STRIDE) != (chunk_t)(dev_size + CB_STRIDE)) { + DMERR("initialize_device: device is too large. Compile kernel with 64-bit sector numbers"); + dm_multisnap_set_error(s, -ERANGE); + return; + } + s->dev_size = dev_size; + s->total_allocated = 0; + s->data_allocated = 0; + + block_to_write = SB_BLOCK + 1; + +/* Write btree */ + dm_multisnap_create_btree(s, &block_to_write); + if (dm_multisnap_has_error(s)) + return; + +/* Write bitmaps */ + dm_multisnap_create_bitmaps(s, block_to_write); + if (dm_multisnap_has_error(s)) + return; + +/* Write commit blocks */ + if (FIRST_CB_BLOCK >= dev_size) { + DMERR("initialize_device: device is too small"); + dm_multisnap_set_error(s, -ENOSPC); + return; + } + for (cb_block = FIRST_CB_BLOCK; cb_block < s->dev_size; cb_block += CB_STRIDE) { + cb = dm_bufio_new(s->bufio, cb_block, &bp); + if (IS_ERR(cb)) { + DMERR("initialize_device: can't allocate commit block at %Lx", (unsigned long long)cb_block); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + memset(cb, 0, s->chunk_size); + cb->signature = CB_SIGNATURE; + cb->sequence = cpu_to_le64(cb_block == FIRST_CB_BLOCK); + if (cb_block == FIRST_CB_BLOCK) { + cb->snapshot_num = cpu_to_le32(0); + write_48(cb, dev_size, s->dev_size); + write_48(cb, total_allocated, s->total_allocated); + write_48(cb, data_allocated, s->data_allocated); + write_48(cb, bitmap_root, s->bitmap_root); + write_48(cb, bt_root, s->bt_root); + cb->bt_depth = s->bt_depth; + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + r = dm_bufio_write_dirty_buffers(s->bufio); + if (r) { + DMERR("initialize_device: write error when initializing device"); + dm_multisnap_set_error(s, r); + return; + } + +/* Write super block */ + sb = dm_bufio_new(s->bufio, SB_BLOCK, &bp); + if (IS_ERR(sb)) { + DMERR("initialize_device: can't allocate super block"); + dm_multisnap_set_error(s, PTR_ERR(sb)); + return; + } + memset(sb, 0, s->chunk_size); + sb->signature = SB_SIGNATURE; + sb->chunk_size = cpu_to_le32(s->chunk_size); + sb->commit_block = cpu_to_le64(FIRST_CB_BLOCK); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + r = dm_bufio_write_dirty_buffers(s->bufio); + if (r) { + DMERR("initialize_device: can't write super block"); + dm_multisnap_set_error(s, r); + return; + } +} + +static void load_commit_block(struct dm_multisnap *s) +{ + struct dm_buffer *bp; + struct multisnap_commit_block *cb; + chunk_t cb_addr = s->sb_commit_block; + __u64 sequence; + __u64 dev_size; + int bitmap_depth; + unsigned i; + s->valid_commit_block = 0; + s->commit_sequence = 0; + +try_next: + cb = dm_bufio_read(s->bufio, cb_addr, &bp); + if (IS_ERR(cb)) { + DMERR("load_commit_block: can't read commit block %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + if (cb->signature != CB_SIGNATURE) { + dm_bufio_release(bp); + DMERR("load_commit_block: bad signature on commit block %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + sequence = le64_to_cpu(cb->sequence); + dev_size = read_48(cb, dev_size); + + dm_bufio_release(bp); + + if (sequence > s->commit_sequence) { + s->commit_sequence = sequence; + s->valid_commit_block = cb_addr; + if ((__u64)cb_addr + CB_STRIDE < dev_size) { + cb_addr += CB_STRIDE; + goto try_next; + } + } + if (!s->valid_commit_block) { + DMERR("load_commit_block: no valid commit block"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + cb = dm_bufio_read(s->bufio, s->valid_commit_block, &bp); + if (IS_ERR(cb)) { + DMERR("load_commit_block: can't re-read commit block %Lx", (unsigned long long)s->valid_commit_block); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + if (cb->signature != CB_SIGNATURE) { + dm_bufio_release(bp); + DMERR("load_commit_block: bad signature when re-reading commit block %Lx", (unsigned long long)s->valid_commit_block); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + dev_size = read_48(cb, dev_size); + s->total_allocated = read_48(cb, total_allocated); + s->data_allocated = read_48(cb, data_allocated); + s->bitmap_root = read_48(cb, bitmap_root); + s->alloc_rover = read_48(cb, alloc_rover); + s->bt_root = read_48(cb, bt_root); + s->snapshot_num = le32_to_cpu(cb->snapshot_num); + s->bt_depth = cb->bt_depth; + + if (s->bt_depth > MAX_BT_DEPTH || !s->bt_depth) { + dm_bufio_release(bp); + DMERR("load_commit_block: invalid b+-tree depth in commit block %Lx", (unsigned long long)s->valid_commit_block); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + for (i = 0; i < TMP_REMAP_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->tmp_remap[i]); + s->n_used_tmp_remaps = 0; + INIT_LIST_HEAD(&s->used_bitmap_tmp_remaps); + INIT_LIST_HEAD(&s->used_bt_tmp_remaps); + INIT_LIST_HEAD(&s->free_tmp_remaps); + + for (i = 0; i < N_REMAPS; i++) { + struct tmp_remap *t = &s->tmp_remap_store[i]; + if (read_48(&cb->tmp_remap[i], old)) { + t->old = read_48(&cb->tmp_remap[i], old); + t->new = read_48(&cb->tmp_remap[i], new); + t->uncommitted = 0; + t->bitmap_idx = le32_to_cpu(cb->tmp_remap[i].bitmap_idx); + hlist_add_head(&t->hash_list, &s->tmp_remap[TMP_REMAP_HASH(t->old)]); + if (t->bitmap_idx == CB_BITMAP_IDX_NONE) + list_add(&t->list, &s->used_bt_tmp_remaps); + else + list_add(&t->list, &s->used_bitmap_tmp_remaps); + s->n_used_tmp_remaps++; + } else { + list_add(&t->list, &s->free_tmp_remaps); + } + } + + dm_bufio_release(bp); + + if ((dev_size + CB_STRIDE) != (chunk_t)(dev_size + CB_STRIDE)) { + DMERR("load_commit_block: device is too large. Compile kernel with 64-bit sector numbers"); + dm_multisnap_set_error(s, -ERANGE); + return; + } + bitmap_depth = dm_multisnap_bitmap_depth(s->chunk_size, dev_size); + if (bitmap_depth < 0) { + DMERR("load_commit_block: device is too large"); + dm_multisnap_set_error(s, bitmap_depth); + return; + } + s->dev_size = dev_size; + s->bitmap_depth = bitmap_depth; + + return; +} + +DEFINE_SPINLOCK(dm_multisnap_bio_list_lock); + +static DEFINE_MUTEX(all_multisnapshots_lock); +static LIST_HEAD(all_multisnapshots); + +static struct dm_multisnap *find_multisnapshot(struct block_device *origin) +{ + struct dm_multisnap *s; + list_for_each_entry(s, &all_multisnapshots, list_all) + if (s->origin->bdev == origin) + return s; + return NULL; +} + +static struct dm_multisnap *create_instance(struct dm_dev *origin, struct dm_dev *snapshot, unsigned chunk_size, char **error) +{ + int r; + unsigned i; + struct dm_multisnap *s; + struct dm_buffer *bp; + struct multisnap_superblock *sb; + int initialized; + + mutex_lock(&all_multisnapshots_lock); + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + *error = "Could not allocate multisnapshot structure"; + r = -ENOMEM; + goto cant_allocate; + } + s->origin = origin; + s->snapshot = snapshot; + s->error = 0; + s->chunk_size = chunk_size; + s->chunk_shift = ffs(chunk_size) - 1; + INIT_LIST_HEAD(&s->all_snaps); + mutex_init(&s->master_lock); + INIT_WORK(&s->work, dm_multisnap_work); + bio_list_init(&s->bios); + atomic_set(&s->n_kcopyd_jobs, 0); + INIT_LIST_HEAD(&s->pes_waiting_for_commit); + s->active_snapshots = RB_ROOT; + s->pending_mempool_allocation_failed = 0; + s->n_preallocated_blocks = 0; + for (i = 0; i < PENDING_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->pending_hash[i]); + + s->tmp_chunk = kmalloc(s->chunk_size + sizeof(struct dm_multisnap_bt_entry), GFP_KERNEL); + if (!s->tmp_chunk) { + *error = "Could not allocate temporary chunk"; + r = -ENOMEM; + goto bad_tmp_chunk; + } + + s->pending_pool = mempool_create_slab_pool(PENDING_MEMPOOL_SIZE, dm_multisnap_pending_exception_cache); + if (!s->pending_pool) { + *error = "Could not allocate mempool for pending exceptions"; + r = -ENOMEM; + goto bad_pending_pool; + } + + r = dm_kcopyd_client_create(MULTISNAP_KCOPYD_PAGES, &s->kcopyd); + if (r) { + *error = "Could not create kcopyd client"; + goto bad_kcopyd; + } + + s->wq = create_singlethread_workqueue("kmultisnapd"); + if (!s->wq) { + *error = "Could not create kernel thread"; + r = -ENOMEM; + goto bad_thread; + } + + s->bufio = dm_bufio_client_create(s->snapshot->bdev, chunk_size); + if (IS_ERR(s->bufio)) { + *error = "Could not create bufio client"; + r = PTR_ERR(s->bufio); + goto bad_bufio; + } + + initialized = 0; +re_read: + sb = dm_bufio_read(s->bufio, SB_BLOCK, &bp); + if (IS_ERR(sb)) { + *error = "Could not read superblock"; + r = PTR_ERR(sb); + goto bad_superblock; + } + if (sb->signature != SB_SIGNATURE) { + dm_bufio_release(bp); + if (initialized) { + *error = "Invalid signature after initialization"; + r = -EIO; + goto bad_superblock; + } + initialize_device(s); + if (dm_multisnap_has_error(s)) { + *error = "Can't initialize device"; + r = dm_multisnap_has_error(s); + goto bad_superblock; + } + initialized = 1; + goto re_read; + } + if (le32_to_cpu(sb->chunk_size) != chunk_size) { + *error = "Bad chunk size"; + r = -EINVAL; + dm_bufio_release(bp); + goto bad_superblock; + } + s->sb_commit_block = le64_to_cpu(sb->commit_block); + dm_bufio_release(bp); + + load_commit_block(s); + if (dm_multisnap_has_error(s)) { + *error = "Unable to load commit block"; + r = dm_multisnap_has_error(s); + goto bad_superblock; + } + + dm_multisnap_read_snapshots(s); + if (dm_multisnap_has_error(s)) { + *error = "Could not read snapshot list"; + r = dm_multisnap_has_error(s); + goto bad_superblock; + } + + list_add(&s->list_all, &all_multisnapshots); + + mutex_unlock(&all_multisnapshots_lock); + + return s; + +bad_superblock: + dm_bufio_client_destroy(s->bufio); +bad_bufio: + flush_workqueue(s->wq); + destroy_workqueue(s->wq); +bad_thread: + dm_kcopyd_client_destroy(s->kcopyd); +bad_kcopyd: + mempool_destroy(s->pending_pool); +bad_pending_pool: + kfree(s->tmp_chunk); +bad_tmp_chunk: + kfree(s); +cant_allocate: + mutex_unlock(&all_multisnapshots_lock); + + return ERR_PTR(r); +} + +static int multisnap_origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + const char *origin_path; + const char *snapshot_path; + char *chunk_size_str; + unsigned long chunk_size; + + struct dm_dev *origin, *snapshot; + + struct dm_multisnap *s; + + if (argc != 3) { + ti->error = "Requires exactly 3 arguments"; + r = -EINVAL; + goto bad_arguments; + } + + origin_path = argv[0]; + snapshot_path = argv[1]; + chunk_size_str = argv[2]; + chunk_size = simple_strtoul(chunk_size_str, &chunk_size_str, 10); + if (*chunk_size_str || chunk_size < 512 || chunk_size > PAGE_SIZE || chunk_size & (chunk_size - 1)) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad_arguments; + } + + r = dm_get_device(ti, origin_path, 0, 0, FMODE_READ | FMODE_WRITE, &origin); + if (r) { + ti->error = "Could not get origin device"; + goto bad_origin; + } + + r = dm_get_device(ti, snapshot_path, 0, 0, FMODE_READ | FMODE_WRITE, &snapshot); + if (r) { + ti->error = "Could not get snapshot device"; + goto bad_snapshot; + } + + s = create_instance(origin, snapshot, chunk_size, &ti->error); + + if (IS_ERR(s)) { + r = PTR_ERR(s); + goto bad_instance; + } + + ti->private = s; + ti->split_io = chunk_size >> SECTOR_SHIFT; + + return 0; + +bad_instance: + dm_put_device(ti, snapshot); +bad_snapshot: + dm_put_device(ti, origin); +bad_origin: +bad_arguments: + return r; +} + +static void multisnap_origin_dtr(struct dm_target *ti) +{ + struct dm_multisnap *s = ti->private; + struct dm_multisnap_snap *sn; + unsigned i; + +#if 0 + /* test allocator */ + { + chunk_t result; + int r = dm_multisnap_alloc_block(s, &result); + if (r) printk("alloc error %d\n", r); + else printk("allocated block %Lx\n", (unsigned long long)result); + } + /* test btree-create */ + { + struct bt_key key = { 0x4567, 0x89, 0xab }; + int r = dm_multisnap_add_to_btree(s, &key, 0x1234); + printk("added: %d\n", r); + } + /* test btree-read */ + { + chunk_t result; + struct bt_key key = { 0x4567, 0x9a, 0x9a }; + int r = dm_multisnap_find_in_btree(s, &key, &result); + printk("read btree: %d, %Lx\n", r, (unsigned long long)result); + } +#endif + + mutex_lock(&all_multisnapshots_lock); + + /* Make sure that any more IOs won't be submitted by snapshot targets */ + list_for_each_entry(sn, &s->all_snaps, list_snaps) { + spin_lock(&dm_multisnap_bio_list_lock); + sn->s = NULL; + spin_unlock(&dm_multisnap_bio_list_lock); + } + list_del(&s->all_snaps); + + /* Wait for IOs on snapshots for this origin to finish */ +poll_for_ios: + spin_lock(&dm_multisnap_bio_list_lock); + if (!bio_list_empty(&s->bios)) { + spin_unlock(&dm_multisnap_bio_list_lock); + flush_workqueue(s->wq); + msleep(1); + goto poll_for_ios; + } + spin_unlock(&dm_multisnap_bio_list_lock); + + mutex_lock(&s->master_lock); + for (i = 0; i < PENDING_HASH_SIZE; i++) + if (!hlist_empty(&s->pending_hash[i])) { + mutex_unlock(&s->master_lock); + msleep(1); + goto poll_for_ios; + } + mutex_unlock(&s->master_lock); + + /* + * When the list is empty, flush the queue once again to make sure + * that the thread is idle. + */ + flush_workqueue(s->wq); + + /* + * And now it is guaranteed that there is nothing executing under us. + */ + destroy_workqueue(s->wq); + s->wq = NULL; /* catch possible bugs */ + dm_kcopyd_client_destroy(s->kcopyd); + s->kcopyd = NULL; /* catch possible bugs */ + + dm_multisnap_commit(s); + + i = 0; + while (!list_empty(&s->used_bitmap_tmp_remaps)) { + struct tmp_remap *t = list_first_entry(&s->used_bitmap_tmp_remaps, struct tmp_remap, list); + list_del(&t->list); + hlist_del(&t->hash_list); + i++; + } + while (!list_empty(&s->used_bt_tmp_remaps)) { + struct tmp_remap *t = list_first_entry(&s->used_bt_tmp_remaps, struct tmp_remap, list); + list_del(&t->list); + hlist_del(&t->hash_list); + i++; + } + BUG_ON(i != s->n_used_tmp_remaps); + while (!list_empty(&s->free_tmp_remaps)) { + struct tmp_remap *t = list_first_entry(&s->free_tmp_remaps, struct tmp_remap, list); + list_del(&t->list); + i++; + } + BUG_ON(i != N_REMAPS); + + for (i = 0; i < TMP_REMAP_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&s->tmp_remap[i])); + + dm_bufio_client_destroy(s->bufio); + dm_multisnap_destroy_snapshot_tree(s); + dm_put_device(ti, s->origin); + dm_put_device(ti, s->snapshot); + + list_del(&s->list_all); + + mempool_destroy(s->pending_pool); + kfree(s->tmp_chunk); + kfree(s); + + mutex_unlock(&all_multisnapshots_lock); +} + +static int multisnap_origin_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct dm_multisnap *s = ti->private; + + /* do the most common case quickly */ + if (likely(bio_rw(bio) != WRITE)) { + bio->bi_bdev = s->origin->bdev; + return DM_MAPIO_REMAPPED; + } + + /* abuse bi_phys_segments field */ + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + bio->bi_phys_segments = SNAPID_T_ORIGIN; + + dm_multisnap_enqueue_bio(s, bio); + wakeup_kmultisnapd(s); + + return DM_MAPIO_SUBMITTED; +} + +static int multisnap_origin_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct dm_multisnap *s = ti->private; + int r; + + mutex_lock(&s->master_lock); + + if (argc == 1 && !strnicmp(argv[0], MESG_STR("create"))) { + if ((r = dm_multisnap_has_error(s))) + goto unlock_ret; + + r = dm_multisnap_create_snapshot(s); + if (r) + goto unlock_ret; + + r = dm_multisnap_has_error(s); + goto unlock_ret; + } + + DMWARN("unrecognised message received."); + r = -EINVAL; + +unlock_ret: + mutex_unlock(&s->master_lock); + + return r; +} + +void dm_multisnap_adjust_string(char **result, unsigned *maxlen) +{ + unsigned len = strlen(*result); + *result += len; + *maxlen -= len; +} + +static int multisnap_origin_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen) +{ + struct dm_multisnap *s = ti->private; + + mutex_lock(&s->master_lock); + + switch (type) { + case STATUSTYPE_INFO: + /* metadata/data/total */ + snprintf(result, maxlen, "%d %u %Lu %Lu %Lu", -dm_multisnap_has_error(s), s->snapshot_num ? s->snapshot_num - 1 : 0, (unsigned long long)s->dev_size, (unsigned long long)s->total_allocated, (unsigned long long)(s->total_allocated - s->data_allocated)); + dm_multisnap_adjust_string(&result, &maxlen); + dm_multisnap_snaps_status(s, result, maxlen); + dm_multisnap_adjust_string(&result, &maxlen); + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s %s %u", s->origin->name, s->snapshot->name, s->chunk_size); + dm_multisnap_adjust_string(&result, &maxlen); + break; + } + + mutex_unlock(&s->master_lock); + + /* If there's no space left in the buffer, ask for larger size */ + return maxlen <= 1; +} + +static int multisnap_snap_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + const char *origin_path; + char *snapid_str; + unsigned long snapid; + + struct dm_dev *origin; + + struct dm_multisnap *s; + struct dm_multisnap_snap *sn; + + if (argc != 2) { + ti->error = "Requires exactly 2 arguments"; + r = -EINVAL; + goto bad_arguments; + } + + origin_path = argv[0]; + snapid_str = argv[1]; + snapid = simple_strtoul(snapid_str, &snapid_str, 10); + if (*snapid_str || snapid > SNAPID_T_MAX) { + ti->error = "Invalid snapshot id"; + r = -EINVAL; + goto bad_arguments; + } + + r = dm_get_device(ti, origin_path, 0, 0, FMODE_READ | FMODE_WRITE, &origin); + if (r) { + ti->error = "Could not get origin device"; + goto bad_origin; + } + mutex_lock(&all_multisnapshots_lock); + s = find_multisnapshot(origin->bdev); + if (!s) { + r = -ENXIO; + ti->error = "Origin target not loaded"; + goto origin_not_loaded; + } + + if (!dm_multisnap_test_snapshot(s, snapid)) { + r = -ENOENT; + ti->error = "Snapshot with this id doesn't exist"; + goto snapid_doesnt_exist; + } + + sn = kmalloc(sizeof(*sn), GFP_KERNEL); + if (!sn) { + ti->error = "Could not allocate multisnapshot_snap structure"; + r = -ENOMEM; + goto cant_allocate; + } + sn->s = s; + sn->snapid = snapid; + list_add(&sn->list_snaps, &s->all_snaps); + strlcpy(sn->origin_name, origin->name, sizeof sn->origin_name); + + mutex_unlock(&all_multisnapshots_lock); + + dm_put_device(ti, origin); + + ti->private = sn; + ti->split_io = s->chunk_size >> SECTOR_SHIFT; + + return 0; + +cant_allocate: +snapid_doesnt_exist: + dm_put_device(ti, origin); +origin_not_loaded: + mutex_unlock(&all_multisnapshots_lock); +bad_origin: +bad_arguments: + return r; +} + +static void multisnap_snap_dtr(struct dm_target *ti) +{ + struct dm_multisnap_snap *sn = ti->private; + + mutex_lock(&all_multisnapshots_lock); + + list_del(&sn->list_snaps); + kfree(sn); + + mutex_unlock(&all_multisnapshots_lock); +} + +static int multisnap_snap_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct dm_multisnap_snap *sn = ti->private; + struct dm_multisnap *s; + + /* abuse bi_phys_segments field */ + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + bio->bi_phys_segments = sn->snapid; + + spin_lock(&dm_multisnap_bio_list_lock); + s = sn->s; + if (!s) { + spin_unlock(&dm_multisnap_bio_list_lock); + return -EIO; + } + dm_multisnap_enqueue_bio_unlocked(s, bio); + spin_unlock(&dm_multisnap_bio_list_lock); + + wakeup_kmultisnapd(s); + + return DM_MAPIO_SUBMITTED; +} + +static int multisnap_snap_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen) +{ + struct dm_multisnap_snap *sn = ti->private; + switch (type) { + case STATUSTYPE_INFO: + /* metadata/data/total */ + result[0] = 0; + dm_multisnap_adjust_string(&result, &maxlen); + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s %u", sn->origin_name, sn->snapid); + dm_multisnap_adjust_string(&result, &maxlen); + break; + } + /* If there's no space left in the buffer, ask for larger size */ + return maxlen <= 1; +} + +static struct target_type multisnap_origin_target = { + .name = "multisnapshot", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = multisnap_origin_ctr, + .dtr = multisnap_origin_dtr, + .map = multisnap_origin_map, + .message = multisnap_origin_message, + .status = multisnap_origin_status, +}; + +static struct target_type multisnap_snap_target = { + .name = "multisnap-snap", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = multisnap_snap_ctr, + .dtr = multisnap_snap_dtr, + .map = multisnap_snap_map, + .status = multisnap_snap_status, +}; + +static int __init dm_multisnapshot_init(void) +{ + int r; + + dm_multisnap_pending_exception_cache = kmem_cache_create( + "pending_cache", sizeof(struct dm_multisnap_pending_exception), + __alignof__(struct dm_multisnap_pending_exception), + 0, dm_multisnap_pending_exception_ctor); + if (!dm_multisnap_pending_exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad_exception_cache; + } + + r = dm_register_target(&multisnap_origin_target); + if (r < 0) { + DMERR("multisnap_origin_target target register failed %d", r); + goto bad_multisnap_origin_target; + } + + r = dm_register_target(&multisnap_snap_target); + if (r < 0) { + DMERR("multisnap_snap_target target register failed %d", r); + goto bad_multisnap_snap_target; + } + + return 0; + +bad_multisnap_snap_target: + dm_unregister_target(&multisnap_origin_target); +bad_multisnap_origin_target: + kmem_cache_destroy(dm_multisnap_pending_exception_cache); +bad_exception_cache: + return r; +} + +static void __exit dm_multisnapshot_exit(void) +{ + dm_unregister_target(&multisnap_origin_target); + dm_unregister_target(&multisnap_snap_target); + kmem_cache_destroy(dm_multisnap_pending_exception_cache); +} + +/* Module hooks */ +module_init(dm_multisnapshot_init); +module_exit(dm_multisnapshot_exit); + +MODULE_DESCRIPTION(DM_NAME " multisnapshot target"); +MODULE_AUTHOR("Mikulas Patocka"); +MODULE_LICENSE("GPL"); Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap.h 2008-12-01 20:01:02.000000000 +0100 @@ -0,0 +1,254 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#ifndef DM_MULTISNAP_H +#define DM_MULTISNAP_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-bio-list.h" + +#include "dm-multisnap-struct.h" + +#define EFSERROR EPERM + +#define DM_MSG_PREFIX "multisnapshot" + +#define TMP_REMAP_HASH_SIZE 256 +#define TMP_REMAP_HASH(c) ((c) & (TMP_REMAP_HASH_SIZE - 1)) + +#define PENDING_HASH_SIZE 256 +#define PENDING_HASH(c) ((c) & (PENDING_HASH_SIZE - 1)) + +#define MULTISNAP_KCOPYD_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) + +#define PENDING_MEMPOOL_SIZE 256 + + +#define SNAPID_T_SENTINEL 0xffffffff +#define SNAPID_T_ORIGIN 0xffffffff +#define SNAPID_T_MAX 0xfffffffe + +#define CHUNK_BITS 48 +#define CHUNK_T_SENTINEL ((chunk_t)(1LL << CHUNK_BITS) - 1) +#define CHUNK_T_SNAP_PRESENT ((chunk_t)(1LL << CHUNK_BITS) - 1) +#define CHUNK_T_MAX ((chunk_t)(1LL << CHUNK_BITS) - 2) + +typedef sector_t chunk_t; +typedef __u32 snapid_t; +typedef __u32 bitmap_t; + +#define read_48(struc, entry) (le32_to_cpu((struc)->entry##1) | ((chunk_t)le16_to_cpu((struc)->entry##2) << 31 << 1)) +#define write_48(struc, entry, val) do { (struc)->entry##1 = cpu_to_le32(val); (struc)->entry##2 = cpu_to_le16((chunk_t)(val) >> 31 >> 1); } while (0) + +struct tmp_remap { + /* List entry for tmp_remap */ + struct hlist_node hash_list; + /* List entry for used_tmp_remaps/free_tmp_remaps */ + struct list_head list; + chunk_t old; + chunk_t new; + bitmap_t bitmap_idx; + int uncommitted; +}; + +struct dm_multisnap { + struct dm_dev *origin; + struct dm_dev *snapshot; + struct dm_bufio_client *bufio; + + int error; + + chunk_t dev_size; + unsigned chunk_size; + unsigned chunk_shift; + unsigned bitmap_depth; + snapid_t snapshot_num; + unsigned char bt_depth; + + chunk_t bitmap_root; + chunk_t alloc_rover; + chunk_t bt_root; + chunk_t sb_commit_block; + chunk_t valid_commit_block; + + chunk_t total_allocated; + chunk_t data_allocated; + + __u64 commit_sequence; + + void *tmp_chunk; + + struct mutex master_lock; + struct workqueue_struct *wq; + struct work_struct work; + struct bio_list bios; + + struct rb_root active_snapshots; + + mempool_t *pending_pool; + + struct dm_kcopyd_client *kcopyd; + atomic_t n_kcopyd_jobs; + + /* This may only be accessed from kcopyd callback, it has no locking */ + struct list_head pes_waiting_for_commit; + + /* List heads for struct tmp_remap->list */ + unsigned n_used_tmp_remaps; + struct list_head used_bitmap_tmp_remaps; + struct list_head used_bt_tmp_remaps; + struct list_head free_tmp_remaps; + /* List head for struct tmp_remap->hash_list */ + struct hlist_head tmp_remap[TMP_REMAP_HASH_SIZE]; + struct tmp_remap tmp_remap_store[N_REMAPS]; + + /* List head for struct dm_multisnap_pending_exception->hash_list */ + struct hlist_head pending_hash[PENDING_HASH_SIZE]; + + int pending_mempool_allocation_failed; + + chunk_t preallocated_blocks[MAX_BITMAP_DEPTH * 2]; + unsigned n_preallocated_blocks; + + /* List entry for all_multisnapshots */ + struct list_head list_all; + + /* List head for struct dm_multisnap_snap->list_snaps */ + struct list_head all_snaps; +}; + +struct dm_multisnap_snap { + struct dm_multisnap *s; + snapid_t snapid; + /* List entry for struct dm_multisnap->list_all */ + struct list_head list_snaps; + char origin_name[16]; +}; + +struct bt_key { + chunk_t chunk; + snapid_t snap_from; + snapid_t snap_to; +}; + +struct path_element { + chunk_t block; + unsigned idx; +}; + +struct dm_multisnap_pending_exception { + struct dm_multisnap *s; + struct bt_key key; + struct bio_list bios; + + /* List entry for struct dm_multisnap->pending_hash */ + struct hlist_node hash_list; + + /* Link when more exceptions are copied with one kcopyd call */ + struct dm_multisnap_pending_exception *link; + + /* List entry for struct dm_multisnap->pes_waiting_for_commit */ + struct list_head list; +}; + + +static inline chunk_t sector_to_chunk(struct dm_multisnap *s, sector_t sector) +{ + return sector >> (s->chunk_shift - SECTOR_SHIFT); +} + +static inline sector_t chunk_to_sector(struct dm_multisnap *s, chunk_t chunk) +{ + return chunk << (s->chunk_shift - SECTOR_SHIFT); +} + + +extern spinlock_t dm_multisnap_bio_list_lock; + +static inline void wakeup_kmultisnapd(struct dm_multisnap *s) +{ + queue_work(s->wq, &s->work); +} + +static inline void dm_multisnap_enqueue_bio_unlocked(struct dm_multisnap *s, struct bio *bio) +{ + bio_list_add(&s->bios, bio); +} + +static inline void dm_multisnap_enqueue_bio(struct dm_multisnap *s, struct bio *bio) +{ + spin_lock(&dm_multisnap_bio_list_lock); + dm_multisnap_enqueue_bio_unlocked(s, bio); + spin_unlock(&dm_multisnap_bio_list_lock); +} + +/* dm-multisnap.c */ + +void dm_multisnap_set_error(struct dm_multisnap *s, int error); +int dm_multisnap_has_error(struct dm_multisnap *s); + +void *dm_multisnap_read_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp); +void *dm_multisnap_duplicate_block(struct dm_multisnap *s, chunk_t old_chunk, chunk_t new_chunk, bitmap_t bitmap_idx, struct dm_buffer **bp); +void dm_multisnap_delete_remapped_block(struct dm_multisnap *s, chunk_t chunk); +void dm_multisnap_free_tmp_remap(struct dm_multisnap *s, struct tmp_remap *t); +void *dm_multisnap_make_block(struct dm_multisnap *s, chunk_t new_chunk, struct dm_buffer **bp); +int dm_multisnap_block_is_uncommitted(struct dm_multisnap *s, chunk_t block); + +int dm_multisnap_is_cb_block(struct dm_multisnap *s, chunk_t block); + +void dm_multisnap_adjust_string(char **result, unsigned *maxlen); + +/* dm-multisnap-alloc.c */ + +void dm_multisnap_create_bitmaps(struct dm_multisnap *s, chunk_t start); +int dm_multisnap_alloc_blocks(struct dm_multisnap *s, chunk_t *results, unsigned n_blocks); +void *dm_multisnap_alloc_duplicate_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp, void *ptr); +void *dm_multisnap_alloc_make_block(struct dm_multisnap *s, chunk_t *result, struct dm_buffer **bp); +void dm_multisnap_free_block_immediate(struct dm_multisnap *s, chunk_t block); +void dm_multisnap_free_block(struct dm_multisnap *s, chunk_t block); +void dm_multisnap_bitmap_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap); + +/* dm-multisnap-btree.c */ + +void dm_multisnap_create_btree(struct dm_multisnap *s, chunk_t *start); +int dm_multisnap_find_in_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t *result); +void dm_multisnap_add_to_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t new_chunk); +void dm_multisnap_bt_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap); + +/* dm-multisnap-commit.c */ + +void dm_multisnap_transaction_mark(struct dm_multisnap *s); +void dm_multisnap_commit(struct dm_multisnap *s); + +/* dm-multisnap-io.c */ + +extern struct kmem_cache *dm_multisnap_pending_exception_cache; +void dm_multisnap_pending_exception_ctor(void *); + +void dm_multisnap_work(struct work_struct *work); + +/* dm-multisnap-snaps.c */ + +int dm_multisnap_test_snapshot(struct dm_multisnap *s, snapid_t id); +int dm_multisnap_find_next_snapid_range(struct dm_multisnap *s, snapid_t id, snapid_t *from, snapid_t *to); + +void dm_multisnap_destroy_snapshot_tree(struct dm_multisnap *s); +void dm_multisnap_read_snapshots(struct dm_multisnap *s); +int dm_multisnap_create_snapshot(struct dm_multisnap *s); + +void dm_multisnap_snaps_status(struct dm_multisnap *s, char *result, unsigned maxlen); + +#endif Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-struct.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-struct.h 2008-11-27 00:06:45.000000000 +0100 @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#ifndef DM_MULTISNAP_STRUCT_H +#define DM_MULTISNAP_STRUCT_H + +#include +#include + +#define SB_BLOCK 0 + +#define SB_SIGNATURE cpu_to_be32(0xF6015342) + +struct multisnap_superblock { + __u32 signature; + __u32 chunk_size; + __u64 commit_block; +}; + +#define FIRST_CB_BLOCK 1 +#define CB_STRIDE 1024 + +#define CB_SIGNATURE cpu_to_be32(0xF6014342) + +struct commit_block_tmp_remap { + __u32 old1; + __u16 old2; + __u16 new2; + __u32 new1; + __u32 bitmap_idx; +}; + +#define CB_BITMAP_IDX_RESERVED 0xffffffff +#define CB_BITMAP_IDX_NONE 0xffffffff + +#define N_REMAPS 28 + +struct multisnap_commit_block { + __u32 signature; + __u32 snapshot_num; + + __u64 sequence; + + __u32 dev_size1; + __u16 dev_size2; + __u16 total_allocated2; + + __u32 total_allocated1; + __u32 data_allocated1; + + __u16 data_allocated2; + __u16 bitmap_root2; + __u32 bitmap_root1; + + __u32 alloc_rover1; + __u16 alloc_rover2; + __u16 bt_root2; + + __u32 bt_root1; + __u8 bt_depth; + + __u8 pad[11]; + + struct commit_block_tmp_remap tmp_remap[N_REMAPS]; +}; + +#define MAX_BITMAP_DEPTH 6 + +static inline int dm_multisnap_bitmap_depth(unsigned chunk_size, __u64 device_size) +{ + unsigned depth = 0; + __u64 entries = chunk_size * 8; + while (entries < device_size) { + depth++; + entries *= chunk_size / 8; + if (!entries) + return -ERANGE; + } + + if (depth > MAX_BITMAP_DEPTH) + return -ERANGE; + + return depth; +} + +/* B+-tree entry. Sorted by orig_chunk and snap_from/to */ + +#define MAX_BT_DEPTH 12 + +struct dm_multisnap_bt_entry { + __u32 orig_chunk1; + __u16 orig_chunk2; + __u16 new_chunk2; + __u32 new_chunk1; + __u32 snap_from; + __u32 snap_to; +}; + +#define BT_SIGNATURE cpu_to_be32(0xF6014254) + +struct dm_multisnap_bt_node { + __u32 signature; + __u16 n_entries; + __u16 pad1; + struct dm_multisnap_bt_entry entries[0]; +}; + +static inline unsigned dm_multisnap_btree_entries(unsigned chunk_size) +{ + return (chunk_size - sizeof(struct dm_multisnap_bt_node)) / sizeof(struct dm_multisnap_bt_entry); +} + +#endif Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-alloc.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-alloc.c 2008-11-27 00:21:33.000000000 +0100 @@ -0,0 +1,384 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + +#define rshift_roundup(val, bits) (((val) + ((chunk_t)1 << (bits)) - 1) >> (bits)) + +#define BITS_PER_BYTE_SHIFT 3 +#define BYTES_PER_POINTER_SHIFT 3 + +void dm_multisnap_create_bitmaps(struct dm_multisnap *s, chunk_t writing_block) +{ + int r; + struct dm_buffer *bp; + chunk_t direct_bitmap_blocks, total_bitmap_blocks, total_preallocated_blocks; + chunk_t lower_depth_block; + unsigned i, d; + chunk_t ii; + + r = dm_multisnap_bitmap_depth(s->chunk_size, s->dev_size); + if (r < 0) { + DMERR("dm_multisnap_create_bitmaps: device is too large"); + dm_multisnap_set_error(s, r); + return; + } + s->bitmap_depth = r; + + direct_bitmap_blocks = rshift_roundup(s->dev_size, s->chunk_shift + BITS_PER_BYTE_SHIFT); + + if (direct_bitmap_blocks >= CB_BITMAP_IDX_RESERVED) { + DMERR("dm_multisnap_create_bitmaps: device is too large"); + dm_multisnap_set_error(s, -ERANGE); + return; + } + + total_bitmap_blocks = 0; + for (i = 0; i <= s->bitmap_depth; i++) { + unsigned shift = (s->chunk_shift - BYTES_PER_POINTER_SHIFT) * i; + total_bitmap_blocks += rshift_roundup(direct_bitmap_blocks, shift); + } + total_preallocated_blocks = writing_block + total_bitmap_blocks; + for (ii = 0; ii < total_preallocated_blocks; ii++) { + if (dm_multisnap_is_cb_block(s, ii)) + total_preallocated_blocks++; + } + + if (total_preallocated_blocks >= s->dev_size) { + DMERR("dm_multisnap_create_bitmaps: device is too small"); + dm_multisnap_set_error(s, -ENOSPC); + return; + } + +/* Write direct bitmap blocks */ + + lower_depth_block = writing_block; + for (ii = 0; ii < direct_bitmap_blocks; ii++, writing_block++) { + void *bmp; + while (dm_multisnap_is_cb_block(s, writing_block)) + writing_block++; + bmp = dm_bufio_new(s->bufio, writing_block, &bp); + if (IS_ERR(bmp)) { + DMERR("dm_multisnap_create_bitmaps: can't create direct bitmap block at %Lx", (unsigned long long)writing_block); + dm_multisnap_set_error(s, PTR_ERR(bmp)); + return; + } + memset(bmp, 0, s->chunk_size); + for (i = 0; i < s->chunk_size << BITS_PER_BYTE_SHIFT; i++) { + chunk_t block_to_test = (ii << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | i; + if (block_to_test < total_preallocated_blocks || block_to_test >= s->dev_size || dm_multisnap_is_cb_block(s, block_to_test)) { + generic___set_le_bit(i, bmp); + s->total_allocated++; + } + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + +/* Write indirect bitmap blocks */ + + for (d = 1; d <= s->bitmap_depth; d++) { + chunk_t this_depth_block = writing_block; + for (ii = 0; ii < rshift_roundup(direct_bitmap_blocks, d * (s->chunk_shift - BYTES_PER_POINTER_SHIFT)); ii++, writing_block++) { + __u64 *bmp; + while (dm_multisnap_is_cb_block(s, writing_block)) + writing_block++; + bmp = dm_bufio_new(s->bufio, writing_block, &bp); + if (IS_ERR(bmp)) { + DMERR("dm_multisnap_create_bitmaps: can't create indirect bitmap block at %Lx", (unsigned long long)writing_block); + dm_multisnap_set_error(s, PTR_ERR(bmp)); + return; + } + for (i = 0; i < s->chunk_size >> BYTES_PER_POINTER_SHIFT; i++) { + if (((ii << d * (s->chunk_shift - BYTES_PER_POINTER_SHIFT)) | (i << (d - 1) * (s->chunk_shift - BYTES_PER_POINTER_SHIFT))) >= direct_bitmap_blocks) { + bmp[i] = cpu_to_le64(0); + continue; + } + while (dm_multisnap_is_cb_block(s, lower_depth_block)) + lower_depth_block++; + bmp[i] = cpu_to_le64(lower_depth_block); + lower_depth_block++; + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + lower_depth_block = this_depth_block; + } + + s->bitmap_root = writing_block - 1; + + return; +} + +static void *map_bitmap(struct dm_multisnap *s, bitmap_t bitmap, struct dm_buffer **bp, chunk_t *block, struct path_element *path) +{ + __u64 *bmp; + unsigned idx; + unsigned d = s->bitmap_depth; + *block = s->bitmap_root; + while (1) { + chunk_t new_block; + bmp = dm_multisnap_read_block(s, *block, bp); + if (unlikely(IS_ERR(bmp))) { + DMERR("map_bitmap: can't read bitmap at %Lx, depth %d/%d, index %Lx", (unsigned long long)*block, s->bitmap_depth - d, s->bitmap_depth, (unsigned long long)bitmap); + dm_multisnap_set_error(s, PTR_ERR(bmp)); + return NULL; + } + if (!d) + return bmp; + + idx = (bitmap >> ((d - 1) * (s->chunk_shift - BYTES_PER_POINTER_SHIFT))) & ((s->chunk_size - 1) >> BYTES_PER_POINTER_SHIFT); + + if (unlikely(path != NULL)) { + path[s->bitmap_depth - d].block = *block; + path[s->bitmap_depth - d].idx = idx; + } + + new_block = le64_to_cpu(bmp[idx]); + + dm_bufio_release(*bp); + if (!new_block) { + DMERR("map_bitmap: accessing bitmap out of range, bitmap %x", bitmap); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + *block = new_block; + + d--; + } +} + +static int find_bit(const void *bmp, unsigned start, unsigned end, int wide_search) +{ + const void *p; + unsigned bit; + if (unlikely(start >= end)) + return -ENOSPC; + if (likely(!generic_test_le_bit(start, bmp))) + return start; + if (likely(wide_search)) { + p = memchr(bmp + (start >> 3), 0, (end >> 3) - (start >> 3)); + if (p) { + bit = ((const __u8 *)p - (const __u8 *)bmp) << 3; + while (bit > start && !generic_test_le_bit(bit - 1, bmp)) + bit--; + goto ret_bit; + } + } + bit = generic_find_next_zero_le_bit(bmp, end, start); + ret_bit: + if (bit >= end) + return -ENOSPC; + return bit; +} + +int dm_multisnap_alloc_blocks(struct dm_multisnap *s, chunk_t *results, unsigned n_blocks) +{ + void *bmp; + struct dm_buffer *bp; + chunk_t block; + int wrap_around = 0; + int start_bit; + int wide_search; + int i; + bitmap_t bitmap_no; + + bitmap_no = s->alloc_rover >> (s->chunk_shift + BITS_PER_BYTE_SHIFT); +next_bitmap: + bmp = map_bitmap(s, bitmap_no, &bp, &block, NULL); + if (unlikely(!bmp)) + return -1; + + wide_search = 1; +find_again: + start_bit = s->alloc_rover & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1); + + for (i = 0; i < n_blocks; i++) { + int bit = find_bit(bmp, start_bit, s->chunk_size << BITS_PER_BYTE_SHIFT, wide_search); + if (unlikely(bit < 0)) { +bit_find_failed: + if (i && wide_search) { + wide_search = 0; + goto find_again; + } + dm_bufio_release(bp); + s->alloc_rover = (chunk_t)++bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT); + if (unlikely(s->alloc_rover >= s->dev_size)) { + s->alloc_rover = 0; + bitmap_no = 0; + wrap_around++; + if (wrap_around >= 2) { + DMERR("snapshot overflow"); + dm_multisnap_set_error(s, -ENOSPC); + return -1; + } + } + goto next_bitmap; + } + results[i] = ((chunk_t)bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | bit; + start_bit = bit + 1; + } + + if (!dm_multisnap_block_is_uncommitted(s, block)) { + int bit = find_bit(bmp, start_bit, s->chunk_size << BITS_PER_BYTE_SHIFT, wide_search); + if (unlikely(bit < 0)) + goto bit_find_failed; + + bmp = dm_multisnap_duplicate_block(s, block, ((chunk_t)bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | bit, bitmap_no, &bp); + if (unlikely(!bmp)) + return -1; + + generic___set_le_bit(bit, bmp); + s->total_allocated++; + start_bit = bit + 1; + } + + for (i = 0; i < n_blocks; i++) + generic___set_le_bit(results[i] & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1), bmp); + s->total_allocated += n_blocks; + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + s->alloc_rover = (s->alloc_rover & ~(chunk_t)((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1)) + start_bit; + if (unlikely(s->alloc_rover >= s->dev_size)) + s->alloc_rover = 0; + + return 0; +} + +void *dm_multisnap_alloc_duplicate_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp, void *ptr) +{ + int r; + chunk_t new_chunk; + void *data; + + if (dm_multisnap_block_is_uncommitted(s, block)) + return ptr; + + dm_bufio_release(*bp); + + r = dm_multisnap_alloc_blocks(s, &new_chunk, 1); + if (r) + return NULL; + + data = dm_multisnap_read_block(s, block, bp); + if (!data) + return NULL; + + return dm_multisnap_duplicate_block(s, block, new_chunk, CB_BITMAP_IDX_NONE, bp); +} + +void *dm_multisnap_alloc_make_block(struct dm_multisnap *s, chunk_t *result, struct dm_buffer **bp) +{ + int r = dm_multisnap_alloc_blocks(s, result, 1); + if (unlikely(r < 0)) + return NULL; + + return dm_multisnap_make_block(s, *result, bp); +} + +void dm_multisnap_free_block_immediate(struct dm_multisnap *s, chunk_t block) +{ + void *bmp; + struct dm_buffer *bp; + bitmap_t bitmap_no; + + if (unlikely(block >= s->dev_size)) { + DMERR("dm_multisnap_free_block_immediate: freeing invalid block %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + if (block + 1 == s->alloc_rover) + s->alloc_rover = block; + + bitmap_no = block >> (s->chunk_shift + BITS_PER_BYTE_SHIFT); + + bmp = map_bitmap(s, bitmap_no, &bp, &block, NULL); + if (!bmp) + return; + + generic___clear_le_bit(block & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1), bmp); + s->total_allocated--; + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + + +void dm_multisnap_free_block(struct dm_multisnap *s, chunk_t block) +{ + /* !!! FIXME: reclaim space */ +} + +void dm_multisnap_bitmap_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap) +{ + chunk_t block; + struct dm_buffer *bp; + __u64 *new_block; + struct path_element path[MAX_BITMAP_DEPTH]; + int results_ptr; + + chunk_t new_blockn; + int i; + + /* + * Preallocate twice the required amount of blocks, so that resolving + * the next tmp_remap (created here, in dm_multisnap_alloc_blocks) + * doesn't have to allocate anything. + */ + if (s->n_preallocated_blocks < s->bitmap_depth) { + if (unlikely(dm_multisnap_alloc_blocks(s, s->preallocated_blocks + s->n_preallocated_blocks, s->bitmap_depth * 2 - s->n_preallocated_blocks) < 0)) + return; + s->n_preallocated_blocks = s->bitmap_depth * 2; + } + results_ptr = 0; + + new_block = map_bitmap(s, tmp_remap->bitmap_idx, &bp, &block, path); + if (unlikely(!new_block)) + return; + + dm_bufio_release(bp); + + new_blockn = tmp_remap->new; + for (i = s->bitmap_depth - 1; i >= 0; i--) { + chunk_t block_to_free; + int remapped = 0; + __u64 *bmp = dm_multisnap_read_block(s, path[i].block, &bp); + if (unlikely(IS_ERR(bmp))) + return; + + if (!dm_multisnap_block_is_uncommitted(s, path[i].block)) { + remapped = 1; + dm_bufio_release_move(bp, s->preallocated_blocks[results_ptr]); + bmp = dm_multisnap_read_block(s, s->preallocated_blocks[results_ptr], &bp); + if (unlikely(IS_ERR(bmp))) + return; + /* !!! FIXME: add to a list of newly allocated blocks */ + } + + block_to_free = le32_to_cpu(bmp[path[i].idx]); + bmp[path[i].idx] = cpu_to_le64(new_blockn); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + dm_multisnap_free_block(s, block_to_free); + + if (!remapped) + goto skip_it; + new_blockn = s->preallocated_blocks[results_ptr]; + results_ptr++; + } + + dm_multisnap_free_block(s, s->bitmap_root); + s->bitmap_root = new_blockn; + +skip_it: + memmove(s->preallocated_blocks, s->preallocated_blocks + results_ptr, (s->n_preallocated_blocks -= results_ptr) * sizeof(chunk_t)); +} Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-commit.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-commit.c 2008-11-27 00:06:36.000000000 +0100 @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + +static void dm_multisnap_finalize_tmp_remaps(struct dm_multisnap *s) +{ + struct tmp_remap *t; + int i; + + while (s->n_used_tmp_remaps) { + if (dm_multisnap_has_error(s)) + return; + if (s->n_used_tmp_remaps < N_REMAPS - 1) { +/* prefer btree remaps ... if there are none, do bitmap remaps */ + if (!list_empty(&s->used_bt_tmp_remaps)) { + t = container_of(s->used_bt_tmp_remaps.next, struct tmp_remap, list); + dm_multisnap_bt_finalize_tmp_remap(s, t); + dm_multisnap_free_tmp_remap(s, t); + continue; + } + } + +/* else: 0 or 1 free remaps : finalize bitmaps */ + if (!list_empty(&s->used_bitmap_tmp_remaps)) { + t = container_of(s->used_bitmap_tmp_remaps.next, struct tmp_remap, list); + dm_multisnap_bitmap_finalize_tmp_remap(s, t); + dm_multisnap_free_tmp_remap(s, t); + continue; + } else { + DMERR("dm_multisnap_finalize_tmp_remaps: no bitmap tmp remaps, n_used_tmp_remaps %u", s->n_used_tmp_remaps); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + } + + if (dm_multisnap_has_error(s)) + return; + + for (i = s->n_preallocated_blocks - 1; i >= 0; i--) + dm_multisnap_free_block_immediate(s, s->preallocated_blocks[i]); + s->n_preallocated_blocks = 0; +} + +void dm_multisnap_transaction_mark(struct dm_multisnap *s) +{ + /* + * Accounting: + * max number of modified/allocated blocks during btree add: + * s->bt_depth * 2 + 1 + * one additional entry for newly allocated data chunk + * one additional entry for bitmap finalization + */ + if (unlikely(N_REMAPS - s->n_used_tmp_remaps < s->bt_depth * 2 + 3)) + dm_multisnap_finalize_tmp_remaps(s); +} + +void dm_multisnap_commit(struct dm_multisnap *s) +{ + struct tmp_remap *t; + chunk_t cb_addr; + chunk_t cb_div, cb_offset; + struct multisnap_commit_block *cb; + struct multisnap_superblock *sb; + unsigned idx; + struct dm_buffer *bp; + int r; + + if (dm_multisnap_has_error(s)) { + /* !!! FIXME: write error to superblock */ + return; + } + + list_for_each_entry(t, &s->used_bitmap_tmp_remaps, list) + t->uncommitted = 0; + + list_for_each_entry(t, &s->used_bt_tmp_remaps, list) + t->uncommitted = 0; + + if (unlikely((r = dm_bufio_write_dirty_buffers(s->bufio)) < 0)) { + DMERR("dm_multisnap_commit: error writing data"); + dm_multisnap_set_error(s, r); + return; + } + + cb_addr = s->alloc_rover; + + if (cb_addr < FIRST_CB_BLOCK) + cb_addr = FIRST_CB_BLOCK; + cb_div = cb_addr - FIRST_CB_BLOCK; + cb_offset = sector_div(cb_div, CB_STRIDE); + cb_addr += CB_STRIDE - cb_offset; + if (cb_offset < CB_STRIDE / 2 || cb_addr >= s->dev_size) + cb_addr -= CB_STRIDE; + + cb = dm_bufio_new(s->bufio, cb_addr, &bp); + if (IS_ERR(cb)) { + DMERR("dm_multisnap_commit: can't allocate new commit block at %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + + s->commit_sequence++; + + cb->signature = CB_SIGNATURE; + cb->snapshot_num = cpu_to_le32(s->snapshot_num); + cb->sequence = cpu_to_le64(s->commit_sequence); + write_48(cb, dev_size, s->dev_size); + write_48(cb, total_allocated, s->total_allocated); + write_48(cb, data_allocated, s->data_allocated); + write_48(cb, bitmap_root, s->bitmap_root); + write_48(cb, alloc_rover, s->alloc_rover); + write_48(cb, bt_root, s->bt_root); + cb->bt_depth = s->bt_depth; + memset(cb->pad, 0, sizeof cb->pad); + idx = 0; + list_for_each_entry(t, &s->used_bitmap_tmp_remaps, list) { + BUG_ON(idx >= N_REMAPS); + write_48(&cb->tmp_remap[idx], old, t->old); + write_48(&cb->tmp_remap[idx], new, t->new); + cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(t->bitmap_idx); + idx++; + } + list_for_each_entry(t, &s->used_bt_tmp_remaps, list) { + BUG_ON(idx >= N_REMAPS); + write_48(&cb->tmp_remap[idx], old, t->old); + write_48(&cb->tmp_remap[idx], new, t->new); + cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(t->bitmap_idx); + idx++; + } + for (; idx < N_REMAPS; idx++) { + write_48(&cb->tmp_remap[idx], old, 0); + write_48(&cb->tmp_remap[idx], new, 0); + cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(0); + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + r = dm_bufio_write_dirty_buffers(s->bufio); + if (unlikely(r < 0)) { + DMERR("dm_multisnap_commit: can't write commit block at %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, r); + return; + } + + if (likely(cb_addr == s->valid_commit_block) || + likely(cb_addr == s->valid_commit_block + CB_STRIDE)) + goto return_success; + + sb = dm_bufio_read(s->bufio, SB_BLOCK, &bp); + if (IS_ERR(sb)) { + DMERR("dm_multisnap_commit: can't read super block"); + dm_multisnap_set_error(s, PTR_ERR(sb)); + return; + } + + if (unlikely(sb->signature != SB_SIGNATURE)) { + dm_bufio_release(bp); + DMERR("dm_multisnap_commit: invalid super block signature when committing"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + sb->commit_block = cpu_to_le64(cb_addr); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + r = dm_bufio_write_dirty_buffers(s->bufio); + if (unlikely(r < 0)) { + DMERR("dm_multisnap_commit: can't write super block"); + dm_multisnap_set_error(s, r); + return; + } + +return_success: + s->valid_commit_block = cb_addr; + return; +} Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-btree.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-btree.c 2008-11-26 22:08:19.000000000 +0100 @@ -0,0 +1,409 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + + +static void add_at_idx(struct dm_multisnap_bt_node *node, unsigned index, struct bt_key *key, chunk_t new_chunk); + +static struct dm_multisnap_bt_node *dm_multisnap_read_btnode(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp) +{ + struct dm_multisnap_bt_node *node; + + node = dm_multisnap_read_block(s, block, bp); + if (unlikely(!node)) + return NULL; + + if (unlikely(node->signature != BT_SIGNATURE)) { + dm_bufio_release(*bp); + DMERR("dm_multisnap_read_btnode: bad signature on btree node %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + + if (unlikely((unsigned)(le16_to_cpu(node->n_entries) - 1) >= dm_multisnap_btree_entries(s->chunk_size))) { + dm_bufio_release(*bp); + DMERR("dm_multisnap_read_btnode: bad number of entries in btree node %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + + return node; +} + +void dm_multisnap_create_btree(struct dm_multisnap *s, chunk_t *writing_block) +{ + struct dm_buffer *bp; + struct dm_multisnap_bt_node *node; + struct bt_key new_key; + + while (dm_multisnap_is_cb_block(s, *writing_block)) + (*writing_block)++; + + if (*writing_block >= s->dev_size) { + DMERR("dm_multisnap_create_btree: device is too small"); + dm_multisnap_set_error(s, -ENOSPC); + return; + } + + node = dm_bufio_new(s->bufio, *writing_block, &bp); + if (IS_ERR(node)) { + DMERR("dm_multisnap_create_btree: 't create direct bitmap block at %Lx", (unsigned long long)*writing_block); + dm_multisnap_set_error(s, PTR_ERR(node)); + return; + } + memset(node, 0, s->chunk_size); + node->signature = BT_SIGNATURE; + node->n_entries = cpu_to_le16(0); + + /* + * A btree node must have at least one entry --- so create this empty + * one + */ + new_key.snap_from = new_key.snap_to = SNAPID_T_SENTINEL; + new_key.chunk = CHUNK_T_SENTINEL; + add_at_idx(node, 0, &new_key, 0); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + s->bt_root = *writing_block; + s->bt_depth = 1; + (*writing_block)++; +} + +static int compare_key(struct dm_multisnap_bt_entry *e, struct bt_key *key) +{ + chunk_t orig_chunk = read_48(e, orig_chunk); + if (orig_chunk < key->chunk) + return -1; + if (orig_chunk > key->chunk) + return 1; + + if (le32_to_cpu(e->snap_to) < key->snap_from) + return -1; + if (le32_to_cpu(e->snap_from) > key->snap_to) + return 1; + + return 0; +} + +/* + * Returns: 0 - found, 1 - not found + * *result - if found, then the first entry in the requested range + * - if not found, then the first entry after the requested range + */ + +static int binary_search(struct dm_multisnap_bt_node *node, struct bt_key *key, unsigned *result) +{ + int c; + int first = 0; + int last = le16_to_cpu(node->n_entries) - 1; + + while (1) { + int middle = (first + last) >> 1; + struct dm_multisnap_bt_entry *e = &node->entries[middle]; + + c = compare_key(e, key); + + if (first == last) + break; + + if (c < 0) + first = middle + 1; + else + last = middle; + } + + *result = first; + return !!c; +} + +/* + * Returns: 0 - found, 1 - not found, -1 - error + * In case of not error (0 or 1 is returned), the node and held buffer for + * this node is returned. + */ + +static int walk_btree(struct dm_multisnap *s, struct bt_key *key, struct dm_multisnap_bt_node **nodep, struct dm_buffer **bp, struct path_element path[MAX_BT_DEPTH]) +{ +#define node (*nodep) + int r; + chunk_t block = s->bt_root; + unsigned d = 0; + while (1) { + path[d].block = block; + node = dm_multisnap_read_btnode(s, block, bp); + if (!node) + return -1; + if (d != s->bt_depth - 1) { + struct dm_multisnap_bt_entry *be = &node->entries[le16_to_cpu(node->n_entries) - 1]; + if (unlikely(read_48(be, orig_chunk) != CHUNK_T_SENTINEL) || + unlikely(le32_to_cpu(be->snap_from) != SNAPID_T_SENTINEL) || + unlikely(le32_to_cpu(be->snap_to) != SNAPID_T_SENTINEL)) { + dm_bufio_release(*bp); + DMERR("walk_btree: node at %Lx in depth %d doesn't have sentinel record, search for %Lx, %x-%x", (unsigned long long)block, d, (unsigned long long)key->chunk, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + return -1; + } + } + r = binary_search(node, key, &path[d].idx); + block = read_48(&node->entries[path[d].idx], new_chunk); + if (++d == s->bt_depth) + break; + dm_bufio_release(*bp); + } + if (unlikely(compare_key(&node->entries[path[s->bt_depth - 1].idx], key) < 0)) + path[s->bt_depth - 1].idx++; + return r; +#undef node +} + +int dm_multisnap_find_in_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t *result) +{ + struct dm_multisnap_bt_node *node; + struct path_element path[MAX_BT_DEPTH]; + struct dm_buffer *bp; + + int r = walk_btree(s, key, &node, &bp, path); + if (unlikely(r < 0)) + return r; + + if (!r) { + struct dm_multisnap_bt_entry *entry = &node->entries[path[s->bt_depth - 1].idx]; + *result = read_48(entry, new_chunk); + key->chunk = read_48(entry, orig_chunk); + key->snap_from = le32_to_cpu(entry->snap_from); + key->snap_to = le32_to_cpu(entry->snap_to); + } + dm_bufio_release(bp); + + return r; +} + +static void add_at_idx(struct dm_multisnap_bt_node *node, unsigned index, struct bt_key *key, chunk_t new_chunk) +{ + memmove(&node->entries[index + 1], &node->entries[index], (le16_to_cpu(node->n_entries) - index) * sizeof(struct dm_multisnap_bt_entry)); + write_48(&node->entries[index], orig_chunk, key->chunk); + if (sizeof(chunk_t) == 4 && unlikely(key->chunk > CHUNK_T_MAX)) + node->entries[index].orig_chunk2 = cpu_to_le16(0xffff); + write_48(&node->entries[index], new_chunk, new_chunk); + node->entries[index].snap_from = cpu_to_le32(key->snap_from); + node->entries[index].snap_to = cpu_to_le32(key->snap_to); + node->n_entries = cpu_to_le16(le16_to_cpu(node->n_entries) + 1); +} + +static void middle_key(struct dm_multisnap_bt_entry *e1, struct dm_multisnap_bt_entry *e2, struct bt_key *result) +{ + __u32 snap1, snap2; + chunk_t chunk1 = read_48(e1, orig_chunk); + chunk_t chunk2 = read_48(e2, orig_chunk); + result->chunk = (chunk1 + chunk2) >> 1; + if (chunk1 != chunk2) { + result->snap_from = result->snap_to = (snapid_t)-1; + } else { + snap1 = le32_to_cpu(e1->snap_to); + snap2 = le32_to_cpu(e2->snap_from); + result->snap_from = result->snap_to = (snap1 + snap2) >> 1; + } +} + +void dm_multisnap_add_to_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t new_chunk) +{ + struct dm_multisnap_bt_node *node; + struct dm_buffer *bp; + struct path_element path[MAX_BT_DEPTH]; + unsigned depth; + + unsigned split_entries, split_index, split_offset, split_size; + struct bt_key new_key; + chunk_t new_root; + + int r = walk_btree(s, key, &node, &bp, path); + if (unlikely(r != 1)) { + if (r >= 0) { + dm_bufio_release(bp); + DMERR("dm_multisnap_add_to_btree: adding key that already exists: %Lx, %x-%x", (unsigned long long)key->chunk, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + } + return; + } + + depth = s->bt_depth - 1; + +go_up: + node = dm_multisnap_alloc_duplicate_block(s, path[depth].block, &bp, node); + if (unlikely(!node)) + return; + + if (likely(le16_to_cpu(node->n_entries) < dm_multisnap_btree_entries(s->chunk_size))) { + add_at_idx(node, path[depth].idx, key, new_chunk); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + return; + } + memcpy(s->tmp_chunk, node, s->chunk_size); + add_at_idx(s->tmp_chunk, path[depth].idx, key, new_chunk); + + split_entries = le16_to_cpu(((struct dm_multisnap_bt_node *)s->tmp_chunk)->n_entries); + split_index = split_entries / 2; + split_offset = sizeof(struct dm_multisnap_bt_node) + split_index * sizeof(struct dm_multisnap_bt_entry); + split_size = sizeof(struct dm_multisnap_bt_node) + split_entries * sizeof(struct dm_multisnap_bt_entry); + memcpy(node, s->tmp_chunk, sizeof(struct dm_multisnap_bt_node)); + memcpy((char *)node + sizeof(struct dm_multisnap_bt_node), (char *)s->tmp_chunk + split_offset, split_size - split_offset); + memset((char *)node + sizeof(struct dm_multisnap_bt_node) + split_size - split_offset, 0, s->chunk_size - (sizeof(struct dm_multisnap_bt_node) + split_size - split_offset)); + node->n_entries = cpu_to_le16(split_entries - split_index); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + node = dm_multisnap_alloc_make_block(s, &new_chunk, &bp); + if (unlikely(!node)) + return; + + memcpy(node, s->tmp_chunk, split_offset); + memset((char *)node + split_offset, 0, s->chunk_size - split_offset); + node->n_entries = cpu_to_le16(split_index); + + if (likely(depth == s->bt_depth - 1)) { + middle_key(&((struct dm_multisnap_bt_node *)s->tmp_chunk)->entries[split_index - 1], &((struct dm_multisnap_bt_node *)s->tmp_chunk)->entries[split_index], &new_key); + } else { + /* + * Warning: when we'll delete btree entries, pay very much + * attention to not create entry that spans this border. + */ + struct dm_multisnap_bt_entry *last_one = &node->entries[split_index - 1]; + new_key.chunk = read_48(last_one, orig_chunk); + new_key.snap_from = le32_to_cpu(last_one->snap_from); + new_key.snap_to = le32_to_cpu(last_one->snap_to); + write_48(last_one, orig_chunk, CHUNK_T_SENTINEL); + last_one->snap_from = last_one->snap_to = cpu_to_le32(SNAPID_T_SENTINEL); + } + key = &new_key; + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (depth--) { + node = dm_multisnap_read_btnode(s, path[depth].block, &bp); + if (unlikely(!node)) + return; + goto go_up; + } + + if (s->bt_depth >= MAX_BT_DEPTH) { + DMERR("dm_multisnap_add_to_btree: max b+-tree depth reached"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + node = dm_multisnap_alloc_make_block(s, &new_root, &bp); + if (unlikely(!node)) + return; + + memset(node, 0, s->chunk_size); + node->signature = BT_SIGNATURE; + node->n_entries = cpu_to_le16(0); + add_at_idx(node, 0, &new_key, new_chunk); + new_key.snap_from = new_key.snap_to = SNAPID_T_SENTINEL; + new_key.chunk = CHUNK_T_SENTINEL; + add_at_idx(node, 1, &new_key, path[0].block); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + s->bt_root = new_root; + s->bt_depth++; +} + +void dm_multisnap_bt_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap) +{ + struct dm_buffer *bp; + struct dm_multisnap_bt_node *node; + struct bt_key key; + struct path_element path[MAX_BT_DEPTH]; + int results_ptr; + + chunk_t new_blockn; + int r; + int i; + + if (s->n_preallocated_blocks < s->bt_depth) { + if (dm_multisnap_alloc_blocks(s, s->preallocated_blocks + s->n_preallocated_blocks, s->bt_depth - s->n_preallocated_blocks) < 0) + return; + s->n_preallocated_blocks = s->bt_depth; + } + results_ptr = 0; + + /* + * Read the key from this node --- we'll walk the btree according + * to this key to find a path from the root. + */ + node = dm_multisnap_read_btnode(s, tmp_remap->new, &bp); + if (!node) + return; + key.chunk = read_48(&node->entries[0], orig_chunk); + key.snap_from = key.snap_to = le32_to_cpu(node->entries[0].snap_from); + dm_bufio_release(bp); + + r = walk_btree(s, &key, &node, &bp, path); + if (r < 0) + return; + + dm_bufio_release(bp); + + for (i = s->bt_depth - 1; i >= 0; i--) + if (path[i].block == tmp_remap->old) + goto found; + + DMERR("block %Lx/%Lx was not found in btree when searching for %Lx/%x", (unsigned long long)tmp_remap->old, (unsigned long long)tmp_remap->new, (unsigned long long)key.chunk, key.snap_from); + for (i = 0; i < s->bt_depth; i++) + DMERR("path[%d]: %Lx/%x", i, (unsigned long long)path[i].block, path[i].idx); + dm_multisnap_set_error(s, -EFSERROR); + return; + + found: + + new_blockn = tmp_remap->new; + for (i--; i >= 0; i--) { + chunk_t block_to_free; + int remapped = 0; + node = dm_multisnap_read_btnode(s, path[i].block, &bp); + if (!node) + return; + if (!dm_multisnap_block_is_uncommitted(s, path[i].block)) { + remapped = 1; + dm_bufio_release_move(bp, s->preallocated_blocks[results_ptr]); + node = dm_multisnap_read_btnode(s, s->preallocated_blocks[results_ptr], &bp); + if (!node) + return; + /* !!! FIXME: add to a list of newly allocated blocks */ + } + block_to_free = read_48(&node->entries[path[i].idx], new_chunk); + write_48(&node->entries[path[i].idx], new_chunk, new_blockn); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (block_to_free != tmp_remap->old) + dm_multisnap_delete_remapped_block(s, block_to_free); + dm_multisnap_free_block(s, block_to_free); + + if (!remapped) + goto skip_it; + new_blockn = s->preallocated_blocks[results_ptr]; + results_ptr++; + } + + if (s->bt_root != tmp_remap->old) + dm_multisnap_delete_remapped_block(s, s->bt_root); + dm_multisnap_free_block(s, s->bt_root); + s->bt_root = new_blockn; + +skip_it: + memmove(s->preallocated_blocks, s->preallocated_blocks + results_ptr, (s->n_preallocated_blocks -= results_ptr) * sizeof(chunk_t)); +} + Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-snaps.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-snaps.c 2008-11-27 00:30:13.000000000 +0100 @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + +struct snapshot_range { + struct rb_node node; + snapid_t from; + snapid_t to; +}; + +static struct snapshot_range *rb_find_insert_snapshot(struct dm_multisnap *s, snapid_t from, snapid_t to, int add) +{ + struct snapshot_range *new; + struct rb_node **p = &s->active_snapshots.rb_node; + struct rb_node *parent = NULL; + while (*p) { + parent = *p; +#define rn rb_entry(parent, struct snapshot_range, node) + if (to < rn->from) { + if (to == rn->from - 1 && add) { + rn->from = from; + return rn; + } + p = &rn->node.rb_left; + } else if (from > rn->to) { + if (from == rn->to + 1 && add) { + rn->to = to; + return rn; + } + p = &rn->node.rb_right; + } else { + if (!add) + return rn; + else { + DMERR("rb_insert_snapshot: inserting overlapping entry: (%u,%u) overlaps (%u,%u)", from, to, rn->from, rn->to); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + } +#undef rn + } + if (!add) + return NULL; + + new = kmalloc(sizeof(struct snapshot_range), GFP_KERNEL); + if (!new) { + DMERR("rb_insert_snapshot: can't allocate memory for snapshot descriptor"); + dm_multisnap_set_error(s, -ENOMEM); + return NULL; + } + + new->from = from; + new->to = to; + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &s->active_snapshots); + + return new; +} + +static struct snapshot_range *rb_find_snapshot(struct dm_multisnap *s, snapid_t from, snapid_t to) +{ + return rb_find_insert_snapshot(s, from, to, 0); +} + +static int rb_insert_snapshot(struct dm_multisnap *s, snapid_t from, snapid_t to) +{ + struct snapshot_range *rn; + rn = rb_find_insert_snapshot(s, from, to, 1); + if (!rn) + return -1; + return 0; +} + +int dm_multisnap_test_snapshot(struct dm_multisnap *s, snapid_t id) +{ + return !!rb_find_snapshot(s, id, id); +} + +int dm_multisnap_find_next_snapid_range(struct dm_multisnap *s, snapid_t id, snapid_t *from, snapid_t *to) +{ + struct snapshot_range *rn; + rn = rb_find_snapshot(s, id, SNAPID_T_MAX); + if (!rn) + return 1; + *from = rn->from; + *to = rn->to; + return 0; +} + +void dm_multisnap_destroy_snapshot_tree(struct dm_multisnap *s) +{ + struct rb_node *root; + while ((root = s->active_snapshots.rb_node)) { +#define rn rb_entry(root, struct snapshot_range, node) + rb_erase(root, &s->active_snapshots); + kfree(rn); +#undef rn + } +} + +void dm_multisnap_read_snapshots(struct dm_multisnap *s) +{ + struct bt_key snap_key; + chunk_t ignore; + int r; + + dm_multisnap_destroy_snapshot_tree(s); + + snap_key.snap_from = 0; +find_next: + snap_key.snap_to = SNAPID_T_MAX; + snap_key.chunk = CHUNK_T_SNAP_PRESENT; + + r = dm_multisnap_find_in_btree(s, &snap_key, &ignore); + + if (unlikely(r < 0)) + return; + + if (!r) { + printk("inserting snapid %d-%d\n", snap_key.snap_from, snap_key.snap_to); + if (unlikely(snap_key.snap_to > SNAPID_T_MAX)) { + DMERR("dm_multisnap_read_snapshots: invalid snapshot id"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + r = rb_insert_snapshot(s, snap_key.snap_from, snap_key.snap_to); + if (unlikely(r < 0)) + return; + snap_key.snap_from = snap_key.snap_to + 1; + goto find_next; + } +} + +int dm_multisnap_create_snapshot(struct dm_multisnap *s) +{ + int r; + struct bt_key snap_key; + + if (s->snapshot_num > SNAPID_T_MAX) { + DMERR("dm_multisnap_create_snapshot: 2^32 snapshot limit reached"); + return -ENOSPC; + } + + r = rb_insert_snapshot(s, s->snapshot_num, s->snapshot_num); + if (r < 0) + return dm_multisnap_has_error(s); + + snap_key.chunk = CHUNK_T_SNAP_PRESENT; + snap_key.snap_from = s->snapshot_num; + snap_key.snap_to = s->snapshot_num; + dm_multisnap_add_to_btree(s, &snap_key, 0); + if (dm_multisnap_has_error(s)) + return dm_multisnap_has_error(s); + + printk("multisnapshot: created snapshot with ID %u\n", s->snapshot_num); + + s->snapshot_num++; + + dm_multisnap_transaction_mark(s); + dm_multisnap_commit(s); + + return 0; +} + +void dm_multisnap_snaps_status(struct dm_multisnap *s, char *result, unsigned maxlen) +{ + snapid_t n_snaps = 0; + + snapid_t from, to; + snapid_t snapid = 0; + for (snapid = 0; !dm_multisnap_find_next_snapid_range(s, snapid, &from, &to); snapid = to + 1) + n_snaps += to - from + 1; + + snprintf(result, maxlen, " %u", n_snaps); + dm_multisnap_adjust_string(&result, &maxlen); + + for (snapid = 0; !dm_multisnap_find_next_snapid_range(s, snapid, &from, &to); snapid = to + 1) + for (; from <= to; from++) { + snprintf(result, maxlen, " %u", from); + dm_multisnap_adjust_string(&result, &maxlen); + } +} Index: linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-io.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.28-rc5-devel/drivers/md/dm-multisnap-io.c 2008-12-01 20:23:04.000000000 +0100 @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + +#define MAX_RANGES_TO_REMAP DM_KCOPYD_MAX_REGIONS + +static void dm_multisnap_process_bios(struct dm_multisnap *s); + +struct kmem_cache *dm_multisnap_pending_exception_cache; + +void dm_multisnap_pending_exception_ctor(void *pe_) +{ + struct dm_multisnap_pending_exception *pe = pe_; + bio_list_init(&pe->bios); +} + +#define GFP_PENDING_EXCEPTION GFP_NOIO + +static struct dm_multisnap_pending_exception *dm_multisnap_alloc_pending_exception(struct dm_multisnap *s, struct bt_key *key) +{ + struct dm_multisnap_pending_exception *pe; + + /* + * Warning, we don't want to wait. Because we are holding master_lock + * and taking this lock is needed to complete the exception. + * + * If an allocation failure happens, we must go up, drop the lock, + * try dummy mempool allocation and go here again. + */ + pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION & ~__GFP_WAIT); + if (!pe) + return NULL; + + pe->s = s; + memcpy(&pe->key, key, sizeof(struct bt_key)); + hlist_add_head(&pe->hash_list, &s->pending_hash[PENDING_HASH(pe->key.chunk)]); + return pe; +} + +static void dm_multisnap_free_pending_exception(struct dm_multisnap_pending_exception *pe) +{ + hlist_del(&pe->hash_list); + mempool_free(pe, pe->s->pending_pool); +} + +static void dm_multisnap_wait_for_pending_exception(struct dm_multisnap *s) +{ + /* + * Wait until there is something in the mempool. Free it immediatelly. + */ + struct dm_multisnap_pending_exception *pe; + + pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION | __GFP_WAIT); + mempool_free(pe, s->pending_pool); +} + +static void dm_multisnap_enqueue_bio_list(struct dm_multisnap *s, struct bio_list *bl) +{ + struct bio *bio; + while ((bio = bio_list_pop(bl))) + dm_multisnap_enqueue_bio(s, bio); +} + +static int get_snapshots_to_remap(struct dm_multisnap *s, chunk_t chunk, struct bt_key result[MAX_RANGES_TO_REMAP]) +{ + int n = 0; + int r; + snapid_t snapid, from, to; + chunk_t sink; + for (snapid = 0; !dm_multisnap_find_next_snapid_range(s, snapid, &from, &to); snapid = to + 1) { + struct bt_key key; +next_btree_search: + + if (dm_multisnap_has_error(s)) + return -1; + + key.chunk = chunk; + key.snap_from = from; + key.snap_to = to; + r = dm_multisnap_find_in_btree(s, &key, &sink); + if (unlikely(r < 0)) + return -1; + + if (r) { + result[n].chunk = chunk; + result[n].snap_from = from; + result[n].snap_to = to; + n++; + return n; + } + + if (key.snap_from > from) { + result[n].chunk = chunk; + result[n].snap_from = from; + result[n].snap_to = key.snap_from - 1; + n++; + if (n == MAX_RANGES_TO_REMAP) + return n; + } + + if (key.snap_to < to) { + from = key.snap_to + 1; + goto next_btree_search; + } + } + return n; +} + +static struct dm_multisnap_pending_exception *make_pending_exception(struct dm_multisnap *s, struct bt_key *key, struct dm_io_region *dest) +{ + int r; + chunk_t new_chunk; + struct dm_multisnap_pending_exception *pe; + + pe = dm_multisnap_alloc_pending_exception(s, key); + if (unlikely(!pe)) + return NULL; + + r = dm_multisnap_alloc_blocks(s, &new_chunk, 1); + if (unlikely(r < 0)) + goto free_exception_return; + s->data_allocated++; + + dm_multisnap_add_to_btree(s, key, new_chunk); + if (dm_multisnap_has_error(s)) + goto free_exception_return; + + dest->bdev = s->snapshot->bdev; + dest->sector = chunk_to_sector(s, new_chunk); + dest->count = s->chunk_size >> SECTOR_SHIFT; + + dm_multisnap_transaction_mark(s); + + return pe; + +free_exception_return: + dm_multisnap_free_pending_exception(pe); + + dm_multisnap_transaction_mark(s); + + return NULL; +} + +static void remap_callback(int read_err, unsigned long write_err, void *pe_); + +static void start_remap(struct dm_multisnap *s, chunk_t chunk, struct bt_key *snapshots, int n_snapshots, struct bio *bio, sector_t origin_sectors) +{ + struct dm_io_region dests[MAX_RANGES_TO_REMAP]; + struct dm_io_region src; + struct dm_multisnap_pending_exception *exc_list; + int i; + + exc_list = NULL; + for (i = 0; i < n_snapshots; i++) { + struct dm_multisnap_pending_exception *pe; + pe = make_pending_exception(s, &snapshots[i], &dests[i]); + + if (unlikely(!pe)) { + if (unlikely(dm_multisnap_has_error(s))) { + if (i) + break; + /* !!! FIXME: maybe requeue after we drop the snapshot store */ + bio_endio(bio, -EIO); + return; + } + + s->pending_mempool_allocation_failed = 1; + if (i) + break; + dm_multisnap_enqueue_bio(s, bio); + return; + } + + pe->link = exc_list; + exc_list = pe; + if (likely(bio != NULL)) { + bio_list_add(&pe->bios, bio); + bio = NULL; + } + } + n_snapshots = i; + + src.bdev = s->origin->bdev; + src.sector = chunk_to_sector(s, chunk); + src.count = s->chunk_size >> SECTOR_SHIFT; + + if (unlikely(src.sector + src.count > origin_sectors)) { + BUG_ON(src.sector >= origin_sectors); + src.count = origin_sectors - src.sector; + for (i = 0; i < n_snapshots; i++) + dests[i].count = src.count; + } + + atomic_inc(&s->n_kcopyd_jobs); + + dm_kcopyd_copy(s->kcopyd, &src, n_snapshots, dests, 0, remap_callback, exc_list); +} + +static void remap_callback(int read_err, unsigned long write_err, void *pe_) +{ + struct dm_multisnap_pending_exception *pe = pe_; + struct dm_multisnap *s = pe->s; + + if (unlikely((read_err | write_err) != 0)) { + DMERR("remap_callback: kcopyd I/O error: %d, %lx", read_err, write_err); + /* !!! FIXME: drop the snapshot ? */ + } + + do { + list_add_tail(&pe->list, &s->pes_waiting_for_commit); + pe = pe->link; + } while (pe); + + if (atomic_dec_and_test(&s->n_kcopyd_jobs)) { + + /* We need to commit stuff */ + mutex_lock(&s->master_lock); + if (unlikely(atomic_read(&s->n_kcopyd_jobs))) { + /* Not yet ... kmultisnapd has just added something */ + mutex_unlock(&s->master_lock); + return; + } + + dm_multisnap_commit(s); + + do { + pe = container_of(s->pes_waiting_for_commit.next, struct dm_multisnap_pending_exception, list); + list_del(&pe->list); + dm_multisnap_enqueue_bio_list(s, &pe->bios); + dm_multisnap_free_pending_exception(pe); + } while (!list_empty(&s->pes_waiting_for_commit)); + + /* + * Process the bios that we have just added to the queue. + * It's faster to process them now than to hand them over to + * kmultisnapd. + */ + dm_multisnap_process_bios(s); + + mutex_unlock(&s->master_lock); + + blk_unplug(bdev_get_queue(s->origin->bdev)); + } +} + +static int check_pending_io(struct dm_multisnap *s, struct bio *bio, chunk_t chunk, snapid_t from, snapid_t to) +{ + struct dm_multisnap_pending_exception *pe; + struct hlist_node *hn; + hlist_for_each_entry(pe, hn, &s->pending_hash[PENDING_HASH(chunk)], hash_list) { + if (pe->key.chunk == chunk && + pe->key.snap_from <= to && + pe->key.snap_to >= from) { + bio_list_add(&pe->bios, bio); + return 1; + } + cond_resched(); + } + return 0; +} + +static void do_origin_write(struct dm_multisnap *s, struct bio *bio, sector_t origin_sectors) +{ + int r; + chunk_t chunk; + struct bt_key snapshots_to_remap[MAX_RANGES_TO_REMAP]; + + /* reads are processed directly in multisnap_origin_map */ + BUG_ON(bio_rw(bio) != WRITE); + + if (unlikely(dm_multisnap_has_error(s))) { + r = -EIO; /* !!! FIXME: maybe allow it, if we drop snapshot store */ + goto err_endio; + } + + chunk = sector_to_chunk(s, bio->bi_sector); + r = get_snapshots_to_remap(s, chunk, snapshots_to_remap); + if (unlikely(r < 0)) { + r = -EIO; /* !!! FIXME: maybe allow it, if we drop snapshot store */ + goto err_endio; + } + + if (likely(!r)) { + if (unlikely(check_pending_io(s, bio, chunk, 0, SNAPID_T_MAX))) + return; + bio->bi_bdev = s->origin->bdev; + generic_make_request(bio); + return; + } + + start_remap(s, chunk, snapshots_to_remap, r, bio, origin_sectors); + return; + +err_endio: + bio_endio(bio, r); + return; +} + +static void do_snapshot_io(struct dm_multisnap *s, struct bio *bio, snapid_t id) +{ + struct bt_key key; + chunk_t chunk; + chunk_t result; + int r; + + if (unlikely(dm_multisnap_has_error(s))) { + bio_endio(bio, -EIO); + return; + } + + if (bio_rw(bio) == WRITE) { + bio_endio(bio, -EROFS); /* !!! FIXME: todo: writes */ + return; + } + + chunk = sector_to_chunk(s, bio->bi_sector); + key.chunk = chunk; + key.snap_from = id; + key.snap_to = id; + r = dm_multisnap_find_in_btree(s, &key, &result); + if (unlikely(r < 0)) { + bio_endio(bio, -EIO); + return; + } + + if (r) { + /* not found in the snapshot */ + /* !!! FIXME: track i/o in-progress */ + bio->bi_bdev = s->origin->bdev; + } else { + if (unlikely(check_pending_io(s, bio, chunk, id, id))) + return; + bio->bi_bdev = s->snapshot->bdev; + bio->bi_sector &= (s->chunk_size >> SECTOR_SHIFT) - 1; + bio->bi_sector |= chunk_to_sector(s, result); + } + generic_make_request(bio); +} + +static void dm_multisnap_process_bios(struct dm_multisnap *s) +{ + struct bio *bio; + sector_t origin_sectors; + +again: + cond_resched(); + + spin_lock(&dm_multisnap_bio_list_lock); + bio = bio_list_pop(&s->bios); + spin_unlock(&dm_multisnap_bio_list_lock); + + if (unlikely(!bio)) + return; + + origin_sectors = i_size_read(s->origin->bdev->bd_inode) >> SECTOR_SHIFT; + if (bio->bi_sector + (bio->bi_size >> SECTOR_SHIFT) > origin_sectors) { + DMERR("dm_multisnap_process_bios: access out of device, flags %lx, sector %Lx, size %x, origin sectors %Lx", bio->bi_flags, (unsigned long long)bio->bi_sector, bio->bi_size, (unsigned long long)origin_sectors); + bio_endio(bio, -EIO); + goto next_bio; + } + + if (likely(bio->bi_phys_segments == SNAPID_T_ORIGIN)) + do_origin_write(s, bio, origin_sectors); + else + do_snapshot_io(s, bio, bio->bi_phys_segments); + +next_bio: + if (!bio_list_empty(&s->bios)) { + if (likely(!s->pending_mempool_allocation_failed)) + goto again; + wakeup_kmultisnapd(s); + } +} + +void dm_multisnap_work(struct work_struct *work) +{ + struct dm_multisnap *s = container_of(work, struct dm_multisnap, work); + + mutex_lock(&s->master_lock); + dm_multisnap_process_bios(s); + mutex_unlock(&s->master_lock); + + if (unlikely(s->pending_mempool_allocation_failed)) { + s->pending_mempool_allocation_failed = 0; + dm_multisnap_wait_for_pending_exception(s); + } + + blk_unplug(bdev_get_queue(s->origin->bdev)); +}