New snapshot implementation. This implementation has shared storage and unlimited number of snapshots. The work is split to two modules: dm-multisnapshot.ko - the general module dm-store-mikulas.ko - the snapshot store The modularity allows to load other snapshot stores. Usage: Create two logical volumes, one for origin and one for snapshots. (assume /dev/mapper/vg1-lv1 for origin and /dev/mapper/vg1-lv2 for snapshot in these examples) Clear the first sector of the snapshot volume: dd if=/dev/zero of=/dev/mapper/vg1-lv2 bs=4096 count=1 Load the shared snapshot driver: echo 0 `blockdev --getsize /dev/mapper/vg1-lv1` multisnapshot mikulas /dev/mapper/vg1-lv1 /dev/mapper/vg1-lv2 4096|dmsetup create ms (4096 is the chunk size. You can place different number there) This creates the origin store on /dev/mapper/ms. If the store was zeroed, it creates new structure, otherwise it loads existing structure. Once this is done, you should no longer access /dev/mapper/vg1-lv1 and /dev/mapper/vg1-lv2 and only use /dev/mapper/ms. Create new snapshot: dmsetup message /dev/mapper/ms 0 create (snapshots have IDs assigned from 0 upwards --- it prints the newly created ID to syslog ... in the final version it will use status to publish the ID) Attach the snapshot: echo 0 `blockdev --getsize /dev/mapper/vg1-lv1` multisnap-snap /dev/mapper/vg1-lv1 0|dmsetup create ms0 (that '0' is the snapshot id ... you can use different number) This attaches the snapshot '0' on /dev/mapper/ms0 Delete the snapshot: dmsetup message /dev/mapper/ms 0 delete 0 (the parameter after "delete" is the snapshot id) See status: dmsetup status prints these information about the multisnapshot device: - 0 on active storage, error number on error (ENOSPC, EIO, etc.) - the last created snapshot number - total number of chunks on the device - total number of allocated chunks - a number of chunks allocated for metadata - a number of snapshots - existing snapshot IDs Unload it: dmsetup remove ms dmsetup remove ms0 ... etc. (note, once you unload the origin, the snapshots become inaccessible - the devices exist but they return -EIO on everything) Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 21 drivers/md/Makefile | 8 drivers/md/dm-multisnap-alloc.c | 438 ++++++++++++ drivers/md/dm-multisnap-blocks.c | 150 ++++ drivers/md/dm-multisnap-btree.c | 674 ++++++++++++++++++ drivers/md/dm-multisnap-commit.c | 192 +++++ drivers/md/dm-multisnap-delete.c | 104 ++ drivers/md/dm-multisnap-freelist.c | 227 ++++++ drivers/md/dm-multisnap-io.c | 186 +++++ drivers/md/dm-multisnap-mikulas-struct.h | 147 ++++ drivers/md/dm-multisnap-mikulas.c | 495 +++++++++++++ drivers/md/dm-multisnap-mikulas.h | 181 +++++ drivers/md/dm-multisnap-snaps.c | 290 ++++++++ drivers/md/dm-multisnap.c | 1105 +++++++++++++++++++++++++++++++ drivers/md/dm-multisnap.h | 191 +++++ 15 files changed, 4409 insertions(+) Index: linux-2.6.30-rc5-fast/drivers/md/Kconfig =================================================================== --- linux-2.6.30-rc5-fast.orig/drivers/md/Kconfig 2009-05-11 13:34:50.000000000 +0200 +++ linux-2.6.30-rc5-fast/drivers/md/Kconfig 2009-05-19 14:36:24.000000000 +0200 @@ -233,6 +233,27 @@ config DM_SNAPSHOT ---help--- Allow volume managers to take writable snapshots of a device. +config DM_MULTISNAPSHOT + tristate "Multisnapshot target" + depends on BLK_DEV_DM + ---help--- + A new implementation of snapshots allowing sharing storage + between several snapshots. + + A submenu allows to select a specific shared snapshot store + driver. + +config DM_MULTISNAPSHOT_MIKULAS + tristate "Mikulas' snapshot store" + depends on DM_MULTISNAPSHOT + ---help--- + Mikulas Patocka's snapshot store. + + A log-structured storage allowing unlimited number of snapshots. + + This is work under development. So far it doesn't support snapshot + deletion and reclaiming of allocated space. + config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM Index: linux-2.6.30-rc5-fast/drivers/md/Makefile =================================================================== --- linux-2.6.30-rc5-fast.orig/drivers/md/Makefile 2009-05-11 13:35:00.000000000 +0200 +++ linux-2.6.30-rc5-fast/drivers/md/Makefile 2009-05-19 14:36:24.000000000 +0200 @@ -7,6 +7,12 @@ dm-mod-y += dm.o dm-table.o dm-target.o dm-multipath-y += dm-path-selector.o dm-mpath.o dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o +dm-multisnapshot-y += dm-multisnap.o +dm-store-mikulas-y += dm-multisnap-mikulas.o dm-multisnap-alloc.o \ + dm-multisnap-blocks.o dm-multisnap-btree.o \ + dm-multisnap-commit.o dm-multisnap-delete.o \ + dm-multisnap-freelist.o dm-multisnap-io.o \ + dm-multisnap-snaps.o dm-multisnap-blocks.o dm-mirror-y += dm-raid1.o dm-log-clustered-y \ += dm-log-cluster.o dm-log-cluster-transfer.o @@ -42,6 +48,8 @@ obj-$(CONFIG_DM_LOOP) += dm-loop.o obj-$(CONFIG_DM_IOBAND) += dm-ioband.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o +obj-$(CONFIG_DM_MULTISNAPSHOT) += dm-multisnapshot.o +obj-$(CONFIG_DM_MULTISNAPSHOT_MIKULAS) += dm-store-mikulas.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_CLUSTERED) += dm-log-clustered.o obj-$(CONFIG_DM_ZERO) += dm-zero.o Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap.c 2009-05-19 14:36:56.000000000 +0200 @@ -0,0 +1,1105 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + +#define MESG_STR(x) x, sizeof(x) + +static void dm_multisnap_process_bios(struct dm_multisnap *s); + +void dm_multisnap_set_error(struct dm_multisnap *s, int error) +{ + if (!s->error) + s->error = error; +} +EXPORT_SYMBOL(dm_multisnap_set_error); + +int dm_multisnap_has_error(struct dm_multisnap *s) +{ + return s->error; +} +EXPORT_SYMBOL(dm_multisnap_has_error); + +static DEFINE_MUTEX(all_multisnapshots_lock); +static LIST_HEAD(all_multisnapshots); + +static chunk_t sector_to_chunk(struct dm_multisnap *s, sector_t sector) +{ + return sector >> (s->chunk_shift - SECTOR_SHIFT); +} + +static sector_t chunk_to_sector(struct dm_multisnap *s, chunk_t chunk) +{ + return chunk << (s->chunk_shift - SECTOR_SHIFT); +} + +/* --- bio list --- */ + +static DEFINE_SPINLOCK(dm_multisnap_bio_list_lock); + +static void wakeup_kmultisnapd(struct dm_multisnap *s) +{ + queue_work(s->wq, &s->work); +} + +static void dm_multisnap_enqueue_bio_unlocked(struct dm_multisnap *s, struct bio *bio) +{ + bio_list_add(&s->bios, bio); +} + +static void dm_multisnap_enqueue_bio(struct dm_multisnap *s, struct bio *bio) +{ + spin_lock(&dm_multisnap_bio_list_lock); + dm_multisnap_enqueue_bio_unlocked(s, bio); + spin_unlock(&dm_multisnap_bio_list_lock); +} + +static void dm_multisnap_enqueue_bio_list(struct dm_multisnap *s, struct bio_list *bl) +{ + struct bio *bio; + while ((bio = bio_list_pop(bl))) + dm_multisnap_enqueue_bio(s, bio); +} + +/* --- pending_exception_cache --- */ + +static struct kmem_cache *dm_multisnap_pending_exception_cache; + +#define GFP_PENDING_EXCEPTION GFP_NOIO + +static void dm_multisnap_pending_exception_ctor(void *pe_) +{ + struct dm_multisnap_pending_exception *pe = pe_; + bio_list_init(&pe->bios); +} + +static struct dm_multisnap_pending_exception *dm_multisnap_alloc_pending_exception(struct dm_multisnap *s, chunk_t chunk) +{ + struct dm_multisnap_pending_exception *pe; + /* + * Warning, we don't want to wait. Because we are holding master_lock + * and taking this lock is needed to complete the exception. + * + * If an allocation failure happens, we must go up, drop the lock, + * try dummy mempool allocation and go here again. + */ + pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION & ~__GFP_WAIT); + if (unlikely(!pe)) + return NULL; + + pe->s = s; + pe->chunk = chunk; + hlist_add_head(&pe->hash_list, &s->pending_hash[PENDING_HASH(chunk)]); + return pe; +} + +static void dm_multisnap_free_pending_exception(struct dm_multisnap_pending_exception *pe) +{ + hlist_del(&pe->hash_list); + mempool_free(pe, pe->s->pending_pool); +} + +static void dm_multisnap_wait_for_pending_exception(struct dm_multisnap *s) +{ + /* + * Wait until there is something in the mempool. Free it immediatelly. + */ + struct dm_multisnap_pending_exception *pe; + + pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION | __GFP_WAIT); + mempool_free(pe, s->pending_pool); +} + +static int check_pending_io(struct dm_multisnap *s, struct bio *bio, chunk_t chunk, snapid_t snapid) +{ + struct dm_multisnap_pending_exception *pe; + struct hlist_node *hn; + hlist_for_each_entry(pe, hn, &s->pending_hash[PENDING_HASH(chunk)], hash_list) { + if (pe->chunk == chunk) { + int i; + if (snapid == SNAPID_T_ORIGIN) + goto conflict; + for (i = 0; i < pe->n_descs; i++) { + if (s->store->check_conflict(s, &pe->desc[i], snapid)) + goto conflict; + } + } + cond_resched(); + } + return 0; + +conflict: + bio_list_add(&pe->bios, bio); + return 1; +} + +/* --- kcopyd callback --- */ + +static void remap_callback(int read_err, unsigned long write_err, void *pe_) +{ + struct dm_multisnap_pending_exception *pe = pe_; + struct dm_multisnap *s = pe->s; + + if (unlikely((read_err | write_err) != 0)) { + DMERR("remap_callback: kcopyd I/O error: %d, %lx", read_err, write_err); + dm_multisnap_set_error(s, -EIO); + } + + list_add_tail(&pe->list, &s->pes_waiting_for_commit); + + if (atomic_dec_and_test(&s->n_kcopyd_jobs)) { + + /* We need to commit stuff */ + mutex_lock(&s->master_lock); + if (unlikely(atomic_read(&s->n_kcopyd_jobs))) { + /* Not yet ... kmultisnapd has just added something */ + mutex_unlock(&s->master_lock); + return; + } + + s->store->commit(s); + + do { + pe = container_of(s->pes_waiting_for_commit.next, struct dm_multisnap_pending_exception, list); + list_del(&pe->list); + dm_multisnap_enqueue_bio_list(s, &pe->bios); + dm_multisnap_free_pending_exception(pe); + } while (!list_empty(&s->pes_waiting_for_commit)); + + /* + * Process the bios that we have just added to the queue. + * It's faster to process them now than to hand them over to + * kmultisnapd. + */ + dm_multisnap_process_bios(s); + + mutex_unlock(&s->master_lock); + + blk_unplug(bdev_get_queue(s->origin->bdev)); + blk_unplug(bdev_get_queue(s->snapshot->bdev)); + } +} + +static void drain_kcopyd_jobs(struct dm_multisnap *s) +{ + while (atomic_read(&s->n_kcopyd_jobs)) + msleep(1); + smp_mb(); +} + +static void dispatch_kcopyd(struct dm_multisnap *s, struct dm_multisnap_pending_exception *pe, int from_snapshot, chunk_t chunk, struct bio *bio, struct dm_io_region *dests, unsigned n_dests) +{ + unsigned i; + struct dm_io_region src; + sector_t origin_sectors = i_size_read(s->origin->bdev->bd_inode) >> SECTOR_SHIFT; + + pe->n_descs = n_dests; + + bio_list_add(&pe->bios, bio); + + src.bdev = likely(!from_snapshot) ? s->origin->bdev : s->snapshot->bdev; + src.sector = chunk_to_sector(s, chunk); + src.count = s->chunk_size >> SECTOR_SHIFT; + + if (likely(!from_snapshot) && unlikely(src.sector + src.count > origin_sectors)) { + BUG_ON(src.sector >= origin_sectors); + src.count = origin_sectors - src.sector; + for (i = 0; i < pe->n_descs; i++) + dests[i].count = src.count; + } + + atomic_inc(&s->n_kcopyd_jobs); + + dm_kcopyd_copy(s->kcopyd, &src, n_dests, dests, 0, remap_callback, pe); +} + +/* --- bio processing --- */ + +static void do_origin_write(struct dm_multisnap *s, struct bio *bio) +{ + int r; + unsigned i; + chunk_t chunk, new_chunk; + struct dm_multisnap_pending_exception *pe; + struct dm_io_region dests[MAX_CHUNKS_TO_REMAP]; + + /* reads are processed directly in multisnap_origin_map */ + BUG_ON(bio_rw(bio) != WRITE); + + if (unlikely(dm_multisnap_has_error(s))) + goto err_endio; + + s->store->reset_query(s); + + chunk = sector_to_chunk(s, bio->bi_sector); + + r = s->store->query_next_remap(s, chunk); + if (unlikely(r < 0)) + goto err_endio; + + if (likely(!r)) { + /* There is nothing to remap */ + + if (unlikely(check_pending_io(s, bio, chunk, SNAPID_T_ORIGIN))) + return; + bio->bi_bdev = s->origin->bdev; + generic_make_request(bio); + return; + } + + pe = dm_multisnap_alloc_pending_exception(s, chunk); + if (unlikely(!pe)) { + s->pending_mempool_allocation_failed = 1; + dm_multisnap_enqueue_bio(s, bio); + return; + } + + i = 0; + goto midcycle; + for (; i < MAX_CHUNKS_TO_REMAP; i++) { + r = s->store->query_next_remap(s, chunk); + if (unlikely(r < 0)) + goto free_err_endio; + if (likely(!r)) + break; + +midcycle: + s->store->add_next_remap(s, &pe->desc[i], &new_chunk); + if (unlikely(dm_multisnap_has_error(s))) + goto free_err_endio; + + dests[i].bdev = s->snapshot->bdev; + dests[i].sector = chunk_to_sector(s, new_chunk); + dests[i].count = s->chunk_size >> SECTOR_SHIFT; + } + + dispatch_kcopyd(s, pe, 0, chunk, bio, dests, i); + return; + +free_err_endio: + dm_multisnap_free_pending_exception(pe); +err_endio: + r = -EIO; /* !!! FIXME: maybe allow it, if we drop snapshot store */ + bio_endio(bio, r); + return; +} + +static void do_snapshot_io(struct dm_multisnap *s, struct bio *bio, snapid_t id) +{ + chunk_t chunk, result, copy_from; + int r; + struct dm_multisnap_pending_exception *pe; + struct dm_io_region dest; + + if (unlikely(bio_rw(bio) == WRITE) && unlikely(!s->store->make_chunk_writeable)) + goto err_endio; + + if (unlikely(dm_multisnap_has_error(s))) + goto err_endio; + + chunk = sector_to_chunk(s, bio->bi_sector); + r = s->store->find_snapshot_chunk(s, id, chunk, &result); + if (unlikely(r < 0)) + goto err_endio; + + if (!r) { + if (unlikely(bio_rw(bio) == WRITE)) { + pe = dm_multisnap_alloc_pending_exception(s, chunk); + if (unlikely(!pe)) + goto failed_pe_allocation; + + s->store->add_next_remap(s, &pe->desc[0], &result); + if (unlikely(dm_multisnap_has_error(s))) + goto free_err_endio; + + dest.bdev = s->snapshot->bdev; + dest.sector = chunk_to_sector(s, result); + dest.count = s->chunk_size >> SECTOR_SHIFT; + + dispatch_kcopyd(s, pe, 0, chunk, bio, &dest, 1); + return; + } + + /* not found in the snapshot */ + /* !!! FIXME: track i/o in-progress */ + bio->bi_bdev = s->origin->bdev; + } else { + if (unlikely(check_pending_io(s, bio, chunk, id))) + return; + + if (unlikely(bio_rw(bio) == WRITE) && r == 1) { + copy_from = result; + + pe = dm_multisnap_alloc_pending_exception(s, chunk); + if (unlikely(!pe)) + goto failed_pe_allocation; + + s->store->make_chunk_writeable(s, &pe->desc[0], &result); + if (unlikely(dm_multisnap_has_error(s))) + goto free_err_endio; + + dest.bdev = s->snapshot->bdev; + dest.sector = chunk_to_sector(s, result); + dest.count = s->chunk_size >> SECTOR_SHIFT; + + dispatch_kcopyd(s, pe, 1, copy_from, bio, &dest, 1); + return; + } + + bio->bi_bdev = s->snapshot->bdev; + bio->bi_sector &= (s->chunk_size >> SECTOR_SHIFT) - 1; + bio->bi_sector |= chunk_to_sector(s, result); + } + generic_make_request(bio); + return; + +free_err_endio: + dm_multisnap_free_pending_exception(pe); +err_endio: + r = -EIO; + bio_endio(bio, r); + return; + +failed_pe_allocation: + s->pending_mempool_allocation_failed = 1; + dm_multisnap_enqueue_bio(s, bio); + return; +} + +static void dm_multisnap_process_bios(struct dm_multisnap *s) +{ + struct bio *bio; + sector_t origin_sectors; + +again: + cond_resched(); + + if (!list_empty(&s->background_works)) { + struct dm_multisnap_background_work *bw = list_entry(s->background_works.next, struct dm_multisnap_background_work, list); + list_del(&bw->list); + bw->queued = 0; + bw->work(s, bw); + + cond_resched(); + } + + spin_lock(&dm_multisnap_bio_list_lock); + bio = bio_list_pop(&s->bios); + spin_unlock(&dm_multisnap_bio_list_lock); + + if (unlikely(!bio)) + return; + + origin_sectors = i_size_read(s->origin->bdev->bd_inode) >> SECTOR_SHIFT; + if (bio->bi_sector + (bio->bi_size >> SECTOR_SHIFT) > origin_sectors) { + DMERR("dm_multisnap_process_bios: access out of device, flags %lx, sector %Lx, size %x, origin sectors %Lx", bio->bi_flags, (unsigned long long)bio->bi_sector, bio->bi_size, (unsigned long long)origin_sectors); + bio_endio(bio, -EIO); + goto next_bio; + } + + if (likely(bio->bi_phys_segments == SNAPID_T_ORIGIN)) + do_origin_write(s, bio); + else + do_snapshot_io(s, bio, bio->bi_phys_segments); + +next_bio: + if (!bio_list_empty(&s->bios) || !list_empty(&s->background_works)) { + if (likely(!bio_list_empty(&s->bios)) && + likely(!s->pending_mempool_allocation_failed) && + likely(list_empty(&s->master_lock.wait_list))) + goto again; + wakeup_kmultisnapd(s); + } +} + +void dm_multisnap_queue_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw) +{ + BUG_ON(!mutex_is_locked(&s->master_lock)); + + if (bw->queued) { + BUG_ON(bw->queued != 1); + return; + } + + bw->queued = 1; + list_add(&bw->list, &s->background_works); + wakeup_kmultisnapd(s); +} +EXPORT_SYMBOL(dm_multisnap_queue_work); + +void dm_multisnap_cancel_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw) +{ + BUG_ON(!mutex_is_locked(&s->master_lock)); + + if (!bw->queued) + return; + + bw->queued = 0; + list_del(&bw->list); +} +EXPORT_SYMBOL(dm_multisnap_cancel_work); + +static void dm_multisnap_work(struct work_struct *work) +{ + struct dm_multisnap *s = container_of(work, struct dm_multisnap, work); + + mutex_lock(&s->master_lock); + dm_multisnap_process_bios(s); + mutex_unlock(&s->master_lock); + + if (unlikely(s->pending_mempool_allocation_failed)) { + s->pending_mempool_allocation_failed = 0; + dm_multisnap_wait_for_pending_exception(s); + } + + blk_unplug(bdev_get_queue(s->origin->bdev)); + blk_unplug(bdev_get_queue(s->snapshot->bdev)); +} + +static struct dm_multisnap *find_multisnapshot(struct block_device *origin) +{ + struct dm_multisnap *s; + list_for_each_entry(s, &all_multisnapshots, list_all) + if (s->origin->bdev == origin) + return s; + return NULL; +} + +/* --- exception stores --- */ + +static DEFINE_MUTEX(exception_stores_lock); +static LIST_HEAD(all_exception_stores); + +static struct dm_multisnap_exception_store *dm_multisnap_find_exception_store(const char *name) +{ + struct dm_multisnap_exception_store *store; + + list_for_each_entry(store, &all_exception_stores, list) + if (!strcmp(store->name, name)) + return store; + + return NULL; +} + +static int dm_multisnap_exception_store_active(struct dm_multisnap_exception_store *find) +{ + struct dm_multisnap_exception_store *store; + + list_for_each_entry(store, &all_exception_stores, list) + if (store == find) + return 1; + + return 0; +} + +int dm_multisnap_register_exception_store(struct dm_multisnap_exception_store *store) +{ + mutex_lock(&exception_stores_lock); + + BUG_ON(dm_multisnap_exception_store_active(store)); + + if (dm_multisnap_find_exception_store(store->name)) { + mutex_unlock(&exception_stores_lock); + return -EEXIST; + } + list_add(&store->list, &all_exception_stores); + + mutex_unlock(&exception_stores_lock); + + return 0; +} +EXPORT_SYMBOL(dm_multisnap_register_exception_store); + +void dm_multisnap_unregister_exception_store(struct dm_multisnap_exception_store *store) +{ + mutex_lock(&exception_stores_lock); + + BUG_ON(!dm_multisnap_exception_store_active(store)); + list_del(&store->list); + + mutex_unlock(&exception_stores_lock); +} +EXPORT_SYMBOL(dm_multisnap_unregister_exception_store); + +static struct dm_multisnap_exception_store *dm_multisnap_get_exception_store(const char *name) +{ + struct dm_multisnap_exception_store *store; + + mutex_lock(&exception_stores_lock); + + store = dm_multisnap_find_exception_store(name); + if (store) { + if (!try_module_get(store->module)) + store = NULL; + } + + mutex_unlock(&exception_stores_lock); + + return store; +} + +static void dm_multisnap_put_exception_store(struct dm_multisnap_exception_store *store) +{ + mutex_lock(&exception_stores_lock); + + BUG_ON(!dm_multisnap_exception_store_active(store)); + module_put(store->module); + + mutex_unlock(&exception_stores_lock); +} + +/* --- target methods --- */ + +static int multisnap_origin_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r; + int i; + const char *store_name; + const char *origin_path; + const char *snapshot_path; + const char *chunk_size_str; + unsigned long chunk_size; + + struct dm_multisnap *s; + + if (argc < 4) { + ti->error = "Requires at least 3 arguments"; + r = -EINVAL; + goto bad_arguments; + } + + store_name = argv[0]; + origin_path = argv[1]; + snapshot_path = argv[2]; + chunk_size_str = argv[3]; + argv += 4; + argc -= 4; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + ti->error = "Can't allocate multisnapshot structure"; + r = -ENOMEM; + goto bad_s; + } + + s->store = dm_multisnap_get_exception_store(store_name); + if (!s->store) { + request_module("dm-store-%s", store_name); + s->store = dm_multisnap_get_exception_store(store_name); + if (!s->store) { + ti->error = "Can't get exception store type"; + r = -ENOENT; + goto bad_store; + } + } + + s->error = 0; + mutex_init(&s->master_lock); + INIT_WORK(&s->work, dm_multisnap_work); + bio_list_init(&s->bios); + INIT_LIST_HEAD(&s->background_works); + atomic_set(&s->n_kcopyd_jobs, 0); + INIT_LIST_HEAD(&s->pes_waiting_for_commit); + for (i = 0; i < PENDING_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->pending_hash[i]); + s->pending_mempool_allocation_failed = 0; + s->last_snapid = 0; + INIT_LIST_HEAD(&s->all_snaps); + + r = dm_get_device(ti, origin_path, 0, 0, FMODE_READ | FMODE_WRITE, &s->origin); + if (r) { + ti->error = "Could not get origin device"; + goto bad_origin; + } + + r = dm_get_device(ti, snapshot_path, 0, 0, FMODE_READ | FMODE_WRITE, &s->snapshot); + if (r) { + ti->error = "Could not get snapshot device"; + goto bad_snapshot; + } + + chunk_size = simple_strtoul(chunk_size_str, (char **)&chunk_size_str, 10); + if (*chunk_size_str || chunk_size & (chunk_size - 1) || + chunk_size > INT_MAX / 512) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad_chunk_size; + } + chunk_size *= 512; + if (chunk_size < bdev_hardsect_size(s->origin->bdev) || + chunk_size < bdev_hardsect_size(s->snapshot->bdev)) { + ti->error = "Chunk size smaller than device block size"; + r = -EINVAL; + goto bad_chunk_size; + } + s->chunk_size = chunk_size; + s->chunk_shift = ffs(chunk_size) - 1; + + s->wq = create_singlethread_workqueue("kmultisnapd"); + if (!s->wq) { + ti->error = "Could not create kernel thread"; + r = -ENOMEM; + goto bad_thread; + } + + s->pending_pool = mempool_create_slab_pool(PENDING_MEMPOOL_SIZE, dm_multisnap_pending_exception_cache); + if (!s->pending_pool) { + ti->error = "Could not allocate mempool for pending exceptions"; + r = -ENOMEM; + goto bad_pending_pool; + } + + r = dm_kcopyd_client_create(MULTISNAP_KCOPYD_PAGES, &s->kcopyd); + if (r) { + ti->error = "Could not create kcopyd client"; + goto bad_kcopyd; + } + + mutex_lock(&s->master_lock); + r = s->store->init_exception_store(s, argc, argv, &ti->error); + if (r) { + mutex_unlock(&s->master_lock); + goto exception_store_error; + } + + ti->private = s; + ti->split_io = s->chunk_size >> SECTOR_SHIFT; + mutex_unlock(&s->master_lock); + + mutex_lock(&all_multisnapshots_lock); + list_add(&s->list_all, &all_multisnapshots); + mutex_unlock(&all_multisnapshots_lock); + + return 0; + +exception_store_error: + dm_kcopyd_client_destroy(s->kcopyd); +bad_kcopyd: + mempool_destroy(s->pending_pool); +bad_pending_pool: + flush_workqueue(s->wq); + destroy_workqueue(s->wq); +bad_thread: +bad_chunk_size: + dm_put_device(ti, s->snapshot); +bad_snapshot: + dm_put_device(ti, s->origin); +bad_origin: + dm_multisnap_put_exception_store(s->store); +bad_store: + kfree(s); +bad_s: +bad_arguments: + return r; +} + +static void multisnap_origin_dtr(struct dm_target *ti) +{ + struct dm_multisnap *s = ti->private; + struct dm_multisnap_snap *sn; + unsigned i; + + mutex_lock(&all_multisnapshots_lock); + + /* Make sure that any more IOs won't be submitted by snapshot targets */ + list_for_each_entry(sn, &s->all_snaps, list_snaps) { + spin_lock(&dm_multisnap_bio_list_lock); + sn->s = NULL; + spin_unlock(&dm_multisnap_bio_list_lock); + } + list_del(&s->all_snaps); + + /* Wait for IOs on snapshots for this origin to finish */ +poll_for_ios: + spin_lock(&dm_multisnap_bio_list_lock); + if (!bio_list_empty(&s->bios)) { + spin_unlock(&dm_multisnap_bio_list_lock); + flush_workqueue(s->wq); + msleep(1); + goto poll_for_ios; + } + spin_unlock(&dm_multisnap_bio_list_lock); + + mutex_lock(&s->master_lock); + for (i = 0; i < PENDING_HASH_SIZE; i++) + if (!hlist_empty(&s->pending_hash[i])) { + mutex_unlock(&s->master_lock); + msleep(1); + goto poll_for_ios; + } + mutex_unlock(&s->master_lock); + + flush_workqueue(s->wq); + + mutex_lock(&s->master_lock); + s->store->commit(s); + s->store->exit_exception_store(s); + list_del(&s->list_all); + mutex_unlock(&s->master_lock); + + flush_workqueue(s->wq); + + mutex_unlock(&all_multisnapshots_lock); + + dm_kcopyd_client_destroy(s->kcopyd); + s->kcopyd = NULL; + mempool_destroy(s->pending_pool); + s->pending_pool = NULL; + destroy_workqueue(s->wq); + s->wq = NULL; + dm_put_device(ti, s->snapshot); + s->snapshot = NULL; + dm_put_device(ti, s->origin); + s->origin = NULL; + dm_multisnap_put_exception_store(s->store); + kfree(s); +} + +static int multisnap_origin_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct dm_multisnap *s = ti->private; + + /* do the most common case quickly */ + if (likely(bio_rw(bio) != WRITE)) { + bio->bi_bdev = s->origin->bdev; + return DM_MAPIO_REMAPPED; + } + + /* abuse bi_phys_segments field */ + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + bio->bi_phys_segments = SNAPID_T_ORIGIN; + + dm_multisnap_enqueue_bio(s, bio); + wakeup_kmultisnapd(s); + + return DM_MAPIO_SUBMITTED; +} + +static int multisnap_origin_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct dm_multisnap *s = ti->private; + int r; + + mutex_lock(&all_multisnapshots_lock); + mutex_lock(&s->master_lock); + + if (argc == 1 && !strnicmp(argv[0], MESG_STR("create"))) { + drain_kcopyd_jobs(s); + + if ((r = dm_multisnap_has_error(s))) + goto unlock_ret; + + r = s->store->create_snapshot(s, &s->last_snapid); + if (r) + goto unlock_ret; + + r = dm_multisnap_has_error(s); + goto unlock_ret; + } + if (argc == 2 && !strnicmp(argv[0], MESG_STR("delete"))) { + char *snapid_end; + unsigned long snapid; + struct dm_multisnap_snap *sn; + struct bio *bio, *next; + + drain_kcopyd_jobs(s); + + snapid = simple_strtoul(argv[1], &snapid_end, 10); + if (!*argv[1] || *snapid_end || snapid == SNAPID_T_ORIGIN) { + DMWARN("invalid snapshot id."); + r = -EINVAL; + goto unlock_ret; + } + + if (!s->store->delete_snapshot) { + DMERR("snapshot store doesn't support delete"); + r = -EOPNOTSUPP; + goto unlock_ret; + } + + if ((r = dm_multisnap_has_error(s))) + goto unlock_ret; + + /* Kick off possibly attached snapshot */ + list_for_each_entry(sn, &s->all_snaps, list_snaps) { + if (sn->snapid == snapid) { + spin_lock(&dm_multisnap_bio_list_lock); + sn->s = NULL; + spin_unlock(&dm_multisnap_bio_list_lock); + } + } + + /* Terminate bios queued for this snapshot so far */ + spin_lock(&dm_multisnap_bio_list_lock); + bio = bio_list_get(&s->bios); + spin_unlock(&dm_multisnap_bio_list_lock); + for (; bio; bio = next) { + next = bio->bi_next; + bio->bi_next = NULL; + if (bio->bi_phys_segments == snapid) + bio_endio(bio, -EIO); + else + dm_multisnap_enqueue_bio(s, bio); + } + + if (!s->store->snapshot_exists(s, snapid)) { + DMWARN("snapshot with this id doesn't exists."); + r = -EINVAL; + goto unlock_ret; + } + + r = s->store->delete_snapshot(s, snapid); + if (r) + goto unlock_ret; + + r = dm_multisnap_has_error(s); + goto unlock_ret; + } + + DMWARN("unrecognised message received."); + r = -EINVAL; + +unlock_ret: + mutex_unlock(&s->master_lock); + mutex_unlock(&all_multisnapshots_lock); + + return r; +} + +static int multisnap_origin_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen) +{ + struct dm_multisnap *s = ti->private; + + mutex_lock(&s->master_lock); + + switch (type) { + case STATUSTYPE_INFO: + /* metadata/data/total */ + snprintf(result, maxlen, "%d %u", -dm_multisnap_has_error(s), s->last_snapid); + dm_multisnap_adjust_string(&result, &maxlen); + s->store->status_info(s, result, maxlen); + dm_multisnap_adjust_string(&result, &maxlen); + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s %s %s %u", s->store->name, s->origin->name, s->snapshot->name, s->chunk_size / 512); + dm_multisnap_adjust_string(&result, &maxlen); + if (s->store->status_table) + s->store->status_table(s, result, maxlen); + dm_multisnap_adjust_string(&result, &maxlen); + break; + } + + mutex_unlock(&s->master_lock); + + /* If there's no space left in the buffer, ask for larger size */ + return maxlen <= 1; +} + +static int multisnap_snap_ctr(struct dm_target *ti, unsigned argc, char **argv) +{ + int r; + const char *origin_path; + char *snapid_str; + unsigned long snapid; + + struct dm_dev *origin; + + struct dm_multisnap *s; + struct dm_multisnap_snap *sn; + + if (argc != 2) { + ti->error = "Requires exactly 2 arguments"; + r = -EINVAL; + goto bad_arguments; + } + + origin_path = argv[0]; + snapid_str = argv[1]; + snapid = simple_strtoul(snapid_str, &snapid_str, 10); + if (*snapid_str) { + ti->error = "Invalid snapshot id"; + r = -EINVAL; + goto bad_arguments; + } + + r = dm_get_device(ti, origin_path, 0, 0, FMODE_READ | FMODE_WRITE, &origin); + if (r) { + ti->error = "Could not get origin device"; + goto bad_origin; + } + mutex_lock(&all_multisnapshots_lock); + s = find_multisnapshot(origin->bdev); + if (!s) { + r = -ENXIO; + ti->error = "Origin target not loaded"; + goto origin_not_loaded; + } + + mutex_lock(&s->master_lock); + if (!s->store->snapshot_exists(s, snapid)) { + mutex_unlock(&s->master_lock); + r = -ENOENT; + ti->error = "Snapshot with this id doesn't exist"; + goto snapid_doesnt_exist; + } + mutex_unlock(&s->master_lock); + + sn = kmalloc(sizeof(*sn), GFP_KERNEL); + if (!sn) { + ti->error = "Could not allocate multisnapshot_snap structure"; + r = -ENOMEM; + goto cant_allocate; + } + sn->s = s; + sn->snapid = snapid; + list_add(&sn->list_snaps, &s->all_snaps); + strlcpy(sn->origin_name, origin->name, sizeof sn->origin_name); + + mutex_unlock(&all_multisnapshots_lock); + + dm_put_device(ti, origin); + + ti->private = sn; + ti->split_io = s->chunk_size >> SECTOR_SHIFT; + + return 0; + +cant_allocate: +snapid_doesnt_exist: + dm_put_device(ti, origin); +origin_not_loaded: + mutex_unlock(&all_multisnapshots_lock); +bad_origin: +bad_arguments: + return r; +} + +static void multisnap_snap_dtr(struct dm_target *ti) +{ + struct dm_multisnap_snap *sn = ti->private; + + mutex_lock(&all_multisnapshots_lock); + + list_del(&sn->list_snaps); + kfree(sn); + + mutex_unlock(&all_multisnapshots_lock); +} + +static int multisnap_snap_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) +{ + struct dm_multisnap_snap *sn = ti->private; + struct dm_multisnap *s; + + /* abuse bi_phys_segments field */ + bio->bi_flags &= ~(1 << BIO_SEG_VALID); + bio->bi_phys_segments = sn->snapid; + + spin_lock(&dm_multisnap_bio_list_lock); + s = sn->s; + if (!s) { + spin_unlock(&dm_multisnap_bio_list_lock); + return -EIO; + } + dm_multisnap_enqueue_bio_unlocked(s, bio); + spin_unlock(&dm_multisnap_bio_list_lock); + + wakeup_kmultisnapd(s); + + return DM_MAPIO_SUBMITTED; +} + +static int multisnap_snap_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen) +{ + struct dm_multisnap_snap *sn = ti->private; + switch (type) { + case STATUSTYPE_INFO: + /* metadata/data/total */ + result[0] = 0; + dm_multisnap_adjust_string(&result, &maxlen); + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s %u", sn->origin_name, sn->snapid); + dm_multisnap_adjust_string(&result, &maxlen); + break; + } + /* If there's no space left in the buffer, ask for larger size */ + return maxlen <= 1; +} + +static struct target_type multisnap_origin_target = { + .name = "multisnapshot", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = multisnap_origin_ctr, + .dtr = multisnap_origin_dtr, + .map = multisnap_origin_map, + .message = multisnap_origin_message, + .status = multisnap_origin_status, +}; + +static struct target_type multisnap_snap_target = { + .name = "multisnap-snap", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = multisnap_snap_ctr, + .dtr = multisnap_snap_dtr, + .map = multisnap_snap_map, + .status = multisnap_snap_status, +}; + +static int __init dm_multisnapshot_init(void) +{ + int r; + + dm_multisnap_pending_exception_cache = kmem_cache_create( + "pending_cache", sizeof(struct dm_multisnap_pending_exception), + __alignof__(struct dm_multisnap_pending_exception), + 0, dm_multisnap_pending_exception_ctor); + if (!dm_multisnap_pending_exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad_exception_cache; + } + + r = dm_register_target(&multisnap_origin_target); + if (r < 0) { + DMERR("multisnap_origin_target target register failed %d", r); + goto bad_multisnap_origin_target; + } + + r = dm_register_target(&multisnap_snap_target); + if (r < 0) { + DMERR("multisnap_snap_target target register failed %d", r); + goto bad_multisnap_snap_target; + } + + return 0; + +bad_multisnap_snap_target: + dm_unregister_target(&multisnap_origin_target); +bad_multisnap_origin_target: + kmem_cache_destroy(dm_multisnap_pending_exception_cache); +bad_exception_cache: + return r; +} + +static void __exit dm_multisnapshot_exit(void) +{ + dm_unregister_target(&multisnap_origin_target); + dm_unregister_target(&multisnap_snap_target); + kmem_cache_destroy(dm_multisnap_pending_exception_cache); +} + +/* Module hooks */ +module_init(dm_multisnapshot_init); +module_exit(dm_multisnapshot_exit); + +MODULE_DESCRIPTION(DM_NAME " multisnapshot target"); +MODULE_AUTHOR("Mikulas Patocka"); +MODULE_LICENSE("GPL"); Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-mikulas.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-mikulas.c 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +static void load_commit_block(struct dm_multisnap *s) +{ + struct dm_buffer *bp; + struct multisnap_commit_block *cb; + __u64 dev_size; + int bitmap_depth; + unsigned i; + + cb = dm_bufio_read(s->p->bufio, s->p->valid_commit_block, &bp); + if (IS_ERR(cb)) { + DMERR("load_commit_block: can't re-read commit block %Lx", (unsigned long long)s->p->valid_commit_block); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + if (cb->signature != CB_SIGNATURE) { + dm_bufio_release(bp); + DMERR("load_commit_block: bad signature when re-reading commit block %Lx", (unsigned long long)s->p->valid_commit_block); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + dev_size = read_48(cb, dev_size); + s->p->snapshot_num = le32_to_cpu(cb->snapshot_num); + s->p->total_allocated = read_48(cb, total_allocated); + s->p->data_allocated = read_48(cb, data_allocated); + s->p->bitmap_root = read_48(cb, bitmap_root); + s->p->alloc_rover = read_48(cb, alloc_rover); + s->p->freelist_ptr = read_48(cb, freelist); + s->p->delete_rover = read_48(cb, delete_rover); + s->p->bt_root = read_48(cb, bt_root); + s->p->bt_depth = cb->bt_depth; + s->p->flags = cb->flags; + + if (s->p->bt_depth > MAX_BT_DEPTH || !s->p->bt_depth) { + dm_bufio_release(bp); + DMERR("load_commit_block: invalid b+-tree depth in commit block %Lx", (unsigned long long)s->p->valid_commit_block); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + for (i = 0; i < TMP_REMAP_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->p->tmp_remap[i]); + s->p->n_used_tmp_remaps = 0; + INIT_LIST_HEAD(&s->p->used_bitmap_tmp_remaps); + INIT_LIST_HEAD(&s->p->used_bt_tmp_remaps); + INIT_LIST_HEAD(&s->p->free_tmp_remaps); + + for (i = 0; i < N_REMAPS; i++) { + struct tmp_remap *t = &s->p->tmp_remap_store[i]; + if (read_48(&cb->tmp_remap[i], old)) { + t->old = read_48(&cb->tmp_remap[i], old); + t->new = read_48(&cb->tmp_remap[i], new); + t->uncommitted = 0; + t->bitmap_idx = le32_to_cpu(cb->tmp_remap[i].bitmap_idx); + hlist_add_head(&t->hash_list, &s->p->tmp_remap[TMP_REMAP_HASH(t->old)]); + if (t->bitmap_idx == CB_BITMAP_IDX_NONE) + list_add(&t->list, &s->p->used_bt_tmp_remaps); + else + list_add(&t->list, &s->p->used_bitmap_tmp_remaps); + s->p->n_used_tmp_remaps++; + } else { + list_add(&t->list, &s->p->free_tmp_remaps); + } + } + + dm_bufio_release(bp); + + if ((dev_size + CB_STRIDE) != (chunk_t)(dev_size + CB_STRIDE)) { + DMERR("load_commit_block: device is too large. Compile kernel with 64-bit sector numbers"); + dm_multisnap_set_error(s, -ERANGE); + return; + } + bitmap_depth = dm_multisnap_bitmap_depth(s->chunk_size, dev_size); + if (bitmap_depth < 0) { + DMERR("load_commit_block: device is too large"); + dm_multisnap_set_error(s, bitmap_depth); + return; + } + s->p->dev_size = dev_size; + s->p->bitmap_depth = bitmap_depth; + s->p->btree_entries = dm_multisnap_btree_entries(s->chunk_size); + + dm_multisnap_load_freelist(s); +} + +static void find_commit_block(struct dm_multisnap *s) +{ + struct dm_buffer *bp; + struct multisnap_commit_block *cb; + chunk_t cb_addr = s->p->sb_commit_block; + __u64 sequence; + __u64 dev_size; + s->p->valid_commit_block = 0; + s->p->commit_sequence = 0; + +try_next: + cb = dm_bufio_read(s->p->bufio, cb_addr, &bp); + if (IS_ERR(cb)) { + DMERR("find_commit_block: can't read commit block %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + if (cb->signature != CB_SIGNATURE) { + dm_bufio_release(bp); + DMERR("find_commit_block: bad signature on commit block %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + sequence = le64_to_cpu(cb->sequence); + dev_size = read_48(cb, dev_size); + + dm_bufio_release(bp); + + if (sequence > s->p->commit_sequence) { + s->p->commit_sequence = sequence; + s->p->valid_commit_block = cb_addr; + if ((__u64)cb_addr + CB_STRIDE < dev_size) { + cb_addr += CB_STRIDE; + goto try_next; + } + } + if (!s->p->valid_commit_block) { + DMERR("find_commit_block: no valid commit block"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } +} + +static void initialize_device(struct dm_multisnap *s) +{ + int r; + struct dm_buffer *bp; + struct multisnap_superblock *sb; + struct multisnap_commit_block *cb; + chunk_t cb_block; + chunk_t block_to_write; + __u64 dev_size; + + dev_size = i_size_read(s->snapshot->bdev->bd_inode) >> s->chunk_shift; + if ((dev_size + CB_STRIDE) != (chunk_t)(dev_size + CB_STRIDE)) { + DMERR("initialize_device: device is too large. Compile kernel with 64-bit sector numbers"); + dm_multisnap_set_error(s, -ERANGE); + return; + } + s->p->dev_size = dev_size; + s->p->total_allocated = 0; + s->p->data_allocated = 0; + + block_to_write = SB_BLOCK + 1; + +/* Write btree */ + dm_multisnap_create_btree(s, &block_to_write); + if (dm_multisnap_has_error(s)) + return; + +/* Write bitmaps */ + dm_multisnap_create_bitmaps(s, block_to_write); + if (dm_multisnap_has_error(s)) + return; + +/* Write commit blocks */ + if (FIRST_CB_BLOCK >= dev_size) { + DMERR("initialize_device: device is too small"); + dm_multisnap_set_error(s, -ENOSPC); + return; + } + for (cb_block = FIRST_CB_BLOCK; cb_block < s->p->dev_size; cb_block += CB_STRIDE) { + cb = dm_bufio_new(s->p->bufio, cb_block, &bp); + if (IS_ERR(cb)) { + DMERR("initialize_device: can't allocate commit block at %Lx", (unsigned long long)cb_block); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + memset(cb, 0, s->chunk_size); + cb->signature = CB_SIGNATURE; + cb->sequence = cpu_to_le64(cb_block == FIRST_CB_BLOCK); + if (cb_block == FIRST_CB_BLOCK) { + cb->snapshot_num = cpu_to_le32(0); + write_48(cb, dev_size, s->p->dev_size); + write_48(cb, total_allocated, s->p->total_allocated); + write_48(cb, data_allocated, s->p->data_allocated); + write_48(cb, bitmap_root, s->p->bitmap_root); + write_48(cb, freelist, 0); + write_48(cb, delete_rover, 0); + write_48(cb, bt_root, s->p->bt_root); + cb->bt_depth = s->p->bt_depth; + cb->flags = 0; + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + r = dm_bufio_write_dirty_buffers(s->p->bufio); + if (r) { + DMERR("initialize_device: write error when initializing device"); + dm_multisnap_set_error(s, r); + return; + } + +/* Write super block */ + sb = dm_bufio_new(s->p->bufio, SB_BLOCK, &bp); + if (IS_ERR(sb)) { + DMERR("initialize_device: can't allocate super block"); + dm_multisnap_set_error(s, PTR_ERR(sb)); + return; + } + memset(sb, 0, s->chunk_size); + sb->signature = SB_SIGNATURE; + sb->chunk_size = cpu_to_le32(s->chunk_size); + sb->commit_block = cpu_to_le64(FIRST_CB_BLOCK); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + r = dm_bufio_write_dirty_buffers(s->p->bufio); + if (r) { + DMERR("initialize_device: can't write super block"); + dm_multisnap_set_error(s, r); + return; + } +} + +static int read_super(struct dm_multisnap *s, char **error) +{ + struct dm_buffer *bp; + struct multisnap_superblock *sb; + int initialized; + + initialized = 0; +re_read: + sb = dm_bufio_read(s->p->bufio, SB_BLOCK, &bp); + if (IS_ERR(sb)) { + *error = "Could not read superblock"; + return PTR_ERR(sb); + } + + if (sb->signature != SB_SIGNATURE) { + int i; + if (initialized) { + *error = "Invalid signature after initialization"; + return -EIO; + } + for (i = 0; i < 1 << SECTOR_SHIFT; i++) { + if (((char *)sb)[i]) { + dm_bufio_release(bp); + *error = "Uninitialized device"; + return -ENXIO; + } + } + dm_bufio_release(bp); + initialize_device(s); + if (dm_multisnap_has_error(s)) { + *error = "Can't initialize device"; + return dm_multisnap_has_error(s); + } + initialized = 1; + goto re_read; + } + if (le32_to_cpu(sb->chunk_size) != s->chunk_size) { + dm_bufio_release(bp); + *error = "Bad chunk size"; + return -EINVAL; + } + s->p->sb_commit_block = le64_to_cpu(sb->commit_block); + dm_bufio_release(bp); + + find_commit_block(s); + + if (dm_multisnap_has_error(s)) { + *error = "Unable to find commit block"; + return dm_multisnap_has_error(s); + } + + load_commit_block(s); + + if (dm_multisnap_has_error(s)) { + *error = "Unable to load commit block"; + return dm_multisnap_has_error(s); + } + + dm_multisnap_read_snapshots(s); + if (dm_multisnap_has_error(s)) { + *error = "Could not read snapshot list"; + return dm_multisnap_has_error(s); + } + + return 0; +} + +static void dm_multisnap_status_table(struct dm_multisnap *s, char *result, unsigned maxlen) +{ +} + +/*#define PRINT_BTREE*/ + +#ifdef PRINT_BTREE +static int print_btree_callback(struct dm_multisnap *s, struct dm_multisnap_bt_entry *bt, void *cookie) +{ + printk("entry: %Lx, %x-%x -> %Lx\n", (unsigned long long)read_48(bt, orig_chunk), le32_to_cpu(bt->snap_from), le32_to_cpu(bt->snap_to), (unsigned long long)read_48(bt, new_chunk)); + return 0; +} + +static void print_btree(struct dm_multisnap *s) +{ + struct bt_key key = { 0, 0, 0 }; + int r = dm_multisnap_list_btree(s, &key, print_btree_callback, NULL); + printk("list ended: %d\n", r); +} +#endif + +/*#define PRINT_BITMAPS*/ + +#ifdef PRINT_BITMAPS +static void print_bitmaps(struct dm_multisnap *s) +{ + chunk_t c; + printk("allocated:"); + for (c = 0; c < s->p->dev_size; c += s->chunk_size * 8) { + struct dm_buffer *bp; + unsigned i; + void *bmp = dm_multisnap_map_bitmap(s, c >> (s->chunk_shift + 3), &bp, NULL, NULL); + if (!bmp) + continue; + for (i = 0; i < s->chunk_size * 8; i++) if (generic_test_le_bit(i, bmp)) { + chunk_t block = c + i; + if (!dm_multisnap_is_cb_block(s, block)) + printk(" %Lx", (unsigned long long)block); + cond_resched(); + } + + dm_bufio_release(bp); + } + printk("\n"); +} +#endif + +static int dm_multisnap_mikulas_init(struct dm_multisnap *s, unsigned argc, char **argv, char **error) +{ + int r; + + if (argc != 0) { + *error = "Bad number of arguments"; + r = -EINVAL; + goto bad_arguments; + } + + s->p = kmalloc(sizeof(struct exception_store_private), GFP_KERNEL); + if (!s->p) { + *error = "Can't allocate private exception store structure"; + r = -ENOMEM; + goto bad_private; + } + + s->p->active_snapshots = RB_ROOT; + s->p->n_preallocated_blocks = 0; + s->p->query_active = 0; + + s->p->delete_work.work = dm_multisnap_background_delete; + s->p->delete_work.queued = 0; + s->p->delete_commit_count = 0; + + s->p->tmp_chunk = vmalloc(s->chunk_size + sizeof(struct dm_multisnap_bt_entry)); + if (!s->p->tmp_chunk) { + *error = "Can't allocate temporary chunk"; + r = -ENOMEM; + goto bad_tmp_chunk; + } + + s->p->freelist = vmalloc(s->chunk_size); + if (!s->p->freelist) { + *error = "Can't allocate freelist"; + r = -ENOMEM; + goto bad_freelist; + } + + s->p->bufio = dm_bufio_client_create(s->snapshot->bdev, s->chunk_size); + if (IS_ERR(s->p->bufio)) { + *error = "Can't create bufio client"; + r = PTR_ERR(s->p->bufio); + goto bad_bufio; + } + + r = read_super(s, error); + if (r) + goto bad_super; + + if (s->p->flags & (MULTISNAP_FLAG_DELETING | MULTISNAP_FLAG_PENDING_DELETE)) + dm_multisnap_queue_work(s, &s->p->delete_work); + +#ifdef PRINT_BTREE + print_btree(s); +#endif +#ifdef PRINT_BITMAPS + print_bitmaps(s); +#endif + + return 0; + +bad_super: + dm_bufio_client_destroy(s->p->bufio); +bad_bufio: + vfree(s->p->freelist); +bad_freelist: + vfree(s->p->tmp_chunk); +bad_tmp_chunk: + kfree(s->p); +bad_private: +bad_arguments: + return r; +} + +static void dm_multisnap_mikulas_exit(struct dm_multisnap *s) +{ + int i; + + dm_multisnap_cancel_work(s, &s->p->delete_work); + + i = 0; + while (!list_empty(&s->p->used_bitmap_tmp_remaps)) { + struct tmp_remap *t = list_first_entry(&s->p->used_bitmap_tmp_remaps, struct tmp_remap, list); + list_del(&t->list); + hlist_del(&t->hash_list); + i++; + } + + while (!list_empty(&s->p->used_bt_tmp_remaps)) { + struct tmp_remap *t = list_first_entry(&s->p->used_bt_tmp_remaps, struct tmp_remap, list); + list_del(&t->list); + hlist_del(&t->hash_list); + i++; + } + + BUG_ON(i != s->p->n_used_tmp_remaps); + while (!list_empty(&s->p->free_tmp_remaps)) { + struct tmp_remap *t = list_first_entry(&s->p->free_tmp_remaps, struct tmp_remap, list); + list_del(&t->list); + i++; + } + BUG_ON(i != N_REMAPS); + + for (i = 0; i < TMP_REMAP_HASH_SIZE; i++) + BUG_ON(!hlist_empty(&s->p->tmp_remap[i])); + + dm_bufio_client_destroy(s->p->bufio); + vfree(s->p->freelist); + vfree(s->p->tmp_chunk); + kfree(s->p); + s->p = NULL; /* catch bugs if someone attempts to reuse it */ +} + +struct dm_multisnap_exception_store dm_multisnap_mikulas_store = { + .name = "mikulas", + .module = THIS_MODULE, + .init_exception_store = dm_multisnap_mikulas_init, + .exit_exception_store = dm_multisnap_mikulas_exit, + .status_info = dm_multisnap_status_info, + .status_table = dm_multisnap_status_table, + .create_snapshot = dm_multisnap_create_snapshot, + .delete_snapshot = dm_multisnap_delete_snapshot, + .snapshot_exists = dm_multisnap_snapshot_exists, + .find_snapshot_chunk = dm_multisnap_find_snapshot_chunk, + .reset_query = dm_multisnap_reset_query, + .query_next_remap = dm_multisnap_query_next_remap, + .add_next_remap = dm_multisnap_add_next_remap, + .make_chunk_writeable = dm_multisnap_make_chunk_writeable, + .check_conflict = dm_multisnap_check_conflict, + .commit = dm_multisnap_commit, +}; + +static int __init dm_multisnapshot_mikulas_module_init(void) +{ + BUG_ON(sizeof(struct multisnap_commit_block) != 512); + return dm_multisnap_register_exception_store(&dm_multisnap_mikulas_store); +} + +static void __exit dm_multisnapshot_mikulas_module_exit(void) +{ + dm_multisnap_unregister_exception_store(&dm_multisnap_mikulas_store); +} + +module_init(dm_multisnapshot_mikulas_module_init); +module_exit(dm_multisnapshot_mikulas_module_exit); + +MODULE_DESCRIPTION(DM_NAME " multisnapshot Mikulas' exceptions store"); +MODULE_AUTHOR("Mikulas Patocka"); +MODULE_LICENSE("GPL"); + Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap.h 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,191 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#ifndef DM_MULTISNAP_H +#define DM_MULTISNAP_H + +#include +#include +#include +#include +#include +#include + +#define EFSERROR EPERM + +#define DM_MSG_PREFIX "multisnapshot" + +#define PENDING_HASH_SIZE 256 +#define PENDING_HASH(c) ((c) & (PENDING_HASH_SIZE - 1)) + +#define MULTISNAP_KCOPYD_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1) + +#define MAX_CHUNKS_TO_REMAP DM_KCOPYD_MAX_REGIONS + +#define PENDING_MEMPOOL_SIZE 256 + +#define SNAPID_T_ORIGIN 0xffffffff +#define SNAPID_T_SENTINEL 0xffffffff +#define SNAPID_T_MAX 0xfffffffe + +typedef sector_t chunk_t; +typedef __u32 snapid_t; + +struct dm_multisnap { + struct exception_store_private *p; + struct dm_multisnap_exception_store *store; + + struct dm_dev *origin; + struct dm_dev *snapshot; + + int error; + + unsigned chunk_size; + unsigned chunk_shift; + + struct mutex master_lock; + struct workqueue_struct *wq; + struct work_struct work; + struct bio_list bios; + struct list_head background_works; + + mempool_t *pending_pool; + + struct dm_kcopyd_client *kcopyd; + atomic_t n_kcopyd_jobs; + + /* This may only be accessed from kcopyd callback, it has no locking */ + struct list_head pes_waiting_for_commit; + + /* List head for struct dm_multisnap_pending_exception->hash_list */ + struct hlist_head pending_hash[PENDING_HASH_SIZE]; + + int pending_mempool_allocation_failed; + + /* The last created snapshot id */ + snapid_t last_snapid; + + /* List head for struct dm_multisnap_snap->list_snaps */ + struct list_head all_snaps; + + /* List entry for all_multisnapshots */ + struct list_head list_all; + +}; + +struct dm_multisnap_snap { + struct dm_multisnap *s; + snapid_t snapid; + /* List entry for struct dm_multisnap->list_all */ + struct list_head list_snaps; + char origin_name[16]; +}; + +union chunk_descriptor { + __u64 bitmask; + struct { + snapid_t from; + snapid_t to; + } range; +}; + +struct dm_multisnap_pending_exception { + /* List entry for struct dm_multisnap->pending_hash */ + struct hlist_node hash_list; + + struct dm_multisnap *s; + struct bio_list bios; + + chunk_t chunk; + + int n_descs; + union chunk_descriptor desc[MAX_CHUNKS_TO_REMAP]; + + /* List entry for struct dm_multisnap->pes_waiting_for_commit */ + struct list_head list; +}; + +struct dm_multisnap_background_work { + struct list_head list; + void (*work)(struct dm_multisnap *, struct dm_multisnap_background_work *); + int queued; +}; + +struct dm_multisnap_exception_store { + struct list_head list; + struct module *module; + const char *name; + + /* < 0 - error */ + int (*init_exception_store)(struct dm_multisnap *s, unsigned argc, char **argv, char **error); + + void (*exit_exception_store)(struct dm_multisnap *s); + void (*status_info)(struct dm_multisnap *s, char *result, unsigned maxlen); + void (*status_table)(struct dm_multisnap *s, char *result, unsigned maxlen); + /* < 0 - error */ + int (*create_snapshot)(struct dm_multisnap *s, snapid_t *snapid); + + /* < 0 - error */ + int (*delete_snapshot)(struct dm_multisnap *s, snapid_t snapid); + + /* 0 - doesn't exist, 1 - exists */ + int (*snapshot_exists)(struct dm_multisnap *s, snapid_t snapid); + + /* 0 - not found, 1 - found (read-only), 2 - found (writeable), < 0 - error */ + int (*find_snapshot_chunk)(struct dm_multisnap *s, snapid_t id, chunk_t chunk, chunk_t *result); + + /* + * Chunk interface between exception store and generic code. + * Allowed sequences: + * + * - first call reset_query + * then repeatedly query next exception to make with query_next_remap + * and add it to btree with add_next_remap. This can be repeated until + * query_next_remap indicates that it has nothing more or until all 8 + * kcopyd slots are filled. + * + * - call find_snapshot_chunk, if it returns 0, you can call + * add_next_remap to add the chunk to the btree. + * + * - call find_snapshot_chunk, if it returns 1 (shared chunk), call + * make_chunk_writeable to relocate that chunk. + */ + + void (*reset_query)(struct dm_multisnap *s); + int (*query_next_remap)(struct dm_multisnap *s, chunk_t chunk); + void (*add_next_remap)(struct dm_multisnap *s, union chunk_descriptor *cd, chunk_t *new_chunk); + void (*make_chunk_writeable)(struct dm_multisnap *s, union chunk_descriptor *cd, chunk_t *new_chunk); + int (*check_conflict)(struct dm_multisnap *s, union chunk_descriptor *cd, snapid_t snapid); + + void (*commit)(struct dm_multisnap *s); +}; + +static inline void dm_multisnap_adjust_string(char **result, unsigned *maxlen) +{ + unsigned len = strlen(*result); + *result += len; + *maxlen -= len; +} + +static inline int dm_multisnap_can_commit(struct dm_multisnap *s) +{ + return !atomic_read(&s->n_kcopyd_jobs); +} + +/* dm-multisnap.c */ + +void dm_multisnap_set_error(struct dm_multisnap *s, int error); +int dm_multisnap_has_error(struct dm_multisnap *s); + +void dm_multisnap_queue_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw); +void dm_multisnap_cancel_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw); + +int dm_multisnap_register_exception_store(struct dm_multisnap_exception_store *store); +void dm_multisnap_unregister_exception_store(struct dm_multisnap_exception_store *store); + +#endif Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-blocks.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-blocks.c 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +static struct tmp_remap *find_tmp_remap(struct dm_multisnap *s, chunk_t block) +{ + struct tmp_remap *t; + struct hlist_node *hn; + unsigned hash = TMP_REMAP_HASH(block); + hlist_for_each_entry(t, hn, &s->p->tmp_remap[hash], hash_list) { + if (t->old == block) + return t; + cond_resched(); + } + return NULL; +} + +void *dm_multisnap_read_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp) +{ + void *buf; + struct tmp_remap *t; + cond_resched(); + t = find_tmp_remap(s, block); + if (t) + block = t->new; + buf = dm_bufio_read(s->p->bufio, block, bp); + if (unlikely(IS_ERR(buf))) { + DMERR("dm_multisnap_read_block: error read chunk %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, PTR_ERR(buf)); + return NULL; + } + return buf; +} + +int dm_multisnap_block_is_uncommitted(struct dm_multisnap *s, chunk_t chunk) +{ + struct tmp_remap *t = find_tmp_remap(s, chunk); + return t && t->uncommitted; +} + +void *dm_multisnap_duplicate_block(struct dm_multisnap *s, chunk_t old_chunk, chunk_t new_chunk, bitmap_t bitmap_idx, struct dm_buffer **bp, chunk_t *to_free_ptr) +{ + chunk_t to_free_val; + void *buf; + struct tmp_remap *t; + + if (!to_free_ptr) + to_free_ptr = &to_free_val; + *to_free_ptr = 0; + + t = find_tmp_remap(s, old_chunk); + if (t) { + if (unlikely(t->bitmap_idx != bitmap_idx)) { + DMERR("dm_multisnap_duplicate_block: bitmap_idx doesn't match, %X != %X", t->bitmap_idx, bitmap_idx); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + *to_free_ptr = t->new; + t->new = new_chunk; + } else { + if (unlikely(list_empty(&s->p->free_tmp_remaps))) { + DMERR("dm_multisnap_duplicate_block: all remap blocks used"); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + t = list_first_entry(&s->p->free_tmp_remaps, struct tmp_remap, list); + t->new = new_chunk; + t->old = old_chunk; + t->bitmap_idx = bitmap_idx; + hlist_add_head(&t->hash_list, &s->p->tmp_remap[TMP_REMAP_HASH(old_chunk)]); + s->p->n_used_tmp_remaps++; + } + list_del(&t->list); + if (bitmap_idx == CB_BITMAP_IDX_NONE) + list_add_tail(&t->list, &s->p->used_bt_tmp_remaps); + else + list_add_tail(&t->list, &s->p->used_bitmap_tmp_remaps); + t->uncommitted = 1; + dm_bufio_release_move(*bp, new_chunk); + + if (to_free_ptr == &to_free_val && to_free_val) + dm_multisnap_free_block(s, to_free_val, 0); + + buf = dm_bufio_read(s->p->bufio, new_chunk, bp); + if (IS_ERR(buf)) { + DMERR("dm_multisnap_duplicate_block: error reading chunk %Lx", (unsigned long long)new_chunk); + dm_multisnap_set_error(s, PTR_ERR(buf)); + return NULL; + } + return buf; +} + +void dm_multisnap_free_tmp_remap(struct dm_multisnap *s, struct tmp_remap *t) +{ + list_del(&t->list); + hlist_del(&t->hash_list); + s->p->n_used_tmp_remaps--; + list_add(&t->list, &s->p->free_tmp_remaps); +} + +void *dm_multisnap_make_block(struct dm_multisnap *s, chunk_t new_chunk, struct dm_buffer **bp) +{ + void *buf; + /* !!! TODO: add it to the list of recently allocated blocks */ + + buf = dm_bufio_new(s->p->bufio, new_chunk, bp); + if (unlikely(IS_ERR(buf))) { + DMERR("dm_multisnap_make_block: error creating new block at chunk %Lx", (unsigned long long)new_chunk); + dm_multisnap_set_error(s, PTR_ERR(buf)); + return NULL; + } + return buf; +} + +void dm_multisnap_free_block_and_duplicates(struct dm_multisnap *s, chunk_t chunk) +{ + struct tmp_remap *t = find_tmp_remap(s, chunk); + if (t) { + dm_multisnap_free_block(s, t->new, 0); + dm_multisnap_free_tmp_remap(s, t); + } + dm_multisnap_free_block(s, chunk, 0); +} + +int dm_multisnap_is_cb_block(struct dm_multisnap *s, chunk_t block) +{ + if (block < FIRST_CB_BLOCK) return 0; + return sector_div(block, CB_STRIDE) == FIRST_CB_BLOCK % CB_STRIDE; +} + +void dm_multisnap_init_stop_cycles(stop_cycles_t *cy) +{ + (*cy)[1] = 0; +} + +int dm_multisnap_stop_cycles(struct dm_multisnap *s, stop_cycles_t *cy, chunk_t key) +{ + if (unlikely((*cy)[0] == key) && unlikely((*cy)[1] != 0)) { + DMERR("dm_multisnap_stop_cycles: cycle detected at chunk %Lx", (unsigned long long)key); + dm_multisnap_set_error(s, -EFSERROR); + return -1; + } + return 0; +} Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-btree.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-btree.c 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,674 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +static void add_at_idx(struct dm_multisnap_bt_node *node, unsigned index, struct bt_key *key, chunk_t new_chunk); + +static struct dm_multisnap_bt_node *dm_multisnap_read_btnode(struct dm_multisnap *s, int depth, chunk_t block, struct dm_buffer **bp) +{ + struct dm_multisnap_bt_node *node; + + BUG_ON((unsigned)depth >= s->p->bt_depth); + + node = dm_multisnap_read_block(s, block, bp); + if (unlikely(!node)) + return NULL; + + if (unlikely(node->signature != BT_SIGNATURE)) { + dm_bufio_release(*bp); + DMERR("dm_multisnap_read_btnode: bad signature on btree node %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + + if (unlikely((unsigned)(le32_to_cpu(node->n_entries) - 1) >= s->p->btree_entries)) { + dm_bufio_release(*bp); + DMERR("dm_multisnap_read_btnode: bad number of entries in btree node %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + + if (depth != s->p->bt_depth - 1) { + struct dm_multisnap_bt_entry *be = &node->entries[le32_to_cpu(node->n_entries) - 1]; + if (unlikely(read_48(be, orig_chunk) != CHUNK_T_SENTINEL) || + unlikely(le32_to_cpu(be->snap_from) != SNAPID_T_SENTINEL) || + unlikely(le32_to_cpu(be->snap_to) != SNAPID_T_SENTINEL)) { + dm_bufio_release(*bp); + DMERR("dm_multisnap_read_btnode: node at %Lx in depth %d doesn't have sentinel record", (unsigned long long)block, depth); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + } + + return node; +} + +void dm_multisnap_create_btree(struct dm_multisnap *s, chunk_t *writing_block) +{ + struct dm_buffer *bp; + struct dm_multisnap_bt_node *node; + struct bt_key new_key; + + while (dm_multisnap_is_cb_block(s, *writing_block)) + (*writing_block)++; + + if (*writing_block >= s->p->dev_size) { + DMERR("dm_multisnap_create_btree: device is too small"); + dm_multisnap_set_error(s, -ENOSPC); + return; + } + + node = dm_bufio_new(s->p->bufio, *writing_block, &bp); + if (IS_ERR(node)) { + DMERR("dm_multisnap_create_btree: 't create direct bitmap block at %Lx", (unsigned long long)*writing_block); + dm_multisnap_set_error(s, PTR_ERR(node)); + return; + } + memset(node, 0, s->chunk_size); + node->signature = BT_SIGNATURE; + node->n_entries = cpu_to_le32(0); + + /* + * A btree node must have at least one entry --- so create this empty + * one + */ + new_key.snap_from = new_key.snap_to = SNAPID_T_SENTINEL; + new_key.chunk = CHUNK_T_SENTINEL; + add_at_idx(node, 0, &new_key, 0); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + s->p->bt_root = *writing_block; + s->p->bt_depth = 1; + (*writing_block)++; +} + +static int compare_key(struct dm_multisnap_bt_entry *e, struct bt_key *key) +{ + chunk_t orig_chunk = read_48(e, orig_chunk); + if (orig_chunk < key->chunk) + return -1; + if (orig_chunk > key->chunk) + return 1; + + if (le32_to_cpu(e->snap_to) < key->snap_from) + return -1; + if (le32_to_cpu(e->snap_from) > key->snap_to) + return 1; + + return 0; +} + +/* + * Returns: 1 - found, 0 - not found + * *result - if found, then the first entry in the requested range + * - if not found, then the first entry after the requested range + */ + +static int binary_search(struct dm_multisnap_bt_node *node, struct bt_key *key, unsigned *result) +{ + int c; + int first = 0; + int last = le32_to_cpu(node->n_entries) - 1; + + while (1) { + int middle = (first + last) >> 1; + struct dm_multisnap_bt_entry *e = &node->entries[middle]; + + c = compare_key(e, key); + + if (first == last) + break; + + if (c < 0) + first = middle + 1; + else + last = middle; + } + + *result = first; + return !c; +} + +/* + * Returns: 1 - found, 0 - not found, -1 - error + * In case of not error (0 or 1 is returned), the node and held buffer for + * this node is returned. + */ + +static int walk_btree(struct dm_multisnap *s, struct bt_key *key, struct dm_multisnap_bt_node **nodep, struct dm_buffer **bp, struct path_element path[MAX_BT_DEPTH]) +{ +#define node (*nodep) + int r; + chunk_t block = s->p->bt_root; + unsigned d = 0; + while (1) { + path[d].block = block; + node = dm_multisnap_read_btnode(s, d, block, bp); + if (!node) + return -1; + path[d].n_entries = le32_to_cpu(node->n_entries); + r = binary_search(node, key, &path[d].idx); + block = read_48(&node->entries[path[d].idx], new_chunk); + if (++d == s->p->bt_depth) + break; + dm_bufio_release(*bp); + } + if (unlikely(compare_key(&node->entries[path[s->p->bt_depth - 1].idx], key) < 0)) + path[s->p->bt_depth - 1].idx++; + return r; +#undef node +} + +/* + * Returns: 1 - found, 0 - not found, -1 - error + * In case the node is found, key contains updated key and result contains + * the resulting chunk. + */ + +int dm_multisnap_find_in_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t *result) +{ + struct dm_multisnap_bt_node *node; + struct path_element path[MAX_BT_DEPTH]; + struct dm_buffer *bp; + + int r = walk_btree(s, key, &node, &bp, path); + if (unlikely(r < 0)) + return r; + + if (r) { + struct dm_multisnap_bt_entry *entry = &node->entries[path[s->p->bt_depth - 1].idx]; + *result = read_48(entry, new_chunk); + key->chunk = read_48(entry, orig_chunk); + key->snap_from = le32_to_cpu(entry->snap_from); + key->snap_to = le32_to_cpu(entry->snap_to); + } + dm_bufio_release(bp); + + return r; +} + +int dm_multisnap_list_btree(struct dm_multisnap *s, struct bt_key *key, int (*call)(struct dm_multisnap *, struct dm_multisnap_bt_entry *, void *), void *cookie) +{ + struct dm_multisnap_bt_node *node; + struct path_element path[MAX_BT_DEPTH]; + struct dm_buffer *bp; + int depth; + int i; + int r; + + r = walk_btree(s, key, &node, &bp, path); + if (unlikely(r < 0)) + return r; + +list_next_node: + for (i = path[s->p->bt_depth - 1].idx; i < le32_to_cpu(node->n_entries); i++) { + cond_resched(); + r = call(s, &node->entries[i], cookie); + if (unlikely(r)) { + dm_bufio_release(bp); + return r; + } + } + dm_bufio_release(bp); + + for (depth = s->p->bt_depth - 2; depth >= 0; depth--) { + int idx; + node = dm_multisnap_read_btnode(s, depth, path[depth].block, &bp); + if (!node) + return -1; + idx = path[depth].idx + 1; + if (idx < le32_to_cpu(node->n_entries)) { + r = compare_key(&node->entries[idx], key); + if (unlikely(r <= 0)) { + DMERR("dm_multisnap_list_btree: non-monotonic btree: node %Lx, index %x", (unsigned long long)path[depth].block, idx); + dm_bufio_release(bp); + dm_multisnap_set_error(s, -EFSERROR); + return 0; + } + path[depth].idx = idx; + do { + depth++; + path[depth].block = read_48(&node->entries[path[depth - 1].idx], new_chunk); + path[depth].idx = 0; + dm_bufio_release(bp); + node = dm_multisnap_read_btnode(s, depth, path[depth].block, &bp); + if (!node) + return -1; + path[depth].n_entries = le32_to_cpu(node->n_entries); + } while (depth < s->p->bt_depth - 1); + goto list_next_node; + } + dm_bufio_release(bp); + } + + return 0; +} + +static void add_at_idx(struct dm_multisnap_bt_node *node, unsigned index, struct bt_key *key, chunk_t new_chunk) +{ + memmove(&node->entries[index + 1], &node->entries[index], (le32_to_cpu(node->n_entries) - index) * sizeof(struct dm_multisnap_bt_entry)); + write_48(&node->entries[index], orig_chunk, key->chunk); + if (sizeof(chunk_t) == 4 && unlikely(key->chunk > CHUNK_T_MAX)) + node->entries[index].orig_chunk2 = cpu_to_le16(0xffff); + write_48(&node->entries[index], new_chunk, new_chunk); + node->entries[index].snap_from = cpu_to_le32(key->snap_from); + node->entries[index].snap_to = cpu_to_le32(key->snap_to); + node->n_entries = cpu_to_le32(le32_to_cpu(node->n_entries) + 1); +} + +void dm_multisnap_add_to_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t new_chunk) +{ + struct dm_multisnap_bt_node *node; + struct dm_buffer *bp; + struct path_element path[MAX_BT_DEPTH]; + int depth; + + unsigned split_entries, split_index, split_offset, split_size; + struct bt_key new_key; + struct dm_multisnap_bt_entry *last_one; + chunk_t new_root; + + int r = walk_btree(s, key, &node, &bp, path); + + if (unlikely(r)) { + if (r > 0) { + dm_bufio_release(bp); + DMERR("dm_multisnap_add_to_btree: adding key that already exists: %Lx, %x-%x", (unsigned long long)key->chunk, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + } + return; + } + + depth = s->p->bt_depth - 1; + +go_up: + node = dm_multisnap_alloc_duplicate_block(s, path[depth].block, &bp, node); + if (unlikely(!node)) + return; + + if (likely(le32_to_cpu(node->n_entries) < s->p->btree_entries)) { + add_at_idx(node, path[depth].idx, key, new_chunk); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + return; + } + memcpy(s->p->tmp_chunk, node, s->chunk_size); + add_at_idx(s->p->tmp_chunk, path[depth].idx, key, new_chunk); + + split_entries = le32_to_cpu(((struct dm_multisnap_bt_node *)s->p->tmp_chunk)->n_entries); + split_index = split_entries / 2; + split_offset = sizeof(struct dm_multisnap_bt_node) + split_index * sizeof(struct dm_multisnap_bt_entry); + split_size = sizeof(struct dm_multisnap_bt_node) + split_entries * sizeof(struct dm_multisnap_bt_entry); + memcpy(node, s->p->tmp_chunk, sizeof(struct dm_multisnap_bt_node)); + memcpy((char *)node + sizeof(struct dm_multisnap_bt_node), (char *)s->p->tmp_chunk + split_offset, split_size - split_offset); + memset((char *)node + sizeof(struct dm_multisnap_bt_node) + split_size - split_offset, 0, s->chunk_size - (sizeof(struct dm_multisnap_bt_node) + split_size - split_offset)); + node->n_entries = cpu_to_le32(split_entries - split_index); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + node = dm_multisnap_alloc_make_block(s, &new_chunk, &bp); + if (unlikely(!node)) + return; + + memcpy(node, s->p->tmp_chunk, split_offset); + memset((char *)node + split_offset, 0, s->chunk_size - split_offset); + node->n_entries = cpu_to_le32(split_index); + + last_one = &node->entries[split_index - 1]; + new_key.chunk = read_48(last_one, orig_chunk); + new_key.snap_from = le32_to_cpu(last_one->snap_to); + new_key.snap_to = le32_to_cpu(last_one->snap_to); + if (unlikely(depth != s->p->bt_depth - 1)) { + write_48(last_one, orig_chunk, CHUNK_T_SENTINEL); + last_one->snap_from = last_one->snap_to = cpu_to_le32(SNAPID_T_SENTINEL); + } + + key = &new_key; + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (depth--) { + node = dm_multisnap_read_btnode(s, depth, path[depth].block, &bp); + if (unlikely(!node)) + return; + goto go_up; + } + + if (s->p->bt_depth >= MAX_BT_DEPTH) { + DMERR("dm_multisnap_add_to_btree: max b+-tree depth reached"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + node = dm_multisnap_alloc_make_block(s, &new_root, &bp); + if (unlikely(!node)) + return; + + memset(node, 0, s->chunk_size); + node->signature = BT_SIGNATURE; + node->n_entries = cpu_to_le32(0); + add_at_idx(node, 0, &new_key, new_chunk); + new_key.snap_from = new_key.snap_to = SNAPID_T_SENTINEL; + new_key.chunk = CHUNK_T_SENTINEL; + add_at_idx(node, 1, &new_key, path[0].block); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + s->p->bt_root = new_root; + s->p->bt_depth++; +} + +static void dm_multisnap_fixup_backlimits(struct dm_multisnap *s, struct path_element path[MAX_BT_DEPTH], int depth, chunk_t old_chunk, snapid_t old_snapid, chunk_t new_chunk, snapid_t new_snapid) +{ + int idx; + struct dm_multisnap_bt_node *node; + struct dm_buffer *bp; + + if (old_chunk == new_chunk && old_snapid == new_snapid) + return; + + for (depth--; depth >= 0; depth--) { + if (path[depth].idx != path[depth].n_entries - 1) + goto got_it; + } + DMERR("dm_multisnap_fixup_backlimits: the last entry modified, %Lx/%x -> %Lx/%x", (unsigned long long)old_chunk, old_snapid, (unsigned long long)new_chunk, new_snapid); + dm_multisnap_set_error(s, -EFSERROR); + return; + +got_it: + node = dm_multisnap_read_btnode(s, depth, path[depth].block, &bp); + if (unlikely(!node)) + return; + + node = dm_multisnap_alloc_duplicate_block(s, path[depth].block, &bp, node); + if (unlikely(!node)) + return; + + idx = path[depth].idx; + + if (read_48(&node->entries[idx], orig_chunk) != old_chunk || + le32_to_cpu(node->entries[idx].snap_from) != old_snapid || + le32_to_cpu(node->entries[idx].snap_to) != old_snapid) { + dm_bufio_release(bp); + DMERR("dm_multisnap_fixup_backlimits: btree limit does not match, block %Lx, idx %x, orig_chunk %Lx, snap_from %x, snap_to %x, want %Lx, %x", (unsigned long long)path[depth].block, idx, (unsigned long long)read_48(&node->entries[idx], orig_chunk), (unsigned)le32_to_cpu(node->entries[idx].snap_from), (unsigned)le32_to_cpu(node->entries[idx].snap_to), (unsigned long long)old_chunk, old_snapid); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + write_48(&node->entries[idx], orig_chunk, new_chunk); + node->entries[idx].snap_from = node->entries[idx].snap_to = cpu_to_le32(new_snapid); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + +void dm_multisnap_restrict_btree_entry(struct dm_multisnap *s, struct bt_key *key) +{ + struct dm_multisnap_bt_node *node; + struct path_element path[MAX_BT_DEPTH]; + struct dm_buffer *bp; + int idx; + struct dm_multisnap_bt_entry *entry; + snapid_t from, to, new_to; + + int r = walk_btree(s, key, &node, &bp, path); + if (unlikely(r < 0)) + return; + + if (!r) { + dm_bufio_release(bp); + DMERR("dm_multisnap_restrict_btree_entry: unknown key: %Lx, %x-%x", (unsigned long long)key->chunk, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + node = dm_multisnap_alloc_duplicate_block(s, path[s->p->bt_depth - 1].block, &bp, node); + if (unlikely(!node)) + return; + + idx = path[s->p->bt_depth - 1].idx; + entry = &node->entries[idx]; + from = le32_to_cpu(entry->snap_from); + to = new_to = le32_to_cpu(entry->snap_to); + if (key->snap_from == from && key->snap_to < to) + entry->snap_from = cpu_to_le32(key->snap_to + 1); + else if (key->snap_from > from && key->snap_to == to) + new_to = entry->snap_to = cpu_to_le32(key->snap_from - 1); + else { + dm_bufio_release(bp); + DMERR("dm_multisnap_restrict_btree_entry: invali range to restruct: %Lx, %x-%x %x-%x", (unsigned long long)key->chunk, from, to, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (unlikely(idx == path[s->p->bt_depth - 1].n_entries - 1)) + dm_multisnap_fixup_backlimits(s, path, s->p->bt_depth - 1, key->chunk, to, key->chunk, new_to); +} + +void dm_multisnap_extend_btree_entry(struct dm_multisnap *s, struct bt_key *key) +{ + struct dm_multisnap_bt_node *node; + struct path_element path[MAX_BT_DEPTH]; + struct dm_buffer *bp; + int idx; + struct dm_multisnap_bt_entry *entry; + snapid_t from, to, new_to; + + int r = walk_btree(s, key, &node, &bp, path); + if (unlikely(r < 0)) + return; + + if (!r) { + dm_bufio_release(bp); + DMERR("dm_multisnap_extend_btree_entry: unknown key: %Lx, %x-%x", (unsigned long long)key->chunk, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + node = dm_multisnap_alloc_duplicate_block(s, path[s->p->bt_depth - 1].block, &bp, node); + if (unlikely(!node)) + return; + + idx = path[s->p->bt_depth - 1].idx; + entry = &node->entries[idx]; + from = le32_to_cpu(entry->snap_from); + to = new_to = le32_to_cpu(entry->snap_to); + if (key->snap_from < from) + entry->snap_from = cpu_to_le32(key->snap_from); + if (key->snap_to > to) + new_to = entry->snap_to = cpu_to_le32(key->snap_to); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (unlikely(idx == path[s->p->bt_depth - 1].n_entries - 1)) + dm_multisnap_fixup_backlimits(s, path, s->p->bt_depth - 1, key->chunk, to, key->chunk, new_to); +} + +void dm_multisnap_delete_from_btree(struct dm_multisnap *s, struct bt_key *key) +{ + struct dm_multisnap_bt_node *node; + struct path_element path[MAX_BT_DEPTH]; + struct dm_buffer *bp; + int idx; + struct dm_multisnap_bt_entry *entry; + snapid_t from, to; + int depth, n_entries; + + struct dm_multisnap_bt_entry *last_one; + chunk_t last_one_chunk; + snapid_t last_one_snap_to; + + int r = walk_btree(s, key, &node, &bp, path); + if (unlikely(r < 0)) + return; + + if (unlikely(!r)) { + dm_bufio_release(bp); + DMERR("dm_multisnap_delete_from_btree: unknown key: %Lx, %x-%x", (unsigned long long)key->chunk, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + depth = s->p->bt_depth - 1; + + idx = path[depth].idx; + entry = &node->entries[idx]; + from = le32_to_cpu(entry->snap_from); + to = le32_to_cpu(entry->snap_to); + if (unlikely(from != key->snap_from) || unlikely(to != key->snap_to)) { + dm_bufio_release(bp); + DMERR("dm_multisnap_restrict_btree: invali range to restruct: %Lx, %x-%x %x-%x", (unsigned long long)key->chunk, from, to, key->snap_from, key->snap_to); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + while (unlikely((n_entries = le32_to_cpu(node->n_entries)) == 1)) { + dm_bufio_release(bp); + if (unlikely(!depth)) { + DMERR("dm_multisnap_restrict_btree: b-tree is empty"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + dm_multisnap_free_block_and_duplicates(s, path[depth].block); + depth--; + node = dm_multisnap_read_btnode(s, depth, path[depth].block, &bp); + if (!node) + return; + } + + node = dm_multisnap_alloc_duplicate_block(s, path[depth].block, &bp, node); + if (unlikely(!node)) + return; + + idx = path[depth].idx; + + /*{ + int x; + printk("before:\n"); + for (x = 0; x < n_entries; x++) + printk("%Lx, %x-%x -> %Lx\n", read_48(&node->entries[x], orig_chunk), le32_to_cpu(node->entries[x].snap_from), le32_to_cpu(node->entries[x].snap_to), read_48(&node->entries[x], new_chunk)); + }*/ + + memmove(node->entries + idx, node->entries + idx + 1, (n_entries - idx - 1) * sizeof(struct dm_multisnap_bt_entry)); + n_entries--; + memset(node->entries + n_entries, 0, sizeof(struct dm_multisnap_bt_entry)); + + node->n_entries = cpu_to_le32(n_entries); + + /*{ + int x; + printk("after:\n"); + for (x = 0; x < n_entries; x++) + printk("%Lx, %x-%x -> %Lx\n", read_48(&node->entries[x], orig_chunk), le32_to_cpu(node->entries[x].snap_from), le32_to_cpu(node->entries[x].snap_to), read_48(&node->entries[x], new_chunk)); + }*/ + + last_one = &node->entries[n_entries - 1]; + last_one_chunk = read_48(last_one, orig_chunk); + last_one_snap_to = le32_to_cpu(last_one->snap_to); + if (depth != s->p->bt_depth - 1) { + /* Keep the sentinel */ + write_48(last_one, orig_chunk, CHUNK_T_SENTINEL); + last_one->snap_from = last_one->snap_to = cpu_to_le32(SNAPID_T_SENTINEL); + } + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (unlikely(idx == n_entries)) + dm_multisnap_fixup_backlimits(s, path, depth, key->chunk, key->snap_to, last_one_chunk, last_one_snap_to); +} + +void dm_multisnap_bt_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap) +{ + struct dm_buffer *bp; + struct dm_multisnap_bt_node *node; + struct bt_key key; + struct path_element path[MAX_BT_DEPTH]; + int results_ptr; + + chunk_t new_blockn; + int r; + int i; + + if (s->p->n_preallocated_blocks < s->p->bt_depth) { + if (dm_multisnap_alloc_blocks(s, s->p->preallocated_blocks + s->p->n_preallocated_blocks, s->p->bt_depth - s->p->n_preallocated_blocks, 0) < 0) + return; + s->p->n_preallocated_blocks = s->p->bt_depth; + } + results_ptr = 0; + + /* + * Read the key from this node --- we'll walk the btree according + * to this key to find a path from the root. + */ + node = dm_multisnap_read_btnode(s, s->p->bt_depth - 1, tmp_remap->new, &bp); + if (!node) + return; + key.chunk = read_48(&node->entries[0], orig_chunk); + key.snap_from = key.snap_to = le32_to_cpu(node->entries[0].snap_from); + dm_bufio_release(bp); + + r = walk_btree(s, &key, &node, &bp, path); + if (r < 0) + return; + + dm_bufio_release(bp); + + for (i = s->p->bt_depth - 1; i >= 0; i--) + if (path[i].block == tmp_remap->old) + goto found; + + DMERR("block %Lx/%Lx was not found in btree when searching for %Lx/%x", (unsigned long long)tmp_remap->old, (unsigned long long)tmp_remap->new, (unsigned long long)key.chunk, key.snap_from); + for (i = 0; i < s->p->bt_depth; i++) + DMERR("path[%d]: %Lx/%x", i, (unsigned long long)path[i].block, path[i].idx); + dm_multisnap_set_error(s, -EFSERROR); + return; + + found: + + dm_multisnap_free_block(s, tmp_remap->old, 0); + + new_blockn = tmp_remap->new; + for (i--; i >= 0; i--) { + int remapped = 0; + node = dm_multisnap_read_btnode(s, i, path[i].block, &bp); + if (!node) + return; + if (!dm_multisnap_block_is_uncommitted(s, path[i].block)) { + remapped = 1; + dm_bufio_release_move(bp, s->p->preallocated_blocks[results_ptr]); + dm_multisnap_free_block_and_duplicates(s, path[i].block); + node = dm_multisnap_read_btnode(s, i, s->p->preallocated_blocks[results_ptr], &bp); + if (!node) + return; + /* !!! TODO: add to a list of newly allocated blocks */ + } + write_48(&node->entries[path[i].idx], new_chunk, new_blockn); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (!remapped) + goto skip_it; + new_blockn = s->p->preallocated_blocks[results_ptr]; + results_ptr++; + } + + s->p->bt_root = new_blockn; + +skip_it: + memmove(s->p->preallocated_blocks, s->p->preallocated_blocks + results_ptr, (s->p->n_preallocated_blocks -= results_ptr) * sizeof(chunk_t)); +} + Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-commit.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-commit.c 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,192 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +static void dm_multisnap_finalize_tmp_remaps(struct dm_multisnap *s) +{ + struct tmp_remap *t; + int i; + + while (s->p->n_used_tmp_remaps) { + if (dm_multisnap_has_error(s)) + return; + if (s->p->n_used_tmp_remaps < N_REMAPS - 1) { +/* prefer btree remaps ... if there are none, do bitmap remaps */ + if (!list_empty(&s->p->used_bt_tmp_remaps)) { + t = container_of(s->p->used_bt_tmp_remaps.next, struct tmp_remap, list); + dm_multisnap_bt_finalize_tmp_remap(s, t); + dm_multisnap_free_tmp_remap(s, t); + continue; + } + } + +/* else: 0 or 1 free remaps : finalize bitmaps */ + if (!list_empty(&s->p->used_bitmap_tmp_remaps)) { + t = container_of(s->p->used_bitmap_tmp_remaps.next, struct tmp_remap, list); + dm_multisnap_bitmap_finalize_tmp_remap(s, t); + dm_multisnap_free_tmp_remap(s, t); + continue; + } else { + DMERR("dm_multisnap_finalize_tmp_remaps: no bitmap tmp remaps, n_used_tmp_remaps %u", s->p->n_used_tmp_remaps); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + } + + if (dm_multisnap_has_error(s)) + return; + + for (i = s->p->n_preallocated_blocks - 1; i >= 0; i--) + dm_multisnap_free_blocks_immediate(s, s->p->preallocated_blocks[i], 1); + s->p->n_preallocated_blocks = 0; +} + +void dm_multisnap_transaction_mark(struct dm_multisnap *s) +{ + /* + * Accounting: + * max number of modified/allocated blocks during btree add: + * s->p->bt_depth * 2 + 1 + * one additional entry for newly allocated data chunk + * one additional entry for bitmap finalization + */ + if (unlikely(N_REMAPS - s->p->n_used_tmp_remaps < s->p->bt_depth * 2 + 3)) + dm_multisnap_finalize_tmp_remaps(s); +} + +void dm_multisnap_commit(struct dm_multisnap *s) +{ + struct tmp_remap *t; + chunk_t cb_addr; + chunk_t cb_div, cb_offset; + struct multisnap_commit_block *cb; + struct multisnap_superblock *sb; + unsigned idx; + struct dm_buffer *bp; + int r; + + dm_multisnap_transaction_mark(s); + + dm_multisnap_flush_freelist_before_commit(s); + + if (dm_multisnap_has_error(s)) { + /* !!! FIXME: write error to superblock */ + return; + } + + list_for_each_entry(t, &s->p->used_bitmap_tmp_remaps, list) + t->uncommitted = 0; + + list_for_each_entry(t, &s->p->used_bt_tmp_remaps, list) + t->uncommitted = 0; + + if (unlikely((r = dm_bufio_write_dirty_buffers(s->p->bufio)) < 0)) { + DMERR("dm_multisnap_commit: error writing data"); + dm_multisnap_set_error(s, r); + return; + } + + cb_addr = s->p->alloc_rover; + + if (cb_addr < FIRST_CB_BLOCK) + cb_addr = FIRST_CB_BLOCK; + cb_div = cb_addr - FIRST_CB_BLOCK; + cb_offset = sector_div(cb_div, CB_STRIDE); + cb_addr += CB_STRIDE - cb_offset; + if (cb_offset < CB_STRIDE / 2 || cb_addr >= s->p->dev_size) + cb_addr -= CB_STRIDE; + + cb = dm_bufio_new(s->p->bufio, cb_addr, &bp); + if (IS_ERR(cb)) { + DMERR("dm_multisnap_commit: can't allocate new commit block at %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, PTR_ERR(cb)); + return; + } + + s->p->commit_sequence++; + + cb->signature = CB_SIGNATURE; + cb->snapshot_num = cpu_to_le32(s->p->snapshot_num); + cb->sequence = cpu_to_le64(s->p->commit_sequence); + write_48(cb, dev_size, s->p->dev_size); + write_48(cb, total_allocated, s->p->total_allocated); + write_48(cb, data_allocated, s->p->data_allocated); + write_48(cb, bitmap_root, s->p->bitmap_root); + write_48(cb, alloc_rover, s->p->alloc_rover); + write_48(cb, freelist, s->p->freelist_ptr); + write_48(cb, delete_rover, s->p->delete_rover); + write_48(cb, bt_root, s->p->bt_root); + cb->bt_depth = s->p->bt_depth; + cb->flags = s->p->flags; + memset(cb->pad, 0, sizeof cb->pad); + idx = 0; + list_for_each_entry(t, &s->p->used_bitmap_tmp_remaps, list) { + BUG_ON(idx >= N_REMAPS); + write_48(&cb->tmp_remap[idx], old, t->old); + write_48(&cb->tmp_remap[idx], new, t->new); + cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(t->bitmap_idx); + idx++; + } + list_for_each_entry(t, &s->p->used_bt_tmp_remaps, list) { + BUG_ON(idx >= N_REMAPS); + write_48(&cb->tmp_remap[idx], old, t->old); + write_48(&cb->tmp_remap[idx], new, t->new); + cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(t->bitmap_idx); + idx++; + } + for (; idx < N_REMAPS; idx++) { + write_48(&cb->tmp_remap[idx], old, 0); + write_48(&cb->tmp_remap[idx], new, 0); + cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(0); + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + r = dm_bufio_write_dirty_buffers(s->p->bufio); + if (unlikely(r < 0)) { + DMERR("dm_multisnap_commit: can't write commit block at %Lx", (unsigned long long)cb_addr); + dm_multisnap_set_error(s, r); + return; + } + + if (likely(cb_addr == s->p->valid_commit_block) || + likely(cb_addr == s->p->valid_commit_block + CB_STRIDE)) + goto return_success; + + sb = dm_bufio_read(s->p->bufio, SB_BLOCK, &bp); + if (IS_ERR(sb)) { + DMERR("dm_multisnap_commit: can't read super block"); + dm_multisnap_set_error(s, PTR_ERR(sb)); + return; + } + + if (unlikely(sb->signature != SB_SIGNATURE)) { + dm_bufio_release(bp); + DMERR("dm_multisnap_commit: invalid super block signature when committing"); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + sb->commit_block = cpu_to_le64(cb_addr); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + r = dm_bufio_write_dirty_buffers(s->p->bufio); + if (unlikely(r < 0)) { + DMERR("dm_multisnap_commit: can't write super block"); + dm_multisnap_set_error(s, r); + return; + } + +return_success: + s->p->valid_commit_block = cb_addr; + + dm_multisnap_load_freelist(s); + + return; +} Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-mikulas-struct.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-mikulas-struct.h 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#ifndef DM_MULTISNAP_MIKULAS_STRUCT_H +#define DM_MULTISNAP_MIKULAS_STRUCT_H + +#include +#include + +#define SB_BLOCK 0 + +#define SB_SIGNATURE cpu_to_be32(0xF6015342) + +struct multisnap_superblock { + __u32 signature; + __u32 chunk_size; + __u64 commit_block; +}; + +#define FIRST_CB_BLOCK 1 +#define CB_STRIDE 1024 /* !!! TODO: make it configurable ? */ + +#define CB_SIGNATURE cpu_to_be32(0xF6014342) + +struct commit_block_tmp_remap { + __u32 old1; + __u16 old2; + __u16 new2; + __u32 new1; + __u32 bitmap_idx; +}; + +#define CB_BITMAP_IDX_MAX 0xfffffffd +#define CB_BITMAP_IDX_NONE 0xfffffffe +#define CB_BITMAP_IDX_FREE 0xffffffff + +#define N_REMAPS 27 + +struct multisnap_commit_block { + __u32 signature; + __u32 snapshot_num; + __u64 sequence; + + __u32 dev_size1; + __u16 dev_size2; + __u16 total_allocated2; + __u32 total_allocated1; + __u32 data_allocated1; + + __u16 data_allocated2; + __u16 bitmap_root2; + __u32 bitmap_root1; + __u32 alloc_rover1; + __u16 alloc_rover2; + __u16 freelist2; + + __u32 freelist1; + __u32 delete_rover1; + __u16 delete_rover2; + __u16 bt_root2; + __u32 bt_root1; + + __u8 bt_depth; + __u8 flags; + __u8 pad[14]; + + struct commit_block_tmp_remap tmp_remap[N_REMAPS]; +}; + +#define MULTISNAP_FLAG_DELETING 0x01 +#define MULTISNAP_FLAG_PENDING_DELETE 0x02 + +#define MAX_BITMAP_DEPTH 6 + +static inline int dm_multisnap_bitmap_depth(unsigned chunk_size, __u64 device_size) +{ + unsigned depth = 0; + __u64 entries = chunk_size * 8; + while (entries < device_size) { + depth++; + entries *= chunk_size / 8; + if (!entries) + return -ERANGE; + } + + if (depth > MAX_BITMAP_DEPTH) + return -ERANGE; + + return depth; +} + +/* B+-tree entry. Sorted by orig_chunk and snap_from/to */ + +#define MAX_BT_DEPTH 12 + +struct dm_multisnap_bt_entry { + __u32 orig_chunk1; + __u16 orig_chunk2; + __u16 new_chunk2; + __u32 new_chunk1; + __u32 snap_from; + __u32 snap_to; +}; + +#define BT_SIGNATURE cpu_to_be32(0xF6014254) + +struct dm_multisnap_bt_node { + __u32 signature; + __u32 n_entries; + struct dm_multisnap_bt_entry entries[0]; +}; + +static inline unsigned dm_multisnap_btree_entries(unsigned chunk_size) +{ + return (chunk_size - sizeof(struct dm_multisnap_bt_node)) / sizeof(struct dm_multisnap_bt_entry); +} + +struct dm_multisnap_freelist_entry { + __u32 block1; + __u16 block2; + __u16 run_length; +}; + +#define FREELIST_RL_MASK 0x7fff +#define FREELIST_DATA_FLAG 0x8000 + +#define FL_SIGNATURE cpu_to_be32(0xF601464C) + +struct dm_multisnap_freelist { + __u32 signature; + __u32 backlink1; + __u16 backlink2; + __u32 n_entries; + struct dm_multisnap_freelist_entry entries[0]; +}; + +static inline unsigned dm_multisnap_freelist_entries(unsigned chunk_size) +{ + return (chunk_size - sizeof(struct dm_multisnap_freelist)) / sizeof(struct dm_multisnap_freelist); +} + +#endif Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-mikulas.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-mikulas.h 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#ifndef DM_MULTISNAP_MIKULAS_H +#define DM_MULTISNAP_MIKULAS_H + +#include "dm-multisnap.h" +#include "dm-multisnap-mikulas-struct.h" + +#include +#include + +typedef __u32 bitmap_t; + +#define read_48(struc, entry) (le32_to_cpu((struc)->entry##1) | ((chunk_t)le16_to_cpu((struc)->entry##2) << 31 << 1)) +#define write_48(struc, entry, val) do { (struc)->entry##1 = cpu_to_le32(val); (struc)->entry##2 = cpu_to_le16((chunk_t)(val) >> 31 >> 1); } while (0) + +#define TMP_REMAP_HASH_SIZE 256 +#define TMP_REMAP_HASH(c) ((c) & (TMP_REMAP_HASH_SIZE - 1)) + +#define CHUNK_BITS 48 +#define CHUNK_T_SENTINEL ((chunk_t)(1LL << CHUNK_BITS) - 1) +#define CHUNK_T_SNAP_PRESENT ((chunk_t)(1LL << CHUNK_BITS) - 1) +#define CHUNK_T_MAX ((chunk_t)(1LL << CHUNK_BITS) - 2) + +struct tmp_remap { + /* List entry for tmp_remap */ + struct hlist_node hash_list; + /* List entry for used_tmp_remaps/free_tmp_remaps */ + struct list_head list; + chunk_t old; + chunk_t new; + bitmap_t bitmap_idx; + int uncommitted; +}; + +struct bt_key { + chunk_t chunk; + snapid_t snap_from; + snapid_t snap_to; +}; + +struct path_element { + chunk_t block; + unsigned idx; + unsigned n_entries; +}; + +struct exception_store_private { + struct dm_bufio_client *bufio; + + chunk_t dev_size; + unsigned bitmap_depth; + unsigned btree_entries; + snapid_t snapshot_num; + __u8 bt_depth; + __u8 flags; + + chunk_t bitmap_root; + chunk_t alloc_rover; + chunk_t bt_root; + chunk_t sb_commit_block; + chunk_t valid_commit_block; + chunk_t delete_rover; + + chunk_t total_allocated; + chunk_t data_allocated; + + __u64 commit_sequence; + + void *tmp_chunk; + + struct rb_root active_snapshots; + + /* Used during query/add remap */ + chunk_t query_snapid; + struct bt_key query_new_key; + unsigned char query_active; + chunk_t query_block_from; + chunk_t query_block_to; + + /* List heads for struct tmp_remap->list */ + unsigned n_used_tmp_remaps; + struct list_head used_bitmap_tmp_remaps; + struct list_head used_bt_tmp_remaps; + struct list_head free_tmp_remaps; + /* List head for struct tmp_remap->hash_list */ + struct hlist_head tmp_remap[TMP_REMAP_HASH_SIZE]; + struct tmp_remap tmp_remap_store[N_REMAPS]; + + unsigned n_preallocated_blocks; + chunk_t preallocated_blocks[MAX_BITMAP_DEPTH * 2]; + + struct dm_multisnap_freelist *freelist; + chunk_t freelist_ptr; + + struct dm_multisnap_background_work delete_work; + unsigned delete_commit_count; +}; + +/* dm-multisnap-alloc.c */ + +void dm_multisnap_create_bitmaps(struct dm_multisnap *s, chunk_t start); +void *dm_multisnap_map_bitmap(struct dm_multisnap *s, bitmap_t bitmap, struct dm_buffer **bp, chunk_t *block, struct path_element *path); +int dm_multisnap_alloc_blocks(struct dm_multisnap *s, chunk_t *results, unsigned n_blocks, int flags); +#define ALLOC_DRY 1 +void *dm_multisnap_alloc_duplicate_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp, void *ptr); +void *dm_multisnap_alloc_make_block(struct dm_multisnap *s, chunk_t *result, struct dm_buffer **bp); +void dm_multisnap_free_blocks_immediate(struct dm_multisnap *s, chunk_t block, unsigned n_blocks); +void dm_multisnap_bitmap_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap); + +/* dm-multisnap-blocks.c */ + +void *dm_multisnap_read_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp); +int dm_multisnap_block_is_uncommitted(struct dm_multisnap *s, chunk_t block); +void *dm_multisnap_duplicate_block(struct dm_multisnap *s, chunk_t old_chunk, chunk_t new_chunk, bitmap_t bitmap_idx, struct dm_buffer **bp, chunk_t *to_free); +void dm_multisnap_free_tmp_remap(struct dm_multisnap *s, struct tmp_remap *t); +void *dm_multisnap_make_block(struct dm_multisnap *s, chunk_t new_chunk, struct dm_buffer **bp); +void dm_multisnap_free_block_and_duplicates(struct dm_multisnap *s, chunk_t chunk); + +int dm_multisnap_is_cb_block(struct dm_multisnap *s, chunk_t block); + +typedef chunk_t stop_cycles_t[2]; + +void dm_multisnap_init_stop_cycles(stop_cycles_t *cy); +int dm_multisnap_stop_cycles(struct dm_multisnap *s, stop_cycles_t *cy, chunk_t key); + +/* dm-multisnap-btree.c */ + +void dm_multisnap_create_btree(struct dm_multisnap *s, chunk_t *start); +int dm_multisnap_find_in_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t *result); +void dm_multisnap_add_to_btree(struct dm_multisnap *s, struct bt_key *key, chunk_t new_chunk); +void dm_multisnap_restrict_btree_entry(struct dm_multisnap *s, struct bt_key *key); +void dm_multisnap_extend_btree_entry(struct dm_multisnap *s, struct bt_key *key); +void dm_multisnap_delete_from_btree(struct dm_multisnap *s, struct bt_key *key); +void dm_multisnap_bt_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap); +int dm_multisnap_list_btree(struct dm_multisnap *s, struct bt_key *key, int (*call)(struct dm_multisnap *, struct dm_multisnap_bt_entry *, void *), void *cookie); + +/* dm-multisnap-commit.c */ + +void dm_multisnap_transaction_mark(struct dm_multisnap *s); +void dm_multisnap_commit(struct dm_multisnap *s); + +/* dm-multisnap-delete.c */ + +void dm_multisnap_background_delete(struct dm_multisnap *s, struct dm_multisnap_background_work *bw); + +/* dm-multisnap-freelist.c */ + +void dm_multisnap_free_block(struct dm_multisnap *s, chunk_t block, unsigned flags); +int dm_multisnap_check_allocated_block(struct dm_multisnap *s, chunk_t block); +void dm_multisnap_flush_freelist_before_commit(struct dm_multisnap *s); +void dm_multisnap_load_freelist(struct dm_multisnap *s); + +/* dm-multisnap-io.c */ + +int dm_multisnap_find_snapshot_chunk(struct dm_multisnap *s, snapid_t snapid, chunk_t chunk, chunk_t *result); +void dm_multisnap_reset_query(struct dm_multisnap *s); +int dm_multisnap_query_next_remap(struct dm_multisnap *s, chunk_t chunk); +void dm_multisnap_add_next_remap(struct dm_multisnap *s, union chunk_descriptor *cd, chunk_t *new_chunk); +void dm_multisnap_make_chunk_writeable(struct dm_multisnap *s, union chunk_descriptor *cd, chunk_t *new_chunk); +int dm_multisnap_check_conflict(struct dm_multisnap *s, union chunk_descriptor *cd, snapid_t snapid); + +/* dm-multisnap-snaps.c */ + +int dm_multisnap_snapshot_exists(struct dm_multisnap *s, snapid_t snapid); +int dm_multisnap_find_next_snapid_range(struct dm_multisnap *s, snapid_t snapid, snapid_t *from, snapid_t *to); + +void dm_multisnap_destroy_snapshot_tree(struct dm_multisnap *s); +void dm_multisnap_read_snapshots(struct dm_multisnap *s); +int dm_multisnap_create_snapshot(struct dm_multisnap *s, snapid_t *snapid); +int dm_multisnap_delete_snapshot(struct dm_multisnap *s, snapid_t snapid); + +void dm_multisnap_status_info(struct dm_multisnap *s, char *result, unsigned maxlen); + +#endif Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-snaps.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-snaps.c 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +struct snapshot_range { + struct rb_node node; + snapid_t from; + snapid_t to; +}; + +static struct snapshot_range *rb_find_insert_snapshot(struct dm_multisnap *s, snapid_t from, snapid_t to, int add) +{ + struct snapshot_range *new; + struct snapshot_range *found = NULL; + struct rb_node **p = &s->p->active_snapshots.rb_node; + struct rb_node *parent = NULL; + while (*p) { + parent = *p; +#define rn rb_entry(parent, struct snapshot_range, node) + if (to < rn->from) { + if (to == rn->from - 1 && add) { + rn->from = from; + return rn; + } +go_left: + p = &rn->node.rb_left; + } else if (from > rn->to) { + if (from == rn->to + 1 && add) { + rn->to = to; + return rn; + } + p = &rn->node.rb_right; + } else { + if (!add) { + found = rn; + /* If there is range query, we need to find the leftmost node */ + if (from < rn->from) + goto go_left; + break; + } else { + dm_multisnap_set_error(s, -EFSERROR); + DMERR("rb_insert_snapshot: inserting overlapping entry: (%u,%u) overlaps (%u,%u)", from, to, rn->from, rn->to); + return NULL; + } + } +#undef rn + } + if (!add) + return found; + + new = kmalloc(sizeof(struct snapshot_range), GFP_KERNEL); + if (!new) { + DMERR("rb_insert_snapshot: can't allocate memory for snapshot descriptor"); + dm_multisnap_set_error(s, -ENOMEM); + return NULL; + } + + new->from = from; + new->to = to; + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &s->p->active_snapshots); + + return new; +} + +static struct snapshot_range *rb_find_snapshot(struct dm_multisnap *s, snapid_t from, snapid_t to) +{ + return rb_find_insert_snapshot(s, from, to, 0); +} + +static int rb_insert_snapshot(struct dm_multisnap *s, snapid_t from, snapid_t to) +{ + struct snapshot_range *rn; + rn = rb_find_insert_snapshot(s, from, to, 1); + if (!rn) + return -1; + return 0; +} + +static int rb_delete_snapshot(struct dm_multisnap *s, snapid_t snapid) +{ + struct snapshot_range *sr = rb_find_snapshot(s, snapid, snapid); + + if (!sr) { + dm_multisnap_set_error(s, -EFSERROR); + DMERR("rb_delete_snapshot: deleting non-existing snapid %u", snapid); + return -1; + } + + if (sr->from < snapid) { + snapid_t orig_to = sr->to; + sr->to = snapid - 1; + if (orig_to > snapid) { + if (rb_insert_snapshot(s, snapid + 1, orig_to)) { + sr->to = orig_to; + return -1; + } + } + } else { + if (sr->to > snapid) { + sr->from = snapid + 1; + } else { + rb_erase(&sr->node, &s->p->active_snapshots); + kfree(sr); + } + } + return 0; +} + +int dm_multisnap_snapshot_exists(struct dm_multisnap *s, snapid_t snapid) +{ + return !!rb_find_snapshot(s, snapid, snapid); +} + +int dm_multisnap_find_next_snapid_range(struct dm_multisnap *s, snapid_t snapid, snapid_t *from, snapid_t *to) +{ + struct snapshot_range *rn; + rn = rb_find_snapshot(s, snapid, SNAPID_T_MAX); + if (!rn) + return 0; + *from = rn->from; + *to = rn->to; + return 1; +} + +void dm_multisnap_destroy_snapshot_tree(struct dm_multisnap *s) +{ + struct rb_node *root; + while ((root = s->p->active_snapshots.rb_node)) { +#define rn rb_entry(root, struct snapshot_range, node) + rb_erase(root, &s->p->active_snapshots); + kfree(rn); +#undef rn + } +} + +void dm_multisnap_read_snapshots(struct dm_multisnap *s) +{ + struct bt_key snap_key; + chunk_t ignore; + int r; + + dm_multisnap_destroy_snapshot_tree(s); + + snap_key.snap_from = 0; +find_next: + snap_key.snap_to = SNAPID_T_MAX; + snap_key.chunk = CHUNK_T_SNAP_PRESENT; + + r = dm_multisnap_find_in_btree(s, &snap_key, &ignore); + + if (unlikely(r < 0)) + return; + + if (r) { + printk("inserting snapid %d-%d\n", snap_key.snap_from, snap_key.snap_to); + if (unlikely(snap_key.snap_to > SNAPID_T_MAX)) { + dm_multisnap_set_error(s, -EFSERROR); + DMERR("dm_multisnap_read_snapshots: invalid snapshot id"); + return; + } + r = rb_insert_snapshot(s, snap_key.snap_from, snap_key.snap_to); + if (unlikely(r < 0)) + return; + snap_key.snap_from = snap_key.snap_to + 1; + goto find_next; + } +} + +int dm_multisnap_create_snapshot(struct dm_multisnap *s, snapid_t *snapid) +{ + int r; + struct bt_key snap_key; + + if (s->p->snapshot_num > SNAPID_T_MAX) { + DMERR("dm_multisnap_create_snapshot: 2^32 snapshot limit reached"); + return -ENOSPC; + } + + r = rb_insert_snapshot(s, s->p->snapshot_num, s->p->snapshot_num); + if (r < 0) + return dm_multisnap_has_error(s); + + if (s->p->snapshot_num && dm_multisnap_snapshot_exists(s, s->p->snapshot_num - 1)) { + /* Extend existing key range */ + snap_key.chunk = CHUNK_T_SNAP_PRESENT; + snap_key.snap_from = s->p->snapshot_num - 1; + snap_key.snap_to = s->p->snapshot_num; + dm_multisnap_extend_btree_entry(s, &snap_key); + } else { + snap_key.chunk = CHUNK_T_SNAP_PRESENT; + snap_key.snap_from = s->p->snapshot_num; + snap_key.snap_to = s->p->snapshot_num; + dm_multisnap_add_to_btree(s, &snap_key, 0); + } + if (dm_multisnap_has_error(s)) + return dm_multisnap_has_error(s); + + printk("multisnapshot: created snapshot with ID %u\n", s->p->snapshot_num); + + *snapid = s->p->snapshot_num++; + + dm_multisnap_transaction_mark(s); + dm_multisnap_commit(s); + + return 0; +} + +int dm_multisnap_delete_snapshot(struct dm_multisnap *s, snapid_t snapid) +{ + int r; + struct bt_key snap_key; + chunk_t ignore; + + r = rb_delete_snapshot(s, snapid); + if (r < 0) + return dm_multisnap_has_error(s); + + snap_key.chunk = CHUNK_T_SNAP_PRESENT; + snap_key.snap_from = snapid; + snap_key.snap_to = snapid; + + r = dm_multisnap_find_in_btree(s, &snap_key, &ignore); + if (r <= 0) { + if (!r) { + dm_multisnap_set_error(s, -EFSERROR); + DMERR("dm_multisnap_delete_snapshot: snapshot id %u not found in b-tree", snapid); + } + return dm_multisnap_has_error(s); + } + + if (snap_key.snap_from < snapid) { + snap_key.snap_from = snapid; + dm_multisnap_restrict_btree_entry(s, &snap_key); + + dm_multisnap_transaction_mark(s); + + if (dm_multisnap_has_error(s)) + return dm_multisnap_has_error(s); + + if (snap_key.snap_to > snapid) { + snap_key.snap_from = snapid + 1; + dm_multisnap_add_to_btree(s, &snap_key, 0); + } + } else { + if (snap_key.snap_to > snapid) { + snap_key.snap_from = snapid; + snap_key.snap_to = snapid; + dm_multisnap_restrict_btree_entry(s, &snap_key); + } else { + dm_multisnap_delete_from_btree(s, &snap_key); + } + } + + dm_multisnap_transaction_mark(s); + + s->p->flags |= MULTISNAP_FLAG_PENDING_DELETE; + dm_multisnap_queue_work(s, &s->p->delete_work); + + dm_multisnap_commit(s); + + return 0; +} + +void dm_multisnap_status_info(struct dm_multisnap *s, char *result, unsigned maxlen) +{ + snapid_t n_snaps = 0; + + snapid_t from, to; + snapid_t snapid = 0; + for (snapid = 0; dm_multisnap_find_next_snapid_range(s, snapid, &from, &to); snapid = to + 1) + n_snaps += to - from + 1; + + snprintf(result, maxlen, " %Lu %Lu %Lu %u", (unsigned long long)s->p->dev_size, (unsigned long long)s->p->total_allocated, (unsigned long long)(s->p->total_allocated - s->p->data_allocated), n_snaps); + dm_multisnap_adjust_string(&result, &maxlen); + + for (snapid = 0; dm_multisnap_find_next_snapid_range(s, snapid, &from, &to); snapid = to + 1) + for (; from <= to; from++) { + snprintf(result, maxlen, " %u", from); + dm_multisnap_adjust_string(&result, &maxlen); + } +} + Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-alloc.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-alloc.c 2009-05-11 13:35:00.000000000 +0200 @@ -0,0 +1,438 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +#define rshift_roundup(val, bits) (((val) + ((chunk_t)1 << (bits)) - 1) >> (bits)) + +#define BITS_PER_BYTE_SHIFT 3 +#define BYTES_PER_POINTER_SHIFT 3 + +void dm_multisnap_create_bitmaps(struct dm_multisnap *s, chunk_t writing_block) +{ + int r; + struct dm_buffer *bp; + chunk_t direct_bitmap_blocks, total_bitmap_blocks, total_preallocated_blocks; + chunk_t lower_depth_block; + unsigned i, d; + chunk_t ii; + + r = dm_multisnap_bitmap_depth(s->chunk_size, s->p->dev_size); + if (r < 0) { + DMERR("dm_multisnap_create_bitmaps: device is too large"); + dm_multisnap_set_error(s, r); + return; + } + s->p->bitmap_depth = r; + + direct_bitmap_blocks = rshift_roundup(s->p->dev_size, s->chunk_shift + BITS_PER_BYTE_SHIFT); + + if (direct_bitmap_blocks > CB_BITMAP_IDX_MAX) { + DMERR("dm_multisnap_create_bitmaps: device is too large"); + dm_multisnap_set_error(s, -ERANGE); + return; + } + + total_bitmap_blocks = 0; + for (i = 0; i <= s->p->bitmap_depth; i++) { + unsigned shift = (s->chunk_shift - BYTES_PER_POINTER_SHIFT) * i; + total_bitmap_blocks += rshift_roundup(direct_bitmap_blocks, shift); + } + total_preallocated_blocks = writing_block + total_bitmap_blocks; + for (ii = 0; ii < total_preallocated_blocks; ii++) { + if (dm_multisnap_is_cb_block(s, ii)) + total_preallocated_blocks++; + } + + if (total_preallocated_blocks >= s->p->dev_size) { + DMERR("dm_multisnap_create_bitmaps: device is too small"); + dm_multisnap_set_error(s, -ENOSPC); + return; + } + +/* Write direct bitmap blocks */ + + lower_depth_block = writing_block; + for (ii = 0; ii < direct_bitmap_blocks; ii++, writing_block++) { + void *bmp; + while (dm_multisnap_is_cb_block(s, writing_block)) + writing_block++; + bmp = dm_bufio_new(s->p->bufio, writing_block, &bp); + if (IS_ERR(bmp)) { + DMERR("dm_multisnap_create_bitmaps: can't create direct bitmap block at %Lx", (unsigned long long)writing_block); + dm_multisnap_set_error(s, PTR_ERR(bmp)); + return; + } + memset(bmp, 0, s->chunk_size); + for (i = 0; i < s->chunk_size << BITS_PER_BYTE_SHIFT; i++) { + chunk_t block_to_test = (ii << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | i; + if (block_to_test >= s->p->dev_size) { + generic___set_le_bit(i, bmp); + } else if (block_to_test < total_preallocated_blocks || dm_multisnap_is_cb_block(s, block_to_test)) { + generic___set_le_bit(i, bmp); + s->p->total_allocated++; + } + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + +/* Write indirect bitmap blocks */ + + for (d = 1; d <= s->p->bitmap_depth; d++) { + chunk_t this_depth_block = writing_block; + for (ii = 0; ii < rshift_roundup(direct_bitmap_blocks, d * (s->chunk_shift - BYTES_PER_POINTER_SHIFT)); ii++, writing_block++) { + __u64 *bmp; + while (dm_multisnap_is_cb_block(s, writing_block)) + writing_block++; + bmp = dm_bufio_new(s->p->bufio, writing_block, &bp); + if (IS_ERR(bmp)) { + DMERR("dm_multisnap_create_bitmaps: can't create indirect bitmap block at %Lx", (unsigned long long)writing_block); + dm_multisnap_set_error(s, PTR_ERR(bmp)); + return; + } + for (i = 0; i < s->chunk_size >> BYTES_PER_POINTER_SHIFT; i++) { + if (((ii << d * (s->chunk_shift - BYTES_PER_POINTER_SHIFT)) | (i << (d - 1) * (s->chunk_shift - BYTES_PER_POINTER_SHIFT))) >= direct_bitmap_blocks) { + bmp[i] = cpu_to_le64(0); + continue; + } + while (dm_multisnap_is_cb_block(s, lower_depth_block)) + lower_depth_block++; + bmp[i] = cpu_to_le64(lower_depth_block); + lower_depth_block++; + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + lower_depth_block = this_depth_block; + } + + s->p->bitmap_root = writing_block - 1; +} + +void *dm_multisnap_map_bitmap(struct dm_multisnap *s, bitmap_t bitmap, struct dm_buffer **bp, chunk_t *block, struct path_element *path) +{ + __u64 *bmp; + unsigned idx; + unsigned d = s->p->bitmap_depth; + chunk_t blk = s->p->bitmap_root; + while (1) { + bmp = dm_multisnap_read_block(s, blk, bp); + if (unlikely(IS_ERR(bmp))) { + DMERR("dm_multisnap_map_bitmap: can't read bitmap at %Lx, depth %d/%d, index %Lx", (unsigned long long)blk, s->p->bitmap_depth - d, s->p->bitmap_depth, (unsigned long long)bitmap); + dm_multisnap_set_error(s, PTR_ERR(bmp)); + return NULL; + } + if (!d) { + if (block) + *block = blk; + return bmp; + } + + idx = (bitmap >> ((d - 1) * (s->chunk_shift - BYTES_PER_POINTER_SHIFT))) & ((s->chunk_size - 1) >> BYTES_PER_POINTER_SHIFT); + + if (unlikely(path != NULL)) { + path[s->p->bitmap_depth - d].block = blk; + path[s->p->bitmap_depth - d].idx = idx; + path[s->p->bitmap_depth - d].n_entries = s->chunk_size >> BYTES_PER_POINTER_SHIFT; + } + + blk = le64_to_cpu(bmp[idx]); + + dm_bufio_release(*bp); + if (!blk) { + DMERR("dm_multisnap_map_bitmap: accessing bitmap out of range, bitmap %x", bitmap); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + + d--; + } +} + +static int find_bit(const void *bmp, unsigned start, unsigned end, int wide_search) +{ + const void *p; + unsigned bit; + if (unlikely(start >= end)) + return -ENOSPC; + if (likely(!generic_test_le_bit(start, bmp))) + return start; + if (likely(wide_search)) { + p = memchr(bmp + (start >> 3), 0, (end >> 3) - (start >> 3)); + if (p) { + bit = ((const __u8 *)p - (const __u8 *)bmp) << 3; + while (bit > start && !generic_test_le_bit(bit - 1, bmp)) + bit--; + goto ret_bit; + } + } + bit = generic_find_next_zero_le_bit(bmp, end, start); + ret_bit: + if (bit >= end) + return -ENOSPC; + return bit; +} + +int dm_multisnap_alloc_blocks(struct dm_multisnap *s, chunk_t *results, unsigned n_blocks, int flags) +{ + void *bmp; + struct dm_buffer *bp; + chunk_t block; + int wrap_around = 0; + int start_bit; + int wide_search; + int i; + bitmap_t bitmap_no; + int c; + int bit; + chunk_t to_free = 0; + + bitmap_no = s->p->alloc_rover >> (s->chunk_shift + BITS_PER_BYTE_SHIFT); +next_bitmap: + bmp = dm_multisnap_map_bitmap(s, bitmap_no, &bp, &block, NULL); + if (unlikely(!bmp)) + return -1; + + wide_search = 1; +find_again: + start_bit = s->p->alloc_rover & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1); + + for (i = 0; i < n_blocks; i++) { +find_another_bit: + bit = find_bit(bmp, start_bit, s->chunk_size << BITS_PER_BYTE_SHIFT, wide_search); + if (unlikely(bit < 0)) { +bit_find_failed: + if (wide_search) { + wide_search = 0; + goto find_again; + } + dm_bufio_release(bp); + s->p->alloc_rover = (chunk_t)++bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT); + if (unlikely(s->p->alloc_rover >= s->p->dev_size)) { + s->p->alloc_rover = 0; + bitmap_no = 0; + wrap_around++; + if (wrap_around >= 2) { + DMERR("snapshot overflow"); + dm_multisnap_set_error(s, -ENOSPC); + return -1; + } + } + goto next_bitmap; + } + results[i] = ((chunk_t)bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | bit; + start_bit = bit + 1; + dm_bufio_release(bp); + + c = dm_multisnap_check_allocated_block(s, results[i]); + if (dm_multisnap_has_error(s)) + return -1; + + bmp = dm_multisnap_read_block(s, block, &bp); + if (unlikely(!bmp)) + return -1; + + if (c) + goto find_another_bit; + } + + if (flags & ALLOC_DRY) + goto bp_release_return; + + if (!dm_multisnap_block_is_uncommitted(s, block)) { + chunk_t new_block; +find_another_bit_for_bitmap: + bit = find_bit(bmp, start_bit, s->chunk_size << BITS_PER_BYTE_SHIFT, wide_search); + if (unlikely(bit < 0)) + goto bit_find_failed; + + new_block = ((chunk_t)bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | bit; + start_bit = bit + 1; + + dm_bufio_release(bp); + c = dm_multisnap_check_allocated_block(s, new_block); + if (dm_multisnap_has_error(s)) + return -1; + + bmp = dm_multisnap_read_block(s, block, &bp); + if (unlikely(!bmp)) + return -1; + + if (c) + goto find_another_bit_for_bitmap; + + /* + * Warning: record the address of a block to free in a special + * variable. + * + * If we freed it here, that could recurse back to + * dm_multisnap_alloc_blocks and corrupt allocations. Free it + * later when we are done with the allocation and all the + * allocated blocks are marked in the bitmap. + */ + bmp = dm_multisnap_duplicate_block(s, block, new_block, bitmap_no, &bp, &to_free); + if (unlikely(!bmp)) + return -1; + + generic___set_le_bit(bit, bmp); + s->p->total_allocated++; + } + + for (i = 0; i < n_blocks; i++) + generic___set_le_bit(results[i] & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1), bmp); + s->p->total_allocated += n_blocks; + + dm_bufio_mark_buffer_dirty(bp); + +bp_release_return: + dm_bufio_release(bp); + + s->p->alloc_rover = (s->p->alloc_rover & ~(chunk_t)((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1)) + start_bit; + if (unlikely(s->p->alloc_rover >= s->p->dev_size)) + s->p->alloc_rover = 0; + + if (unlikely(to_free != 0)) + dm_multisnap_free_block(s, to_free, 0); + + return 0; +} + +void *dm_multisnap_alloc_duplicate_block(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp, void *ptr) +{ + int r; + chunk_t new_chunk; + void *data; + + if (dm_multisnap_block_is_uncommitted(s, block)) + return ptr; + + dm_bufio_release(*bp); + + r = dm_multisnap_alloc_blocks(s, &new_chunk, 1, 0); + if (r) + return NULL; + + data = dm_multisnap_read_block(s, block, bp); + if (!data) + return NULL; + + return dm_multisnap_duplicate_block(s, block, new_chunk, CB_BITMAP_IDX_NONE, bp, NULL); +} + +void *dm_multisnap_alloc_make_block(struct dm_multisnap *s, chunk_t *result, struct dm_buffer **bp) +{ + int r = dm_multisnap_alloc_blocks(s, result, 1, 0); + if (unlikely(r < 0)) + return NULL; + + return dm_multisnap_make_block(s, *result, bp); +} + +void dm_multisnap_free_blocks_immediate(struct dm_multisnap *s, chunk_t block, unsigned n_blocks) +{ + void *bmp; + struct dm_buffer *bp; + + if (!n_blocks) + return; + + if (unlikely(block + n_blocks > s->p->dev_size)) { + DMERR("dm_multisnap_free_block_immediate: freeing invalid blocks %Lx, %x", (unsigned long long)block, n_blocks); + dm_multisnap_set_error(s, -EFSERROR); + return; + } + + if (block + n_blocks == s->p->alloc_rover) + s->p->alloc_rover = block; + + do { + bitmap_t bitmap_no = block >> (s->chunk_shift + BITS_PER_BYTE_SHIFT); + + bmp = dm_multisnap_map_bitmap(s, bitmap_no, &bp, NULL, NULL); + if (!bmp) + return; + + do { + generic___clear_le_bit(block & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1), bmp); + s->p->total_allocated--; + n_blocks--; + block++; + } while (n_blocks && (block & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1))); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } while (unlikely(n_blocks != 0)); +} + + +void dm_multisnap_bitmap_finalize_tmp_remap(struct dm_multisnap *s, struct tmp_remap *tmp_remap) +{ + chunk_t block; + struct dm_buffer *bp; + __u64 *new_block; + struct path_element path[MAX_BITMAP_DEPTH]; + int results_ptr; + + chunk_t new_blockn; + int i; + + /* + * Preallocate twice the required amount of blocks, so that resolving + * the next tmp_remap (created here, in dm_multisnap_alloc_blocks) + * doesn't have to allocate anything. + */ + if (s->p->n_preallocated_blocks < s->p->bitmap_depth) { + if (unlikely(dm_multisnap_alloc_blocks(s, s->p->preallocated_blocks + s->p->n_preallocated_blocks, s->p->bitmap_depth * 2 - s->p->n_preallocated_blocks, 0) < 0)) + return; + s->p->n_preallocated_blocks = s->p->bitmap_depth * 2; + } + results_ptr = 0; + + new_block = dm_multisnap_map_bitmap(s, tmp_remap->bitmap_idx, &bp, &block, path); + if (unlikely(!new_block)) + return; + + dm_bufio_release(bp); + + new_blockn = tmp_remap->new; + for (i = s->p->bitmap_depth - 1; i >= 0; i--) { + chunk_t block_to_free; + int remapped = 0; + __u64 *bmp = dm_multisnap_read_block(s, path[i].block, &bp); + if (unlikely(IS_ERR(bmp))) + return; + + if (!dm_multisnap_block_is_uncommitted(s, path[i].block)) { + remapped = 1; + dm_bufio_release_move(bp, s->p->preallocated_blocks[results_ptr]); + bmp = dm_multisnap_read_block(s, s->p->preallocated_blocks[results_ptr], &bp); + if (unlikely(IS_ERR(bmp))) + return; + /* !!! TODO: add to a list of newly allocated blocks */ + } + + block_to_free = le64_to_cpu(bmp[path[i].idx]); + bmp[path[i].idx] = cpu_to_le64(new_blockn); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + dm_multisnap_free_block(s, block_to_free, 0); + + if (!remapped) + goto skip_it; + new_blockn = s->p->preallocated_blocks[results_ptr]; + results_ptr++; + } + + dm_multisnap_free_block(s, s->p->bitmap_root, 0); + s->p->bitmap_root = new_blockn; + +skip_it: + memmove(s->p->preallocated_blocks, s->p->preallocated_blocks + results_ptr, (s->p->n_preallocated_blocks -= results_ptr) * sizeof(chunk_t)); +} Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-io.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-io.c 2009-05-11 13:35:01.000000000 +0200 @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +/* + * This function will check if there is remapping for a given snapid/chunk. + * It returns 1 if remapping exists and is read-only (shared by other snapshots) + * and 2 if it exists and is read-write (not shared by anyone). + */ + +int dm_multisnap_find_snapshot_chunk(struct dm_multisnap *s, snapid_t snapid, chunk_t chunk, chunk_t *result) +{ + int r; + struct bt_key key; + key.chunk = chunk; + key.snap_from = snapid; + key.snap_to = snapid; + r = dm_multisnap_find_in_btree(s, &key, result); + if (!r) { + s->p->query_new_key.chunk = chunk; + s->p->query_new_key.snap_from = snapid; + s->p->query_new_key.snap_to = snapid; + s->p->query_active = 1; + } + if (r > 0) { + snapid_t find_from, find_to; + /* + * !!! TODO: this branch could be done conditionally + * only for write requests + */ + if (key.snap_from < snapid) { + if (likely(dm_multisnap_find_next_snapid_range(s, key.snap_from, &find_from, &find_to))) { + if (find_from < snapid) { + s->p->query_new_key.chunk = chunk; + s->p->query_new_key.snap_from = snapid; + s->p->query_new_key.snap_to = key.snap_to; + s->p->query_block_from = key.snap_from; + s->p->query_block_to = key.snap_to; + s->p->query_active = 2; + return 1; + } + if (unlikely(find_from > snapid)) + BUG(); /* SNAPID not in our tree */ + if (find_to > snapid && key.snap_to > snapid) { + s->p->query_new_key.chunk = chunk; + s->p->query_new_key.snap_from = key.snap_from; + s->p->query_new_key.snap_to = snapid; + s->p->query_block_from = key.snap_from; + s->p->query_block_to = key.snap_to; + s->p->query_active = 2; + return 1; + } + } else { + /* we're asking for a SNAPID not in our tree */ + BUG(); + } + } + if (key.snap_to > snapid) { + if (likely(dm_multisnap_find_next_snapid_range(s, snapid + 1, &find_from, &find_to))) { + if (find_from <= key.snap_to) { + s->p->query_new_key.chunk = chunk; + s->p->query_new_key.snap_from = key.snap_from; + s->p->query_new_key.snap_to = snapid; + s->p->query_block_from = key.snap_from; + s->p->query_block_to = key.snap_to; + s->p->query_active = 2; + return 1; + } + } + } + return 2; + } + return r; +} + +void dm_multisnap_reset_query(struct dm_multisnap *s) +{ + s->p->query_active = 0; + + s->p->query_snapid = 0; +} + +int dm_multisnap_query_next_remap(struct dm_multisnap *s, chunk_t chunk) +{ + int r; + chunk_t sink; + snapid_t from, to; + + s->p->query_active = 0; + + while (dm_multisnap_find_next_snapid_range(s, s->p->query_snapid, &from, &to)) { + struct bt_key key; +next_btree_search: + if (dm_multisnap_has_error(s)) + return -1; + key.chunk = chunk; + key.snap_from = from; + key.snap_to = to; + r = dm_multisnap_find_in_btree(s, &key, &sink); + if (unlikely(r < 0)) + return -1; + + if (!r) { + s->p->query_new_key.chunk = chunk; + s->p->query_new_key.snap_from = from; + s->p->query_new_key.snap_to = to; + s->p->query_active = 1; + return 1; + } + + if (key.snap_from > from) { + s->p->query_new_key.chunk = chunk; + s->p->query_new_key.snap_from = from; + s->p->query_new_key.snap_to = key.snap_from - 1; + s->p->query_active = 1; + return 1; + } + + if (key.snap_to < to) { + from = key.snap_to + 1; + goto next_btree_search; + } + + s->p->query_snapid = to + 1; + } + + return 0; +} + +void dm_multisnap_add_next_remap(struct dm_multisnap *s, union chunk_descriptor *cd, chunk_t *new_chunk) +{ + int r; + + BUG_ON(s->p->query_active != 1); + s->p->query_active = 0; + + cd->range.from = s->p->query_new_key.snap_from; + cd->range.to = s->p->query_new_key.snap_to; + + r = dm_multisnap_alloc_blocks(s, new_chunk, 1, 0); + if (unlikely(r < 0)) + return; + + s->p->data_allocated++; + + dm_multisnap_add_to_btree(s, &s->p->query_new_key, *new_chunk); + dm_multisnap_transaction_mark(s); +} + +void dm_multisnap_make_chunk_writeable(struct dm_multisnap *s, union chunk_descriptor *cd, chunk_t *new_chunk) +{ + int r; + + BUG_ON(s->p->query_active != 2); + s->p->query_active = 0; + + cd->range.from = s->p->query_block_from; + cd->range.to = s->p->query_block_to; + + r = dm_multisnap_alloc_blocks(s, new_chunk, 1, 0); + if (unlikely(r < 0)) + return; + + s->p->data_allocated++; + + dm_multisnap_restrict_btree_entry(s, &s->p->query_new_key); + dm_multisnap_transaction_mark(s); + + if (unlikely(dm_multisnap_has_error(s))) + return; + + dm_multisnap_add_to_btree(s, &s->p->query_new_key, *new_chunk); + dm_multisnap_transaction_mark(s); +} + +int dm_multisnap_check_conflict(struct dm_multisnap *s, union chunk_descriptor *cd, snapid_t snapid) +{ + return snapid >= cd->range.from && snapid <= cd->range.to; +} + Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-freelist.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-freelist.c 2009-05-11 13:35:01.000000000 +0200 @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +static void init_freelist(struct dm_multisnap_freelist *fl, unsigned chunk_size) +{ + memset(fl, 0, chunk_size); + fl->signature = FL_SIGNATURE; + write_48(fl, backlink, 0); + fl->n_entries = cpu_to_le32(0); +} + +static int add_to_freelist(struct dm_multisnap *s, chunk_t block, unsigned flags) +{ + int i; + struct dm_multisnap_freelist *fl = s->p->freelist; + for (i = le32_to_cpu(fl->n_entries) - 1; i >= 0; i--) { + chunk_t x = read_48(&fl->entries[i], block); + unsigned r = le16_to_cpu(fl->entries[i].run_length) & FREELIST_RL_MASK; + unsigned f = le16_to_cpu(fl->entries[i].run_length) & FREELIST_DATA_FLAG; + if (block >= x && block < x + r) { + DMERR("add_to_freelist: freeing already free block %Lx (%Lx - %x)", (unsigned long long)block, (unsigned long long)x, r); + dm_multisnap_set_error(s, -EFSERROR); + return -1; + } + if (likely(r < FREELIST_RL_MASK) && likely(f == flags)) { + if (block == x - 1) { + write_48(&fl->entries[i], block, x - 1); + goto inc_length; + } + if (block == x + r) { +inc_length: + fl->entries[i].run_length = cpu_to_le16((r + 1) | f); + return 1; + } + } + } + if ((i = le32_to_cpu(fl->n_entries)) < dm_multisnap_freelist_entries(s->chunk_size)) { + fl->n_entries = cpu_to_le32(i + 1); + write_48(&fl->entries[i], block, block); + fl->entries[i].run_length = cpu_to_le16(1 | flags); + return 1; + } + return 0; +} + +static struct dm_multisnap_freelist *read_freelist(struct dm_multisnap *s, chunk_t block, struct dm_buffer **bp) +{ + struct dm_multisnap_freelist *fl; + fl = dm_bufio_read(s->p->bufio, block, bp); + if (IS_ERR(fl)) { + DMERR("read_freelist: can't read freelist block %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, PTR_ERR(fl)); + return NULL; + } + if (fl->signature != FL_SIGNATURE) { + dm_bufio_release(*bp); + DMERR("read_freelist: bad signature freelist block %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + if (le32_to_cpu(fl->n_entries) > dm_multisnap_freelist_entries(s->chunk_size)) { + dm_bufio_release(*bp); + DMERR("read_freelist: bad number of entries in freelist block %Lx", (unsigned long long)block); + dm_multisnap_set_error(s, -EFSERROR); + return NULL; + } + return fl; +} + +static void alloc_write_freelist(struct dm_multisnap *s) +{ + chunk_t new_block; + struct dm_multisnap_freelist *fl; + struct dm_buffer *bp; + + if (dm_multisnap_alloc_blocks(s, &new_block, 1, ALLOC_DRY)) + return; + + fl = dm_bufio_new(s->p->bufio, new_block, &bp); + if (IS_ERR(fl)) { + DMERR("alloc_write_freelist: can't make new freelist block %Lx", (unsigned long long)new_block); + dm_multisnap_set_error(s, PTR_ERR(fl)); + return; + } + + memcpy(fl, s->p->freelist, s->chunk_size); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + init_freelist(s->p->freelist, s->chunk_size); + write_48(s->p->freelist, backlink, new_block); +} + +void dm_multisnap_free_block(struct dm_multisnap *s, chunk_t block, unsigned flags) +{ + if (likely(add_to_freelist(s, block, flags))) + return; + + alloc_write_freelist(s); + if (dm_multisnap_has_error(s)) + return; + + if (add_to_freelist(s, block, flags)) + return; + + BUG(); +} + +static int check_against_freelist(struct dm_multisnap_freelist *fl, chunk_t block) +{ + int i; + for (i = le32_to_cpu(fl->n_entries) - 1; i >= 0; i--) { + chunk_t x = read_48(&fl->entries[i], block); + unsigned r = le16_to_cpu(fl->entries[i].run_length) & FREELIST_RL_MASK; + if (block - x >= 0 && unlikely(block - x < r)) + return 1; + } + return 0; +} + +static int check_against_freelist_chain(struct dm_multisnap *s, chunk_t fl_block, chunk_t block) +{ + stop_cycles_t cy; + dm_multisnap_init_stop_cycles(&cy); + + while (unlikely(fl_block != 0)) { + int c; + struct dm_buffer *bp; + struct dm_multisnap_freelist *fl; + + if (dm_multisnap_stop_cycles(s, &cy, fl_block)) + return -1; + + if (unlikely(block == fl_block)) + return 1; + + fl = read_freelist(s, fl_block, &bp); + if (unlikely(!fl)) + return -1; + c = check_against_freelist(fl, block); + fl_block = read_48(fl, backlink); + dm_bufio_release(bp); + if (unlikely(c)) + return c; + } + return 0; +} + +int dm_multisnap_check_allocated_block(struct dm_multisnap *s, chunk_t block) +{ + int c; + + c = check_against_freelist(s->p->freelist, block); + if (unlikely(c)) + return c; + + c = check_against_freelist_chain(s, read_48(s->p->freelist, backlink), block); + if (unlikely(c)) + return c; + + c = check_against_freelist_chain(s, s->p->freelist_ptr, block); + if (unlikely(c)) + return c; + + return 0; +} + +void dm_multisnap_flush_freelist_before_commit(struct dm_multisnap *s) +{ + alloc_write_freelist(s); + + if (dm_multisnap_has_error(s)) + return; + + s->p->freelist_ptr = read_48(s->p->freelist, backlink); +} + +static void free_blocks_in_freelist(struct dm_multisnap *s, struct dm_multisnap_freelist *fl) +{ + int i; + for (i = le32_to_cpu(fl->n_entries) - 1; i >= 0; i--) { + chunk_t x = read_48(&fl->entries[i], block); + unsigned r = le16_to_cpu(fl->entries[i].run_length) & FREELIST_RL_MASK; + unsigned f = le16_to_cpu(fl->entries[i].run_length) & FREELIST_DATA_FLAG; + dm_multisnap_free_blocks_immediate(s, x, r); + if (likely(f & FREELIST_DATA_FLAG)) + s->p->data_allocated -= r; + } +} + +void dm_multisnap_load_freelist(struct dm_multisnap *s) +{ + chunk_t fl_block = s->p->freelist_ptr; + + stop_cycles_t cy; + dm_multisnap_init_stop_cycles(&cy); + + while (fl_block) { + struct dm_buffer *bp; + struct dm_multisnap_freelist *fl; + + if (dm_multisnap_stop_cycles(s, &cy, fl_block)) + break; + + if (dm_multisnap_has_error(s)) + break; + + fl = read_freelist(s, fl_block, &bp); + if (!fl) + break; + memcpy(s->p->freelist, fl, s->chunk_size); + dm_bufio_release(bp); + + free_blocks_in_freelist(s, s->p->freelist); + fl_block = read_48(s->p->freelist, backlink); + } + + init_freelist(s->p->freelist, s->chunk_size); +} Index: linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-delete.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.30-rc5-fast/drivers/md/dm-multisnap-delete.c 2009-05-11 13:35:01.000000000 +0200 @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2008 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-multisnap-mikulas.h" + +#define COMMIT_AFTER 1024 + +struct list_cookie { + struct bt_key key; + chunk_t new_chunk; +}; + +#define RET_END 1 +#define RET_PROCESS_FREE 2 + +static int list_callback(struct dm_multisnap *s, struct dm_multisnap_bt_entry *bt, void *cookie) +{ + struct list_cookie *lc = cookie; + snapid_t found_from, found_to; + + lc->key.chunk = read_48(bt, orig_chunk); + lc->key.snap_from = le32_to_cpu(bt->snap_from); + lc->key.snap_to = le32_to_cpu(bt->snap_to); + + if (unlikely(lc->key.chunk > CHUNK_T_MAX)) + return RET_END; + + s->p->delete_rover = lc->key.chunk; + + if (!dm_multisnap_find_next_snapid_range(s, lc->key.snap_from, &found_from, &found_to) || found_from > lc->key.snap_to) { + /* + * This range maps unused snapshots, delete it. + * But we can't do it now, so submit it to the caller; + */ + lc->new_chunk = read_48(bt, new_chunk); + return RET_PROCESS_FREE; + } + + return 0; +} + +static void delete_step(struct dm_multisnap *s) +{ + struct bt_key key; + int r; + struct list_cookie lc; + + key.chunk = s->p->delete_rover; + key.snap_from = 0; + key.snap_to = 0; + + r = dm_multisnap_list_btree(s, &key, list_callback, &lc); + + if (unlikely(r < 0)) + return; + + if (unlikely(r <= RET_END)) { + s->p->flags &= ~MULTISNAP_FLAG_DELETING; + + /* If we finished the job and there is no pending I/O, commit */ + if (dm_multisnap_can_commit(s)) + dm_multisnap_commit(s); + + return; + } + + if (unlikely(dm_multisnap_has_error(s))) + return; + + dm_multisnap_delete_from_btree(s, &lc.key); + + dm_multisnap_transaction_mark(s); + + dm_multisnap_free_block(s, lc.new_chunk, FREELIST_DATA_FLAG); + + if (dm_multisnap_can_commit(s)) { + if (++s->p->delete_commit_count >= COMMIT_AFTER) { + s->p->delete_commit_count = 0; + dm_multisnap_commit(s); + } + } +} + +void dm_multisnap_background_delete(struct dm_multisnap *s, struct dm_multisnap_background_work *bw) +{ + if (unlikely(dm_multisnap_has_error(s))) + return; + + if (s->p->flags & MULTISNAP_FLAG_DELETING) { + delete_step(s); + } else if (s->p->flags & MULTISNAP_FLAG_PENDING_DELETE) { + s->p->flags &= ~MULTISNAP_FLAG_PENDING_DELETE; + s->p->flags |= MULTISNAP_FLAG_DELETING; + s->p->delete_rover = 0; + } else + return; + + dm_multisnap_queue_work(s, &s->p->delete_work); +}