Common code for multisnapshot target.

This is the common code, shared by all exception stores.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 Documentation/device-mapper/dm-multisnapshot.txt |  153 +
 drivers/md/Kconfig                               |   10 
 drivers/md/Makefile                              |    2 
 drivers/md/dm-multisnap-private.h                |  163 +
 drivers/md/dm-multisnap.c                        | 2060 +++++++++++++++++++++++
 drivers/md/dm-multisnap.h                        |  183 ++
 6 files changed, 2571 insertions(+)

Index: linux-2.6.34-rc4-fast/drivers/md/Kconfig
===================================================================
--- linux-2.6.34-rc4-fast.orig/drivers/md/Kconfig	2010-04-13 16:28:24.000000000 +0200
+++ linux-2.6.34-rc4-fast/drivers/md/Kconfig	2010-04-14 13:36:38.000000000 +0200
@@ -258,6 +258,16 @@ config DM_SNAPSHOT
        ---help---
          Allow volume managers to take writable snapshots of a device.
 
+config DM_MULTISNAPSHOT
+	tristate "Multisnapshot target"
+	depends on BLK_DEV_DM
+	---help---
+	  A new implementation of snapshots allowing sharing storage
+	  between several snapshots.
+
+	  A submenu allows to select a specific shared snapshot store
+	  driver.
+
 config DM_MIRROR
        tristate "Mirror target"
        depends on BLK_DEV_DM
Index: linux-2.6.34-rc4-fast/drivers/md/Makefile
===================================================================
--- linux-2.6.34-rc4-fast.orig/drivers/md/Makefile	2010-04-13 16:28:24.000000000 +0200
+++ linux-2.6.34-rc4-fast/drivers/md/Makefile	2010-04-14 13:36:38.000000000 +0200
@@ -7,6 +7,7 @@ dm-mod-y	+= dm.o dm-table.o dm-target.o 
 dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
+dm-multisnapshot-y += dm-multisnap.o
 dm-mirror-y	+= dm-raid1.o
 dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
@@ -42,6 +43,7 @@ obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipa
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
+obj-$(CONFIG_DM_MULTISNAPSHOT)	+= dm-multisnapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
Index: linux-2.6.34-rc4-fast/drivers/md/dm-multisnap.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.34-rc4-fast/drivers/md/dm-multisnap.c	2010-04-14 00:56:32.000000000 +0200
@@ -0,0 +1,2060 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-private.h"
+
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+
+static void dm_multisnap_process_bios(struct dm_multisnap *s);
+
+/* --- locking --- */
+
+static void dm_multisnap_lock(struct dm_multisnap *s)
+{
+	mutex_lock(&s->master_lock);
+	if (s->p && s->store->store_lock_acquired)
+		/*
+		 * Flags is currently unused, it will be used to flush cache
+		 * in clustered environment
+		 */
+		s->store->store_lock_acquired(s->p, 0);
+}
+
+static void dm_multisnap_unlock(struct dm_multisnap *s)
+{
+	mutex_unlock(&s->master_lock);
+}
+
+static int dm_multisnap_lock_contended(struct dm_multisnap *s)
+{
+	return !list_empty(&s->master_lock.wait_list);
+}
+
+static void dm_multisnap_assert_locked(struct dm_multisnap *s)
+{
+	BUG_ON(!mutex_is_locked(&s->master_lock));
+}
+
+void dm_multisnap_status_lock(struct dm_multisnap *s)
+{
+	mutex_lock(&s->status_lock);
+}
+EXPORT_SYMBOL(dm_multisnap_status_lock);
+
+void dm_multisnap_status_unlock(struct dm_multisnap *s)
+{
+	mutex_unlock(&s->status_lock);
+}
+EXPORT_SYMBOL(dm_multisnap_status_unlock);
+
+void dm_multisnap_status_assert_locked(struct dm_multisnap *s)
+{
+	BUG_ON(!mutex_is_locked(&s->status_lock));
+}
+EXPORT_SYMBOL(dm_multisnap_status_assert_locked);
+
+/* --- helper functions to access internal state --- */
+
+/*
+ * These tiny functions are used to access internal state of dm_multisnap.
+ *
+ * We access these fields with functions and don't export struct dm_multisnap
+ * to exception store drivers, so that changes to "struct dm_multisnap" don't
+ * change the ABI.
+ */
+
+struct block_device *dm_multisnap_snapshot_bdev(struct dm_multisnap *s)
+{
+	return s->snapshot->bdev;
+}
+EXPORT_SYMBOL(dm_multisnap_snapshot_bdev);
+
+unsigned dm_multisnap_chunk_size(struct dm_multisnap *s)
+{
+	return s->chunk_size;
+}
+EXPORT_SYMBOL(dm_multisnap_chunk_size);
+
+void dm_multisnap_set_error(struct dm_multisnap *s, int error)
+{
+	if (!s->error)
+		s->error = error;
+
+	/*
+	 * Dump the stack on all errors, except space overflow.
+	 *
+	 * Space overflow can happen normally, other errors may mean that
+	 * there is a bug in the code and getting a stack dump is viable.
+	 */
+	if (error != -ENOSPC)
+		dump_stack();
+}
+EXPORT_SYMBOL(dm_multisnap_set_error);
+
+int dm_multisnap_has_error(struct dm_multisnap *s)
+{
+	return s->error;
+}
+EXPORT_SYMBOL(dm_multisnap_has_error);
+
+int dm_multisnap_drop_on_error(struct dm_multisnap *s)
+{
+	return !(s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR);
+}
+EXPORT_SYMBOL(dm_multisnap_drop_on_error);
+
+static DEFINE_MUTEX(all_multisnapshots_lock);
+static LIST_HEAD(all_multisnapshots);
+
+static chunk_t sector_to_chunk(struct dm_multisnap *s, sector_t sector)
+{
+	return sector >> (s->chunk_shift - SECTOR_SHIFT);
+}
+
+static sector_t chunk_to_sector(struct dm_multisnap *s, chunk_t chunk)
+{
+	return chunk << (s->chunk_shift - SECTOR_SHIFT);
+}
+
+int dm_multisnap_snapshot_exists(struct dm_multisnap *s, snapid_t snapid)
+{
+	return snapid == s->store->get_next_snapid(s->p, snapid);
+}
+EXPORT_SYMBOL(dm_multisnap_snapshot_exists);
+
+static long dm_multisnap_jobs_in_flight(struct dm_multisnap *s)
+{
+	return s->kcopyd_jobs_submitted_count - s->kcopyd_jobs_last_commit_count;
+}
+
+/* --- snapids --- */
+
+/*
+ * Any reading/writing of snapids in table/status/message must go
+ * through these functions, so that snapid format for userspace can
+ * be overridden.
+ */
+
+static void print_snapid(struct dm_multisnap *s, char *string,
+			 unsigned maxlen, snapid_t snapid)
+{
+	if (s->store->print_snapid)
+		s->store->print_snapid(s->p, string, maxlen, snapid);
+	else
+		snprintf(string, maxlen, "%llu", (unsigned long long)snapid);
+}
+
+static int read_snapid(struct dm_multisnap *s, char *string,
+		       snapid_t *snapid, char **error)
+{
+	if (s->store->read_snapid)
+		return s->store->read_snapid(s->p, string, snapid, error);
+	else {
+		int r;
+
+		char *argv_array[1] = { string };
+		char **argv = argv_array;
+		unsigned argc = 1;
+		__u64 unsigned_int64;
+
+		r = dm_multisnap_get_uint64(&argv, &argc, &unsigned_int64, error);
+		if (r)
+			return r;
+
+		*snapid = unsigned_int64;
+		return 0;
+	}
+}
+
+/* --- bio list --- */
+
+static DEFINE_SPINLOCK(dm_multisnap_bio_list_lock);
+
+static void wakeup_kmultisnapd(struct dm_multisnap *s)
+{
+	queue_work(s->wq, &s->work);
+}
+
+static void dm_multisnap_enqueue_bio_unlocked(struct dm_multisnap *s, struct bio *bio)
+{
+	struct dm_multisnap_bio_queue *q;
+	if (bio_rw(bio) != WRITE)
+		q = &s->queue[0];
+	else
+		q = &s->queue[1];
+	bio_list_add(&q->bios, bio);
+}
+
+static void dm_multisnap_enqueue_bio(struct dm_multisnap *s, struct bio *bio)
+{
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+	dm_multisnap_enqueue_bio_unlocked(s, bio);
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+}
+
+static void dm_multisnap_enqueue_bio_list(struct dm_multisnap *s, struct bio_list *bl)
+{
+	struct bio *bio;
+	while ((bio = bio_list_pop(bl))) {
+		dm_multisnap_enqueue_bio(s, bio);
+		cond_resched();
+	}
+}
+
+static struct bio *dm_multisnap_dequeue_bio(struct dm_multisnap *s)
+{
+	struct bio *bio;
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+
+#ifdef DM_MULTISNAP_MAX_REMAPS
+	if (dm_multisnap_jobs_in_flight(s) >= DM_MULTISNAP_MAX_REMAPS) {
+		s->current_queue = 0;
+		goto test_current_queue;
+	}
+#endif
+
+	s->current_queue ^= 1;
+
+	bio = bio_list_pop(&s->queue[s->current_queue ^ 1].bios);
+	if (bio)
+		goto ret;
+
+#ifdef DM_MULTISNAP_MAX_REMAPS
+test_current_queue:
+#endif
+	bio = bio_list_pop(&s->queue[s->current_queue].bios);
+
+ret:
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	return bio;
+}
+
+static int dm_multisnap_bio_queue_empty(struct dm_multisnap *s)
+{
+	unsigned i;
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+
+	for (i = 0; i < DM_MULTISNAP_N_QUEUES; i++)
+		if (!bio_list_empty(&s->queue[i].bios))
+			break;
+
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	return i != DM_MULTISNAP_N_QUEUES;
+}
+
+static void dm_multisnap_bio_dequeue_all(struct dm_multisnap *s, struct bio_list *bl)
+{
+	unsigned i;
+
+	bio_list_init(bl);
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+
+	for (i = 0; i < DM_MULTISNAP_N_QUEUES; i++) {
+		bio_list_merge(bl, &s->queue[i].bios);
+		bio_list_init(&s->queue[i].bios);
+	}
+
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+}
+
+static void dm_multisnap_init_bio_queues(struct dm_multisnap *s)
+{
+	unsigned i;
+	for (i = 0; i < DM_MULTISNAP_N_QUEUES; i++)
+		bio_list_init(&s->queue[i].bios);
+	s->current_queue = 0;
+}
+
+/* Reduce the size of the bio */
+
+static void bio_trim(struct bio *bio, unsigned size)
+{
+	unsigned i;
+	bio->bi_size = size;
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		if (size <= bio->bi_io_vec[i].bv_len) {
+			bio->bi_io_vec[i].bv_len = size;
+			bio->bi_vcnt = i + 1;
+			bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+			return;
+		}
+		size -= bio->bi_io_vec[i].bv_len;
+	}
+	BUG();
+}
+
+/* --- encode 64-bit snapids in bio */
+
+static snapid_t bio_get_snapid(struct bio *bio)
+{
+	return ((__u64)bio->bi_seg_front_size << 32) | bio->bi_seg_back_size;
+}
+
+static void bio_put_snapid(struct bio *bio, snapid_t snapid)
+{
+	bio->bi_seg_front_size = (__u64)snapid >> 32;
+	bio->bi_seg_back_size = snapid;
+}
+
+/* --- tracked chunks --- */
+
+static struct kmem_cache *tracked_chunk_cache;
+
+static int chunk_is_tracked(struct dm_multisnap *s, chunk_t chunk)
+{
+	struct dm_multisnap_tracked_chunk *c;
+	struct hlist_node *hn;
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+
+	hlist_for_each_entry(c, hn,
+	    &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
+		if (likely(c->chunk == chunk)) {
+			spin_unlock_irq(&dm_multisnap_bio_list_lock);
+			return 1;
+		}
+	}
+
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	return 0;
+}
+
+/* --- pending exception cache --- */
+
+static struct kmem_cache *pending_exception_cache;
+
+#define GFP_PENDING_EXCEPTION	GFP_NOIO
+
+static void pending_exception_ctor(void *pe_)
+{
+	struct dm_multisnap_pending_exception *pe = pe_;
+	bio_list_init(&pe->bios);
+}
+
+static struct dm_multisnap_pending_exception *
+dm_multisnap_alloc_pending_exception(struct dm_multisnap *s, chunk_t chunk)
+{
+	struct dm_multisnap_pending_exception *pe;
+	/*
+	 * Warning, we don't want to wait. Because we are holding master_lock
+	 * and taking this lock is needed to complete the exception.
+	 *
+	 * If an allocation failure happens, we must go up, drop the lock,
+	 * try dummy mempool allocation and go here again.
+	 */
+	pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION & ~__GFP_WAIT);
+	if (unlikely(!pe))
+		return NULL;
+
+	pe->s = s;
+	pe->chunk = chunk;
+	hlist_add_head(&pe->hash_list, &s->pending_hash[DM_PENDING_HASH(chunk)]);
+	return pe;
+}
+
+static void dm_multisnap_free_pending_exception(struct dm_multisnap_pending_exception *pe)
+{
+	hlist_del(&pe->hash_list);
+	mempool_free(pe, pe->s->pending_pool);
+}
+
+static void dm_multisnap_wait_for_pending_exception(struct dm_multisnap *s)
+{
+	/*
+	 * Wait until there is something in the mempool. Free it immediately.
+	 */
+	struct dm_multisnap_pending_exception *pe;
+
+	pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION | __GFP_WAIT);
+	mempool_free(pe, s->pending_pool);
+}
+
+/*
+ * Check if the chunk+snapid conflicts with any pending exception.
+ *
+ * If it does, queue the bio on the pending exception.
+ */
+static int check_pending_io(struct dm_multisnap *s, struct bio *bio,
+			    chunk_t chunk, snapid_t snapid)
+{
+	struct dm_multisnap_pending_exception *pe;
+	struct hlist_node *hn;
+	hlist_for_each_entry(pe, hn, &s->pending_hash[DM_PENDING_HASH(chunk)], hash_list) {
+		if (pe->chunk == chunk) {
+			int i;
+			if (snapid == DM_SNAPID_T_ORIGIN)
+				goto conflict;
+			for (i = 0; i < pe->n_descs; i++) {
+				if (s->store->check_conflict(s->p, &pe->desc[i], snapid))
+					goto conflict;
+			}
+		}
+		cond_resched();
+	}
+	return 0;
+
+conflict:
+	bio_list_add(&pe->bios, bio);
+	return 1;
+}
+
+/* --- commit --- */
+
+/*
+ * Test if commit can be performed. If these two variables are not equal,
+ * there are some pending kcopyd jobs and we must not commit.
+ */
+int dm_multisnap_can_commit(struct dm_multisnap *s)
+{
+	return s->kcopyd_jobs_submitted_count == s->kcopyd_jobs_finished_count;
+}
+EXPORT_SYMBOL(dm_multisnap_can_commit);
+
+/*
+ * Call exception store commit method.
+ * This can be called only if dm_multisnap_can_commit returned true;
+ * master_lock must be locked.
+ */
+void dm_multisnap_call_commit(struct dm_multisnap *s)
+{
+	s->kcopyd_jobs_last_commit_count = s->kcopyd_jobs_finished_count;
+	s->store->commit(s->p);
+	s->commit_sequence++;
+}
+EXPORT_SYMBOL(dm_multisnap_call_commit);
+
+/*
+ * Force commit at this point. It is guaranteed that commit happened when
+ * this function exits.
+ * master_lock must be unlocked.
+ *
+ * If the commit cannot be performed immediately (because there are pending
+ * chunks being copied), the function drops the lock and polls. It won't
+ * livelock --- either it will be possible to do the commit or someone
+ * has done the commit already (commit_sequence changed).
+ *
+ * The polling is justified because this function is only called when deleting
+ * a snapshot or when suspending the origin with postsuspend. These functions
+ * are not performance-critical, thus 1ms delay won't cause a performance
+ * problem.
+ */
+static int dm_multisnap_force_commit(struct dm_multisnap *s)
+{
+	int err;
+	unsigned commit_sequence;
+
+	dm_multisnap_lock(s);
+
+	commit_sequence = s->commit_sequence;
+
+	while (!dm_multisnap_can_commit(s)) {
+		dm_multisnap_unlock(s);
+		msleep(1);
+		dm_multisnap_lock(s);
+		if (s->commit_sequence != commit_sequence)
+			goto unlock_ret;
+	}
+
+	dm_multisnap_call_commit(s);
+
+unlock_ret:
+	err = dm_multisnap_has_error(s);
+	dm_multisnap_unlock(s);
+
+	return err;
+}
+
+/* --- kcopyd callback --- */
+
+static void remap_callback(int read_err, unsigned long write_err, void *pe_)
+{
+	struct dm_multisnap_pending_exception *pe = pe_;
+	struct dm_multisnap *s = pe->s;
+
+	if (unlikely((read_err | write_err) != 0))
+		DM_MULTISNAP_SET_ERROR(s, -EIO, ("%s: kcopyd I/O error: %d, %lx",
+						 __func__, read_err, write_err));
+
+	list_add_tail(&pe->list, &s->pes_waiting_for_commit);
+
+	s->kcopyd_jobs_finished_count++;
+
+	/* If there are more jobs pending, don't commit */
+	if (!dm_multisnap_can_commit(s))
+		return;
+
+	if (s->store->prepare_for_commit)
+		s->store->prepare_for_commit(s->p);
+
+	dm_multisnap_lock(s);
+
+	/* Recheck after the lock was taken */
+	if (unlikely(!dm_multisnap_can_commit(s))) {
+		/* Not yet ... kmultisnapd has just added something */
+		dm_multisnap_unlock(s);
+		return;
+	}
+
+	/* We need to commit stuff */
+
+	dm_multisnap_call_commit(s);
+
+	do {
+		pe = container_of(s->pes_waiting_for_commit.next,
+				  struct dm_multisnap_pending_exception, list);
+
+		/*
+		 * When we are about to free the pending exception, we must
+		 * wait for all reads to the appropriate chunk to finish.
+		 *
+		 * This prevents the following race condition:
+		 * - someone reads the chunk in the snapshot with no exception
+		 * - that read is remapped directly to the origin, the read
+		 *	is delayed for some reason
+		 * - someone else writes to the origin, this triggers realloc
+		 * - the realloc finishes
+		 * - the write is dispatched to the origin
+		 * - the read submitted first is dispatched and reads modified
+		 *	data
+		 *
+		 * This race is very improbable (non-shared snapshots have this
+		 * race too and it hasn't ever been reported seen, except in
+		 * artifically simulated cases). So we use active waiting with
+		 * msleep(1).
+		 */
+		while (chunk_is_tracked(s, pe->chunk))
+			msleep(1);
+
+		list_del(&pe->list);
+		dm_multisnap_enqueue_bio_list(s, &pe->bios);
+		dm_multisnap_free_pending_exception(pe);
+	} while (!list_empty(&s->pes_waiting_for_commit));
+
+	/*
+	 * Process the bios that we have just added to the queue.
+	 * It's faster to process them now than to hand them over to
+	 * kmultisnapd.
+	 */
+	dm_multisnap_process_bios(s);
+
+	dm_multisnap_unlock(s);
+
+	blk_unplug(bdev_get_queue(s->origin->bdev));
+	blk_unplug(bdev_get_queue(s->snapshot->bdev));
+}
+
+static void dispatch_kcopyd(struct dm_multisnap *s,
+			    struct dm_multisnap_pending_exception *pe,
+			    int from_snapshot, chunk_t chunk, struct bio *bio,
+			    struct dm_io_region *dests, unsigned n_dests)
+{
+	unsigned i;
+	struct dm_io_region src;
+
+	pe->n_descs = n_dests;
+
+	bio_list_add(&pe->bios, bio);
+
+	src.bdev = likely(!from_snapshot) ? s->origin->bdev : s->snapshot->bdev;
+	src.sector = chunk_to_sector(s, chunk);
+	src.count = s->chunk_size >> SECTOR_SHIFT;
+
+	if (likely(!from_snapshot) &&
+	    unlikely(src.sector + src.count > s->origin_sectors)) {
+		if (src.sector >= s->origin_sectors)
+			src.count = 0;
+		else
+			src.count = s->origin_sectors - src.sector;
+
+		for (i = 0; i < pe->n_descs; i++)
+			dests[i].count = src.count;
+	}
+
+	s->kcopyd_jobs_submitted_count++;
+
+	dm_kcopyd_copy(s->kcopyd, &src, n_dests, dests, 0, remap_callback, pe);
+}
+
+/* --- bio processing --- */
+
+/*
+ * Process bio on the origin.
+ * Reads and barriers never go here, they are dispatched directly.
+ */
+static void do_origin_write(struct dm_multisnap *s, struct bio *bio)
+{
+	int r;
+	unsigned i;
+	chunk_t chunk, new_chunk;
+	struct dm_multisnap_pending_exception *pe;
+	struct dm_io_region dests[DM_MULTISNAP_MAX_CHUNKS_TO_REMAP];
+
+	/* reads are processed directly in multisnap_origin_map */
+	BUG_ON(bio_rw(bio) != WRITE);
+
+	if (bio->bi_sector + (bio->bi_size >> SECTOR_SHIFT) > s->origin_sectors) {
+		DMERR("%s: access beyond end of device, flags %lx, "
+		      "sector %llx, size %x, origin sectors %llx",
+		      __func__,
+		      bio->bi_flags,
+		      (unsigned long long)bio->bi_sector,
+		      bio->bi_size,
+		      (unsigned long long)s->origin_sectors);
+		bio_endio(bio, -EIO);
+		return;
+	}
+
+	if (unlikely(dm_multisnap_has_error(s)))
+		goto err_endio;
+
+	s->store->reset_query(s->p);
+
+	chunk = sector_to_chunk(s, bio->bi_sector);
+
+	r = s->store->query_next_remap(s->p, chunk);
+	if (unlikely(r < 0))
+		goto err_endio;
+
+	if (likely(!r)) {
+		/* There is nothing to remap */
+		if (unlikely(check_pending_io(s, bio, chunk, DM_SNAPID_T_ORIGIN)))
+			return;
+dispatch_write:
+		bio->bi_bdev = s->origin->bdev;
+		generic_make_request(bio);
+		return;
+	}
+
+	pe = dm_multisnap_alloc_pending_exception(s, chunk);
+	if (unlikely(!pe)) {
+		s->pending_mempool_allocation_failed = 1;
+		dm_multisnap_enqueue_bio(s, bio);
+		return;
+	}
+
+	/*
+	 * Jump to the middle of the cycle.
+	 * We already asked for the first remap, so we skip it in the first
+	 * iteration. Chaning the cycle to start with add_next_remap would
+	 * make the code less readable because it wouldn't follow the natural
+	 * flow of operations, so we use this goto instead.
+	 */
+	i = 0;
+	goto skip_query_next_remap;
+	for (; i < DM_MULTISNAP_MAX_CHUNKS_TO_REMAP; i++) {
+		r = s->store->query_next_remap(s->p, chunk);
+		if (unlikely(r < 0))
+			goto free_err_endio;
+		if (likely(!r))
+			break;
+
+skip_query_next_remap:
+		s->store->add_next_remap(s->p, &pe->desc[i], &new_chunk);
+		if (unlikely(dm_multisnap_has_error(s)))
+			goto free_err_endio;
+
+		dests[i].bdev = s->snapshot->bdev;
+		dests[i].sector = chunk_to_sector(s, new_chunk);
+		dests[i].count = s->chunk_size >> SECTOR_SHIFT;
+	}
+
+	dispatch_kcopyd(s, pe, 0, chunk, bio, dests, i);
+	return;
+
+free_err_endio:
+	dm_multisnap_free_pending_exception(pe);
+err_endio:
+	r = -EIO;
+	if (!(s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR))
+		goto dispatch_write;
+
+	bio_endio(bio, r);
+	return;
+}
+
+/*
+ * Process bio on the snapshot.
+ * Barriers never go here, they are dispatched directly.
+ */
+static void do_snapshot_io(struct dm_multisnap *s, struct bio *bio, snapid_t id)
+{
+	chunk_t chunk, result, copy_from;
+	int r;
+	struct dm_multisnap_pending_exception *pe;
+	struct dm_io_region dest;
+
+	if (unlikely(!s->store->make_chunk_writeable) &&
+	    unlikely(bio_rw(bio) == WRITE))
+		goto err_endio;
+
+	if (unlikely(dm_multisnap_has_error(s)))
+		goto err_endio;
+
+	chunk = sector_to_chunk(s, bio->bi_sector);
+	r = s->store->find_snapshot_chunk(s->p, id, chunk,
+					  bio_rw(bio) == WRITE, &result);
+	if (unlikely(r < 0))
+		goto err_endio;
+
+	if (!r) {
+		/* Not found in the snapshot */
+		if (likely(bio_rw(bio) != WRITE)) {
+			union map_info *map_context;
+			struct dm_multisnap_tracked_chunk *c;
+
+			if (unlikely(bio->bi_sector + (bio->bi_size >> SECTOR_SHIFT) > s->origin_sectors)) {
+				zero_fill_bio(bio);
+				if (bio->bi_sector >= s->origin_sectors) {
+					bio_endio(bio, 0);
+					return;
+				}
+				bio_trim(bio, (s->origin_sectors - bio->bi_sector) << SECTOR_SHIFT);
+			}
+
+			/*
+			 * Redirect reads to the origin.
+			 * Record the bio in the hash of tracked bios.
+			 * This prevents read-vs-realloc race.
+			 *
+			 * An important requirement is that when any bio is
+			 * added to tracked_chunk_hash, the bio must be finished
+			 * and removed from the hash without taking master_lock.
+			 *
+			 * So we add it immediately before submitting the bio
+			 * with generic_make_request.
+			 */
+			bio->bi_bdev = s->origin->bdev;
+
+			map_context = dm_get_mapinfo(bio);
+			BUG_ON(!map_context);
+			c = map_context->ptr;
+
+			spin_lock_irq(&dm_multisnap_bio_list_lock);
+			BUG_ON(!hlist_unhashed(&c->node));
+			hlist_add_head(&c->node, &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(c->chunk)]);
+			spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		} else {
+			pe = dm_multisnap_alloc_pending_exception(s, chunk);
+			if (unlikely(!pe))
+				goto failed_pe_allocation;
+
+			s->store->add_next_remap(s->p, &pe->desc[0], &result);
+			if (unlikely(dm_multisnap_has_error(s)))
+				goto free_err_endio;
+
+			dest.bdev = s->snapshot->bdev;
+			dest.sector = chunk_to_sector(s, result);
+			dest.count = s->chunk_size >> SECTOR_SHIFT;
+
+			dispatch_kcopyd(s, pe, 0, chunk, bio, &dest, 1);
+			return;
+		}
+	} else {
+		/* Found in the snapshot */
+		if (unlikely(check_pending_io(s, bio, chunk, id)))
+			return;
+
+		if (unlikely(bio_rw(bio) == WRITE) && r == 1) {
+			copy_from = result;
+
+			pe = dm_multisnap_alloc_pending_exception(s, chunk);
+			if (unlikely(!pe))
+				goto failed_pe_allocation;
+
+			s->store->make_chunk_writeable(s->p, &pe->desc[0], &result);
+			if (unlikely(dm_multisnap_has_error(s)))
+				goto free_err_endio;
+
+			dest.bdev = s->snapshot->bdev;
+			dest.sector = chunk_to_sector(s, result);
+			dest.count = s->chunk_size >> SECTOR_SHIFT;
+
+			dispatch_kcopyd(s, pe, 1, copy_from, bio, &dest, 1);
+			return;
+		}
+
+		bio->bi_bdev = s->snapshot->bdev;
+		bio->bi_sector &= (s->chunk_size >> SECTOR_SHIFT) - 1;
+		bio->bi_sector |= chunk_to_sector(s, result);
+	}
+	generic_make_request(bio);
+	return;
+
+free_err_endio:
+	dm_multisnap_free_pending_exception(pe);
+err_endio:
+	r = -EIO;
+	bio_endio(bio, r);
+	return;
+
+failed_pe_allocation:
+	s->pending_mempool_allocation_failed = 1;
+	dm_multisnap_enqueue_bio(s, bio);
+	return;
+}
+
+/*
+ * The main routine used to process everything in the thread.
+ * It must be called with master_lock held.
+ * It is usually called from the worker thread, but can also be called
+ * from other places (for example kcopyd callback), assuming that the caller
+ * holds master_lock.
+ */
+static void dm_multisnap_process_bios(struct dm_multisnap *s)
+{
+	struct bio *bio;
+	snapid_t snapid;
+
+again:
+	cond_resched();
+
+	if (!list_empty(&s->background_works)) {
+		struct dm_multisnap_background_work *bw =
+			list_entry(s->background_works.next,
+				   struct dm_multisnap_background_work, list);
+		list_del(&bw->list);
+		bw->queued = 0;
+		bw->work(s->p, bw);
+
+		cond_resched();
+	}
+
+	bio = dm_multisnap_dequeue_bio(s);
+	if (unlikely(!bio))
+		return;
+
+	snapid = bio_get_snapid(bio);
+	if (snapid == DM_SNAPID_T_ORIGIN)
+		do_origin_write(s, bio);
+	else
+		do_snapshot_io(s, bio, snapid);
+
+	if (likely(!s->pending_mempool_allocation_failed) &&
+	    likely(!dm_multisnap_lock_contended(s)))
+		goto again;
+
+	if (!dm_multisnap_bio_queue_empty(s))
+		wakeup_kmultisnapd(s);
+}
+
+/*
+ * Background-job routines exported for exception store drivers.
+ *
+ * Jobs queued with these routines will be executed on background, with the
+ * master lock held.
+ */
+
+void dm_multisnap_queue_work(struct dm_multisnap *s,
+			     struct dm_multisnap_background_work *bw)
+{
+	dm_multisnap_assert_locked(s);
+
+	if (bw->queued) {
+		BUG_ON(bw->queued != 1);
+		return;
+	}
+
+	bw->queued = 1;
+	list_add(&bw->list, &s->background_works);
+	wakeup_kmultisnapd(s);
+}
+EXPORT_SYMBOL(dm_multisnap_queue_work);
+
+void dm_multisnap_cancel_work(struct dm_multisnap *s,
+			      struct dm_multisnap_background_work *bw)
+{
+	dm_multisnap_assert_locked(s);
+
+	if (!bw->queued)
+		return;
+
+	bw->queued = 0;
+	list_del(&bw->list);
+}
+EXPORT_SYMBOL(dm_multisnap_cancel_work);
+
+/*
+ * The main work thread.
+ */
+static void dm_multisnap_work(struct work_struct *work)
+{
+	struct dm_multisnap *s = container_of(work, struct dm_multisnap, work);
+
+	dm_multisnap_lock(s);
+	dm_multisnap_process_bios(s);
+	dm_multisnap_unlock(s);
+
+	/*
+	 * If there was some mempool allocation failure we must wait, outside
+	 * the lock, until there is some free memory.
+	 * If this branch is taken, the work is already queued again, so it
+	 * reexecutes after finding some memory.
+	 */
+	if (unlikely(s->pending_mempool_allocation_failed)) {
+		s->pending_mempool_allocation_failed = 0;
+		dm_multisnap_wait_for_pending_exception(s);
+	}
+
+	blk_unplug(bdev_get_queue(s->origin->bdev));
+	blk_unplug(bdev_get_queue(s->snapshot->bdev));
+}
+
+static struct dm_multisnap *find_multisnapshot(struct block_device *origin)
+{
+	struct dm_multisnap *s;
+	list_for_each_entry(s, &all_multisnapshots, list_all)
+		if (s->origin->bdev == origin)
+			return s;
+	return NULL;
+}
+
+/* --- exception stores --- */
+
+static DEFINE_MUTEX(exception_stores_lock);
+static LIST_HEAD(all_exception_stores);
+
+static struct dm_multisnap_exception_store *
+dm_multisnap_find_exception_store(const char *name)
+{
+	struct dm_multisnap_exception_store *store;
+
+	list_for_each_entry(store, &all_exception_stores, list)
+		if (!strcmp(store->name, name))
+			return store;
+
+	return NULL;
+}
+
+static int dm_multisnap_exception_store_active(struct dm_multisnap_exception_store *find)
+{
+	struct dm_multisnap_exception_store *store;
+
+	list_for_each_entry(store, &all_exception_stores, list)
+		if (store == find)
+			return 1;
+
+	return 0;
+}
+
+int dm_multisnap_register_exception_store(struct dm_multisnap_exception_store *store)
+{
+	mutex_lock(&exception_stores_lock);
+
+	BUG_ON(dm_multisnap_exception_store_active(store));
+
+	if (dm_multisnap_find_exception_store(store->name)) {
+		mutex_unlock(&exception_stores_lock);
+		return -EEXIST;
+	}
+	list_add(&store->list, &all_exception_stores);
+
+	mutex_unlock(&exception_stores_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_register_exception_store);
+
+void dm_multisnap_unregister_exception_store(struct dm_multisnap_exception_store *store)
+{
+	mutex_lock(&exception_stores_lock);
+
+	BUG_ON(!dm_multisnap_exception_store_active(store));
+	list_del(&store->list);
+
+	mutex_unlock(&exception_stores_lock);
+}
+EXPORT_SYMBOL(dm_multisnap_unregister_exception_store);
+
+static struct dm_multisnap_exception_store *
+dm_multisnap_get_exception_store(const char *name)
+{
+	struct dm_multisnap_exception_store *store;
+
+	mutex_lock(&exception_stores_lock);
+
+	store = dm_multisnap_find_exception_store(name);
+	if (store) {
+		if (!try_module_get(store->module))
+			store = NULL;
+	}
+
+	mutex_unlock(&exception_stores_lock);
+
+	return store;
+}
+
+static void dm_multisnap_put_exception_store(struct dm_multisnap_exception_store *store)
+{
+	mutex_lock(&exception_stores_lock);
+
+	BUG_ON(!dm_multisnap_exception_store_active(store));
+	module_put(store->module);
+
+	mutex_unlock(&exception_stores_lock);
+}
+
+/* --- argument parser --- */
+
+int dm_multisnap_get_string(char ***argv, unsigned *argc,
+			    char **string, char **error)
+{
+	if (!*argc) {
+		*error = "Not enough arguments";
+		return -EINVAL;
+	}
+	*string = *(*argv)++;
+	(*argc)--;
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_string);
+
+int dm_multisnap_get_uint64(char ***argv, unsigned *argc,
+			    __u64 *unsigned_int64, char **error)
+{
+	char *string;
+	int r = dm_multisnap_get_string(argv, argc, &string, error);
+	if (r)
+		return r;
+	if (!*string) {
+invalid_number:
+		*error = "Invalid number";
+		return -EINVAL;
+	}
+	*unsigned_int64 = simple_strtoull(string, &string, 10);
+	if (*string)
+		goto invalid_number;
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_uint64);
+
+int dm_multisnap_get_uint(char ***argv, unsigned *argc,
+			  unsigned *unsigned_int, char **error)
+{
+	__u64 unsigned_int64;
+	int r = dm_multisnap_get_uint64(argv, argc, &unsigned_int64, error);
+	if (r)
+		return r;
+	*unsigned_int = unsigned_int64;
+	if (unsigned_int64 != *unsigned_int) {
+		*error = "Number out of range";
+		return -ERANGE;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_uint);
+
+int dm_multisnap_get_argcount(char ***argv, unsigned *argc,
+			      unsigned *unsigned_int, char **error)
+{
+	int r = dm_multisnap_get_uint(argv, argc, unsigned_int, error);
+	if (r)
+		return r;
+	if (*unsigned_int > *argc) {
+		*error = "Not enough arguments";
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_argcount);
+
+void dm_multisnap_adjust_string(char **result, unsigned *maxlen)
+{
+	unsigned len = strlen(*result);
+	*result += len;
+	*maxlen -= len;
+}
+EXPORT_SYMBOL(dm_multisnap_adjust_string);
+
+/* --- target methods --- */
+
+static int compare_snapids(const void *p1, const void *p2)
+{
+	snapid_t s1 = *(const snapid_t *)p1;
+	snapid_t s2 = *(const snapid_t *)p2;
+	if (s1 < s2)
+		return -1;
+	if (s1 > s2)
+		return 1;
+	return 0;
+}
+
+/* --- constructor & destructor --- */
+
+static int multisnap_origin_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r;
+	int i;
+	char *origin_path;
+	char *snapshot_path;
+	unsigned chunk_size;
+	unsigned generic_args;
+	char *store_name;
+	unsigned store_args;
+	unsigned num_snapshots;
+
+	struct dm_multisnap *s, *ss;
+
+	mutex_lock(&all_multisnapshots_lock);
+
+	r = dm_multisnap_get_string(&argv, &argc, &origin_path, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_multisnap_get_string(&argv, &argc, &snapshot_path, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_multisnap_get_uint(&argv, &argc, &chunk_size, &ti->error);
+	if (r)
+		goto bad_arguments;
+
+	s = kmalloc(sizeof(struct dm_multisnap), GFP_KERNEL);
+	if (!s) {
+		ti->error = "Can't allocate multisnapshot structure";
+		r = -ENOMEM;
+		goto bad_s;
+	}
+
+	ti->private = s;
+
+	s->p = NULL;
+	s->error = 0;
+	s->flags = 0;
+	mutex_init(&s->master_lock);
+	mutex_init(&s->status_lock);
+	INIT_WORK(&s->work, dm_multisnap_work);
+	dm_multisnap_init_bio_queues(s);
+	INIT_LIST_HEAD(&s->background_works);
+	s->kcopyd_jobs_submitted_count = 0;
+	s->kcopyd_jobs_finished_count = 0;
+	s->kcopyd_jobs_last_commit_count = 0;
+	INIT_LIST_HEAD(&s->pes_waiting_for_commit);
+	s->commit_sequence = 0;
+	for (i = 0; i < DM_PENDING_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&s->pending_hash[i]);
+	s->pending_mempool_allocation_failed = 0;
+	s->new_snapid_valid = 0;
+	INIT_LIST_HEAD(&s->all_snaps);
+
+	r = dm_multisnap_get_argcount(&argv, &argc, &generic_args, &ti->error);
+	if (r)
+		goto bad_arguments;
+	while (generic_args--) {
+		char *arg;
+		r = dm_multisnap_get_string(&argv, &argc, &arg, &ti->error);
+		if (r)
+			goto bad_generic_arguments;
+
+		/* Synchronize snapshot list against the list given in the target table */
+		if (!strcasecmp(arg, "sync-snapshots"))
+			s->flags |= DM_MULTISNAP_SYNC_SNAPSHOTS;
+		/* Don't drop the snapshot store on error, rather stop the origin */
+		else if (!strcasecmp(arg, "preserve-on-error"))
+			s->flags |= DM_MULTISNAP_PRESERVE_ON_ERROR;
+		else {
+			r = -EINVAL;
+			ti->error = "Invalid argument";
+			goto bad_generic_arguments;
+		}
+	}
+
+	r = dm_get_device(ti, origin_path, FMODE_READ | FMODE_WRITE, &s->origin);
+	if (r) {
+		ti->error = "Could not get origin device";
+		goto bad_origin;
+	}
+	s->origin_sectors = i_size_read(s->origin->bdev->bd_inode) >> SECTOR_SHIFT;
+
+	r = dm_get_device(ti, snapshot_path, FMODE_READ | FMODE_WRITE, &s->snapshot);
+	if (r) {
+		ti->error = "Could not get snapshot device";
+		goto bad_snapshot;
+	}
+
+	/*
+	 * Prevent multiple loads over the same devices.
+	 *
+	 * Currently, multisnapshot target is loaded just once, there is no
+	 * place where it would be reloaded (even lvchange --refresh doesn't
+	 * do it).  So there is no need to handle loading the target multiple
+	 * times for the same devices and "handover" of the exception store.
+	 *
+	 * As a safeguard to protect against possible data corruption from
+	 * userspace misbehavior, we check that there is no other target loaded
+	 * that has the origin or the snapshot store on the same devices.
+	 */
+	list_for_each_entry(ss, &all_multisnapshots, list_all)
+		if (ss->origin->bdev == s->origin->bdev ||
+		    ss->snapshot->bdev == s->snapshot->bdev) {
+			ti->error = "Another multisnapshot with the same devices";
+			r = -EINVAL;
+			goto bad_conflicting_snapshot;
+		}
+
+	/* Validate the chunk size */
+	if (chunk_size > INT_MAX / 512) {
+		ti->error = "Chunk size is too high";
+		r = -EINVAL;
+		goto bad_chunk_size;
+	}
+	if (!is_power_of_2(chunk_size)) {
+		ti->error = "Chunk size is not power of two";
+		r = -EINVAL;
+		goto bad_chunk_size;
+	}
+	chunk_size *= 512;
+	if (chunk_size < bdev_logical_block_size(s->origin->bdev) ||
+	    chunk_size < bdev_logical_block_size(s->snapshot->bdev)) {
+		ti->error = "Chunk size is smaller than device block size";
+		r = -EINVAL;
+		goto bad_chunk_size;
+	}
+	s->chunk_size = chunk_size;
+	s->chunk_shift = ffs(chunk_size) - 1;
+
+	s->pending_pool = mempool_create_slab_pool(DM_PENDING_MEMPOOL_SIZE,
+						   pending_exception_cache);
+	if (!s->pending_pool) {
+		ti->error = "Could not allocate mempool for pending exceptions";
+		r = -ENOMEM;
+		goto bad_pending_pool;
+	}
+
+	s->tracked_chunk_pool = mempool_create_slab_pool(DM_TRACKED_CHUNK_POOL_SIZE,
+							 tracked_chunk_cache);
+	if (!s->tracked_chunk_pool) {
+		ti->error = "Could not allocate tracked_chunk mempool for tracking reads";
+		goto bad_tracked_chunk_pool;
+	}
+	s->n_tracked_ios = 0;
+	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
+
+	r = dm_kcopyd_client_create(DM_MULTISNAP_KCOPYD_PAGES, &s->kcopyd);
+	if (r) {
+		ti->error = "Could not create kcopyd client";
+		goto bad_kcopyd;
+	}
+
+	r = dm_multisnap_get_string(&argv, &argc, &store_name, &ti->error);
+	if (r)
+		goto bad_store;
+
+	r = dm_multisnap_get_argcount(&argv, &argc, &store_args, &ti->error);
+	if (r)
+		goto bad_store;
+
+	s->store = dm_multisnap_get_exception_store(store_name);
+	if (!s->store) {
+		request_module("dm-store-%s", store_name);
+		s->store = dm_multisnap_get_exception_store(store_name);
+		if (!s->store) {
+			ti->error = "Can't get exception store type";
+			r = -ENOENT;
+			goto bad_store;
+		}
+	}
+
+	s->wq = create_singlethread_workqueue("kmultisnapd");
+	if (!s->wq) {
+		ti->error = "Could not create kernel thread";
+		r = -ENOMEM;
+		goto bad_thread;
+	}
+
+	dm_multisnap_lock(s);
+	r = s->store->init_exception_store(s, &s->p, store_args, argv, &ti->error);
+	if (r) {
+		s->p = NULL;
+		goto exception_store_error;
+	}
+
+	ti->split_io = s->chunk_size >> SECTOR_SHIFT;
+	ti->num_flush_requests = 1;
+
+	argv += store_args;
+	argc -= store_args;
+
+	/*
+	 * Synchronize snapshot IDs according to the table line:
+	 *	allocate IDs that are specified on the table line
+	 *	free IDs that are not specified on the table line
+	 */
+	if (s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS) {
+		snapid_t sn, n, *snapids;
+		r = dm_multisnap_get_argcount(&argv, &argc, &num_snapshots, &ti->error);
+		if (r)
+			goto error_syncing_snapshots;
+		snapids = vmalloc(sizeof(snapid_t) * (num_snapshots + 1));
+		if (!snapids && num_snapshots) {
+			ti->error = "Could not allocate snapids array";
+			goto bad_kcopyd;
+		}
+		for (n = 0; n < num_snapshots; n++) {
+			char *string;
+			r = dm_multisnap_get_string(&argv, &argc, &string, &ti->error);
+			if (r) {
+				vfree(snapids);
+				goto error_syncing_snapshots;
+			}
+			r = read_snapid(s, string, &snapids[n], &ti->error);
+			if (r) {
+				vfree(snapids);
+				goto error_syncing_snapshots;
+			}
+		}
+		snapids[num_snapshots] = DM_SNAPID_T_ORIGIN;
+
+		/* Delete the snapshots that shouldn't be there */
+		sort(snapids, num_snapshots, sizeof(snapid_t), compare_snapids, NULL);
+		sn = s->store->get_next_snapid(s->p, 0);
+		for (n = 0; n <= num_snapshots; n++) {
+			while (sn < snapids[n]) {
+				if (!dm_multisnap_has_error(s)) {
+					r = s->store->delete_snapshot(s->p, sn);
+					if (r && s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR) {
+						ti->error = "Could not delete snapshot";
+						vfree(snapids);
+						goto error_syncing_snapshots;
+					}
+				}
+				sn = s->store->get_next_snapid(s->p, sn + 1);
+				if (sn == DM_SNAPID_T_ORIGIN)
+					goto delete_done;
+			}
+			if (sn == snapids[n]) {
+				sn = s->store->get_next_snapid(s->p, sn + 1);
+				if (sn == DM_SNAPID_T_ORIGIN)
+					goto delete_done;
+			}
+		}
+delete_done:
+		/* Create the snapshots that should be there */
+		if (s->store->compare_snapids_for_create)
+			sort(snapids, num_snapshots, sizeof(snapid_t),
+			     s->store->compare_snapids_for_create, NULL);
+		for (n = 0; n <= num_snapshots; n++) {
+			if (!dm_multisnap_snapshot_exists(s, snapids[n])) {
+				if (!dm_multisnap_has_error(s)) {
+					r = s->store->create_snapshot(s->p, snapids[n]);
+					if (r && s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR) {
+						ti->error = "Could not create snapshot";
+						vfree(snapids);
+						goto error_syncing_snapshots;
+					}
+				}
+			}
+		}
+		vfree(snapids);
+	}
+
+	dm_multisnap_unlock(s);
+
+	list_add(&s->list_all, &all_multisnapshots);
+
+	mutex_unlock(&all_multisnapshots_lock);
+	return 0;
+
+error_syncing_snapshots:
+	s->store->exit_exception_store(s->p);
+	s->p = NULL;
+exception_store_error:
+	dm_multisnap_unlock(s);
+	destroy_workqueue(s->wq);
+bad_thread:
+	dm_multisnap_put_exception_store(s->store);
+bad_store:
+	dm_kcopyd_client_destroy(s->kcopyd);
+bad_kcopyd:
+	mempool_destroy(s->tracked_chunk_pool);
+bad_tracked_chunk_pool:
+	mempool_destroy(s->pending_pool);
+bad_pending_pool:
+bad_conflicting_snapshot:
+bad_chunk_size:
+	dm_put_device(ti, s->snapshot);
+bad_snapshot:
+	dm_put_device(ti, s->origin);
+bad_origin:
+bad_generic_arguments:
+	kfree(s);
+bad_s:
+bad_arguments:
+	mutex_unlock(&all_multisnapshots_lock);
+	return r;
+}
+
+static void multisnap_origin_dtr(struct dm_target *ti)
+{
+	struct dm_multisnap *s = ti->private;
+	struct dm_multisnap_snap *sn;
+	unsigned i;
+
+	mutex_lock(&all_multisnapshots_lock);
+
+	/* Make sure that no more IOs will be submitted by snapshot targets */
+	list_for_each_entry(sn, &s->all_snaps, list_snaps) {
+		spin_lock_irq(&dm_multisnap_bio_list_lock);
+		sn->s = NULL;
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+	}
+	list_del(&s->all_snaps);
+
+	/*
+	 * This code is called in the destructor, it is not performance
+	 * sensitive and thus we use polling with active waiting (msleep(1)).
+	 *
+	 * A possible 1ms delay on device destruction won't cause any trouble
+	 * and this polling is simpler and less bug-prone than using wait
+	 * queues.
+	 */
+poll_for_ios:
+	/* Wait for IOs on the snapshot */
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+	if (s->n_tracked_ios) {
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		msleep(1);
+		goto poll_for_ios;
+	}
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	/* Make sure that there really are no outstanding IOs */
+	for (i = 0; i < DM_MULTISNAP_N_QUEUES; i++)
+		BUG_ON(!bio_list_empty(&s->queue[i].bios));
+	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+		BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
+
+	/* Wait for pending reallocations */
+	dm_multisnap_lock(s);
+	for (i = 0; i < DM_PENDING_HASH_SIZE; i++)
+		if (!hlist_empty(&s->pending_hash[i])) {
+			dm_multisnap_unlock(s);
+			msleep(1);
+			goto poll_for_ios;
+		}
+	dm_multisnap_unlock(s);
+
+	flush_workqueue(s->wq);
+
+	dm_multisnap_lock(s);
+	dm_multisnap_call_commit(s);
+	s->store->exit_exception_store(s->p);
+	s->p = NULL;
+	list_del(&s->list_all);
+	dm_multisnap_unlock(s);
+
+	destroy_workqueue(s->wq);
+	kfree(s->p);
+	dm_kcopyd_client_destroy(s->kcopyd);
+	mempool_destroy(s->tracked_chunk_pool);
+	mempool_destroy(s->pending_pool);
+	dm_put_device(ti, s->snapshot);
+	dm_put_device(ti, s->origin);
+	dm_multisnap_put_exception_store(s->store);
+
+	kfree(s);
+
+	mutex_unlock(&all_multisnapshots_lock);
+}
+
+static int multisnap_iterate_devices(struct dm_target *ti, struct dm_multisnap *s,
+				     iterate_devices_callout_fn fn, void *data)
+{
+	int r;
+
+	r = fn(ti, s->origin, 0, s->origin_sectors, data);
+
+	if (!r) {
+		sector_t snapshot_sectors =
+		       i_size_read(s->snapshot->bdev->bd_inode) >> SECTOR_SHIFT;
+		r = fn(ti, s->snapshot, 0, snapshot_sectors, data);
+	}
+
+	return r;
+}
+
+static int multisnap_origin_iterate_devices(struct dm_target *ti,
+					    iterate_devices_callout_fn fn, void *data)
+{
+	struct dm_multisnap *s = ti->private;
+	return multisnap_iterate_devices(ti, s, fn, data);
+}
+
+static int multisnap_snap_iterate_devices(struct dm_target *ti,
+					  iterate_devices_callout_fn fn, void *data)
+{
+	int r;
+	struct dm_multisnap_snap *sn = ti->private;
+	struct dm_multisnap *s;
+
+	mutex_lock(&all_multisnapshots_lock);
+	s = sn->s;
+	if (s)
+		r = multisnap_iterate_devices(ti, s, fn, data);
+	else
+		r = 0;
+	mutex_unlock(&all_multisnapshots_lock);
+
+	return r;
+}
+
+static int multisnap_origin_map(struct dm_target *ti, struct bio *bio,
+				union map_info *map_context)
+{
+	struct dm_multisnap *s = ti->private;
+
+	/*
+	 * Do the most common case quickly: reads and write barriers are
+	 * dispatched to the origin device directly.
+	 */
+	if (likely(bio_rw(bio) != WRITE) || unlikely(bio_empty_barrier(bio))) {
+		bio->bi_bdev = s->origin->bdev;
+		return DM_MAPIO_REMAPPED;
+	}
+
+	bio_put_snapid(bio, DM_SNAPID_T_ORIGIN);
+
+	dm_multisnap_enqueue_bio(s, bio);
+	wakeup_kmultisnapd(s);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+static int multisnap_origin_message(struct dm_target *ti,
+				    unsigned argc, char **argv)
+{
+	struct dm_multisnap *s = ti->private;
+	char *error;
+	int r;
+	int subsnap = 0;
+	snapid_t subsnap_id = 0;
+
+	mutex_lock(&all_multisnapshots_lock);
+	dm_multisnap_lock(s);
+
+	if (argc == 2 && !strcasecmp(argv[0], "create_subsnap")) {
+		/*
+		 * Create snapshot of snapshot.
+		 */
+		r = read_snapid(s, argv[1], &subsnap_id, &error);
+		if (r) {
+			DMWARN("invalid snapshot id: %s", error);
+			goto unlock_ret;
+		}
+		subsnap = 1;
+		goto create_snapshot;
+	}
+
+	if (argc == 1 && !strcasecmp(argv[0], "create")) {
+create_snapshot:
+		/*
+		 * Prepare snapshot creation.
+		 *
+		 * We allocate a snapid, and return it in the status.
+		 *
+		 * The snapshot is really created in postsuspend method (to
+		 * make sure that possibly mounted filesystem is quiescent and
+		 * the snapshot will be consistent).
+		 */
+		r = dm_multisnap_has_error(s);
+		if (r)
+			goto unlock_ret;
+
+		dm_multisnap_status_lock(s);
+		s->new_snapid_valid = 0;
+		dm_multisnap_status_unlock(s);
+
+		r = s->store->allocate_snapid(s->p, &s->new_snapid,
+					      subsnap, subsnap_id);
+		if (r)
+			goto unlock_ret;
+
+		r = dm_multisnap_has_error(s);
+		if (r)
+			goto unlock_ret;
+
+		dm_multisnap_status_lock(s);
+		s->new_snapid_valid = 1;
+		dm_multisnap_status_unlock(s);
+
+		r = 0;
+		goto unlock_ret;
+	}
+
+	if (argc == 2 && !strcasecmp(argv[0], "delete")) {
+		/*
+		 * Delete a snapshot.
+		 */
+		snapid_t snapid;
+		struct dm_multisnap_snap *sn;
+		struct bio *bio;
+		struct bio_list all_bios;
+
+		r = read_snapid(s, argv[1], &snapid, &error);
+		if (r) {
+			DMWARN("invalid snapshot id: %s", error);
+			goto unlock_ret;
+		}
+
+		if (!s->store->delete_snapshot) {
+			DMERR("snapshot store doesn't support delete");
+			r = -EOPNOTSUPP;
+			goto unlock_ret;
+		}
+
+		r = dm_multisnap_has_error(s);
+		if (r)
+			goto unlock_ret;
+
+		/* Kick off possibly attached snapshot */
+		list_for_each_entry(sn, &s->all_snaps, list_snaps) {
+			if (sn->snapid == snapid) {
+				spin_lock_irq(&dm_multisnap_bio_list_lock);
+				sn->s = NULL;
+				spin_unlock_irq(&dm_multisnap_bio_list_lock);
+			}
+		}
+
+		/* Terminate bios queued for this snapshot so far */
+		dm_multisnap_bio_dequeue_all(s, &all_bios);
+		while ((bio = bio_list_pop(&all_bios))) {
+			if (bio_get_snapid(bio) == snapid)
+				bio_endio(bio, -EIO);
+			else
+				dm_multisnap_enqueue_bio(s, bio);
+		}
+
+		if (!dm_multisnap_snapshot_exists(s, snapid)) {
+			DMWARN("snapshot with this id doesn't exists.");
+			r = -EINVAL;
+			goto unlock_ret;
+		}
+
+		r = s->store->delete_snapshot(s->p, snapid);
+		if (r)
+			goto unlock_ret;
+
+		dm_multisnap_unlock(s);
+
+		r = dm_multisnap_force_commit(s);
+
+		goto unlock2_ret;
+	}
+
+	DMWARN("unrecognised message received.");
+	r = -EINVAL;
+
+unlock_ret:
+	dm_multisnap_unlock(s);
+unlock2_ret:
+	mutex_unlock(&all_multisnapshots_lock);
+
+	return r;
+}
+
+/* Print used snapshot IDs into a supplied string */
+static void print_snapshot_ids(struct dm_multisnap *s, char *result, unsigned maxlen)
+{
+	snapid_t nsnap = 0;
+	snapid_t sn = 0;
+	while ((sn = s->store->get_next_snapid(s->p, sn)) != DM_SNAPID_T_ORIGIN)
+		sn++, nsnap++;
+	snprintf(result, maxlen, " %llu", (unsigned long long)nsnap);
+	dm_multisnap_adjust_string(&result, &maxlen);
+	sn = 0;
+	while ((sn = s->store->get_next_snapid(s->p, sn)) != DM_SNAPID_T_ORIGIN) {
+		snprintf(result, maxlen, " ");
+		dm_multisnap_adjust_string(&result, &maxlen);
+		print_snapid(s, result, maxlen, sn);
+		dm_multisnap_adjust_string(&result, &maxlen);
+		sn++;
+	}
+}
+
+static int multisnap_origin_status(struct dm_target *ti, status_type_t type,
+				   char *result, unsigned maxlen)
+{
+	struct dm_multisnap *s = ti->private;
+
+	/*
+	 * Use a special status lock, so that this code can execute even
+	 * when the underlying device is suspended and there is no possibility
+	 * to obtain the master lock.
+	 */
+	dm_multisnap_status_lock(s);
+
+	switch (type) {
+		case STATUSTYPE_INFO: {
+			unsigned long long total, alloc, meta;
+			snprintf(result, maxlen, "5 %d ", dm_multisnap_has_error(s));
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->new_snapid_valid)
+				print_snapid(s, result, maxlen, s->new_snapid);
+			else
+				snprintf(result, maxlen, "-");
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->store->get_space)
+				s->store->get_space(s->p, &total, &alloc, &meta);
+			else
+				total = alloc = meta = 0;
+			total <<= s->chunk_shift - SECTOR_SHIFT;
+			alloc <<= s->chunk_shift - SECTOR_SHIFT;
+			meta <<= s->chunk_shift - SECTOR_SHIFT;
+			snprintf(result, maxlen, " %llu %llu %llu", total, alloc, meta);
+			dm_multisnap_adjust_string(&result, &maxlen);
+			print_snapshot_ids(s, result, maxlen);
+			dm_multisnap_adjust_string(&result, &maxlen);
+			break;
+		}
+		case STATUSTYPE_TABLE: {
+			unsigned ngen = 0;
+			if (s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS)
+				ngen++;
+			if (s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR)
+				ngen++;
+			snprintf(result, maxlen, "%s %s %u %u%s%s %s",
+				s->origin->name,
+				s->snapshot->name,
+				s->chunk_size / 512,
+				ngen,
+				s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS ?
+					" sync-snapshots" : "",
+				s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR ?
+					" preserve-on-error" : "",
+				s->store->name);
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->store->status_table)
+				s->store->status_table(s->p, result, maxlen);
+			else
+				snprintf(result, maxlen, " 0");
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS) {
+				print_snapshot_ids(s, result, maxlen);
+				dm_multisnap_adjust_string(&result, &maxlen);
+			}
+			break;
+		}
+	}
+
+	dm_multisnap_status_unlock(s);
+
+	/* If there's no space left in the buffer, ask for larger size */
+	return maxlen <= 1;
+}
+
+/*
+ * In postsuspend, we optionally create a snapshot that we prepared with
+ * a message.
+ */
+static void multisnap_origin_postsuspend(struct dm_target *ti)
+{
+	struct dm_multisnap *s = ti->private;
+
+	dm_multisnap_lock(s);
+	if (s->new_snapid_valid && !dm_multisnap_has_error(s)) {
+		/*
+		 * No way to return the error code, but it is recorded
+		 * in s->error anyway.
+		 */
+		s->store->create_snapshot(s->p, s->new_snapid);
+		s->new_snapid_valid = 0;
+	}
+	dm_multisnap_unlock(s);
+
+	dm_multisnap_force_commit(s);
+}
+
+static int multisnap_snap_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r;
+	char *origin_path;
+	char *snapid_str;
+	snapid_t snapid;
+	int doesnt_exist;
+
+	struct dm_dev *origin;
+
+	struct dm_multisnap *s;
+	struct dm_multisnap_snap *sn;
+
+	r = dm_multisnap_get_string(&argv, &argc, &origin_path, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_multisnap_get_string(&argv, &argc, &snapid_str, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_get_device(ti, origin_path, FMODE_READ | FMODE_WRITE, &origin);
+	if (r) {
+		ti->error = "Could not get origin device";
+		goto bad_origin;
+	}
+	mutex_lock(&all_multisnapshots_lock);
+	s = find_multisnapshot(origin->bdev);
+	if (!s) {
+		r = -ENXIO;
+		ti->error = "Origin target not loaded";
+		goto origin_not_loaded;
+	}
+
+	dm_multisnap_lock(s);
+
+	r = read_snapid(s, snapid_str, &snapid, &ti->error);
+	if (r) {
+		dm_multisnap_unlock(s);
+		goto snapid_doesnt_exist;
+	}
+
+	doesnt_exist = 0;
+	if (!dm_multisnap_snapshot_exists(s, snapid)) {
+		if (dm_multisnap_has_error(s) && dm_multisnap_drop_on_error(s)) {
+			/*
+			 * If there was an error, we don't know which snapshot
+			 * IDs are available. So we must accept it. But we
+			 * abort all accesses to this snapshot with an error.
+			 */
+			 doesnt_exist = 1;
+		} else {
+			dm_multisnap_unlock(s);
+			r = -ENOENT;
+			ti->error = "Snapshot with this id doesn't exist";
+			goto snapid_doesnt_exist;
+		}
+	}
+	dm_multisnap_unlock(s);
+
+	sn = kmalloc(sizeof(*sn) + strlen(snapid_str), GFP_KERNEL);
+	if (!sn) {
+		ti->error = "Could not allocate multisnapshot_snap structure";
+		r = -ENOMEM;
+		goto cant_allocate;
+	}
+	sn->s = doesnt_exist ? NULL : s;
+	sn->snapid = snapid;
+	list_add(&sn->list_snaps, &s->all_snaps);
+	strlcpy(sn->origin_name, origin->name, sizeof sn->origin_name);
+	strcpy(sn->snapid_string, snapid_str);
+
+	mutex_unlock(&all_multisnapshots_lock);
+
+	dm_put_device(ti, origin);
+
+	ti->private = sn;
+	ti->split_io = s->chunk_size >> SECTOR_SHIFT;
+	ti->num_flush_requests = 1;
+
+	return 0;
+
+cant_allocate:
+snapid_doesnt_exist:
+origin_not_loaded:
+	dm_put_device(ti, origin);
+	mutex_unlock(&all_multisnapshots_lock);
+bad_origin:
+bad_arguments:
+	return r;
+}
+
+static void multisnap_snap_dtr(struct dm_target *ti)
+{
+	struct dm_multisnap_snap *sn = ti->private;
+
+	mutex_lock(&all_multisnapshots_lock);
+
+	list_del(&sn->list_snaps);
+	kfree(sn);
+
+	mutex_unlock(&all_multisnapshots_lock);
+}
+
+/*
+ * Each snapshot I/O is counted in n_tracked_ios in the origin and
+ * has 'struct dm_multisnap_tracked_chunk' allocated.
+ * dm_multisnap_tracked_chunk->node can be optionally linked into
+ * origin's hash of tracked I/Os.
+ */
+static int multisnap_snap_map(struct dm_target *ti, struct bio *bio,
+			      union map_info *map_context)
+{
+	struct dm_multisnap_snap *sn = ti->private;
+	struct dm_multisnap *s;
+	struct dm_multisnap_tracked_chunk *c;
+
+	bio_put_snapid(bio, sn->snapid);
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+	s = sn->s;
+	if (unlikely(!s)) {
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		return -EIO;
+	}
+	/*
+	 * make sure that the origin is not unloaded under us while
+	 * we drop the lock
+	 */
+	s->n_tracked_ios++;
+
+	c = mempool_alloc(s->tracked_chunk_pool, GFP_ATOMIC);
+	if (unlikely(!c)) {
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		c = mempool_alloc(s->tracked_chunk_pool, GFP_NOIO);
+		spin_lock_irq(&dm_multisnap_bio_list_lock);
+	}
+	c->s = s;
+	c->chunk = sector_to_chunk(s, bio->bi_sector);
+	c->bio_rw = bio_rw(bio);
+	INIT_HLIST_NODE(&c->node);
+	map_context->ptr = c;
+
+	if (unlikely(bio_empty_barrier(bio))) {
+		bio->bi_bdev = s->snapshot->bdev;
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		return DM_MAPIO_REMAPPED;
+	}
+
+	dm_multisnap_enqueue_bio_unlocked(s, bio);
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	wakeup_kmultisnapd(s);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+static int multisnap_snap_end_io(struct dm_target *ti, struct bio *bio,
+				 int error, union map_info *map_context)
+{
+	struct dm_multisnap_tracked_chunk *c = map_context->ptr;
+	struct dm_multisnap *s = c->s;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dm_multisnap_bio_list_lock, flags);
+
+	s->n_tracked_ios--;
+	if (!hlist_unhashed(&c->node))
+		hlist_del(&c->node);
+	mempool_free(c, s->tracked_chunk_pool);
+
+	spin_unlock_irqrestore(&dm_multisnap_bio_list_lock, flags);
+
+	return 0;
+}
+
+static int multisnap_snap_status(struct dm_target *ti, status_type_t type,
+				 char *result, unsigned maxlen)
+{
+	struct dm_multisnap_snap *sn = ti->private;
+
+	switch (type) {
+
+	case STATUSTYPE_INFO:
+		/* there is no status */
+		result[0] = 0;
+		dm_multisnap_adjust_string(&result, &maxlen);
+		break;
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s %s",
+			 sn->origin_name, sn->snapid_string);
+		dm_multisnap_adjust_string(&result, &maxlen);
+		break;
+	}
+
+	/* If there's no space left in the buffer, ask for larger size */
+	return maxlen <= 1;
+}
+
+static struct target_type multisnap_origin_target = {
+	.name		= "multisnapshot",
+	.version 	= {1, 0, 0},
+	.module		= THIS_MODULE,
+	.ctr		= multisnap_origin_ctr,
+	.dtr		= multisnap_origin_dtr,
+	.map		= multisnap_origin_map,
+	.message 	= multisnap_origin_message,
+	.status		= multisnap_origin_status,
+	.postsuspend	= multisnap_origin_postsuspend,
+	.iterate_devices = multisnap_origin_iterate_devices,
+};
+
+static struct target_type multisnap_snap_target = {
+	.name		= "multisnap-snap",
+	.version 	= {1, 0, 0},
+	.module		= THIS_MODULE,
+	.ctr		= multisnap_snap_ctr,
+	.dtr		= multisnap_snap_dtr,
+	.map		= multisnap_snap_map,
+	.end_io		= multisnap_snap_end_io,
+	.status		= multisnap_snap_status,
+	.iterate_devices = multisnap_snap_iterate_devices,
+};
+
+static int __init dm_multisnapshot_init(void)
+{
+	int r;
+
+	pending_exception_cache =
+		kmem_cache_create("dm_multisnap_pending_exception",
+				  sizeof(struct dm_multisnap_pending_exception),
+				  __alignof__(struct dm_multisnap_pending_exception),
+				  0, pending_exception_ctor);
+	if (!pending_exception_cache) {
+		DMERR("Couldn't create exception cache.");
+		r = -ENOMEM;
+		goto bad_exception_cache;
+	}
+	tracked_chunk_cache = KMEM_CACHE(dm_multisnap_tracked_chunk, 0);
+	if (!tracked_chunk_cache) {
+		DMERR("Couldn't create cache to track chunks in use.");
+		r = -ENOMEM;
+		goto bad_tracked_chunk_cache;
+	}
+
+	r = dm_register_target(&multisnap_origin_target);
+	if (r < 0) {
+		DMERR("multisnapshot target register failed %d", r);
+		goto bad_multisnap_origin_target;
+	}
+
+	r = dm_register_target(&multisnap_snap_target);
+	if (r < 0) {
+		DMERR("multisnap-snap target register failed %d", r);
+		goto bad_multisnap_snap_target;
+	}
+
+	return 0;
+
+bad_multisnap_snap_target:
+	dm_unregister_target(&multisnap_origin_target);
+bad_multisnap_origin_target:
+	kmem_cache_destroy(tracked_chunk_cache);
+bad_tracked_chunk_cache:
+	kmem_cache_destroy(pending_exception_cache);
+bad_exception_cache:
+	return r;
+}
+
+static void __exit dm_multisnapshot_exit(void)
+{
+	dm_unregister_target(&multisnap_origin_target);
+	dm_unregister_target(&multisnap_snap_target);
+	kmem_cache_destroy(tracked_chunk_cache);
+	kmem_cache_destroy(pending_exception_cache);
+}
+
+/* Module hooks */
+module_init(dm_multisnapshot_init);
+module_exit(dm_multisnapshot_exit);
+
+MODULE_DESCRIPTION(DM_NAME " multisnapshot target");
+MODULE_AUTHOR("Mikulas Patocka");
+MODULE_LICENSE("GPL");
Index: linux-2.6.34-rc4-fast/drivers/md/dm-multisnap.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.34-rc4-fast/drivers/md/dm-multisnap.h	2010-04-13 16:28:30.000000000 +0200
@@ -0,0 +1,183 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_MULTISNAP_H
+#define DM_MULTISNAP_H
+
+/*
+ * This file defines the interface between generic driver (dm-multisnap.c)
+ * and exception store drivers.
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+
+#define EFSERROR	EPERM
+
+#define DM_MSG_PREFIX	"multisnapshot"
+
+#define DM_SNAPID_T_ORIGIN	0xffffffffffffffffULL
+
+typedef sector_t chunk_t;
+typedef __u64 snapid_t;
+
+struct dm_multisnap;		/* private to dm-multisnap.c */
+struct dm_exception_store;	/* private to the exception store driver */
+
+struct dm_multisnap_background_work {
+	struct list_head list;
+	void (*work)(struct dm_exception_store *, struct dm_multisnap_background_work *);
+	int queued;
+};
+
+union chunk_descriptor {
+	__u64 bitmask;
+	struct {
+		snapid_t from;
+		snapid_t to;
+	} range;
+};
+
+struct dm_multisnap_exception_store {
+	struct list_head list;
+	struct module *module;
+	const char *name;
+
+	/* < 0 - error */
+	int (*init_exception_store)(struct dm_multisnap *dm, struct dm_exception_store **s,
+				    unsigned argc, char **argv, char **error);
+
+	void (*exit_exception_store)(struct dm_exception_store *s);
+
+	void (*store_lock_acquired)(struct dm_exception_store *s, int flags);
+
+	/* These two can override format of snapids in the table. Can be NULL */
+	void (*print_snapid)(struct dm_exception_store *s, char *string,
+			     unsigned maxlen, snapid_t snapid);
+	int (*read_snapid)(struct dm_exception_store *s, char *string,
+			   snapid_t *snapid, char **error);
+
+	/* return the exception-store specific table arguments */
+	void (*status_table)(struct dm_exception_store *s, char *result, unsigned maxlen);
+
+	/* return the space */
+	void (*get_space)(struct dm_exception_store *s, unsigned long long *chunks_total,
+			  unsigned long long *chunks_allocated,
+			  unsigned long long *chunks_metadata_allocated);
+
+	/* < 0 - error */
+	int (*allocate_snapid)(struct dm_exception_store *s, snapid_t *snapid,
+			       int snap_of_snap, snapid_t master);
+
+	/* < 0 - error */
+	int (*create_snapshot)(struct dm_exception_store *s, snapid_t snapid);
+
+	/* < 0 - error (may be NULL if not supported) */
+	int (*delete_snapshot)(struct dm_exception_store *s, snapid_t snapid);
+
+	/*
+	 * Get the first snapid at or after snapid in its argument.
+	 * If there are no more snapids, return DM_SNAPID_T_ORIGIN.
+	 */
+	snapid_t (*get_next_snapid)(struct dm_exception_store *s, snapid_t snapid);
+
+	/*
+	 * qsort()-compatible function to order snapshots for creation.
+	 * may be NULL if standard ordering should be used.
+	 */
+	int (*compare_snapids_for_create)(const void *p1, const void *p2);
+
+	/* 0 - not found, 1 - found (read-only), 2 - found (writeable), < 0 - error */
+	int (*find_snapshot_chunk)(struct dm_exception_store *s, snapid_t snapid,
+				   chunk_t chunk, int write, chunk_t *result);
+
+	/*
+	 * Chunk interface between exception store and generic code.
+	 * Allowed sequences:
+	 *
+	 * - first call reset_query
+	 *   then repeatedly query next exception to make with query_next_remap
+	 *   and add it to btree with add_next_remap. This can be repeated until
+	 *   query_next_remap indicates that it has nothing more or until all 8
+	 *   kcopyd slots are filled.
+	 *
+	 * - call find_snapshot_chunk, if it returns 0, you can call
+	 *   add_next_remap to add the chunk to the btree.
+	 *
+	 * - call find_snapshot_chunk, if it returns 1 (shared chunk), call
+	 *   make_chunk_writeable to relocate that chunk.
+	 */
+
+	void (*reset_query)(struct dm_exception_store *s);
+	int (*query_next_remap)(struct dm_exception_store *s, chunk_t chunk);
+	void (*add_next_remap)(struct dm_exception_store *s,
+			       union chunk_descriptor *cd, chunk_t *new_chunk);
+
+	/* may be NULL if writeable snapshots are not supported */
+	void (*make_chunk_writeable)(struct dm_exception_store *s,
+				     union chunk_descriptor *cd, chunk_t *new_chunk);
+	int (*check_conflict)(struct dm_exception_store *s,
+			      union chunk_descriptor *cd, snapid_t snapid);
+
+	/* This is called without the lock, prior to commit */
+	void (*prepare_for_commit)(struct dm_exception_store *s);
+
+	/* Commit the transactions */
+	void (*commit)(struct dm_exception_store *s);
+};
+
+#define DM_MULTISNAP_SET_ERROR(dm, err, msg)				\
+do {									\
+	DMERR msg;							\
+	dm_multisnap_set_error(dm, err);				\
+} while (0)
+
+/* dm-multisnap.c */
+
+/* Access generic information about the snapshot */
+struct block_device *dm_multisnap_snapshot_bdev(struct dm_multisnap *s);
+unsigned dm_multisnap_chunk_size(struct dm_multisnap *s);
+void dm_multisnap_set_error(struct dm_multisnap *s, int error);
+int dm_multisnap_has_error(struct dm_multisnap *s);
+int dm_multisnap_drop_on_error(struct dm_multisnap *s);
+int dm_multisnap_snapshot_exists(struct dm_multisnap *s, snapid_t snapid);
+
+/* Lock status/table queries */
+void dm_multisnap_status_lock(struct dm_multisnap *s);
+void dm_multisnap_status_unlock(struct dm_multisnap *s);
+void dm_multisnap_status_assert_locked(struct dm_multisnap *s);
+
+/*
+ * Commit. dm_multisnap_call_commit can only be called
+ * if dm_multisnap_can_commit returns true
+ */
+int dm_multisnap_can_commit(struct dm_multisnap *s);
+void dm_multisnap_call_commit(struct dm_multisnap *s);
+
+/* Delayed work for delete/merge */
+void dm_multisnap_queue_work(struct dm_multisnap *s,
+			     struct dm_multisnap_background_work *bw);
+void dm_multisnap_cancel_work(struct dm_multisnap *s,
+			      struct dm_multisnap_background_work *bw);
+
+/* Parsing command line */
+int dm_multisnap_get_string(char ***argv, unsigned *argc,
+			    char **string, char **error);
+int dm_multisnap_get_uint64(char ***argv, unsigned *argc,
+			    __u64 *unsigned_int64, char **error);
+int dm_multisnap_get_uint(char ***argv, unsigned *argc,
+			  unsigned *unsigned_int, char **error);
+int dm_multisnap_get_argcount(char ***argv, unsigned *argc,
+			      unsigned *unsigned_int, char **error);
+void dm_multisnap_adjust_string(char **result, unsigned *maxlen);
+
+/* Register/unregister the exception store driver */
+int dm_multisnap_register_exception_store(struct dm_multisnap_exception_store *store);
+void dm_multisnap_unregister_exception_store(struct dm_multisnap_exception_store *store);
+
+#endif
Index: linux-2.6.34-rc4-fast/drivers/md/dm-multisnap-private.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.34-rc4-fast/drivers/md/dm-multisnap-private.h	2010-04-14 13:36:32.000000000 +0200
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_MULTISNAP_PRIVATE_H
+#define DM_MULTISNAP_PRIVATE_H
+
+#include "dm-multisnap.h"
+
+/*
+ * Private structures for dm-multisnap.c.
+ * This file should not be included by exception store drivers.
+ * Changes to this file do not change ABI.
+ */
+
+#include <linux/dm-kcopyd.h>
+
+#define DM_MULTISNAP_MAX_REMAPS		256
+
+#define DM_MULTISNAP_KCOPYD_PAGES	(((1UL << 20) >> PAGE_SHIFT) ? : 1)
+
+#define DM_MULTISNAP_MAX_CHUNKS_TO_REMAP DM_KCOPYD_MAX_REGIONS
+
+#define DM_PENDING_HASH_SIZE		256
+#define DM_PENDING_HASH(c)		((c) & (DM_PENDING_HASH_SIZE - 1))
+#define DM_PENDING_MEMPOOL_SIZE		256
+
+#define DM_TRACKED_CHUNK_HASH_SIZE	16
+#define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+#define DM_TRACKED_CHUNK_POOL_SIZE	256
+
+struct dm_multisnap_bio_queue {
+	struct bio_list bios;
+};
+
+#define DM_MULTISNAP_N_QUEUES	2
+
+struct dm_multisnap {
+	struct dm_exception_store *p;
+	struct dm_multisnap_exception_store *store;
+
+	struct dm_dev *origin;
+	struct dm_dev *snapshot;
+
+	int error;
+
+	unsigned chunk_size;
+	unsigned char chunk_shift;
+
+	unsigned char flags;	/* DM_MULTISNAP_* */
+
+	sector_t origin_sectors;
+
+	struct mutex master_lock;
+	struct mutex status_lock;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+
+	/* Queues are protected with dm_multisnap_bio_list_lock */
+	struct dm_multisnap_bio_queue queue[DM_MULTISNAP_N_QUEUES];
+	unsigned current_queue;
+
+	struct list_head background_works;
+
+	/* All snapshot IOs */
+	mempool_t *tracked_chunk_pool;
+
+	/* these two are protected with dm_multisnap_bio_list_lock */
+	long n_tracked_ios;
+	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+
+	mempool_t *pending_pool;
+
+	struct dm_kcopyd_client *kcopyd;
+
+	/*
+	 * The following two variables do a trick to avoid the need for
+	 * atomic operations.
+	 *
+	 * kcopyd_jobs_submitted_count is incremented each time a job is
+	 * submitted to kcopyd. master_lock protects it.
+	 *
+	 * kcopyd_jobs_finished_count is incremented each time a kcopyd
+	 * callback is called. The callback is single-threaded, so it needs
+	 * no protection.
+	 *
+	 * Both kcopyd_jobs_submitted_count and kcopyd_jobs_finished_count
+	 * can be updated simultaneously. But none of these variables is
+	 * updated multiple times concurrently.
+	 *
+	 * When these two are equal, there are no jobs in flight. When they
+	 * are equal and master_lock is held, we know that there are no jobs
+	 * in flight and no new can be submitted --- i.e. we can commit.
+	 */
+	unsigned long kcopyd_jobs_submitted_count;
+	unsigned long kcopyd_jobs_finished_count;
+
+	/* The value of the counter on last commit */
+	unsigned long kcopyd_jobs_last_commit_count;
+
+	/* This may only be accessed from kcopyd callback, it has no locking */
+	struct list_head pes_waiting_for_commit;
+
+	/* Increased each time a commit happens */
+	unsigned commit_sequence;
+
+	/* List head for struct dm_multisnap_pending_exception->hash_list */
+	struct hlist_head pending_hash[DM_PENDING_HASH_SIZE];
+
+	char pending_mempool_allocation_failed;
+
+	/* The new snapshot id to be created */
+	char new_snapid_valid;
+	snapid_t new_snapid;
+
+	/* List head for struct dm_multisnap_snap->list_snaps */
+	struct list_head all_snaps;
+
+	/* List entry for all_multisnapshots */
+	struct list_head list_all;
+};
+
+/* struct dm_multisnap->flags */
+#define DM_MULTISNAP_SYNC_SNAPSHOTS		1
+#define DM_MULTISNAP_PRESERVE_ON_ERROR		2
+
+struct dm_multisnap_snap {
+	struct dm_multisnap *s;
+	snapid_t snapid;
+	/* List entry for struct dm_multisnap->list_all */
+	struct list_head list_snaps;
+	char origin_name[16];
+	char snapid_string[1];
+};
+
+struct dm_multisnap_tracked_chunk {
+	struct hlist_node node;
+	chunk_t chunk;
+	unsigned long bio_rw;
+	struct dm_multisnap *s;
+};
+
+struct dm_multisnap_pending_exception {
+	/* List entry for struct dm_multisnap->pending_hash */
+	struct hlist_node hash_list;
+
+	struct dm_multisnap *s;
+	struct bio_list bios;
+
+	chunk_t chunk;
+
+	int n_descs;
+	union chunk_descriptor desc[DM_MULTISNAP_MAX_CHUNKS_TO_REMAP];
+
+	/* List entry for struct dm_multisnap->pes_waiting_for_commit */
+	struct list_head list;
+};
+
+#endif
Index: linux-2.6.34-rc4-fast/Documentation/device-mapper/dm-multisnapshot.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.34-rc4-fast/Documentation/device-mapper/dm-multisnapshot.txt	2010-04-13 16:28:30.000000000 +0200
@@ -0,0 +1,153 @@
+From snitzer@redhat.com Tue Mar 30 12:27:02 2010
+Date: Tue, 30 Mar 2010 12:26:21 -0400
+From: Mike Snitzer <snitzer@redhat.com>
+To: Mikulas Patocka <mpatocka@redhat.com>
+Cc: device-mapper development <dm-devel@redhat.com>, Alasdair G Kergon <agk@redhat.com>
+Subject: Re: Shared snapshots
+
+On Wed, Dec 16 2009 at  3:39pm -0500,
+Mike Snitzer <snitzer@redhat.com> wrote:
+
+> As an aside, I have some ideas for improving
+> Documentation/device-mapper/dm-multisnapshot.txt
+> I'll just send a patch and we can go from there.
+
+OK, here is the updated dm-multisnapshot.txt (finally):
+
+Device-mapper multiple snapshot support
+=======================================
+
+Device-mapper allows a single copy-on-write (COW) block device to be
+shared among multiple snapshots of an origin device.  This variant of dm
+snapshot is ideal for supporting high numbers of snapshots.
+
+There is a single dm target for the origin device:
+multisnapshot
+
+and associated shared COW storage modules:
+mikulas - supports 2^32 snapshots and 2^32 snapshots of snapshots with
+	  full consistency across crashes via journaling
+daniel  - only supports 64 snapshots and does not provide consistency
+          through journaling
+
+The snapshots within the shared COW use a single dm target:
+multisnap-snap
+
+*) multisnapshot <origin> <COW device> <chunksize>
+   <# generic args> <generic args> <shared COW store type>
+   <# shared COW store args> <shared COW store args>
+   [<# snapshot ids> <snapshot ids>]
+
+Table line arguments:
+- <origin> : origin device
+- <COW device> : shared COW store device
+- <chunksize> : chunk size in 512b sectors
+- <# generic args> : number of generic arguments
+- <generic args> : generic arguments
+	sync-snapshots --- synchronize snapshots according to the list
+	preserve-on-error --- halt the origin on error in the snapshot store
+- <shared COW store type> : shared COW store type
+	mikulas --- provided by the 'dm-store-mikulas' module
+	daniel --- provided by the 'dm-store-daniel' module
+- <# shared COW store args> : number of arguments for shared COW store type
+- <shared COW store args> : shared COW store arguments
+If 'sync-snapshots' was specified:
+- <# snapshot ids> : number of snapshot ids
+- <snapshot ids> : snapshot ids in desired sync order
+
+
+*) multisnap-snap <origin> <snapshot id>
+
+Table line arguments:
+- <origin> : origin device
+- <snapshot id> : id of the snapshot within the shared store
+
+
+Status output:
+*) multisnapshot <# output args> <errno> <new snapid>
+   <total_sectors> <sectors_allocated> <metadata_sectors>
+   <# snapshot ids> <snapshot ids>
+
+Status line output arguments:
+- <# shared COW store output args> : number of output arguments before
+                                     snapshot id list
+- <errno> : error number associated with the first error that occurred in
+            the store (e.g. -EIO), 0 means the store is active with no errors
+- <new snapid> : snapshot id that will be used for next snapshot, '-' if
+                 no snapshot is in the process of being created
+- <total_sectors> : total size of the shared store in 512b sectors
+- <sectors_allocated> : number of sectors allocated for data and metadata
+- <metadata_sectors> : number of sectors allocated for metadata
+- <# snapshot ids> : number of snapshot ids
+- <snapshot ids> : snapshot ids for snapshots in the store
+
+
+Other tunables:
+*) multisnapshot (when using 'mikulas' store)
+The size of the metadata cache associated with the 'mikulas' shared COW
+store defaults to 2% of system memory or 25% of vmalloc memory (which
+ever is lower).  The size of the metadata cache may be overriden using
+the 'dm_bufio_cache_size' module parameter when loading the
+'dm-store-mikulas' module.  Alternatively, the size may be changed or
+queried after the module is loaded via sysfs:
+/sys/module/dm_store_mikulas/parameters/dm_bufio_cache_size
+
+
+DM messages:
+*) multisnapshot
+   - create : creates next new snapshot id, reports created id through 'status'
+     (the snapshot is created once the multisnapshot is suspended)
+   - create_subsnap <snapshot id> : create subsnapshot of specified snapshot
+   - delete <snapshot id> : delete the specified snapshot
+
+
+Usage
+=====
+*) Create two logical volumes, one for origin and one for snapshots.
+(The following examples assume /dev/sda for origin and /dev/sdb for snapshot)
+
+*) Clear the first 4 sectors of the snapshot volume:
+dd if=/dev/zero of=/dev/sdb bs=4096 count=1
+(Otherwise the multisnapshot target's constructor will fail)
+
+*) Load the shared snapshot driver:
+ORIGIN_BDEV_SIZE=`blockdev --getsize /dev/sda`
+echo 0 $ORIGIN_BDEV_SIZE multisnapshot /dev/sda /dev/sdb 16 0 mikulas 0 | dmsetup create ms
+('16' is the chunk size in 512-byte sectors. The chunk size may range
+from 1 to 1024 512-byte sectors via lvm. DM's maximum chunk size is only
+limited by 32-bit integer size and available memory)
+
+This creates the multisnapshot device on /dev/mapper/ms. If the COW
+store was zeroed, it creates a new structure, otherwise it loads
+existing structure.
+
+Once this is done, you should no longer access /dev/sda and
+/dev/sdb and only use /dev/mapper/ms.
+
+*) Create new snapshot:
+('0' in the following dmsetup message commands means sector arg isn't needed)
+dmsetup message /dev/mapper/ms 0 create
+	If you want to create snapshot-of-snapshot, use:
+	dmsetup message /dev/mapper/ms 0 create_subsnap <snapID>
+dmsetup status /dev/mapper/ms
+	(this will display the newly created snapshot ID)
+dmsetup suspend /dev/mapper/ms
+dmsetup resume /dev/mapper/ms
+
+*) Attach the snapshot:
+echo 0 $ORIGIN_BDEV_SIZE multisnap-snap /dev/sda <snapID> | dmsetup create ms0
+This attaches the snapshot with <snapID> to /dev/mapper/ms0
+
+*) Delete the snapshot:
+dmsetup message /dev/mapper/ms 0 delete <snapID>
+
+*) See shared store's status:
+dmsetup status /dev/mapper/ms
+(multisnapshot target's status output is documented above)
+
+*) Unload it:
+dmsetup remove ms
+dmsetup remove ms0
+... etc. (note, once you unload the origin, the snapshots become inaccessible
+- the devices exist but they return -EIO when accessed)
+