drivers/md/Kconfig                            |   25 
 drivers/md/Makefile                           |    4 
 drivers/md/dm-cache-metadata.c                |  146 ++++
 drivers/md/dm-cache-metadata.h                |   21 
 drivers/md/dm-cache-policy-cleaner.c          |    2 
 drivers/md/dm-cache-policy-hints.c            |  772 ++++++++++++++++++++++++++
 drivers/md/dm-cache-policy-internal.h         |   13 
 drivers/md/dm-cache-policy-mq-era.c           |  546 ++++++++++++++++++
 drivers/md/dm-cache-policy-mq.c               |   44 -
 drivers/md/dm-cache-policy.c                  |   17 
 drivers/md/dm-cache-policy.h                  |   20 
 drivers/md/dm-cache-target.c                  |  281 ++++++++-
 drivers/md/persistent-data/dm-block-manager.h |    5 
 13 files changed, 1796 insertions(+), 100 deletions(-)

Index: linux/drivers/md/Kconfig
===================================================================
--- linux.orig/drivers/md/Kconfig
+++ linux/drivers/md/Kconfig
@@ -282,6 +282,24 @@ config DM_CACHE_MQ
          This is meant to be a general purpose policy.  It prioritises
          reads over writes.
 
+config DM_CACHE_MQ_ERA
+       tristate "MQ-ERA Cache Policy (EXPERIMENTAL)"
+       depends on DM_CACHE_MQ
+       default y
+       ---help---
+         A wrapper for the MQ policy that adds an "era" property to
+         the per-cache-block metadata, to facilitate the implementation
+         of cache coherency validation and recovery tools.  This mechanism
+         works as follows.  There is a monotonically increasing 32-bit era 
+         counter associated with each cache instance.  Each cache block is 
+         tagged with the era during which it was last written.  A device
+         mapper message interface is provided to obtain the current era, 
+         advance to the next era, and invalidate blocks from before or
+         after a given era.  NOTE: this policy will provide the same
+         performance benefits as MQ but requires significantly more memory
+         to support the era mechanism.  If you do not need era support,
+         use MQ rather than MQ-ERA.
+
 config DM_CACHE_CLEANER
        tristate "Cleaner Cache Policy (EXPERIMENTAL)"
        depends on DM_CACHE
@@ -290,6 +308,13 @@ config DM_CACHE_CLEANER
          A simple cache policy that writes back all data to the
          origin.  Used when decommissioning a dm-cache.
 
+config DM_CACHE_HINTS
+       tristate "Hint Size Test Cache Policy (EXPERIMENTAL)"
+       depends on DM_CACHE
+       default y
+       ---help---
+         A dumb cache policy just for the purpose to test variable hint size
+
 config DM_MIRROR
        tristate "Mirror target"
        depends on BLK_DEV_DM
Index: linux/drivers/md/Makefile
===================================================================
--- linux.orig/drivers/md/Makefile
+++ linux/drivers/md/Makefile
@@ -13,7 +13,9 @@ dm-log-userspace-y \
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
 dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
 dm-cache-mq-y   += dm-cache-policy-mq.o
+dm-cache-mq-era-y += dm-cache-policy-mq-era.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
+dm-cache-hints-y += dm-cache-policy-hints.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o
 
@@ -51,6 +53,8 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_MQ_ERA)	+= dm-cache-mq-era.o
+obj-$(CONFIG_DM_CACHE_HINTS) += dm-cache-hints.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
Index: linux/drivers/md/dm-cache-metadata.c
===================================================================
--- linux.orig/drivers/md/dm-cache-metadata.c
+++ linux/drivers/md/dm-cache-metadata.c
@@ -113,6 +113,7 @@ struct dm_cache_metadata {
 	char policy_name[CACHE_POLICY_NAME_SIZE];
 	unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
 	size_t policy_hint_size;
+	void *policy_hint_value_buffer;
 	struct dm_cache_statistics stats;
 };
 
@@ -198,7 +199,7 @@ static int superblock_lock(struct dm_cac
 
 /*----------------------------------------------------------------*/
 
-static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
 {
 	int r;
 	unsigned i;
@@ -214,10 +215,10 @@ static int __superblock_all_zeroes(struc
 		return r;
 
 	data_le = dm_block_data(b);
-	*result = 1;
+	*result = true;
 	for (i = 0; i < sb_block_size; i++) {
 		if (data_le[i] != zero) {
-			*result = 0;
+			*result = false;
 			break;
 		}
 	}
@@ -225,7 +226,7 @@ static int __superblock_all_zeroes(struc
 	return dm_bm_unlock(b);
 }
 
-static void __setup_mapping_info(struct dm_cache_metadata *cmd)
+static int __setup_mapping_info(struct dm_cache_metadata *cmd)
 {
 	struct dm_btree_value_type vt;
 
@@ -237,11 +238,34 @@ static void __setup_mapping_info(struct 
 	dm_array_info_init(&cmd->info, cmd->tm, &vt);
 
 	if (cmd->policy_hint_size) {
-		vt.size = sizeof(__le32);
+		if (cmd->policy_hint_size > DM_CACHE_POLICY_MAX_HINT_SIZE) {
+			DMERR("hint size is too large %d > %d",
+			      (int) cmd->policy_hint_size,
+			      (int) DM_CACHE_POLICY_MAX_HINT_SIZE);
+			return -EPERM;
+		}
+
+		vt.size = cmd->policy_hint_size;
 		dm_array_info_init(&cmd->hint_info, cmd->tm, &vt);
-	}
+
+		cmd->policy_hint_value_buffer = kmalloc(cmd->policy_hint_size, GFP_KERNEL);
+		if (!cmd->policy_hint_value_buffer) {
+			DMERR("unable to allocate hint value buffer");
+			return -ENOMEM;
+		}
+	} else
+		cmd->policy_hint_value_buffer = NULL;
+
+	return 0;
 }
 
+static void __teardown_mapping_info(struct dm_cache_metadata *cmd)
+{
+	if (cmd->policy_hint_value_buffer)
+		kfree(cmd->policy_hint_value_buffer);
+}
+
+
 static int __write_initial_superblock(struct dm_cache_metadata *cmd)
 {
 	int r;
@@ -312,7 +336,9 @@ static int __format_metadata(struct dm_c
 		return r;
 	}
 
-	__setup_mapping_info(cmd);
+	r = __setup_mapping_info(cmd);
+	if (r < 0)
+		goto bad_mapping_info;
 
 	r = dm_array_empty(&cmd->info, &cmd->root);
 	if (r < 0)
@@ -335,6 +361,8 @@ static int __format_metadata(struct dm_c
 	return 0;
 
 bad:
+	__teardown_mapping_info(cmd);
+bad_mapping_info:
 	dm_tm_destroy(cmd->tm);
 	dm_sm_destroy(cmd->metadata_sm);
 
@@ -397,7 +425,10 @@ static int __open_metadata(struct dm_cac
 		goto bad;
 	}
 
-	__setup_mapping_info(cmd);
+	r = __setup_mapping_info(cmd);
+	if (r < 0)
+		goto bad;
+
 	dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
 	sb_flags = le32_to_cpu(disk_super->flags);
 	cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
@@ -411,7 +442,8 @@ bad:
 static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
 				     bool format_device)
 {
-	int r, unformatted;
+	int r;
+	bool unformatted = false;
 
 	r = __superblock_all_zeroes(cmd->bm, &unformatted);
 	if (r)
@@ -581,6 +613,7 @@ static int __commit_transaction(struct d
 	disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
 	disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
 	disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
+	disk_super->policy_hint_size =  cpu_to_le32(cmd->policy_hint_size);
 
 	disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
 	disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
@@ -647,6 +680,7 @@ struct dm_cache_metadata *dm_cache_metad
 
 	r = __create_persistent_data_objects(cmd, may_format_device);
 	if (r) {
+		__teardown_mapping_info(cmd);
 		kfree(cmd);
 		return ERR_PTR(r);
 	}
@@ -663,22 +697,86 @@ struct dm_cache_metadata *dm_cache_metad
 void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
 {
 	__destroy_persistent_data_objects(cmd);
+	__teardown_mapping_info(cmd);
 	kfree(cmd);
 }
 
+/*
+ * Checks that the given cache block is either unmapped, or clean.
+ */
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+				   bool *result)
+{
+	int r;
+	__le64 value;
+	dm_oblock_t ob;
+	unsigned flags;
+
+	r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
+	if (r) {
+		DMERR("block_unmapped_or_clean failed");
+		return r;
+	}
+
+	unpack_value(value, &ob, &flags);
+	*result = !((flags & (1 << M_VALID)) && (flags & (1 << M_DIRTY)));
+
+	return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+					dm_cblock_t begin,
+					dm_cblock_t end,
+					bool *result)
+{
+	int r;
+
+	while (begin != end) {
+		r = block_unmapped_or_clean(cmd, begin, result);
+		if (r)
+			return r;
+
+		if (!*result) {
+			DMERR("cache block %llu is dirty",
+			      (unsigned long long) from_cblock(begin));
+			return 0;
+		}
+
+		begin++;
+	}
+
+	return 0;
+}
+
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 {
 	int r;
+	bool clean;
 	__le64 null_mapping = pack_value(0, 0);
 
 	down_write(&cmd->root_lock);
 	__dm_bless_for_disk(&null_mapping);
+
+	if (new_cache_size < cmd->cache_blocks) {
+		r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
+		if (r)
+			goto out;
+
+		if (!clean) {
+			DMERR("unable to shrink cache due to dirty blocks");
+			r = -EINVAL;
+			goto out;
+		}
+	}
+
 	r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
 			    from_cblock(new_cache_size),
 			    &null_mapping, &cmd->root);
 	if (!r)
 		cmd->cache_blocks = new_cache_size;
 	cmd->changed = true;
+
+out:
 	up_write(&cmd->root_lock);
 
 	return r;
@@ -908,7 +1006,6 @@ static int __load_mapping(void *context,
 	int r = 0;
 	bool dirty;
 	__le64 value;
-	__le32 hint_value = 0;
 	dm_oblock_t oblock;
 	unsigned flags;
 	struct thunk *thunk = context;
@@ -920,14 +1017,14 @@ static int __load_mapping(void *context,
 	if (flags & M_VALID) {
 		if (thunk->hints_valid) {
 			r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
-					       cblock, &hint_value);
+					       cblock, cmd->policy_hint_value_buffer);
 			if (r && r != -ENODATA)
 				return r;
 		}
 
 		dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
 		r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
-			      dirty, le32_to_cpu(hint_value), thunk->hints_valid);
+			      dirty, cmd->policy_hint_value_buffer, thunk->hints_valid);
 	}
 
 	return r;
@@ -1103,8 +1200,6 @@ int dm_cache_get_metadata_dev_size(struc
 static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
 {
 	int r;
-	__le32 value;
-	size_t hint_size;
 	const char *policy_name = dm_cache_policy_get_name(policy);
 	const unsigned *policy_version = dm_cache_policy_get_version(policy);
 
@@ -1113,6 +1208,8 @@ static int begin_hints(struct dm_cache_m
 		return -EINVAL;
 
 	if (!policy_unchanged(cmd, policy)) {
+		size_t hint_size;
+
 		strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
 		memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
 
@@ -1131,11 +1228,11 @@ static int begin_hints(struct dm_cache_m
 		if (r)
 			return r;
 
-		value = cpu_to_le32(0);
+		memset(cmd->policy_hint_value_buffer, 0, hint_size);
 		__dm_bless_for_disk(&value);
 		r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
 				    from_cblock(cmd->cache_blocks),
-				    &value, &cmd->hint_root);
+				    cmd->policy_hint_value_buffer, &cmd->hint_root);
 		if (r)
 			return r;
 	}
@@ -1154,22 +1251,20 @@ int dm_cache_begin_hints(struct dm_cache
 	return r;
 }
 
-static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
-		     uint32_t hint)
+static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, void *hint)
+	__dm_written_to_disk(hint)
 {
 	int r;
-	__le32 value = cpu_to_le32(hint);
-	__dm_bless_for_disk(&value);
 
 	r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
-			       from_cblock(cblock), &value, &cmd->hint_root);
+			       from_cblock(cblock), hint, &cmd->hint_root);
 	cmd->changed = true;
 
 	return r;
 }
 
-int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
-		       uint32_t hint)
+int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, void *hint)
+	__dm_written_to_disk(hint)
 {
 	int r;
 
@@ -1182,3 +1277,8 @@ int dm_cache_save_hint(struct dm_cache_m
 
 	return r;
 }
+
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
+{
+	return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+}
Index: linux/drivers/md/dm-cache-metadata.h
===================================================================
--- linux.orig/drivers/md/dm-cache-metadata.h
+++ linux/drivers/md/dm-cache-metadata.h
@@ -87,7 +87,7 @@ int dm_cache_changed_this_transaction(st
 
 typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
 			       dm_cblock_t cblock, bool dirty,
-			       uint32_t hint, bool hint_valid);
+			       void *hint, bool hint_valid);
 int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
 			   struct dm_cache_policy *policy,
 			   load_mapping_fn fn,
@@ -118,9 +118,10 @@ int dm_cache_get_metadata_dev_size(struc
 void dm_cache_dump(struct dm_cache_metadata *cmd);
 
 /*
- * The policy is invited to save a 32bit hint value for every cblock (eg,
- * for a hit count).  These are stored against the policy name.  If
- * policies are changed, then hints will be lost.  If the machine crashes,
+ * The policy is invited to save a hint (void* sequence of bytes) for every
+ * cblock (eg, for a hit count) and is reponsible to do endianess conversions.
+ * These are stored against the policy name.
+ * If policies are changed, then hints will be lost.  If the machine crashes,
  * hints will be lost.
  *
  * The hints are indexed by the cblock, but many policies will not
@@ -132,10 +133,18 @@ void dm_cache_dump(struct dm_cache_metad
 int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
 
 /*
- * requests hints for every cblock and stores in the metadata device.
+ * Saves the hint for a given cblock in the metadata device.  Policy
+ * modules must perform any endian conversions needed and bless the hints
+ * for disk.
  */
 int dm_cache_save_hint(struct dm_cache_metadata *cmd,
-		       dm_cblock_t cblock, uint32_t hint);
+		       dm_cblock_t cblock, void *hint)
+	__dm_written_to_disk(hint);
+
+/*
+ * Query method.  Are all the blocks in the cache clean?
+ */
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
 
 /*----------------------------------------------------------------*/
 
Index: linux/drivers/md/dm-cache-policy-cleaner.c
===================================================================
--- linux.orig/drivers/md/dm-cache-policy-cleaner.c
+++ linux/drivers/md/dm-cache-policy-cleaner.c
@@ -274,7 +274,7 @@ static void add_cache_entry(struct polic
 
 static int wb_load_mapping(struct dm_cache_policy *pe,
 			   dm_oblock_t oblock, dm_cblock_t cblock,
-			   uint32_t hint, bool hint_valid)
+			   void *hint, bool hint_valid)
 {
 	int r;
 	struct policy *p = to_policy(pe);
Index: linux/drivers/md/dm-cache-policy-hints.c
===================================================================
--- /dev/null
+++ linux/drivers/md/dm-cache-policy-hints.c
@@ -0,0 +1,772 @@
+/*
+ * Copyright (C) 2013 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * TESTING! NOT FOR PRODUCTION USE!
+ *
+ * "hints" policy to test variable hint size.
+ */
+
+#include "dm.h"
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "cache-policy-hints"
+
+/*----------------------------------------------------------------*/
+
+static struct kmem_cache *hints_entry_cache;
+
+/*----------------------------------------------------------------*/
+
+static unsigned next_power(unsigned n, unsigned min)
+{
+	return roundup_pow_of_two(max(n, min));
+}
+
+struct hash {
+	struct hlist_head *table;
+	dm_block_t hash_bits;
+	unsigned nr_buckets;
+};
+
+struct entry {
+	struct hlist_node hlist;
+	struct list_head list;
+	dm_oblock_t oblock;
+	dm_cblock_t cblock;
+};
+
+#define	DEFAULT_HINT_SIZE DM_CACHE_POLICY_MAX_HINT_SIZE
+struct policy {
+	struct dm_cache_policy policy;
+	struct mutex lock;
+
+	sector_t origin_size, block_size;
+
+	/* To optimize search in the allocation bitset */
+	unsigned find_free_nr_words, find_free_last_word;
+	unsigned long *allocation_bitset;
+
+	dm_cblock_t nr_cblocks_allocated;
+	dm_cblock_t cache_size;
+
+	struct {
+		struct list_head free; /* Free cache entry list */
+		struct list_head used; /* Used cache entry list */
+	} queues;
+
+	/* The cache hash */
+	struct hash chash;
+
+	void *hints_buffer;
+	unsigned hint_counter[4];
+
+	/* Flag to block (re)setting hint_size via the message interface */
+	bool hint_size_set;
+};
+
+/*----------------------------------------------------------------------------*/
+/* Low-level queue function. */
+static struct entry *queue_pop(struct list_head *q)
+{
+	if (!list_empty(q)) {
+		struct list_head *elt = q->next;
+
+		list_del(elt);
+		return list_entry(elt, struct entry, list);
+	}
+	
+	return NULL;
+}
+/*----------------------------------------------------------------------------*/
+
+/* Allocate/free various resources. */
+static int alloc_hash(struct hash *hash, unsigned elts)
+{
+	hash->nr_buckets = next_power(elts >> 4, 16);
+	hash->hash_bits = ffs(hash->nr_buckets) - 1;
+	hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets);
+
+	return hash->table ? 0 : -ENOMEM;
+}
+
+static void free_hash(struct hash *hash)
+{
+	vfree(hash->table);
+}
+
+/* Free/alloc basic cache entry structures. */
+static void __free_cache_entries(struct list_head *q) {
+	struct entry *e;
+
+	while ((e = queue_pop(q)))
+		kmem_cache_free(hints_entry_cache, e);
+}
+
+static void free_cache_entries(struct policy *p)
+{
+	__free_cache_entries(&p->queues.free);
+	__free_cache_entries(&p->queues.used);
+}
+
+static int alloc_cache_blocks_with_hash(struct policy *p, unsigned cache_size)
+{
+	int r = -ENOMEM;
+	unsigned u = cache_size;
+
+	p->nr_cblocks_allocated = to_cblock(0);
+
+	while (u--) {
+		struct entry *e = kmem_cache_zalloc(hints_entry_cache, GFP_KERNEL);
+
+		if (!e)
+			goto bad_cache_alloc;
+
+		list_add(&e->list, &p->queues.free);
+	}
+
+	/* Cache entries hash. */
+	r = alloc_hash(&p->chash, cache_size);
+	if (r)
+		goto bad_cache_alloc;
+
+	return 0;
+
+bad_cache_alloc:
+	free_cache_entries(p);
+
+	return r;
+}
+
+static void free_cache_blocks_and_hash(struct policy *p)
+{
+	free_hash(&p->chash);
+	free_cache_entries(p);
+}
+
+static void alloc_cblock(struct policy *p, dm_cblock_t cblock)
+{
+	BUG_ON(from_cblock(cblock) >= from_cblock(p->cache_size));
+	BUG_ON(test_bit(from_cblock(cblock), p->allocation_bitset));
+	set_bit(from_cblock(cblock), p->allocation_bitset);
+}
+
+static void free_cblock(struct policy *p, dm_cblock_t cblock)
+{
+	BUG_ON(from_cblock(cblock) >= from_cblock(p->cache_size));
+	BUG_ON(!test_bit(from_cblock(cblock), p->allocation_bitset));
+	clear_bit(from_cblock(cblock), p->allocation_bitset);
+}
+
+/*----------------------------------------------------------------------------*/
+/* Low-level functions. */
+static struct policy *to_policy(struct dm_cache_policy *p)
+{
+	return container_of(p, struct policy, policy);
+}
+
+/*----------------------------------------------------------------*/
+
+static unsigned bit_set_nr_words(unsigned long nr_cblocks)
+{
+	return dm_div_up(nr_cblocks, BITS_PER_LONG);
+}
+
+static unsigned long *alloc_bitset(unsigned nr_cblocks)
+{
+	return vzalloc(sizeof(unsigned long) * bit_set_nr_words(nr_cblocks));
+}
+
+static void free_bitset(unsigned long *bits)
+{
+	vfree(bits);
+}
+/*----------------------------------------------------------------------------*/
+
+/* Hash functions (lookup, insert, remove). */
+static struct entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock)
+{
+	struct hash *hash = &p->chash;
+	unsigned h = hash_64(from_oblock(oblock), hash->hash_bits);
+	struct entry *cur;
+	struct hlist_head *bucket = &hash->table[h];
+
+	hlist_for_each_entry(cur, bucket, hlist) {
+		if (cur->oblock == oblock) {
+			/* Move upfront bucket for faster access. */
+			hlist_del(&cur->hlist);
+			hlist_add_head(&cur->hlist, bucket);
+			return cur;
+		}
+	}
+
+	return NULL;
+}
+
+static void insert_cache_hash_entry(struct policy *p, struct entry *e)
+{
+	unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits);
+
+	hlist_add_head(&e->hlist, &p->chash.table[h]);
+}
+
+static void remove_cache_hash_entry(struct policy *p, struct entry *e)
+{
+	hlist_del(&e->hlist);
+}
+
+
+/*----------------------------------------------------------------------------*/
+/*
+ * This doesn't allocate the block.
+ */
+static int __find_free_cblock(struct policy *p, unsigned begin, unsigned end,
+			      dm_cblock_t *result, unsigned *last_word)
+{
+	int r = -ENOSPC;
+	unsigned w;
+
+	for (w = begin; w < end; w++) {
+		/*
+		 * ffz is undefined if no zero exists
+		 */
+		if (p->allocation_bitset[w] != ULONG_MAX) {
+			*last_word = w;
+			*result = to_cblock((w * BITS_PER_LONG) + ffz(p->allocation_bitset[w]));
+			if (from_cblock(*result) < from_cblock(p->cache_size))
+				r = 0;
+
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int find_free_cblock(struct policy *p, dm_cblock_t *result)
+{
+	int r = __find_free_cblock(p, p->find_free_last_word, p->find_free_nr_words, result, &p->find_free_last_word);
+
+	if (r == -ENOSPC && p->find_free_last_word)
+		r = __find_free_cblock(p, 0, p->find_free_last_word, result, &p->find_free_last_word);
+
+	return r;
+}
+
+static struct entry *alloc_cache_entry(struct policy *p)
+{
+	struct entry *e = queue_pop(&p->queues.free);
+
+	if (e) {
+		BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size));
+		p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1);
+	}
+
+	return e;
+}
+
+static void alloc_cblock_and_insert_cache(struct policy *p, struct entry *e)
+{
+	alloc_cblock(p, e->cblock);
+	insert_cache_hash_entry(p, e);
+}
+
+static void add_cache_entry(struct policy *p, struct entry *e)
+{
+	list_add_tail(&e->list, &p->queues.used);
+	alloc_cblock_and_insert_cache(p, e);
+}
+
+static void remove_cache_entry(struct policy *p, struct entry *e)
+{
+	remove_cache_hash_entry(p, e);
+	free_cblock(p, e->cblock);
+}
+
+static struct entry *evict_cache_entry(struct policy *p)
+{
+	struct entry *e = queue_pop(&p->queues.used);
+
+	BUG_ON(!e);
+	remove_cache_entry(p, e);
+
+	return e;
+}
+
+static void get_cache_block(struct policy *p, dm_oblock_t oblock, struct bio *bio,
+			    struct policy_result *result)
+{
+	struct entry *e = alloc_cache_entry(p);
+
+	if (e) {
+		int r = find_free_cblock(p, &e->cblock);
+
+		BUG_ON(r);
+		result->op = POLICY_NEW;
+
+	} else {
+		e = evict_cache_entry(p);
+		result->old_oblock = e->oblock;
+		result->op = POLICY_REPLACE;
+	}
+
+	result->cblock = e->cblock;
+	e->oblock = oblock;
+	add_cache_entry(p, e);
+}
+
+static bool in_cache(struct policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+	struct entry *e = lookup_cache_entry(p, oblock);
+
+	if (!e)
+		return false;
+
+	*cblock = e->cblock;
+	return true;
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Public interface (see dm-cache-policy.h */
+static int hints_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
+		     bool can_block, bool can_migrate, bool discarded_oblock,
+		     struct bio *bio, struct policy_result *result)
+{
+	int r = 0;
+	struct policy *p = to_policy(pe);
+
+	result->op = POLICY_MISS;
+
+	if (can_block)
+		mutex_lock(&p->lock);
+
+	else if (!mutex_trylock(&p->lock))
+		return -EWOULDBLOCK;
+
+
+	if (in_cache(p, oblock, &result->cblock))
+		result->op = POLICY_HIT;
+
+	else if (!can_migrate)
+		r = -EWOULDBLOCK;
+
+	else
+		get_cache_block(p, oblock, bio, result);
+
+	mutex_unlock(&p->lock);
+
+	return r;
+}
+
+static int hints_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+	int r;
+	struct policy *p = to_policy(pe);
+
+	if (!mutex_trylock(&p->lock))
+		return -EWOULDBLOCK;
+
+	if (!in_cache(p, oblock, cblock))
+		r = -ENOENT;
+
+	mutex_unlock(&p->lock);
+
+	return r;
+}
+
+static void hints_destroy(struct dm_cache_policy *pe)
+{
+	struct policy *p = to_policy(pe);
+
+	free_bitset(p->allocation_bitset);
+	free_cache_blocks_and_hash(p);
+	kfree(p->hints_buffer);
+	kfree(p);
+}
+
+/*----------------------------------------------------------------------------*/
+
+/* Hints endianess conversions */
+#define __le8 uint8_t
+struct hints_ptrs {
+	__le64 *le64_hints;
+	__le32 *le32_hints;
+	__le16 *le16_hints;
+	__le8  *le8_hints;
+
+	uint64_t *u64_hints;
+	uint32_t *u32_hints;
+	uint16_t *u16_hints;
+	uint8_t  *u8_hints;
+};
+
+typedef int (*hints_xfer_fn_t) (struct hints_ptrs*, unsigned, unsigned, bool);
+
+#define cpu_to_le8(x) (x)
+#define le8_to_cpu(x) (x)
+
+#define HINTS_XFER(width) \
+static int hints_ ## width ## _xfer(struct hints_ptrs *p, unsigned idx, unsigned val, bool to_disk) \
+{ \
+	if (to_disk) \
+		p->le ## width ## _hints[idx] = cpu_to_le ## width(val); \
+\
+	else { \
+		p->u ## width ## _hints[idx] = le ## width ## _to_cpu(p->le ## width ## _hints[idx]); \
+		if (p->u ## width ## _hints[idx] != val) { \
+			DMERR_LIMIT("%s -- hint value %llu != %u", __func__, \
+				    (long long unsigned) p->u ## width ## _hints[idx], val); \
+			return -EINVAL; \
+		} \
+	} \
+\
+	return 0; \
+}
+
+HINTS_XFER(64)
+HINTS_XFER(32)
+HINTS_XFER(16)
+HINTS_XFER(8)
+
+static void calc_hint_value_counters(struct policy *p)
+{
+	unsigned div, rest = dm_cache_policy_get_hint_size(&p->policy), u;
+
+	for (u = 3, div = sizeof(uint64_t); rest; u--, div >>= 1) {
+		p->hint_counter[u] = rest / div;
+		rest -= p->hint_counter[u] * div;
+	}
+}
+
+/* Macro to set hint ptr for width on LHS based on RHS width<<1 */
+#define PTR_INC(lhs, rhs, c) \
+	inc = 2 * p->hint_counter[c]; \
+	ptrs->le ## lhs ## _hints = (__le ## lhs  *) ptrs->le ## rhs ## _hints + inc; \
+	ptrs->u ## lhs ## _hints  = (uint ## lhs ## _t *) ptrs->u ## rhs ## _hints  + inc;
+
+static void set_hints_ptrs(struct policy *p, struct hints_ptrs *ptrs)
+{
+	unsigned inc;
+
+	ptrs->le64_hints = p->hints_buffer;
+	ptrs->u64_hints  = p->hints_buffer;
+
+	PTR_INC(32, 64, 3)
+	PTR_INC(16, 32, 2)
+	PTR_INC( 8, 16, 1)
+}
+
+static void __hints_xfer_disk(struct policy *p, bool to_disk)
+{
+	unsigned idx, u, val;
+	hints_xfer_fn_t hints_xfer_fns[] = {
+		hints_8_xfer,
+		hints_16_xfer,
+		hints_32_xfer,
+		hints_64_xfer
+	};
+
+	struct hints_ptrs hints_ptrs;
+ 
+	if (!p->hint_size_set) {
+		calc_hint_value_counters(p);
+		p->hint_size_set = true;
+	}	
+
+	/* Must happen after calc_hint_value_counters()! */
+	set_hints_ptrs(p, &hints_ptrs);
+
+	val = 1;
+	u = ARRAY_SIZE(hints_xfer_fns);
+	while (u--) {
+		for (idx = 0; idx < p->hint_counter[u]; idx++) {
+			/*
+			 * val only suitable because of 256 hint value limitation.
+			 *
+			 * An uint8_t maxes at 255, so we could theoretically
+			 * test hint sizes up to 2023 bytes with this limitation.
+			 */
+			if (hints_xfer_fns[u](&hints_ptrs, idx, val, to_disk))
+				return;
+
+			val++;
+		}
+	}
+
+	return;
+}
+
+static void hints_preset_and_to_disk(struct policy *p)
+{
+	__hints_xfer_disk(p, true);
+}
+
+static void hints_from_disk_and_check(struct policy *p)
+{
+	__hints_xfer_disk(p, false);
+}
+
+static int hints_load_mapping(struct dm_cache_policy *pe,
+			      dm_oblock_t oblock, dm_cblock_t cblock,
+			      void *hint, bool hint_valid)
+{
+	struct policy *p = to_policy(pe);
+	struct entry *e;
+
+	e = alloc_cache_entry(p);
+	if (!e)
+		return -ENOMEM;
+
+	e->cblock = cblock;
+	e->oblock = oblock;
+
+	if (hint_valid) {
+		void *tmp = p->hints_buffer;
+
+		p->hints_buffer = hint;
+		hints_from_disk_and_check(p);
+		p->hints_buffer = tmp;
+	}
+
+	alloc_cblock_and_insert_cache(p, e);
+
+	return 0;
+}
+
+/* Walk mappings */
+static int hints_walk_mappings(struct dm_cache_policy *pe, policy_walk_fn fn, void *context)
+{
+	int r = 0;
+	struct policy *p = to_policy(pe);
+	struct entry *e;
+
+	hints_preset_and_to_disk(p);
+
+	mutex_lock(&p->lock);
+
+	list_for_each_entry(e, &p->queues.used, list) {
+		r = fn(context, e->cblock, e->oblock, (void*) p->hints_buffer);
+		if (r)
+			break;
+	}
+
+	mutex_unlock(&p->lock);
+
+	return r;
+}
+
+static struct entry *__hints_force_remove_mapping(struct policy *p,
+							      dm_oblock_t oblock)
+{
+	struct entry *e = lookup_cache_entry(p, oblock);
+
+	BUG_ON(!e);
+
+	list_del(&e->list);
+	remove_cache_entry(p, e);
+
+	return e;
+}
+
+static void hints_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock)
+{
+	struct policy *p = to_policy(pe);
+	struct entry *e;
+
+	mutex_lock(&p->lock);
+	e = __hints_force_remove_mapping(p, oblock);
+	list_add_tail(&e->list, &p->queues.free);
+
+	BUG_ON(!from_cblock(p->nr_cblocks_allocated));
+	p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1);
+	mutex_unlock(&p->lock);
+}
+
+static void hints_force_mapping(struct dm_cache_policy *pe,
+				dm_oblock_t current_oblock, dm_oblock_t oblock)
+{
+	struct policy *p = to_policy(pe);
+	struct entry *e;
+
+	mutex_lock(&p->lock);
+
+	e = __hints_force_remove_mapping(p, current_oblock);
+	e->oblock = oblock;
+	add_cache_entry(p, e);
+
+	mutex_unlock(&p->lock);
+}
+
+static int hints_next_dirty_block(struct dm_cache_policy *pe, dm_oblock_t *oblock, dm_cblock_t *cblock)
+{
+	return -ENOENT;
+}
+
+static dm_cblock_t hints_residency(struct dm_cache_policy *pe)
+{
+	/* FIXME: lock mutex, not sure we can block here. */
+	return to_policy(pe)->nr_cblocks_allocated;
+}
+
+static int hints_set_config_value(struct dm_cache_policy *pe,
+				  const char *key, const char *value)
+{
+	if (!strcasecmp(key, "hint_size")) {
+		struct policy *p = to_policy(pe);
+
+		if (p->hint_size_set)
+			return -EPERM;
+
+		else {
+			unsigned tmp;
+
+			if (kstrtou32(value, 10, &tmp))
+				return -EINVAL;
+
+			else {
+				int r = dm_cache_policy_set_hint_size(pe, tmp);
+
+				if (!r) {
+					calc_hint_value_counters(p);
+					p->hint_size_set = true;
+				}
+
+				return r;
+			}
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int hints_emit_config_values(struct dm_cache_policy *pe, char *result, unsigned maxlen)
+{
+	ssize_t sz = 0;
+
+	DMEMIT("hint_size %llu", (long long unsigned) dm_cache_policy_get_hint_size(pe));
+	return 0;
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct policy *p)
+{
+	p->policy.destroy = hints_destroy;
+	p->policy.map = hints_map;
+	p->policy.lookup = hints_lookup;
+#if 0
+	p->policy.set_dirty = NULL;
+	p->policy.clear_dirty = NULL;
+#endif
+	p->policy.load_mapping = hints_load_mapping;
+	p->policy.walk_mappings = hints_walk_mappings;
+	p->policy.remove_mapping = hints_remove_mapping;
+	p->policy.writeback_work = NULL;
+	p->policy.next_dirty_block = hints_next_dirty_block;
+	p->policy.force_mapping = hints_force_mapping;
+	p->policy.residency = hints_residency;
+	p->policy.tick = NULL;
+	p->policy.emit_config_values = hints_emit_config_values;
+	p->policy.set_config_value = hints_set_config_value;
+}
+
+static struct dm_cache_policy *hints_policy_create(dm_cblock_t cache_size,
+						   sector_t origin_size,
+						   sector_t block_size)
+{
+	int r;
+	struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL);
+
+	if (!p)
+		return NULL;
+
+	init_policy_functions(p);
+
+	p->cache_size = cache_size;
+	p->find_free_nr_words = bit_set_nr_words(from_cblock(cache_size));
+	p->find_free_last_word = 0;
+	p->block_size = block_size;
+	p->origin_size = origin_size;
+	mutex_init(&p->lock);
+	INIT_LIST_HEAD(&p->queues.free);
+	INIT_LIST_HEAD(&p->queues.used);
+
+	/* Allocate cache entry structs and add them to free list. */
+	r = alloc_cache_blocks_with_hash(p, from_cblock(cache_size));
+	if (r)
+		goto bad_free_policy;
+
+	/* Cache allocation bitset. */
+	p->allocation_bitset = alloc_bitset(from_cblock(cache_size));
+	if (!p->allocation_bitset)
+		goto bad_free_cache_blocks_and_hash;
+
+	p->hints_buffer = kzalloc(DM_CACHE_POLICY_MAX_HINT_SIZE, GFP_KERNEL);
+	if (!p->hints_buffer)
+		goto bad_free_allocation_bitset;
+
+	p->hint_size_set = false;
+
+	return &p->policy;
+
+bad_free_allocation_bitset:
+	free_bitset(p->allocation_bitset);
+bad_free_cache_blocks_and_hash:
+	free_cache_blocks_and_hash(p);
+bad_free_policy:
+	kfree(p);
+
+	return NULL;
+}
+
+/*----------------------------------------------------------------------------*/
+static struct dm_cache_policy_type hints_policy_type = {
+	.name = "hints",
+	.version = {1, 0, 0},
+	.hint_size = DEFAULT_HINT_SIZE,
+	.owner = THIS_MODULE,
+	.create = hints_policy_create
+};
+
+static int __init hints_init(void)
+{
+	int r = -ENOMEM;
+
+	hints_entry_cache = kmem_cache_create("dm_hints_policy_cache_entry",
+					      sizeof(struct entry),
+					      __alignof__(struct entry),
+					      0, NULL);
+	if (hints_entry_cache) {
+		r = dm_cache_policy_register(&hints_policy_type);
+		if (r)
+			kmem_cache_destroy(hints_entry_cache);
+
+		else {
+			DMINFO("version %u.%u.%u loaded",
+			       hints_policy_type.version[0],
+			       hints_policy_type.version[1],
+			       hints_policy_type.version[2]);
+		}
+	}
+
+	return r;
+}
+
+static void __exit hints_exit(void)
+{
+	dm_cache_policy_unregister(&hints_policy_type);
+	kmem_cache_destroy(hints_entry_cache);
+}
+
+module_init(hints_init);
+module_exit(hints_exit);
+
+MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("hint size test cache policy");
Index: linux/drivers/md/dm-cache-policy-internal.h
===================================================================
--- linux.orig/drivers/md/dm-cache-policy-internal.h
+++ linux/drivers/md/dm-cache-policy-internal.h
@@ -41,7 +41,7 @@ static inline void policy_clear_dirty(st
 
 static inline int policy_load_mapping(struct dm_cache_policy *p,
 				      dm_oblock_t oblock, dm_cblock_t cblock,
-				      uint32_t hint, bool hint_valid)
+				      void *hint, bool hint_valid)
 {
 	return p->load_mapping(p, oblock, cblock, hint, hint_valid);
 }
@@ -59,6 +59,13 @@ static inline int policy_writeback_work(
 	return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
 }
 
+static inline int policy_next_dirty_block(struct dm_cache_policy *p,
+					  dm_oblock_t *oblock,
+					  dm_cblock_t *cblock)
+{
+	return p->next_dirty_block ? p->next_dirty_block(p, oblock, cblock) : -ENOENT;
+}
+
 static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
 {
 	return p->remove_mapping(p, oblock);
@@ -87,7 +94,7 @@ static inline int policy_emit_config_val
 	if (p->emit_config_values)
 		return p->emit_config_values(p, result, maxlen);
 
-	DMEMIT("0");
+	DMEMIT(" 0");
 	return 0;
 }
 
@@ -119,6 +126,8 @@ const char *dm_cache_policy_get_name(str
 
 const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p);
 
+#define DM_CACHE_POLICY_MAX_HINT_SIZE 256 /* Max 2023 for the policy hints test module to work */
+int    dm_cache_policy_set_hint_size(struct dm_cache_policy *p, unsigned hint_size);
 size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
 
 /*----------------------------------------------------------------*/
Index: linux/drivers/md/dm-cache-policy-mq-era.c
===================================================================
--- /dev/null
+++ linux/drivers/md/dm-cache-policy-mq-era.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright 2013 NetApp, Inc. All Rights Reserved, contribution by
+ * Morgan Mears.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details
+ *
+ */
+
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "cache-policy-mq-era"
+
+typedef uint32_t era_t;
+#define MQ_ERA_MAX_ERA UINT_MAX
+
+struct mq_era_policy {
+	struct dm_cache_policy policy;
+	struct mutex lock;	/* FIXME: spinlock? */
+	struct dm_cache_policy *mq;
+	dm_cblock_t cache_size;
+	era_t *cb_to_era;
+	era_t era_counter;
+};
+
+/*----------------------------------------------------------------*/
+
+static struct mq_era_policy *to_mq_era_policy(struct dm_cache_policy *p)
+{
+	return container_of(p, struct mq_era_policy, policy);
+}
+
+static int incr_era_counter(struct mq_era_policy *mq_era, const char *curr_era_counter_str)
+{
+	era_t curr_era_counter;
+	int r;
+
+	/*
+	 * If the era counter value provided by the user matches the current
+	 * counter value while under lock, increment the counter (intention
+	 * is to prevent races).  Rollover problems are avoided by locking
+	 * the counter at a maximum value (the application must take
+	 * appropriate action on this error to preserve correction, but
+	 * a properly behaved set of applications will never trigger it;
+	 * the era counter is meant to increment less than once a second
+	 * and is 32 bits.
+	 */
+
+	if (kstrtou32(curr_era_counter_str, 10, &curr_era_counter))
+		return -EINVAL;
+
+	mutex_lock(&mq_era->lock);
+
+	if (mq_era->era_counter != curr_era_counter)
+		r = -ECANCELED;
+	else if (mq_era->era_counter >= MQ_ERA_MAX_ERA)
+		r = -EOVERFLOW;
+	else {
+		mq_era->era_counter++;
+		r = 0;
+	}
+
+	mutex_unlock(&mq_era->lock);
+
+	return r;
+}
+
+struct nested_walk_ctx {
+	policy_walk_fn parent_fn;
+	void *parent_ctx;
+	struct mq_era_policy *mq_era;
+};
+
+static int nested_walk(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint)
+{
+	struct nested_walk_ctx *ctx = (struct nested_walk_ctx *)context;
+
+	/*
+	 * Inserted as a filter into walk_mappings so we can take additional
+	 * actions in the shim.
+	 */
+
+	DMDEBUG("calling parent walk_mappings function for cblock %u, "
+		"oblock %llu (era %u)", from_cblock(cblock), oblock,
+		ctx->mq_era->cb_to_era[from_cblock(cblock)]);
+
+	/*
+	 * XXX need to consolidate the hint being provided by our caller (mq)
+	 * with the hint we want to preserve (era) once the hint size
+	 * restriction goes away.
+	 */
+
+	return (*ctx->parent_fn)(ctx->parent_ctx, cblock, oblock,
+				 ctx->mq_era->cb_to_era[from_cblock(cblock)]);
+}
+
+static int era_is_gt_value(era_t era, era_t value)
+{
+	return era > value;
+}
+
+static int era_is_gte_value(era_t era, era_t value)
+{
+	return era >= value;
+}
+
+static int era_is_lte_value(era_t era, era_t value)
+{
+	return era <= value;
+}
+
+static int era_is_lt_value(era_t era, era_t value)
+{
+	return era < value;
+}
+
+typedef int (*era_match_fn_t)(era_t, era_t);
+
+struct find_oblocks_ctx {
+	struct mq_era_policy *mq_era;
+	era_match_fn_t era_match_fn;
+	era_t test_era;
+	uint32_t matches;
+	uint32_t next_ob_idx;
+	dm_oblock_t *oblocks;
+};
+
+static int find_oblocks(void *context, dm_cblock_t cblock,
+			dm_oblock_t oblock, uint32_t hint)
+{
+	struct find_oblocks_ctx *ctx = (struct find_oblocks_ctx *)context;
+	era_t era;
+
+	/*
+	 * Assembles a list of oblocks that are currently in the cache and
+	 * whose cblocks have eras that satisfy the given matching function
+	 * (currently >, >=, <=, or <)
+	 */
+
+	if (ctx->next_ob_idx >= ctx->matches)
+		return -EOVERFLOW;
+
+	era = ctx->mq_era->cb_to_era[from_cblock(cblock)];
+	if (ctx->era_match_fn(era, ctx->test_era)) {
+		DMDEBUG("cblock %u has era %u matching test_era %u; "
+			"recording oblock %llu at oblocks %u.",
+			from_cblock(cblock), era, ctx->test_era,
+			oblock, ctx->next_ob_idx);
+		ctx->oblocks[ctx->next_ob_idx++] = oblock;
+		ctx->mq_era->cb_to_era[from_cblock(cblock)] = 0;
+	}
+
+	return 0;
+}
+
+static int cond_unmap_by_era(struct mq_era_policy *mq_era,
+			     const char *test_era_str,
+			     era_match_fn_t era_match_fn)
+{
+	struct find_oblocks_ctx fo_ctx;
+	uint32_t cb_idx, matches, ob_idx, max_cb_idx;
+	era_t test_era;
+	int r;
+
+	/*
+	 * Unmap blocks with eras matching the given era, according to the
+	 * given matching function.
+	 */
+
+	if (kstrtou32(test_era_str, 10, &test_era))
+		return -EINVAL;
+
+	/*
+	 * This is a little convoluted, but is not expected to be a common
+	 * operation.
+	 */
+
+	mutex_lock(&mq_era->lock);
+
+	/* While locked, count matches */
+	max_cb_idx = from_cblock(mq_era->cache_size);
+	for (matches = 0, cb_idx = 0; cb_idx < max_cb_idx; cb_idx++)
+		if (era_match_fn(mq_era->cb_to_era[cb_idx], test_era))
+			matches++;
+
+	/* If there aren't any, we're done */
+	if (matches == 0) {
+		r = 0;
+		goto out;
+	}
+
+	/* Set up to find the origin block for each matching cache block */
+	fo_ctx.mq_era = mq_era;
+	fo_ctx.era_match_fn = era_match_fn;
+	fo_ctx.test_era = test_era;
+	fo_ctx.matches = matches;
+	fo_ctx.next_ob_idx = 0;
+	fo_ctx.oblocks = kzalloc(sizeof(*fo_ctx.oblocks) * matches, GFP_KERNEL);
+	if (!fo_ctx.oblocks) {
+		r = -ENOMEM;
+		goto out;
+	}
+
+	/* Go ahead and find the origins */
+	r = mq_era->mq->walk_mappings(mq_era->mq, find_oblocks, &fo_ctx);
+	if (r)
+		goto free_and_out;
+
+	/* Unmap each matching origin */
+	for (ob_idx = 0; ob_idx < fo_ctx.next_ob_idx; ob_idx++) {
+		DMDEBUG("removing mapping for oblock %llu.", fo_ctx.oblocks[ob_idx]);
+		mq_era->mq->remove_mapping(mq_era->mq, fo_ctx.oblocks[ob_idx]);
+	}
+
+free_and_out:
+	kfree(fo_ctx.oblocks);
+out:
+	mutex_unlock(&mq_era->lock);
+	return r;
+}
+
+/*
+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static void mq_era_destroy(struct dm_cache_policy *p)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	DMDEBUG("destroyed mq_era %p, mq %p.", mq_era, mq_era->mq);
+	mq_era->mq->destroy(mq_era->mq);
+	kfree(mq_era->cb_to_era);
+	kfree(mq_era);
+}
+
+static int mq_era_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+		      bool can_block, bool can_migrate, bool discarded_oblock,
+		      struct bio *bio, struct policy_result *result)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	uint32_t cb_idx;
+	int r;
+
+	result->op = POLICY_MISS;
+
+	if (can_block)
+		mutex_lock(&mq_era->lock);
+	else if (!mutex_trylock(&mq_era->lock))
+		return -EWOULDBLOCK;
+
+	/* Check for a mapping */
+	r = mq_era->mq->map(mq_era->mq, oblock, can_block, can_migrate,
+			    discarded_oblock, bio, result);
+
+	/* If we got a hit and this is a write, update the era for the block */
+	if (!r && (bio_data_dir(bio) == WRITE) && (result->op == POLICY_HIT)) {
+		cb_idx = from_cblock(result->cblock);
+		BUG_ON(cb_idx >= from_cblock(mq_era->cache_size));
+		/* XXX remove this */
+		DMDEBUG("assigning era %u to cblock %u, oblock %llu due to write hit.",
+			mq_era->era_counter, result->cblock, oblock);
+		mq_era->cb_to_era[cb_idx] = mq_era->era_counter;
+	}
+
+	mutex_unlock(&mq_era->lock);
+
+	return r;
+}
+
+static int mq_era_lookup(struct dm_cache_policy *p, dm_oblock_t oblock,
+			 dm_cblock_t *cblock)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	return mq_era->mq->lookup(mq_era->mq, oblock, cblock);
+}
+
+static void mq_era_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	mq_era->mq->set_dirty(mq_era->mq, oblock);
+}
+
+static void mq_era_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	mq_era->mq->clear_dirty(mq_era->mq, oblock);
+}
+
+static int mq_era_load_mapping(struct dm_cache_policy *p,
+			       dm_oblock_t oblock, dm_cblock_t cblock,
+			       uint32_t hint, bool hint_valid)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	int r;
+
+	/*
+	 * XXX need to consolidate the hint being provided by our caller (mq)
+	 * with the hint we want to preserve (era) once the hint size
+	 * restriction goes away.
+	 */
+
+	r = mq_era->mq->load_mapping(mq_era->mq, oblock, cblock, 0, 0);
+	if (!r && hint_valid &&
+	    (from_cblock(cblock) < from_cblock(mq_era->cache_size))) {
+		DMDEBUG("recovered era %u for cblock %u.", hint, cblock);
+		mq_era->cb_to_era[from_cblock(cblock)] = hint;
+		/*
+		 * Make sure the era counter starts higher than the highest
+		 * persisted era.
+		 */
+		if (hint >= mq_era->era_counter) {
+			mq_era->era_counter = hint;
+			if (mq_era->era_counter < MQ_ERA_MAX_ERA)
+				mq_era->era_counter++;
+			DMDEBUG("set era_counter to %u.", mq_era->era_counter);
+		}
+	}
+
+	return r;
+}
+
+static int mq_era_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+				void *context)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	struct nested_walk_ctx nested_walk_ctx = {
+		.parent_fn = fn,
+		.parent_ctx = context,
+		.mq_era = mq_era
+	};
+	int r;
+
+	/* XXX remove this */
+	DMDEBUG("call to mq_era_walk_mappings");
+
+	mutex_lock(&mq_era->lock);
+
+	r = mq_era->mq->walk_mappings(mq_era->mq, nested_walk, &nested_walk_ctx);
+
+	mutex_unlock(&mq_era->lock);
+
+	return r;
+}
+
+static void mq_era_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	dm_cblock_t cblock;
+
+	mutex_lock(&mq_era->lock);
+
+	if (!mq_era->mq->lookup(mq_era->mq, oblock, &cblock)) {
+		DMDEBUG("zeroed era for cblock %u (oblock %llu) due to a call "
+			"to remove_mapping.", cblock, oblock);
+		mq_era->cb_to_era[from_cblock(cblock)] = 0;
+	}
+
+	mq_era->mq->remove_mapping(mq_era->mq, oblock);
+
+	mutex_unlock(&mq_era->lock);
+}
+
+static int mq_era_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+				 dm_cblock_t *cblock)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	return mq_era->mq->writeback_work(mq_era->mq, oblock, cblock);
+}
+
+static void mq_era_force_mapping(struct dm_cache_policy *p,
+				 dm_oblock_t current_oblock,
+				 dm_oblock_t new_oblock)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	dm_cblock_t cblock;
+
+	mutex_lock(&mq_era->lock);
+
+	if (!mq_era->mq->lookup(mq_era->mq, current_oblock, &cblock)) {
+		DMDEBUG("assigning era %u to cblock %u, oblock %llu "
+			"(old_oblock %llu) due to force_mapping.",
+			mq_era->era_counter, cblock, new_oblock,
+			current_oblock);
+		mq_era->cb_to_era[from_cblock(cblock)] = mq_era->era_counter;
+	}
+
+	mq_era->mq->force_mapping(mq_era->mq, current_oblock, new_oblock);
+
+	mutex_unlock(&mq_era->lock);
+}
+
+static dm_cblock_t mq_era_residency(struct dm_cache_policy *p)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	return mq_era->mq->residency(mq_era->mq);
+}
+
+static void mq_era_tick(struct dm_cache_policy *p)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	mq_era->mq->tick(mq_era->mq);
+}
+
+static int mq_era_set_config_value(struct dm_cache_policy *p,
+				   const char *key,
+				   const char *value)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	int r;
+
+	if (!strcasecmp(key, "increment_era_counter"))
+		r = incr_era_counter(mq_era, value);
+	else if (!strcasecmp(key, "unmap_blocks_from_later_eras"))
+		r = cond_unmap_by_era(mq_era, value, era_is_gt_value);
+	else if (!strcasecmp(key, "unmap_blocks_from_this_era_and_later"))
+		r = cond_unmap_by_era(mq_era, value, era_is_gte_value);
+	else if (!strcasecmp(key, "unmap_blocks_from_this_era_and_earlier"))
+		r = cond_unmap_by_era(mq_era, value, era_is_lte_value);
+	else if (!strcasecmp(key, "unmap_blocks_from_earlier_eras"))
+		r = cond_unmap_by_era(mq_era, value, era_is_lt_value);
+	else
+		r =  mq_era->mq->set_config_value(mq_era->mq, key, value);
+
+	return r;
+}
+
+static int mq_era_emit_config_values(struct dm_cache_policy *p, char *result,
+				     unsigned maxlen)
+{
+	struct mq_era_policy *mq_era = to_mq_era_policy(p);
+	ssize_t sz = 0;
+	DMEMIT("era_counter %u ", mq_era->era_counter);
+	return mq_era->mq->emit_config_values(mq_era->mq, result + sz, maxlen - sz);
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct mq_era_policy *mq_era)
+{
+	mq_era->policy.destroy = mq_era_destroy;
+	mq_era->policy.map = mq_era_map;
+	mq_era->policy.lookup = mq_era_lookup;
+	mq_era->policy.set_dirty = mq_era_set_dirty;
+	mq_era->policy.clear_dirty = mq_era_clear_dirty;
+	mq_era->policy.load_mapping = mq_era_load_mapping;
+	mq_era->policy.walk_mappings = mq_era_walk_mappings;
+	mq_era->policy.remove_mapping = mq_era_remove_mapping;
+	mq_era->policy.writeback_work = mq_era_writeback_work;
+	mq_era->policy.force_mapping = mq_era_force_mapping;
+	mq_era->policy.residency = mq_era_residency;
+	mq_era->policy.tick = mq_era_tick;
+	mq_era->policy.emit_config_values = mq_era_emit_config_values;
+	mq_era->policy.set_config_value = mq_era_set_config_value;
+}
+
+static struct dm_cache_policy *mq_era_create(dm_cblock_t cache_size,
+					     sector_t origin_size,
+					     sector_t cache_block_size)
+{
+	struct mq_era_policy *mq_era = kzalloc(sizeof(*mq_era), GFP_KERNEL);
+
+	if (!mq_era)
+		return NULL;
+
+	init_policy_functions(mq_era);
+	mq_era->cache_size = cache_size;
+	mutex_init(&mq_era->lock);
+
+	mq_era->cb_to_era = kzalloc(from_cblock(mq_era->cache_size) *
+				    sizeof(*(mq_era->cb_to_era)),
+			      	    GFP_KERNEL);
+	if (!mq_era->cb_to_era)
+		goto bad_alloc_cb_to_era;
+	mq_era->era_counter = 1;
+
+	mq_era->mq = dm_cache_policy_create("mq", cache_size, origin_size,
+					    cache_block_size);
+	if (!mq_era->mq)
+		goto bad_policy_create;
+
+	DMDEBUG("created mq_era %p, mq %p.", mq_era, mq_era->mq);
+
+	return &mq_era->policy;
+
+bad_policy_create:
+	kfree(mq_era->cb_to_era);
+bad_alloc_cb_to_era:
+	kfree(mq_era);
+
+	return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type mq_era_policy_type = {
+	.name = "mq-era",
+	.version = {1, 0, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = mq_era_create
+};
+
+static int __init mq_era_init(void)
+{
+	int r;
+
+	r = dm_cache_policy_register(&mq_era_policy_type);
+	if (!r) {
+		DMINFO("version %u.%u.%u loaded",
+		       mq_era_policy_type.version[0],
+		       mq_era_policy_type.version[1],
+		       mq_era_policy_type.version[2]);
+		return 0;
+	}
+
+	DMERR("register failed %d", r);
+
+	dm_cache_policy_unregister(&mq_era_policy_type);
+	return -ENOMEM;
+}
+
+static void __exit mq_era_exit(void)
+{
+	dm_cache_policy_unregister(&mq_era_policy_type);
+}
+
+module_init(mq_era_init);
+module_exit(mq_era_exit);
+
+MODULE_AUTHOR("Morgan Mears <morgan.mears@netapp.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("mq-era cache policy");
Index: linux/drivers/md/dm-cache-policy-mq.c
===================================================================
--- linux.orig/drivers/md/dm-cache-policy-mq.c
+++ linux/drivers/md/dm-cache-policy-mq.c
@@ -6,6 +6,7 @@
 
 #include "dm-cache-policy.h"
 #include "dm.h"
+#include "persistent-data/dm-btree.h"
 
 #include <linux/hash.h>
 #include <linux/module.h>
@@ -1030,7 +1031,7 @@ static void mq_clear_dirty(struct dm_cac
 
 static int mq_load_mapping(struct dm_cache_policy *p,
 			   dm_oblock_t oblock, dm_cblock_t cblock,
-			   uint32_t hint, bool hint_valid)
+			   void *hint, bool hint_valid)
 {
 	struct mq_policy *mq = to_mq_policy(p);
 	struct entry *e;
@@ -1043,38 +1044,45 @@ static int mq_load_mapping(struct dm_cac
 	e->oblock = oblock;
 	e->in_cache = true;
 	e->dirty = true;	/* this gets corrected in a minute */
-	e->hit_count = hint_valid ? hint : 1;
+	e->hit_count = hint_valid ? le32_to_cpu(*((__le32 *) hint)) : 1;
 	e->generation = mq->generation;
 	push(mq, e);
 
 	return 0;
 }
 
+static int mq_save_hints(struct mq_policy *mq, struct queue *q,
+			 policy_walk_fn fn, void *context)
+{
+	int r;
+	unsigned level;
+	struct entry *e;
+
+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
+		list_for_each_entry(e, q->qs + level, list) {
+			__le32 value = cpu_to_le32(e->hit_count);
+			__dm_bless_for_disk(&value);
+
+			r = fn(context, e->cblock, e->oblock, &value);
+			if (r)
+				return r;
+		}
+
+	return 0;
+}
+
 static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
 			    void *context)
 {
 	struct mq_policy *mq = to_mq_policy(p);
 	int r = 0;
-	struct entry *e;
-	unsigned level;
 
 	mutex_lock(&mq->lock);
 
-	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each_entry(e, &mq->cache_clean.qs[level], list) {
-			r = fn(context, e->cblock, e->oblock, e->hit_count);
-			if (r)
-				goto out;
-		}
-
-	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each_entry(e, &mq->cache_dirty.qs[level], list) {
-			r = fn(context, e->cblock, e->oblock, e->hit_count);
-			if (r)
-				goto out;
-		}
+	r = mq_save_hints(mq, &mq->cache_clean, fn, context);
+	if (!r)
+		r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
 
-out:
 	mutex_unlock(&mq->lock);
 
 	return r;
Index: linux/drivers/md/dm-cache-policy.c
===================================================================
--- linux.orig/drivers/md/dm-cache-policy.c
+++ linux/drivers/md/dm-cache-policy.c
@@ -81,8 +81,9 @@ int dm_cache_policy_register(struct dm_c
 	int r;
 
 	/* One size fits all for now */
-	if (type->hint_size != 0 && type->hint_size != 4) {
-		DMWARN("hint size must be 0 or 4 but %llu supplied.", (unsigned long long) type->hint_size);
+	if (type->hint_size > DM_CACHE_POLICY_MAX_HINT_SIZE) {
+		DMWARN("hint size must be <= %llu but %llu supplied.",
+		       (unsigned long long) DM_CACHE_POLICY_MAX_HINT_SIZE, (unsigned long long) type->hint_size);
 		return -EINVAL;
 	}
 
@@ -166,4 +167,16 @@ size_t dm_cache_policy_get_hint_size(str
 }
 EXPORT_SYMBOL_GPL(dm_cache_policy_get_hint_size);
 
+int dm_cache_policy_set_hint_size(struct dm_cache_policy *p, unsigned hint_size)
+{
+	struct dm_cache_policy_type *t = p->private;
+
+	if (hint_size > DM_CACHE_POLICY_MAX_HINT_SIZE)
+		return -EPERM;
+
+	t->hint_size = hint_size;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_set_hint_size);
+
 /*----------------------------------------------------------------*/
Index: linux/drivers/md/dm-cache-policy.h
===================================================================
--- linux.orig/drivers/md/dm-cache-policy.h
+++ linux/drivers/md/dm-cache-policy.h
@@ -8,6 +8,7 @@
 #define DM_CACHE_POLICY_H
 
 #include "dm-cache-block-types.h"
+#include "persistent-data/dm-btree.h"
 
 #include <linux/device-mapper.h>
 
@@ -79,7 +80,8 @@ struct policy_result {
 };
 
 typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
-			      dm_oblock_t oblock, uint32_t hint);
+			      dm_oblock_t oblock, void *hint)
+	__dm_written_to_disk(hint);
 
 /*
  * The cache policy object.  Just a bunch of methods.  It is envisaged that
@@ -130,7 +132,7 @@ struct dm_cache_policy {
 	 *
 	 * Must not block.
 	 *
-	 * Returns 0 if in cache, -ENOENT if not, < 0 for other errors (-EWOULDBLOCK
+	 * Returns 0 iff in cache, -ENOENT iff not, < 0 on error (-EWOULDBLOCK
 	 * would be typical).
 	 */
 	int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
@@ -146,7 +148,7 @@ struct dm_cache_policy {
 	 * mapping from the metadata device into the policy.
 	 */
 	int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
-			    dm_cblock_t cblock, uint32_t hint, bool hint_valid);
+			    dm_cblock_t cblock, void *hint, bool hint_valid);
 
 	int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
 			     void *context);
@@ -159,7 +161,14 @@ struct dm_cache_policy {
 	void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
 			      dm_oblock_t new_oblock);
 
+	/*
+	 * writeback_work supporting the cache target to retrieve any dirty blocks to write back.
+	 *
+	 * next_dirty_block providing any next dirty block to the background policy for writeback,
+	 * thus allowing quicker eviction by evoiding demotion on cache block replacement.
+	 */
 	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+	int (*next_dirty_block)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
 
 
 	/*
@@ -211,8 +220,7 @@ struct dm_cache_policy_type {
 
 	/*
 	 * Policies may store a hint for each each cache block.
-	 * Currently the size of this hint must be 0 or 4 bytes but we
-	 * expect to relax this in future.
+	 * Currently the size of this hint must <= 512 bytes.
 	 */
 	size_t hint_size;
 
@@ -227,4 +235,4 @@ void dm_cache_policy_unregister(struct d
 
 /*----------------------------------------------------------------*/
 
-#endif	/* DM_CACHE_POLICY_H */
+#endif /* DM_CACHE_POLICY_H */
Index: linux/drivers/md/dm-cache-target.c
===================================================================
--- linux.orig/drivers/md/dm-cache-target.c
+++ linux/drivers/md/dm-cache-target.c
@@ -104,14 +104,37 @@ static void unhook_bio(struct hook_info 
 /*
  * FIXME: the cache is read/write for the time being.
  */
-enum cache_mode {
+enum cache_metadata_mode {
 	CM_WRITE,		/* metadata may be changed */
 	CM_READ_ONLY,		/* metadata may not be changed */
 };
 
+enum cache_io_mode {
+	/*
+	 * Data is written to cached blocks only.  These blocks are marked
+	 * dirty.  If you lose the cache device you will lose data.
+	 * Potential performance increase for both reads and writes.
+	 */
+	CM_IO_WRITEBACK,
+
+	/*
+	 * Data is written to both cache and origin.  Blocks are never
+	 * dirty.  Potential performance benfit for reads only.
+	 */
+	CM_IO_WRITETHROUGH,
+
+	/*
+	 * A degraded mode useful for various cache coherency situations
+	 * (eg, rolling back snapshots).  Reads and writes always go to the
+	 * origin.  If a write goes to a cached oblock, then the cache
+	 * block is invalidated.
+	 */
+	CM_IO_PASSTHROUGH
+};
+
 struct cache_features {
-	enum cache_mode mode;
-	bool write_through:1;
+	enum cache_metadata_mode mode;
+	enum cache_io_mode io_mode;
 };
 
 struct cache_stats {
@@ -562,9 +585,24 @@ static void save_stats(struct cache *cac
 #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
 #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
 
+static bool writethrough_mode(struct cache_features *f)
+{
+	return f->io_mode == CM_IO_WRITETHROUGH;
+}
+
+static bool writeback_mode(struct cache_features *f)
+{
+	return f->io_mode == CM_IO_WRITEBACK;
+}
+
+static bool passthrough_mode(struct cache_features *f)
+{
+	return f->io_mode == CM_IO_PASSTHROUGH;
+}
+
 static size_t get_per_bio_data_size(struct cache *cache)
 {
-	return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+	return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
 }
 
 static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
@@ -1130,6 +1168,32 @@ static void demote_then_promote(struct c
 	quiesce_migration(mg);
 }
 
+/*
+ * Invalidate a cache entry.  No writeback occurs; any changes in the cache
+ * block are thrown away.
+ */
+static void invalidate(struct cache *cache, struct prealloc *structs,
+		       dm_oblock_t oblock, dm_cblock_t cblock,
+		       struct dm_bio_prison_cell *cell)
+{
+	struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+	mg->err = false;
+	mg->writeback = false;
+	mg->demote = true;
+	mg->promote = false;
+	mg->requeue_holder = true;
+	mg->cache = cache;
+	mg->old_oblock = oblock;
+	mg->cblock = cblock;
+	mg->old_ocell = cell;
+	mg->new_ocell = NULL;
+	mg->start_jiffies = jiffies;
+
+	inc_nr_migrations(cache);
+	quiesce_migration(mg);
+}
+
 /*----------------------------------------------------------------
  * bio processing
  *--------------------------------------------------------------*/
@@ -1192,11 +1256,9 @@ static bool spare_migration_bandwidth(st
 	return current_volume < cache->migration_threshold;
 }
 
-static bool is_writethrough_io(struct cache *cache, struct bio *bio,
-			       dm_cblock_t cblock)
+static bool is_write_io(struct bio *bio)
 {
-	return bio_data_dir(bio) == WRITE &&
-		cache->features.write_through && !is_dirty(cache, cblock);
+	return bio_data_dir(bio) == WRITE;
 }
 
 static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -1211,6 +1273,15 @@ static void inc_miss_counter(struct cach
 		   &cache->stats.read_miss : &cache->stats.write_miss);
 }
 
+static void issue_cache_bio(struct cache *cache, struct bio *bio,
+			    struct per_bio_data *pb,
+			    dm_oblock_t oblock, dm_cblock_t cblock)
+{
+	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+	remap_to_cache_dirty(cache, bio, oblock, cblock);
+	issue(cache, bio);
+}
+
 static void process_bio(struct cache *cache, struct prealloc *structs,
 			struct bio *bio)
 {
@@ -1222,7 +1293,8 @@ static void process_bio(struct cache *ca
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	bool discarded_block = is_discarded_oblock(cache, block);
-	bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
+	bool passthrough = passthrough_mode(&cache->features);
+	bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
 
 	/*
 	 * Check to see if that block is currently migrating.
@@ -1243,15 +1315,39 @@ static void process_bio(struct cache *ca
 
 	switch (lookup_result.op) {
 	case POLICY_HIT:
-		inc_hit_counter(cache, bio);
-		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+		if (passthrough) {
+			inc_miss_counter(cache, bio);
 
-		if (is_writethrough_io(cache, bio, lookup_result.cblock))
-			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-		else
-			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+			/*
+			 * Passthrough always maps to the origin,
+			 * invalidating any cache blocks that are written
+			 * to.
+			 */
+
+			if (is_write_io(bio)) {
+				atomic_inc(&cache->stats.demotion);
+				invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
+				release_cell = false;
+
+			} else {
+				// FIXME: factor out issue_origin()
+				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+				remap_to_origin_clear_discard(cache, bio, block);
+				issue(cache, bio);
+			}
+		} else {
+			inc_hit_counter(cache, bio);
+
+			if (is_write_io(bio) &&
+			    writethrough_mode(&cache->features) &&
+			    !is_dirty(cache, lookup_result.cblock)) {
+				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+				issue(cache, bio);
+			} else
+				issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+		}
 
-		issue(cache, bio);
 		break;
 
 	case POLICY_MISS:
@@ -1798,7 +1894,7 @@ static int parse_block_size(struct cache
 static void init_features(struct cache_features *cf)
 {
 	cf->mode = CM_WRITE;
-	cf->write_through = false;
+	cf->io_mode = CM_IO_WRITEBACK;
 }
 
 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
@@ -1823,10 +1919,13 @@ static int parse_features(struct cache_a
 		arg = dm_shift_arg(as);
 
 		if (!strcasecmp(arg, "writeback"))
-			cf->write_through = false;
+			cf->io_mode = CM_IO_WRITEBACK;
 
 		else if (!strcasecmp(arg, "writethrough"))
-			cf->write_through = true;
+			cf->io_mode = CM_IO_WRITETHROUGH;
+
+		else if (!strcasecmp(arg, "passthrough"))
+			cf->io_mode = CM_IO_PASSTHROUGH;
 
 		else {
 			*error = "Unrecognised cache feature requested";
@@ -2078,6 +2177,22 @@ static int cache_create(struct cache_arg
 	}
 	cache->cmd = cmd;
 
+	if (passthrough_mode(&cache->features)) {
+		bool all_clean;
+
+		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
+		if (r) {
+			*error = "dm_cache_metadata_all_clean() failed";
+			goto bad;
+		}
+
+		if (!all_clean) {
+			*error = "Cannot enter passthrough mode unless all blocks are clean";
+			r = -EINVAL;
+			goto bad;
+		}
+	}
+
 	spin_lock_init(&cache->lock);
 	bio_list_init(&cache->deferred_bios);
 	bio_list_init(&cache->deferred_flush_bios);
@@ -2291,17 +2406,38 @@ static int cache_map(struct dm_target *t
 		return DM_MAPIO_SUBMITTED;
 	}
 
+	r = DM_MAPIO_REMAPPED;
 	switch (lookup_result.op) {
 	case POLICY_HIT:
-		inc_hit_counter(cache, bio);
-		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+		if (passthrough_mode(&cache->features)) {
+			if (is_write_io(bio)) {
+				/*
+				 * We need to invalidate this block, so
+				 * defer for the worker thread.
+				 */
+				cell_defer(cache, cell, true);
+				r = DM_MAPIO_SUBMITTED;
+
+			} else {
+				pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+				inc_miss_counter(cache, bio);
+				remap_to_origin_clear_discard(cache, bio, block);
+			}
 
-		if (is_writethrough_io(cache, bio, lookup_result.cblock))
-			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-		else
-			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+		} else {
+			inc_hit_counter(cache, bio);
+
+			if (is_write_io(bio) &&
+			    writethrough_mode(&cache->features) &&
+			    !is_dirty(cache, lookup_result.cblock))
+				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
 
-		cell_defer(cache, cell, false);
+			else
+				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+
+			cell_defer(cache, cell, false);
+
+		}
 		break;
 
 	case POLICY_MISS:
@@ -2326,10 +2462,10 @@ static int cache_map(struct dm_target *t
 		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
 			    (unsigned) lookup_result.op);
 		bio_io_error(bio);
-		return DM_MAPIO_SUBMITTED;
+		r = DM_MAPIO_SUBMITTED;
 	}
 
-	return DM_MAPIO_REMAPPED;
+	return r;
 }
 
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
@@ -2388,7 +2524,7 @@ static int write_discard_bitset(struct c
 }
 
 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
-		     uint32_t hint)
+		     void *hint)
 {
 	struct cache *cache = context;
 	return dm_cache_save_hint(cache->cmd, cblock, hint);
@@ -2458,7 +2594,7 @@ static void cache_postsuspend(struct dm_
 }
 
 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
-			bool dirty, uint32_t hint, bool hint_valid)
+			bool dirty, void *hint, bool hint_valid)
 {
 	int r;
 	struct cache *cache = context;
@@ -2490,26 +2626,71 @@ static int load_discard(void *context, s
 	return 0;
 }
 
-static int cache_preresume(struct dm_target *ti)
+static dm_cblock_t get_cache_dev_size(struct cache *cache)
+{
+	sector_t size = get_dev_size(cache->cache_dev);
+	(void) sector_div(size, cache->sectors_per_block);
+	return to_cblock(size);
+}
+
+static bool can_resize(struct cache *cache, dm_cblock_t new_size)
+{
+	if (new_size > cache->cache_size)
+		return true;
+
+	/*
+	 * We can't drop a dirty block.
+	 */
+	for (; new_size > cache->cache_size;
+	     new_size = to_cblock(from_cblock(new_size) + 1)) {
+		if (is_dirty(cache, new_size)) {
+			DMERR("unable to shrink cache; cache block %llu is dirty",
+			      (unsigned long long) from_cblock(new_size));
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
+{
+	int r;
+
+	r = dm_cache_resize(cache->cmd, cache->cache_size);
+	if (r) {
+		DMERR("could not resize cache metadata");
+		return r;
+	}
+
+	cache->cache_size = new_size;
+
+	return 0;
+}
+
+static int cache_preresume_(struct dm_target *ti)
 {
 	int r = 0;
 	struct cache *cache = ti->private;
-	sector_t actual_cache_size = get_dev_size(cache->cache_dev);
-	(void) sector_div(actual_cache_size, cache->sectors_per_block);
+	dm_cblock_t csize = get_cache_dev_size(cache);
 
 	/*
 	 * Check to see if the cache has resized.
 	 */
-	if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
-		cache->cache_size = to_cblock(actual_cache_size);
-
-		r = dm_cache_resize(cache->cmd, cache->cache_size);
-		if (r) {
-			DMERR("could not resize cache metadata");
+	if (!cache->sized) {
+		r = resize_cache_dev(cache, csize);
+		if (r)
 			return r;
-		}
 
 		cache->sized = true;
+
+	} else if (csize != cache->cache_size) {
+		if (!can_resize(cache, csize))
+			return -EINVAL;
+
+		r = resize_cache_dev(cache, csize);
+		if (r)
+			return r;
 	}
 
 	if (!cache->loaded_mappings) {
@@ -2536,6 +2717,14 @@ static int cache_preresume(struct dm_tar
 	return r;
 }
 
+static int cache_preresume(struct dm_target *ti)
+{
+	int r = cache_preresume_(ti);
+	if (r)
+		DMERR("cache_preresume failed");
+	return r;
+}
+
 static void cache_resume(struct dm_target *ti)
 {
 	struct cache *cache = ti->private;
@@ -2602,10 +2791,18 @@ static void cache_status(struct dm_targe
 		       (unsigned long long) from_cblock(residency),
 		       cache->nr_dirty);
 
-		if (cache->features.write_through)
+		if (writethrough_mode(&cache->features))
 			DMEMIT("1 writethrough ");
+
+		else if (passthrough_mode(&cache->features))
+			DMEMIT("1 passthrough ");
+
+		else if (writeback_mode(&cache->features))
+			DMEMIT("1 writeback ");
+
 		else
-			DMEMIT("0 ");
+			DMERR("internal error: unknown io mode: %d",
+			      (int) cache->features.io_mode);
 
 		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
 		if (sz < maxlen) {
Index: linux/drivers/md/persistent-data/dm-block-manager.h
===================================================================
--- linux.orig/drivers/md/persistent-data/dm-block-manager.h
+++ linux/drivers/md/persistent-data/dm-block-manager.h
@@ -114,6 +114,11 @@ int dm_bm_flush_and_unlock(struct dm_blo
 void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
 
 /*
+ * Request data is prefetched into the cache.
+ */
+void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
+
+/*
  * Switches the bm to a read only mode.  Once read-only mode
  * has been entered the following functions will return -EPERM.
  *