From: Joe Thornber A cache policy that uses a multiqueue ordered by recent hit count to select which blocks should be promoted and demoted. This is meant to be a general purpose policy. It prioritises reads over writes. Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/cache-policies.txt | 72 + drivers/md/Kconfig | 10 drivers/md/Makefile | 2 drivers/md/dm-cache-policy-mq.c | 1195 +++++++++++++++++++++++++ 4 files changed, 1279 insertions(+) Index: linux/Documentation/device-mapper/cache-policies.txt =================================================================== --- /dev/null +++ linux/Documentation/device-mapper/cache-policies.txt @@ -0,0 +1,72 @@ +Guidance for writing policies +============================= + +Try to keep transactionality out of it. The core is careful to +avoid asking about anything that is migrating. This is a pain, but +makes it easier to write the policies. + +Mappings are loaded into the policy at construction time. + +Every bio that is mapped by the target is referred to the policy. +The policy can return a simple HIT or MISS or issue a migration. + +Currently there's no way for the policy to issue background work, +e.g. to start writing back dirty blocks that are going to be evicte +soon. + +Because we map bios, rather than requests it's easy for the policy +to get fooled by many small bios. For this reason the core target +issues periodic ticks to the policy. It's suggested that the policy +doesn't update states (eg, hit counts) for a block more than once +for each tick. The core ticks by watching bios complete, and so +trying to see when the io scheduler has let the ios run. + + +Overview of supplied cache replacement policies +=============================================== + +multiqueue +---------- + +This policy is the default. + +The multiqueue policy has two sets of 16 queues: one set for entries +waiting for the cache and another one for those in the cache. +Cache entries in the queues are aged based on logical time. Entry into +the cache is based on variable thresholds and queue selection is based +on hit count on entry. The policy aims to take different cache miss +costs into account and to adjust to varying load patterns automatically. + +Message and constructor argument pairs are: + 'sequential_threshold <#nr_sequential_ios>' and + 'random_threshold <#nr_random_ios>'. + +The sequential threshold indicates the number of contiguous I/Os +required before a stream is treated as sequential. The random threshold +is the number of intervening non-contiguous I/Os that must be seen +before the stream is treated as random again. + +The sequential and random thresholds default to 512 and 4 respectively. + +Large, sequential ios are probably better left on the origin device +since spindles tend to have good bandwidth. The io_tracker counts +contiguous I/Os to try to spot when the io is in one of these sequential +modes. + +Examples +======== + +The syntax for a table is: + cache + <#feature_args> []* + <#policy_args> []* + +The syntax to send a message using the dmsetup command is: + dmsetup message 0 sequential_threshold 1024 + dmsetup message 0 random_threshold 8 + +Using dmsetup: + dmsetup create blah --table "0 268435456 cache /dev/sdb /dev/sdc \ + /dev/sdd 512 0 mq 4 sequential_threshold 1024 random_threshold 8" + creates a 128GB large mapped device named 'blah' with the + sequential threshold set to 1024 and the random_threshold set to 8. Index: linux/drivers/md/Kconfig =================================================================== --- linux.orig/drivers/md/Kconfig +++ linux/drivers/md/Kconfig @@ -281,6 +281,16 @@ config DM_CACHE algorithms used to select which blocks are promoted, demoted, cleaned etc. It supports writeback and writethrough modes. +config DM_CACHE_MQ + tristate "MQ Cache Policy (EXPERIMENTAL)" + depends on DM_CACHE + default y + ---help--- + A cache policy that uses a multiqueue ordered by recent hit + count to select which blocks should be promoted and demoted. + This is meant to be a general purpose policy. It prioritises + reads over writes. + config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM Index: linux/drivers/md/Makefile =================================================================== --- linux.orig/drivers/md/Makefile +++ linux/drivers/md/Makefile @@ -12,6 +12,7 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o +dm-cache-mq-y += dm-cache-policy-mq.o md-mod-y += md.o bitmap.o raid456-y += raid5.o @@ -46,6 +47,7 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o +obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o Index: linux/drivers/md/dm-cache-policy-mq.c =================================================================== --- /dev/null +++ linux/drivers/md/dm-cache-policy-mq.c @@ -0,0 +1,1195 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-policy.h" +#include "dm.h" + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "cache-policy-mq" +#define MQ_VERSION "1.0.0" + +static struct kmem_cache *mq_entry_cache; + +/*----------------------------------------------------------------*/ + +static unsigned next_power(unsigned n, unsigned min) +{ + return roundup_pow_of_two(max(n, min)); +} + +/*----------------------------------------------------------------*/ + +static unsigned long *alloc_bitset(unsigned nr_entries) +{ + size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); + return vzalloc(s); +} + +static void free_bitset(unsigned long *bits) +{ + vfree(bits); +} + +/*----------------------------------------------------------------*/ + +/* + * Large, sequential ios are probably better left on the origin device since + * spindles tend to have good bandwidth. + * + * The io_tracker tries to spot when the io is in one of these sequential + * modes. + * + * Two thresholds to switch between random and sequential io mode are defaulting + * as follows and can be adjusted via the constructor and message interfaces. + */ +#define RANDOM_THRESHOLD_DEFAULT 4 +#define SEQUENTIAL_THRESHOLD_DEFAULT 512 + +enum io_pattern { + PATTERN_SEQUENTIAL, + PATTERN_RANDOM +}; + +struct io_tracker { + enum io_pattern pattern; + + unsigned nr_seq_samples; + unsigned nr_rand_samples; + unsigned thresholds[2]; + + dm_oblock_t last_end_oblock; +}; + +static void iot_init(struct io_tracker *t, + int sequential_threshold, int random_threshold) +{ + t->pattern = PATTERN_RANDOM; + t->nr_seq_samples = 0; + t->nr_rand_samples = 0; + t->last_end_oblock = 0; + t->thresholds[PATTERN_RANDOM] = random_threshold; + t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; +} + +static enum io_pattern iot_pattern(struct io_tracker *t) +{ + return t->pattern; +} + +static void iot_update_stats(struct io_tracker *t, struct bio *bio) +{ + if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1) + t->nr_seq_samples++; + else { + /* + * Just one non-sequential IO is enough to reset the + * counters. + */ + if (t->nr_seq_samples) { + t->nr_seq_samples = 0; + t->nr_rand_samples = 0; + } + + t->nr_rand_samples++; + } + + t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1); +} + +static void iot_check_for_pattern_switch(struct io_tracker *t) +{ + switch (t->pattern) { + case PATTERN_SEQUENTIAL: + if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { + t->pattern = PATTERN_RANDOM; + t->nr_seq_samples = t->nr_rand_samples = 0; + } + break; + + case PATTERN_RANDOM: + if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { + t->pattern = PATTERN_SEQUENTIAL; + t->nr_seq_samples = t->nr_rand_samples = 0; + } + break; + } +} + +static void iot_examine_bio(struct io_tracker *t, struct bio *bio) +{ + iot_update_stats(t, bio); + iot_check_for_pattern_switch(t); +} + +/*----------------------------------------------------------------*/ + + +/* + * This queue is divided up into different levels. Allowing us to push + * entries to the back of any of the levels. Think of it as a partially + * sorted queue. + */ +#define NR_QUEUE_LEVELS 16u + +struct queue { + struct list_head qs[NR_QUEUE_LEVELS]; +}; + +static void queue_init(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) + INIT_LIST_HEAD(q->qs + i); +} + +/* + * Insert an entry to the back of the given level. + */ +static void queue_push(struct queue *q, unsigned level, struct list_head *elt) +{ + list_add_tail(elt, q->qs + level); +} + +static void queue_remove(struct list_head *elt) +{ + list_del(elt); +} + +/* + * Shifts all regions down one level. This has no effect on the order of + * the queue. + */ +static void queue_shift_down(struct queue *q) +{ + unsigned level; + + for (level = 1; level < NR_QUEUE_LEVELS; level++) + list_splice_init(q->qs + level, q->qs + level - 1); +} + +/* + * Gives us the oldest entry of the lowest popoulated level. If the first + * level is emptied then we shift down one level. + */ +static struct list_head *queue_pop(struct queue *q) +{ + unsigned level; + struct list_head *r; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + if (!list_empty(q->qs + level)) { + r = q->qs[level].next; + list_del(r); + + /* have we just emptied the bottom level? */ + if (level == 0 && list_empty(q->qs)) + queue_shift_down(q); + + return r; + } + + return NULL; +} + +static struct list_head *list_pop(struct list_head *lh) +{ + struct list_head *r = lh->next; + + BUG_ON(!r); + list_del_init(r); + + return r; +} + +/*----------------------------------------------------------------*/ + +/* + * Describes a cache entry. Used in both the cache and the pre_cache. + */ +struct entry { + struct hlist_node hlist; + struct list_head list; + dm_oblock_t oblock; + dm_cblock_t cblock; /* valid iff in_cache */ + + /* + * FIXME: pack these better + */ + bool in_cache:1; + unsigned hit_count; + unsigned generation; + unsigned tick; +}; + +struct mq_policy { + struct dm_cache_policy policy; + + /* protects everything */ + struct mutex lock; + dm_cblock_t cache_size; + struct io_tracker tracker; + + /* + * We maintain two queues of entries. The cache proper contains + * the currently active mappings. Whereas the pre_cache tracks + * blocks that are being hit frequently and potential candidates + * for promotion to the cache. + */ + struct queue pre_cache; + struct queue cache; + + /* + * Keeps track of time, incremented by the core. We use this to + * avoid attributing multiple hits within the same tick. + * + * Access to tick_protected should be done with the spin lock held. + * It's copied to tick at the start of the map function (within the + * mutex). + */ + spinlock_t tick_lock; + unsigned tick_protected; + unsigned tick; + + /* + * A count of the number of times the map function has been called + * and found an entry in the pre_cache or cache. Currently used to + * calculate the generation. + */ + unsigned hit_count; + + /* + * A generation is a longish period that is used to trigger some + * book keeping effects. eg, decrementing hit counts on entries. + * This is needed to allow the cache to evolve as io patterns + * change. + */ + unsigned generation; + unsigned generation_period; /* in lookups (will probably change) */ + + /* + * Entries in the pre_cache whose hit count passes the promotion + * threshold move to the cache proper. Working out the correct + * value for the promotion_threshold is crucial to this policy. + */ + unsigned promote_threshold; + + /* + * We need cache_size entries for the cache, and choose to have + * cache_size entries for the pre_cache too. One motivation for + * using the same size is to make the hit counts directly + * comparable between pre_cache and cache. + */ + unsigned nr_entries; + unsigned nr_entries_allocated; + struct list_head free; + + /* + * Cache blocks may be unallocated. We store this info in a + * bitset. + */ + unsigned long *allocation_bitset; + unsigned nr_cblocks_allocated; + unsigned find_free_nr_words; + unsigned find_free_last_word; + + /* + * The hash table allows us to quickly find an entry by origin + * block. Both pre_cache and cache entries are in here. + */ + unsigned nr_buckets; + dm_block_t hash_bits; + struct hlist_head *table; +}; + +/*----------------------------------------------------------------*/ +/* Free/alloc mq cache entry structures. */ +static void takeout_queue(struct list_head *lh, struct queue *q) +{ + unsigned level; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_splice(q->qs + level, lh); +} + +static void free_entries(struct mq_policy *mq) +{ + struct entry *e, *tmp; + + takeout_queue(&mq->free, &mq->pre_cache); + takeout_queue(&mq->free, &mq->cache); + + list_for_each_entry_safe(e, tmp, &mq->free, list) + kmem_cache_free(mq_entry_cache, e); +} + +static int alloc_entries(struct mq_policy *mq, unsigned elts) +{ + unsigned u = mq->nr_entries; + + INIT_LIST_HEAD(&mq->free); + mq->nr_entries_allocated = 0; + + while (u--) { + struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL); + + if (!e) { + free_entries(mq); + return -ENOMEM; + } + + + list_add(&e->list, &mq->free); + } + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* + * Simple hash table implementation. Should replace with the standard hash + * table that's making its way upstream. + */ +static void hash_insert(struct mq_policy *mq, struct entry *e) +{ + unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); + + hlist_add_head(&e->hlist, mq->table + h); +} + +static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) +{ + unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); + struct hlist_head *bucket = mq->table + h; + struct entry *e; + + hlist_for_each_entry(e, bucket, hlist) + if (e->oblock == oblock) { + hlist_del(&e->hlist); + hlist_add_head(&e->hlist, bucket); + return e; + } + + return NULL; +} + +static void hash_remove(struct entry *e) +{ + hlist_del(&e->hlist); +} + +/*----------------------------------------------------------------*/ + +/* + * Allocates a new entry structure. The memory is allocated in one lump, + * so we just handing it out here. Returns NULL if all entries have + * already been allocated. Cannot fail otherwise. + */ +static struct entry *alloc_entry(struct mq_policy *mq) +{ + struct entry *e; + + if (mq->nr_entries_allocated >= mq->nr_entries) { + BUG_ON(!list_empty(&mq->free)); + return NULL; + } + + e = list_entry(list_pop(&mq->free), struct entry, list); + INIT_LIST_HEAD(&e->list); + INIT_HLIST_NODE(&e->hlist); + + mq->nr_entries_allocated++; + return e; +} + +/*----------------------------------------------------------------*/ + +/* + * Mark cache blocks allocated or not in the bitset. + */ +static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock) +{ + BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); + BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset)); + + set_bit(from_cblock(cblock), mq->allocation_bitset); + mq->nr_cblocks_allocated++; +} + +static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock) +{ + BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); + BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset)); + + clear_bit(from_cblock(cblock), mq->allocation_bitset); + mq->nr_cblocks_allocated--; +} + +static bool any_free_cblocks(struct mq_policy *mq) +{ + return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); +} + +/* + * Fills result out with a cache block that isn't in use, or return + * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is + * reponsible for that. + */ +static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end, + dm_cblock_t *result, unsigned *last_word) +{ + int r = -ENOSPC; + unsigned w; + + for (w = begin; w < end; w++) { + /* + * ffz is undefined if no zero exists + */ + if (mq->allocation_bitset[w] != ~0UL) { + *last_word = w; + *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w])); + if (from_cblock(*result) < from_cblock(mq->cache_size)) + r = 0; + + break; + } + } + + return r; +} + +static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result) +{ + int r; + + if (!any_free_cblocks(mq)) + return -ENOSPC; + + r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word); + if (r == -ENOSPC && mq->find_free_last_word) + r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word); + + return r; +} + +/*----------------------------------------------------------------*/ + +/* + * Now we get to the meat of the policy. This section deals with deciding + * when to to add entries to the pre_cache and cache, and move between + * them. + */ + +/* + * The queue level is based on the log2 of the hit count. + */ +static unsigned queue_level(struct entry *e) +{ + return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); +} + +/* + * Inserts the entry into the pre_cache or the cache. Ensures the cache + * block is marked as allocated if necc. Inserts into the hash table. Sets the + * tick which records when the entry was last moved about. + */ +static void push(struct mq_policy *mq, struct entry *e) +{ + e->tick = mq->tick; + hash_insert(mq, e); + + if (e->in_cache) { + alloc_cblock(mq, e->cblock); + queue_push(&mq->cache, queue_level(e), &e->list); + } else + queue_push(&mq->pre_cache, queue_level(e), &e->list); +} + +/* + * Removes an entry from pre_cache or cache. Removes from the hash table. + * Frees off the cache block if necc. + */ +static void del(struct mq_policy *mq, struct entry *e) +{ + queue_remove(&e->list); + hash_remove(e); + if (e->in_cache) + free_cblock(mq, e->cblock); +} + +/* + * Like del, except it removes the first entry in the queue (ie. the least + * recently used). + */ +static struct entry *pop(struct mq_policy *mq, struct queue *q) +{ + struct entry *e = container_of(queue_pop(q), struct entry, list); + + if (e) { + hash_remove(e); + + if (e->in_cache) + free_cblock(mq, e->cblock); + } + + return e; +} + +/* + * Has this entry already been updated? + */ +static bool updated_this_tick(struct mq_policy *mq, struct entry *e) +{ + return mq->tick == e->tick; +} + +/* + * The promotion threshold is adjusted every generation. As are the counts + * of the entries. + * + * At the moment the threshold is taken by averaging the hit counts of some + * of the entries in the cache (the first 20 entries of the first level). + * + * We can be much cleverer than this though. For example, each promotion + * could bump up the threshold helping to prevent churn. Much more to do + * here. + */ + +#define MAX_TO_AVERAGE 20 + +static void check_generation(struct mq_policy *mq) +{ + unsigned total = 0, nr = 0, count = 0, level; + struct list_head *head; + struct entry *e; + + if ((mq->hit_count >= mq->generation_period) && + (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) { + + mq->hit_count = 0; + mq->generation++; + + for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { + head = mq->cache.qs + level; + list_for_each_entry(e, head, list) { + nr++; + total += e->hit_count; + + if (++count >= MAX_TO_AVERAGE) + break; + } + } + + mq->promote_threshold = nr ? total / nr : 1; + if (mq->promote_threshold * nr < total) + mq->promote_threshold++; + } +} + +/* + * Whenever we use an entry we bump up it's hit counter, and push it to the + * back to it's current level. + */ +static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) +{ + if (updated_this_tick(mq, e)) + return; + + e->hit_count++; + mq->hit_count++; + check_generation(mq); + + /* generation adjustment, to stop the counts increasing forever. */ + /* FIXME: divide? */ + /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ + e->generation = mq->generation; + + del(mq, e); + push(mq, e); +} + +/* + * Demote the least recently used entry from the cache to the pre_cache. + * Returns the new cache entry to use, and the old origin block it was + * mapped to. + * + * We drop the hit count on the demoted entry back to 1 to stop it bouncing + * straight back into the cache if it's subsequently hit. There are + * various options here, and more experimentation would be good: + * + * - just forget about the demoted entry completely (ie. don't insert it + into the pre_cache). + * - divide the hit count rather that setting to some hard coded value. + * - set the hit count to a hard coded value other than 1, eg, is it better + * if it goes in at level 2? + */ +static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) +{ + dm_cblock_t result; + struct entry *demoted = pop(mq, &mq->cache); + + BUG_ON(!demoted); + result = demoted->cblock; + *oblock = demoted->oblock; + demoted->in_cache = false; + demoted->hit_count = 1; + push(mq, demoted); + + return result; +} + +/* + * We modify the basic promotion_threshold depending on the specific io. + * + * If the origin block has been discarded then there's no cost to copy it + * to the cache. + * + * We bias towards reads, since they can be demoted at no cost if they + * haven't been dirtied. + */ +#define DISCARDED_PROMOTE_THRESHOLD 1 +#define READ_PROMOTE_THRESHOLD 4 +#define WRITE_PROMOTE_THRESHOLD 8 + +static unsigned adjusted_promote_threshold(struct mq_policy *mq, + bool discarded_oblock, int data_dir) +{ + if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) + /* + * We don't need to do any copying at all, so give this a + * very low threshold. In practice this only triggers + * during initial population after a format. + */ + return DISCARDED_PROMOTE_THRESHOLD; + + return data_dir == READ ? + (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : + (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); +} + +static bool should_promote(struct mq_policy *mq, struct entry *e, + bool discarded_oblock, int data_dir) +{ + return e->hit_count >= + adjusted_promote_threshold(mq, discarded_oblock, data_dir); +} + +static int cache_entry_found(struct mq_policy *mq, + struct entry *e, + struct policy_result *result) +{ + requeue_and_update_tick(mq, e); + + if (e->in_cache) { + result->op = POLICY_HIT; + result->cblock = e->cblock; + } + + return 0; +} + +/* + * Moves and entry from the pre_cache to the cache. The main work is + * finding which cache block to use. + */ +static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, + struct policy_result *result) +{ + dm_cblock_t cblock; + + if (find_free_cblock(mq, &cblock) == -ENOSPC) { + result->op = POLICY_REPLACE; + cblock = demote_cblock(mq, &result->old_oblock); + } else + result->op = POLICY_NEW; + + result->cblock = e->cblock = cblock; + + del(mq, e); + e->in_cache = true; + push(mq, e); + + return 0; +} + +static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, + bool can_migrate, bool discarded_oblock, + int data_dir, struct policy_result *result) +{ + int r = 0; + bool updated = updated_this_tick(mq, e); + + requeue_and_update_tick(mq, e); + + if ((!discarded_oblock && updated) || + !should_promote(mq, e, discarded_oblock, data_dir)) + result->op = POLICY_MISS; + else if (!can_migrate) + r = -EWOULDBLOCK; + else + r = pre_cache_to_cache(mq, e, result); + + return r; +} + +static void insert_in_pre_cache(struct mq_policy *mq, + dm_oblock_t oblock) +{ + struct entry *e = alloc_entry(mq); + + if (!e) + /* + * There's no spare entry structure, so we grab the least + * used one from the pre_cache. + */ + e = pop(mq, &mq->pre_cache); + + if (unlikely(!e)) { + DMWARN("couldn't pop from pre cache"); + return; + } + + e->in_cache = false; + e->oblock = oblock; + e->hit_count = 1; + e->generation = mq->generation; + push(mq, e); +} + +static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, + struct policy_result *result) +{ + struct entry *e; + dm_cblock_t cblock; + + if (find_free_cblock(mq, &cblock) == -ENOSPC) { + result->op = POLICY_MISS; + insert_in_pre_cache(mq, oblock); + return; + } + + e = alloc_entry(mq); + if (unlikely(!e)) { + result->op = POLICY_MISS; + return; + } + + e->oblock = oblock; + e->cblock = cblock; + e->in_cache = true; + e->hit_count = 1; + e->generation = mq->generation; + push(mq, e); + + result->op = POLICY_NEW; + result->cblock = e->cblock; +} + +static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, + bool can_migrate, bool discarded_oblock, + int data_dir, struct policy_result *result) +{ + if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { + if (can_migrate) + insert_in_cache(mq, oblock, result); + else + return -EWOULDBLOCK; + } else { + insert_in_pre_cache(mq, oblock); + result->op = POLICY_MISS; + } + + return 0; +} + +/* + * Looks the oblock up in the hash table, then decides whether to put in + * pre_cache, or cache etc. + */ +static int map(struct mq_policy *mq, dm_oblock_t oblock, + bool can_migrate, bool discarded_oblock, + int data_dir, struct policy_result *result) +{ + int r = 0; + struct entry *e = hash_lookup(mq, oblock); + + if (e && e->in_cache) + r = cache_entry_found(mq, e, result); + else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) + result->op = POLICY_MISS; + else if (e) + r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, + data_dir, result); + else + r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, + data_dir, result); + + if (r == -EWOULDBLOCK) + result->op = POLICY_MISS; + + return r; +} + +/*----------------------------------------------------------------*/ + +/* + * Public interface, via the policy struct. See dm-cache-policy.h for a + * description of these. + */ + +static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) +{ + return container_of(p, struct mq_policy, policy); +} + +static void mq_destroy(struct dm_cache_policy *p) +{ + struct mq_policy *mq = to_mq_policy(p); + + free_bitset(mq->allocation_bitset); + kfree(mq->table); + free_entries(mq); + kfree(mq); +} + +static void copy_tick(struct mq_policy *mq) +{ + unsigned long flags; + + spin_lock_irqsave(&mq->tick_lock, flags); + mq->tick = mq->tick_protected; + spin_unlock_irqrestore(&mq->tick_lock, flags); +} + +static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, + bool can_block, bool can_migrate, bool discarded_oblock, + struct bio *bio, struct policy_result *result) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + result->op = POLICY_MISS; + + if (can_block) + mutex_lock(&mq->lock); + else if (!mutex_trylock(&mq->lock)) + return -EWOULDBLOCK; + + copy_tick(mq); + + iot_examine_bio(&mq->tracker, bio); + r = map(mq, oblock, can_migrate, discarded_oblock, + bio_data_dir(bio), result); + + mutex_unlock(&mq->lock); + + return r; +} + +static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + if (!mutex_trylock(&mq->lock)) + return -EWOULDBLOCK; + + e = hash_lookup(mq, oblock); + if (e && e->in_cache) { + *cblock = e->cblock; + r = 0; + } else + r = -ENOENT; + + mutex_unlock(&mq->lock); + + return r; +} + +static int mq_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + uint32_t hint, bool hint_valid) +{ + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + e = alloc_entry(mq); + if (!e) + return -ENOMEM; + + e->cblock = cblock; + e->oblock = oblock; + e->in_cache = true; + e->hit_count = hint_valid ? hint : 1; + e->generation = mq->generation; + push(mq, e); + + return 0; +} + +static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, + void *context) +{ + struct mq_policy *mq = to_mq_policy(p); + int r = 0; + struct entry *e; + unsigned level; + + mutex_lock(&mq->lock); + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each_entry(e, &mq->cache.qs[level], list) { + r = fn(context, e->cblock, e->oblock, e->hit_count); + if (r) + goto out; + } + +out: + mutex_unlock(&mq->lock); + + return r; +} + +static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) +{ + struct entry *e = hash_lookup(mq, oblock); + + BUG_ON(!e || !e->in_cache); + + del(mq, e); + e->in_cache = false; + push(mq, e); +} + +static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + remove_mapping(mq, oblock); + mutex_unlock(&mq->lock); +} + +static void force_mapping(struct mq_policy *mq, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) +{ + struct entry *e = hash_lookup(mq, current_oblock); + + BUG_ON(!e || !e->in_cache); + + del(mq, e); + e->oblock = new_oblock; + push(mq, e); +} + +static void mq_force_mapping(struct dm_cache_policy *p, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + force_mapping(mq, current_oblock, new_oblock); + mutex_unlock(&mq->lock); +} + +static dm_cblock_t mq_residency(struct dm_cache_policy *p) +{ + struct mq_policy *mq = to_mq_policy(p); + + /* FIXME: lock mutex, not sure we can block here */ + return to_cblock(mq->nr_cblocks_allocated); +} + +static void mq_tick(struct dm_cache_policy *p) +{ + struct mq_policy *mq = to_mq_policy(p); + unsigned long flags; + + spin_lock_irqsave(&mq->tick_lock, flags); + mq->tick_protected++; + spin_unlock_irqrestore(&mq->tick_lock, flags); +} + +static int mq_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + struct mq_policy *mq = to_mq_policy(p); + enum io_pattern pattern; + unsigned long tmp; + + if (!strcasecmp(key, "random_threshold")) + pattern = PATTERN_RANDOM; + else if (!strcasecmp(key, "sequential_threshold")) + pattern = PATTERN_SEQUENTIAL; + else + return -EINVAL; + + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + mq->tracker.thresholds[pattern] = tmp; + + return 0; +} + +static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) +{ + ssize_t sz = 0; + struct mq_policy *mq = to_mq_policy(p); + + DMEMIT("4 random_threshold %u sequential_threshold %u", + mq->tracker.thresholds[PATTERN_RANDOM], + mq->tracker.thresholds[PATTERN_SEQUENTIAL]); + + return 0; +} + +/* Init the policy plugin interface function pointers. */ +static void init_policy_functions(struct mq_policy *mq) +{ + mq->policy.destroy = mq_destroy; + mq->policy.map = mq_map; + mq->policy.lookup = mq_lookup; + mq->policy.load_mapping = mq_load_mapping; + mq->policy.walk_mappings = mq_walk_mappings; + mq->policy.remove_mapping = mq_remove_mapping; + mq->policy.writeback_work = NULL; + mq->policy.force_mapping = mq_force_mapping; + mq->policy.residency = mq_residency; + mq->policy.tick = mq_tick; + mq->policy.emit_config_values = mq_emit_config_values; + mq->policy.set_config_value = mq_set_config_value; +} + +static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + int r; + struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); + + if (!mq) + return NULL; + + init_policy_functions(mq); + iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); + + mq->cache_size = cache_size; + mq->tick_protected = 0; + mq->tick = 0; + mq->hit_count = 0; + mq->generation = 0; + mq->promote_threshold = 0; + mutex_init(&mq->lock); + spin_lock_init(&mq->tick_lock); + mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG); + mq->find_free_last_word = 0; + + queue_init(&mq->pre_cache); + queue_init(&mq->cache); + mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); + + mq->nr_entries = 2 * from_cblock(cache_size); + r = alloc_entries(mq, mq->nr_entries); + if (r) + goto bad_cache_alloc; + + mq->nr_entries_allocated = 0; + mq->nr_cblocks_allocated = 0; + + mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); + mq->hash_bits = ffs(mq->nr_buckets) - 1; + mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); + if (!mq->table) + goto bad_alloc_table; + + mq->allocation_bitset = alloc_bitset(from_cblock(cache_size)); + if (!mq->allocation_bitset) + goto bad_alloc_bitset; + + return &mq->policy; + +bad_alloc_bitset: + kfree(mq->table); +bad_alloc_table: + free_entries(mq); +bad_cache_alloc: + kfree(mq); + + return NULL; +} + +/*----------------------------------------------------------------*/ + +static struct dm_cache_policy_type mq_policy_type = { + .name = "mq", + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create +}; + +static struct dm_cache_policy_type default_policy_type = { + .name = "default", + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create +}; + +static int __init mq_init(void) +{ + int r; + + mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", + sizeof(struct entry), + __alignof__(struct entry), + 0, NULL); + if (!mq_entry_cache) + goto bad; + + r = dm_cache_policy_register(&mq_policy_type); + if (r) { + DMERR("register failed %d", r); + goto bad_register_mq; + } + + r = dm_cache_policy_register(&default_policy_type); + if (!r) { + DMINFO("version " MQ_VERSION " loaded"); + return 0; + } + + DMERR("register failed (as default) %d", r); + + dm_cache_policy_unregister(&mq_policy_type); +bad_register_mq: + kmem_cache_destroy(mq_entry_cache); +bad: + return -ENOMEM; +} + +static void __exit mq_exit(void) +{ + dm_cache_policy_unregister(&mq_policy_type); + dm_cache_policy_unregister(&default_policy_type); + + kmem_cache_destroy(mq_entry_cache); +} + +module_init(mq_init); +module_exit(mq_exit); + +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("mq cache policy"); + +MODULE_ALIAS("dm-cache-default");