commit 069fde0a47de07db85bb1f3356b8f75e64292499 Author: Mike Snitzer Date: Wed Oct 2 17:54:35 2013 -0400 dm-cache-thin-dev-prereq.patch -- throwaway diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 4296155..6bbb32f 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -151,6 +151,20 @@ static void queue_init(struct queue *q) } /* + * Checks to see if the queue is empty. + */ +static bool queue_empty(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) + if (!list_empty(q->qs + i)) + return false; + + return true; +} + +/* * Insert an entry to the back of the given level. */ static void queue_push(struct queue *q, unsigned level, struct list_head *elt) @@ -224,6 +238,7 @@ struct entry { * FIXME: pack these better */ bool in_cache:1; + bool dirty:1; unsigned hit_count; unsigned generation; unsigned tick; @@ -244,7 +259,8 @@ struct mq_policy { * for promotion to the cache. */ struct queue pre_cache; - struct queue cache; + struct queue cache_clean; + struct queue cache_dirty; /* * Keeps track of time, incremented by the core. We use this to @@ -311,7 +327,7 @@ struct mq_policy { /*----------------------------------------------------------------*/ /* Free/alloc mq cache entry structures. */ -static void takeout_queue(struct list_head *lh, struct queue *q) +static void concat_queue(struct list_head *lh, struct queue *q) { unsigned level; @@ -323,8 +339,9 @@ static void free_entries(struct mq_policy *mq) { struct entry *e, *tmp; - takeout_queue(&mq->free, &mq->pre_cache); - takeout_queue(&mq->free, &mq->cache); + concat_queue(&mq->free, &mq->pre_cache); + concat_queue(&mq->free, &mq->cache_clean); + concat_queue(&mq->free, &mq->cache_dirty); list_for_each_entry_safe(e, tmp, &mq->free, list) kmem_cache_free(mq_entry_cache, e); @@ -438,6 +455,11 @@ static bool any_free_cblocks(struct mq_policy *mq) return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); } +static bool any_clean_cblocks(struct mq_policy *mq) +{ + return !queue_empty(&mq->cache_clean); +} + /* * Fills result out with a cache block that isn't in use, or return * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is @@ -508,7 +530,8 @@ static void push(struct mq_policy *mq, struct entry *e) if (e->in_cache) { alloc_cblock(mq, e->cblock); - queue_push(&mq->cache, queue_level(e), &e->list); + queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, + queue_level(e), &e->list); } else queue_push(&mq->pre_cache, queue_level(e), &e->list); } @@ -531,14 +554,16 @@ static void del(struct mq_policy *mq, struct entry *e) */ static struct entry *pop(struct mq_policy *mq, struct queue *q) { - struct entry *e = container_of(queue_pop(q), struct entry, list); + struct entry *e; + struct list_head *h = queue_pop(q); - if (e) { - hash_remove(e); + if (!h) + return NULL; - if (e->in_cache) - free_cblock(mq, e->cblock); - } + e = container_of(h, struct entry, list); + hash_remove(e); + if (e->in_cache) + free_cblock(mq, e->cblock); return e; } @@ -578,7 +603,16 @@ static void check_generation(struct mq_policy *mq) mq->generation++; for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache.qs + level; + head = mq->cache_clean.qs + level; + list_for_each_entry(e, head, list) { + nr++; + total += e->hit_count; + + if (++count >= MAX_TO_AVERAGE) + break; + } + + head = mq->cache_dirty.qs + level; list_for_each_entry(e, head, list) { nr++; total += e->hit_count; @@ -631,19 +665,28 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) * - set the hit count to a hard coded value other than 1, eg, is it better * if it goes in at level 2? */ -static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) +static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t *cblock) { - dm_cblock_t result; - struct entry *demoted = pop(mq, &mq->cache); + struct entry *demoted = pop(mq, &mq->cache_clean); + + if (!demoted) + /* + * We could get a block from mq->cache_dirty, but that + * would add extra latency to the triggering bio as it + * waits for the writeback. Better to not promote this + * time and hope there's a clean block next time this block + * is hit. + */ + return -ENOSPC; - BUG_ON(!demoted); - result = demoted->cblock; + *cblock = demoted->cblock; *oblock = demoted->oblock; demoted->in_cache = false; + demoted->dirty = false; demoted->hit_count = 1; push(mq, demoted); - return result; + return 0; } /* @@ -662,17 +705,18 @@ static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) static unsigned adjusted_promote_threshold(struct mq_policy *mq, bool discarded_oblock, int data_dir) { - if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) - /* - * We don't need to do any copying at all, so give this a - * very low threshold. In practice this only triggers - * during initial population after a format. - */ - return DISCARDED_PROMOTE_THRESHOLD; + if (data_dir == WRITE) { + if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { + /* + * We don't need to do any copying at all, so give this a + * very low threshold. + */ + return DISCARDED_PROMOTE_THRESHOLD; + } else + return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; + } - return data_dir == READ ? - (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : - (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); + return mq->promote_threshold + READ_PROMOTE_THRESHOLD; } static bool should_promote(struct mq_policy *mq, struct entry *e, @@ -697,17 +741,22 @@ static int cache_entry_found(struct mq_policy *mq, } /* - * Moves and entry from the pre_cache to the cache. The main work is + * Moves an entry from the pre_cache to the cache. The main work is * finding which cache block to use. */ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, struct policy_result *result) { + int r; dm_cblock_t cblock; if (find_free_cblock(mq, &cblock) == -ENOSPC) { result->op = POLICY_REPLACE; - cblock = demote_cblock(mq, &result->old_oblock); + r = demote_cblock(mq, &result->old_oblock, &cblock); + if (r) { + result->op = POLICY_MISS; + return 0; + } } else result->op = POLICY_NEW; @@ -715,6 +764,7 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, del(mq, e); e->in_cache = true; + e->dirty = false; push(mq, e); return 0; @@ -740,6 +790,17 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, return r; } +static void insert_entry_in_pre_cache(struct mq_policy *mq, + struct entry *e, dm_oblock_t oblock) +{ + e->in_cache = false; + e->dirty = false; + e->oblock = oblock; + e->hit_count = 1; + e->generation = mq->generation; + push(mq, e); +} + static void insert_in_pre_cache(struct mq_policy *mq, dm_oblock_t oblock) { @@ -757,39 +818,51 @@ static void insert_in_pre_cache(struct mq_policy *mq, return; } - e->in_cache = false; - e->oblock = oblock; - e->hit_count = 1; - e->generation = mq->generation; - push(mq, e); + insert_entry_in_pre_cache(mq, e, oblock); } static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, struct policy_result *result) { + int r; struct entry *e; dm_cblock_t cblock; if (find_free_cblock(mq, &cblock) == -ENOSPC) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } + r = demote_cblock(mq, &result->old_oblock, &cblock); + if (unlikely(r)) { + result->op = POLICY_MISS; + insert_in_pre_cache(mq, oblock); + return; + } - e = alloc_entry(mq); - if (unlikely(!e)) { - result->op = POLICY_MISS; - return; + /* + * This will always succeed, since we've just demoted. + */ + e = pop(mq, &mq->pre_cache); + result->op = POLICY_REPLACE; + + } else { + e = alloc_entry(mq); + if (unlikely(!e)) + e = pop(mq, &mq->pre_cache); + + if (unlikely(!e)) { + result->op = POLICY_MISS; + return; + } + + result->op = POLICY_NEW; } e->oblock = oblock; e->cblock = cblock; e->in_cache = true; + e->dirty = false; e->hit_count = 1; e->generation = mq->generation; push(mq, e); - result->op = POLICY_NEW; result->cblock = e->cblock; } @@ -915,6 +988,46 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t return r; } +// FIXME: can these block? +// FIXME: duplication +static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + mutex_lock(&mq->lock); + e = hash_lookup(mq, oblock); + if (!e) + DMWARN("mq_set_dirty called for a block that isn't in the cache"); + else { + BUG_ON(!e->in_cache); + + del(mq, e); + e->dirty = true; + push(mq, e); + } + mutex_unlock(&mq->lock); +} + +static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + struct entry *e; + + mutex_lock(&mq->lock); + e = hash_lookup(mq, oblock); + if (!e) + DMWARN("mq_clear_dirty called for a block that isn't in the cache"); + else { + BUG_ON(!e->in_cache); + + del(mq, e); + e->dirty = false; + push(mq, e); + } + mutex_unlock(&mq->lock); +} + static int mq_load_mapping(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, uint32_t hint, bool hint_valid) @@ -929,6 +1042,7 @@ static int mq_load_mapping(struct dm_cache_policy *p, e->cblock = cblock; e->oblock = oblock; e->in_cache = true; + e->dirty = true; /* this gets corrected in a minute */ e->hit_count = hint_valid ? hint : 1; e->generation = mq->generation; push(mq, e); @@ -947,7 +1061,14 @@ static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, mutex_lock(&mq->lock); for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each_entry(e, &mq->cache.qs[level], list) { + list_for_each_entry(e, &mq->cache_clean.qs[level], list) { + r = fn(context, e->cblock, e->oblock, e->hit_count); + if (r) + goto out; + } + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each_entry(e, &mq->cache_dirty.qs[level], list) { r = fn(context, e->cblock, e->oblock, e->hit_count); if (r) goto out; @@ -959,22 +1080,67 @@ out: return r; } +static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) +{ + struct entry *e = hash_lookup(mq, oblock); + + BUG_ON(!e || !e->in_cache); + + del(mq, e); + e->in_cache = false; + e->dirty = false; + push(mq, e); +} + static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) { struct mq_policy *mq = to_mq_policy(p); - struct entry *e; mutex_lock(&mq->lock); + remove_mapping(mq, oblock); + mutex_unlock(&mq->lock); +} - e = hash_lookup(mq, oblock); +static int mq_writeback_work_(struct mq_policy *mq, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + struct entry *e = pop(mq, &mq->cache_dirty); - BUG_ON(!e || !e->in_cache); + if (e) { +#if 0 + /* + * mq->tick - 1 because we don't want a flurry of + * writebacks every time the tick rolls over. + */ + if (e->tick >= (mq->tick - 1)) + push(mq, e); + + else { +#endif + *oblock = e->oblock; + *cblock = e->cblock; + e->dirty = false; + push(mq, e); + return 0; +#if 0 + } +#endif + } - del(mq, e); - e->in_cache = false; - push(mq, e); + return -ENODATA; +} +static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = mq_writeback_work_(mq, oblock, cblock); mutex_unlock(&mq->lock); + + return r; } static void force_mapping(struct mq_policy *mq, @@ -986,6 +1152,7 @@ static void force_mapping(struct mq_policy *mq, del(mq, e); e->oblock = new_oblock; + e->dirty = true; push(mq, e); } @@ -1057,10 +1224,12 @@ static void init_policy_functions(struct mq_policy *mq) mq->policy.destroy = mq_destroy; mq->policy.map = mq_map; mq->policy.lookup = mq_lookup; + mq->policy.set_dirty = mq_set_dirty; + mq->policy.clear_dirty = mq_clear_dirty; mq->policy.load_mapping = mq_load_mapping; mq->policy.walk_mappings = mq_walk_mappings; mq->policy.remove_mapping = mq_remove_mapping; - mq->policy.writeback_work = NULL; + mq->policy.writeback_work = mq_writeback_work; mq->policy.force_mapping = mq_force_mapping; mq->policy.residency = mq_residency; mq->policy.tick = mq_tick; @@ -1093,7 +1262,9 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->find_free_last_word = 0; queue_init(&mq->pre_cache); - queue_init(&mq->cache); + queue_init(&mq->cache_clean); + queue_init(&mq->cache_dirty); + mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); mq->nr_entries = 2 * from_cblock(cache_size); diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index 33369ca..63fc042 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -130,8 +130,8 @@ struct dm_cache_policy { * * Must not block. * - * Returns 0 if in cache, -ENOENT if not, < 0 for other errors - * (-EWOULDBLOCK would be typical). + * Returns 0 if in cache, -ENOENT if not, < 0 for other errors (-EWOULDBLOCK + * would be typical). */ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2956976..022e926 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -61,6 +61,35 @@ static void free_bitset(unsigned long *bits) /*----------------------------------------------------------------*/ +/* + * There are a couple of places where we let a bio run, but want to do some + * work before calling it's endio function. We do this by temporarily + * changing the endio fn. + */ +struct hook_info { + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +static void hook_bio(struct hook_info *h, struct bio *bio, + bio_end_io_t *bi_end_io, + void *bi_private) +{ + h->bi_end_io = bio->bi_end_io; + h->bi_private = bio->bi_private; + + bio->bi_end_io = bi_end_io; + bio->bi_private = bi_private; +} + +static void unhook_bio(struct hook_info *h, struct bio *bio) +{ + bio->bi_end_io = h->bi_end_io; + bio->bi_private = h->bi_private; +} + +/*----------------------------------------------------------------*/ + #define PRISON_CELLS 1024 #define MIGRATION_POOL_SIZE 128 #define COMMIT_PERIOD HZ @@ -211,7 +240,7 @@ struct per_bio_data { */ struct cache *cache; dm_cblock_t cblock; - bio_end_io_t *saved_bi_end_io; + struct hook_info hook_info; struct dm_bio_details bio_details; }; @@ -228,6 +257,7 @@ struct dm_cache_migration { bool writeback:1; bool demote:1; bool promote:1; + bool requeue_holder:1; struct dm_bio_prison_cell *old_ocell; struct dm_bio_prison_cell *new_ocell; @@ -662,7 +692,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) static void writethrough_endio(struct bio *bio, int err) { struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); - bio->bi_end_io = pb->saved_bi_end_io; + unhook_bio(&pb->hook_info, bio); if (err) { bio_endio(bio, err); @@ -693,9 +723,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, pb->cache = cache; pb->cblock = cblock; - pb->saved_bi_end_io = bio->bi_end_io; + hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); dm_bio_record(&pb->bio_details, bio); - bio->bi_end_io = writethrough_endio; remap_to_origin_clear_discard(pb->cache, bio, oblock); } @@ -782,6 +811,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) unsigned long flags; struct cache *cache = mg->cache; + /* FIXME: what if mg->err? */ + if (mg->writeback) { cell_defer(cache, mg->old_ocell, false); clear_dirty(cache, mg->old_oblock, mg->cblock); @@ -836,7 +867,12 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) cleanup_migration(mg); } else { - cell_defer(cache, mg->new_ocell, true); + if (mg->requeue_holder) + cell_defer(cache, mg->new_ocell, true); + else { + bio_endio(mg->new_ocell->holder, 0); + cell_defer(cache, mg->new_ocell, false); + } clear_dirty(cache, mg->new_oblock, mg->cblock); cleanup_migration(mg); } @@ -885,6 +921,42 @@ static void issue_copy_real(struct dm_cache_migration *mg) migration_failure(mg); } +static void overwrite_endio(struct bio *bio, int err) +{ + struct dm_cache_migration *mg = bio->bi_private; + struct cache *cache = mg->cache; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + unsigned long flags; + + if (err) + mg->err = true; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->completed_migrations); + unhook_bio(&pb->hook_info, bio); + mg->requeue_holder = false; + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(mg->cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + hook_bio(&pb->hook_info, bio, overwrite_endio, mg); + remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); + generic_make_request(bio); +} + +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); +} + static void avoid_copy(struct dm_cache_migration *mg) { atomic_inc(&mg->cache->stats.copies_avoided); @@ -899,8 +971,17 @@ static void issue_copy(struct dm_cache_migration *mg) if (mg->writeback || mg->demote) avoid = !is_dirty(cache, mg->cblock) || is_discarded_oblock(cache, mg->old_oblock); - else + else { + struct bio *bio = mg->new_ocell->holder; + avoid = is_discarded_oblock(cache, mg->new_oblock); +#if 0 + if (!avoid && bio_writes_complete_block(cache, bio)) { + issue_overwrite(mg, bio); + return; + } +#endif + } avoid ? avoid_copy(mg) : issue_copy_real(mg); } @@ -991,6 +1072,7 @@ static void promote(struct cache *cache, struct prealloc *structs, mg->writeback = false; mg->demote = false; mg->promote = true; + mg->requeue_holder = true; mg->cache = cache; mg->new_oblock = oblock; mg->cblock = cblock; @@ -1012,6 +1094,7 @@ static void writeback(struct cache *cache, struct prealloc *structs, mg->writeback = true; mg->demote = false; mg->promote = false; + mg->requeue_holder = true; mg->cache = cache; mg->old_oblock = oblock; mg->cblock = cblock; @@ -1035,6 +1118,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, mg->writeback = false; mg->demote = true; mg->promote = true; + mg->requeue_holder = true; mg->cache = cache; mg->old_oblock = old_oblock; mg->new_oblock = new_oblock; @@ -2005,7 +2089,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) atomic_set(&cache->nr_migrations, 0); init_waitqueue_head(&cache->migration_wait); - r = -ENOMEM; cache->nr_dirty = 0; cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); if (!cache->dirty_bitset) { @@ -2198,6 +2281,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio) r = policy_map(cache->policy, block, false, can_migrate, discarded_block, bio, &lookup_result); if (r == -EWOULDBLOCK) { + // FIXME: we should check to see if there's any spare migration bandwidth here cell_defer(cache, cell, true); return DM_MAPIO_SUBMITTED;