--- drivers/md/dm-raid1.c | 158 +++++++++++++++------- drivers/md/dm-region-hash.c | 311 +++++++++++++++++++++----------------------- 2 files changed, 261 insertions(+), 208 deletions(-) Index: linux-2.6.29-rc7/drivers/md/dm-raid1.c =================================================================== --- linux-2.6.29-rc7.orig/drivers/md/dm-raid1.c 2009-03-05 12:08:57.000000000 +0000 +++ linux-2.6.29-rc7/drivers/md/dm-raid1.c 2009-03-05 12:10:34.000000000 +0000 @@ -109,11 +109,6 @@ static void delayed_wake(struct mirror_s add_timer(&ms->timer); } -static void wakeup_all_recovery_waiters(void *context) -{ - wake_up_all(&_kmirrord_recovery_stopped); -} - static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) { unsigned long flags; @@ -139,6 +134,11 @@ static void dispatch_bios(void *context, queue_bio(ms, bio, WRITE); } +static region_t bio_to_region(struct mirror_set *ms, struct bio *bio) +{ + return dm_rh_sector_to_region(ms->rh, bio->bi_sector - ms->ti->begin); +} + #define MIN_READ_RECORDS 20 struct dm_raid1_read_record { struct mirror *m; @@ -167,7 +167,7 @@ static void bio_set_m(struct bio *bio, s static struct mirror *get_default_mirror(struct mirror_set *ms) { - return &ms->mirror[atomic_read(&ms->default_mirror)]; + return ms->mirror + atomic_read(&ms->default_mirror); } static void set_default_mirror(struct mirror *m) @@ -247,8 +247,7 @@ static void recovery_complete(int read_e void *context) { struct dm_region *reg = context; - struct mirror_set *ms = dm_rh_region_context(reg); - int m, bit = 0; + struct mirror_set *ms = dm_rh_get_region_context(reg); if (read_err) { /* Read error means the failure of default mirror. */ @@ -257,15 +256,18 @@ static void recovery_complete(int read_e } if (write_err) { + int bit, m; + DMERR_LIMIT("Write error during recovery (error = 0x%lx)", write_err); /* * Bits correspond to devices (excluding default mirror). * The default mirror cannot change during recovery. */ - for (m = 0; m < ms->nr_mirrors; m++) { + for (bit = m = 0; m < ms->nr_mirrors; m++) { if (&ms->mirror[m] == get_default_mirror(ms)) continue; + if (test_bit(bit, &write_err)) fail_mirror(ms->mirror + m, DM_RAID1_SYNC_ERROR); @@ -273,27 +275,27 @@ static void recovery_complete(int read_e } } - dm_rh_recovery_end(reg, !(read_err || write_err)); + dm_rh_recovery_end(ms->rh, reg, read_err || write_err); } static int recover(struct mirror_set *ms, struct dm_region *reg) { int r; unsigned i; - struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; - struct mirror *m; unsigned long flags = 0; region_t key = dm_rh_get_region_key(reg); - sector_t region_size = dm_rh_get_region_size(ms->rh); + struct dm_region_hash *rh = ms->rh; + sector_t region_size = dm_rh_get_region_size(rh); + struct mirror *m; + struct dm_io_region from, to[ms->nr_mirrors - 1], *dest; /* fill in the source */ m = get_default_mirror(ms); from.bdev = m->dev->bdev; - from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); - if (key == (ms->nr_regions - 1)) { + from.sector = m->offset + dm_rh_region_to_sector(rh, key); + if (key == ms->nr_regions - 1) { /* - * The final region may be smaller than - * region_size. + * The final region may be smaller than region_size. */ from.count = ms->ti->len & (region_size - 1); if (!from.count) @@ -303,28 +305,31 @@ static int recover(struct mirror_set *ms /* fill in the destinations */ for (i = 0, dest = to; i < ms->nr_mirrors; i++) { - if (&ms->mirror[i] == get_default_mirror(ms)) + m = ms->mirror + i; + if (m == get_default_mirror(ms)) continue; - m = ms->mirror + i; dest->bdev = m->dev->bdev; - dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); + dest->sector = m->offset + dm_rh_region_to_sector(rh, key); dest->count = from.count; dest++; } - /* hand to kcopyd */ + /* Keep mirror set reference in region context for callback function. */ + dm_rh_set_region_context(reg, ms); + + /* Hand to kcopyd. */ if (!errors_handled(ms)) set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, recovery_complete, reg); - return r; } static void do_recovery(struct mirror_set *ms) { + struct dm_region_hash *rh = ms->rh; struct dm_region *reg; struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); int r; @@ -332,15 +337,17 @@ static void do_recovery(struct mirror_se /* * Start quiescing some regions. */ - dm_rh_recovery_prepare(ms->rh); + r = dm_rh_recovery_prepare(rh); + if (r == -ESRCH) + wake_up_all(&_kmirrord_recovery_stopped); /* * Copy any already quiesced regions. */ - while ((reg = dm_rh_recovery_start(ms->rh))) { + while ((reg = dm_rh_recovery_start(rh))) { r = recover(ms, reg); if (r) - dm_rh_recovery_end(reg, 0); + dm_rh_recovery_end(rh, reg, r); } /* @@ -461,13 +468,14 @@ static void read_async_bio(struct mirror BUG_ON(dm_io(&io_req, 1, &io, NULL)); } -static inline int region_in_sync(struct mirror_set *ms, region_t region, - int may_block) +static inline int region_in_sync(struct dm_region_hash *rh, + region_t region, int may_block) { - int state = dm_rh_get_state(ms->rh, region, may_block); + int state = dm_rh_get_state(rh, region, may_block); return state == DM_RH_CLEAN || state == DM_RH_DIRTY; } + static void do_reads(struct mirror_set *ms, struct bio_list *reads) { region_t region; @@ -481,7 +489,7 @@ static void do_reads(struct mirror_set * /* * We can only read balance if the region is in sync. */ - if (likely(region_in_sync(ms, region, 1))) + if (likely(region_in_sync(ms->rh, region, 1))) m = choose_mirror(ms, bio->bi_sector); else if (m && atomic_read(&m->error_count)) m = NULL; @@ -504,6 +512,56 @@ static void do_reads(struct mirror_set * * NOSYNC: increment pending, just write to the default mirror *---------------------------------------------------------------*/ +/* __bio_mark_nosync + * @ms + * @bio + * @done + * @error + * + * The bio was written on some mirror(s) but failed on other mirror(s). + * We can successfully endio the bio but should avoid the region being + * marked clean by setting the state RH_NOSYNC. + * + * This function is _not_ safe in interrupt context! + */ +static void __bio_mark_nosync(struct mirror_set *ms, struct bio *bio) +{ + struct dm_region_hash *rh = ms->rh; + region_t region = bio_to_region(rh, bio); + int recovering = dm_rh_get_state(rh, region, 0) == DM_RH_RECOVERING; + + ms->in_sync = 0; + + /* + * Region hash entry should exist because write was in-flight. + * + * The log'll be informed about the state change via the region hash. + */ + dm_rh_set_state(rh, region, DM_RH_NOSYNC, 0); + + /* + * Possible cases: + * 1) DM_RH_DIRTY + * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed + * 3) DM_RH_RECOVERING: flushing pending writes + * Either case, the region should have not been connected to list. + */ + bio_endio(bio, 0); + if (recovering) { + /* + * Dispatch the bios before we call 'wake_up_all'. + * This is important because if we are suspending, + * we want to know that recovery is complete and + * the work queue is flushed. If we wake_up_all + * before we dispatch_bios (queue bios and call wake()), + * then we risk suspending before the work queue + * has been properly flushed. + */ + dm_rh_dispatch_bios(rh, region, 0); + if (!dm_rh_recovery_in_flight(rh)) + wake_up_all(&_kmirrord_recovery_stopped); + } +} static void write_callback(unsigned long error, void *context) { @@ -545,6 +603,7 @@ static void write_callback(unsigned long spin_lock_irqsave(&ms->lock, flags); if (!ms->failures.head) should_wake = 1; + bio_list_add(&ms->failures, bio); spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) @@ -577,13 +636,13 @@ static void do_write(struct mirror_set * * to the mirror set in write_callback(). */ bio_set_m(bio, get_default_mirror(ms)); - BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); } static void do_writes(struct mirror_set *ms, struct bio_list *writes) { - int state; + enum dm_rh_region_states state; + struct dm_region_hash *rh = ms->rh; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; struct bio_list requeue; @@ -602,7 +661,7 @@ static void do_writes(struct mirror_set bio_list_init(&requeue); while ((bio = bio_list_pop(writes))) { - region = dm_rh_bio_to_region(ms->rh, bio); + region = bio_to_region(ms->rh, bio); if (log->type->is_remote_recovering && log->type->is_remote_recovering(log, region)) { @@ -644,9 +703,9 @@ static void do_writes(struct mirror_set * be written to (writes to recover regions are going to * be delayed). */ - dm_rh_inc_pending(ms->rh, &sync); - dm_rh_inc_pending(ms->rh, &nosync); - ms->log_failure = dm_rh_flush(ms->rh) ? 1 : 0; + dm_rh_inc_pending(rh, &sync); + dm_rh_inc_pending(rh, &nosync); + ms->log_failure = dm_rh_flush(rh) ? 1 : 0; /* * Dispatch io. @@ -661,7 +720,7 @@ static void do_writes(struct mirror_set do_write(ms, bio); while ((bio = bio_list_pop(&recover))) - dm_rh_delay(ms->rh, bio); + dm_rh_delay(rh, bio); while ((bio = bio_list_pop(&nosync))) { map_bio(get_default_mirror(ms), bio); @@ -677,10 +736,8 @@ static void do_failures(struct mirror_se return; if (!ms->log_failure) { - while ((bio = bio_list_pop(failures))) { - ms->in_sync = 0; - dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); - } + while ((bio = bio_list_pop(failures))) + __bio_mark_nosync(ms->rh, bio, bio->bi_size, 0); return; } @@ -802,10 +859,8 @@ static struct mirror_set *alloc_context( return NULL; } - ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, - wakeup_all_recovery_waiters, - ms->ti->begin, MAX_RECOVERY, - dl, region_size, ms->nr_regions); + ms->rh = dm_region_hash_create(MAX_RECOVERY, dispatch_bios, ms, wakeup_mirrord, + ms, dl, region_size, ms->nr_regions); if (IS_ERR(ms->rh)) { ti->error = "Error creating dirty region hash"; dm_io_client_destroy(ms->io_client); @@ -858,8 +913,8 @@ static int get_mirror(struct mirror_set * Create dirty log: log_type #log_params */ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, - unsigned argc, char **argv, - unsigned *args_used) + unsigned argc, char **argv, + unsigned *args_used) { unsigned param_count; struct dm_dirty_log *dl; @@ -1061,12 +1116,12 @@ static int mirror_map(struct dm_target * if (rw == WRITE) { /* Save region for mirror_end_io() handler */ - map_context->ll = dm_rh_bio_to_region(ms->rh, bio); + map_context->ll = bio_to_region(ms->rh, bio); queue_bio(ms, bio, rw); return DM_MAPIO_SUBMITTED; } - r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); + r = log->type->in_sync(log, bio_to_region(ms->rh, bio), 0); if (r < 0 && r != -EWOULDBLOCK) return r; @@ -1114,7 +1169,11 @@ static int mirror_end_io(struct dm_targe * We need to dec pending if this was a write. */ if (rw == WRITE) { - dm_rh_dec(ms->rh, map_context->ll); + int r = dm_rh_dec(ms->rh, map_context->ll); + + if (r) + wakeup_mirrord(ms); + return error; } @@ -1215,6 +1274,7 @@ static void mirror_resume(struct dm_targ if (log->type->resume && log->type->resume(log)) /* FIXME: need better error handling */ DMWARN("log resume failed"); + dm_rh_start_recovery(ms->rh); } Index: linux-2.6.29-rc7/drivers/md/dm-region-hash.c =================================================================== --- linux-2.6.29-rc7.orig/drivers/md/dm-region-hash.c 2009-03-05 12:08:41.000000000 +0000 +++ linux-2.6.29-rc7/drivers/md/dm-region-hash.c 2009-03-05 12:10:34.000000000 +0000 @@ -21,9 +21,15 @@ /*----------------------------------------------------------------- * Region hash * - * The mirror splits itself up into discrete regions. Each - * region can be in one of three states: clean, dirty, - * nosync. There is no need to put clean regions in the hash. + * A storage set (eg. RAID1, RAID5) splits itself up into discrete regions. + * Each region can be in one of three states: + * + * o clean + * o dirty, + * o nosync. + * + * There is no need to put clean regions in the hash. + * * * In addition to being present in the hash table a region _may_ * be present on one of three lists. @@ -34,14 +40,13 @@ * hash table. * * quiesced_regions: These regions have been spun down, ready - * for recovery. rh_recovery_start() will remove regions from - * this list and hand them to kmirrord, which will schedule the - * recovery io with kcopyd. + * for recovery. dm_rh_recovery_start() will remove regions from + * this list and hand them to the caller, which will schedule the + * recovery io. * - * recovered_regions: Regions that kcopyd has successfully + * recovered_regions: Regions that the caller has successfully * recovered. dm_rh_update_states() will now schedule any delayed - * io, up the recovery_count, and remove the region from the - * hash. + * io, up the recovery_count, and remove the region from the hash. * * There are 2 locks: * A rw spin lock 'hash_lock' protects just the hash table, @@ -55,6 +60,14 @@ * context, so all other uses will have to suspend local irqs. *---------------------------------------------------------------*/ struct dm_region_hash { + /* Callback function to dispatch queued writes on recovered regions. */ + void (*dispatch)(void *context, struct bio_list *bios, int error); + void *dispatch_context; + + /* Callback function to wakeup callers worker thread. */ + void (*wakeup_mirrord)(void *context); + void *wake_context; + uint32_t region_size; unsigned region_shift; @@ -79,24 +92,12 @@ struct dm_region_hash { struct list_head quiesced_regions; struct list_head recovered_regions; struct list_head failed_recovered_regions; - - void *context; - sector_t target_begin; - - /* Callback function to schedule bios writes */ - void (*dispatch_bios)(void *context, struct bio_list *bios); - - /* Callback function to wakeup callers worker thread. */ - void (*wakeup_workers)(void *context); - - /* Callback function to wakeup callers recovery waiters. */ - void (*wakeup_all_recovery_waiters)(void *context); }; struct dm_region { - struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ region_t key; - int state; + enum dm_rh_region_states state; + void *context; /* Caller context. */ struct list_head hash_list; struct list_head list; @@ -113,24 +114,21 @@ static region_t dm_rh_sector_to_region(s return sector >> rh->region_shift; } -sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) -{ - return region << rh->region_shift; -} -EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); - region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) { - return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); + return dm_rh_sector_to_region(rh, bio->bi_sector); } EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); -void *dm_rh_region_context(struct dm_region *reg) +sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) { - return reg->rh->context; + return region << rh->region_shift; } -EXPORT_SYMBOL_GPL(dm_rh_region_context); +EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); +/* + * Retrival fns. + */ region_t dm_rh_get_region_key(struct dm_region *reg) { return reg->key; @@ -184,18 +182,15 @@ struct dm_region_hash *dm_region_hash_cr return ERR_PTR(-ENOMEM); } - rh->context = context; - rh->dispatch_bios = dispatch_bios; - rh->wakeup_workers = wakeup_workers; - rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; - rh->target_begin = target_begin; rh->max_recovery = max_recovery; + rh->dispatch = dispatch; + rh->dispatch_context = dispatch_context; + rh->wakeup_mirrord = wakeup_mirrord; + rh->wake_context = wake_context; rh->log = log; rh->region_size = region_size; rh->region_shift = ffs(region_size) - 1; rwlock_init(&rh->hash_lock); - rh->mask = nr_buckets - 1; - rh->nr_buckets = nr_buckets; rh->shift = ffs(nr_buckets); if (rh->shift - 1 > ARRAY_SIZE(dm_region_hash_primes)) @@ -240,9 +235,9 @@ void dm_region_hash_destroy(struct dm_re struct dm_region *reg, *nreg; BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { - list_for_each_entry_safe(reg, nreg, rh->buckets + h, - hash_list) { + list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) { BUG_ON(atomic_read(®->pending)); mempool_free(reg, rh->region_pool); } @@ -297,7 +292,6 @@ static struct dm_region *__rh_alloc(stru nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? DM_RH_CLEAN : DM_RH_NOSYNC; - nreg->rh = rh; nreg->key = region; INIT_LIST_HEAD(&nreg->list); atomic_set(&nreg->pending, 0); @@ -363,88 +357,35 @@ int dm_rh_get_state(struct dm_region_has } EXPORT_SYMBOL_GPL(dm_rh_get_state); -static void complete_resync_work(struct dm_region *reg, int success) +void dm_rh_set_state(struct dm_region_hash *rh, region_t region, + enum dm_rh_region_states state, int may_block) { - struct dm_region_hash *rh = reg->rh; - - rh->log->type->set_region_sync(rh->log, reg->key, success); - - /* - * Dispatch the bios before we call 'wake_up_all'. - * This is important because if we are suspending, - * we want to know that recovery is complete and - * the work queue is flushed. If we wake_up_all - * before we dispatch_bios (queue bios and call wake()), - * then we risk suspending before the work queue - * has been properly flushed. - */ - rh->dispatch_bios(rh->context, ®->delayed_bios); - if (atomic_dec_and_test(&rh->recovery_in_flight)) - rh->wakeup_all_recovery_waiters(rh->context); - up(&rh->recovery_count); -} - -/* dm_rh_mark_nosync - * @ms - * @bio - * @done - * @error - * - * The bio was written on some mirror(s) but failed on other mirror(s). - * We can successfully endio the bio but should avoid the region being - * marked clean by setting the state DM_RH_NOSYNC. - * - * This function is _not_ safe in interrupt context! - */ -void dm_rh_mark_nosync(struct dm_region_hash *rh, - struct bio *bio, unsigned done, int error) -{ - unsigned long flags; - struct dm_dirty_log *log = rh->log; struct dm_region *reg; - region_t region = dm_rh_bio_to_region(rh, bio); - int recovering = 0; + struct dm_dirty_log *log = rh->log; - /* We must inform the log that the sync count has changed. */ - log->type->set_region_sync(log, region, 0); + if (state == DM_RH_NOSYNC) + log->type->set_region_sync(log, region, 0); + else if (state == DM_RH_CLEAN) + log->type->clear_region(log, region); + else if (state == DM_RH_DIRTY) + log->type->mark_region(log, region); read_lock(&rh->hash_lock); reg = __rh_find(rh, region); + reg->state = state; read_unlock(&rh->hash_lock); - - /* region hash entry should exist because write was in-flight */ - BUG_ON(!reg); - BUG_ON(!list_empty(®->list)); - - spin_lock_irqsave(&rh->region_lock, flags); - /* - * Possible cases: - * 1) DM_RH_DIRTY - * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed - * 3) DM_RH_RECOVERING: flushing pending writes - * Either case, the region should have not been connected to list. - */ - recovering = (reg->state == DM_RH_RECOVERING); - reg->state = DM_RH_NOSYNC; - BUG_ON(!list_empty(®->list)); - spin_unlock_irqrestore(&rh->region_lock, flags); - - bio_endio(bio, error); - if (recovering) - complete_resync_work(reg, 0); } -EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); +EXPORT_SYMBOL_GPL(dm_rh_set_state); void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) { struct dm_region *reg, *next; - LIST_HEAD(clean); LIST_HEAD(recovered); LIST_HEAD(failed_recovered); /* - * Quickly grab the lists. + * Quickly grab the lists and remove any regions from hash. */ write_lock_irq(&rh->hash_lock); spin_lock(&rh->region_lock); @@ -466,7 +407,7 @@ void dm_rh_update_states(struct dm_regio list_splice_init(&rh->failed_recovered_regions, &failed_recovered); - list_for_each_entry(reg, &failed_recovered, list) + list_for_each_entry(reg, &recovered, list) list_del(®->hash_list); } @@ -480,12 +421,24 @@ void dm_rh_update_states(struct dm_regio */ list_for_each_entry_safe(reg, next, &recovered, list) { rh->log->type->clear_region(rh->log, reg->key); - complete_resync_work(reg, 1); + rh->log->type->set_region_sync(rh->log, reg->key, 1); + + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, + ®->delayed_bios, 0); + + up(&rh->recovery_count); mempool_free(reg, rh->region_pool); } list_for_each_entry_safe(reg, next, &failed_recovered, list) { - complete_resync_work(reg, errors_handled ? 0 : 1); + rh->log->type->set_region_sync(rh->log, reg->key, + errors_handled ? 0 : 1); + if (reg->delayed_bios.head) + rh->dispatch(rh->dispatch_context, + ®->delayed_bios, -EIO); + + up(&rh->recovery_count); mempool_free(reg, rh->region_pool); } @@ -494,54 +447,53 @@ void dm_rh_update_states(struct dm_regio mempool_free(reg, rh->region_pool); } - rh->log->type->flush(rh->log); + dm_rh_flush(rh); } EXPORT_SYMBOL_GPL(dm_rh_update_states); -static void rh_inc(struct dm_region_hash *rh, region_t region) +void dm_rh_inc(struct dm_region_hash *rh, region_t region) { struct dm_region *reg; read_lock(&rh->hash_lock); reg = __rh_find(rh, region); - - spin_lock_irq(&rh->region_lock); - atomic_inc(®->pending); - if (reg->state == DM_RH_CLEAN) { - reg->state = DM_RH_DIRTY; - list_del_init(®->list); /* take off the clean list */ - spin_unlock_irq(&rh->region_lock); - rh->log->type->mark_region(rh->log, reg->key); - } else - spin_unlock_irq(&rh->region_lock); + spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_DIRTY; + list_del_init(®->list); /* Take off the clean list. */ + spin_unlock_irq(&rh->region_lock); + } + atomic_inc(®->pending); read_unlock(&rh->hash_lock); } +EXPORT_SYMBOL_GPL(dm_rh_inc); void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) { struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) - rh_inc(rh, dm_rh_bio_to_region(rh, bio)); + dm_rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } EXPORT_SYMBOL_GPL(dm_rh_inc_pending); -void dm_rh_dec(struct dm_region_hash *rh, region_t region) +int dm_rh_dec(struct dm_region_hash *rh, region_t region) { - unsigned long flags; + int r = 0; struct dm_region *reg; - int should_wake = 0; read_lock(&rh->hash_lock); reg = __rh_lookup(rh, region); read_unlock(&rh->hash_lock); - spin_lock_irqsave(&rh->region_lock, flags); + BUG_ON(!reg); + if (atomic_dec_and_test(®->pending)) { + unsigned long flags; + /* * There is no pending I/O for this region. * We can move the region to corresponding list for next action. @@ -553,19 +505,19 @@ void dm_rh_dec(struct dm_region_hash *rh * until the region is recovered or the map is reloaded. */ - /* do nothing for DM_RH_NOSYNC */ - if (reg->state == DM_RH_RECOVERING) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == DM_RH_RECOVERING) list_add_tail(®->list, &rh->quiesced_regions); - } else if (reg->state == DM_RH_DIRTY) { + else { reg->state = DM_RH_CLEAN; list_add(®->list, &rh->clean_regions); } - should_wake = 1; + spin_unlock_irqrestore(&rh->region_lock, flags); + + r = 1; } - spin_unlock_irqrestore(&rh->region_lock, flags); - if (should_wake) - rh->wakeup_workers(rh->context); + return r; } EXPORT_SYMBOL_GPL(dm_rh_dec); @@ -586,44 +538,49 @@ static int __rh_recovery_prepare(struct return r; /* - * Get this region, and start it quiescing by setting the - * recovering flag. + * Get this region, and start it quiescing by setting + * the recovering flag. */ read_lock(&rh->hash_lock); reg = __rh_find(rh, region); read_unlock(&rh->hash_lock); spin_lock_irq(&rh->region_lock); + reg->state = DM_RH_RECOVERING; /* Already quiesced ? */ - if (atomic_read(®->pending)) - list_del_init(®->list); - else - list_move(®->list, &rh->quiesced_regions); + list_del_init(®->list); + if (!atomic_read(®->pending)) + list_add(®->list, &rh->quiesced_regions); spin_unlock_irq(&rh->region_lock); - return 1; } -void dm_rh_recovery_prepare(struct dm_region_hash *rh) +int dm_rh_recovery_prepare(struct dm_region_hash *rh) { - /* Extra reference to avoid race with dm_rh_stop_recovery */ + int r = 0; + + /* Extra reference to avoid race with rh_stop_recovery */ atomic_inc(&rh->recovery_in_flight); while (!down_trylock(&rh->recovery_count)) { atomic_inc(&rh->recovery_in_flight); + if (__rh_recovery_prepare(rh) <= 0) { atomic_dec(&rh->recovery_in_flight); up(&rh->recovery_count); + r = -ENOENT; break; } } /* Drop the extra reference */ if (atomic_dec_and_test(&rh->recovery_in_flight)) - rh->wakeup_all_recovery_waiters(rh->context); + r = -ESRCH; + + return r; } EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); @@ -638,28 +595,32 @@ struct dm_region *dm_rh_recovery_start(s if (!list_empty(&rh->quiesced_regions)) { reg = list_entry(rh->quiesced_regions.next, struct dm_region, list); - list_del_init(®->list); /* remove from the quiesced list */ + list_del_init(®->list); /* Remove from the quiesced list. */ } - spin_unlock_irq(&rh->region_lock); + spin_unlock_irq(&rh->region_lock); return reg; } EXPORT_SYMBOL_GPL(dm_rh_recovery_start); -void dm_rh_recovery_end(struct dm_region *reg, int success) +/* + * Put region on list of recovered ones. + */ +void dm_rh_recovery_end(struct dm_region_hash *rh, struct dm_region *reg, + int error) { - struct dm_region_hash *rh = reg->rh; - spin_lock_irq(&rh->region_lock); - if (success) - list_add(®->list, ®->rh->recovered_regions); - else { + if (error) { reg->state = DM_RH_NOSYNC; - list_add(®->list, ®->rh->failed_recovered_regions); - } + list_add(®->list, &rh->failed_recovered_regions); + } else + list_add(®->list, &rh->recovered_regions); + + atomic_dec(&rh->recovery_in_flight); spin_unlock_irq(&rh->region_lock); - rh->wakeup_workers(rh->context); + rh->wakeup_mirrord(rh->wake_context); + BUG_ON(atomic_read(&rh->recovery_in_flight) < 0); } EXPORT_SYMBOL_GPL(dm_rh_recovery_end); @@ -676,21 +637,53 @@ int dm_rh_flush(struct dm_region_hash *r } EXPORT_SYMBOL_GPL(dm_rh_flush); -void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +void dm_rh_delay_by_region(struct dm_region_hash *rh, + struct bio *bio, region_t region) { struct dm_region *reg; + /* FIXME: locking. */ read_lock(&rh->hash_lock); - reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); + reg = __rh_find(rh, region); bio_list_add(®->delayed_bios, bio); read_unlock(&rh->hash_lock); } +EXPORT_SYMBOL_GPL(dm_rh_delay_by_region); + +void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) +{ + return dm_rh_delay_by_region(rh, bio, + dm_rh_bio_to_region(rh, bio)); +} EXPORT_SYMBOL_GPL(dm_rh_delay); +void dm_rh_dispatch_bios(struct dm_region_hash *rh, + region_t region, int error) +{ + struct dm_region *reg; + struct bio_list delayed_bios; + + /* FIXME: locking. */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + BUG_ON(!reg); + delayed_bios = reg->delayed_bios; + bio_list_init(®->delayed_bios); + read_unlock(&rh->hash_lock); + + if (delayed_bios.head) + rh->dispatch(rh->dispatch_context, &delayed_bios, error); + + up(&rh->recovery_count); +} +EXPORT_SYMBOL_GPL(dm_rh_dispatch_bios); + void dm_rh_stop_recovery(struct dm_region_hash *rh) { int i; + rh->wakeup_mirrord(rh->wake_context); + /* wait for any recovering regions */ for (i = 0; i < rh->max_recovery; i++) down(&rh->recovery_count); @@ -704,7 +697,7 @@ void dm_rh_start_recovery(struct dm_regi for (i = 0; i < rh->max_recovery; i++) up(&rh->recovery_count); - rh->wakeup_workers(rh->context); + rh->wakeup_mirrord(rh->wake_context); } EXPORT_SYMBOL_GPL(dm_rh_start_recovery);