From: Joe Thornber Add read-only and fail-io modes to thin provisioning. If a transaction commit fails the pool's metadata device will transition to "read-only" mode. If a commit fails once already in read-only mode the transition to "fail-io" mode occurs. Once in fail-io mode the pool and all associated thin devices will report a status of "Fail". Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/thin-provisioning.txt | 24 + drivers/md/dm-thin.c | 416 ++++++++++++++++------ 2 files changed, 344 insertions(+), 96 deletions(-) Index: linux-3.5/Documentation/device-mapper/thin-provisioning.txt =================================================================== --- linux-3.5.orig/Documentation/device-mapper/thin-provisioning.txt +++ linux-3.5/Documentation/device-mapper/thin-provisioning.txt @@ -231,6 +231,9 @@ i) Constructor no_discard_passdown: Don't pass discards down to the underlying data device, but just remove the mapping. + read_only: Don't allow any changes to be made to the pool + metadata. + Data block size must be between 64KB (128 sectors) and 1GB (2097152 sectors) inclusive. @@ -239,7 +242,7 @@ ii) Status / / - + [no_]discard_passdown ro|rw transaction id: A 64-bit number used by userspace to help synchronise with metadata @@ -257,6 +260,21 @@ ii) Status held root. This feature is not yet implemented so '-' is always returned. + discard_passdown|no_discard_passdown + Whether or not discards are actually being passed down to the + underlying device. When this is enabled when loading the table, + it can get disabled if the underlying device doesn't support it. + + ro|rw + If the pool encounters certain types of device failures it will + drop into a read-only metadata mode in which no changes to + the pool metadata (like allocating new blocks) are permitted. + + In serious cases where even a read-only mode is deemed unsafe + no further I/O will be permitted and the status will just + contain the string 'Fail'. The userspace recovery tools + should then be used. + iii) Messages create_thin @@ -329,3 +347,7 @@ regain some space then send the 'trim' m ii) Status + + If the pool has encountered device errors and failed, the status + will just contain the string 'Fail'. The userspace recovery + tools should then be used. Index: linux-3.5/drivers/md/dm-thin.c =================================================================== --- linux-3.5.orig/drivers/md/dm-thin.c +++ linux-3.5/drivers/md/dm-thin.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Red Hat UK. + * Copyright (C) 2011-2012 Red Hat UK. * * This file is released under the GPL. */ @@ -496,12 +496,27 @@ static void build_virtual_key(struct dm_ */ struct dm_thin_new_mapping; +/* + * The pool runs in 3 modes. Ordered in degraded order for comparisons. + */ +enum pool_mode { + PM_WRITE, /* metadata may be changed */ + PM_READ_ONLY, /* metadata may not be changed */ + PM_FAIL, /* all I/O fails */ +}; + struct pool_features { + enum pool_mode mode; + unsigned zero_new_blocks:1; unsigned discard_enabled:1; unsigned discard_passdown:1; }; +struct thin_c; +typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); +typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); + struct pool { struct list_head list; struct dm_target *ti; /* Only set if a pool target is bound */ @@ -542,8 +557,17 @@ struct pool { struct dm_thin_new_mapping *next_mapping; mempool_t *mapping_pool; mempool_t *endio_hook_pool; + + process_bio_fn process_bio; + process_bio_fn process_discard; + + process_mapping_fn process_prepared_mapping; + process_mapping_fn process_prepared_discard; }; +static enum pool_mode get_pool_mode(struct pool *pool); +static void set_pool_mode(struct pool *pool, enum pool_mode mode); + /* * Target context for a pool. */ @@ -718,16 +742,28 @@ static void issue(struct thin_c *tc, str struct pool *pool = tc->pool; unsigned long flags; + if (!bio_triggers_commit(tc, bio)) { + generic_make_request(bio); + return; + } + /* - * Batch together any FUA/FLUSH bios we find and then issue - * a single commit for them in process_deferred_bios(). + * Complete bio with an error if earlier I/O caused changes to + * the metadata that can't be committed e.g, due to I/O errors + * on the metadata device. */ - if (bio_triggers_commit(tc, bio)) { - spin_lock_irqsave(&pool->lock, flags); - bio_list_add(&pool->deferred_flush_bios, bio); - spin_unlock_irqrestore(&pool->lock, flags); - } else - generic_make_request(bio); + if (dm_thin_aborted_changes(tc->td)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a + * single commit for them in process_deferred_bios(). + */ + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->deferred_flush_bios, bio); + spin_unlock_irqrestore(&pool->lock, flags); } static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) @@ -864,6 +900,14 @@ static void cell_defer_except(struct thi wake_worker(pool); } +static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) +{ + if (m->bio) + m->bio->bi_end_io = m->saved_bi_end_io; + cell_error(m->cell); + list_del(&m->list); + mempool_free(m, m->tc->pool->mapping_pool); +} static void process_prepared_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; @@ -908,18 +952,20 @@ out: mempool_free(m, tc->pool->mapping_pool); } -static void process_prepared_discard(struct dm_thin_new_mapping *m) +static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) { - int r; struct thin_c *tc = m->tc; - r = dm_thin_remove_block(tc->td, m->virt_block); - if (r) - DMERR("dm_thin_remove_block() failed"); + bio_io_error(m->bio); + cell_defer_except(tc, m->cell); + cell_defer_except(tc, m->cell2); + mempool_free(m, tc->pool->mapping_pool); +} + +static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) +{ + struct thin_c *tc = m->tc; - /* - * Pass the discard down to the underlying device? - */ if (m->pass_discard) remap_and_issue(tc, m->bio, m->data_block); else @@ -930,8 +976,20 @@ static void process_prepared_discard(str mempool_free(m, tc->pool->mapping_pool); } +static void process_prepared_discard(struct dm_thin_new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + + r = dm_thin_remove_block(tc->td, m->virt_block); + if (r) + DMERR("dm_thin_remove_block() failed"); + + process_prepared_discard_passdown(m); +} + static void process_prepared(struct pool *pool, struct list_head *head, - void (*fn)(struct dm_thin_new_mapping *)) + process_mapping_fn *fn) { unsigned long flags; struct list_head maps; @@ -943,7 +1001,7 @@ static void process_prepared(struct pool spin_unlock_irqrestore(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &maps, list) - fn(m); + (*fn)(m); } /* @@ -1109,6 +1167,35 @@ static void schedule_zero(struct thin_c } } +static int commit(struct pool *pool) +{ + int r; + + r = dm_pool_commit_metadata(pool->pmd); + if (r) + DMERR("commit failed, error = %d", r); + + return r; +} + +/* + * A non-zero return indicates read_only or fail_io mode. + * Many callers don't care about the return value. + */ +static int commit_or_fallback(struct pool *pool) +{ + int r; + + if (get_pool_mode(pool) != PM_WRITE) + return -EINVAL; + + r = commit(pool); + if (r) + set_pool_mode(pool, PM_READ_ONLY); + + return r; +} + static int alloc_data_block(struct thin_c *tc, dm_block_t *result) { int r; @@ -1137,12 +1224,7 @@ static int alloc_data_block(struct thin_ * Try to commit to see if that will free up some * more space. */ - r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - return r; - } + (void) commit_or_fallback(pool); r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); if (r) @@ -1373,6 +1455,7 @@ static void provision_block(struct thin_ default: DMERR("%s: alloc_data_block() failed, error = %d", __func__, r); + set_pool_mode(tc->pool, PM_READ_ONLY); cell_error(cell); break; } @@ -1430,6 +1513,49 @@ static void process_bio(struct thin_c *t } } +static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +{ + int r; + int rw = bio_data_dir(bio); + dm_block_t block = get_bio_block(tc, bio); + struct dm_thin_lookup_result lookup_result; + + r = dm_thin_find_block(tc->td, block, 1, &lookup_result); + switch (r) { + case 0: + if (lookup_result.shared && (rw == WRITE) && bio->bi_size) + bio_io_error(bio); + else + remap_and_issue(tc, bio, lookup_result.block); + break; + + case -ENODATA: + if (rw != READ) { + bio_io_error(bio); + break; + } + + if (tc->origin_dev) { + remap_to_origin_and_issue(tc, bio); + break; + } + + zero_fill_bio(bio); + bio_endio(bio, 0); + break; + + default: + DMERR("dm_thin_find_block() failed, error = %d", r); + bio_io_error(bio); + break; + } +} + +static void process_bio_fail(struct thin_c *tc, struct bio *bio) +{ + bio_io_error(bio); +} + static int need_commit_due_to_time(struct pool *pool) { return jiffies < pool->last_commit_jiffies || @@ -1441,7 +1567,6 @@ static void process_deferred_bios(struct unsigned long flags; struct bio *bio; struct bio_list bios; - int r; bio_list_init(&bios); @@ -1468,9 +1593,9 @@ static void process_deferred_bios(struct } if (bio->bi_rw & REQ_DISCARD) - process_discard(tc, bio); + pool->process_discard(tc, bio); else - process_bio(tc, bio); + pool->process_bio(tc, bio); } /* @@ -1486,10 +1611,7 @@ static void process_deferred_bios(struct if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) return; - r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); + if (commit_or_fallback(pool)) { while ((bio = bio_list_pop(&bios))) bio_io_error(bio); return; @@ -1504,8 +1626,8 @@ static void do_worker(struct work_struct { struct pool *pool = container_of(ws, struct pool, worker); - process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); - process_prepared(pool, &pool->prepared_discards, process_prepared_discard); + process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); + process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); process_deferred_bios(pool); } @@ -1522,6 +1644,52 @@ static void do_waker(struct work_struct /*----------------------------------------------------------------*/ +static enum pool_mode get_pool_mode(struct pool *pool) +{ + return pool->pf.mode; +} + +static void set_pool_mode(struct pool *pool, enum pool_mode mode) +{ + int r; + + pool->pf.mode = mode; + + switch (mode) { + case PM_FAIL: + DMERR("switching pool to failure mode"); + pool->process_bio = process_bio_fail; + pool->process_discard = process_bio_fail; + pool->process_prepared_mapping = process_prepared_mapping_fail; + pool->process_prepared_discard = process_prepared_discard_fail; + break; + + case PM_READ_ONLY: + DMERR("switching pool to read-only mode"); + r = dm_pool_abort_metadata(pool->pmd); + if (r) { + DMERR("aborting transaction failed"); + set_pool_mode(pool, PM_FAIL); + } else { + dm_pool_metadata_read_only(pool->pmd); + pool->process_bio = process_bio_read_only; + pool->process_discard = process_discard; + pool->process_prepared_mapping = process_prepared_mapping_fail; + pool->process_prepared_discard = process_prepared_discard_passdown; + } + break; + + case PM_WRITE: + pool->process_bio = process_bio; + pool->process_discard = process_discard; + pool->process_prepared_mapping = process_prepared_mapping; + pool->process_prepared_discard = process_prepared_discard; + break; + } +} + +/*----------------------------------------------------------------*/ + /* * Mapping functions. */ @@ -1567,6 +1735,12 @@ static int thin_bio_map(struct dm_target struct dm_thin_lookup_result result; map_context->ptr = thin_hook_bio(tc, bio); + + if (get_pool_mode(tc->pool) == PM_FAIL) { + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { thin_defer_bio(tc, bio); return DM_MAPIO_SUBMITTED; @@ -1603,14 +1777,35 @@ static int thin_bio_map(struct dm_target break; case -ENODATA: + if (get_pool_mode(tc->pool) == PM_READ_ONLY) { + /* + * This block isn't provisioned, and we have no way + * of doing so. Just error it. + */ + bio_io_error(bio); + r = DM_MAPIO_SUBMITTED; + break; + } + /* fall through */ + + case -EWOULDBLOCK: /* * In future, the failed dm_thin_find_block above could * provide the hint to load the metadata into cache. */ - case -EWOULDBLOCK: thin_defer_bio(tc, bio); r = DM_MAPIO_SUBMITTED; break; + + default: + /* + * Must always call bio_io_error on failure. + * dm_thin_find_block can fail with -EINVAL if the + * pool is switched to fail-io mode. + */ + bio_io_error(bio); + r = DM_MAPIO_SUBMITTED; + break; } return r; @@ -1647,15 +1842,26 @@ static int bind_control_target(struct po { struct pool_c *pt = ti->private; + /* + * We want to make sure that degraded pools are never upgraded. + */ + enum pool_mode old_mode = pool->pf.mode; + enum pool_mode new_mode = pt->pf.mode; + + if (old_mode > new_mode) + new_mode = old_mode; + pool->ti = ti; pool->low_water_blocks = pt->low_water_blocks; pool->pf = pt->pf; + set_pool_mode(pool, new_mode); /* * If discard_passdown was enabled verify that the data device * supports discards. Disable discard_passdown if not; otherwise * -EOPNOTSUPP will be returned. */ + /* FIXME: pull this out into a sep fn. */ if (pt->pf.discard_passdown) { struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); if (!q || !blk_queue_discard(q)) { @@ -1681,6 +1887,7 @@ static void unbind_control_target(struct /* Initialize pool features. */ static void pool_features_init(struct pool_features *pf) { + pf->mode = PM_WRITE; pf->zero_new_blocks = 1; pf->discard_enabled = 1; pf->discard_passdown = 1; @@ -1711,14 +1918,16 @@ static struct kmem_cache *_endio_hook_ca static struct pool *pool_create(struct mapped_device *pool_md, struct block_device *metadata_dev, - unsigned long block_size, char **error) + unsigned long block_size, + int read_only, char **error) { int r; void *err_p; struct pool *pool; struct dm_pool_metadata *pmd; + bool format_device = read_only ? false : true; - pmd = dm_pool_metadata_open(metadata_dev, block_size, true); + pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); if (IS_ERR(pmd)) { *error = "Error creating metadata object"; return (struct pool *)pmd; @@ -1835,8 +2044,8 @@ static void __pool_dec(struct pool *pool static struct pool *__pool_find(struct mapped_device *pool_md, struct block_device *metadata_dev, - unsigned long block_size, char **error, - int *created) + unsigned long block_size, int read_only, + char **error, int *created) { struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); @@ -1857,7 +2066,7 @@ static struct pool *__pool_find(struct m __pool_inc(pool); } else { - pool = pool_create(pool_md, metadata_dev, block_size, error); + pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); *created = 1; } } @@ -1908,19 +2117,23 @@ static int parse_pool_features(struct dm arg_name = dm_shift_arg(as); argc--; - if (!strcasecmp(arg_name, "skip_block_zeroing")) { + if (!strcasecmp(arg_name, "skip_block_zeroing")) pf->zero_new_blocks = 0; - continue; - } else if (!strcasecmp(arg_name, "ignore_discard")) { + + else if (!strcasecmp(arg_name, "ignore_discard")) pf->discard_enabled = 0; - continue; - } else if (!strcasecmp(arg_name, "no_discard_passdown")) { + + else if (!strcasecmp(arg_name, "no_discard_passdown")) pf->discard_passdown = 0; - continue; - } - ti->error = "Unrecognised pool feature requested"; - r = -EINVAL; + else if (!strcasecmp(arg_name, "read_only")) + pf->mode = PM_READ_ONLY; + + else { + ti->error = "Unrecognised pool feature requested"; + r = -EINVAL; + break; + } } return r; @@ -2013,7 +2226,7 @@ static int pool_ctr(struct dm_target *ti } pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, - block_size, &ti->error, &pool_created); + block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); if (IS_ERR(pool)) { r = PTR_ERR(pool); goto out_free_pt; @@ -2146,15 +2359,12 @@ static int pool_preresume(struct dm_targ r = dm_pool_resize_data_dev(pool->pmd, data_size); if (r) { DMERR("failed to resize data device"); + /* FIXME Stricter than necessary: Rollback transaction instead here */ + set_pool_mode(pool, PM_READ_ONLY); return r; } - r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - return r; - } + (void) commit_or_fallback(pool); } return 0; @@ -2177,19 +2387,12 @@ static void pool_resume(struct dm_target static void pool_postsuspend(struct dm_target *ti) { - int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; cancel_delayed_work(&pool->waker); flush_workqueue(pool->wq); - - r = dm_pool_commit_metadata(pool->pmd); - if (r < 0) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ - } + (void) commit_or_fallback(pool); } static int check_arg_count(unsigned argc, unsigned args_required) @@ -2323,12 +2526,7 @@ static int process_reserve_metadata_snap if (r) return r; - r = dm_pool_commit_metadata(pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - return r; - } + (void) commit_or_fallback(pool); r = dm_pool_reserve_metadata_snap(pool->pmd); if (r) @@ -2389,16 +2587,32 @@ static int pool_message(struct dm_target else DMWARN("Unrecognised thin pool target message received: %s", argv[0]); - if (!r) { - r = dm_pool_commit_metadata(pool->pmd); - if (r) - DMERR("%s message: dm_pool_commit_metadata() failed, error = %d", - argv[0], r); - } + if (!r) + (void) commit_or_fallback(pool); return r; } +static void emit_flags(struct pool_features *pf, char *result, + unsigned sz, unsigned maxlen) +{ + unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + + !pf->discard_passdown + (pf->mode == PM_READ_ONLY); + DMEMIT("%u ", count); + + if (!pf->zero_new_blocks) + DMEMIT("skip_block_zeroing "); + + if (!pf->discard_enabled) + DMEMIT("ignore_discard "); + + if (!pf->discard_passdown) + DMEMIT("no_discard_passdown "); + + if (pf->mode == PM_READ_ONLY) + DMEMIT("read_only "); +} + /* * Status line is: * / @@ -2407,7 +2621,7 @@ static int pool_message(struct dm_target static int pool_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen) { - int r, count; + int r; unsigned sz = 0; uint64_t transaction_id; dm_block_t nr_free_blocks_data; @@ -2422,6 +2636,11 @@ static int pool_status(struct dm_target switch (type) { case STATUSTYPE_INFO: + if (get_pool_mode(pool) == PM_FAIL) { + DMEMIT("Fail"); + break; + } + r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id); if (r) @@ -2457,9 +2676,19 @@ static int pool_status(struct dm_target (unsigned long long)nr_blocks_data); if (held_root) - DMEMIT("%llu", held_root); + DMEMIT("%llu ", held_root); + else + DMEMIT("- "); + + if (pool->pf.mode == PM_READ_ONLY) + DMEMIT("ro "); else - DMEMIT("-"); + DMEMIT("rw "); + + if (pool->pf.discard_enabled && pool->pf.discard_passdown) + DMEMIT("discard_passdown"); + else + DMEMIT("no_discard_passdown"); break; @@ -2469,20 +2698,7 @@ static int pool_status(struct dm_target format_dev_t(buf2, pt->data_dev->bdev->bd_dev), (unsigned long)pool->sectors_per_block, (unsigned long long)pt->low_water_blocks); - - count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + - !pt->pf.discard_passdown; - DMEMIT("%u ", count); - - if (!pool->pf.zero_new_blocks) - DMEMIT("skip_block_zeroing "); - - if (!pool->pf.discard_enabled) - DMEMIT("ignore_discard "); - - if (!pt->pf.discard_passdown) - DMEMIT("no_discard_passdown "); - + emit_flags(&pt->pf, result, sz, maxlen); break; } @@ -2542,7 +2758,7 @@ static struct target_type pool_target = .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -2647,6 +2863,11 @@ static int thin_ctr(struct dm_target *ti } __pool_inc(tc->pool); + if (get_pool_mode(tc->pool) == PM_FAIL) { + ti->error = "Couldn't open thin device, Pool is in fail mode"; + goto bad_thin_open; + } + r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); if (r) { ti->error = "Couldn't open thin internal device"; @@ -2755,6 +2976,11 @@ static int thin_status(struct dm_target char buf[BDEVNAME_SIZE]; struct thin_c *tc = ti->private; + if (get_pool_mode(tc->pool) == PM_FAIL) { + DMEMIT("Fail"); + return 0; + } + if (!tc->td) DMEMIT("-"); else { @@ -2823,7 +3049,7 @@ static void thin_io_hints(struct dm_targ static struct target_type thin_target = { .name = "thin", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr,