FIXME - explain #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * 8) --- Documentation/device-mapper/thin-provisioning.txt | 25 - drivers/md/dm-thin-metadata.c | 305 ++++++-------- drivers/md/dm-thin-metadata.h | 11 drivers/md/dm-thin.c | 473 ++++++++++++++-------- 4 files changed, 464 insertions(+), 350 deletions(-) Index: linux-3.1-rc9/Documentation/device-mapper/thin-provisioning.txt =================================================================== --- linux-3.1-rc9.orig/Documentation/device-mapper/thin-provisioning.txt +++ linux-3.1-rc9/Documentation/device-mapper/thin-provisioning.txt @@ -88,16 +88,15 @@ Using an existing pool device $data_block_size $low_water_mark" $data_block_size gives the smallest unit of disk space that can be -allocated at a time. As with all sizes passed to device-mapper, this -is expressed in units of 512-byte sectors. People primarily -interested in thin provisioning may want to use a value such as 1024. -People doing lots of snapshotting may want a smaller value such as -128. $data_block_size must be the same for the lifetime of the +allocated at a time expressed in units of 512-byte sectors. People +primarily interested in thin provisioning may want to use a value such +as 1024. People doing lots of snapshotting may want a smaller value +such as 128. $data_block_size must be the same for the lifetime of the metadata device. -$low_water_mark is expressed in 512-byte sectors. If free space on -the data device drops below this level then a dm event will be -triggered which a userspace daemon should catch allowing it to +$low_water_mark is expressed in blocks of size $data_block_size. If +free space on the data device drops below this level then a dm event +will be triggered which a userspace daemon should catch allowing it to extend the pool device. Only one such event will be sent. FIXME - Do we get a second event after a table reload when you're @@ -177,7 +176,7 @@ Reference i) Constructor thin-pool \ - [ []*] + [ []*] Optional feature arguments: - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks. @@ -187,16 +186,16 @@ i) Constructor ii) Status - / - / + / + / transaction id: A 64-bit number used by userspace to help synchronise with metadata from volume managers. - used data sectors / total data sectors - If the number of free sectors drops below the pool's low water mark a + used data blocks / total data blocks + If the number of free blocks drops below the pool's low water mark a dm event will be sent to userspace. This event is edge-triggered and it will occur only once after each resume so volume manager writers should register for the event and then check the target's status. Index: linux-3.1-rc9/drivers/md/dm-thin-metadata.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/dm-thin-metadata.c +++ linux-3.1-rc9/drivers/md/dm-thin-metadata.c @@ -77,7 +77,6 @@ #define THIN_SUPERBLOCK_MAGIC 27022010 #define THIN_SUPERBLOCK_LOCATION 0 #define THIN_VERSION 1 -#define THIN_METADATA_BLOCK_SIZE 4096 #define THIN_METADATA_CACHE_SIZE 64 #define SECTOR_TO_BLOCK_SHIFT 3 @@ -174,7 +173,6 @@ struct dm_pool_metadata { struct rw_semaphore root_lock; uint32_t time; int need_commit; - struct dm_block *sblock; dm_block_t root; dm_block_t details_root; struct list_head thin_devices; @@ -200,6 +198,8 @@ struct dm_thin_device { * superblock validator *--------------------------------------------------------------*/ +#define SUPERBLOCK_CSUM_XOR 160774 + static void sb_prepare_for_write(struct dm_block_validator *v, struct dm_block *b, size_t block_size) @@ -207,7 +207,9 @@ static void sb_prepare_for_write(struct struct thin_disk_superblock *disk_super = dm_block_data(b); disk_super->blocknr = cpu_to_le64(dm_block_location(b)); - disk_super->csum = cpu_to_le32(dm_block_csum_data(&disk_super->flags, sizeof(*disk_super) - sizeof(__le32))); + disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); } static int sb_check(struct dm_block_validator *v, @@ -231,7 +233,9 @@ static int sb_check(struct dm_block_vali return -EILSEQ; } - csum_le = cpu_to_le32(dm_block_csum_data(&disk_super->flags, sizeof(*disk_super) - sizeof(__le32))); + csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, + block_size - sizeof(__le32), + SUPERBLOCK_CSUM_XOR)); if (csum_le != disk_super->csum) { DMERR("sb_check failed: csum %u: wanted %u", le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); @@ -319,7 +323,7 @@ static void subtree_dec(void *context, v memcpy(&root_le, value, sizeof(root_le)); root = le64_to_cpu(root_le); - if (dm_btree_destroy(info, root)) + if (dm_btree_del(info, root)) DMERR("btree delete failed\n"); } @@ -361,13 +365,13 @@ static int superblock_all_zeroes(struct return dm_bm_unlock(b); } -static struct dm_pool_metadata *alloc_pmd(struct dm_block_manager *bm, - dm_block_t nr_blocks, int create) +static int init_pmd(struct dm_pool_metadata *pmd, + struct dm_block_manager *bm, + dm_block_t nr_blocks, int create) { int r; struct dm_space_map *sm, *data_sm; struct dm_transaction_manager *tm; - struct dm_pool_metadata *pmd = NULL; struct dm_block *sblock; if (create) { @@ -375,7 +379,7 @@ static struct dm_pool_metadata *alloc_pm &sb_validator, &tm, &sm, &sblock); if (r < 0) { DMERR("tm_create_with_sm failed"); - return ERR_PTR(r); + return r; } data_sm = dm_sm_disk_create(tm, nr_blocks); @@ -384,18 +388,6 @@ static struct dm_pool_metadata *alloc_pm r = PTR_ERR(data_sm); goto bad; } - - r = dm_tm_pre_commit(tm); - if (r < 0) { - DMERR("couldn't pre commit"); - goto bad_data_sm; - } - - r = dm_tm_commit(tm, sblock); - if (r < 0) { - DMERR("couldn't commit"); - goto bad_data_sm; - } } else { struct thin_disk_superblock *disk_super = NULL; size_t space_map_root_offset = @@ -406,7 +398,7 @@ static struct dm_pool_metadata *alloc_pm SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); if (r < 0) { DMERR("tm_open_with_sm failed"); - return ERR_PTR(r); + return r; } disk_super = dm_block_data(sblock); @@ -417,14 +409,12 @@ static struct dm_pool_metadata *alloc_pm r = PTR_ERR(data_sm); goto bad; } - - dm_tm_unlock(tm, sblock); } - pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); - if (!pmd) { - DMERR("could not allocate metadata struct"); - r = -ENOMEM; + + r = dm_tm_unlock(tm, sblock); + if (r < 0) { + DMERR("couldn't unlock superblock"); goto bad_data_sm; } @@ -436,11 +426,9 @@ static struct dm_pool_metadata *alloc_pm if (!pmd->nb_tm) { DMERR("could not create clone tm"); r = -ENOMEM; - goto bad_pmd; + goto bad_data_sm; } - pmd->sblock = NULL; - pmd->info.tm = tm; pmd->info.levels = 2; pmd->info.value_type.context = pmd->data_sm; @@ -484,17 +472,15 @@ static struct dm_pool_metadata *alloc_pm pmd->details_root = 0; INIT_LIST_HEAD(&pmd->thin_devices); - return pmd; + return 0; -bad_pmd: - kfree(pmd); bad_data_sm: dm_sm_destroy(data_sm); bad: dm_tm_destroy(tm); dm_sm_destroy(sm); - return ERR_PTR(r); + return r; } static int __begin_transaction(struct dm_pool_metadata *pmd) @@ -502,22 +488,23 @@ static int __begin_transaction(struct dm int r; u32 features; struct thin_disk_superblock *disk_super; + struct dm_block *sblock; /* * __maybe_commit_transaction() resets these */ - WARN_ON(pmd->sblock); WARN_ON(pmd->need_commit); /* - * superblock is unlocked via dm_tm_commit() + * We re-read the superblock every time. Shouldn't need to do this + * really. */ - r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, - &sb_validator, &pmd->sblock); + r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); if (r) return r; - disk_super = dm_block_data(pmd->sblock); + disk_super = dm_block_data(sblock); pmd->time = le32_to_cpu(disk_super->time); pmd->root = le64_to_cpu(disk_super->data_mapping_root); pmd->details_root = le64_to_cpu(disk_super->device_details_root); @@ -530,24 +517,27 @@ static int __begin_transaction(struct dm DMERR("could not access metadata due to " "unsupported optional features (%lx).", (unsigned long)features); - return -EINVAL; + r = -EINVAL; + goto out; } /* * Check for read-only metadata to skip the following RDWR checks. */ if (get_disk_ro(pmd->bdev->bd_disk)) - return 0; + goto out; features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; if (features) { DMERR("could not access metadata RDWR due to " "unsupported optional features (%lx).", (unsigned long)features); - return -EINVAL; + r = -EINVAL; } - return 0; +out: + dm_bm_unlock(sblock); + return r; } static int __write_changed_details(struct dm_pool_metadata *pmd) @@ -587,21 +577,18 @@ static int __write_changed_details(struc return 0; } -/* - * If there is data waiting to be committed, commit it. - * Returns 1 if commit took place, 0 if not, or < 0 on error. - */ -static int __maybe_commit_transaction(struct dm_pool_metadata *pmd) +static int __commit_transaction(struct dm_pool_metadata *pmd) { /* * FIXME: Associated pool should be made read-only on failure. */ int r; - size_t len; + size_t metadata_len, data_len; struct thin_disk_superblock *disk_super; + struct dm_block *sblock; /* - * thin_disk_superblock is assumed not to exceed a 512-byte sector. + * We need to know if the thin_disk_superblock exceeds a 512-byte sector. */ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); @@ -612,57 +599,53 @@ static int __maybe_commit_transaction(st if (!pmd->need_commit) goto out; + r = dm_sm_commit(pmd->data_sm); + if (r < 0) + goto out; + r = dm_tm_pre_commit(pmd->tm); if (r < 0) goto out; - r = dm_sm_root_size(pmd->metadata_sm, &len); + r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); if (r < 0) goto out; - disk_super = dm_block_data(pmd->sblock); + r = dm_sm_root_size(pmd->metadata_sm, &data_len); + if (r < 0) + goto out; + + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + goto out; + + disk_super = dm_block_data(sblock); disk_super->time = cpu_to_le32(pmd->time); disk_super->data_mapping_root = cpu_to_le64(pmd->root); disk_super->device_details_root = cpu_to_le64(pmd->details_root); disk_super->trans_id = cpu_to_le64(pmd->trans_id); disk_super->flags = cpu_to_le32(pmd->flags); - r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, len); + r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, + metadata_len); if (r < 0) - goto out; + goto out_locked; - r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, len); + r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, + data_len); if (r < 0) - goto out; + goto out_locked; - r = dm_tm_commit(pmd->tm, pmd->sblock); - if (!r) { - r = 1; - pmd->sblock = NULL; + r = dm_tm_commit(pmd->tm, sblock); + if (!r) pmd->need_commit = 0; - } out: return r; -} - -int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) -{ - int r; - down_write(&pmd->root_lock); - - r = __maybe_commit_transaction(pmd); - if (r <= 0) - goto out; - - /* - * Open the next transaction. - */ - r = __begin_transaction(pmd); - -out: - up_write(&pmd->root_lock); +out_locked: + dm_bm_unlock(sblock); return r; } @@ -675,24 +658,40 @@ struct dm_pool_metadata *dm_pool_metadat sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; struct dm_block_manager *bm; int create; + struct dm_block *sblock; + pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); + if (!pmd) { + DMERR("could not allocate metadata struct"); + return ERR_PTR(-ENOMEM); + } + + /* + * Max hex locks: + * 3 for btree insert + + * 2 for btree lookup used within space map + */ bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, - THIN_METADATA_CACHE_SIZE, 6); + THIN_METADATA_CACHE_SIZE, 5); if (!bm) { DMERR("could not create block manager"); + kfree(pmd); return ERR_PTR(-ENOMEM); } r = superblock_all_zeroes(bm, &create); if (r) { dm_block_manager_destroy(bm); + kfree(pmd); return ERR_PTR(r); } - pmd = alloc_pmd(bm, 0, create); - if (IS_ERR(pmd)) { + + r = init_pmd(pmd, bm, 0, create); + if (r) { dm_block_manager_destroy(bm); - return pmd; + kfree(pmd); + return ERR_PTR(r); } pmd->bdev = bdev; @@ -706,13 +705,12 @@ struct dm_pool_metadata *dm_pool_metadat /* * Create. */ - if (!pmd->sblock) { - r = __begin_transaction(pmd); - if (r < 0) - goto bad; - } + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + goto bad; - disk_super = dm_block_data(pmd->sblock); + disk_super = dm_block_data(sblock); disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); disk_super->version = cpu_to_le32(THIN_VERSION); disk_super->time = 0; @@ -720,11 +718,15 @@ struct dm_pool_metadata *dm_pool_metadat disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); disk_super->data_block_size = cpu_to_le32(data_block_size); - r = dm_btree_create(&pmd->info, &pmd->root); + r = dm_bm_unlock(sblock); if (r < 0) goto bad; - r = dm_btree_create(&pmd->details_info, &pmd->details_root); + r = dm_btree_empty(&pmd->info, &pmd->root); + if (r < 0) + goto bad; + + r = dm_btree_empty(&pmd->details_info, &pmd->details_root); if (r < 0) { DMERR("couldn't create devices root"); goto bad; @@ -770,14 +772,10 @@ int dm_pool_metadata_close(struct dm_poo return -EBUSY; } - if (pmd->sblock) { - r = __maybe_commit_transaction(pmd); - if (r < 0) - DMWARN("%s: __maybe_commit_transaction() failed, error = %d", - __func__, r); - if (pmd->sblock) - dm_tm_unlock(pmd->tm, pmd->sblock); - } + r = __commit_transaction(pmd); + if (r < 0) + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); dm_tm_destroy(pmd->tm); dm_tm_destroy(pmd->nb_tm); @@ -805,7 +803,7 @@ static int __open_device(struct dm_pool_ struct disk_device_details details_le; /* - * If the device is already open, just increment its open_count. + * Check the device isn't already open. */ list_for_each_entry(td2, &pmd->thin_devices, list) if (td2->id == dev) { @@ -823,9 +821,6 @@ static int __open_device(struct dm_pool_ if (r != -ENODATA || !create) return r; - /* - * New device. - */ changed = 1; details_le.mapped_blocks = 0; details_le.transaction_id = cpu_to_le64(pmd->trans_id); @@ -874,7 +869,7 @@ static int __create_thin(struct dm_pool_ /* * Create an empty btree for the mappings. */ - r = dm_btree_create(&pmd->bl_info, &dev_root); + r = dm_btree_empty(&pmd->bl_info, &dev_root); if (r) return r; @@ -885,7 +880,7 @@ static int __create_thin(struct dm_pool_ __dm_bless_for_disk(&value); r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); if (r) { - dm_btree_destroy(&pmd->bl_info, dev_root); + dm_btree_del(&pmd->bl_info, dev_root); return r; } @@ -893,7 +888,7 @@ static int __create_thin(struct dm_pool_ if (r) { __close_device(td); dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); - dm_btree_destroy(&pmd->bl_info, dev_root); + dm_btree_del(&pmd->bl_info, dev_root); return r; } td->changed = 1; @@ -938,7 +933,7 @@ static int __create_snap(struct dm_pool_ dm_thin_id dev, dm_thin_id origin) { int r; - dm_block_t origin_root, snap_root; + dm_block_t origin_root; uint64_t key = origin, dev_key = dev; struct dm_thin_device *td; struct disk_device_details details_le; @@ -956,18 +951,16 @@ static int __create_snap(struct dm_pool_ return r; origin_root = le64_to_cpu(value); - /* clone the origin */ - r = dm_btree_clone(&pmd->bl_info, origin_root, &snap_root); - if (r) - return r; + /* clone the origin, an inc will do */ + dm_tm_inc(pmd->tm, origin_root); /* insert into the main mapping tree */ - value = cpu_to_le64(snap_root); + value = cpu_to_le64(origin_root); __dm_bless_for_disk(&value); key = dev; r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); if (r) { - dm_btree_destroy(&pmd->bl_info, snap_root); + dm_tm_dec(pmd->tm, origin_root); return r; } @@ -1049,46 +1042,6 @@ int dm_pool_delete_thin_device(struct dm return r; } -static int __trim_thin_dev(struct dm_thin_device *td, sector_t new_size) -{ - struct dm_pool_metadata *pmd = td->pmd; - /* FIXME: convert new size to blocks */ - uint64_t key[2] = { td->id, new_size - 1 }; - - td->changed = 1; - - /* - * We need to truncate all the extraneous mappings. - * - * FIXME: We have to be careful to do this atomically. - * Perhaps clone the bottom layer first so we can revert? - */ - return dm_btree_delete_gt(&pmd->info, pmd->root, key, &pmd->root); -} - -// FIXME Incomplete implementation. Finish or remove it before final submission. -int dm_pool_trim_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, - sector_t new_size) -{ - int r; - struct dm_thin_device *td; - - down_write(&pmd->root_lock); - r = __open_device(pmd, dev, 1, &td); - if (r) - DMERR("couldn't open virtual device"); - else { - r = __trim_thin_dev(td, new_size); - __close_device(td); - } - - /* FIXME: update mapped_blocks */ - - up_write(&pmd->root_lock); - - return r; -} - int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, uint64_t current_id, uint64_t new_id) @@ -1117,17 +1070,34 @@ int dm_pool_get_metadata_transaction_id( return 0; } +static int __get_held_metadata_root(struct dm_pool_metadata *pmd, + dm_block_t *result) +{ + int r; + struct thin_disk_superblock *disk_super; + struct dm_block *sblock; + + r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, + &sb_validator, &sblock); + if (r) + return r; + + disk_super = dm_block_data(sblock); + *result = le64_to_cpu(disk_super->held_root); + + return dm_bm_unlock(sblock); +} + int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, dm_block_t *result) { - struct thin_disk_superblock *disk_super; + int r; down_read(&pmd->root_lock); - disk_super = dm_block_data(pmd->sblock); - *result = le64_to_cpu(disk_super->held_root); + r = __get_held_metadata_root(pmd, result); up_read(&pmd->root_lock); - return 0; + return r; } int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, @@ -1275,6 +1245,25 @@ int dm_pool_alloc_data_block(struct dm_p return r; } +int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) +{ + int r; + + down_write(&pmd->root_lock); + + r = __commit_transaction(pmd); + if (r <= 0) + goto out; + + /* + * Open the next transaction. + */ + r = __begin_transaction(pmd); +out: + up_write(&pmd->root_lock); + return r; +} + int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) { int r; Index: linux-3.1-rc9/drivers/md/dm-thin-metadata.h =================================================================== --- linux-3.1-rc9.orig/drivers/md/dm-thin-metadata.h +++ linux-3.1-rc9/drivers/md/dm-thin-metadata.h @@ -9,6 +9,9 @@ #include "persistent-data/dm-block-manager.h" +/* FIXME: need metadata blocksize function later... */ +#define THIN_METADATA_BLOCK_SIZE 4096 + /*----------------------------------------------------------------*/ struct dm_pool_metadata; @@ -64,14 +67,6 @@ int dm_pool_delete_thin_device(struct dm dm_thin_id dev); /* - * Thin devices don't have a size, however they do keep track of the - * highest mapped block. This trimming function allows the user to remove - * mappings above a certain virtual block. - */ -int dm_pool_trim_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, - sector_t new_size); - -/* * Commits _all_ metadata changes: device creation, deletion, mapping * updates. */ Index: linux-3.1-rc9/drivers/md/dm-thin.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/dm-thin.c +++ linux-3.1-rc9/drivers/md/dm-thin.c @@ -468,15 +468,17 @@ struct pool { struct dm_target *ti; /* Only set if a pool target is bound */ struct mapped_device *pool_md; + struct block_device *md_dev; struct dm_pool_metadata *pmd; uint32_t sectors_per_block; unsigned block_shift; dm_block_t offset_mask; - dm_block_t low_water_mark; + dm_block_t low_water_blocks; unsigned zero_new_blocks:1; unsigned low_water_triggered:1; /* A dm event has been sent */ + unsigned no_free_space:1; /* An ENOSPC warning has been issued */ struct bio_prison *prison; struct dm_kcopyd_client *copier; @@ -484,8 +486,11 @@ struct pool { struct workqueue_struct *wq; struct work_struct worker; + unsigned ref_count; + spinlock_t lock; struct bio_list deferred_bios; + struct bio_list deferred_flush_bios; struct list_head prepared_mappings; struct bio_list retry_on_resume_list; @@ -493,11 +498,8 @@ struct pool { struct deferred_set ds; /* FIXME: move to thin_c */ struct new_mapping *next_mapping; - mempool_t *mapping_pool; mempool_t *endio_hook_pool; - - atomic_t ref_count; }; /* @@ -510,7 +512,7 @@ struct pool_c { struct dm_dev *metadata_dev; struct dm_target_callbacks callbacks; - sector_t low_water_mark; + dm_block_t low_water_blocks; unsigned zero_new_blocks:1; }; @@ -528,45 +530,59 @@ struct thin_c { /*----------------------------------------------------------------*/ /* - * A global list that uses a struct mapped_device as a key. + * A global list of pools that uses a struct mapped_device as a key. */ static struct dm_thin_pool_table { - spinlock_t lock; + struct mutex mutex; struct list_head pools; } dm_thin_pool_table; static void pool_table_init(void) { - spin_lock_init(&dm_thin_pool_table.lock); - + mutex_init(&dm_thin_pool_table.mutex); INIT_LIST_HEAD(&dm_thin_pool_table.pools); } -static void pool_table_insert(struct pool *pool) +static void __pool_table_insert(struct pool *pool) { - spin_lock(&dm_thin_pool_table.lock); + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); list_add(&pool->list, &dm_thin_pool_table.pools); - spin_unlock(&dm_thin_pool_table.lock); } -static void pool_table_remove(struct pool *pool) +static void __pool_table_remove(struct pool *pool) { - spin_lock(&dm_thin_pool_table.lock); + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); list_del(&pool->list); - spin_unlock(&dm_thin_pool_table.lock); } -static struct pool *pool_table_lookup(struct mapped_device *md) +static struct pool *__pool_table_lookup(struct mapped_device *md) { struct pool *pool = NULL, *tmp; - spin_lock(&dm_thin_pool_table.lock); - list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { if (tmp->pool_md == md) { pool = tmp; break; } - spin_unlock(&dm_thin_pool_table.lock); + } + + return pool; +} + +static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev) +{ + struct pool *pool = NULL, *tmp; + + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + + list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) { + if (tmp->md_dev == md_dev) { + pool = tmp; + break; + } + } return pool; } @@ -597,23 +613,27 @@ static void remap(struct thin_c *tc, str static void remap_and_issue(struct thin_c *tc, struct bio *bio, dm_block_t block) { - if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { - int r = dm_pool_commit_metadata(tc->pool->pmd); - if (r) { - DMERR("%s: dm_pool_commit_metadata() failed, error = %d", - __func__, r); - bio_io_error(bio); - return; - } - } + struct pool *pool = tc->pool; + unsigned long flags; remap(tc, bio, block); - generic_make_request(bio); + + /* + * Batch together any FUA/FLUSH bios we find and then issue + * a single commit for them in process_deferred_bios(). + */ + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->deferred_flush_bios, bio); + spin_unlock_irqrestore(&pool->lock, flags); + } else + generic_make_request(bio); } /* - * wake_worker() is used by thin_defer_bio and pool_preresume to continue - * deferred IO processing after pool resume. + * wake_worker() is used when new work is queued and when + * pool_resume is ready to continue deferred IO + * processing. */ static void wake_worker(struct pool *pool) { @@ -625,7 +645,6 @@ static void wake_worker(struct pool *poo /* * Bio endio functions. */ - struct endio_hook { struct thin_c *tc; bio_end_io_t *saved_bi_end_io; @@ -737,14 +756,14 @@ static void cell_defer(struct thin_c *tc spin_lock_irqsave(&pool->lock, flags); cell_release(cell, &pool->deferred_bios); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irqrestore(&tc->pool->lock, flags); wake_worker(pool); } /* - * As above, but ignoring @exception (a write bio that covers - * the block) because it has already been processed. + * Same as cell_defer above, except it omits one particular detainee, + * a write bio that covers the block and has already been processed. */ static void cell_defer_except(struct thin_c *tc, struct cell *cell, struct bio *exception) @@ -805,6 +824,7 @@ static void process_prepared_mapping(str } else cell_defer(tc, m->cell, m->data_block); + list_del(&m->list); mempool_free(m, tc->pool->mapping_pool); } @@ -812,14 +832,14 @@ static void process_prepared_mappings(st { unsigned long flags; struct list_head maps; - struct new_mapping *m; + struct new_mapping *m, *tmp; INIT_LIST_HEAD(&maps); spin_lock_irqsave(&pool->lock, flags); list_splice_init(&pool->prepared_mappings, &maps); spin_unlock_irqrestore(&pool->lock, flags); - list_for_each_entry(m, &maps, list) + list_for_each_entry_safe(m, tmp, &maps, list) process_prepared_mapping(m); } @@ -935,11 +955,13 @@ static void schedule_zero(struct thin_c */ if (!pool->zero_new_blocks) process_prepared_mapping(m); + else if (io_overwrites_block(pool, bio)) { m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); dm_get_mapinfo(bio)->ptr = m; remap_and_issue(tc, bio, data_block); + } else { int r; struct dm_io_region to; @@ -957,21 +979,6 @@ static void schedule_zero(struct thin_c } } -/* - * If we have run out of space, queue bios until the device is - * resumed, presumably after having been reloaded with more space. - */ -static void retry_when_resumed(struct bio *bio) -{ - struct thin_c *tc = dm_get_mapinfo(bio)->ptr; - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&pool->lock, flags); - bio_list_add(&pool->retry_on_resume_list, bio); - spin_unlock_irqrestore(&pool->lock, flags); -} - static int alloc_data_block(struct thin_c *tc, dm_block_t *result) { int r; @@ -983,13 +990,49 @@ static int alloc_data_block(struct thin_ if (r) return r; - if (free_blocks <= pool->low_water_mark && !pool->low_water_triggered) { + if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { + DMWARN("%s: reached low water mark, sending event.", + dm_device_name(pool->pool_md)); spin_lock_irqsave(&pool->lock, flags); pool->low_water_triggered = 1; spin_unlock_irqrestore(&pool->lock, flags); dm_table_event(pool->ti->table); } + if (!free_blocks) { + if (pool->no_free_space) + return -ENOSPC; + else { + /* + * Try to commit to see if that will free up some + * more space. + */ + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + return r; + } + + r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); + if (r) + return r; + + /* + * If we still have no space we set a flag to avoid + * doing all this checking and return -ENOSPC. + */ + if (!free_blocks) { + DMWARN("%s: no free space available.", + dm_device_name(pool->pool_md)); + spin_lock_irqsave(&pool->lock, flags); + pool->no_free_space = 1; + spin_unlock_irqrestore(&pool->lock, flags); + return -ENOSPC; + } + } + } + r = dm_pool_alloc_data_block(pool->pmd, result); if (r) return r; @@ -997,6 +1040,21 @@ static int alloc_data_block(struct thin_ return 0; } +/* + * If we have run out of space, queue bios until the device is + * resumed, presumably after having been reloaded with more space. + */ +static void retry_on_resume(struct bio *bio) +{ + struct thin_c *tc = dm_get_mapinfo(bio)->ptr; + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->retry_on_resume_list, bio); + spin_unlock_irqrestore(&pool->lock, flags); +} + static void no_space(struct cell *cell) { struct bio *bio; @@ -1006,7 +1064,7 @@ static void no_space(struct cell *cell) cell_release(cell, &bios); while ((bio = bio_list_pop(&bios))) - retry_when_resumed(bio); + retry_on_resume(bio); } static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, @@ -1040,8 +1098,8 @@ static void process_shared_bio(struct th struct dm_thin_lookup_result *lookup_result) { struct cell *cell; - struct cell_key key; struct pool *pool = tc->pool; + struct cell_key key; /* * If cell is already occupied, then sharing is already in the process @@ -1133,6 +1191,10 @@ static void process_bio(struct thin_c *t * one that puts bios into a cell, and we know there were * no preceding bios. */ + /* + * TODO: this will probably have to change when discard goes + * back in. + */ cell_release_singleton(cell, bio); if (lookup_result.shared) @@ -1157,6 +1219,7 @@ static void process_deferred_bios(struct unsigned long flags; struct bio *bio; struct bio_list bios; + int r; bio_list_init(&bios); @@ -1167,22 +1230,45 @@ static void process_deferred_bios(struct while ((bio = bio_list_pop(&bios))) { struct thin_c *tc = dm_get_mapinfo(bio)->ptr; - /* - * If we've got no free new_mapping structs, and processing this bio - * might require one, we pause until there are some prepared mappings to - * process. + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. */ if (ensure_next_mapping(pool)) { spin_lock_irqsave(&pool->lock, flags); bio_list_merge(&pool->deferred_bios, &bios); spin_unlock_irqrestore(&pool->lock, flags); - return; + break; } - process_bio(tc, bio); } + + /* + * If there are any deferred flush bios, we must commit + * the metadata before issuing them. + */ + bio_list_init(&bios); + spin_lock_irqsave(&pool->lock, flags); + bio_list_merge(&bios, &pool->deferred_flush_bios); + bio_list_init(&pool->deferred_flush_bios); + spin_unlock_irqrestore(&pool->lock, flags); + + if (bio_list_empty(&bios)) + return; + + r = dm_pool_commit_metadata(pool->pmd); + if (r) { + DMERR("%s: dm_pool_commit_metadata() failed, error = %d", + __func__, r); + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); + return; + } + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); } static void do_worker(struct work_struct *ws) @@ -1215,8 +1301,7 @@ static void thin_defer_bio(struct thin_c } /* - * Non-blocking function designed to be called from the target's map - * function. + * Non-blocking function called from the thin target's map function. */ static int thin_bio_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) @@ -1256,7 +1341,7 @@ static int thin_bio_map(struct dm_target * ensure a consistent application view * (i.e. lockfs). * - * More distant ancestors are irrelevant: the + * More distant ancestors are irrelevant. The * shared flag will be set in their case. */ thin_defer_bio(tc, bio); @@ -1309,8 +1394,7 @@ static int bind_control_target(struct po struct pool_c *pt = ti->private; pool->ti = ti; - pool->low_water_mark = dm_sector_div_up(pt->low_water_mark, - pool->sectors_per_block); + pool->low_water_blocks = pt->low_water_blocks; pool->zero_new_blocks = pt->zero_new_blocks; dm_pool_rebind_metadata_device(pool->pmd, pt->metadata_dev->bdev); @@ -1326,8 +1410,10 @@ static void unbind_control_target(struct /*---------------------------------------------------------------- * Pool creation *--------------------------------------------------------------*/ -static void pool_destroy(struct pool *pool) +static void __pool_destroy(struct pool *pool) { + __pool_table_remove(pool); + if (dm_pool_metadata_close(pool->pmd) < 0) DMWARN("%s: dm_pool_metadata_close() failed.", __func__); @@ -1339,13 +1425,13 @@ static void pool_destroy(struct pool *po if (pool->next_mapping) mempool_free(pool->next_mapping, pool->mapping_pool); - mempool_destroy(pool->mapping_pool); mempool_destroy(pool->endio_hook_pool); kfree(pool); } -static struct pool *pool_create(struct block_device *metadata_dev, +static struct pool *pool_create(struct mapped_device *pool_md, + struct block_device *metadata_dev, unsigned long block_size, char **error) { int r; @@ -1370,7 +1456,7 @@ static struct pool *pool_create(struct b pool->sectors_per_block = block_size; pool->block_shift = ffs(block_size) - 1; pool->offset_mask = block_size - 1; - pool->low_water_mark = 0; + pool->low_water_blocks = 0; pool->zero_new_blocks = 1; pool->prison = prison_create(PRISON_CELLS); if (!pool->prison) { @@ -1401,8 +1487,10 @@ static struct pool *pool_create(struct b INIT_WORK(&pool->worker, do_worker); spin_lock_init(&pool->lock); bio_list_init(&pool->deferred_bios); + bio_list_init(&pool->deferred_flush_bios); INIT_LIST_HEAD(&pool->prepared_mappings); pool->low_water_triggered = 0; + pool->no_free_space = 0; bio_list_init(&pool->retry_on_resume_list); ds_init(&pool->ds); @@ -1422,7 +1510,10 @@ static struct pool *pool_create(struct b err_p = ERR_PTR(-ENOMEM); goto bad_endio_hook_pool; } - atomic_set(&pool->ref_count, 1); + pool->ref_count = 1; + pool->pool_md = pool_md; + pool->md_dev = metadata_dev; + __pool_table_insert(pool); return pool; @@ -1443,29 +1534,38 @@ bad_pool: return err_p; } -static void pool_inc(struct pool *pool) +static void __pool_inc(struct pool *pool) { - atomic_inc(&pool->ref_count); + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + pool->ref_count++; } -static void pool_dec(struct pool *pool) +static void __pool_dec(struct pool *pool) { - if (atomic_dec_and_test(&pool->ref_count)) - pool_destroy(pool); + BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); + BUG_ON(!pool->ref_count); + if (!--pool->ref_count) + __pool_destroy(pool); } -static struct pool *pool_find(struct mapped_device *pool_md, - struct block_device *metadata_dev, - unsigned long block_size, - char **error) +static struct pool *__pool_find(struct mapped_device *pool_md, + struct block_device *metadata_dev, + unsigned long block_size, char **error) { - struct pool *pool; + struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); - pool = pool_table_lookup(pool_md); - if (pool) - pool_inc(pool); - else - pool = pool_create(metadata_dev, block_size, error); + if (pool) { + if (pool->pool_md != pool_md) + return ERR_PTR(-EBUSY); + __pool_inc(pool); + + } else { + pool = __pool_table_lookup(pool_md); + if (pool) + __pool_inc(pool); + else + pool = pool_create(pool_md, metadata_dev, block_size, error); + } return pool; } @@ -1534,12 +1634,12 @@ static int pool_is_congested(struct dm_t /* * thin-pool - * - * - * [<#feature args> []*] + * + * + * [<#feature args> []*] * * Optional feature arguments are: - * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. + * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. */ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) { @@ -1550,13 +1650,16 @@ static int pool_ctr(struct dm_target *ti struct dm_arg_set as; struct dm_dev *data_dev; unsigned long block_size; - dm_block_t low_water; + dm_block_t low_water_blocks; struct dm_dev *metadata_dev; sector_t metadata_dev_size; + mutex_lock(&dm_thin_pool_table.mutex); + if (argc < 4) { ti->error = "Invalid argument count"; - return -EINVAL; + r = -EINVAL; + goto out_unlock; } as.argc = argc; as.argv = argv; @@ -1564,7 +1667,7 @@ static int pool_ctr(struct dm_target *ti r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev); if (r) { ti->error = "Error opening metadata block device"; - return r; + goto out_unlock; } metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; @@ -1589,8 +1692,7 @@ static int pool_ctr(struct dm_target *ti goto out; } - if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water) || - !low_water) { + if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) { ti->error = "Invalid low water mark"; r = -EINVAL; goto out; @@ -1607,39 +1709,44 @@ static int pool_ctr(struct dm_target *ti if (r) goto out; - pool = pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, - block_size, &ti->error); - if (IS_ERR(pool)) { - r = PTR_ERR(pool); - goto out; - } - pt = kzalloc(sizeof(*pt), GFP_KERNEL); if (!pt) { - pool_destroy(pool); r = -ENOMEM; goto out; } + + pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, + block_size, &ti->error); + if (IS_ERR(pool)) { + r = PTR_ERR(pool); + goto out_free_pt; + } + pt->pool = pool; pt->ti = ti; pt->metadata_dev = metadata_dev; pt->data_dev = data_dev; - pt->low_water_mark = low_water; + pt->low_water_blocks = low_water_blocks; pt->zero_new_blocks = pf.zero_new_blocks; ti->num_flush_requests = 1; ti->num_discard_requests = 0; - ti->discards_supported = 0; ti->private = pt; pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); + mutex_unlock(&dm_thin_pool_table.mutex); + return 0; +out_free_pt: + kfree(pt); out: dm_put_device(ti, data_dev); out_metadata: dm_put_device(ti, metadata_dev); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); return r; } @@ -1648,13 +1755,15 @@ static void pool_dtr(struct dm_target *t { struct pool_c *pt = ti->private; - unbind_control_target(pt->pool, ti); - pool_dec(pt->pool); + mutex_lock(&dm_thin_pool_table.mutex); + unbind_control_target(pt->pool, ti); + __pool_dec(pt->pool); dm_put_device(ti, pt->metadata_dev); dm_put_device(ti, pt->data_dev); - kfree(pt); + + mutex_unlock(&dm_thin_pool_table.mutex); } static void __requeue_bios(struct pool *pool) @@ -1680,7 +1789,6 @@ static int pool_preresume(struct dm_targ struct pool_c *pt = ti->private; struct pool *pool = pt->pool; dm_block_t data_size, sb_data_size; - unsigned long flags; /* * Take control of the pool object. @@ -1716,20 +1824,21 @@ static int pool_preresume(struct dm_targ } } - spin_lock_irqsave(&pool->lock, flags); + return 0; +} + +static void pool_resume(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + + spin_lock_irq(&pool->lock); pool->low_water_triggered = 0; + pool->no_free_space = 0; __requeue_bios(pool); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); wake_worker(pool); - - /* - * The pool object is only present if the pool is active. - */ - pool->pool_md = dm_table_get_md(ti->table); - pool_table_insert(pool); - - return 0; } static void pool_postsuspend(struct dm_target *ti) @@ -1741,14 +1850,11 @@ static void pool_postsuspend(struct dm_t flush_workqueue(pool->wq); r = dm_pool_commit_metadata(pool->pmd); - if (r) { + if (r < 0) { DMERR("%s: dm_pool_commit_metadata() failed, error = %d", __func__, r); /* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/ } - - pool_table_remove(pool); - pool->pool_md = NULL; } static int check_arg_count(unsigned argc, unsigned args_required) @@ -1845,34 +1951,6 @@ static int process_delete_mesg(unsigned return r; } -static int process_trim_mesg(unsigned argc, char **argv, struct pool *pool) -{ - dm_thin_id dev_id; - sector_t new_size; - int r; - - r = check_arg_count(argc, 3); - if (r) - return r; - - r = read_dev_id(argv[1], &dev_id, 1); - if (r) - return r; - - if (kstrtoull(argv[2], 10, (unsigned long long *)&new_size)) { - DMWARN("trim device %s: Invalid new size: %s sectors.", - argv[1], argv[2]); - return -EINVAL; - } - - r = dm_pool_trim_thin_device(pool->pmd, dev_id, - dm_sector_div_up(new_size, pool->sectors_per_block)); - if (r) - DMWARN("Attempt to trim thin device %s failed.", argv[1]); - - return r; -} - static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool) { dm_thin_id old_id, new_id; @@ -1925,9 +2003,6 @@ static int pool_message(struct dm_target else if (!strcasecmp(argv[0], "delete")) r = process_delete_mesg(argc, argv, pool); - else if (!strcasecmp(argv[0], "trim")) - r = process_trim_mesg(argc, argv, pool); - else if (!strcasecmp(argv[0], "set_transaction_id")) r = process_set_transaction_id_mesg(argc, argv, pool); @@ -1994,13 +2069,11 @@ static int pool_status(struct dm_target if (r) return r; - DMEMIT("%llu %llu/%llu %llu/%llu", (unsigned long long)transaction_id, - (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata) * - pool->sectors_per_block, - (unsigned long long)nr_blocks_metadata * pool->sectors_per_block, - (unsigned long long)(nr_blocks_data - nr_free_blocks_data) * - pool->sectors_per_block, - (unsigned long long)nr_blocks_data * pool->sectors_per_block); + DMEMIT("%llu %llu/%llu %llu/%llu ", (unsigned long long)transaction_id, + (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), + (unsigned long long)nr_blocks_metadata, + (unsigned long long)(nr_blocks_data - nr_free_blocks_data), + (unsigned long long)nr_blocks_data); if (held_root) DMEMIT("%llu", held_root); @@ -2014,7 +2087,7 @@ static int pool_status(struct dm_target format_dev_t(buf, pt->metadata_dev->bdev->bd_dev), format_dev_t(buf2, pt->data_dev->bdev->bd_dev), (unsigned long)pool->sectors_per_block, - (unsigned long long)pt->low_water_mark); + (unsigned long long)pt->low_water_blocks); DMEMIT("%u ", !pool->zero_new_blocks); @@ -2067,6 +2140,7 @@ static struct target_type pool_target = .map = pool_map, .postsuspend = pool_postsuspend, .preresume = pool_preresume, + .resume = pool_resume, .message = pool_message, .status = pool_status, .merge = pool_merge, @@ -2074,16 +2148,21 @@ static struct target_type pool_target = .io_hints = pool_io_hints, }; -/*----------------------------------------------------------------*/ - +/*---------------------------------------------------------------- + * Thin target methods + *--------------------------------------------------------------*/ static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; - pool_dec(tc->pool); + mutex_lock(&dm_thin_pool_table.mutex); + + __pool_dec(tc->pool); dm_pool_close_thin_device(tc->td); dm_put_device(ti, tc->pool_dev); kfree(tc); + + mutex_unlock(&dm_thin_pool_table.mutex); } /* @@ -2101,15 +2180,19 @@ static int thin_ctr(struct dm_target *ti struct dm_dev *pool_dev; struct mapped_device *pool_md; + mutex_lock(&dm_thin_pool_table.mutex); + if (argc != 2) { ti->error = "Invalid argument count"; - return -EINVAL; + r = -EINVAL; + goto out_unlock; } tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL); if (!tc) { ti->error = "Out of memory"; - return -ENOMEM; + r = -ENOMEM; + goto out_unlock; } r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); @@ -2132,13 +2215,13 @@ static int thin_ctr(struct dm_target *ti goto bad_common; } - tc->pool = pool_table_lookup(pool_md); + tc->pool = __pool_table_lookup(pool_md); if (!tc->pool) { ti->error = "Couldn't find pool object"; r = -EINVAL; goto bad_pool_lookup; } - pool_inc(tc->pool); + __pool_inc(tc->pool); r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); if (r) { @@ -2153,16 +2236,20 @@ static int thin_ctr(struct dm_target *ti dm_put(pool_md); + mutex_unlock(&dm_thin_pool_table.mutex); + return 0; bad_thin_open: - pool_dec(tc->pool); + __pool_dec(tc->pool); bad_pool_lookup: dm_put(pool_md); bad_common: dm_put_device(ti, tc->pool_dev); bad_pool_dev: kfree(tc); +out_unlock: + mutex_unlock(&dm_thin_pool_table.mutex); return r; } @@ -2175,6 +2262,40 @@ static int thin_map(struct dm_target *ti return thin_bio_map(ti, bio, map_context); } +static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) +{ + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + bio_list_merge(&bios, master); + bio_list_init(master); + + while ((bio = bio_list_pop(&bios))) { + if (dm_get_mapinfo(bio)->ptr == tc) + bio_endio(bio, DM_ENDIO_REQUEUE); + else + bio_list_add(master, bio); + } +} + +static void requeue_io(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + __requeue_bio_list(tc, &pool->deferred_bios); + __requeue_bio_list(tc, &pool->retry_on_resume_list); + spin_unlock_irqrestore(&pool->lock, flags); +} + +static void thin_postsuspend(struct dm_target *ti) +{ + if (dm_noflush_suspending(ti)) + requeue_io((struct thin_c *)ti->private); +} + /* * */ @@ -2222,9 +2343,18 @@ static int thin_status(struct dm_target static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { + int r; + dm_block_t blocks; struct thin_c *tc = ti->private; - return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block, data); + r = dm_pool_get_data_dev_size(tc->pool->pmd, &blocks); + if (r) + return r; + + if (blocks) + return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); + + return 0; } static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -2242,6 +2372,7 @@ static struct target_type thin_target = .ctr = thin_ctr, .dtr = thin_dtr, .map = thin_map, + .postsuspend = thin_postsuspend, .status = thin_status, .iterate_devices = thin_iterate_devices, .io_hints = thin_io_hints,