--- drivers/md/multisnap/Kconfig | 10 drivers/md/multisnap/Makefile | 7 drivers/md/multisnap/dm-thinp-cct.c | 249 +++++++++++ drivers/md/multisnap/dm-thinp-delete.c | 128 +++++ drivers/md/multisnap/dm-thinp-directory.c | 466 +++++++++++++++++++++ drivers/md/multisnap/dm-thinp-radix.c | 432 +++++++++++++++++++ drivers/md/multisnap/dm-thinp-spacemap.c | 627 +++++++++++++++++++++++++++++ drivers/md/multisnap/dm-thinp-struct.c | 141 ++++++ drivers/md/multisnap/dm-thinp-struct.h | 213 +++++++++ drivers/md/multisnap/dm-thinp-unit-tests.c | 241 +++++++++++ drivers/md/multisnap/dm-thinp.c | 577 ++++++++++++++++++++++++++ drivers/md/multisnap/dm-thinp.h | 254 +++++++++++ 12 files changed, 3345 insertions(+) Index: linux-3.9-rc8-fast/drivers/md/multisnap/Kconfig =================================================================== --- linux-3.9-rc8-fast.orig/drivers/md/multisnap/Kconfig 2013-04-22 17:04:52.000000000 +0200 +++ linux-3.9-rc8-fast/drivers/md/multisnap/Kconfig 2013-04-22 17:04:52.000000000 +0200 @@ -35,3 +35,13 @@ config DM_MULTISNAPSHOT_DANIEL So far it doesn't support maintaining consistency across crashes; journaling is under development. +config DM_MULTISNAPSHOT_THINP + tristate "Thin-provisioning store" + depends on DM_MULTISNAPSHOT + select DM_BUFIO + ---help--- + Thin provisioning snapshot store. This data store accepts no + origin, all the data are stored on the snapshot store. + It allows unlimited snapshots and unlimited + snapshots-of-snapshots. + Index: linux-3.9-rc8-fast/drivers/md/multisnap/Makefile =================================================================== --- linux-3.9-rc8-fast.orig/drivers/md/multisnap/Makefile 2013-04-22 17:04:52.000000000 +0200 +++ linux-3.9-rc8-fast/drivers/md/multisnap/Makefile 2013-04-22 17:04:52.000000000 +0200 @@ -13,3 +13,10 @@ obj-$(CONFIG_DM_MULTISNAPSHOT_ROLLING) + dm-store-daniel-y += dm-multisnap-daniel.o obj-$(CONFIG_DM_MULTISNAPSHOT_DANIEL) += dm-store-daniel.o + +dm-store-thinp-y += dm-thinp.o dm-thinp-cct.o dm-thinp-delete.o \ + dm-thinp-directory.o dm-thinp-radix.o \ + dm-thinp-spacemap.o dm-thinp-struct.o \ + dm-thinp-unit-tests.o + +obj-$(CONFIG_DM_MULTISNAPSHOT_THINP) += dm-store-thinp.o Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp.c 2013-04-22 21:18:17.000000000 +0200 @@ -0,0 +1,577 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +/* + * Set various internal variables that depend on metadata or data chunk sizes. + */ +static void initialize_chunk_size(struct dm_exception_store *s) +{ + s->metadata_chunk_shift = ffs(s->metadata_chunk_size) - 1; + s->spacemaps_per_chunk_bits = ffs(s->metadata_chunk_size / + SPACEMAP_TOTAL_SIZE) - 1; + s->radix_tree_nodes_per_chunk_bits = ffs(s->metadata_chunk_size / + RADIX_TREE_NODE_TOTAL_SIZE) - 1; + + s->snapshot_directory_entries = + (s->metadata_chunk_size - sizeof(struct snapshot_directory)) / + sizeof(struct snapshot_directory_entry); + + s->data_div_metadata = s->data_chunk_size / s->metadata_chunk_size; + s->data_div_metadata_bits = ffs(s->data_div_metadata) - 1; + + s->radix_tree_node_mask = (1 << (s->radix_tree_nodes_per_chunk_bits + + RADIX_TREE_ENTRIES_BITS)) - 1; + + s->chunks_per_bitmap_bits = SPACEMAP_ENTRIES_BITS + + s->data_div_metadata_bits + s->spacemaps_per_chunk_bits; + + s->cct_n_chunks = 4 * CCT_ENTRIES / s->metadata_chunk_size; + if (!s->cct_n_chunks) + s->cct_n_chunks = 1; +} + +/* + * Select metadata chunk size. + */ +static int select_metadata_chunk_size(struct dm_exception_store *s) +{ + struct block_device *bdev = dm_multisnap_snapshot_bdev(s->dm); + + s->metadata_chunk_size = s->data_chunk_size / 16; + if (s->metadata_chunk_size < 1 << SECTOR_SHIFT) + s->metadata_chunk_size = 1 << SECTOR_SHIFT; + if (s->metadata_chunk_size < MIN_METACHUNK_SIZE) + s->metadata_chunk_size = MIN_METACHUNK_SIZE; + if (s->metadata_chunk_size < bdev_logical_block_size(bdev)) + s->metadata_chunk_size = bdev_logical_block_size(bdev); + +test_again: + initialize_chunk_size(s); + + if (1 << s->chunks_per_bitmap_bits <= s->cct_n_chunks) { + /* + * If crash count table doesn't fit between two bitmaps, + * increase chunk size. + */ + if (s->metadata_chunk_size == s->data_chunk_size) + return -ERANGE; + s->metadata_chunk_size <<= 1; + goto test_again; + } + + return 0; +} + +/* + * Get the number of metadata chunks. + */ +static int get_size(struct dm_exception_store *s, chunk_t *size) +{ + u64 dev_size; + + dev_size = i_size_read(dm_multisnap_snapshot_bdev(s->dm)->bd_inode) >> s->metadata_chunk_shift; + + if (dev_size > MAX_DEV_SIZE) + return -EFBIG; + + *size = dev_size; + + if (dev_size != *size) + return -EOPNOTSUPP; + + return 0; +} + +/* + * Test device size and possibly extend it. + */ +static void dm_thinp_lock_acquired(struct dm_exception_store *s, int flags) +{ + int r; + chunk_t new_size; + + if (unlikely(dm_multisnap_has_error(s->dm))) + return; + + if (!dm_multisnap_can_commit(s->dm)) + return; + + r = get_size(s, &new_size); + if (unlikely(r)) + return; + + if (unlikely(new_size != s->size)) { + if (unlikely(new_size < s->size)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EINVAL, + ("%s: device shrank", __func__)); + return; + } + dm_thinp_spacemap_extend(s, new_size); + } +} + +/* + * Return space usage. + */ +static void dm_thinp_get_space(struct dm_exception_store *s, + unsigned long long *chunks_total, + unsigned long long *chunks_allocated, + unsigned long long *chunks_metadata_allocated) +{ + dm_multisnap_status_assert_locked(s->dm); + + *chunks_total = s->size >> s->data_div_metadata_bits; + *chunks_allocated = s->allocated[MODE_METADATA] + + s->allocated[MODE_DATA]; + *chunks_metadata_allocated = s->allocated[MODE_METADATA]; +} + +/* + * Initialize the whole exception store. + */ +static void initialize_device(struct dm_exception_store *s) +{ + struct dm_buffer *bp; + struct thinp_superblock *sb; + + chunk_t used_chunks; + chunk_t total_chunks; + + chunk_t device_size; + int r; + + r = get_size(s, &device_size); + if (r) { + DM_MULTISNAP_SET_ERROR(s->dm, r, + ("%s: device is too large, %s", + __func__, + r == -EOPNOTSUPP ? + "compile kernel with 64-bit sector numbers" : + "increase chunk size")); + return; + } + + /* Chunk 1 is for the spacemap */ + + s->cct_chunk = 2; + + s->directory_chunk = s->cct_chunk + s->cct_n_chunks; + + used_chunks = s->directory_chunk + 1; + + total_chunks = (used_chunks + s->data_div_metadata - 1) & + ~(chunk_t)(s->data_div_metadata - 1); + + if (total_chunks > device_size) { + DM_MULTISNAP_SET_ERROR(s->dm, -ENOSPC, + ("%s: device is too small", __func__)); + return; + } + + s->allocated[MODE_METADATA] = total_chunks >> s->data_div_metadata_bits; + s->allocated[MODE_DATA] = 0; + + dm_thinp_create_directory(s); + if (dm_multisnap_has_error(s->dm)) + return; + + dm_thinp_create_spacemap(s, used_chunks, total_chunks); + if (dm_multisnap_has_error(s->dm)) + return; + + dm_thinp_create_cct(s); + if (dm_multisnap_has_error(s->dm)) + return; + + if (dm_thinp_write_dirty_buffers(s)) + return; + + sb = dm_bufio_new(s->bufio, 0, &bp); + if (IS_ERR(sb)) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(sb), + ("%s: can't allocate cct at %llx", + __func__, (unsigned long long)0)); + return; + } + + sb->signature = SUPERBLOCK_SIGNATURE; + sb->metadata_chunk_size = cpu_to_le32(s->metadata_chunk_size); + sb->data_chunk_size = cpu_to_le32(s->data_chunk_size); + sb->spacemap_total_size = cpu_to_le32(SPACEMAP_TOTAL_SIZE); + sb->radix_tree_node_total_size = cpu_to_le32(RADIX_TREE_NODE_TOTAL_SIZE); + sb->cc = cpu_to_le16(s->cc); + sb->size[0] = cpu_to_le64(total_chunks); + cc_make_invalid(&sb->size_txc, &sb->size_cc); + write_48(sb, cct, s->cct_chunk); + write_48(sb, directory, s->directory_chunk); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (dm_thinp_write_dirty_buffers(s)) + return; +} + +/* + * Load existing exception store into memory. + */ +static void load_device(struct dm_exception_store *s) +{ + struct thinp_superblock *sb; + struct dm_buffer *bp; + u64 size; + + dm_thinp_load_cct(s); + if (dm_multisnap_has_error(s->dm)) + return; + + sb = dm_thinp_read_superblock(s, &bp); + if (!sb) + return; + + size = le64_to_cpu(sb->size[cc_valid(s, sb->size_txc, sb->size_cc)]); + + if (size & (s->data_div_metadata - 1)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: device size is not multiple of data chunk size", + __func__)); + return; + } + + if (size != (chunk_t)size) { + DM_MULTISNAP_SET_ERROR(s->dm, -ERANGE, + ("%s: device is too large. Compile kernel with 64-bit sector numbers", + __func__)); + return; + } + s->size = size; + + s->directory_chunk = read_48(sb, directory); + + dm_bufio_release(bp); + + dm_thinp_load_directory(s); + if (dm_multisnap_has_error(s->dm)) + return; + + dm_thinp_mount(s); +} + +/* + * Initialize one instance of the driver. + */ +static int dm_thinp_init(struct dm_multisnap *dm, + struct dm_exception_store **sp, + unsigned argc, char **argv, char **error) +{ + int r; + struct dm_exception_store *s; + struct block_device *bdev; + + struct thinp_superblock *sb; + struct dm_buffer *bp; + + s = vzalloc(sizeof(struct dm_exception_store)); + if (!s) { + *error = "Could not allocate private area"; + r = -ENOMEM; + goto bad_private; + } + *sp = s; + s->dm = dm; + + s->delete_work.work = dm_thinp_background_delete; + s->delete_work.queued = 0; + s->delete_commit_count = 0; + + dm_thinp_init_directory(s); + dm_thinp_init_spacemap(s); + + s->data_chunk_size = dm_multisnap_chunk_size(dm); + s->data_chunk_shift = ffs(s->data_chunk_size) - 1; + + if (s->data_chunk_size < MIN_METACHUNK_SIZE) { + *error = "Too small chunk size"; + r = -EINVAL; + goto bad_arguments; + } + + while (argc) { + char *string; + r = dm_multisnap_get_string(&argv, &argc, &string, error); + if (r) + goto bad_arguments; + /* + * Add test for future arguments here. + * Also, regenerate the arguments in the "status_table" + * callback. + */ + { + *error = "Unknown parameter"; + r = -EINVAL; + goto bad_arguments; + } + } + + bdev = dm_multisnap_snapshot_bdev(s->dm); + + s->bufio = dm_bufio_client_create(bdev, bdev_logical_block_size(bdev), + 1, 0, NULL, NULL); + if (IS_ERR(s->bufio)) { + *error = "Can't create bufio client"; + r = PTR_ERR(s->bufio); + goto bad_bufio; + } + + sb = dm_bufio_read(s->bufio, 0, &bp); + if (IS_ERR(sb)) { + *error = "Can't read superblock"; + r = PTR_ERR(sb); + goto bad_superblock; + } + + if (sb->signature != SUPERBLOCK_SIGNATURE) { + unsigned i; + for (i = 0; i < 1 << SECTOR_SHIFT; i++) { + if (((char *)sb)[i]) { + dm_bufio_release(bp); + *error = "Uninitialized device"; + r = -ENXIO; + goto bad_superblock; + } + } + dm_bufio_release(bp); + + r = select_metadata_chunk_size(s); + if (r) { + *error = "Chunk size too small"; + goto bad_superblock; + } + + dm_bufio_client_destroy(s->bufio); + s->bufio = dm_bufio_client_create(bdev, s->metadata_chunk_size, + 1, 0, NULL, NULL); + if (IS_ERR(s->bufio)) { + *error = "Can't create bufio client"; + r = PTR_ERR(s->bufio); + goto bad_bufio; + } + +#ifdef UNIT_TEST_BUFIO + r = dm_thinp_unit_test_bufio(s); + if (r) { + *error = "Bufio unit test failed"; + goto bad_initialize; + } +#endif + + initialize_device(s); + if (dm_multisnap_has_error(s->dm)) { + *error = "Can't initialize device"; + r = dm_multisnap_has_error(s->dm); + goto bad_initialize; + } + } else { + if (le32_to_cpu(sb->spacemap_total_size) != SPACEMAP_TOTAL_SIZE) { + dm_bufio_release(bp); + *error = "Bad spacemap size"; + r = -EINVAL; + goto bad_superblock; + } + if (le32_to_cpu(sb->radix_tree_node_total_size) != RADIX_TREE_NODE_TOTAL_SIZE) { + dm_bufio_release(bp); + *error = "Bad radix tree node size"; + r = -EINVAL; + goto bad_superblock; + } + if (le32_to_cpu(sb->data_chunk_size) != s->data_chunk_size) { + dm_bufio_release(bp); + *error = "Bad data chunk size"; + r = -EINVAL; + goto bad_superblock; + } + s->metadata_chunk_size = le32_to_cpu(sb->metadata_chunk_size); + s->cc = le16_to_cpu(sb->cc); + s->cct_chunk = read_48(sb, cct); + + dm_bufio_release(bp); + + if (s->metadata_chunk_size & (s->metadata_chunk_size - 1) || + s->metadata_chunk_size > s->data_chunk_size || + s->metadata_chunk_size < 1 << SECTOR_SHIFT || + s->metadata_chunk_size < s->data_chunk_size / 16) { + *error = "Bad metadata chunk size"; + r = -EFSERROR; + goto bad_superblock; + } + + if (s->metadata_chunk_size < bdev_logical_block_size(bdev)) { + *error = "Metadata chunk size is smaller than device block size"; + r = -EINVAL; + goto bad_superblock; + } + + initialize_chunk_size(s); + + dm_bufio_client_destroy(s->bufio); + s->bufio = dm_bufio_client_create(bdev, s->metadata_chunk_size, + 1, 0, NULL, NULL); + if (IS_ERR(s->bufio)) { + *error = "Can't create bufio client"; + r = PTR_ERR(s->bufio); + goto bad_bufio; + } + } + + s->node_to_clone = vmalloc(s->metadata_chunk_size); + if (!s->node_to_clone) { + *error = "Can't allocate internal structure"; + r = -ENOMEM; + goto bad_initialize; + } + + load_device(s); + if (dm_multisnap_has_error(s->dm)) { + *error = "Can't load exception store"; + r = dm_multisnap_has_error(s->dm); + goto bad_load; + } + + /* Extend the store */ + dm_thinp_lock_acquired(s, 0); + +#ifdef UNIT_TEST_SPACEMAP + dm_thinp_unit_test_spacemap(s); + if (dm_multisnap_has_error(s->dm)) { + *error = "Spacemap unit test failed"; + r = dm_multisnap_has_error(s->dm); + goto bad_load; + } +#endif + + if (!hlist_empty(&s->deleting_snapshots)) + dm_multisnap_queue_work(s->dm, &s->delete_work); + + return 0; + +bad_load: + vfree(s->node_to_clone); +bad_initialize: +bad_superblock: + dm_bufio_client_destroy(s->bufio); +bad_bufio: +bad_arguments: + dm_thinp_done_spacemap(s); + dm_thinp_done_directory(s); + vfree(s); +bad_private: + return r; +} + +/* + * Free one instance of the driver. + */ +static void dm_thinp_exit(struct dm_exception_store *s) +{ + dm_multisnap_cancel_work(s->dm, &s->delete_work); + +#ifdef UNIT_TEST_SPACEMAP + dm_thinp_unit_test_spacemap(s); +#endif + + dm_thinp_unmount(s); + vfree(s->node_to_clone); + dm_bufio_client_destroy(s->bufio); + dm_thinp_done_spacemap(s); + dm_thinp_done_directory(s); + vfree(s); +} + +/* + * This function is optional. It flushes all buffers prior to commit, so that + * most metadata writes are not done under the lock. + * + * This function is not run under the lock, so it may race with anything. + */ +static void dm_thinp_prepare_for_commit(struct dm_exception_store *s) +{ + dm_thinp_write_dirty_buffers(s); +} + +static struct dm_multisnap_exception_store dm_thinp_store = { + .name = "thinp", + .module = THIS_MODULE, + .init_exception_store = dm_thinp_init, + .exit_exception_store = dm_thinp_exit, + .store_lock_acquired = dm_thinp_lock_acquired, + .get_space = dm_thinp_get_space, + .allocate_snapid = dm_thinp_allocate_snapid, + .create_snapshot = dm_thinp_create_snapshot, + .delete_snapshot = dm_thinp_delete_snapshot, + .get_next_snapid = dm_thinp_get_next_snapid, + .find_snapshot_chunk = dm_thinp_find_snapshot_chunk, + .add_next_remap = dm_thinp_make_chunk_writeable, + .make_chunk_writeable = dm_thinp_make_chunk_writeable, + .check_conflict = dm_thinp_check_conflict, + .prepare_for_commit = dm_thinp_prepare_for_commit, + .commit = dm_thinp_commit, +}; + +/* + * Init the whole module. + */ +static int __init dm_multisnapshot_thinp_module_init(void) +{ + int r; + + r = 0; + if (sizeof(struct spacemap) != SPACEMAP_TOTAL_SIZE) { + DMERR("thinp module miscompiled, sizeof(struct spacemap): %lx", + (unsigned long)sizeof(struct spacemap)); + r = -EINVAL; + } + if (sizeof(struct thin_radix_tree_node) != RADIX_TREE_NODE_TOTAL_SIZE) { + DMERR("thinp module miscompiled, sizeof(struct thin_radix_tree_node): %lx", + (unsigned long)sizeof(struct thin_radix_tree_node)); + r = -EINVAL; + } + if (r) + goto bad_self_check; + + r = dm_multisnap_register_exception_store(&dm_thinp_store); + if (r) + goto cant_register; + + return 0; + +cant_register: +bad_self_check: + return r; +} + +/* + * Exit the whole module. + */ +static void __exit dm_multisnapshot_thinp_module_exit(void) +{ + dm_multisnap_unregister_exception_store(&dm_thinp_store); +} + +size_t bla(void) +{ + return RADIX_TREE_NODE_TOTAL_SIZE - 32 - 2 * RADIX_TREE_ENTRIES * sizeof(struct radix_tree_pointer); +} + +module_init(dm_multisnapshot_thinp_module_init); +module_exit(dm_multisnapshot_thinp_module_exit); + +MODULE_DESCRIPTION(DM_NAME " thin-provisioning exceptions store"); +MODULE_AUTHOR("Mikulas Patocka"); +MODULE_LICENSE("GPL"); Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp.h 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,254 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#ifndef DM_MULTISNAP_THINP_H +#define DM_MULTISNAP_THINP_H + +/* internal checks */ +#define DM_THINP_CHECKING + +#include "dm-multisnap.h" +#include "dm-thinp-struct.h" + +#include "../dm-bufio.h" + +#include +#include +#include +#include + +#define SNAPSHOT_HASH_SIZE 32768 + +#define ALLOC_FLAGS_MODE 1 +#define MODE_METADATA 0 +#define MODE_DATA 1 +#define ALLOC_FLAGS_METADATA_FULL 2 + +struct spacemap_stats { + char valid; + chunk_t free[2]; /* indexed by MODE_ */ +}; + +struct dm_exception_store { + struct dm_multisnap *dm; + struct dm_bufio_client *bufio; + + unsigned metadata_chunk_size; + unsigned char metadata_chunk_shift; + + unsigned data_chunk_size; + unsigned char data_chunk_shift; + + unsigned char data_div_metadata; + unsigned char data_div_metadata_bits; + + unsigned snapshot_directory_entries; + + unsigned char spacemaps_per_chunk_bits; + unsigned char radix_tree_nodes_per_chunk_bits; + unsigned radix_tree_node_mask; + + unsigned char chunks_per_bitmap_bits; + unsigned cct_n_chunks; + + chunk_t size; + chunk_t allocated[2]; /* Indexed by MODE_* */ + + u16 cc; + u32 txc; + chunk_t cct_chunk; + + struct snapshot_entry *query_snapshot; + chunk_t query_chunk; + + chunk_t directory_chunk; + + unsigned min_free_snapid; + unsigned max_used_snapid; + + struct spacemap_stats *spacemap_stats; + + void *node_to_clone; + + struct hlist_head deleting_snapshots; + struct dm_multisnap_background_work delete_work; + unsigned delete_commit_count; + + u32 cct[CCT_ENTRIES]; + + struct hlist_head snapshots[SNAPSHOT_HASH_SIZE]; +}; + +#define read_48(struc, entry) (le32_to_cpu((struc)->entry##1) |\ + ((chunk_t)le16_to_cpu((struc)->entry##2) << 31 << 1)) + +#define write_48(struc, entry, val) do { (struc)->entry##1 = cpu_to_le32(val); \ + (struc)->entry##2 = cpu_to_le16((chunk_t)(val) >> 31 >> 1); } while (0) + +static inline void write_radix(struct radix_tree_pointer *rp, chunk_t chunk) +{ + rp->ptr1 = cpu_to_le16(chunk); + rp->ptr2 = cpu_to_le16(chunk >> 16); + rp->ptr3 = cpu_to_le16(chunk >> 31 >> 1); +} + +static inline chunk_t read_radix(struct radix_tree_pointer *rp) +{ + return le16_to_cpu(rp->ptr1) | + ((u32)le16_to_cpu(rp->ptr2) << 16) | + ((chunk_t)le16_to_cpu(rp->ptr3) << 31 << 1); +} + +/* dm-thinp-cct.c */ + +static inline void cc_make_invalid(__le32 *txc, __le16 *cc) +{ + *txc = cpu_to_le32(0x80000000); + *cc = cpu_to_le16(CCT_AUX_ENTRIES); +} + +static inline int cc_valid(struct dm_exception_store *s, __le32 txc_, __le16 cc_) +{ + u32 txc = le32_to_cpu(txc_); + u16 cc = le16_to_cpu(cc_); + return (s32)(s->cct[cc] - txc) >= 0; +} + +static inline int cc_current(struct dm_exception_store *s, __le32 txc_, __le16 cc_) +{ + u32 txc = le32_to_cpu(txc_); + u16 cc = le16_to_cpu(cc_); + return !((cc ^ s->cc) | ((txc ^ s->txc) & 0x7fffffff)); +} + +static inline void cc_set_current(struct dm_exception_store *s, + __le32 *txc, __le16 *cc) +{ + *txc = cpu_to_le32((cc_valid(s, *txc, *cc) << 31) | s->txc); + *cc = cpu_to_le16(s->cc); +} + +static inline void cc_set_current_valid(struct dm_exception_store *s, + __le32 *txc, __le16 *cc) +{ + *txc = cpu_to_le32(s->txc); + *cc = cpu_to_le16(s->cc); +} + +void dm_thinp_create_cct(struct dm_exception_store *s); +void dm_thinp_load_cct(struct dm_exception_store *s); + +void dm_thinp_mount(struct dm_exception_store *s); +void dm_thinp_unmount(struct dm_exception_store *s); +void dm_thinp_commit(struct dm_exception_store *s); + +/* dm-thinp-delete.c */ + +void dm_thinp_background_delete(struct dm_exception_store *s, + struct dm_multisnap_background_work *bw); + +/* dm-thinp-directory.c */ + +struct snapshot_entry { + struct hlist_node e; + chunk_t root; + unsigned char depth; + unsigned char tag; + u32 snapid; + + chunk_t directory_chunk; + unsigned directory_offset; +}; + +void dm_thinp_init_directory(struct dm_exception_store *s); +void dm_thinp_done_directory(struct dm_exception_store *s); + +struct snapshot_entry *dm_thinp_find_snapshot_entry_must_succeed( + struct dm_exception_store *s, u32 snapid); + +chunk_t dm_thinp_snapshot_pivot(struct dm_exception_store *s, u32 snapid); + +void dm_thinp_create_directory(struct dm_exception_store *s); +void dm_thinp_load_directory(struct dm_exception_store *s); + +void dm_thinp_update_directory_entry(struct dm_exception_store *s, + struct snapshot_entry *se); + +int dm_thinp_allocate_snapid(struct dm_exception_store *s, snapid_t *snapid, + int snap_of_snap, snapid_t master); +int dm_thinp_create_snapshot(struct dm_exception_store *s, snapid_t snapid, + int snap_of_snap, snapid_t master); +int dm_thinp_delete_snapshot(struct dm_exception_store *s, snapid_t snapid); +snapid_t dm_thinp_get_next_snapid(struct dm_exception_store *s, + snapid_t snapid); +void dm_thinp_free_snapshot(struct dm_exception_store *s, + struct snapshot_entry *se); + +/* dm-thinp-radix.c */ + +void dm_thinp_radix_tree_node_ptr_set_refcount(struct dm_exception_store *s, + struct thin_radix_tree_node *rn, + int adjust); +chunk_t dm_thinp_alloc_radix_tree_node(struct dm_exception_store *s, + chunk_t goal, u8 depth, + chunk_t init_pointer); +void dm_thinp_radix_tree_set_refcount(struct dm_exception_store *s, + chunk_t chunk, u8 depth, int adjust); +void dm_thinp_radix_tree_node_set_pointer(struct dm_exception_store *s, + struct snapshot_entry *snapshot, + chunk_t node, unsigned offset, + u8 depth, chunk_t new_node); + +int dm_thinp_find_snapshot_chunk(struct dm_exception_store *s, snapid_t snapid, + chunk_t chunk, int write, chunk_t *result); +void dm_thinp_make_chunk_writeable(struct dm_exception_store *s, + union chunk_descriptor *cd, + chunk_t *new_chunk); +int dm_thinp_check_conflict(struct dm_exception_store *s, + union chunk_descriptor *cd, snapid_t snapid); + +/* dm-thinp-spacemap.c */ + +void dm_thinp_create_spacemap(struct dm_exception_store *s, + chunk_t free_from, chunk_t free_to); +void dm_thinp_spacemap_extend(struct dm_exception_store *s, chunk_t new_size); +chunk_t dm_thinp_spacemap_alloc_metadata(struct dm_exception_store *s, + chunk_t goal); +chunk_t dm_thinp_spacemap_alloc_data(struct dm_exception_store *s, + chunk_t goal); +int dm_thinp_spacemap_get_set_refcount(struct dm_exception_store *s, + chunk_t chunk, int adjust); +void dm_thinp_spacemap_free_metadata(struct dm_exception_store *s, + chunk_t chunk); + +void dm_thinp_init_spacemap(struct dm_exception_store *s); +void dm_thinp_done_spacemap(struct dm_exception_store *s); + +/* dm-thinp-struct.c */ + +struct thinp_superblock *dm_thinp_read_superblock(struct dm_exception_store *s, + struct dm_buffer **bp); +struct spacemap *dm_thinp_read_spacemap(struct dm_exception_store *s, + chunk_t ptr, struct dm_buffer **bp); +struct snapshot_directory *dm_thinp_read_directory(struct dm_exception_store *s, + chunk_t ptr, + struct dm_buffer **bp); +struct thin_radix_tree_node *dm_thinp_read_radix_tree_node( + struct dm_exception_store *s, + chunk_t ptr, u8 depth, struct dm_buffer **bp); +int dm_thinp_write_dirty_buffers(struct dm_exception_store *s); + +/* dm-thinp-unit-tests.c */ + +/*#define UNIT_TEST_BUFIO*/ +#define UNIT_TEST_SPACEMAP + +int dm_thinp_unit_test_bufio(struct dm_exception_store *s); +void dm_thinp_unit_test_spacemap(struct dm_exception_store *s); + +#endif Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-unit-tests.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-unit-tests.c 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +#ifdef UNIT_TEST_BUFIO + +typedef unsigned seed_t; + +static void set_random(seed_t *seed, int x) +{ + if (!x) + x = 1234567; + *seed = x; +} + +static unsigned get_random(seed_t *seed) +{ + return *seed = 36969 * (*seed & 65535) + (*seed >> 16); +} + +#define BUFIO_MAX_CHUNKS 1000 + +static int bufio_pattern(struct dm_exception_store *s, int generate, int pass, chunk_t chunk) +{ + int r; + unsigned i; + seed_t pattern; + struct dm_buffer *bp; + unsigned char *block; + + if (generate && chunk & 1) + block = dm_bufio_new(s->bufio, chunk, &bp); + else + block = dm_bufio_read(s->bufio, chunk, &bp); + if (IS_ERR(block)) { + printk("dm_bufio_read failed (%Lx,%d) !\n", + (unsigned long long)chunk, pass); + return PTR_ERR(block); + } + set_random(&pattern, chunk + (pass << 24)); + r = 0; + for (i = 0; i < s->metadata_chunk_size; i++) { + if (generate) { + block[i] = get_random(&pattern); + } else { + if (block[i] != (unsigned char)get_random(&pattern)) { + printk("dm_bufio pattern failure at %Lx/%x !\n", + (unsigned long long)chunk, i); + r = -EILSEQ; + break; + } + } + } + if (generate) + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + return r; +} + +int dm_multisnap_thinp_unit_test_bufio(struct dm_exception_store *s) +{ + int r = 0; + chunk_t *array; + chunk_t n_chunks = i_size_read(dm_multisnap_snapshot_bdev(s->dm)->bd_inode) >> s->metadata_chunk_shift; + int pass; + printk("unit test: bufio\n"); + if (!n_chunks) { + printk("empty device\n"); + return 0; + } + + array = kmalloc(sizeof(chunk_t) * BUFIO_MAX_CHUNKS, GFP_KERNEL); + if (!array) { + printk("alloc failed !\n"); + return -ENOMEM; + } + + for (pass = 0; pass < 2; pass++) { + seed_t seed; + unsigned n, limit = min((chunk_t)BUFIO_MAX_CHUNKS, n_chunks); + set_random(&seed, 1); + for (n = 0; n < limit; n++) { + unsigned i; + chunk_t chunk = get_random(&seed) % n_chunks; +test_chunk_again: + for (i = 0; i < n; i++) { + cond_resched(); + if (array[i] == chunk) { + chunk = (chunk + 1) % n_chunks; + goto test_chunk_again; + } + } + array[n] = chunk; + /*printk("testing: %u - %Lx\n", n, (unsigned long long)chunk);*/ + r = bufio_pattern(s, 1, pass, chunk); + if (r) + goto failed; + i = get_random(&seed) % (n + 1); + r = bufio_pattern(s, 0, pass, array[i]); + if (r) + goto failed; + if (pass == 1) { + if (n == limit / 4) { + dm_bufio_drop_buffers(s->bufio); + } + if (n == limit / 2) { + r = dm_bufio_write_dirty_buffers(s->bufio); + if (r) { + printk("dm_bufio_write_dirty_buffers failed !\n"); + goto failed; + } + } + if (n == limit * 3 / 4) { + dm_bufio_write_dirty_buffers_async(s->bufio); + } + } + } + for (n = 0; n < limit; n++) { + r = bufio_pattern(s, 0, pass, array[n]); + if (r) + goto failed; + } + } + +failed: + kfree(array); + + if (r) + printk("TEST FAILED\n"); + + return r; +} + +#endif + + +#ifdef UNIT_TEST_SPACEMAP + +void dm_thinp_unit_test_spacemap(struct dm_exception_store *s) +{ + chunk_t chunk; + chunk_t metadata_allocated = 0; + chunk_t data_allocated = 0; + + printk("unit test: spacemap\n"); + + for (chunk = 0; chunk < s->size; chunk += ((chunk_t)1 << s->chunks_per_bitmap_bits)) { + unsigned idx = chunk >> s->chunks_per_bitmap_bits; + chunk_t spacemap_chunk = !chunk ? 1 : chunk; + struct spacemap *sm; + struct dm_buffer *bp; + unsigned i, j; + chunk_t stats[2] = { 0, 0 }; + + sm = dm_thinp_read_spacemap(s, spacemap_chunk, &bp); + for (i = 0; i < 1 << s->spacemaps_per_chunk_bits; i++) { + struct spacemap *sm2 = sm + i; + unsigned version = cc_valid(s, sm2->txc, sm2->cc); + struct spacemap_store *st = &sm2->spacemap[version]; + for (j = 0; j < SPACEMAP_ENTRIES; j++) { + u16 map = le16_to_cpu(st->map[j]); + chunk_t self; + self = chunk | ((((chunk_t)i << SPACEMAP_ENTRIES_BITS) | j) << s->data_div_metadata_bits); + if (self >= s->size && st->tags[j] != SPACEMAP_TAG_METADATA_FULL) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: free space beyond end: %llx >= %llx, tag %02x, map %04x", + __func__, + (unsigned long long)self, + (unsigned long long)s->size, + st->tags[j], + map)); + } + switch (st->tags[j]) { + case SPACEMAP_TAG_FREE: + if (map) { +invalid: + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: invalid entry at %llx/%x/%x: tag %02x, map %04x", + __func__, + (unsigned long long)spacemap_chunk, + i, j, + st->tags[j], + map)); + } + stats[MODE_METADATA]++; + stats[MODE_DATA]++; + break; + case SPACEMAP_TAG_METADATA_PARTIAL: + metadata_allocated++; + if (map & -(1 << s->data_div_metadata)) + goto invalid; + stats[MODE_METADATA]++; + break; + case SPACEMAP_TAG_METADATA_FULL: + if (self < s->size) + metadata_allocated++; + if (map != ((1 << s->data_div_metadata) - 1)) + goto invalid; + break; + default: + data_allocated++; + if (!st->tags[j] && !map) + goto invalid; + break; + } + } + } + dm_bufio_release(bp); + if (s->spacemap_stats && s->spacemap_stats[idx].valid) { + if (stats[0] != s->spacemap_stats[idx].free[0] || + stats[1] != s->spacemap_stats[idx].free[1]) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: botched stats: %llx != %llx || %llx != %llx", + __func__, + (unsigned long long)stats[0], + (unsigned long long)s->spacemap_stats[idx].free[0], + (unsigned long long)stats[1], + (unsigned long long)s->spacemap_stats[idx].free[1])); + } + } + } + + if (metadata_allocated != s->allocated[MODE_METADATA] || + data_allocated != s->allocated[MODE_DATA]) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: blocks miscounted: %llx != %llx || %llx != %llx", + __func__, + (unsigned long long)metadata_allocated, + (unsigned long long)s->allocated[MODE_METADATA], + (unsigned long long)data_allocated, + (unsigned long long)s->allocated[MODE_DATA])); + } +} + +#endif Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-struct.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-struct.h 2013-04-23 00:20:45.000000000 +0200 @@ -0,0 +1,213 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#ifndef DM_MULTISNAP_THINP_STRUCT_H +#define DM_MULTISNAP_THINP_STRUCT_H + +#include +#include + +/* + * Description of on-disk format + * + * The device is composed of metadata chunks and data chunks. Metadata chunks + * are smaller or equal as data chunks. + * + * data_chunk_size / 16 <= metadata_chunk_size <= data_chunk_size + * + * Crash counts are used to maintain consistency. All consistency-related + * structures have two version and a txc/cc pair to determine which one is + * valid. + * + * The device begins with superblock (struct thinp_superblock) The superblock + * contains chunk sizes, pointers to crash count table and snapshot directory + * and other general information. + * + * Snapshots are stored in the snapshot directory. The snapshot directory is a + * linear sequence of blocks (struct snapshot_directory), each block contains + * several structures (struct snapshot_directory_entry), each one describes one + * snapshot. + * SNAPSHOT_TAG_FREE --- the structure is free and doesn't contain any snapshot. + * SNAPSHOT_TAG_ACTIVE --- the structure contains information about an existing + * snapshot. + * SNAPSHOT_TAG_DELETED --- the snapshot was deleted, but not yet deallocated. + * The driver walks all SNAPSHOT_TAG_DELETED snapshots on background and + * deallocates their space. Once the space is completely deallocated, the + * structure moves to SNAPSHOT_TAG_FREE state. + * + * A radix tree is used to map snapshot data. Each radix tree node is stored in + * "struct thin_radix_tree_node" structures. If metadata chunk size is greater than + * sizeof(struct thin_radix_tree_node), more consecutive structures "radix_tree_node" + * are used to describe one radix tree node. The snapshot directory contains + * pointer to the root radix tree node. + * + * Each radix tree node has a reference count. When we take a snapshot of + * snapshot, the root radix tree node reference count is increased and both + * snapshots point to this node. Radix tree node is writeable only if it has + * reference count 1. If it has a reference count greater than 1, it must be + * copied before writing. + * + * Free space map is described in structures "struct spacemap". One or more + * these structures are packed into one metadata block. The structures are at + * fixed locations. Each structure has arrays "tag" and "map". Each data chunk + * is described by exactly one entry in "tag" and "map" arrays. For data chunks, + * these arrays hold reference count, for smaller metadata chunks, these arrays + * hold a bitmap of used metadata chunks. + * The following are possible combinations: + * tag == SPACEMAP_TAG_FREE, map == 0 --- the whole data chunk is free. + * tag == SPACEMAP_TAG_METADATA_PARTIAL --- the data chunk keeps metadata, it is + * partially free, bitmap of free and used metadata is stored in "map" entry. + * tag == SPACEMAP_TAG_METADATA_FULL --- the data chunk keeps metadata and it is + * full. + * tag == other values --- the data chunk keeps data and it has reference count + * (tag << 16) + map. + */ + +/* Tunable. Minimum is 6 */ +#define SPACEMAP_ENTRIES_BITS 6 + +/* Tunable. Minimum is 5 */ +#define RADIX_TREE_ENTRIES_BITS 5 + +#define MAX_DEV_SIZE (1ULL << 48) + +#define SNAPSHOT_TAG_FREE 0 +#define SNAPSHOT_TAG_ACTIVE 1 +#define SNAPSHOT_TAG_DELETED 2 + +struct snapshot_root { + __le32 root1; + __le16 root2; + __u8 depth; + __u8 tag; /* SNAPSHOT_TAG_* */ + __le32 snapid; +}; + +struct snapshot_directory_entry { + struct snapshot_root root[2]; + __le32 txc; + __le16 cc; + __u8 pad[2]; +}; + +#define SNAPSHOT_DIRECTORY_SIGNATURE cpu_to_be32(0xFFFF5344) + +struct snapshot_directory { + __be32 signature; + __le32 next1; + __le16 next2; + __le16 next_cc; + __le32 next_txc; + struct snapshot_directory_entry entry[0]; +}; + +struct radix_tree_pointer { + __le16 ptr1; + __le16 ptr2; + __le16 ptr3; +} __attribute__((packed)); + +#define RADIX_TREE_NODE_SIGNATURE cpu_to_be32(0xFFFF524E) + +#define RADIX_TREE_ENTRIES (1 << RADIX_TREE_ENTRIES_BITS) +#define RADIX_TREE_NODE_TOTAL_SIZE (16 << RADIX_TREE_ENTRIES_BITS) + +struct thin_radix_tree_node { + __be32 signature; + + /* refcount is used only on the first node in a metadata chunk */ + __le32 refcount[2]; + __le32 refcount_txc; + __le16 refcount_cc; + + __le16 pointers_cc; + __le32 pointers_txc; + __le32 self1; + __le16 self2; + __u8 depth; + __u8 pad1; + struct radix_tree_pointer pointers[2][RADIX_TREE_ENTRIES]; + + __u8 pad[RADIX_TREE_NODE_TOTAL_SIZE - 32 - 2 * RADIX_TREE_ENTRIES * sizeof(struct radix_tree_pointer)]; +}; + +#define SPACEMAP_SIGNATURE cpu_to_be32(0xFFFF534D) + +#define SPACEMAP_ENTRIES (1 << SPACEMAP_ENTRIES_BITS) +#define SPACEMAP_TOTAL_SIZE (8 << SPACEMAP_ENTRIES_BITS) + +struct spacemap_store { + /* SPACEMAP_TAG_*, or data refcount bits 16-23 */ + __u8 tags[SPACEMAP_ENTRIES]; + + /* metadata bitmap or data refcount bits 0-15 */ + __le16 map[SPACEMAP_ENTRIES]; +}; + +#define SPACEMAP_TAG_FREE 0xff +#define SPACEMAP_TAG_METADATA_PARTIAL 0xfe +#define SPACEMAP_TAG_METADATA_FULL 0xfd +#define SPACEMAP_TAG_REFCOUNT 0xfc /* 0 ... 0xfc */ + +#define MAX_SNAPSHOTS 0xfcffff + +struct spacemap { + __be32 signature; + __le32 txc; + __le16 cc; + __u8 pad[6]; + struct spacemap_store spacemap[2]; + __u8 pad2[SPACEMAP_TOTAL_SIZE - 16 - 2 * sizeof(struct spacemap_store)]; +}; + +#define CCT_SIGNATURE cpu_to_be32(0xFFFF4343) + +#define CCT_ENTRIES 65536 + +#define CCT_AUX_STRIDE (512 / 4) +#define CCT_AUX_ENTRIES ((sizeof(struct cct_aux) + 3) / 4) + +/* An auxulary information stored in each 512 bytes of crash count table */ + +struct cct_aux { + __be32 signature; + __le32 metadata_allocated1; + __le16 metadata_allocated2; + __le16 data_allocated2; + __le32 data_allocated1; +}; + +#define SUPERBLOCK_SIGNATURE cpu_to_be32(0xFFFF5342) + +struct thinp_superblock { + __be32 signature; + + __le32 error; + __le32 metadata_chunk_size; + __le32 data_chunk_size; + + /* depends on SPACEMAP_ENTRIES_BITS */ + __le32 spacemap_total_size; + /* depends on RADIX_TREE_ENTRIES_BITS */ + __le32 radix_tree_node_total_size; + + __le16 cc; + __le16 size_cc; + __le32 size_txc; + + __le64 size[2]; + + __le32 cct1; + __le16 cct2; + __le16 directory2; + __le32 directory1; +}; + +#define MIN_METACHUNK_SIZE max(SPACEMAP_TOTAL_SIZE, RADIX_TREE_NODE_TOTAL_SIZE) + +#endif Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-directory.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-directory.c 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,466 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +static unsigned snapid_hash(u32 snapid) +{ + return snapid & (SNAPSHOT_HASH_SIZE - 1); +} + +static struct snapshot_entry *dm_thinp_find_snapshot_entry( + struct dm_exception_store *s, u32 snapid) +{ + struct snapshot_entry *se; + + hlist_for_each_entry(se, &s->snapshots[snapid_hash(snapid)], e) + if (se->snapid == snapid) + return se; + + return NULL; +} + +struct snapshot_entry *dm_thinp_find_snapshot_entry_must_succeed( + struct dm_exception_store *s, u32 snapid) +{ + struct snapshot_entry *snapshot; + + snapshot = dm_thinp_find_snapshot_entry(s, snapid); + if (!snapshot) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: snapshot %x not found", + __func__, (unsigned)snapid)); + return NULL; + } + + return snapshot; +} + +/* + * Distribute snapshots across the device. + * + * 0 4 2 6 1 5 3 7 + */ +chunk_t dm_thinp_snapshot_pivot(struct dm_exception_store *s, u32 snapid) +{ + int bits; + chunk_t goal; + if (!snapid) { + goal = 0; + goto ret; + } + bits = fls(snapid); + snapid = bitrev32(snapid) >> (32 - bits); + goal = (s->size >> bits) * snapid; +ret: + /*printk("pivot for %x: %lld (%lld)\n", snapid, goal, s->size);*/ + return goal; +} + +static void dm_thinp_create_directory_at(struct dm_exception_store *s, + chunk_t directory_chunk) +{ + struct dm_buffer *bp; + struct snapshot_directory *dir; + + dir = dm_bufio_new(s->bufio, directory_chunk, &bp); + if (IS_ERR(dir)) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(dir), + ("%s: can't allocate snapshot directory at %llx", + __func__, (unsigned long long)directory_chunk)); + return; + } + memset(dir, 0, s->metadata_chunk_size); + + dir->signature = SNAPSHOT_DIRECTORY_SIGNATURE; + cc_make_invalid(&dir->next_txc, &dir->next_cc); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + +void dm_thinp_create_directory(struct dm_exception_store *s) +{ + dm_thinp_create_directory_at(s, s->directory_chunk); +} + +static void create_snapshot_entry(struct dm_exception_store *s, + chunk_t directory_chunk, + unsigned directory_offset, + struct snapshot_root *sr) +{ + struct snapshot_entry *se; + u32 snapid = le32_to_cpu(sr->snapid); + + if (sr->tag != SNAPSHOT_TAG_ACTIVE && sr->tag != SNAPSHOT_TAG_DELETED) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: invalid tag in snapshot directory, chunk %llx, entry %x: %02x", + __func__, + (unsigned long long)directory_chunk, + directory_offset, + sr->tag)); + return; + } + + if (snapid >= MAX_SNAPSHOTS) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: invalid snapid in snapshot directory, chunk %llx, entry %x: %08x", + __func__, + (unsigned long long)directory_chunk, + directory_offset, + snapid)); + return; + } + + se = dm_thinp_find_snapshot_entry(s, snapid); + if (se) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: duplicate snapid in snapshot directory, chunk %llx, entry %x: %08x", + __func__, + (unsigned long long)directory_chunk, + directory_offset, + snapid)); + return; + } + + se = kmalloc(sizeof(struct snapshot_entry), + dm_multisnap_initializing(s->dm) ? GFP_KERNEL : GFP_NOIO); + if (!se) { + DM_MULTISNAP_SET_ERROR(s->dm, -ENOMEM, + ("%s: can't allocate snapshot entry", __func__)); + return; + } + + se->root = read_48(sr, root); + se->depth = sr->depth; + se->tag = sr->tag; + se->snapid = snapid; + se->directory_chunk = directory_chunk; + se->directory_offset = directory_offset; + + dm_multisnap_status_lock(s->dm); + if (sr->tag == SNAPSHOT_TAG_ACTIVE) { + hlist_add_head(&se->e, &s->snapshots[snapid_hash(se->snapid)]); + if (snapid > s->max_used_snapid) + s->max_used_snapid = snapid; + } else { + hlist_add_head(&se->e, &s->deleting_snapshots); + } + dm_multisnap_status_unlock(s->dm); +} + +void dm_thinp_load_directory(struct dm_exception_store *s) +{ + chunk_t directory_chunk = s->directory_chunk; + struct dm_multisnap_stop_cycles cy; + dm_multisnap_init_stop_cycles(&cy); + + while (1) { + struct dm_buffer *bp; + struct snapshot_directory *dir; + unsigned i; + + if (dm_multisnap_stop_cycles(s->dm, &cy, directory_chunk)) + return; + + dir = dm_thinp_read_directory(s, directory_chunk, &bp); + if (!dir) + return; + + for (i = 0; i < s->snapshot_directory_entries; i++) { + struct snapshot_directory_entry *se = &dir->entry[i]; + struct snapshot_root *sr = + &se->root[cc_valid(s, se->txc, se->cc)]; + if (sr->tag == SNAPSHOT_TAG_FREE) + continue; + create_snapshot_entry(s, directory_chunk, i, sr); + if (dm_multisnap_has_error(s->dm)) { + dm_bufio_release(bp); + return; + } + } + + if (!cc_valid(s, dir->next_txc, dir->next_cc)) { + dm_bufio_release(bp); + break; + } + directory_chunk = read_48(dir, next); + + dm_bufio_release(bp); + } +} + +static struct snapshot_root *start_modifty_directory_entry( + struct dm_exception_store *s, + struct snapshot_directory_entry *se) +{ + unsigned idx = cc_valid(s, se->txc, se->cc); + if (unlikely(!cc_current(s, se->txc, se->cc))) { + memcpy(&se->root[idx ^ 1], &se->root[idx], + sizeof(struct snapshot_root)); + cc_set_current(s, &se->txc, &se->cc); + idx ^= 1; + } + return &se->root[idx]; +} + +void dm_thinp_update_directory_entry(struct dm_exception_store *s, + struct snapshot_entry *se) +{ + struct dm_buffer *bp; + struct snapshot_directory *dir; + struct snapshot_root *sr; + + dir = dm_thinp_read_directory(s, se->directory_chunk, &bp); + if (!dir) + return; + + sr = start_modifty_directory_entry(s, &dir->entry[se->directory_offset]); + write_48(sr, root, se->root); + sr->depth = se->depth; + sr->tag = se->tag; + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + +/* + * Allocate ID for new snapshot (possibly snapshot of another snapshot). + */ +int dm_thinp_allocate_snapid(struct dm_exception_store *s, snapid_t *snapid, + int snap_of_snap, snapid_t master) +{ + snapid_t sn; + + if (snap_of_snap) { + if (master >= MAX_SNAPSHOTS || + !dm_thinp_find_snapshot_entry(s, master)) { + DMERR("%s: invalid master snapshot %llx", + __func__, (unsigned long long)master); + return -EINVAL; + } + } + + for (sn = s->min_free_snapid; sn < MAX_SNAPSHOTS; sn++) { + if (!dm_thinp_find_snapshot_entry(s, sn)) { + s->min_free_snapid = sn + 1; + *snapid = sn; + return 0; + } + } + return -ENOSPC; +} + +/* + * Create snapshot with a given ID. + */ +int dm_thinp_create_snapshot(struct dm_exception_store *s, snapid_t snapid, + int snap_of_snap, snapid_t master) +{ + chunk_t radix_tree_root; + unsigned radix_tree_depth; + + chunk_t directory_chunk, new_directory_chunk; + unsigned i; + + struct dm_buffer *bp; + struct snapshot_directory *dir; + struct snapshot_directory_entry *se; + struct snapshot_root *sr; + + struct dm_multisnap_stop_cycles cy; + + if (snapid >= MAX_SNAPSHOTS) { + DMERR("%s: invalid snapshot id %llx", + __func__, (unsigned long long)snapid); + return -EINVAL; + } + + if (dm_thinp_find_snapshot_entry(s, snapid)) { + DMERR("%s: snapshot %x already exists", + __func__, (unsigned)snapid); + return -EINVAL; + } + + if (!snap_of_snap) { + radix_tree_root = dm_thinp_alloc_radix_tree_node(s, + dm_thinp_snapshot_pivot(s, snapid), 0, 0); + if (!radix_tree_root) + return dm_multisnap_has_error(s->dm); + radix_tree_depth = 0; + } else { + struct snapshot_entry *m = + dm_thinp_find_snapshot_entry(s, master); + if (master >= MAX_SNAPSHOTS || !m) { + DMERR("%s: invalid master snapshot %llx", + __func__, (unsigned long long)master); + return -EINVAL; + } + dm_thinp_radix_tree_set_refcount(s, m->root, m->depth, 1); + if (dm_multisnap_has_error(s->dm)) + return dm_multisnap_has_error(s->dm); + radix_tree_root = m->root; + radix_tree_depth = m->depth; + } + + directory_chunk = s->directory_chunk; + dm_multisnap_init_stop_cycles(&cy); + + while (1) { + if (dm_multisnap_stop_cycles(s->dm, &cy, directory_chunk)) + return -EFSERROR; + + dir = dm_thinp_read_directory(s, directory_chunk, &bp); + if (!dir) + return dm_multisnap_has_error(s->dm); + + for (i = 0; i < s->snapshot_directory_entries; i++) { + se = &dir->entry[i]; + sr = &se->root[cc_valid(s, se->txc, se->cc)]; + if (sr->tag != SNAPSHOT_TAG_FREE) + continue; + sr = start_modifty_directory_entry(s, se); + dm_bufio_mark_buffer_dirty(bp); + if (se->root[0].tag == SNAPSHOT_TAG_FREE && + se->root[1].tag == SNAPSHOT_TAG_FREE) + goto found_it; + } + + if (!cc_valid(s, dir->next_txc, dir->next_cc)) { + dm_bufio_release(bp); + break; + } + directory_chunk = read_48(dir, next); + + dm_bufio_release(bp); + } + + new_directory_chunk = dm_thinp_spacemap_alloc_metadata(s, + directory_chunk + 1); + if (!new_directory_chunk) + return dm_multisnap_has_error(s->dm); + + dm_thinp_create_directory_at(s, new_directory_chunk); + if (dm_multisnap_has_error(s->dm)) + return dm_multisnap_has_error(s->dm); + + dir = dm_thinp_read_directory(s, directory_chunk, &bp); + if (!dir) + return dm_multisnap_has_error(s->dm); + + write_48(dir, next, new_directory_chunk); + cc_set_current_valid(s, &dir->next_txc, &dir->next_cc); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + directory_chunk = new_directory_chunk; + i = 0; + + dir = dm_thinp_read_directory(s, directory_chunk, &bp); + if (!dir) + return dm_multisnap_has_error(s->dm); + + se = &dir->entry[i]; + sr = start_modifty_directory_entry(s, se); + +found_it: + write_48(sr, root, radix_tree_root); + sr->depth = radix_tree_depth; + sr->tag = SNAPSHOT_TAG_ACTIVE; + sr->snapid = cpu_to_le32(snapid); + + create_snapshot_entry(s, directory_chunk, i, sr); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + return 0; +} + +int dm_thinp_delete_snapshot(struct dm_exception_store *s, snapid_t snapid) +{ + struct snapshot_entry *se; + + se = dm_thinp_find_snapshot_entry(s, snapid); + if (!se || snapid >= MAX_SNAPSHOTS) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: snapshot id %llx not found", + __func__, (unsigned long long)snapid)); + return -EFSERROR; + } + + dm_multisnap_status_lock(s->dm); + + if (snapid < s->min_free_snapid) + s->min_free_snapid = snapid; + + se->tag = SNAPSHOT_TAG_DELETED; + dm_thinp_update_directory_entry(s, se); + + hlist_del(&se->e); + hlist_add_head(&se->e, &s->deleting_snapshots); + + dm_multisnap_status_unlock(s->dm); + + dm_multisnap_queue_work(s->dm, &s->delete_work); + + return 0; +} + +/* + * If "snapid" is valid snapshot ID, return snapid. + * Otherwise, return the next valid snapshot ID. + * If there is no next valid snapshot ID, return DM_SNAPID_T_ORIGIN. + */ +snapid_t dm_thinp_get_next_snapid(struct dm_exception_store *s, snapid_t snapid) +{ + snapid_t sn; + if (snapid >= MAX_SNAPSHOTS) + return DM_SNAPID_T_ORIGIN; + for (sn = snapid; sn <= s->max_used_snapid; sn++) { + if (dm_thinp_find_snapshot_entry(s, sn)) + return sn; + } + return DM_SNAPID_T_ORIGIN; +} + +void dm_thinp_free_snapshot(struct dm_exception_store *s, + struct snapshot_entry *se) +{ + dm_multisnap_status_lock(s->dm); + hlist_del(&se->e); + kfree(se); + dm_multisnap_status_unlock(s->dm); +} + +static void free_hlist(struct dm_exception_store *s, struct hlist_head *h) +{ + while (!hlist_empty(h)) { + struct snapshot_entry *se = hlist_entry(h->first, + struct snapshot_entry, e); + dm_thinp_free_snapshot(s, se); + } +} + +void dm_thinp_init_directory(struct dm_exception_store *s) +{ + unsigned i; + for (i = 0; i < SNAPSHOT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&s->snapshots[i]); + INIT_HLIST_HEAD(&s->deleting_snapshots); +} + +void dm_thinp_done_directory(struct dm_exception_store *s) +{ + unsigned i; + for (i = 0; i < SNAPSHOT_HASH_SIZE; i++) + free_hlist(s, &s->snapshots[i]); + free_hlist(s, &s->deleting_snapshots); +} + Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-spacemap.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-spacemap.c 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,627 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +static void adjust_allocated(struct dm_exception_store *s, int mode, int adjust) +{ + dm_multisnap_status_lock(s->dm); + s->allocated[mode] += adjust; + dm_multisnap_status_unlock(s->dm); +} + +static void spacemap_location(struct dm_exception_store *s, chunk_t chunk, + chunk_t *spacemap, chunk_t *spacemap_idx, + unsigned *subblk) +{ + *spacemap = chunk & ~(((chunk_t)1 << s->chunks_per_bitmap_bits) - 1); + if (!*spacemap) *spacemap = 1; + + *spacemap_idx = *spacemap >> s->chunks_per_bitmap_bits; + + chunk &= ((chunk_t)1 << s->chunks_per_bitmap_bits) - 1; + *subblk = chunk >> (SPACEMAP_ENTRIES_BITS + s->data_div_metadata_bits); +} + +static struct spacemap_store *start_modify_spacemap( + struct dm_exception_store *s, + struct spacemap *sm, + struct spacemap_store **other) +{ + unsigned idx = cc_valid(s, sm->txc, sm->cc); + if (unlikely(!cc_current(s, sm->txc, sm->cc))) { + memcpy(&sm->spacemap[idx ^ 1], &sm->spacemap[idx], + sizeof(struct spacemap_store)); + cc_set_current(s, &sm->txc, &sm->cc); + idx ^= 1; +#ifdef DM_THINP_CHECKING + if (unlikely(idx != cc_valid(s, sm->txc, sm->cc))) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: internal error", __func__)); + } +#endif + if (other) + *other = NULL; + } else { + if (other) + *other = &sm->spacemap[idx ^ 1]; + } + return &sm->spacemap[idx]; +} + +static void spacemap_ptr_free_metadata(struct dm_exception_store *s, + struct spacemap_store *store, + chunk_t chunk) +{ + unsigned bit, offset; + u8 tag; + u16 map; + unsigned idx; + + offset = (chunk >> s->data_div_metadata_bits) & (SPACEMAP_ENTRIES - 1); + bit = chunk & (s->data_div_metadata - 1); + + tag = store->tags[offset]; + map = le16_to_cpu(store->map[offset]); + if (unlikely(tag == SPACEMAP_TAG_METADATA_FULL)) { + if (unlikely(map != ((1 << s->data_div_metadata) - 1))) + goto bad_entry; + } else if (likely(tag == SPACEMAP_TAG_METADATA_PARTIAL)) { + if (unlikely(map & -(1 << s->data_div_metadata))) + goto bad_entry; + } else { +bad_entry: + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad entry when freeing %llx: tag %02x, map %04x", + __func__, (unsigned long long)chunk, tag, map)); + return; + } + + if (!(map & (1 << bit))) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: freeing metadata chunk %Lx: chunk already free", + __func__, (unsigned long long)chunk)); + return; + } + map &= ~(1 << bit); + store->map[offset] = cpu_to_le16(map); + + idx = chunk >> s->chunks_per_bitmap_bits; + if (s->spacemap_stats && s->spacemap_stats[idx].valid) { + if (store->tags[offset] == SPACEMAP_TAG_METADATA_FULL) + s->spacemap_stats[idx].free[MODE_METADATA]++; + if (!map) + s->spacemap_stats[idx].free[MODE_DATA]++; + } + if (!map) { + store->tags[offset] = SPACEMAP_TAG_FREE; + if (likely(chunk < s->size)) + adjust_allocated(s, MODE_METADATA, -1); + } else { + store->tags[offset] = SPACEMAP_TAG_METADATA_PARTIAL; + } +} + +static void dm_thinp_create_spacemap_at(struct dm_exception_store *s, + chunk_t location, + chunk_t free_from, chunk_t free_to) +{ + struct dm_buffer *bp; + struct spacemap *sm; + struct spacemap_store *store; + unsigned i, j; + chunk_t ch; + + sm = dm_bufio_new(s->bufio, location, &bp); + if (IS_ERR(sm)) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(sm), + ("%s: can't allocate spacemap at %llx", + __func__, (unsigned long long)location)); + return; + } + memset(sm, 0, s->metadata_chunk_size); + + for (i = 0; i < 1 << s->spacemaps_per_chunk_bits; i++) { + sm[i].signature = SPACEMAP_SIGNATURE; + cc_make_invalid(&sm[i].txc, &sm[i].cc); + store = &sm[i].spacemap[0]; + memset(&store->tags, SPACEMAP_TAG_METADATA_FULL, SPACEMAP_ENTRIES); + for (j = 0; j < SPACEMAP_ENTRIES; j++) + store->map[j] = cpu_to_le16((1 << s->data_div_metadata) - 1); + } + + for (ch = free_from; ch < free_to; ch++) { + unsigned map; + chunk_t location_cpy; + chunk_t idx; + spacemap_location(s, ch, &location_cpy, &idx, &map); + if (map >= 1 << s->spacemaps_per_chunk_bits) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: internal error, map %u, max maps %u", + __func__, + map, 1 << s->spacemaps_per_chunk_bits)); + goto ret; + } + if (location_cpy != location) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: internal error, location doesn't match: %llx != %llx", + __func__, + (unsigned long long)location_cpy, + (unsigned long long)location)); + goto ret; + } + store = &sm[map].spacemap[0]; + if (ch == location) + continue; + spacemap_ptr_free_metadata(s, store, ch); + } + +ret: + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + +void dm_thinp_create_spacemap(struct dm_exception_store *s, + chunk_t free_from, chunk_t free_to) +{ + dm_thinp_create_spacemap_at(s, 1, free_from, free_to); +} + +void dm_thinp_spacemap_extend(struct dm_exception_store *s, chunk_t new_size) +{ + struct dm_buffer *bp; + struct spacemap *sm; + struct spacemap_store *st; + struct thinp_superblock *sb; + unsigned idx; + + if (s->spacemap_stats) { + vfree(s->spacemap_stats); + s->spacemap_stats = NULL; + } + + while (s->size < new_size) { + chunk_t spacemap; + chunk_t idx; + unsigned subblk; + + spacemap_location(s, s->size, &spacemap, &idx, &subblk); + if (spacemap < s->size) { + sm = dm_thinp_read_spacemap(s, spacemap, &bp); + if (!sm) + return; + sm += subblk; + st = start_modify_spacemap(s, sm, NULL); + do { + spacemap_ptr_free_metadata(s, st, s->size); + dm_multisnap_status_lock(s->dm); + s->size++; + dm_multisnap_status_unlock(s->dm); + } while (s->size < new_size && + s->size & ((SPACEMAP_ENTRIES << + s->data_div_metadata_bits) - 1)); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } else if (spacemap == s->size) { + chunk_t limit = spacemap + + ((chunk_t)1 << s->chunks_per_bitmap_bits); + if (limit > new_size) + limit = new_size; + dm_thinp_create_spacemap_at(s, spacemap, s->size, + limit); + dm_multisnap_status_lock(s->dm); + s->size = limit; + dm_multisnap_status_unlock(s->dm); + adjust_allocated(s, MODE_METADATA, 1); + } else { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: internal error, extending botched: %llx != %llx", + __func__, + (unsigned long long)spacemap, + (unsigned long long)s->size)); + break; + } + } + + sb = dm_thinp_read_superblock(s, &bp); + if (!sb) + return; + idx = cc_valid(s, sb->size_txc, sb->size_cc); + if (!cc_current(s, sb->size_txc, sb->size_cc)) { + cc_set_current(s, &sb->size_txc, &sb->size_cc); + idx ^= 1; + } + sb->size[idx] = cpu_to_le64(s->size); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + dm_thinp_commit(s); +} + +static noinline void alloc_stats(struct dm_exception_store *s) +{ + u64 vm_size = (u64)(s->size >> s->chunks_per_bitmap_bits) * + sizeof(struct spacemap_stats); + if (vm_size != (unsigned long)vm_size) + return; + s->spacemap_stats = __vmalloc(vm_size, GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); +} + +static noinline void make_stats(struct dm_exception_store *s, + struct spacemap *sm, + struct spacemap_stats *stats) +{ + unsigned i, j; + stats->valid = 1; + stats->free[MODE_METADATA] = 0; + stats->free[MODE_DATA] = 0; + for (i = 0; i < 1 << s->spacemaps_per_chunk_bits; i++, sm++) { + struct spacemap_store *st = &sm->spacemap[ + cc_valid(s, sm->txc, sm->cc)]; + for (j = 0; j < SPACEMAP_ENTRIES; j++) { + __u8 tag = st->tags[j]; + if (tag == SPACEMAP_TAG_FREE) { + stats->free[MODE_METADATA]++; + stats->free[MODE_DATA]++; + } else if (tag == SPACEMAP_TAG_METADATA_PARTIAL) { + stats->free[MODE_METADATA]++; + } + } + } +} + +static chunk_t try_one_spacemap_once(struct dm_exception_store *s, chunk_t goal, + int flags) +{ + chunk_t spacemap, idx; + unsigned subblk; + + struct spacemap *sm; + struct dm_buffer *bp; + + spacemap_location(s, goal, &spacemap, &idx, &subblk); + if (likely(s->spacemap_stats != NULL) && + likely(s->spacemap_stats[idx].valid) && + likely(!s->spacemap_stats[idx].free[flags & ALLOC_FLAGS_MODE])) + return 0; + + sm = dm_thinp_read_spacemap(s, spacemap, &bp); + if (unlikely(!sm)) + return 0; + if (likely(s->spacemap_stats != NULL)) { + if (unlikely(!s->spacemap_stats[idx].valid)) + make_stats(s, sm, &s->spacemap_stats[idx]); +#ifdef DM_THINP_CHECKING + else { + chunk_t st0 = s->spacemap_stats[idx].free[0]; + chunk_t st1 = s->spacemap_stats[idx].free[1]; + make_stats(s, sm, &s->spacemap_stats[idx]); + if (unlikely(st0 != s->spacemap_stats[idx].free[0]) || + unlikely(st1 != s->spacemap_stats[idx].free[1])) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: botched stats: %llx != %llx || %llx != %llx", + __func__, + (unsigned long long)st0, + (unsigned long long)s->spacemap_stats[idx].free[0], + (unsigned long long)st1, + (unsigned long long)s->spacemap_stats[idx].free[1])); + return 0; + } + } +#endif + } + + sm += subblk; + do { + int offset; + unsigned goal_offset; + u8 *result; + chunk_t result_chunk; + struct spacemap_store *st = &sm->spacemap[ + cc_valid(s, sm->txc, sm->cc)]; + + u8 tag = flags & (MODE_DATA | ALLOC_FLAGS_METADATA_FULL) ? + SPACEMAP_TAG_FREE : SPACEMAP_TAG_METADATA_PARTIAL; + + goal_offset = (goal >> s->data_div_metadata) & + (SPACEMAP_ENTRIES - 1); + if (st->tags[goal_offset] == tag) { + result = &st->tags[goal_offset]; + goto got_result; + } + + offset = -1; + +try_next_offset: + offset++; + result = memchr(st->tags + offset, tag, + SPACEMAP_ENTRIES - offset); + if (likely(result != NULL)) { + unsigned val; + struct spacemap_store *other_st; +got_result: + offset = result - st->tags; + + result_chunk = ((chunk_t)idx << s->chunks_per_bitmap_bits) | + subblk << (SPACEMAP_ENTRIES_BITS + s->data_div_metadata_bits) | + offset << s->data_div_metadata_bits; + + st = start_modify_spacemap(s, sm, &other_st); + + if (other_st) { + if (unlikely(other_st->tags[offset] < tag)) + goto try_next_offset; + if (other_st->tags[offset] == + SPACEMAP_TAG_METADATA_PARTIAL) { + if (unlikely(other_st->map[offset] & + ~st->map[offset])) + goto try_next_offset; + } + } + + /* !!! FIXME: check other map */ + + val = le16_to_cpu(st->map[offset]); + if (likely(tag == SPACEMAP_TAG_FREE)) { + if (unlikely(val != 0)) { +bad_entry: + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad entry, spacemap %llx, subblk %x, offset %x: tag %02x, map %04x", + __func__, + (unsigned long long)spacemap, + subblk, offset, tag, val)); + goto release_ret_0; + } + if (likely(flags & MODE_DATA)) { + st->tags[offset] = 0; + st->map[offset] = cpu_to_le16(1); + if (likely(s->spacemap_stats != NULL)) { + s->spacemap_stats[idx].free[MODE_METADATA]--; + s->spacemap_stats[idx].free[MODE_DATA]--; + } + } else { + st->tags[offset] = s->data_div_metadata_bits ? + SPACEMAP_TAG_METADATA_PARTIAL : + SPACEMAP_TAG_METADATA_FULL; + st->map[offset] = cpu_to_le16(0x0001); + if (likely(s->spacemap_stats != NULL)) { + s->spacemap_stats[idx].free[MODE_DATA]--; + if (!s->data_div_metadata_bits) + s->spacemap_stats[idx].free[MODE_METADATA]--; + } + } + if (likely(flags & MODE_DATA)) + adjust_allocated(s, MODE_DATA, 1); + else + adjust_allocated(s, MODE_METADATA, 1); + } else { + unsigned bit; + bit = ffs(~val) - 1; + if (unlikely(bit >= s->data_div_metadata)) + goto bad_entry; + val |= 1 << bit; + st->map[offset] = cpu_to_le16(val); + result_chunk |= bit; + if (unlikely(val == ((1 << s->data_div_metadata) - 1))) { + st->tags[offset] = SPACEMAP_TAG_METADATA_FULL; + if (likely(s->spacemap_stats != NULL)) + s->spacemap_stats[idx].free[MODE_METADATA]--; + } + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + return result_chunk; + } + + sm++; + subblk++; + goal = 0; + } while (subblk < 1 << s->spacemaps_per_chunk_bits); + +release_ret_0: + dm_bufio_release(bp); + + return 0; +} + +static chunk_t try_one_spacemap(struct dm_exception_store *s, chunk_t goal, + int flags) +{ + chunk_t result; + + result = try_one_spacemap_once(s, goal, flags); + if (likely(result != 0)) + return result; + + if (flags & MODE_DATA) + return 0; + + return try_one_spacemap_once(s, goal, flags | ALLOC_FLAGS_METADATA_FULL); +} + +static chunk_t spacemap_alloc(struct dm_exception_store *s, chunk_t goal, + int flags) +{ + chunk_t result; + chunk_t end; + + if (unlikely(!s->spacemap_stats)) + alloc_stats(s); + + if (unlikely(goal >= s->size)) + goal = 0; + + result = try_one_spacemap(s, goal, flags); + if (likely(result != 0)) + return result; + + end = goal & ~(((chunk_t)1 << s->chunks_per_bitmap_bits) - 1); + + goal = end; + do { + goal += (chunk_t)1 << s->chunks_per_bitmap_bits; + if (unlikely(goal >= s->size)) + goal = 0; + + result = try_one_spacemap(s, goal, flags); + if (likely(result != 0)) + return result; + + } while (goal != end); + + DM_MULTISNAP_SET_ERROR(s->dm, -ENOSPC, ("space overflow")); + + return 0; +} + +chunk_t dm_thinp_spacemap_alloc_metadata(struct dm_exception_store *s, + chunk_t goal) +{ + return spacemap_alloc(s, goal, MODE_METADATA); +} + +chunk_t dm_thinp_spacemap_alloc_data(struct dm_exception_store *s, + chunk_t goal) +{ + return spacemap_alloc(s, goal, MODE_DATA); +} + +void dm_thinp_init_spacemap(struct dm_exception_store *s) +{ + s->spacemap_stats = NULL; +} + +int dm_thinp_spacemap_get_set_refcount(struct dm_exception_store *s, + chunk_t chunk, int adjust) +{ + chunk_t spacemap_chunk, idx; + unsigned subblk; + unsigned offset; + u8 tag; + unsigned refcount; + struct spacemap *sm; + struct spacemap_store *st; + struct dm_buffer *bp; + + if (unlikely(chunk >= s->size)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: too big pointer %llx >= %llx", + __func__, + (unsigned long long)chunk, + (unsigned long long)s->size)); + return -1; + } + if (unlikely((chunk & (s->data_div_metadata - 1)) != 0)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: unaligned pointer %llx", + __func__, + (unsigned long long)chunk)); + return -1; + } + + spacemap_location(s, chunk, &spacemap_chunk, &idx, &subblk); + + sm = dm_thinp_read_spacemap(s, spacemap_chunk, &bp); + if (unlikely(!sm)) + return -1; + + if (likely(!adjust)) + st = &sm[subblk].spacemap[ + cc_valid(s, sm[subblk].txc, sm[subblk].cc)]; + else + st = start_modify_spacemap(s, &sm[subblk], NULL); + + offset = (chunk >> s->data_div_metadata_bits) & (SPACEMAP_ENTRIES - 1); + + tag = st->tags[offset]; + if (unlikely(tag > SPACEMAP_TAG_REFCOUNT)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: invalid tag on chunk %llx: %02x", + __func__, (unsigned long long)chunk, tag)); + dm_bufio_release(bp); + return -1; + } + refcount = le16_to_cpu(st->map[offset]) + (tag << 16); + if (unlikely(!refcount)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: zero refcount on chunk %llx", + __func__, (unsigned long long)chunk)); + if (adjust) + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + return -1; + } + + if (unlikely(adjust)) { + refcount += adjust; + if (unlikely(refcount > MAX_SNAPSHOTS)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: refcount overflow on chunk %llx: %x + %x", + __func__, (unsigned long long)chunk, + refcount - adjust, adjust)); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + return -1; + } + st->map[offset] = cpu_to_le16(refcount); + if (refcount) + st->tags[offset] = refcount >> 16; + else { + st->tags[offset] = SPACEMAP_TAG_FREE; + adjust_allocated(s, MODE_DATA, -1); + if (s->spacemap_stats && s->spacemap_stats[idx].valid) { + s->spacemap_stats[idx].free[MODE_METADATA]++; + s->spacemap_stats[idx].free[MODE_DATA]++; + } + } + dm_bufio_mark_buffer_dirty(bp); + } + + dm_bufio_release(bp); + + return refcount; +} + +void dm_thinp_spacemap_free_metadata(struct dm_exception_store *s, + chunk_t chunk) +{ + chunk_t spacemap_chunk, idx; + unsigned subblk; + struct spacemap *sm; + struct spacemap_store *st; + struct dm_buffer *bp; + + if (unlikely(chunk >= s->size)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: too big pointer %llx >= %llx", + __func__, + (unsigned long long)chunk, + (unsigned long long)s->size)); + return ; + } + + spacemap_location(s, chunk, &spacemap_chunk, &idx, &subblk); + + sm = dm_thinp_read_spacemap(s, spacemap_chunk, &bp); + if (unlikely(!sm)) + return; + + st = start_modify_spacemap(s, &sm[subblk], NULL); + + spacemap_ptr_free_metadata(s, st, chunk); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + +void dm_thinp_done_spacemap(struct dm_exception_store *s) +{ + if (s->spacemap_stats) + vfree(s->spacemap_stats); +} Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-cct.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-cct.c 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +/* + * Create initial crash count table. + */ +void dm_thinp_create_cct(struct dm_exception_store *s) +{ + chunk_t chunk; + for (chunk = s->cct_chunk; chunk < s->cct_chunk + s->cct_n_chunks; + chunk++) { + struct dm_buffer *bp; + void *ptr; + ptr = dm_bufio_new(s->bufio, chunk, &bp); + if (IS_ERR(ptr)) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(ptr), + ("%s: can't allocate cct at %llx", + __func__, (unsigned long long)chunk)); + return; + } + memset(ptr, 0, s->metadata_chunk_size); + if (chunk == s->cct_chunk) { + struct cct_aux *cct_aux = ptr; + cct_aux->signature = CCT_SIGNATURE; + write_48(cct_aux, metadata_allocated, + s->allocated[MODE_METADATA]); + write_48(cct_aux, data_allocated, + s->allocated[MODE_DATA]); + } + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + } + s->cc = CCT_AUX_ENTRIES; +} + +/* + * Load crash count table into memory. + */ +void dm_thinp_load_cct(struct dm_exception_store *s) +{ + unsigned i; + + for (i = 0; i < s->cct_n_chunks; i++) { + struct dm_buffer *bp; + void *p = dm_bufio_read(s->bufio, s->cct_chunk + i, &bp); + if (IS_ERR(p)) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(p), + ("%s: can't read cct at %llx", + __func__, + (unsigned long long)(s->cct_chunk + i))); + return; + } + memcpy((char *)s->cct + (i << s->metadata_chunk_shift), + p, + min(s->metadata_chunk_size, (unsigned)sizeof(s->cct))); + dm_bufio_release(bp); + } + + for (i = 0; i < CCT_ENTRIES; i += CCT_AUX_STRIDE) { + struct cct_aux *cct_aux; + cct_aux = (struct cct_aux *)&s->cct[i]; + if (cct_aux->signature == CCT_SIGNATURE) { + s->allocated[MODE_METADATA] = + read_48(cct_aux, metadata_allocated); + s->allocated[MODE_DATA] = + read_48(cct_aux, data_allocated); + } else { + if (!i) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: no crash count table signature", + __func__)); + return; + } + } + } + + for (i = 0; i < CCT_ENTRIES; i++) + s->cct[i] = le32_to_cpu(s->cct[i]); + + s->txc = s->cct[s->cc]; +} + +/* + * Increase crash count. + * Skip over numbers that point into auxulary store in the crash count table. + */ +static u16 increase_cc(struct dm_exception_store *s, u16 cc) +{ + do { + cc++; + } while (unlikely((cc & (CCT_AUX_STRIDE - 1)) < CCT_AUX_ENTRIES)); + return cc; +} + +/* + * Write a crash count to disk. + * + * If "increase" is nonzero, write increased value. + */ +static void dm_thinp_write_cc(struct dm_exception_store *s, int increase) +{ + struct thinp_superblock *sb; + struct dm_buffer *bp; + u16 cc; + + sb = dm_thinp_read_superblock(s, &bp); + if (unlikely(!sb)) + return; + + cc = s->cc; + if (increase) + cc = increase_cc(s, cc); + sb->cc = cpu_to_le16(cc); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + dm_thinp_write_dirty_buffers(s); +} + +/* + * Start new transaction. + * If the transaction count overflows, increase crash count. + */ +static void dm_thinp_new_transaction(struct dm_exception_store *s) +{ + s->txc++; + if (unlikely(s->txc == 0x80000000)) { + s->cc = increase_cc(s, s->cc); + dm_thinp_write_cc(s, 1); + s->txc = 0; + } + s->cct[s->cc] = s->txc; +} + +/* + * Do a "mount" operation on the crash count table. + */ +void dm_thinp_mount(struct dm_exception_store *s) +{ + dm_thinp_write_cc(s, 1); + dm_thinp_new_transaction(s); +} + +/* + * Do an "unmount" operation on the crash count table. + */ +void dm_thinp_unmount(struct dm_exception_store *s) +{ + dm_thinp_write_cc(s, 0); +} + +/* + * Write crash count table value at a given index. + * + * Also, write auxulary data. + */ +static void write_cct(struct dm_exception_store *s, u16 write_cc) +{ + struct cct_aux *cct_aux; + unsigned rel_chunk, rel_offset, aux_offset; + struct dm_buffer *bp; + __le32 *cct; + + rel_chunk = (write_cc * 4) >> s->metadata_chunk_shift; + rel_offset = (write_cc & (s->metadata_chunk_size / 4 - 1)); + aux_offset = rel_offset & ~(CCT_AUX_STRIDE - 1); + + cct = dm_bufio_read(s->bufio, s->cct_chunk + rel_chunk, &bp); + if (unlikely(IS_ERR(cct))) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(cct), + ("%s: can't read cct at %llx", + __func__, + (unsigned long long)(s->cct_chunk + rel_chunk))); + return; + } + cct[rel_offset] = cpu_to_le32(s->cct[write_cc]); + + cct_aux = (struct cct_aux *)&cct[aux_offset]; + cct_aux->signature = CCT_SIGNATURE; + write_48(cct_aux, metadata_allocated, s->allocated[MODE_METADATA]); + write_48(cct_aux, data_allocated, s->allocated[MODE_DATA]); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + dm_thinp_write_dirty_buffers(s); +} + +/* + * Write "error" field in the superblock. + */ +static noinline void update_error_in_superblock(struct dm_exception_store *s) +{ + struct thinp_superblock *sb; + struct dm_buffer *bp; + + sb = dm_thinp_read_superblock(s, &bp); + if (unlikely(!sb)) + return; + + if (sb->error == cpu_to_le32(dm_multisnap_has_error(s->dm))) { + dm_bufio_release(bp); + return; + } + sb->error = cpu_to_le32(dm_multisnap_has_error(s->dm)); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + dm_thinp_write_dirty_buffers(s); +} + +/* + * Permanently commit the changes. + * + * The changes must not be committed before this function is called. + * They must be committed when this function exits. + * + * Error is stored in the error flag, there is no return code. + */ +void dm_thinp_commit(struct dm_exception_store *s) +{ + dm_thinp_write_dirty_buffers(s); + if (unlikely(dm_multisnap_has_error(s->dm))) { + if (!dm_multisnap_drop_on_error(s->dm) || + dm_multisnap_has_error(s->dm) == -ENOSPC) + return; + update_error_in_superblock(s); + return; + } + + /* + printk("!!! COMMIT: cc %x, txc %x\n", s->cc, s->txc); + printk("allocated: %lld / %lld / %lld, percent %d\n", + s->size >> s->data_div_metadata_bits, + s->data_allocated, + s->metadata_allocated, + (unsigned)((s->data_allocated + s->metadata_allocated) * 100 / (s->size >> s->data_div_metadata_bits))); + */ + + write_cct(s, s->cc); + + dm_thinp_new_transaction(s); +} Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-radix.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-radix.c 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,432 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +chunk_t dm_thinp_alloc_radix_tree_node(struct dm_exception_store *s, + chunk_t goal, u8 depth, + chunk_t init_pointer) +{ + chunk_t chunk; + struct dm_buffer *bp; + struct thin_radix_tree_node *rn; + struct radix_tree_pointer *rp; + unsigned i; + + chunk = dm_thinp_spacemap_alloc_metadata(s, goal); + if (unlikely(!chunk)) + return 0; + + rn = dm_bufio_new(s->bufio, chunk, &bp); + if (IS_ERR(rn)) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(rn), + ("%s: can't allocate snapshot directory at %llx", + __func__, (unsigned long long)chunk)); + return 0; + } + + memset(rn, 0, s->metadata_chunk_size); + + rn->refcount[0] = cpu_to_le32(1); + cc_make_invalid(&rn->refcount_txc, &rn->refcount_cc); + write_48(rn, self, chunk); + rn->depth = depth; + + for (i = 0; i < 1 << s->radix_tree_nodes_per_chunk_bits; i++) { + rn[i].signature = RADIX_TREE_NODE_SIGNATURE; + cc_make_invalid(&rn[i].pointers_txc, &rn[i].pointers_cc); + if (!i) { + rp = &rn[i].pointers[0][0]; + write_radix(rp, init_pointer); + } + } + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + return chunk; +} + +void dm_thinp_radix_tree_node_ptr_set_refcount(struct dm_exception_store *s, + struct thin_radix_tree_node *rn, + int adjust) +{ + unsigned refcount; + unsigned idx = cc_valid(s, rn->refcount_txc, rn->refcount_cc); + if (!cc_current(s, rn->refcount_txc, rn->refcount_cc)) { + rn->refcount[idx ^ 1] = rn->refcount[idx]; + cc_set_current(s, &rn->refcount_txc, &rn->refcount_cc); + idx ^= 1; + } + refcount = le32_to_cpu(rn->refcount[idx]); + if (unlikely(!refcount)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: zero refcount overflow on radix tree node %llx (add %x)", + __func__, + (unsigned long long)read_48(rn, self), + adjust)); + return; + } + refcount += adjust; + if (unlikely(refcount > MAX_SNAPSHOTS)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: refcount overflow on radix tree node %llx: %x + %x", + __func__, + (unsigned long long)read_48(rn, self), + refcount - adjust, adjust)); + return; + } + rn->refcount[idx] = cpu_to_le32(refcount); +} + +void dm_thinp_radix_tree_set_refcount(struct dm_exception_store *s, + chunk_t chunk, u8 depth, int adjust) +{ + struct thin_radix_tree_node *rn; + struct dm_buffer *bp; + + rn = dm_thinp_read_radix_tree_node(s, chunk, depth, &bp); + if (unlikely(!rn)) + return; + dm_thinp_radix_tree_node_ptr_set_refcount(s, rn, 1); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + +static chunk_t dm_thinp_clone_radix_tree_node(struct dm_exception_store *s, + chunk_t goal, + struct thin_radix_tree_node *orig) +{ + chunk_t chunk; + struct dm_buffer *bp; + struct thin_radix_tree_node *rn; + struct thin_radix_tree_node *rn2; + unsigned i, j; + + chunk = dm_thinp_spacemap_alloc_metadata(s, goal); + if (unlikely(!chunk)) + return 0; + + rn = dm_bufio_new(s->bufio, chunk, &bp); + if (IS_ERR(rn)) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(rn), + ("%s: can't allocate snapshot directory at %llx", + __func__, (unsigned long long)chunk)); + return 0; + } + + memcpy(rn, orig, s->metadata_chunk_size); + + rn->refcount[0] = cpu_to_le32(1); + cc_make_invalid(&rn->refcount_txc, &rn->refcount_cc); + write_48(rn, self, chunk); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + /* bump up reference counts */ + rn2 = orig; + for (i = 0; i < 1 << s->radix_tree_nodes_per_chunk_bits; i++, rn2++) { + struct radix_tree_pointer *rp = + rn2->pointers[cc_valid(s, + rn2->pointers_txc, rn2->pointers_cc)]; + for (j = 0; j < RADIX_TREE_ENTRIES; j++) { + chunk_t pointer = read_radix(&rp[j]); + if (!pointer) + continue; + if (likely(!orig->depth)) { + if (unlikely(dm_thinp_spacemap_get_set_refcount( + s, pointer, 1) == -1)) + return 0; + } else { + dm_thinp_radix_tree_set_refcount(s, pointer, + orig->depth - 1, 1); + if (unlikely(dm_multisnap_has_error(s->dm))) + return 0; + } + } + } + + return chunk; +} + +static struct radix_tree_pointer *start_modify_radix_tree_node( + struct dm_exception_store *s, + struct thin_radix_tree_node *rn) +{ + unsigned idx; + + idx = cc_valid(s, rn->pointers_txc, rn->pointers_cc); + if (!cc_current(s, rn->pointers_txc, rn->pointers_cc)) { + memcpy(&rn->pointers[idx ^ 1], &rn->pointers[idx], + RADIX_TREE_ENTRIES * sizeof(struct radix_tree_pointer)); + cc_set_current(s, &rn->pointers_txc, &rn->pointers_cc); + idx ^= 1; +#ifdef DM_THINP_CHECKING + if (unlikely(idx != + cc_valid(s, rn->pointers_txc, rn->pointers_cc))) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: internal error", __func__)); + } +#endif + } + return rn->pointers[idx]; +} + +void dm_thinp_radix_tree_node_set_pointer(struct dm_exception_store *s, + struct snapshot_entry *snapshot, + chunk_t node, unsigned offset, + u8 depth, chunk_t new_node) +{ + struct thin_radix_tree_node *rn; + struct dm_buffer *bp; + struct radix_tree_pointer *ptr; + + if (unlikely(!node)) { + snapshot->root = new_node; + if (!new_node) { + snapshot->tag = SNAPSHOT_TAG_FREE; + snapshot->depth = 0; + } + dm_thinp_update_directory_entry(s, snapshot); + return; + } + + rn = dm_thinp_read_radix_tree_node(s, node, depth, &bp); + if (unlikely(!rn)) + return; + + rn += offset >> RADIX_TREE_ENTRIES_BITS; + + ptr = start_modify_radix_tree_node(s, rn); + + write_radix(&ptr[offset & (RADIX_TREE_ENTRIES - 1)], new_node); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); +} + +/* + * Look up a translation for a given chunk. + * "write" is the hint that tells that the lookup is for writing. + * + * returns: + * -1 --- error + * 0 --- not found + * 1 --- read-only mapping (shared with other snapshots) exists + * 2 --- read-write mapping exists + */ +int dm_thinp_find_snapshot_chunk(struct dm_exception_store *s, snapid_t snapid, + chunk_t chunk, int write, chunk_t *result) +{ + struct snapshot_entry *snapshot; + int bits, depth; + chunk_t node; + + snapshot = dm_thinp_find_snapshot_entry_must_succeed(s, snapid); + if (unlikely(!snapshot)) + return -1; + + s->query_snapshot = snapshot; + s->query_chunk = chunk; + + bits = (snapshot->depth + 1) * + (s->radix_tree_nodes_per_chunk_bits + RADIX_TREE_ENTRIES_BITS); + + if (unlikely((chunk >> bits) != 0)) + return 0; + + node = snapshot->root; + depth = snapshot->depth; + + for ( + bits -= s->radix_tree_nodes_per_chunk_bits + RADIX_TREE_ENTRIES_BITS; + bits >= 0; + bits -= s->radix_tree_nodes_per_chunk_bits + RADIX_TREE_ENTRIES_BITS, + depth--) { + unsigned offset = (chunk >> bits) & s->radix_tree_node_mask; + unsigned version; + struct thin_radix_tree_node *rn; + struct dm_buffer *bp; + + rn = dm_thinp_read_radix_tree_node(s, node, depth, &bp); + if (unlikely(!rn)) + return -1; + if (write) { + if (unlikely(le32_to_cpu(rn->refcount[cc_valid(s, + rn->refcount_txc, rn->refcount_cc)]) != 1)) + write = 0; + } + rn += offset >> RADIX_TREE_ENTRIES_BITS; + version = cc_valid(s, rn->pointers_txc, rn->pointers_cc); + node = read_radix(&rn->pointers + [version][offset & (RADIX_TREE_ENTRIES - 1)]); + dm_bufio_release(bp); + if (unlikely(!node)) + return 0; + } + + if (unlikely((node & (s->data_div_metadata - 1)) != 0)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: unaligned pointer on snapshot %x, chunk %llx: %llx", + __func__, snapshot->snapid, + (unsigned long long)chunk, + (unsigned long long)node)); + return -1; + } + + if (write) { + int refcount = dm_thinp_spacemap_get_set_refcount(s, node, 0); + if (unlikely(refcount == -1)) + return -1; + if (unlikely(refcount != 1)) + write = 0; + } + + *result = node >> s->data_div_metadata_bits; + + return write + 1; +} + +/* + * This is used for both add_next_remap and make_chunk_writeable callbacks. + * + * This makes a writeable chunk, either allocating it or doing copy-on-write. + * + * Store the new location in "new_chunk". + */ +void dm_thinp_make_chunk_writeable(struct dm_exception_store *s, + union chunk_descriptor *cd, + chunk_t *new_chunk) +{ + struct snapshot_entry *snapshot = s->query_snapshot; + chunk_t chunk = s->query_chunk; + int bits, depth; + chunk_t node; + chunk_t sub_node; + chunk_t up_node; + unsigned up_offset; + chunk_t new_node; + +test_depth_again: + bits = (snapshot->depth + 1) * + (s->radix_tree_nodes_per_chunk_bits + RADIX_TREE_ENTRIES_BITS); + + if (unlikely((chunk >> bits) != 0)) { + chunk_t new_root; + new_root = dm_thinp_alloc_radix_tree_node(s, snapshot->root, + snapshot->depth + 1, snapshot->root); + if (unlikely(!new_root)) + return; + snapshot->root = new_root; + snapshot->depth++; + dm_thinp_update_directory_entry(s, snapshot); + if (unlikely(dm_multisnap_has_error(s->dm))) + return; + goto test_depth_again; + } + + up_node = 0; + up_offset = 0; + node = snapshot->root; + depth = snapshot->depth; + + for ( + bits -= s->radix_tree_nodes_per_chunk_bits + RADIX_TREE_ENTRIES_BITS; + bits >= 0; + bits -= s->radix_tree_nodes_per_chunk_bits + RADIX_TREE_ENTRIES_BITS, + depth--) { + unsigned offset = (chunk >> bits) & s->radix_tree_node_mask; + unsigned version; + struct thin_radix_tree_node *rn, *rn2; + struct dm_buffer *bp; + + rn = dm_thinp_read_radix_tree_node(s, node, depth, &bp); + if (unlikely(!rn)) + return; + + rn2 = rn + (offset >> RADIX_TREE_ENTRIES_BITS); + version = cc_valid(s, rn2->pointers_txc, rn2->pointers_cc); + sub_node = read_radix(&rn2->pointers + [version][offset & (RADIX_TREE_ENTRIES - 1)]); + + if (unlikely(le32_to_cpu(rn->refcount[cc_valid(s, + rn->refcount_txc, rn->refcount_cc)]) != 1)) { + + memcpy(s->node_to_clone, rn, s->metadata_chunk_size); + dm_thinp_radix_tree_node_ptr_set_refcount(s, rn, -1); + + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + + if (unlikely(dm_multisnap_has_error(s->dm))) + return; + + new_node = dm_thinp_clone_radix_tree_node(s, + up_node ? up_node : + dm_thinp_snapshot_pivot(s, snapshot->snapid), + s->node_to_clone); + if (unlikely(!new_node)) + return; + + dm_thinp_radix_tree_node_set_pointer(s, snapshot, + up_node, up_offset, depth + 1, new_node); + if (unlikely(dm_multisnap_has_error(s->dm))) + return; + } else { + dm_bufio_release(bp); + } + + up_node = node; + up_offset = offset; + + node = sub_node; + if (node) { + if (!depth) { + int refcount = + dm_thinp_spacemap_get_set_refcount(s, + node, 0); + if (unlikely(refcount == -1)) + return; + if (likely(refcount != 1)) { + dm_thinp_spacemap_get_set_refcount(s, + node, -1); + goto alloc_new_data_node; + } + } + } else { + if (likely(!depth)) { +alloc_new_data_node: + new_node = dm_thinp_spacemap_alloc_data(s, + up_node); + if (unlikely(!new_node)) + return; + } else { + new_node = dm_thinp_alloc_radix_tree_node(s, + up_node, depth - 1, 0); + if (unlikely(!new_node)) + return; + } + dm_thinp_radix_tree_node_set_pointer(s, snapshot, + up_node, up_offset, depth, new_node); + node = new_node; + } + } + + *new_chunk = node >> s->data_div_metadata_bits; +} + +/* + * Test if "snapid" belongs to the set of snapids described by "cd". + */ +int dm_thinp_check_conflict(struct dm_exception_store *s, + union chunk_descriptor *cd, snapid_t snapid) +{ + return 1; +} + + Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-struct.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-struct.c 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +struct thinp_superblock *dm_thinp_read_superblock(struct dm_exception_store *s, + struct dm_buffer **bp) +{ + struct thinp_superblock *sb = dm_bufio_read(s->bufio, 0, bp); + if (unlikely(IS_ERR(sb))) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(sb), + ("%s: can't read superblock: %d", + __func__, (int)PTR_ERR(sb))); + return NULL; + } + if (unlikely(sb->signature != SUPERBLOCK_SIGNATURE)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad signature on superblock: %08x", + __func__, be32_to_cpu(sb->signature))); + dm_bufio_release(*bp); + return NULL; + } + return sb; +} + +struct spacemap *dm_thinp_read_spacemap(struct dm_exception_store *s, + chunk_t ptr, struct dm_buffer **bp) +{ + struct spacemap *sm = dm_bufio_read(s->bufio, ptr, bp); + unsigned i; + if (unlikely(IS_ERR(sm))) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(sm), + ("%s: can't read spacemap %llx: %d", + __func__, (unsigned long long)ptr, (int)PTR_ERR(sm))); + return NULL; + } + for (i = 0; i < +#ifdef DM_THINP_CHECKING + 1 << s->spacemaps_per_chunk_bits +#else + 1 +#endif + ; i++) { + if (unlikely(sm[i].signature != SPACEMAP_SIGNATURE)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad signature on bitmap %llx/%x: %08x", + __func__, (unsigned long long)ptr, i, + be32_to_cpu(sm[i].signature))); + dm_bufio_release(*bp); + return NULL; + } + } + return sm; +} + +struct snapshot_directory *dm_thinp_read_directory(struct dm_exception_store *s, + chunk_t ptr, + struct dm_buffer **bp) +{ + struct snapshot_directory *dir = dm_bufio_read(s->bufio, ptr, bp); + if (unlikely(IS_ERR(dir))) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(dir), + ("%s: can't read snapshot directory %llx: %d", + __func__, (unsigned long long)ptr, (int)PTR_ERR(dir))); + return NULL; + } + if (unlikely(dir->signature != SNAPSHOT_DIRECTORY_SIGNATURE)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad signature on snapshot directory %llx: %08x", + __func__, (unsigned long long)ptr, + be32_to_cpu(dir->signature))); + dm_bufio_release(*bp); + return NULL; + } + return dir; +} + +struct thin_radix_tree_node *dm_thinp_read_radix_tree_node( + struct dm_exception_store *s, + chunk_t ptr, u8 depth, struct dm_buffer **bp) +{ + struct thin_radix_tree_node *rn = dm_bufio_read(s->bufio, ptr, bp); + unsigned i; + if (unlikely(IS_ERR(rn))) { + DM_MULTISNAP_SET_ERROR(s->dm, PTR_ERR(rn), + ("%s: can't read radix tree node %llx: %d", + __func__, (unsigned long long)ptr, (int)PTR_ERR(rn))); + return NULL; + } + for (i = 0; i < +#ifdef DM_THINP_CHECKING + 1 << s->radix_tree_nodes_per_chunk_bits +#else + 1 +#endif + ; i++) { + if (unlikely(rn[i].signature != RADIX_TREE_NODE_SIGNATURE)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad signature on radix tree node %llx/%x: %08x", + __func__, (unsigned long long)ptr, i, + be32_to_cpu(rn[i].signature))); + goto error_ret; + } + } + if (unlikely(read_48(rn, self) != ptr)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad self pointer on radix tree node %llx: %llx", + __func__, (unsigned long long)ptr, + (unsigned long long)read_48(rn, self))); + goto error_ret; + } + if (unlikely(rn->depth != depth)) { + DM_MULTISNAP_SET_ERROR(s->dm, -EFSERROR, + ("%s: bad depth on radix tree node %llx: %x != %x", + __func__, (unsigned long long)ptr, + rn->depth, depth)); + goto error_ret; + } + return rn; + +error_ret: + dm_bufio_release(*bp); + return NULL; +} + +int dm_thinp_write_dirty_buffers(struct dm_exception_store *s) +{ + int r; + r = dm_bufio_write_dirty_buffers(s->bufio); + if (r) { + DM_MULTISNAP_SET_ERROR(s->dm, r, + ("%s: error flushing cache", __func__)); + } + return r; +} + Index: linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-delete.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-3.9-rc8-fast/drivers/md/multisnap/dm-thinp-delete.c 2013-04-22 17:04:52.000000000 +0200 @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2011 Red Hat Czech, s.r.o. + * + * Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include "dm-thinp.h" + +#define MAX_WORK_IN_ONE_STEP 64 +#define COMMIT_AFTER 64 + +/* + * Perform background delete of a specified snapshot. + * + * This function does only a partial delete, because we must not lock the store + * for long periods of time. It is called over and over again, until the + * snapshot is finally deleted. + */ +static void delete_step(struct dm_exception_store *s, struct snapshot_entry *se) +{ + chunk_t up_node; + unsigned up_offset; + + chunk_t node; + int depth; + struct thin_radix_tree_node *rn; + struct dm_buffer *bp; + unsigned refcount; + + unsigned i, j; + + unsigned work_in_one_step = 0; + + up_node = 0; + up_offset = 0; + node = se->root; + depth = se->depth; + +go_down: + rn = dm_thinp_read_radix_tree_node(s, node, depth, &bp); + if (unlikely(!rn)) + return; + + refcount = le32_to_cpu(rn->refcount[ + cc_valid(s, rn->refcount_txc, rn->refcount_cc)]); + + if (unlikely(refcount != 1)) { + dm_thinp_radix_tree_node_ptr_set_refcount(s, rn, -1); + dm_bufio_mark_buffer_dirty(bp); + dm_bufio_release(bp); + goto go_up; + } + + memcpy(s->node_to_clone, rn, s->metadata_chunk_size); + dm_bufio_release(bp); + + for (i = 0, rn = s->node_to_clone; + i < 1 << s->radix_tree_nodes_per_chunk_bits; + i++, rn++) { + struct radix_tree_pointer *rp = + rn->pointers[ + cc_valid(s, rn->pointers_txc, rn->pointers_cc)]; + for (j = 0; j < RADIX_TREE_ENTRIES; j++, rp++) { + chunk_t chunk = read_radix(rp); + unsigned offset; + if (!chunk) + continue; + offset = (i << RADIX_TREE_ENTRIES_BITS) | j; + if (depth) { + up_node = node; + up_offset = offset; + node = chunk; + depth--; + goto go_down; + } + dm_thinp_spacemap_get_set_refcount(s, chunk, -1); + if (unlikely(dm_multisnap_has_error(s->dm))) + return; + dm_thinp_radix_tree_node_set_pointer(s, se, node, + offset, depth, 0); + if (unlikely(dm_multisnap_has_error(s->dm))) + return; + work_in_one_step++; + if (work_in_one_step > MAX_WORK_IN_ONE_STEP) + return; + } + } + + dm_thinp_spacemap_free_metadata(s, node); + +go_up: + dm_thinp_radix_tree_node_set_pointer(s, se, up_node, up_offset, + depth + 1, 0); + if (se->tag == SNAPSHOT_TAG_FREE) + dm_thinp_free_snapshot(s, se); +} + +/* + * The delete callback. + */ +void dm_thinp_background_delete(struct dm_exception_store *s, + struct dm_multisnap_background_work *bw) +{ + struct snapshot_entry *se; + + if (unlikely(dm_multisnap_has_error(s->dm))) + return; + + if (unlikely(hlist_empty(&s->deleting_snapshots))) + return; + + se = hlist_entry(s->deleting_snapshots.first, struct snapshot_entry, e); + + delete_step(s, se); + + if (dm_multisnap_can_commit(s->dm)) { + if (unlikely(++s->delete_commit_count >= COMMIT_AFTER) || + unlikely(hlist_empty(&s->deleting_snapshots))) { + s->delete_commit_count = 0; + dm_multisnap_call_commit(s->dm); + } + } + + dm_multisnap_queue_work(s->dm, &s->delete_work); +} +