--- drivers/md/persistent-data/Kconfig | 1 drivers/md/persistent-data/Makefile | 2 drivers/md/persistent-data/dm-block-manager.c | 1222 ++++++-------------- drivers/md/persistent-data/dm-block-manager.h | 33 drivers/md/persistent-data/dm-btree-internal.h | 6 drivers/md/persistent-data/dm-btree-remove.c | 109 - drivers/md/persistent-data/dm-btree-spine.c | 38 drivers/md/persistent-data/dm-btree.c | 72 - drivers/md/persistent-data/dm-btree.h | 19 drivers/md/persistent-data/dm-space-map-checker.c | 437 +++++++ drivers/md/persistent-data/dm-space-map-checker.h | 26 drivers/md/persistent-data/dm-space-map-common.c | 704 +++++++++++ drivers/md/persistent-data/dm-space-map-common.h | 52 drivers/md/persistent-data/dm-space-map-disk.c | 590 ++------- drivers/md/persistent-data/dm-space-map-metadata.c | 432 ------- drivers/md/persistent-data/dm-space-map.h | 16 drivers/md/persistent-data/dm-transaction-manager.c | 96 - 17 files changed, 1978 insertions(+), 1877 deletions(-) Index: linux-3.1-rc9/drivers/md/persistent-data/Kconfig =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/Kconfig +++ linux-3.1-rc9/drivers/md/persistent-data/Kconfig @@ -2,6 +2,7 @@ config DM_PERSISTENT_DATA tristate depends on BLK_DEV_DM && EXPERIMENTAL select LIBCRC32C + select DM_BUFIO ---help--- Library providing immutable on-disk data structure support for device-mapper targets such as the thin provisioning target. Index: linux-3.1-rc9/drivers/md/persistent-data/Makefile =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/Makefile +++ linux-3.1-rc9/drivers/md/persistent-data/Makefile @@ -1,6 +1,8 @@ obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o dm-persistent-data-objs := \ dm-block-manager.o \ + dm-space-map-checker.o \ + dm-space-map-common.o \ dm-space-map-disk.o \ dm-space-map-metadata.o \ dm-transaction-manager.o \ Index: linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-block-manager.c +++ linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.c @@ -5,843 +5,453 @@ */ #include "dm-block-manager.h" #include "dm-persistent-data-internal.h" +#include "../dm-bufio.h" -#include +#include #include #include +#include #include +#include #define DM_MSG_PREFIX "block manager" /*----------------------------------------------------------------*/ -#define SECTOR_SIZE (1 << SECTOR_SHIFT) -#define MAX_CACHE_SIZE 16U - -enum dm_block_state { - BS_EMPTY, - BS_CLEAN, - BS_READING, - BS_WRITING, - BS_READ_LOCKED, - BS_READ_LOCKED_DIRTY, /* Block was dirty before it was read locked. */ - BS_WRITE_LOCKED, - BS_DIRTY, - BS_ERROR -}; - -struct dm_block { - struct list_head list; - struct hlist_node hlist; - - dm_block_t where; - struct dm_block_validator *validator; - void *data; - wait_queue_head_t io_q; - unsigned read_lock_count; - unsigned write_lock_pending; - enum dm_block_state state; - - /* - * Extra flags like REQ_FLUSH and REQ_FUA can be set here. This is - * mainly as to avoid a race condition in flush_and_unlock() where - * the newly-unlocked superblock may have been submitted for a - * write before the write_all_dirty() call is made. - */ - int io_flags; - - /* - * Sadly we need an up pointer so we can get to the bm on io - * completion. - */ - struct dm_block_manager *bm; -}; - -struct dm_block_manager { - struct block_device *bdev; - unsigned cache_size; - unsigned max_held_per_thread; - unsigned block_size; /* In bytes */ - dm_block_t nr_blocks; - - /* - * This will trigger every time an io completes. - */ - wait_queue_head_t io_q; +/* + * This is a read/write semaphore with a couple of differences. + * + * i) There is a restriction on the number of concurrent read locks that + * may be held at once. This is just an implementation detail. + * + * ii) Recursive locking attempts are detected and return EINVAL. A stack + * trace is also emitted for the previous lock aquisition. + * + * iii) Priority is given to write locks. + */ +#define MAX_HOLDERS 4 +#define MAX_STACK 10 - struct dm_io_client *io; +typedef unsigned long stack_entries[MAX_STACK]; - /* - * Protects all the lists and the hash table. - */ +struct block_lock { spinlock_t lock; + __s32 count; + struct list_head waiters; + struct task_struct *holders[MAX_HOLDERS]; + +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_trace traces[MAX_HOLDERS]; + stack_entries entries[MAX_HOLDERS]; +#endif +}; - unsigned error_count; - unsigned available_count; - unsigned reading_count; - unsigned writing_count; - - struct list_head empty_list; /* No block assigned */ - struct list_head clean_list; /* Unlocked and clean */ - struct list_head dirty_list; /* Unlocked and dirty */ - struct list_head error_list; - - char buffer_cache_name[32]; - struct kmem_cache *buffer_cache; /* The buffers that store the raw data */ - - /* - * Hash table of cached blocks, holds everything that isn't in the - * BS_EMPTY state. - */ - unsigned hash_size; - unsigned hash_mask; - - struct hlist_head buckets[0]; /* Must be last member of struct. */ +struct waiter { + struct list_head list; + struct task_struct *task; + int wants_write; }; -dm_block_t dm_block_location(struct dm_block *b) +static unsigned __find_holder(struct block_lock *lock, + struct task_struct *task) { - return b->where; -} -EXPORT_SYMBOL_GPL(dm_block_location); + unsigned i; -void *dm_block_data(struct dm_block *b) -{ - return b->data; + for (i = 0; i < MAX_HOLDERS; i++) + if (lock->holders[i] == task) + break; + + BUG_ON(i == MAX_HOLDERS); + return i; } -EXPORT_SYMBOL_GPL(dm_block_data); -/*---------------------------------------------------------------- - * Hash table - *--------------------------------------------------------------*/ -static struct dm_block *__find_block(struct dm_block_manager *bm, dm_block_t b) +/* call this *after* you increment lock->count */ +static void __add_holder(struct block_lock *lock, struct task_struct *task) { - unsigned bucket = dm_hash_block(b, bm->hash_mask); - struct dm_block *blk; - struct hlist_node *n; + unsigned h = __find_holder(lock, NULL); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + struct stack_trace *t; +#endif - hlist_for_each_entry(blk, n, bm->buckets + bucket, hlist) - if (blk->where == b) - return blk; + get_task_struct(task); + lock->holders[h] = task; - return NULL; +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + t = lock->traces + h; + t->nr_entries = 0; + t->max_entries = MAX_STACK; + t->entries = lock->entries[h]; + t->skip = 2; + save_stack_trace(t); +#endif } -static void __insert_block(struct dm_block_manager *bm, struct dm_block *b) +/* call this *before* you decrement lock->count */ +static void __del_holder(struct block_lock *lock, struct task_struct *task) { - unsigned bucket = dm_hash_block(b->where, bm->hash_mask); - - hlist_add_head(&b->hlist, bm->buckets + bucket); + unsigned h = __find_holder(lock, task); + lock->holders[h] = NULL; + put_task_struct(task); } -/*---------------------------------------------------------------- - * Block state: - * __transition() handles transition of a block between different states. - * Study this to understand the state machine. - * - * Alternatively install graphviz and run: - * grep DOT dm-block-manager.c | grep -v ' ' | - * sed -e 's/.*DOT: //' -e 's/\*\///' | - * dot -Tps -o states.ps - * - * Assumes bm->lock is held. - *--------------------------------------------------------------*/ -static void __transition(struct dm_block *b, enum dm_block_state new_state) +static int __check_holder(struct block_lock *lock) { - /* DOT: digraph BlockStates { */ - struct dm_block_manager *bm = b->bm; + unsigned i; +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + static struct stack_trace t; + static stack_entries entries; +#endif - switch (new_state) { - case BS_EMPTY: - /* DOT: error -> empty */ - /* DOT: clean -> empty */ - BUG_ON(!((b->state == BS_ERROR) || - (b->state == BS_CLEAN))); - hlist_del(&b->hlist); - list_move(&b->list, &bm->empty_list); - b->write_lock_pending = 0; - b->read_lock_count = 0; - b->io_flags = 0; - b->validator = NULL; - - if (b->state == BS_ERROR) { - bm->error_count--; - bm->available_count++; + for (i = 0; i < MAX_HOLDERS; i++) { + if (lock->holders[i] == current) { + DMERR("recursive lock detected in pool metadata"); +#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING + DMERR("previously held here:"); + print_stack_trace(lock->traces + i, 4); + + DMERR("subsequent aquisition attempted here:"); + t.nr_entries = 0; + t.max_entries = MAX_STACK; + t.entries = entries; + t.skip = 3; + save_stack_trace(&t); + print_stack_trace(&t, 4); +#endif + return -EINVAL; } - break; + } - case BS_CLEAN: - /* DOT: reading -> clean */ - /* DOT: writing -> clean */ - /* DOT: read_locked -> clean */ - BUG_ON(!((b->state == BS_READING) || - (b->state == BS_WRITING) || - (b->state == BS_READ_LOCKED))); - switch (b->state) { - case BS_READING: - BUG_ON(!bm->reading_count); - bm->reading_count--; - break; + return 0; +} - case BS_WRITING: - BUG_ON(!bm->writing_count); - bm->writing_count--; - b->io_flags = 0; - break; +static void __wait(struct waiter *w) +{ + for (;;) { + set_task_state(current, TASK_UNINTERRUPTIBLE); - default: + if (!w->task) break; - } - list_add_tail(&b->list, &bm->clean_list); - bm->available_count++; - break; - - case BS_READING: - /* DOT: empty -> reading */ - BUG_ON(!(b->state == BS_EMPTY)); - __insert_block(bm, b); - list_del(&b->list); - bm->available_count--; - bm->reading_count++; - break; - - case BS_WRITING: - /* DOT: dirty -> writing */ - BUG_ON(!(b->state == BS_DIRTY)); - list_del(&b->list); - bm->writing_count++; - break; - - case BS_READ_LOCKED: - /* DOT: clean -> read_locked */ - BUG_ON(!(b->state == BS_CLEAN)); - list_del(&b->list); - bm->available_count--; - break; - - case BS_READ_LOCKED_DIRTY: - /* DOT: dirty -> read_locked_dirty */ - BUG_ON(!((b->state == BS_DIRTY))); - list_del(&b->list); - break; - - case BS_WRITE_LOCKED: - /* DOT: dirty -> write_locked */ - /* DOT: clean -> write_locked */ - BUG_ON(!((b->state == BS_DIRTY) || - (b->state == BS_CLEAN))); - list_del(&b->list); - - if (b->state == BS_CLEAN) - bm->available_count--; - break; - - case BS_DIRTY: - /* DOT: write_locked -> dirty */ - /* DOT: read_locked_dirty -> dirty */ - BUG_ON(!((b->state == BS_WRITE_LOCKED) || - (b->state == BS_READ_LOCKED_DIRTY))); - list_add_tail(&b->list, &bm->dirty_list); - break; - - case BS_ERROR: - /* DOT: writing -> error */ - /* DOT: reading -> error */ - BUG_ON(!((b->state == BS_WRITING) || - (b->state == BS_READING))); - bm->error_count++; - list_add_tail(&b->list, &bm->error_list); - break; + + schedule(); } - b->state = new_state; - /* DOT: } */ + set_task_state(current, TASK_RUNNING); } -/*---------------------------------------------------------------- - * Low-level io. - *--------------------------------------------------------------*/ -typedef void (completion_fn)(unsigned long error, struct dm_block *b); - -static void submit_io(struct dm_block *b, int rw, - completion_fn fn) +static void __wake_waiter(struct waiter *w) { - struct dm_block_manager *bm = b->bm; - struct dm_io_request req; - struct dm_io_region region; - unsigned sectors_per_block = bm->block_size >> SECTOR_SHIFT; - - region.bdev = bm->bdev; - region.sector = b->where * sectors_per_block; - region.count = sectors_per_block; - - req.bi_rw = rw; - req.mem.type = DM_IO_KMEM; - req.mem.offset = 0; - req.mem.ptr.addr = b->data; - req.notify.fn = (void (*)(unsigned long, void *)) fn; - req.notify.context = b; - req.client = bm->io; + struct task_struct *task; - if (dm_io(&req, 1, ®ion, NULL) < 0) - fn(1, b); + list_del(&w->list); + task = w->task; + smp_mb(); + w->task = NULL; + wake_up_process(task); } -/*---------------------------------------------------------------- - * High-level io. - *--------------------------------------------------------------*/ -static void __complete_io(unsigned long error, struct dm_block *b) +/* + * We either wake a few readers or a single writer. + */ +static void __wake_many(struct block_lock *lock) { - struct dm_block_manager *bm = b->bm; + struct waiter *w, *tmp; - if (error) { - DMERR("io error = %lu, block = %llu", - error , (unsigned long long)b->where); - __transition(b, BS_ERROR); - } else - __transition(b, BS_CLEAN); + BUG_ON(lock->count < 0); + list_for_each_entry_safe(w, tmp, &lock->waiters, list) { + if (lock->count >= MAX_HOLDERS) + return; - wake_up(&b->io_q); - wake_up(&bm->io_q); -} + if (w->wants_write) { + if (lock->count > 0) + return; /* still read locked */ -static void complete_io(unsigned long error, struct dm_block *b) -{ - struct dm_block_manager *bm = b->bm; - unsigned long flags; + lock->count = -1; + __add_holder(lock, w->task); + __wake_waiter(w); + return; + } - spin_lock_irqsave(&bm->lock, flags); - __complete_io(error, b); - spin_unlock_irqrestore(&bm->lock, flags); + lock->count++; + __add_holder(lock, w->task); + __wake_waiter(w); + } } -static void read_block(struct dm_block *b) +static void bl_init(struct block_lock *lock) { - submit_io(b, READ, complete_io); + int i; + + spin_lock_init(&lock->lock); + lock->count = 0; + INIT_LIST_HEAD(&lock->waiters); + for (i = 0; i < MAX_HOLDERS; i++) + lock->holders[i] = NULL; } -static void write_block(struct dm_block *b) +static int __available_for_read(struct block_lock *lock) { - if (b->validator) - b->validator->prepare_for_write(b->validator, b, - b->bm->block_size); - - submit_io(b, WRITE | b->io_flags, complete_io); + return lock->count >= 0 && + lock->count < MAX_HOLDERS && + list_empty(&lock->waiters); } -static void write_dirty(struct dm_block_manager *bm, unsigned count) +static int bl_down_read(struct block_lock *lock) { - struct dm_block *b, *tmp; - struct list_head dirty; - unsigned long flags; + int r; + struct waiter w; - /* - * Grab the first @count entries from the dirty list - */ - INIT_LIST_HEAD(&dirty); - spin_lock_irqsave(&bm->lock, flags); - list_for_each_entry_safe(b, tmp, &bm->dirty_list, list) { - if (!count--) - break; - __transition(b, BS_WRITING); - list_add_tail(&b->list, &dirty); + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; } - spin_unlock_irqrestore(&bm->lock, flags); - list_for_each_entry_safe(b, tmp, &dirty, list) { - list_del(&b->list); - write_block(b); - } -} + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } + + get_task_struct(current); + + w.task = current; + w.wants_write = 0; + list_add_tail(&w.list, &lock->waiters); + spin_unlock(&lock->lock); -static void write_all_dirty(struct dm_block_manager *bm) -{ - write_dirty(bm, bm->cache_size); + __wait(&w); + put_task_struct(current); + return 0; } -static void __clear_errors(struct dm_block_manager *bm) +static int bl_down_read_nonblock(struct block_lock *lock) { - struct dm_block *b, *tmp; - list_for_each_entry_safe(b, tmp, &bm->error_list, list) - __transition(b, BS_EMPTY); -} - -/*---------------------------------------------------------------- - * Waiting - *--------------------------------------------------------------*/ -#ifdef __CHECKER__ -# define __retains(x) __attribute__((context(x, 1, 1))) -#else -# define __retains(x) -#endif + int r; -#define __wait_block(wq, lock, flags, sched_fn, condition) \ -do { \ - DEFINE_WAIT(wait); \ - add_wait_queue(wq, &wait); \ - \ - for (;;) { \ - prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - \ - spin_unlock_irqrestore(lock, flags); \ - sched_fn(); \ - spin_lock_irqsave(lock, flags); \ - } \ - \ - finish_wait(wq, &wait); \ -} while (0) + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) + goto out; -static void __wait_io(struct dm_block *b, unsigned long *flags) - __retains(&b->bm->lock) -{ - __wait_block(&b->io_q, &b->bm->lock, *flags, io_schedule, - ((b->state != BS_READING) && (b->state != BS_WRITING))); -} + if (__available_for_read(lock)) { + lock->count++; + __add_holder(lock, current); + r = 0; + } else + r = -EWOULDBLOCK; -static void __wait_unlocked(struct dm_block *b, unsigned long *flags) - __retains(&b->bm->lock) -{ - __wait_block(&b->io_q, &b->bm->lock, *flags, schedule, - ((b->state == BS_CLEAN) || (b->state == BS_DIRTY))); +out: + spin_unlock(&lock->lock); + return r; } -static void __wait_read_lockable(struct dm_block *b, unsigned long *flags) - __retains(&b->bm->lock) +static void bl_up_read(struct block_lock *lock) { - __wait_block(&b->io_q, &b->bm->lock, *flags, schedule, - (!b->write_lock_pending && (b->state == BS_CLEAN || - b->state == BS_DIRTY || - b->state == BS_READ_LOCKED))); + spin_lock(&lock->lock); + BUG_ON(lock->count <= 0); + __del_holder(lock, current); + --lock->count; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); } -static void __wait_all_writes(struct dm_block_manager *bm, unsigned long *flags) - __retains(&bm->lock) +static int bl_down_write(struct block_lock *lock) { - __wait_block(&bm->io_q, &bm->lock, *flags, io_schedule, - !bm->writing_count); -} + int r; + struct waiter w; -static void __wait_all_io(struct dm_block_manager *bm, unsigned long *flags) - __retains(&bm->lock) -{ - __wait_block(&bm->io_q, &bm->lock, *flags, io_schedule, - !bm->writing_count && !bm->reading_count); -} + spin_lock(&lock->lock); + r = __check_holder(lock); + if (r) { + spin_unlock(&lock->lock); + return r; + } -static void __wait_clean(struct dm_block_manager *bm, unsigned long *flags) - __retains(&bm->lock) -{ - __wait_block(&bm->io_q, &bm->lock, *flags, io_schedule, - (!list_empty(&bm->clean_list) || - (!bm->writing_count))); -} + if (lock->count == 0 && list_empty(&lock->waiters)) { + lock->count = -1; + __add_holder(lock, current); + spin_unlock(&lock->lock); + return 0; + } -/*---------------------------------------------------------------- - * Finding a free block to recycle - *--------------------------------------------------------------*/ -static int __recycle_block(struct dm_block_manager *bm, dm_block_t where, - int need_read, struct dm_block_validator *v, - unsigned long flags, - struct dm_block **result) - __retains(&bm->lock) -{ - int r = 0; - struct dm_block *b; - unsigned long available; + get_task_struct(current); + w.task = current; + w.wants_write = 1; /* - * Wait for a block to appear on the empty or clean lists. + * Writers given priority. We know there's only one mutator in the + * system, so ignoring the ordering reversal. */ -retry: - while (1) { - /* - * The calling thread may hold some locks on blocks, and - * the rest be errored. In which case we're never going to - * succeed here. - */ - if (bm->error_count == bm->cache_size - bm->max_held_per_thread) - return -ENOMEM; - - /* - * Once we can lock and do io concurrently then we should - * probably flush at bm->cache_size / 2 and write _all_ - * dirty blocks. - */ - available = bm->available_count + bm->writing_count; - if (available < bm->cache_size / 4) { - spin_unlock_irqrestore(&bm->lock, flags); - write_dirty(bm, bm->cache_size / 4); - spin_lock_irqsave(&bm->lock, flags); - } - - if (!list_empty(&bm->empty_list)) { - b = list_first_entry(&bm->empty_list, struct dm_block, list); - break; - - } else if (!list_empty(&bm->clean_list)) { - b = list_first_entry(&bm->clean_list, struct dm_block, list); - __transition(b, BS_EMPTY); - break; - } - - __wait_clean(bm, &flags); - } - - b->where = where; - __transition(b, BS_READING); - - if (!need_read) { - memset(b->data, 0, bm->block_size); - b->validator = v; - __transition(b, BS_CLEAN); - } else { - spin_unlock_irqrestore(&bm->lock, flags); - read_block(b); - spin_lock_irqsave(&bm->lock, flags); - __wait_io(b, &flags); - - /* - * Has b been recycled whilst we were unlocked? - */ - if (b->where != where) - goto retry; - - /* - * Did the io succeed? - */ - if (b->state == BS_ERROR) { - /* - * Since this is a read that has failed we can clear the error - * immediately. Failed writes are revealed during a commit. - */ - __transition(b, BS_EMPTY); - r = -EIO; - } else { - /* - * We set the validator late, since there's a - * window while we're waiting for the read where - * someone could have set a different one. - */ - b->validator = v; - if (b->validator) { - r = b->validator->check(b->validator, b, bm->block_size); - if (r) { - DMERR("%s validator check failed for block %llu", - b->validator->name, (unsigned long long)b->where); - __transition(b, BS_EMPTY); - } - } - } - } + list_add(&w.list, &lock->waiters); + spin_unlock(&lock->lock); - if (!r) - *result = b; + __wait(&w); + put_task_struct(current); - return r; + return 0; } -/*---------------------------------------------------------------- - * Low level block management - *--------------------------------------------------------------*/ - -static struct kmem_cache *dm_block_cache; /* struct dm_block */ - -static struct dm_block *alloc_block(struct dm_block_manager *bm) +static void bl_up_write(struct block_lock *lock) { - struct dm_block *b = kmem_cache_alloc(dm_block_cache, GFP_KERNEL); - - if (!b) - return NULL; - - INIT_LIST_HEAD(&b->list); - INIT_HLIST_NODE(&b->hlist); + spin_lock(&lock->lock); + __del_holder(lock, current); + lock->count = 0; + if (!list_empty(&lock->waiters)) + __wake_many(lock); + spin_unlock(&lock->lock); +} - b->data = kmem_cache_alloc(bm->buffer_cache, GFP_KERNEL); - if (!b->data) { - kmem_cache_free(dm_block_cache, b); - return NULL; - } +static void report_recursive_bug(dm_block_t b, int r) +{ + if (r == -EINVAL) + DMERR("recursive acquisition of block %llu requested.", + (unsigned long long) b); +} - b->validator = NULL; - b->state = BS_EMPTY; - init_waitqueue_head(&b->io_q); - b->read_lock_count = 0; - b->write_lock_pending = 0; - b->io_flags = 0; - b->bm = bm; +/*----------------------------------------------------------------*/ - return b; +/* + * Block manager is currently implemented using dm-bufio. struct + * dm_block_manager and struct dm_block map directly onto a couple of + * structs in the bufio interface. I want to retain the freedom to move + * away from bufio in the future. So these structs are just cast within + * this .c file, rather than making it through to the public interface. + */ +static struct dm_buffer *to_buffer(struct dm_block *b) +{ + return (struct dm_buffer *) b; } -static void free_block(struct dm_block *b) +static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm) { - kmem_cache_free(b->bm->buffer_cache, b->data); - kmem_cache_free(dm_block_cache, b); + return (struct dm_bufio_client *) bm; } -static int populate_bm(struct dm_block_manager *bm, unsigned count) +dm_block_t dm_block_location(struct dm_block *b) { - int i; - LIST_HEAD(bs); + return dm_bufio_get_block_number(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_location); - for (i = 0; i < count; i++) { - struct dm_block *b = alloc_block(bm); - if (!b) { - struct dm_block *tmp; - list_for_each_entry_safe(b, tmp, &bs, list) - free_block(b); - return -ENOMEM; - } +void *dm_block_data(struct dm_block *b) +{ + return dm_bufio_get_block_data(to_buffer(b)); +} +EXPORT_SYMBOL_GPL(dm_block_data); - list_add(&b->list, &bs); - } +struct buffer_aux { + struct dm_block_validator *validator; + struct block_lock lock; + int write_locked; +}; - list_replace(&bs, &bm->empty_list); - bm->available_count = count; +static void dm_block_manager_alloc_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + aux->validator = NULL; + bl_init(&aux->lock); +} - return 0; +static void dm_block_manager_write_callback(struct dm_buffer *buf) +{ + struct buffer_aux *aux = dm_bufio_get_aux_data(buf); + if (aux->validator) { + aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf, + dm_bufio_get_block_size(dm_bufio_get_client(buf))); + } } /*---------------------------------------------------------------- * Public interface *--------------------------------------------------------------*/ -static unsigned calc_hash_size(unsigned cache_size) -{ - unsigned r = 32; /* Minimum size is 16 */ - - while (r < cache_size) - r <<= 1; - - return r >> 1; -} - struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, unsigned block_size, unsigned cache_size, unsigned max_held_per_thread) { - unsigned i; - unsigned hash_size = calc_hash_size(cache_size); - size_t len = sizeof(struct dm_block_manager) + - sizeof(struct hlist_head) * hash_size; - struct dm_block_manager *bm; - - bm = kmalloc(len, GFP_KERNEL); - if (!bm) - return NULL; - - bm->bdev = bdev; - bm->cache_size = max(MAX_CACHE_SIZE, cache_size); - bm->max_held_per_thread = max_held_per_thread; - bm->block_size = block_size; - bm->nr_blocks = i_size_read(bdev->bd_inode); - do_div(bm->nr_blocks, block_size); - init_waitqueue_head(&bm->io_q); - spin_lock_init(&bm->lock); - - INIT_LIST_HEAD(&bm->empty_list); - INIT_LIST_HEAD(&bm->clean_list); - INIT_LIST_HEAD(&bm->dirty_list); - INIT_LIST_HEAD(&bm->error_list); - bm->error_count = 0; - bm->available_count = 0; - bm->reading_count = 0; - bm->writing_count = 0; - - sprintf(bm->buffer_cache_name, "dm_block_buffer-%d-%d", - MAJOR(disk_devt(bdev->bd_disk)), - MINOR(disk_devt(bdev->bd_disk))); - - bm->buffer_cache = kmem_cache_create(bm->buffer_cache_name, - block_size, SECTOR_SIZE, - 0, NULL); - if (!bm->buffer_cache) - goto bad_free_bm; - - bm->hash_size = hash_size; - bm->hash_mask = hash_size - 1; - for (i = 0; i < hash_size; i++) - INIT_HLIST_HEAD(bm->buckets + i); - - bm->io = dm_io_client_create(); - if (!bm->io) - goto bad_free_buffer_cache; - - if (populate_bm(bm, cache_size) < 0) - goto bad_free_io_client; - - return bm; - -bad_free_io_client: - dm_io_client_destroy(bm->io); -bad_free_buffer_cache: - kmem_cache_destroy(bm->buffer_cache); -bad_free_bm: - kfree(bm); - - return NULL; + return (struct dm_block_manager *) + dm_bufio_client_create(bdev, block_size, max_held_per_thread, + sizeof(struct buffer_aux), + dm_block_manager_alloc_callback, + dm_block_manager_write_callback); } EXPORT_SYMBOL_GPL(dm_block_manager_create); void dm_block_manager_destroy(struct dm_block_manager *bm) { - int i; - struct dm_block *b, *btmp; - struct hlist_node *n, *tmp; - - dm_io_client_destroy(bm->io); - - for (i = 0; i < bm->hash_size; i++) - hlist_for_each_entry_safe(b, n, tmp, bm->buckets + i, hlist) - free_block(b); - - list_for_each_entry_safe(b, btmp, &bm->empty_list, list) - free_block(b); - - kmem_cache_destroy(bm->buffer_cache); - - kfree(bm); + return dm_bufio_client_destroy(to_bufio(bm)); } EXPORT_SYMBOL_GPL(dm_block_manager_destroy); unsigned dm_bm_block_size(struct dm_block_manager *bm) { - return bm->block_size; + return dm_bufio_get_block_size(to_bufio(bm)); } EXPORT_SYMBOL_GPL(dm_bm_block_size); dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) { - return bm->nr_blocks; + return dm_bufio_get_device_size(to_bufio(bm)); } -static int lock_internal(struct dm_block_manager *bm, dm_block_t block, - int how, int need_read, int can_block, - struct dm_block_validator *v, - struct dm_block **result) -{ - int r = 0; - struct dm_block *b; - unsigned long flags; - - spin_lock_irqsave(&bm->lock, flags); -retry: - b = __find_block(bm, block); - if (b) { - /* - * The block may be in state BS_READING at this point. - * Which means we're racing for this block against another - * locking op. This is fine, __wait_read_lockable() below - * will do the right thing. We do need to be careful - * however that the validator isn't set until the lock is - * full granted, otherwise the other thread could get the - * lock, but this one's validator be used. This situation - * only arises if there's a programming error in the code - * driving bm. - */ - - switch (how) { - case READ: - if (b->write_lock_pending || (b->state != BS_CLEAN && - b->state != BS_DIRTY && - b->state != BS_READ_LOCKED)) { - if (!can_block) { - spin_unlock_irqrestore(&bm->lock, flags); - return -EWOULDBLOCK; - } - - __wait_read_lockable(b, &flags); - - if (b->where != block) - goto retry; - } - break; - - case WRITE: - while (b->state != BS_CLEAN && b->state != BS_DIRTY) { - if (!can_block) { - spin_unlock_irqrestore(&bm->lock, flags); - return -EWOULDBLOCK; - } - - b->write_lock_pending++; - __wait_unlocked(b, &flags); - if (b->where != block) - /* - * Recycled blocks have their - * write_lock_pending count reset - * to zero, so no need to undo the - * above increment. - */ - goto retry; - b->write_lock_pending--; - } - break; - } - - if (!need_read) - b->validator = v; - else { - if (b->validator && (v != b->validator)) { - DMERR("validator mismatch (old=%s vs new=%s) for block %llu", - b->validator->name, v ? v->name : "NULL", - (unsigned long long)b->where); - spin_unlock_irqrestore(&bm->lock, flags); - return -EINVAL; - } - - if (!b->validator && v) { - b->validator = v; - r = b->validator->check(b->validator, b, bm->block_size); - if (r) { - DMERR("%s validator check failed for block %llu", - b->validator->name, - (unsigned long long)b->where); - spin_unlock_irqrestore(&bm->lock, flags); - return r; - } - } - } - - } else if (!can_block) { - r = -EWOULDBLOCK; - goto out; - - } else - r = __recycle_block(bm, block, need_read, v, flags, &b); - - if (!r) { - switch (how) { - case READ: - b->read_lock_count++; - - if (b->state == BS_DIRTY) - __transition(b, BS_READ_LOCKED_DIRTY); - else if (b->state == BS_CLEAN) - __transition(b, BS_READ_LOCKED); - break; - - case WRITE: - __transition(b, BS_WRITE_LOCKED); - break; +static int dm_bm_validate_buffer(struct dm_block_manager *bm, + struct dm_buffer *buf, + struct buffer_aux *aux, + struct dm_block_validator *v) +{ + if (unlikely(!aux->validator)) { + int r; + if (!v) + return 0; + r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm))); + if (unlikely(r)) + return r; + aux->validator = v; + } else { + if (unlikely(aux->validator != v)) { + DMERR("validator mismatch (old=%s vs new=%s) for block %llu", + aux->validator->name, v ? v->name : "NULL", + (unsigned long long) + dm_bufio_get_block_number(buf)); + return -EINVAL; } - - *result = b; } -out: - spin_unlock_irqrestore(&bm->lock, flags); - - return r; + return 0; } - int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, struct dm_block_validator *v, struct dm_block **result) { - return lock_internal(bm, b, READ, 1, 1, v, result); + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read(&aux->lock); + if (unlikely(r)) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; } EXPORT_SYMBOL_GPL(dm_bm_read_lock); @@ -849,7 +459,32 @@ int dm_bm_write_lock(struct dm_block_man dm_block_t b, struct dm_block_validator *v, struct dm_block **result) { - return lock_internal(bm, b, WRITE, 1, 1, v, result); + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + + aux->write_locked = 1; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_write(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; } EXPORT_SYMBOL_GPL(dm_bm_write_lock); @@ -857,142 +492,139 @@ int dm_bm_read_try_lock(struct dm_block_ dm_block_t b, struct dm_block_validator *v, struct dm_block **result) { - return lock_internal(bm, b, READ, 1, 0, v, result); + struct buffer_aux *aux; + void *p; + int r; + + p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + if (unlikely(!p)) + return -EWOULDBLOCK; + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_read_nonblock(&aux->lock); + if (r < 0) { + dm_bufio_release(to_buffer(*result)); + report_recursive_bug(b, r); + return r; + } + aux->write_locked = 0; + + r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); + return r; + } + + return 0; } int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, struct dm_block_validator *v, struct dm_block **result) { - int r = lock_internal(bm, b, WRITE, 0, 1, v, result); + int r; + struct buffer_aux *aux; + void *p; + + p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result); + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + + memset(p, 0, dm_bm_block_size(bm)); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); + if (r) { + dm_bufio_release(to_buffer(*result)); + return r; + } - if (!r) - memset((*result)->data, 0, bm->block_size); + aux->write_locked = 1; + aux->validator = v; - return r; + return 0; } int dm_bm_unlock(struct dm_block *b) { - int r = 0; - unsigned long flags; - - spin_lock_irqsave(&b->bm->lock, flags); - switch (b->state) { - case BS_WRITE_LOCKED: - __transition(b, BS_DIRTY); - wake_up(&b->io_q); - break; - - case BS_READ_LOCKED: - if (!--b->read_lock_count) { - __transition(b, BS_CLEAN); - wake_up(&b->io_q); - } - break; + struct buffer_aux *aux; + aux = dm_bufio_get_aux_data(to_buffer(b)); - case BS_READ_LOCKED_DIRTY: - if (!--b->read_lock_count) { - __transition(b, BS_DIRTY); - wake_up(&b->io_q); - } - break; + if (aux->write_locked) { + dm_bufio_mark_buffer_dirty(to_buffer(b)); + bl_up_write(&aux->lock); + } else + bl_up_read(&aux->lock); - default: - DMERR("block = %llu not locked", - (unsigned long long)b->where); - r = -EINVAL; - break; - } - spin_unlock_irqrestore(&b->bm->lock, flags); + dm_bufio_release(to_buffer(b)); - return r; + return 0; } EXPORT_SYMBOL_GPL(dm_bm_unlock); -static int __wait_flush(struct dm_block_manager *bm) +int dm_bm_unlock_move(struct dm_block *b, dm_block_t n) { - int r = 0; - unsigned long flags; + struct buffer_aux *aux; - spin_lock_irqsave(&bm->lock, flags); - __wait_all_writes(bm, &flags); + aux = dm_bufio_get_aux_data(to_buffer(b)); - if (!list_empty(&bm->error_list)) { - r = -EIO; - __clear_errors(bm); - } - spin_unlock_irqrestore(&bm->lock, flags); + if (aux->write_locked) { + dm_bufio_mark_buffer_dirty(to_buffer(b)); + bl_up_write(&aux->lock); + } else + bl_up_read(&aux->lock); - return r; + dm_bufio_release_move(to_buffer(b), n); + return 0; } int dm_bm_flush_and_unlock(struct dm_block_manager *bm, struct dm_block *superblock) { int r; - unsigned long flags; - write_all_dirty(bm); - r = __wait_flush(bm); - if (r) + r = dm_bufio_write_dirty_buffers(to_bufio(bm)); + if (unlikely(r)) + return r; + r = dm_bufio_issue_flush(to_bufio(bm)); + if (unlikely(r)) return r; - - spin_lock_irqsave(&bm->lock, flags); - superblock->io_flags = REQ_FUA | REQ_FLUSH; - spin_unlock_irqrestore(&bm->lock, flags); dm_bm_unlock(superblock); - write_all_dirty(bm); - return __wait_flush(bm); + r = dm_bufio_write_dirty_buffers(to_bufio(bm)); + if (unlikely(r)) + return r; + r = dm_bufio_issue_flush(to_bufio(bm)); + if (unlikely(r)) + return r; + + return 0; } int dm_bm_rebind_block_device(struct dm_block_manager *bm, struct block_device *bdev) { - unsigned long flags; - dm_block_t nr_blocks = i_size_read(bdev->bd_inode); - - do_div(nr_blocks, bm->block_size); - - spin_lock_irqsave(&bm->lock, flags); - if (nr_blocks < bm->nr_blocks) { - spin_unlock_irqrestore(&bm->lock, flags); - return -EINVAL; - } - - bm->bdev = bdev; - bm->nr_blocks = nr_blocks; - /* - * Wait for any in-flight io that may be using the old bdev + * !!! FIXME: remove this. It is supposedly unused. */ - __wait_all_io(bm, &flags); - spin_unlock_irqrestore(&bm->lock, flags); - return 0; } EXPORT_SYMBOL_GPL(dm_bm_rebind_block_device); -/*----------------------------------------------------------------*/ - -static int __init init_persistent_data(void) +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) { - dm_block_cache = KMEM_CACHE(dm_block, SLAB_HWCACHE_ALIGN); - if (!dm_block_cache) - return -ENOMEM; - - return 0; + return crc32c(~(u32) 0, data, len) ^ init_xor; } +EXPORT_SYMBOL_GPL(dm_bm_checksum); -static void __exit exit_persistent_data(void) -{ - kmem_cache_destroy(dm_block_cache); -} +/*----------------------------------------------------------------*/ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joe Thornber "); MODULE_DESCRIPTION("Immutable metadata library for dm"); -module_init(init_persistent_data); -module_exit(exit_persistent_data); + +/*----------------------------------------------------------------*/ Index: linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.h =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-block-manager.h +++ linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.h @@ -7,9 +7,8 @@ #ifndef _LINUX_DM_BLOCK_MANAGER_H #define _LINUX_DM_BLOCK_MANAGER_H -#include #include -#include +#include /*----------------------------------------------------------------*/ @@ -17,31 +16,21 @@ * Block number. */ typedef uint64_t dm_block_t; - -/* - * An opaque handle to a block of data. - */ struct dm_block; dm_block_t dm_block_location(struct dm_block *b); void *dm_block_data(struct dm_block *b); -/* - * Use CRC32 checksumming on data blocks. - */ -static inline uint32_t dm_block_csum_data(const void *data_le, unsigned length) -{ - return crc32c(~(u32)0, data_le, length); -} - /*----------------------------------------------------------------*/ -struct dm_block_manager; - /* + * @name should be a unique identifier for the block manager, no longer + * than 32 chars. + * * @max_held_per_thread should be the maximum number of locks, read or * write, that an individual thread holds at any one time. */ +struct dm_block_manager; struct dm_block_manager *dm_block_manager_create( struct block_device *bdev, unsigned block_size, unsigned cache_size, unsigned max_held_per_thread); @@ -108,6 +97,14 @@ int dm_bm_write_lock_zero(struct dm_bloc int dm_bm_unlock(struct dm_block *b); /* + * An optimisation; we often want to copy a block's contents to a new + * block. eg, as part of the shadowing operation. It's far better for + * bufio to do this move behind the scenes than hold 2 locks and memcpy the + * data. + */ +int dm_bm_unlock_move(struct dm_block *b, dm_block_t n); + +/* * It's a common idiom to have a superblock that should be committed last. * * @superblock should be write-locked on entry. It will be unlocked during @@ -131,4 +128,8 @@ int dm_bm_flush_and_unlock(struct dm_blo int dm_bm_rebind_block_device(struct dm_block_manager *bm, struct block_device *bdev); +u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); + +/*----------------------------------------------------------------*/ + #endif /* _LINUX_DM_BLOCK_MANAGER_H */ Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree-internal.h =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree-internal.h +++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree-internal.h @@ -79,7 +79,7 @@ void init_shadow_spine(struct shadow_spi int exit_shadow_spine(struct shadow_spine *s); int shadow_step(struct shadow_spine *s, dm_block_t b, - struct dm_btree_value_type *vt, int *inc); + struct dm_btree_value_type *vt); /* * The spine must have at least one entry before calling this. @@ -108,8 +108,12 @@ static inline void *value_base(struct no return &n->keys[le32_to_cpu(n->header.max_entries)]; } +/* + * FIXME: Now that value size is stored in node we don't need the third parm. + */ static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size) { + BUG_ON(value_size != le32_to_cpu(n->header.value_size)); return value_base(n) + (value_size * index); } Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree-remove.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree-remove.c +++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree-remove.c @@ -56,54 +56,64 @@ static void node_shift(struct node *n, int shift) { uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); + uint32_t value_size = le32_to_cpu(n->header.value_size); if (shift < 0) { shift = -shift; + BUG_ON(shift > nr_entries); + BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); memmove(key_ptr(n, 0), key_ptr(n, shift), (nr_entries - shift) * sizeof(__le64)); - memmove(value_ptr(n, 0, sizeof(__le64)), - value_ptr(n, shift, sizeof(__le64)), - (nr_entries - shift) * sizeof(__le64)); + memmove(value_ptr(n, 0, value_size), + value_ptr(n, shift, value_size), + (nr_entries - shift) * value_size); } else { + BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); memmove(key_ptr(n, shift), key_ptr(n, 0), nr_entries * sizeof(__le64)); - memmove(value_ptr(n, shift, sizeof(__le64)), - value_ptr(n, 0, sizeof(__le64)), - nr_entries * sizeof(__le64)); + memmove(value_ptr(n, shift, value_size), + value_ptr(n, 0, value_size), + nr_entries * value_size); } } static void node_copy(struct node *left, struct node *right, int shift) { uint32_t nr_left = le32_to_cpu(left->header.nr_entries); + uint32_t value_size = le32_to_cpu(left->header.value_size); + BUG_ON(value_size != le32_to_cpu(right->header.value_size)); if (shift < 0) { shift = -shift; + BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); memcpy(key_ptr(left, nr_left), key_ptr(right, 0), shift * sizeof(__le64)); - memcpy(value_ptr(left, nr_left, sizeof(__le64)), - value_ptr(right, 0, sizeof(__le64)), - shift * sizeof(__le64)); + memcpy(value_ptr(left, nr_left, value_size), + value_ptr(right, 0, value_size), + shift * value_size); } else { + BUG_ON(shift > le32_to_cpu(right->header.max_entries)); memcpy(key_ptr(right, 0), key_ptr(left, nr_left - shift), shift * sizeof(__le64)); - memcpy(value_ptr(right, 0, sizeof(__le64)), - value_ptr(left, nr_left - shift, sizeof(__le64)), - shift * sizeof(__le64)); + memcpy(value_ptr(right, 0, value_size), + value_ptr(left, nr_left - shift, value_size), + shift * value_size); } } /* * Delete a specific entry from a leaf node. */ -static void delete_at(struct node *n, unsigned index, size_t value_size) +static void delete_at(struct node *n, unsigned index) { unsigned nr_entries = le32_to_cpu(n->header.nr_entries); unsigned nr_to_copy = nr_entries - (index + 1); + uint32_t value_size = le32_to_cpu(n->header.value_size); + BUG_ON(index >= nr_entries); if (nr_to_copy) { memmove(key_ptr(n, index), @@ -165,6 +175,9 @@ static int init_child(struct dm_btree_in if (inc) inc_children(info->tm, result->n, &le64_type); + *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = + cpu_to_le64(dm_block_location(result->block)); + return 0; } @@ -188,9 +201,11 @@ static void shift(struct node *left, str left->header.nr_entries = cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); + BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); right->header.nr_entries = cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); + BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); } static void __rebalance2(struct dm_btree_info *info, struct node *parent, @@ -207,10 +222,7 @@ static void __rebalance2(struct dm_btree */ node_copy(left, right, -nr_right); left->header.nr_entries = cpu_to_le32(nr_left + nr_right); - - *((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(l->block)); - delete_at(parent, r->index, sizeof(__le64)); + delete_at(parent, r->index); /* * We need to decrement the right block, but not it's @@ -222,12 +234,10 @@ static void __rebalance2(struct dm_btree * Rebalance. */ unsigned target_left = (nr_left + nr_right) / 2; - + unsigned shift_ = nr_left - target_left; + BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); + BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); shift(left, right, nr_left - target_left); - *((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(l->block)); - *((__le64 *) value_ptr(parent, r->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(r->block)); *key_ptr(parent, r->index) = right->keys[0]; } } @@ -259,11 +269,7 @@ static int rebalance2(struct shadow_spin return r; } - r = exit_child(info, &right); - if (r) - return r; - - return 0; + return exit_child(info, &right); } static void __rebalance3(struct dm_btree_info *info, struct node *parent, @@ -280,6 +286,9 @@ static void __rebalance3(struct dm_btree unsigned target; + BUG_ON(left->header.max_entries != center->header.max_entries); + BUG_ON(center->header.max_entries != right->header.max_entries); + if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { /* * Delete center node: @@ -290,23 +299,20 @@ static void __rebalance3(struct dm_btree */ unsigned shift = min(max_entries - nr_left, nr_center); + BUG_ON(nr_left + shift > max_entries); node_copy(left, center, -shift); left->header.nr_entries = cpu_to_le32(nr_left + shift); if (shift != nr_center) { shift = nr_center - shift; + BUG_ON((nr_right + shift) >= max_entries); node_shift(right, shift); node_copy(center, right, shift); right->header.nr_entries = cpu_to_le32(nr_right + shift); } - - *((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(l->block)); - *((__le64 *) value_ptr(parent, r->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(r->block)); *key_ptr(parent, r->index) = right->keys[0]; - delete_at(parent, c->index, sizeof(__le64)); + delete_at(parent, c->index); r->index--; dm_tm_dec(info->tm, dm_block_location(c->block)); @@ -319,7 +325,7 @@ static void __rebalance3(struct dm_btree * Rebalance */ target = (nr_left + nr_center + nr_right) / 3; - BUG_ON(target == nr_center); + BUG_ON(target > max_entries); /* * Adjust the left node @@ -330,14 +336,6 @@ static void __rebalance3(struct dm_btree * Adjust the right node */ shift(center, right, target - nr_right); - - *((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(l->block)); - *((__le64 *) value_ptr(parent, c->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(c->block)); - *((__le64 *) value_ptr(parent, r->index, sizeof(__le64))) = - cpu_to_le64(dm_block_location(r->block)); - *key_ptr(parent, c->index) = center->keys[0]; *key_ptr(parent, r->index) = right->keys[0]; } @@ -428,9 +426,11 @@ static int rebalance_children(struct sha memcpy(n, dm_block_data(child), dm_bm_block_size(dm_tm_get_bm(info->tm))); r = dm_tm_unlock(info->tm, child); - dm_tm_dec(info->tm, dm_block_location(child)); + if (r) + return r; - return r; + dm_tm_dec(info->tm, dm_block_location(child)); + return 0; } i = lower_bound(n, key); @@ -444,9 +444,8 @@ static int rebalance_children(struct sha if (child_entries > del_threshold(n)) return 0; - has_left_sibling = i > 0 ? 1 : 0; - has_right_sibling = - (i >= (le32_to_cpu(n->header.nr_entries) - 1)) ? 0 : 1; + has_left_sibling = i > 0; + has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); if (!has_left_sibling) r = rebalance2(s, info, i); @@ -476,17 +475,17 @@ static int do_leaf(struct node *n, uint6 /* * Prepares for removal from one level of the hierarchy. The caller must - * actually call delete_at() to remove the entry at index. + * call delete_at() to remove the entry at index. */ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, struct dm_btree_value_type *vt, dm_block_t root, uint64_t key, unsigned *index) { - int i = *index, inc, r; + int i = *index, r; struct node *n; for (;;) { - r = shadow_step(s, root, vt, &inc); + r = shadow_step(s, root, vt); if (r < 0) break; @@ -497,13 +496,11 @@ static int remove_raw(struct shadow_spin */ if (shadow_has_parent(s)) { __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); - memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), + memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), &location, sizeof(__le64)); } n = dm_block_data(shadow_current(s)); - if (inc) - inc_children(info->tm, n, vt); if (le32_to_cpu(n->header.flags) & LEAF_NODE) return do_leaf(n, key, index); @@ -558,12 +555,10 @@ int dm_btree_remove(struct dm_btree_info info->value_type.dec(info->value_type.context, value_ptr(n, index, info->value_type.size)); - delete_at(n, index, info->value_type.size); - - r = 0; - *new_root = shadow_root(&spine); + delete_at(n, index); } + *new_root = shadow_root(&spine); exit_shadow_spine(&spine); return r; Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree-spine.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree-spine.c +++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree-spine.c @@ -13,6 +13,12 @@ /*----------------------------------------------------------------*/ +#define BTREE_CSUM_XOR 121107 + +static int node_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size); + static void node_prepare_for_write(struct dm_block_validator *v, struct dm_block *b, size_t block_size) @@ -21,7 +27,11 @@ static void node_prepare_for_write(struc struct node_header *h = &n->header; h->blocknr = cpu_to_le64(dm_block_location(b)); - h->csum = cpu_to_le32(dm_block_csum_data(&h->flags, block_size - sizeof(__le32))); + h->csum = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); + + BUG_ON(node_check(v, b, 4096)); } static int node_check(struct dm_block_validator *v, @@ -32,6 +42,7 @@ static int node_check(struct dm_block_va struct node_header *h = &n->header; size_t value_size; __le32 csum_disk; + uint32_t flags; if (dm_block_location(b) != le64_to_cpu(h->blocknr)) { DMERR("node_check failed blocknr %llu wanted %llu", @@ -39,7 +50,9 @@ static int node_check(struct dm_block_va return -ENOTBLK; } - csum_disk = cpu_to_le32(dm_block_csum_data(&h->flags, block_size - sizeof(__le32))); + csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags, + block_size - sizeof(__le32), + BTREE_CSUM_XOR)); if (csum_disk != h->csum) { DMERR("node_check failed csum %u wanted %u", le32_to_cpu(csum_disk), le32_to_cpu(h->csum)); @@ -59,6 +72,15 @@ static int node_check(struct dm_block_va return -EILSEQ; } + /* + * The node must be either INTERNAL or LEAF. + */ + flags = le32_to_cpu(h->flags); + if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) { + DMERR("node_check failed, node is neither INTERNAL or LEAF"); + return -EILSEQ; + } + return 0; } @@ -78,13 +100,13 @@ static int bn_read_lock(struct dm_btree_ static int bn_shadow(struct dm_btree_info *info, dm_block_t orig, struct dm_btree_value_type *vt, - struct dm_block **result, int *inc) + struct dm_block **result) { - int r; + int r, inc; r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator, - result, inc); - if (!r && *inc) + result, &inc); + if (!r && inc) inc_children(info->tm, dm_block_data(*result), vt); return r; @@ -174,7 +196,7 @@ int exit_shadow_spine(struct shadow_spin } int shadow_step(struct shadow_spine *s, dm_block_t b, - struct dm_btree_value_type *vt, int *inc) + struct dm_btree_value_type *vt) { int r; @@ -186,7 +208,7 @@ int shadow_step(struct shadow_spine *s, s->count--; } - r = bn_shadow(s->info, b, vt, s->nodes + s->count, inc); + r = bn_shadow(s->info, b, vt, s->nodes + s->count); if (!r) { if (!s->count) s->root = dm_block_location(s->nodes[0]); Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree.c +++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree.c @@ -119,7 +119,7 @@ static uint32_t calc_max_entries(size_t return 3 * n; } -int dm_btree_create(struct dm_btree_info *info, dm_block_t *root) +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) { int r; struct dm_block *b; @@ -142,10 +142,9 @@ int dm_btree_create(struct dm_btree_info n->header.value_size = cpu_to_le32(info->value_type.size); *root = dm_block_location(b); - return unlock_block(info, b); } -EXPORT_SYMBOL_GPL(dm_btree_create); +EXPORT_SYMBOL_GPL(dm_btree_empty); /*----------------------------------------------------------------*/ @@ -201,7 +200,7 @@ static int push_frame(struct del_stack * if (ref_count > 1) /* - * This is a shared node, so we can just decrement its + * This is a shared node, so we can just decrement it's * reference counter and leave the children. */ dm_tm_dec(s->tm, b); @@ -232,7 +231,7 @@ static void pop_frame(struct del_stack * dm_tm_unlock(s->tm, f->b); } -int dm_btree_destroy(struct dm_btree_info *info, dm_block_t root) +int dm_btree_del(struct dm_btree_info *info, dm_block_t root) { int r; struct del_stack *s; @@ -240,7 +239,6 @@ int dm_btree_destroy(struct dm_btree_inf s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; - s->tm = info->tm; s->top = -1; @@ -293,16 +291,7 @@ out: kfree(s); return r; } -EXPORT_SYMBOL_GPL(dm_btree_destroy); - -// FIXME Implement or remove this fn before final submission. -int dm_btree_delete_gt(struct dm_btree_info *info, dm_block_t root, uint64_t *key, - dm_block_t *new_root) -{ - /* FIXME: implement */ - return 0; -} -EXPORT_SYMBOL_GPL(dm_btree_delete_gt); +EXPORT_SYMBOL_GPL(dm_btree_del); /*----------------------------------------------------------------*/ @@ -587,17 +576,15 @@ static int btree_insert_raw(struct shado struct dm_btree_value_type *vt, uint64_t key, unsigned *index) { - int r, i = *index, inc, top = 1; + int r, i = *index, top = 1; struct node *node; for (;;) { - r = shadow_step(s, root, vt, &inc); + r = shadow_step(s, root, vt); if (r < 0) return r; node = dm_block_data(shadow_current(s)); - if (inc) - inc_children(s->info->tm, node, vt); /* * We have to patch up the parent node, ugly, but I don't @@ -644,13 +631,6 @@ static int btree_insert_raw(struct shado if (i < 0 || le64_to_cpu(node->keys[i]) != key) i++; - /* we're about to overwrite this value, so undo the increment for it */ - /* FIXME: shame that inc information is leaking outside the spine. - * Plus inc is just plain wrong in the event of a split */ - if (le64_to_cpu(node->keys[i]) == key && inc) - if (vt->dec) - vt->dec(vt->context, value_ptr(node, i, vt->size)); - *index = i; return 0; } @@ -688,7 +668,7 @@ static int insert(struct dm_btree_info * dm_block_t new_tree; __le64 new_le; - r = dm_btree_create(info, &new_tree); + r = dm_btree_empty(info, &new_tree); if (r < 0) goto bad; @@ -770,42 +750,6 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify /*----------------------------------------------------------------*/ -int dm_btree_clone(struct dm_btree_info *info, dm_block_t root, - dm_block_t *clone) -{ - int r; - struct dm_block *b, *orig_b; - struct node *b_node, *orig_node; - - /* Copy the root node */ - r = new_block(info, &b); - if (r < 0) - return r; - - r = dm_tm_read_lock(info->tm, root, &btree_node_validator, &orig_b); - if (r < 0) { - dm_block_t location = dm_block_location(b); - - unlock_block(info, b); - dm_tm_dec(info->tm, location); - } - - *clone = dm_block_location(b); - b_node = dm_block_data(b); - orig_node = dm_block_data(orig_b); - - memcpy(b_node, orig_node, - dm_bm_block_size(dm_tm_get_bm(info->tm))); - dm_tm_unlock(info->tm, orig_b); - inc_children(info->tm, b_node, &info->value_type); - dm_tm_unlock(info->tm, b); - - return 0; -} -EXPORT_SYMBOL_GPL(dm_btree_clone); - -/*----------------------------------------------------------------*/ - static int find_highest_key(struct ro_spine *s, dm_block_t block, uint64_t *result_key, dm_block_t *next_block) { Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree.h =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree.h +++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree.h @@ -91,21 +91,13 @@ struct dm_btree_info { /* * Set up an empty tree. O(1). */ -int dm_btree_create(struct dm_btree_info *info, dm_block_t *root); +int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root); /* - * Destroy a tree. O(n) - this is the slow one! It can also block, so + * Delete a tree. O(n) - this is the slow one! It can also block, so * please don't call it on an IO path. */ -int dm_btree_destroy(struct dm_btree_info *info, dm_block_t root); - -/* - * Delete part of a tree. This is really specific to truncation of - * thin devices. It only removes keys from the bottom level-btree that - * are greater than key[info->levels - 1]. - */ -int dm_btree_delete_gt(struct dm_btree_info *info, dm_block_t root, uint64_t *key, - dm_block_t *new_root); +int dm_btree_del(struct dm_btree_info *info, dm_block_t root); /* * All the lookup functions return -ENODATA if the key cannot be found. @@ -143,11 +135,6 @@ int dm_btree_remove(struct dm_btree_info uint64_t *keys, dm_block_t *new_root); /* - * Clone a tree. O(1) - */ -int dm_btree_clone(struct dm_btree_info *info, dm_block_t root, dm_block_t *clone); - -/* * Returns < 0 on failure. Otherwise the number of key entries that have * been filled out. Remember trees can have zero entries, and as such have * no highest key. Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.c =================================================================== --- /dev/null +++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.c @@ -0,0 +1,437 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-checker.h" + +#include + +#ifdef CONFIG_DM_DEBUG_SPACE_MAPS + +#define DM_MSG_PREFIX "space map checker" + +/*----------------------------------------------------------------*/ + +struct count_array { + dm_block_t nr; + dm_block_t nr_free; + + uint32_t *counts; +}; + +static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count) +{ + if (b >= ca->nr) + return -EINVAL; + + *count = ca->counts[b]; + return 0; +} + +static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r) +{ + if (b >= ca->nr) + return -EINVAL; + + *r = ca->counts[b] > 1; + return 0; +} + +static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count) +{ + uint32_t old_count; + + if (b >= ca->nr) + return -EINVAL; + + old_count = ca->counts[b]; + + if (!count && old_count) + ca->nr_free++; + + else if (count && !old_count) + ca->nr_free--; + + ca->counts[b] = count; + return 0; +} + +static int ca_inc_block(struct count_array *ca, dm_block_t b) +{ + if (b >= ca->nr) + return -EINVAL; + + ca_set_count(ca, b, ca->counts[b] + 1); + return 0; +} + +static int ca_dec_block(struct count_array *ca, dm_block_t b) +{ + if (b >= ca->nr) + return -EINVAL; + + BUG_ON(ca->counts[b] == 0); + ca_set_count(ca, b, ca->counts[b] - 1); + return 0; +} + +static int ca_create(struct count_array *ca, struct dm_space_map *sm) +{ + int r; + dm_block_t nr_blocks; + + r = dm_sm_get_nr_blocks(sm, &nr_blocks); + if (r) + return r; + + ca->nr = nr_blocks; + ca->nr_free = nr_blocks; + ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL); + if (!ca->counts) + return -ENOMEM; + + return 0; +} + +static int ca_load(struct count_array *ca, struct dm_space_map *sm) +{ + int r; + uint32_t count; + dm_block_t nr_blocks, i; + + r = dm_sm_get_nr_blocks(sm, &nr_blocks); + if (r) + return r; + + BUG_ON(ca->nr != nr_blocks); + + DMWARN("Loading debug space map from disk. This may take some time"); + for (i = 0; i < nr_blocks; i++) { + r = dm_sm_get_count(sm, i, &count); + if (r) { + DMERR("load failed"); + return r; + } + + ca_set_count(ca, i, count); + } + DMWARN("Load complete"); + + return 0; +} + +static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) +{ + dm_block_t nr_blocks = ca->nr + extra_blocks; + uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL); + if (!counts) + return -ENOMEM; + + memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); + kfree(ca->counts); + ca->nr = nr_blocks; + ca->nr_free += extra_blocks; + ca->counts = counts; + return 0; +} + +static int ca_commit(struct count_array *old, struct count_array *new) +{ + if (old->nr != new->nr) { + BUG_ON(old->nr > new->nr); + ca_extend(old, new->nr - old->nr); + } + + BUG_ON(old->nr != new->nr); + old->nr_free = new->nr_free; + memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr); + return 0; +} + +static void ca_destroy(struct count_array *ca) +{ + kfree(ca->counts); +} + +/*----------------------------------------------------------------*/ + +struct sm_checker { + struct dm_space_map sm; + + struct count_array old_counts; + struct count_array counts; + + struct dm_space_map *real_sm; +}; + +static void sm_checker_destroy(struct dm_space_map *sm) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + + dm_sm_destroy(smc->real_sm); + ca_destroy(&smc->old_counts); + ca_destroy(&smc->counts); + kfree(smc); +} + +static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_get_nr_blocks(smc->real_sm, count); + if (!r) + BUG_ON(smc->old_counts.nr != *count); + return r; +} + +static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_get_nr_free(smc->real_sm, count); + if (!r) { + /* + * Slow, but we know it's correct. + */ + dm_block_t b, n = 0; + for (b = 0; b < smc->old_counts.nr; b++) + if (smc->old_counts.counts[b] == 0 && + smc->counts.counts[b] == 0) + n++; + + if (n != *count) + DMERR("free block counts differ, checker %u, sm-disk:%u", + (unsigned) n, (unsigned) *count); + } + return r; +} + +static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_new_block(smc->real_sm, b); + + if (!r) { + BUG_ON(*b >= smc->old_counts.nr); + BUG_ON(smc->old_counts.counts[*b] != 0); + BUG_ON(*b >= smc->counts.nr); + BUG_ON(smc->counts.counts[*b] != 0); + ca_set_count(&smc->counts, *b, 1); + } + + return r; +} + +static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_inc_block(smc->real_sm, b); + int r2 = ca_inc_block(&smc->counts, b); + BUG_ON(r != r2); + return r; +} + +static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_dec_block(smc->real_sm, b); + int r2 = ca_dec_block(&smc->counts, b); + BUG_ON(r != r2); + return r; +} + +static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + uint32_t result2 = 0; + int r = dm_sm_get_count(smc->real_sm, b, result); + int r2 = ca_get_count(&smc->counts, b, &result2); + + BUG_ON(r != r2); + if (!r) + BUG_ON(*result != result2); + return r; +} + +static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int result2 = 0; + int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result); + int r2 = ca_count_more_than_one(&smc->counts, b, &result2); + + BUG_ON(r != r2); + if (!r) + BUG_ON(!(*result) && result2); + return r; +} + +static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + uint32_t old_rc; + int r = dm_sm_set_count(smc->real_sm, b, count); + int r2; + + BUG_ON(b >= smc->counts.nr); + old_rc = smc->counts.counts[b]; + r2 = ca_set_count(&smc->counts, b, count); + BUG_ON(r != r2); + + return r; +} + +static int sm_checker_commit(struct dm_space_map *sm) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r; + + r = dm_sm_commit(smc->real_sm); + if (r) + return r; + + r = ca_commit(&smc->old_counts, &smc->counts); + if (r) + return r; + + return 0; +} + +static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + int r = dm_sm_extend(smc->real_sm, extra_blocks); + if (r) + return r; + + return ca_extend(&smc->counts, extra_blocks); +} + +static int sm_checker_root_size(struct dm_space_map *sm, size_t *result) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + return dm_sm_root_size(smc->real_sm, result); +} + +static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) +{ + struct sm_checker *smc = container_of(sm, struct sm_checker, sm); + return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len); +} + +/*----------------------------------------------------------------*/ + +static struct dm_space_map ops_ = { + .destroy = sm_checker_destroy, + .get_nr_blocks = sm_checker_get_nr_blocks, + .get_nr_free = sm_checker_get_nr_free, + .inc_block = sm_checker_inc_block, + .dec_block = sm_checker_dec_block, + .new_block = sm_checker_new_block, + .get_count = sm_checker_get_count, + .count_is_more_than_one = sm_checker_count_more_than_one, + .set_count = sm_checker_set_count, + .commit = sm_checker_commit, + .extend = sm_checker_extend, + .root_size = sm_checker_root_size, + .copy_root = sm_checker_copy_root +}; + +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) +{ + int r; + struct sm_checker *smc; + + if (!sm) + return NULL; + + smc = kmalloc(sizeof(*smc), GFP_KERNEL); + if (!smc) + return NULL; + + memcpy(&smc->sm, &ops_, sizeof(smc->sm)); + r = ca_create(&smc->old_counts, sm); + if (r) { + kfree(smc); + return NULL; + } + + r = ca_create(&smc->counts, sm); + if (r) { + ca_destroy(&smc->old_counts); + kfree(smc); + return NULL; + } + + smc->real_sm = sm; + + r = ca_load(&smc->counts, sm); + if (r) { + ca_destroy(&smc->counts); + ca_destroy(&smc->old_counts); + kfree(smc); + return NULL; + } + + r = ca_commit(&smc->old_counts, &smc->counts); + if (r) { + ca_destroy(&smc->counts); + ca_destroy(&smc->old_counts); + kfree(smc); + return NULL; + } + + return &smc->sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create); + +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) +{ + int r; + struct sm_checker *smc; + + if (!sm) + return NULL; + + smc = kmalloc(sizeof(*smc), GFP_KERNEL); + if (!smc) + return NULL; + + memcpy(&smc->sm, &ops_, sizeof(smc->sm)); + r = ca_create(&smc->old_counts, sm); + if (r) { + kfree(smc); + return NULL; + } + + r = ca_create(&smc->counts, sm); + if (r) { + ca_destroy(&smc->old_counts); + kfree(smc); + return NULL; + } + + smc->real_sm = sm; + return &smc->sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); + +/*----------------------------------------------------------------*/ + +#else + +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) +{ + return sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create); + +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) +{ + return sm; +} +EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); + +/*----------------------------------------------------------------*/ + +#endif Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.h =================================================================== --- /dev/null +++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H +#define SNAPSHOTS_SPACE_MAP_CHECKER_H + +#include "dm-space-map.h" + +/*----------------------------------------------------------------*/ + +/* + * This space map wraps a real on-disk space map, and verifies all of its + * operations. It uses a lot of memory, so only use if you have a specific + * problem that you're debugging. + * + * Ownership of @sm passes. + */ +struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm); +struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm); + +/*----------------------------------------------------------------*/ + +#endif Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.c =================================================================== --- /dev/null +++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.c @@ -0,0 +1,704 @@ +/* + * Copyright (C) 2011 Red Hat, Inc. + * + * This file is released under the GPL. + */ + +#include "dm-space-map-common.h" +#include "dm-transaction-manager.h" + +#include +#include + +#define DM_MSG_PREFIX "space map common" + +/*----------------------------------------------------------------*/ + +/* + * Index validator. + */ +#define INDEX_CSUM_XOR 160478 + +static void index_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + + mi_le->blocknr = cpu_to_le64(dm_block_location(b)); + mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); +} + +static int index_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_metadata_index *mi_le = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { + DMERR("index_check failed blocknr %llu wanted %llu", + le64_to_cpu(mi_le->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding, + block_size - sizeof(__le32), + INDEX_CSUM_XOR)); + if (csum_disk != mi_le->csum) { + DMERR("index_check failed csum %u wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator index_validator = { + .name = "index", + .prepare_for_write = index_prepare_for_write, + .check = index_check +}; + +/*----------------------------------------------------------------*/ + +/* + * Bitmap validator + */ +#define BITMAP_CSUM_XOR 240779 + +static void bitmap_prepare_for_write(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + + disk_header->blocknr = cpu_to_le64(dm_block_location(b)); + disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); +} + +static int bitmap_check(struct dm_block_validator *v, + struct dm_block *b, + size_t block_size) +{ + struct disk_bitmap_header *disk_header = dm_block_data(b); + __le32 csum_disk; + + if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { + DMERR("bitmap check failed blocknr %llu wanted %llu", + le64_to_cpu(disk_header->blocknr), dm_block_location(b)); + return -ENOTBLK; + } + + csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used, + block_size - sizeof(__le32), + BITMAP_CSUM_XOR)); + if (csum_disk != disk_header->csum) { + DMERR("bitmap check failed csum %u wanted %u", + le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); + return -EILSEQ; + } + + return 0; +} + +static struct dm_block_validator dm_sm_bitmap_validator = { + .name = "sm_bitmap", + .prepare_for_write = bitmap_prepare_for_write, + .check = bitmap_check +}; + +/*----------------------------------------------------------------*/ + +#define ENTRIES_PER_WORD 32 +#define ENTRIES_SHIFT 5 + +static void *dm_bitmap_data(struct dm_block *b) +{ + return dm_block_data(b) + sizeof(struct disk_bitmap_header); +} + +#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL + +static unsigned bitmap_word_used(void *addr, unsigned b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + uint64_t bits = le64_to_cpu(*w_le); + uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH; + + return !(~bits & mask); +} + +static unsigned sm_lookup_bitmap(void *addr, unsigned b) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + + return (!!test_bit_le(b, (void *) w_le) << 1) | + (!!test_bit_le(b + 1, (void *) w_le)); +} + +static void sm_set_bitmap(void *addr, unsigned b, unsigned val) +{ + __le64 *words_le = addr; + __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); + + b = (b & (ENTRIES_PER_WORD - 1)) << 1; + + if (val & 2) + __set_bit_le(b, (void *) w_le); + else + __clear_bit_le(b, (void *) w_le); + + if (val & 1) + __set_bit_le(b + 1, (void *) w_le); + else + __clear_bit_le(b + 1, (void *) w_le); +} + +static int sm_find_free(void *addr, unsigned begin, unsigned end, + unsigned *result) +{ + while (begin < end) { + if (!(begin & (ENTRIES_PER_WORD - 1)) && + bitmap_word_used(addr, begin)) { + begin += ENTRIES_PER_WORD; + continue; + } + + if (!sm_lookup_bitmap(addr, begin)) { + *result = begin; + return 0; + } + + begin++; + } + + return -ENOSPC; +} + +/*----------------------------------------------------------------*/ + +static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + ll->tm = tm; + + ll->bitmap_info.tm = tm; + ll->bitmap_info.levels = 1; + + /* + * Because the new bitmap blocks are created via a shadow + * operation, the old entry has already had its reference count + * decremented and we don't need the btree to do any bookkeeping. + */ + ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry); + ll->bitmap_info.value_type.inc = NULL; + ll->bitmap_info.value_type.dec = NULL; + ll->bitmap_info.value_type.equal = NULL; + + ll->ref_count_info.tm = tm; + ll->ref_count_info.levels = 1; + ll->ref_count_info.value_type.size = sizeof(uint32_t); + ll->ref_count_info.value_type.inc = NULL; + ll->ref_count_info.value_type.dec = NULL; + ll->ref_count_info.value_type.equal = NULL; + + ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm)); + + if (ll->block_size > (1 << 30)) { + DMERR("block size too big to hold bitmaps"); + return -EINVAL; + } + + ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) * + ENTRIES_PER_BYTE; + ll->nr_blocks = 0; + ll->bitmap_root = 0; + ll->ref_count_root = 0; + + return 0; +} + +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks) +{ + int r; + dm_block_t i, nr_blocks, nr_indexes; + unsigned old_blocks, blocks; + + nr_blocks = ll->nr_blocks + extra_blocks; + old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block); + blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block); + + nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block); + if (nr_indexes > ll->max_entries(ll)) { + DMERR("space map too large"); + return -EINVAL; + } + + for (i = old_blocks; i < blocks; i++) { + struct dm_block *b; + struct disk_index_entry idx; + + r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b); + if (r < 0) + return r; + idx.blocknr = cpu_to_le64(dm_block_location(b)); + + r = dm_tm_unlock(ll->tm, b); + if (r < 0) + return r; + + idx.nr_free = cpu_to_le32(ll->entries_per_block); + idx.none_free_before = 0; + + r = ll->save_ie(ll, i, &idx); + if (r < 0) + return r; + } + + ll->nr_blocks = nr_blocks; + return 0; +} + +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + int r; + dm_block_t index = b; + struct disk_index_entry ie_disk; + struct dm_block *blk; + + b = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + *result = sm_lookup_bitmap(dm_bitmap_data(blk), b); + + return dm_tm_unlock(ll->tm, blk); +} + +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) +{ + __le32 le_rc; + int r = sm_ll_lookup_bitmap(ll, b, result); + + if (r) + return r; + + if (*result != 3) + return r; + + r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); + if (r < 0) + return r; + + *result = le32_to_cpu(le_rc); + + return r; +} + +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result) +{ + int r; + struct disk_index_entry ie_disk; + dm_block_t i, index_begin = begin; + dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block); + + /* + * FIXME: Use shifts + */ + begin = do_div(index_begin, ll->entries_per_block); + end = do_div(end, ll->entries_per_block); + + for (i = index_begin; i < index_end; i++, begin = 0) { + struct dm_block *blk; + unsigned position; + uint32_t bit_end; + + r = ll->load_ie(ll, i, &ie_disk); + if (r < 0) + return r; + + if (le32_to_cpu(ie_disk.nr_free) == 0) + continue; + + r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &blk); + if (r < 0) + return r; + + bit_end = (i == index_end - 1) ? end : ll->entries_per_block; + + r = sm_find_free(dm_bitmap_data(blk), + max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)), + bit_end, &position); + if (r == -ENOSPC) { + /* + * This might happen because we started searching + * part way through the bitmap. + */ + dm_tm_unlock(ll->tm, blk); + continue; + + } else if (r < 0) { + dm_tm_unlock(ll->tm, blk); + return r; + } + + r = dm_tm_unlock(ll->tm, blk); + if (r < 0) + return r; + + *result = i * ll->entries_per_block + (dm_block_t) position; + return 0; + } + + return -ENOSPC; +} + +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, enum allocation_event *ev) +{ + int r; + uint32_t bit, old; + struct dm_block *nb; + dm_block_t index = b; + struct disk_index_entry ie_disk; + void *bm_le; + int inc; + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ie_disk); + if (r < 0) + return r; + + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr), + &dm_sm_bitmap_validator, &nb, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); + + bm_le = dm_bitmap_data(nb); + old = sm_lookup_bitmap(bm_le, bit); + + if (ref_count <= 2) { + sm_set_bitmap(bm_le, bit, ref_count); + + r = dm_tm_unlock(ll->tm, nb); + if (r < 0) + return r; + + if (old > 2) { +#if 0 + /* FIXME: bug in dm_btree_remove causes corruption */ + r = dm_btree_remove(&ll->ref_count_info, + ll->ref_count_root, + &b, &ll->ref_count_root); + if (r) + return r; +#endif + } + + } else { + __le32 le_rc = cpu_to_le32(ref_count); + + sm_set_bitmap(bm_le, bit, 3); + r = dm_tm_unlock(ll->tm, nb); + if (r < 0) + return r; + + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + } + + if (ref_count && !old) { + *ev = SM_ALLOC; + ll->nr_allocated++; + ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1); + if (le32_to_cpu(ie_disk.none_free_before) == bit) + ie_disk.none_free_before = cpu_to_le32(bit + 1); + + } else if (old && !ref_count) { + *ev = SM_FREE; + ll->nr_allocated--; + ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1); + ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); + } + + return ll->save_ie(ll, index, &ie_disk); +} + +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + int r; + uint32_t rc; + + r = sm_ll_lookup(ll, b, &rc); + if (r) + return r; + + return sm_ll_insert(ll, b, rc + 1, ev); +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +{ + int r; + uint32_t rc; + + r = sm_ll_lookup(ll, b, &rc); + if (r) + return r; + + if (!rc) + return -EINVAL; + + return sm_ll_insert(ll, b, rc - 1, ev); +} + +int sm_ll_commit(struct ll_disk *ll) +{ + return ll->commit(ll); +} + +/*----------------------------------------------------------------*/ + +static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + memcpy(ie, ll->mi_le.index + index, sizeof(*ie)); + return 0; +} + +static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + memcpy(ll->mi_le.index + index, ie, sizeof(*ie)); + return 0; +} + +static int metadata_ll_init_index(struct ll_disk *ll) +{ + int r; + struct dm_block *b; + + r = dm_tm_new_block(ll->tm, &index_validator, &b); + if (r < 0) + return r; + + memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); + ll->bitmap_root = dm_block_location(b); + + return dm_tm_unlock(ll->tm, b); +} + +static int metadata_ll_open(struct ll_disk *ll) +{ + int r; + struct dm_block *block; + + r = dm_tm_read_lock(ll->tm, ll->bitmap_root, + &index_validator, &block); + if (r) + return r; + + memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le)); + return dm_tm_unlock(ll->tm, block); +} + +static dm_block_t metadata_ll_max_entries(struct ll_disk *ll) +{ + return MAX_METADATA_BITMAPS; +} + +static int metadata_ll_commit(struct ll_disk *ll) +{ + int r, inc; + struct dm_block *b; + + r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc); + if (r) + return r; + + memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); + ll->bitmap_root = dm_block_location(b); + + return dm_tm_unlock(ll->tm, b); +} + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root *smr = root_le; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = metadata_ll_load_ie; + ll->save_ie = metadata_ll_save_ie; + ll->init_index = metadata_ll_init_index; + ll->open_index = metadata_ll_open; + ll->max_entries = metadata_ll_max_entries; + ll->commit = metadata_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr->nr_blocks); + ll->nr_allocated = le64_to_cpu(smr->nr_allocated); + ll->bitmap_root = le64_to_cpu(smr->bitmap_root); + ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ + +static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); +} + +static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, + struct disk_index_entry *ie) +{ + __dm_bless_for_disk(ie); + return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, + &index, ie, &ll->bitmap_root); +} + +static int disk_ll_init_index(struct ll_disk *ll) +{ + return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); +} + +static int disk_ll_open(struct ll_disk *ll) +{ + /* nothing to do */ + return 0; +} + +static dm_block_t disk_ll_max_entries(struct ll_disk *ll) +{ + return -1ULL; +} + +static int disk_ll_commit(struct ll_disk *ll) +{ + return 0; +} + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) +{ + int r; + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = 0; + ll->nr_allocated = 0; + + r = ll->init_index(ll); + if (r < 0) + return r; + + r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root); + if (r < 0) + return r; + + return 0; +} + +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + int r; + struct disk_sm_root *smr = root_le; + + if (len < sizeof(struct disk_sm_root)) { + DMERR("sm_metadata root too small"); + return -ENOMEM; + } + + r = sm_ll_init(ll, tm); + if (r < 0) + return r; + + ll->load_ie = disk_ll_load_ie; + ll->save_ie = disk_ll_save_ie; + ll->init_index = disk_ll_init_index; + ll->open_index = disk_ll_open; + ll->max_entries = disk_ll_max_entries; + ll->commit = disk_ll_commit; + + ll->nr_blocks = le64_to_cpu(smr->nr_blocks); + ll->nr_allocated = le64_to_cpu(smr->nr_allocated); + ll->bitmap_root = le64_to_cpu(smr->bitmap_root); + ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + + return ll->open_index(ll); +} + +/*----------------------------------------------------------------*/ Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.h =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map-common.h +++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.h @@ -9,8 +9,9 @@ #include "dm-btree.h" +/*----------------------------------------------------------------*/ + /* - *-------------------------------------------------------------------- * Low level disk format * * Bitmap btree @@ -26,7 +27,6 @@ * * Any entry that has a ref count higher than 2 gets entered in the ref * count tree. The leaf values for this tree is the 32-bit ref count. - *--------------------------------------------------------------------- */ struct disk_index_entry { @@ -45,6 +45,15 @@ struct disk_metadata_index { struct disk_index_entry index[MAX_METADATA_BITMAPS]; } __packed; +struct ll_disk; + +typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result); +typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie); +typedef int (*init_index_fn)(struct ll_disk *ll); +typedef int (*open_index_fn)(struct ll_disk *ll); +typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); +typedef int (*commit_fn)(struct ll_disk *ll); + struct ll_disk { struct dm_transaction_manager *tm; struct dm_btree_info bitmap_info; @@ -63,6 +72,12 @@ struct ll_disk { dm_block_t ref_count_root; struct disk_metadata_index mi_le; + load_ie_fn load_ie; + save_ie_fn save_ie; + init_index_fn init_index; + open_index_fn open_index; + max_index_entries_fn max_entries; + commit_fn commit; }; struct disk_sm_root { @@ -80,15 +95,32 @@ struct disk_bitmap_header { __le64 blocknr; } __packed; -/* - * These bitops work on a block's worth of bits. - */ -unsigned sm_lookup_bitmap(void *addr, unsigned b); -void sm_set_bitmap(void *addr, unsigned b, unsigned val); -int sm_find_free(void *addr, unsigned begin, unsigned end, unsigned *result); +enum allocation_event { + SM_NONE, + SM_ALLOC, + SM_FREE, +}; + +/*----------------------------------------------------------------*/ -void *dm_bitmap_data(struct dm_block *b); +int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); +int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); +int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, + dm_block_t end, dm_block_t *result); +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); +int sm_ll_commit(struct ll_disk *ll); + +int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); + +int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm); +int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm, + void *root_le, size_t len); -extern struct dm_block_validator dm_sm_bitmap_validator; +/*----------------------------------------------------------------*/ #endif /* DM_SPACE_MAP_COMMON_H */ Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-disk.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map-disk.c +++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-disk.c @@ -4,6 +4,7 @@ * This file is released under the GPL. */ +#include "dm-space-map-checker.h" #include "dm-space-map-common.h" #include "dm-space-map-disk.h" #include "dm-space-map.h" @@ -11,450 +12,13 @@ #include #include -#include #include #include #define DM_MSG_PREFIX "space map disk" -/* - * Bitmap validator - */ -static void bitmap_prepare_for_write(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_bitmap_header *disk_header = dm_block_data(b); - - disk_header->blocknr = cpu_to_le64(dm_block_location(b)); - disk_header->csum = cpu_to_le32(dm_block_csum_data(&disk_header->not_used, block_size - sizeof(__le32))); -} - -static int bitmap_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_bitmap_header *disk_header = dm_block_data(b); - __le32 csum_disk; - - if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) { - DMERR("bitmap check failed blocknr %llu wanted %llu", - le64_to_cpu(disk_header->blocknr), dm_block_location(b)); - return -ENOTBLK; - } - - csum_disk = cpu_to_le32(dm_block_csum_data(&disk_header->not_used, block_size - sizeof(__le32))); - if (csum_disk != disk_header->csum) { - DMERR("bitmap check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum)); - return -EILSEQ; - } - - return 0; -} - -struct dm_block_validator dm_sm_bitmap_validator = { - .name = "sm_bitmap", - .prepare_for_write = bitmap_prepare_for_write, - .check = bitmap_check -}; - /*----------------------------------------------------------------*/ -#define ENTRIES_PER_WORD 32 -#define ENTRIES_SHIFT 5 - -void *dm_bitmap_data(struct dm_block *b) -{ - return dm_block_data(b) + sizeof(struct disk_bitmap_header); -} - -#define WORD_MASK_LOW 0x5555555555555555ULL -#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL -#define WORD_MASK_ALL 0xFFFFFFFFFFFFFFFFULL - -static unsigned bitmap_word_used(void *addr, unsigned b) -{ - __le64 *words_le = addr; - __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); - - uint64_t bits = le64_to_cpu(*w_le); - - return ((bits & WORD_MASK_LOW) == WORD_MASK_LOW || - (bits & WORD_MASK_HIGH) == WORD_MASK_HIGH || - (bits & WORD_MASK_ALL) == WORD_MASK_ALL); -} - -unsigned sm_lookup_bitmap(void *addr, unsigned b) -{ - __le64 *words_le = addr; - __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); - - b = (b & (ENTRIES_PER_WORD - 1)) << 1; - - return (!!test_bit_le(b, (void *) w_le) << 1) | - (!!test_bit_le(b + 1, (void *) w_le)); -} - -void sm_set_bitmap(void *addr, unsigned b, unsigned val) -{ - __le64 *words_le = addr; - __le64 *w_le = words_le + (b >> ENTRIES_SHIFT); - - b = (b & (ENTRIES_PER_WORD - 1)) << 1; - - if (val & 2) - __set_bit_le(b, (void *) w_le); - else - __clear_bit_le(b, (void *) w_le); - - if (val & 1) - __set_bit_le(b + 1, (void *) w_le); - else - __clear_bit_le(b + 1, (void *) w_le); -} - -int sm_find_free(void *addr, unsigned begin, unsigned end, - unsigned *result) -{ - while (begin < end) { - if (!(begin & (ENTRIES_PER_WORD - 1)) && - bitmap_word_used(addr, begin)) { - begin += ENTRIES_PER_WORD; - continue; - } - - if (!sm_lookup_bitmap(addr, begin)) { - *result = begin; - return 0; - } - - begin++; - } - - return -ENOSPC; -} - -static int disk_ll_init(struct ll_disk *io, struct dm_transaction_manager *tm) -{ - io->tm = tm; - io->bitmap_info.tm = tm; - io->bitmap_info.levels = 1; - - /* - * Because the new bitmap blocks are created via a shadow - * operation, the old entry has already had its reference count - * decremented and we don't need the btree to do any bookkeeping. - */ - io->bitmap_info.value_type.size = sizeof(struct disk_index_entry); - io->bitmap_info.value_type.inc = NULL; - io->bitmap_info.value_type.dec = NULL; - io->bitmap_info.value_type.equal = NULL; - - io->ref_count_info.tm = tm; - io->ref_count_info.levels = 1; - io->ref_count_info.value_type.size = sizeof(uint32_t); - io->ref_count_info.value_type.inc = NULL; - io->ref_count_info.value_type.dec = NULL; - io->ref_count_info.value_type.equal = NULL; - - io->block_size = dm_bm_block_size(dm_tm_get_bm(tm)); - - if (io->block_size > (1 << 30)) { - DMERR("block size too big to hold bitmaps"); - return -EINVAL; - } - - io->entries_per_block = (io->block_size - sizeof(struct disk_bitmap_header)) * - ENTRIES_PER_BYTE; - io->nr_blocks = 0; - io->bitmap_root = 0; - io->ref_count_root = 0; - - return 0; -} - -static int disk_ll_new(struct ll_disk *io, struct dm_transaction_manager *tm) -{ - int r; - - r = disk_ll_init(io, tm); - if (r < 0) - return r; - - io->nr_blocks = 0; - io->nr_allocated = 0; - r = dm_btree_create(&io->bitmap_info, &io->bitmap_root); - if (r < 0) - return r; - - r = dm_btree_create(&io->ref_count_info, &io->ref_count_root); - if (r < 0) { - dm_btree_destroy(&io->bitmap_info, io->bitmap_root); - return r; - } - - return 0; -} - -static int disk_ll_extend(struct ll_disk *io, dm_block_t extra_blocks) -{ - int r; - dm_block_t i, nr_blocks; - unsigned old_blocks, blocks; - - nr_blocks = io->nr_blocks + extra_blocks; - old_blocks = dm_sector_div_up(io->nr_blocks, io->entries_per_block); - blocks = dm_sector_div_up(nr_blocks, io->entries_per_block); - - for (i = old_blocks; i < blocks; i++) { - struct dm_block *b; - struct disk_index_entry idx; - - r = dm_tm_new_block(io->tm, &dm_sm_bitmap_validator, &b); - if (r < 0) - return r; - idx.blocknr = cpu_to_le64(dm_block_location(b)); - - r = dm_tm_unlock(io->tm, b); - if (r < 0) - return r; - - idx.nr_free = cpu_to_le32(io->entries_per_block); - idx.none_free_before = 0; - __dm_bless_for_disk(&idx); - - r = dm_btree_insert(&io->bitmap_info, io->bitmap_root, - &i, &idx, &io->bitmap_root); - if (r < 0) - return r; - } - - io->nr_blocks = nr_blocks; - return 0; -} - -static int disk_ll_open(struct ll_disk *ll, struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - int r; - struct disk_sm_root *smr = root_le; - - if (len < sizeof(struct disk_sm_root)) { - DMERR("sm_disk root too small"); - return -ENOMEM; - } - - r = disk_ll_init(ll, tm); - if (r < 0) - return r; - - ll->nr_blocks = le64_to_cpu(smr->nr_blocks); - ll->nr_allocated = le64_to_cpu(smr->nr_allocated); - ll->bitmap_root = le64_to_cpu(smr->bitmap_root); - ll->ref_count_root = le64_to_cpu(smr->ref_count_root); - - return 0; -} - -static int disk_ll_lookup_bitmap(struct ll_disk *io, dm_block_t b, uint32_t *result) -{ - int r; - dm_block_t index = b; - struct disk_index_entry ie_disk; - struct dm_block *blk; - - do_div(index, io->entries_per_block); - r = dm_btree_lookup(&io->bitmap_info, io->bitmap_root, &index, &ie_disk); - if (r < 0) - return r; - - r = dm_tm_read_lock(io->tm, le64_to_cpu(ie_disk.blocknr), &dm_sm_bitmap_validator, &blk); - if (r < 0) - return r; - - *result = sm_lookup_bitmap(dm_bitmap_data(blk), do_div(b, io->entries_per_block)); - - return dm_tm_unlock(io->tm, blk); -} - -static int disk_ll_lookup(struct ll_disk *io, dm_block_t b, uint32_t *result) -{ - __le32 rc_le; - int r = disk_ll_lookup_bitmap(io, b, result); - - if (r) - return r; - - if (*result != 3) - return r; - - r = dm_btree_lookup(&io->ref_count_info, io->ref_count_root, &b, &rc_le); - if (r < 0) - return r; - - *result = le32_to_cpu(rc_le); - - return r; -} - -static int disk_ll_find_free_block(struct ll_disk *io, dm_block_t begin, - dm_block_t end, dm_block_t *result) -{ - int r; - struct disk_index_entry ie_disk; - dm_block_t i, index_begin = begin; - dm_block_t index_end = dm_sector_div_up(end, io->entries_per_block); - - begin = do_div(index_begin, io->entries_per_block); - - for (i = index_begin; i < index_end; i++, begin = 0) { - struct dm_block *blk; - unsigned position; - uint32_t bit_end; - - r = dm_btree_lookup(&io->bitmap_info, io->bitmap_root, &i, &ie_disk); - if (r < 0) - return r; - - if (le32_to_cpu(ie_disk.nr_free) <= 0) - continue; - - r = dm_tm_read_lock(io->tm, le64_to_cpu(ie_disk.blocknr), - &dm_sm_bitmap_validator, &blk); - if (r < 0) - return r; - - bit_end = (i == index_end - 1) ? - do_div(end, io->entries_per_block) : io->entries_per_block; - - r = sm_find_free(dm_bitmap_data(blk), - max((unsigned)begin, (unsigned)le32_to_cpu(ie_disk.none_free_before)), - bit_end, &position); - if (r < 0) { - dm_tm_unlock(io->tm, blk); - continue; - } - - r = dm_tm_unlock(io->tm, blk); - if (r < 0) - return r; - - *result = i * io->entries_per_block + (dm_block_t) position; - - return 0; - } - - return -ENOSPC; -} - -static int disk_ll_insert(struct ll_disk *io, dm_block_t b, uint32_t ref_count) -{ - int r; - uint32_t bit, old; - struct dm_block *nb; - dm_block_t index = b; - struct disk_index_entry ie_disk; - void *bm_le; - int inc; - - do_div(index, io->entries_per_block); - r = dm_btree_lookup(&io->bitmap_info, io->bitmap_root, &index, &ie_disk); - if (r < 0) - return r; - - r = dm_tm_shadow_block(io->tm, le64_to_cpu(ie_disk.blocknr), - &dm_sm_bitmap_validator, &nb, &inc); - if (r < 0) { - DMERR("dm_tm_shadow_block() failed"); - return r; - } - ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); - - bm_le = dm_bitmap_data(nb); - bit = do_div(b, io->entries_per_block); - old = sm_lookup_bitmap(bm_le, bit); - - if (ref_count <= 2) { - sm_set_bitmap(bm_le, bit, ref_count); - - if (old > 2) { - r = dm_btree_remove(&io->ref_count_info, io->ref_count_root, - &b, &io->ref_count_root); - if (r) { - dm_tm_unlock(io->tm, nb); - return r; - } - } - } else { - __le32 rc_le = cpu_to_le32(ref_count); - - __dm_bless_for_disk(&rc_le); - - sm_set_bitmap(bm_le, bit, 3); - r = dm_btree_insert(&io->ref_count_info, io->ref_count_root, - &b, &rc_le, &io->ref_count_root); - if (r < 0) { - dm_tm_unlock(io->tm, nb); - DMERR("ref count insert failed"); - return r; - } - } - - r = dm_tm_unlock(io->tm, nb); - if (r < 0) - return r; - - if (ref_count && !old) { - io->nr_allocated++; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1); - if (le32_to_cpu(ie_disk.none_free_before) == b) - ie_disk.none_free_before = cpu_to_le32(b + 1); - - } else if (old && !ref_count) { - io->nr_allocated--; - ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1); - ie_disk.none_free_before = cpu_to_le32(min((dm_block_t) le32_to_cpu(ie_disk.none_free_before), b)); - } - - __dm_bless_for_disk(&ie_disk); - - r = dm_btree_insert(&io->bitmap_info, io->bitmap_root, &index, &ie_disk, &io->bitmap_root); - if (r < 0) - return r; - - return 0; -} - -static int disk_ll_inc(struct ll_disk *ll, dm_block_t b) -{ - int r; - uint32_t rc; - - r = disk_ll_lookup(ll, b, &rc); - if (r) - return r; - - return disk_ll_insert(ll, b, rc + 1); -} - -static int disk_ll_dec(struct ll_disk *ll, dm_block_t b) -{ - int r; - uint32_t rc; - - r = disk_ll_lookup(ll, b, &rc); - if (r) - return r; - - if (!rc) - return -EINVAL; - - return disk_ll_insert(ll, b, rc - 1); -} - -/*--------------------------------------------------------------*/ - /* * Space map interface. */ @@ -462,6 +26,10 @@ struct sm_disk { struct dm_space_map sm; struct ll_disk ll; + struct ll_disk old_ll; + + dm_block_t begin; + dm_block_t nr_allocated_this_transaction; }; static void sm_disk_destroy(struct dm_space_map *sm) @@ -475,14 +43,13 @@ static int sm_disk_extend(struct dm_spac { struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return disk_ll_extend(&smd->ll, extra_blocks); + return sm_ll_extend(&smd->ll, extra_blocks); } static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) { struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - *count = smd->ll.nr_blocks; + *count = smd->old_ll.nr_blocks; return 0; } @@ -490,8 +57,7 @@ static int sm_disk_get_nr_blocks(struct static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count) { struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - *count = smd->ll.nr_blocks - smd->ll.nr_allocated; + *count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction; return 0; } @@ -500,8 +66,7 @@ static int sm_disk_get_count(struct dm_s uint32_t *result) { struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - - return disk_ll_lookup(&smd->ll, b, result); + return sm_ll_lookup(&smd->ll, b, result); } static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, @@ -520,42 +85,127 @@ static int sm_disk_count_is_more_than_on static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { + int r; + uint32_t old_count; + enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return disk_ll_insert(&smd->ll, b, count); + r = sm_ll_insert(&smd->ll, b, count, &ev); + if (!r) { + switch (ev) { + case SM_NONE: + break; + + case SM_ALLOC: + /* + * This _must_ be free in the prior transaction + * otherwise we've lost atomicity. + */ + smd->nr_allocated_this_transaction++; + break; + + case SM_FREE: + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (r) + return r; + + if (!old_count) + smd->nr_allocated_this_transaction--; + break; + } + } + + return r; } static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) { + int r; + enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return disk_ll_inc(&smd->ll, b); + r = sm_ll_inc(&smd->ll, b, &ev); + if (!r && (ev == SM_ALLOC)) + /* + * This _must_ be free in the prior transaction + * otherwise we've lost atomicity. + */ + smd->nr_allocated_this_transaction++; + + return r; } static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { + int r; + uint32_t old_count; + enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return disk_ll_dec(&smd->ll, b); + r = sm_ll_dec(&smd->ll, b, &ev); + if (!r && (ev == SM_FREE)) { + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (r) + return r; + + if (!old_count) + smd->nr_allocated_this_transaction--; + } + + return r; } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) { int r; + enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - /* - * FIXME: We should start the search where we left off. - */ - r = disk_ll_find_free_block(&smd->ll, 0, smd->ll.nr_blocks, b); + /* FIXME: we should loop round a couple of times */ + r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); if (r) return r; - return disk_ll_inc(&smd->ll, *b); + smd->begin = *b + 1; + r = sm_ll_inc(&smd->ll, *b, &ev); + if (!r) { + BUG_ON(ev != SM_ALLOC); + smd->nr_allocated_this_transaction++; + } + + return r; } static int sm_disk_commit(struct dm_space_map *sm) { + int r; + dm_block_t nr_free; + struct sm_disk *smd = container_of(sm, struct sm_disk, sm); + + r = sm_disk_get_nr_free(sm, &nr_free); + if (r) + return r; + + r = sm_ll_commit(&smd->ll); + if (r) + return r; + + memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; + + r = sm_disk_get_nr_free(sm, &nr_free); + if (r) + return r; + return 0; } @@ -602,8 +252,9 @@ static struct dm_space_map ops = { .copy_root = sm_disk_copy_root }; -struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, - dm_block_t nr_blocks) +static struct dm_space_map *dm_sm_disk_create_real( + struct dm_transaction_manager *tm, + dm_block_t nr_blocks) { int r; struct sm_disk *smd; @@ -612,13 +263,15 @@ struct dm_space_map *dm_sm_disk_create(s if (!smd) return ERR_PTR(-ENOMEM); + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; memcpy(&smd->sm, &ops, sizeof(smd->sm)); - r = disk_ll_new(&smd->ll, tm); + r = sm_ll_new_disk(&smd->ll, tm); if (r) goto bad; - r = disk_ll_extend(&smd->ll, nr_blocks); + r = sm_ll_extend(&smd->ll, nr_blocks); if (r) goto bad; @@ -632,10 +285,18 @@ bad: kfree(smd); return ERR_PTR(r); } + +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks) +{ + struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); + return dm_sm_checker_create_fresh(sm); +} EXPORT_SYMBOL_GPL(dm_sm_disk_create); -struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, - void *root_le, size_t len) +static struct dm_space_map *dm_sm_disk_open_real( + struct dm_transaction_manager *tm, + void *root_le, size_t len) { int r; struct sm_disk *smd; @@ -644,9 +305,11 @@ struct dm_space_map *dm_sm_disk_open(str if (!smd) return ERR_PTR(-ENOMEM); + smd->begin = 0; + smd->nr_allocated_this_transaction = 0; memcpy(&smd->sm, &ops, sizeof(smd->sm)); - r = disk_ll_open(&smd->ll, tm, root_le, len); + r = sm_ll_open_disk(&smd->ll, tm, root_le, len); if (r) goto bad; @@ -660,4 +323,13 @@ bad: kfree(smd); return ERR_PTR(r); } + +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root_le, size_t len) +{ + return dm_sm_checker_create( + dm_sm_disk_open_real(tm, root_le, len)); +} EXPORT_SYMBOL_GPL(dm_sm_disk_open); + +/*----------------------------------------------------------------*/ Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-metadata.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map-metadata.c +++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-metadata.c @@ -10,7 +10,6 @@ #include #include -#include #include #define DM_MSG_PREFIX "space map metadata" @@ -18,384 +17,6 @@ /*----------------------------------------------------------------*/ /* - * Index validator. - */ -static void index_prepare_for_write(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_metadata_index *mi_le = dm_block_data(b); - - mi_le->blocknr = cpu_to_le64(dm_block_location(b)); - mi_le->csum = cpu_to_le32(dm_block_csum_data(&mi_le->padding, block_size - sizeof(__le32))); -} - -static int index_check(struct dm_block_validator *v, - struct dm_block *b, - size_t block_size) -{ - struct disk_metadata_index *mi_le = dm_block_data(b); - __le32 csum_disk; - - if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) { - DMERR("index_check failed blocknr %llu wanted %llu", - le64_to_cpu(mi_le->blocknr), dm_block_location(b)); - return -ENOTBLK; - } - - csum_disk = cpu_to_le32(dm_block_csum_data(&mi_le->padding, - block_size - sizeof(__le32))); - if (csum_disk != mi_le->csum) { - DMERR("index_check failed csum %u wanted %u", - le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum)); - return -EILSEQ; - } - - return 0; -} - -static struct dm_block_validator index_validator = { - .name = "index", - .prepare_for_write = index_prepare_for_write, - .check = index_check -}; - -/*----------------------------------------------------------------*/ - -/* - * Low-level disk ops. - */ -static int metadata_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) -{ - ll->tm = tm; - - ll->ref_count_info.tm = tm; - ll->ref_count_info.levels = 1; - ll->ref_count_info.value_type.size = sizeof(uint32_t); - ll->ref_count_info.value_type.inc = NULL; - ll->ref_count_info.value_type.dec = NULL; - ll->ref_count_info.value_type.equal = NULL; - - ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm)); - - if (ll->block_size > (1 << 30)) { - DMERR("block size too big to hold bitmaps"); - return -EINVAL; - } - - ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) * - ENTRIES_PER_BYTE; - ll->nr_blocks = 0; - ll->bitmap_root = 0; - ll->ref_count_root = 0; - - return 0; -} - -static int metadata_ll_new(struct ll_disk *ll, struct dm_transaction_manager *tm, - dm_block_t nr_blocks) -{ - int r; - dm_block_t i; - unsigned blocks; - struct dm_block *index_block; - - r = metadata_ll_init(ll, tm); - if (r < 0) - return r; - - ll->nr_blocks = nr_blocks; - ll->nr_allocated = 0; - - blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block); - if (blocks > MAX_METADATA_BITMAPS) { - DMERR("metadata device too large"); - return -EINVAL; - } - - for (i = 0; i < blocks; i++) { - struct dm_block *b; - struct disk_index_entry *idx_le = ll->mi_le.index + i; - - r = dm_tm_new_block(tm, &dm_sm_bitmap_validator, &b); - if (r < 0) - return r; - idx_le->blocknr = cpu_to_le64(dm_block_location(b)); - - r = dm_tm_unlock(tm, b); - if (r < 0) - return r; - - idx_le->nr_free = cpu_to_le32(ll->entries_per_block); - idx_le->none_free_before = 0; - } - - /* - * Write the index. - */ - r = dm_tm_new_block(tm, &index_validator, &index_block); - if (r) - return r; - - ll->bitmap_root = dm_block_location(index_block); - memcpy(dm_block_data(index_block), &ll->mi_le, sizeof(ll->mi_le)); - r = dm_tm_unlock(tm, index_block); - if (r) - return r; - - r = dm_btree_create(&ll->ref_count_info, &ll->ref_count_root); - if (r < 0) - return r; - - return 0; -} - -static int metadata_ll_open(struct ll_disk *ll, struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - int r; - struct disk_sm_root *smr = root_le; - struct dm_block *block; - - if (len < sizeof(struct disk_sm_root)) { - DMERR("sm_metadata root too small"); - return -ENOMEM; - } - - r = metadata_ll_init(ll, tm); - if (r < 0) - return r; - - ll->nr_blocks = le64_to_cpu(smr->nr_blocks); - ll->nr_allocated = le64_to_cpu(smr->nr_allocated); - ll->bitmap_root = le64_to_cpu(smr->bitmap_root); - - r = dm_tm_read_lock(tm, le64_to_cpu(smr->bitmap_root), - &index_validator, &block); - if (r) - return r; - - memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le)); - r = dm_tm_unlock(tm, block); - if (r) - return r; - - ll->ref_count_root = le64_to_cpu(smr->ref_count_root); - return 0; -} - -static int metadata_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) -{ - int r; - dm_block_t index = b; - struct disk_index_entry *ie_disk; - struct dm_block *blk; - - b = do_div(index, ll->entries_per_block); - ie_disk = ll->mi_le.index + index; - - r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk->blocknr), - &dm_sm_bitmap_validator, &blk); - if (r < 0) - return r; - - *result = sm_lookup_bitmap(dm_bitmap_data(blk), b); - - return dm_tm_unlock(ll->tm, blk); -} - -static int metadata_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result) -{ - __le32 le_rc; - int r = metadata_ll_lookup_bitmap(ll, b, result); - - if (r) - return r; - - if (*result != 3) - return r; - - r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc); - if (r < 0) - return r; - - *result = le32_to_cpu(le_rc); - - return r; -} - -static int metadata_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, - dm_block_t end, dm_block_t *result) -{ - int r; - struct disk_index_entry *ie_disk; - dm_block_t i, index_begin = begin; - dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block); - - /* - * FIXME: Use shifts - */ - begin = do_div(index_begin, ll->entries_per_block); - end = do_div(end, ll->entries_per_block); - - for (i = index_begin; i < index_end; i++, begin = 0) { - struct dm_block *blk; - unsigned position; - uint32_t bit_end; - - ie_disk = ll->mi_le.index + i; - - if (le32_to_cpu(ie_disk->nr_free) <= 0) - continue; - - r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk->blocknr), - &dm_sm_bitmap_validator, &blk); - if (r < 0) - return r; - - bit_end = (i == index_end - 1) ? end : ll->entries_per_block; - - r = sm_find_free(dm_bitmap_data(blk), begin, bit_end, &position); - if (r < 0) { - dm_tm_unlock(ll->tm, blk); - /* - * Avoiding retry (FIXME: explain why) - */ - return r; - } - - r = dm_tm_unlock(ll->tm, blk); - if (r < 0) - return r; - - *result = i * ll->entries_per_block + (dm_block_t) position; - - return 0; - } - - return -ENOSPC; -} - -static int metadata_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count) -{ - int r; - uint32_t bit, old; - struct dm_block *nb; - dm_block_t index = b; - struct disk_index_entry *ie_disk; - void *bm_le; - int inc; - - bit = do_div(index, ll->entries_per_block); - ie_disk = ll->mi_le.index + index; - - r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk->blocknr), - &dm_sm_bitmap_validator, &nb, &inc); - if (r < 0) { - DMERR("dm_tm_shadow_block() failed"); - return r; - } - ie_disk->blocknr = cpu_to_le64(dm_block_location(nb)); - - bm_le = dm_bitmap_data(nb); - old = sm_lookup_bitmap(bm_le, bit); - - if (ref_count <= 2) { - sm_set_bitmap(bm_le, bit, ref_count); - - r = dm_tm_unlock(ll->tm, nb); - if (r < 0) - return r; - - if (old > 2) { - r = dm_btree_remove(&ll->ref_count_info, - ll->ref_count_root, - &b, &ll->ref_count_root); - if (r) { - sm_set_bitmap(bm_le, bit, old); - return r; - } - } - } else { - __le32 le_rc = cpu_to_le32(ref_count); - - __dm_bless_for_disk(&le_rc); - - sm_set_bitmap(bm_le, bit, 3); - r = dm_tm_unlock(ll->tm, nb); - if (r < 0) { - __dm_unbless_for_disk(&le_rc); - return r; - } - - r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, - &b, &le_rc, &ll->ref_count_root); - if (r < 0) { - /* FIXME: release shadow? or assume the whole transaction will be ditched */ - DMERR("ref count insert failed"); - return r; - } - } - - if (ref_count && !old) { - ll->nr_allocated++; - ie_disk->nr_free = cpu_to_le32(le32_to_cpu(ie_disk->nr_free) - 1); - if (le32_to_cpu(ie_disk->none_free_before) == b) - ie_disk->none_free_before = cpu_to_le32(b + 1); - } else if (old && !ref_count) { - ll->nr_allocated--; - ie_disk->nr_free = cpu_to_le32(le32_to_cpu(ie_disk->nr_free) + 1); - ie_disk->none_free_before = cpu_to_le32(min((dm_block_t) le32_to_cpu(ie_disk->none_free_before), b)); - } - - return 0; -} - -static int metadata_ll_inc(struct ll_disk *ll, dm_block_t b) -{ - int r; - uint32_t rc; - - r = metadata_ll_lookup(ll, b, &rc); - if (r) - return r; - - return metadata_ll_insert(ll, b, rc + 1); -} - -static int metadata_ll_dec(struct ll_disk *ll, dm_block_t b) -{ - int r; - uint32_t rc; - - r = metadata_ll_lookup(ll, b, &rc); - if (r) - return r; - - if (!rc) - return -EINVAL; - - return metadata_ll_insert(ll, b, rc - 1); -} - -static int metadata_ll_commit(struct ll_disk *ll) -{ - int r, inc; - struct dm_block *b; - - r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc); - if (r) - return r; - - memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); - ll->bitmap_root = dm_block_location(b); - - return dm_tm_unlock(ll->tm, b); -} - -/*----------------------------------------------------------------*/ - -/* * Space map interface. * * The low level disk format is written using the standard btree and @@ -454,14 +75,15 @@ static int add_bop(struct sm_metadata *s static int commit_bop(struct sm_metadata *smm, struct block_op *op) { int r = 0; + enum allocation_event ev; switch (op->type) { case BOP_INC: - r = metadata_ll_inc(&smm->ll, op->block); + r = sm_ll_inc(&smm->ll, op->block, &ev); break; case BOP_DEC: - r = metadata_ll_dec(&smm->ll, op->block); + r = sm_ll_dec(&smm->ll, op->block, &ev); break; } @@ -575,7 +197,7 @@ static int sm_metadata_get_count(struct } } - r = metadata_ll_lookup(&smm->ll, b, result); + r = sm_ll_lookup(&smm->ll, b, result); if (r) return r; @@ -617,7 +239,7 @@ static int sm_metadata_count_is_more_tha return 0; } - r = metadata_ll_lookup_bitmap(&smm->ll, b, &rc); + r = sm_ll_lookup_bitmap(&smm->ll, b, &rc); if (r) return r; @@ -636,6 +258,7 @@ static int sm_metadata_set_count(struct uint32_t count) { int r, r2; + enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (smm->recursion_count) { @@ -644,7 +267,7 @@ static int sm_metadata_set_count(struct } in(smm); - r = metadata_ll_insert(&smm->ll, b, count); + r = sm_ll_insert(&smm->ll, b, count, &ev); r2 = out(smm); return combine_errors(r, r2); @@ -653,13 +276,14 @@ static int sm_metadata_set_count(struct static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b) { int r, r2 = 0; + enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (recursing(smm)) r = add_bop(smm, BOP_INC, b); else { in(smm); - r = metadata_ll_inc(&smm->ll, b); + r = sm_ll_inc(&smm->ll, b, &ev); r2 = out(smm); } @@ -669,25 +293,27 @@ static int sm_metadata_inc_block(struct static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) { int r, r2 = 0; + enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (recursing(smm)) r = add_bop(smm, BOP_DEC, b); else { in(smm); - r = metadata_ll_dec(&smm->ll, b); + r = sm_ll_dec(&smm->ll, b, &ev); r2 = out(smm); } return combine_errors(r, r2); } -static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) +static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) { int r, r2 = 0; + enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - r = metadata_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); + r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); if (r) return r; @@ -697,7 +323,7 @@ static int sm_metadata_new_block(struct r = add_bop(smm, BOP_INC, *b); else { in(smm); - r = metadata_ll_inc(&smm->ll, *b); + r = sm_ll_inc(&smm->ll, *b, &ev); r2 = out(smm); } @@ -707,14 +333,20 @@ static int sm_metadata_new_block(struct return combine_errors(r, r2); } +static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b) +{ + int r = sm_metadata_new_block_(sm, b); + if (r) + DMERR("out of metadata space"); + return r; +} + static int sm_metadata_commit(struct dm_space_map *sm) { int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - - r = metadata_ll_commit(&smm->ll); + r = sm_ll_commit(&smm->ll); if (r) return r; @@ -910,6 +542,7 @@ int dm_sm_metadata_create(struct dm_spac { int r; dm_block_t i; + enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); smm->begin = superblock + 1; @@ -918,9 +551,15 @@ int dm_sm_metadata_create(struct dm_spac smm->nr_uncommitted = 0; memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); - r = metadata_ll_new(&smm->ll, tm, nr_blocks); + + r = sm_ll_new_metadata(&smm->ll, tm); if (r) return r; + + r = sm_ll_extend(&smm->ll, nr_blocks); + if (r) + return r; + memcpy(&smm->sm, &ops, sizeof(smm->sm)); /* @@ -928,7 +567,7 @@ int dm_sm_metadata_create(struct dm_spac * allocated blocks that they were built from. */ for (i = superblock; !r && i < smm->begin; i++) - r = metadata_ll_inc(&smm->ll, i); + r = sm_ll_inc(&smm->ll, i, &ev); if (r) return r; @@ -943,7 +582,7 @@ int dm_sm_metadata_open(struct dm_space_ int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - r = metadata_ll_open(&smm->ll, tm, root_le, len); + r = sm_ll_open_metadata(&smm->ll, tm, root_le, len); if (r) return r; @@ -952,5 +591,6 @@ int dm_sm_metadata_open(struct dm_space_ smm->allocated_this_transaction = 0; smm->nr_uncommitted = 0; - return sm_metadata_commit(sm); + memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); + return 0; } Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map.h =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map.h +++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map.h @@ -16,9 +16,25 @@ struct dm_space_map { void (*destroy)(struct dm_space_map *sm); + /* + * You must commit before allocating the newly added space. + */ int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks); + /* + * Extensions do not appear in this count until after commit has + * been called. + */ int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count); + + /* + * Space maps must never allocate a block from the previous + * transaction, in case we need to rollback. This complicates the + * semantics of get_nr_free(), it should return the number of blocks + * that are available for allocation _now_. For instance you may + * have blocks with a zero reference count that will not be + * available for allocation until after the next commit. + */ int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count); int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result); Index: linux-3.1-rc9/drivers/md/persistent-data/dm-transaction-manager.c =================================================================== --- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-transaction-manager.c +++ linux-3.1-rc9/drivers/md/persistent-data/dm-transaction-manager.c @@ -5,6 +5,7 @@ */ #include "dm-transaction-manager.h" #include "dm-space-map.h" +#include "dm-space-map-checker.h" #include "dm-space-map-disk.h" #include "dm-space-map-metadata.h" #include "dm-persistent-data-internal.h" @@ -49,13 +50,11 @@ static int is_shadow(struct dm_transacti struct hlist_node *n; spin_lock(&tm->lock); - hlist_for_each_entry(si, n, tm->buckets + bucket, hlist) if (si->where == b) { r = 1; break; } - spin_unlock(&tm->lock); return r; @@ -74,7 +73,6 @@ static void insert_shadow(struct dm_tran if (si) { si->where = b; bucket = dm_hash_block(b, HASH_MASK); - spin_lock(&tm->lock); hlist_add_head(&si->hlist, tm->buckets + bucket); spin_unlock(&tm->lock); @@ -96,6 +94,7 @@ static void wipe_shadow_table(struct dm_ INIT_HLIST_HEAD(bucket); } + spin_unlock(&tm->lock); } @@ -200,77 +199,52 @@ int dm_tm_new_block(struct dm_transactio static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, struct dm_block_validator *v, - struct dm_block **result, int *inc_children) + struct dm_block **result) { int r; dm_block_t new; - uint32_t count; struct dm_block *orig_block; r = dm_sm_new_block(tm->sm, &new); if (r < 0) return r; - r = dm_bm_write_lock_zero(tm->bm, new, v, result); + r = dm_sm_dec_block(tm->sm, orig); if (r < 0) - goto bad_dec_block; + return r; r = dm_bm_read_lock(tm->bm, orig, v, &orig_block); if (r < 0) - goto bad_dec_block; - - memcpy(dm_block_data(*result), dm_block_data(orig_block), - dm_bm_block_size(tm->bm)); - - r = dm_bm_unlock(orig_block); - if (r < 0) - goto bad_dec_block; - - r = dm_sm_get_count(tm->sm, orig, &count); - if (r < 0) - goto bad; - - r = dm_sm_dec_block(tm->sm, orig); - if (r < 0) - goto bad; - - *inc_children = count > 1; - - return 0; + return r; -bad: - dm_bm_unlock(*result); -bad_dec_block: - dm_sm_dec_block(tm->sm, new); + r = dm_bm_unlock_move(orig_block, new); + if (r < 0) { + dm_bm_unlock(orig_block); + return r; + } - return r; + return dm_bm_write_lock(tm->bm, new, v, result); } int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, struct dm_block_validator *v, struct dm_block **result, int *inc_children) { - int r, more_than_one; + int r; if (tm->is_clone) return -EWOULDBLOCK; - if (is_shadow(tm, orig)) { - r = dm_sm_count_is_more_than_one(tm->sm, orig, &more_than_one); - if (r < 0) - return r; - - if (!more_than_one) { - *inc_children = 0; - return dm_bm_write_lock(tm->bm, orig, v, result); - } - /* fall through */ - } - - r = __shadow_block(tm, orig, v, result, inc_children); + r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children); if (r < 0) return r; + if (is_shadow(tm, orig) && !*inc_children) + return dm_bm_write_lock(tm->bm, orig, v, result); + + r = __shadow_block(tm, orig, v, result); + if (r < 0) + return r; insert_shadow(tm, dm_block_location(*result)); return r; @@ -312,6 +286,7 @@ void dm_tm_dec(struct dm_transaction_man dm_sm_dec_block(tm->sm, b); } +EXPORT_SYMBOL_GPL(dm_tm_dec); int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result) @@ -339,14 +314,15 @@ static int dm_tm_create_internal(struct int create) { int r; + struct dm_space_map *inner; - *sm = dm_sm_metadata_init(); - if (IS_ERR(*sm)) - return PTR_ERR(*sm); + inner = dm_sm_metadata_init(); + if (IS_ERR(inner)) + return PTR_ERR(inner); - *tm = dm_tm_create(bm, *sm); + *tm = dm_tm_create(bm, inner); if (IS_ERR(*tm)) { - dm_sm_destroy(*sm); + dm_sm_destroy(inner); return PTR_ERR(*tm); } @@ -358,13 +334,17 @@ static int dm_tm_create_internal(struct goto bad1; } - r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm), + r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm), sb_location); if (r) { DMERR("couldn't create metadata space map"); goto bad2; } + *sm = dm_sm_checker_create(inner); + if (!*sm) + goto bad2; + } else { r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, sb_validator, sblock); @@ -373,13 +353,17 @@ static int dm_tm_create_internal(struct goto bad1; } - r = dm_sm_metadata_open(*sm, *tm, + r = dm_sm_metadata_open(inner, *tm, dm_block_data(*sblock) + root_offset, root_max_len); - if (IS_ERR(*sm)) { + if (r) { DMERR("couldn't open metadata space map"); goto bad2; } + + *sm = dm_sm_checker_create(inner); + if (!*sm) + goto bad2; } return 0; @@ -388,7 +372,7 @@ bad2: dm_tm_unlock(*tm, *sblock); bad1: dm_tm_destroy(*tm); - dm_sm_destroy(*sm); + dm_sm_destroy(inner); return r; } @@ -412,3 +396,5 @@ int dm_tm_open_with_sm(struct dm_block_m root_max_len, tm, sm, sblock, 0); } EXPORT_SYMBOL_GPL(dm_tm_open_with_sm); + +/*----------------------------------------------------------------*/