An implementation of snapshot store. Designed by Daniel Phillips, ported to kernelspace by Fujita Tomonorig. This implementation plugs as a module into dm-multisnapshot interface. The interface is almost the same as in Mikulas' exception store. You have to specify "daniel" instead of "mikulas" as a type of exception store. The only supported chunk size is 16384 bytes (but the code is general enough and this restriction may be removed in the future). Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 14 drivers/md/Makefile | 2 drivers/md/dm-multisnap-daniel.c | 1699 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 1715 insertions(+) Index: linux-2.6.32-rc8-devel/drivers/md/dm-multisnap-daniel.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.32-rc8-devel/drivers/md/dm-multisnap-daniel.c 2009-11-26 17:07:11.000000000 +0100 @@ -0,0 +1,1699 @@ +/* + * dm-exception-store.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * Copyright (C) 2006 Red Hat GmbH + * + * The shared exception code is based on Zumastor (http://zumastor.org/) + * + * By: Daniel Phillips, Nov 2003 to Mar 2007 + * (c) 2003, Sistina Software Inc. + * (c) 2004, Red Hat Software Inc. + * (c) 2005 Daniel Phillips + * (c) 2006 - 2007, Google Inc + * + * This file is released under the GPL. + */ + +#include "dm-multisnap.h" + +#include +#include +#include +#include +#include +#include + +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ + +#define MAX_SNAPSHOTS 64 + +#define MAX_CHUNK_BUFFERS 128 + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +struct disk_header { + uint32_t magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + uint32_t valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + uint32_t version; + + /* In sectors */ + uint32_t chunk_size; + + /* + * for shared exception + */ + __le64 root_tree_chunk; + __le64 snapmask; /* Mikulas: don't mix bit_test/set w.r.t. direct reading, it's unportable */ + __le32 tree_level; + __le32 h_nr_journal_chunks; +}; + +struct disk_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback)(void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct dm_exception_store { + struct dm_multisnap *dm; + + void *area; + + struct dm_io_client *io_client; + + unsigned chunk_size; + unsigned char chunk_shift; + + int version; + + /* + * for shared exception + */ + u64 root_tree_chunk; + u64 snapmask; + u32 tree_level; + + u32 nr_snapshots; + + chunk_t nr_chunks; + chunk_t nr_bitmap_chunks; + chunk_t nr_journal_chunks; + unsigned long *bitmap; + chunk_t cur_bitmap_chunk; + chunk_t cur_bitmap_index; + chunk_t cur_journal_chunk; + + struct list_head chunk_buffer_list; + struct list_head chunk_buffer_dirty_list; + + int header_dirty; + int nr_chunk_buffers; + + chunk_t chunk_to_add; +}; + +static unsigned sectors_to_pages(unsigned sectors) +{ + return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); +} + +static int alloc_area(struct dm_exception_store *ps) +{ + int r = -ENOMEM; + size_t len; + + len = ps->chunk_size; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + return r; + + return 0; +} + +static void free_area(struct dm_exception_store *ps) +{ + vfree(ps->area); + ps->area = NULL; +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct dm_exception_store *ps, chunk_t chunk, int rw, void *data) +{ + struct dm_io_region where = { + .bdev = dm_multisnap_snapshot_bdev(ps->dm), + .sector = (ps->chunk_size >> SECTOR_SHIFT) * chunk, + .count = (ps->chunk_size >> SECTOR_SHIFT), + }; + struct dm_io_request io_req = { + .bi_rw = rw, + .mem.type = DM_IO_VMA, + .mem.ptr.vma = data, + .client = ps->io_client, + .notify.fn = NULL, + }; + + int r = dm_io(&io_req, 1, &where, NULL); + if (r) { + DMERR("io error when %s %llx: %d", rw == WRITE ? "writing" : "reading", (unsigned long long)chunk, r); + dm_multisnap_set_error(ps->dm, r); + } + return r; +} + +static int write_header(struct dm_exception_store *ps) +{ + struct disk_header *dh; + + memset(ps->area, 0, ps->chunk_size); + + dh = (struct disk_header *) ps->area; + dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->valid = cpu_to_le32(dm_multisnap_drop_on_error(ps->dm) && dm_multisnap_has_error(ps->dm)); + dh->version = cpu_to_le32(ps->version); + dh->chunk_size = cpu_to_le32(ps->chunk_size >> SECTOR_SHIFT); + + dh->root_tree_chunk = cpu_to_le64(ps->root_tree_chunk); + dh->snapmask = cpu_to_le64(ps->snapmask); + dh->tree_level = cpu_to_le32(ps->tree_level); + dh->h_nr_journal_chunks = cpu_to_le32(ps->nr_journal_chunks); + + ps->header_dirty = 0; + + return chunk_io(ps, 0, WRITE, ps->area); +} + +/* + * shared exception code + */ + +#define SNAP_MAGIC 0x70416e53 +#define FIRST_BITMAP_CHUNK 1 + +struct chunk_buffer { + struct list_head list; + struct list_head dirty_list; + u64 chunk; + void *data; +}; + +struct node { + __le32 count; + __le32 unused; + struct index_entry { + __le64 key; /* note: entries[0].key never accessed */ + __le64 chunk; /* node sector address goes here */ + } entries[]; +}; + +struct leaf { + __le16 magic; + __le16 version; + __le32 count; + /* !!! FIXME the code doesn't use the base_chunk properly */ + __le64 base_chunk; + __le64 using_mask; + + struct tree_map { + __le32 offset; + __le32 rchunk; + } map[]; +}; + +struct exception { + __le64 share; + __le64 chunk; +}; + +static inline struct node *buffer2node(struct chunk_buffer *buffer) +{ + return (struct node *)buffer->data; +} + +static inline struct leaf *buffer2leaf(struct chunk_buffer *buffer) +{ + return (struct leaf *)buffer->data; +} + +static struct chunk_buffer *alloc_chunk_buffer(struct dm_exception_store *ps) +{ + struct chunk_buffer *b; + + /* Mikulas: changed to GFP_NOIO */ + b = kzalloc(sizeof(*b), GFP_NOIO); + if (!b) { + DMERR("%s %d: out of memory", __func__, __LINE__); + dm_multisnap_set_error(ps->dm, -ENOMEM); + return NULL; + } + + /* Mikulas: must use GFP_NOIO; vmalloc without it may deadlock */ + b->data = __vmalloc(ps->chunk_size, GFP_NOIO | __GFP_HIGHMEM, PAGE_KERNEL); + if (!b->data) { + DMERR("%s %d: out of memory", __func__, __LINE__); + kfree(b); + dm_multisnap_set_error(ps->dm, -ENOMEM); + return NULL; + } + + memset(b->data, 0, ps->chunk_size); + + list_add(&b->list, &ps->chunk_buffer_list); + INIT_LIST_HEAD(&b->dirty_list); + + ps->nr_chunk_buffers++; + + return b; +} + +static void free_chunk_buffer(struct dm_exception_store *ps, struct chunk_buffer *b) +{ + list_del(&b->list); + vfree(b->data); + kfree(b); + + ps->nr_chunk_buffers--; +} + +static int read_new_bitmap_chunk(struct dm_exception_store *ps) +{ + chunk_io(ps, ps->cur_bitmap_chunk, WRITE, ps->bitmap); + + ps->cur_bitmap_chunk++; + if (ps->cur_bitmap_chunk == ps->nr_bitmap_chunks + FIRST_BITMAP_CHUNK /* --- I couldn't get it working without this. I wonder how it ever worked? --- Mikulas */) + ps->cur_bitmap_chunk = FIRST_BITMAP_CHUNK; + + chunk_io(ps, ps->cur_bitmap_chunk, READ, ps->bitmap); + + return 0; +} + +static chunk_t shared_allocate_chunk(struct dm_exception_store *ps) +{ + /* !!! FIXME: replace multiply/divide/modulo with bit shifts */ + unsigned idx; + unsigned limit; + chunk_t start_chunk; + unsigned nr_bits = ps->chunk_shift + 3; + + start_chunk = ps->cur_bitmap_chunk; +again: + if (ps->cur_bitmap_chunk == ps->nr_bitmap_chunks && ps->nr_chunks & ((1 << nr_bits) - 1)) + limit = ps->nr_chunks & ((1 << nr_bits) - 1); + else + limit = 1 << nr_bits; + + idx = ext2_find_next_zero_bit(ps->bitmap, limit, ps->cur_bitmap_index); + if (idx < limit) { + ext2_set_bit(idx, ps->bitmap); + + if (idx == limit - 1) { + ps->cur_bitmap_index = 0; + + read_new_bitmap_chunk(ps); + } else + ps->cur_bitmap_index++; + } else { + /* chunk_io(ps, ps->cur_bitmap_chunk, WRITE, ps->bitmap); don't write it twice -- Mikulas */ + + read_new_bitmap_chunk(ps); + + /* todo: check # free chunks */ + if (start_chunk == ps->cur_bitmap_chunk) { + DMERR("%s %d: fail to find a new chunk", + __func__, __LINE__); + dm_multisnap_set_error(ps->dm, -ENOSPC); + return 0; + } + + goto again; + } + + return idx + ((ps->cur_bitmap_chunk - FIRST_BITMAP_CHUNK) << nr_bits); +} + +static int shared_free_chunk(struct dm_exception_store *ps, chunk_t chunk) +{ + unsigned bits_per_chunk_shift = ps->chunk_shift + 3; + unsigned idx = chunk & ((1 << bits_per_chunk_shift) - 1); + + /* we don't always need to do this... */ + chunk_io(ps, ps->cur_bitmap_chunk, WRITE, ps->bitmap); + + ps->cur_bitmap_chunk = (chunk >> bits_per_chunk_shift) + FIRST_BITMAP_CHUNK; + + chunk_io(ps, ps->cur_bitmap_chunk, READ, ps->bitmap); + + if (!ext2_test_bit(idx, ps->bitmap)) { + DMERR("%s: trying to free free block %lld %lld %u", __func__, + (unsigned long long)chunk, (unsigned long long)ps->cur_bitmap_chunk, idx); + dm_multisnap_set_error(ps->dm, -EFSERROR); + } + + ext2_clear_bit(idx, ps->bitmap); + + chunk_io(ps, ps->cur_bitmap_chunk, WRITE, ps->bitmap); + + DMINFO("%s %d: free a chunk, %llu", __func__, __LINE__, + (unsigned long long)chunk); + + return 0; +} + +static void init_leaf(struct dm_exception_store *ps, struct leaf *leaf) +{ + leaf->magic = cpu_to_le16(0x1eaf); + leaf->version = 0; + leaf->base_chunk = 0; + leaf->count = 0; + leaf->map[0].offset = cpu_to_le32(ps->chunk_size); +} + +static struct chunk_buffer *new_btree_obj(struct dm_exception_store *ps) +{ + u64 chunk; + struct chunk_buffer *b; + + b = alloc_chunk_buffer(ps); + if (!b) + return NULL; + + chunk = shared_allocate_chunk(ps); + if (!chunk) { + free_chunk_buffer(ps, b); + return NULL; + } + + b->chunk = chunk; + + return b; +} + +static int shared_create_bitmap(struct dm_exception_store *ps) +{ + int i, r, rest, this; + chunk_t chunk; + + /* bitmap + superblock */ + rest = 1 + ps->nr_bitmap_chunks + ps->nr_journal_chunks; + + for (chunk = 0; chunk < ps->nr_bitmap_chunks; chunk++) { + memset(ps->area, 0, ps->chunk_size); + + this = min_t(int, rest, ps->chunk_size * 8); + if (this) { + rest -= this; + + memset(ps->area, 0xff, this / 8); + + for (i = 0; i < this % 8; i++) { + /* Mikulas: this produces unaligned accesses + char *p = ps->area + (this / 8); + ext2_set_bit(i, (unsigned long *)p);*/ + ext2_set_bit(this - this % 8 + i, (unsigned long *)ps->area); + } + } + + r = chunk_io(ps, chunk + FIRST_BITMAP_CHUNK, WRITE, + ps->area); + if (r) + return r; + } + + return 0; +} + +static struct chunk_buffer *new_leaf(struct dm_exception_store *ps) +{ + struct chunk_buffer *cb; + + cb = new_btree_obj(ps); + if (cb) + init_leaf(ps, cb->data); + + return cb; +} + +static struct chunk_buffer *new_node(struct dm_exception_store *ps) +{ + return new_btree_obj(ps); +} + +static int shared_create_btree(struct dm_exception_store *ps) +{ + struct chunk_buffer *l, *n; + int r; + + r = chunk_io(ps, ps->cur_bitmap_chunk, READ, ps->bitmap); + if (r) + return r; + + l = new_btree_obj(ps); + if (!l) + return -ENOMEM; + init_leaf(ps, l->data); + + n = new_btree_obj(ps); + if (!n) + return -ENOMEM; + + buffer2node(n)->count = cpu_to_le32(1); + buffer2node(n)->entries[0].chunk = cpu_to_le64(l->chunk); + + chunk_io(ps, l->chunk, WRITE, l->data); + chunk_io(ps, n->chunk, WRITE, n->data); + + ps->root_tree_chunk = n->chunk; + ps->snapmask = 0; + ps->tree_level = 1; + + return 0; +} + +static int shared_create_header(struct dm_exception_store *ps) +{ + int r; + + /* 128MB by default, should be configurable. */ + ps->nr_journal_chunks = (128 * 1024 * 1024) / ps->chunk_size; + + /* Mikulas: if the device is smaller than 128MB, we need something smaller */ + if (ps->nr_journal_chunks > ps->nr_chunks / 8) + ps->nr_journal_chunks = ps->nr_chunks / 8; + + r = shared_create_bitmap(ps); + if (r) + return r; + + r = shared_create_btree(ps); + if (r) + return r; + + r = write_header(ps); + if (r) + return r; + + return r; +} + +static int shared_read_header(struct dm_exception_store *ps, int *new_snapshot) +{ + struct disk_header *dh; + int r; + + ps->io_client = dm_io_client_create(sectors_to_pages(ps->chunk_size >> SECTOR_SHIFT)); + if (IS_ERR(ps->io_client)) { + r = PTR_ERR(ps->io_client); + goto fail_io_client; + } + + ps->bitmap = vmalloc(ps->chunk_size); + if (!ps->bitmap) { + r = -ENOMEM; + goto fail_bitmap; + } + + r = alloc_area(ps); + if (r) + goto fail_alloc_area; + + + r = chunk_io(ps, 0, READ, ps->area); + if (r) + goto fail_to_read_header; + + dh = (struct disk_header *) ps->area; + + if (le32_to_cpu(dh->magic) == 0) { + *new_snapshot = 1; + return 0; + } + + if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { + DMWARN("Invalid or corrupt snapshot"); + r = -ENXIO; + goto fail_to_read_header; + } + + *new_snapshot = 0; + if (le32_to_cpu(dh->valid)) + dm_multisnap_set_error(ps->dm, -EINVAL); + ps->version = le32_to_cpu(dh->version); + + ps->root_tree_chunk = le64_to_cpu(dh->root_tree_chunk); + ps->snapmask = le64_to_cpu(dh->snapmask); + ps->tree_level = le32_to_cpu(dh->tree_level); + ps->nr_journal_chunks = le32_to_cpu(dh->h_nr_journal_chunks); + + if (ps->chunk_size >> SECTOR_SHIFT != le32_to_cpu(dh->chunk_size)) { + DMWARN("Invalid chunk size"); + r = -ENXIO; + goto fail_to_read_header; + } + + return 0; + +fail_to_read_header: + free_area(ps); +fail_alloc_area: + vfree(ps->bitmap); + ps->bitmap = NULL; +fail_bitmap: + dm_io_client_destroy(ps->io_client); +fail_io_client: + return r; +} + +static void shared_drop_header(struct dm_exception_store *ps) +{ + free_area(ps); + vfree(ps->bitmap); + ps->bitmap = NULL; + dm_io_client_destroy(ps->io_client); +} + +static int shared_read_metadata(struct dm_exception_store *ps, char **error) +{ + int i, r, uninitialized_var(new_snapshot); + size_t size = i_size_read(dm_multisnap_snapshot_bdev(ps->dm)->bd_inode); + chunk_t bitmap_chunk_bytes; + unsigned chunk_bytes = ps->chunk_size; + + ps->cur_bitmap_chunk = FIRST_BITMAP_CHUNK; + ps->cur_bitmap_index = 0; + ps->nr_chunks = size >> ps->chunk_shift; + + INIT_LIST_HEAD(&ps->chunk_buffer_list); + INIT_LIST_HEAD(&ps->chunk_buffer_dirty_list); + ps->nr_chunk_buffers = 0; + + bitmap_chunk_bytes = DIV_ROUND_UP(ps->nr_chunks, 8); + ps->nr_bitmap_chunks = (bitmap_chunk_bytes + chunk_bytes - 1) >> ps->chunk_shift; + + r = shared_read_header(ps, &new_snapshot); + if (r) { + *error = "Failed to read header"; + return r; + } + + if (new_snapshot) + DMINFO("%s %d: creates a new cow device", __func__, __LINE__); + else + DMINFO("%s %d: loads the cow device", __func__, __LINE__); + + if (new_snapshot) { + r = shared_create_header(ps); + if (r) { + *error = "Failed to create header"; + goto bad_header; + } + } else { + r = chunk_io(ps, ps->cur_bitmap_chunk, READ, ps->bitmap); + if (r) { + *error = "Failed to read bitmap"; + goto bad_header; + } + } + + for (i = 0; i < MAX_SNAPSHOTS; i++) + if (ps->snapmask & 1LL << i) + ps->nr_snapshots++; + + return 0; + +bad_header: + shared_drop_header(ps); + return r; +} + +struct etree_path { + struct chunk_buffer *buffer; + struct index_entry *pnext; +}; + +static struct chunk_buffer *btbread(struct dm_exception_store *ps, u64 chunk) +{ + struct chunk_buffer *b; + + list_for_each_entry(b, &ps->chunk_buffer_list, list) { + if (b->chunk == chunk) + return b; + } + + b = alloc_chunk_buffer(ps); + if (!b) + return NULL; + + b->chunk = chunk; + + chunk_io(ps, chunk, READ, b->data); + + return b; +} + +static void brelse(struct chunk_buffer *buffer) +{ +} + +static void brelse_dirty(struct dm_exception_store *ps, struct chunk_buffer *b) +{ + if (list_empty(&b->dirty_list)) + list_add(&b->dirty_list, &ps->chunk_buffer_dirty_list); +} + +static void set_buffer_dirty(struct dm_exception_store *ps, struct chunk_buffer *b) +{ + if (list_empty(&b->dirty_list)) + list_add(&b->dirty_list, &ps->chunk_buffer_dirty_list); +} + +static inline struct exception *emap(struct leaf *leaf, unsigned i) +{ + return (struct exception *) + ((char *)leaf + le32_to_cpu(leaf->map[i].offset)); +} + +static int add_exception_to_leaf(struct leaf *leaf, u64 chunk, u64 exception, + int snapshot, u64 active) +{ + unsigned target = chunk - le64_to_cpu(leaf->base_chunk); + u64 mask = 1ULL << snapshot, sharemap; + struct exception *ins, *exceptions = emap(leaf, 0); + char *maptop = (char *)(&leaf->map[le32_to_cpu(leaf->count) + 1]); + unsigned i, j, free = (char *)exceptions - maptop; + + /* + * Find the chunk for which we're adding an exception entry. + */ + for (i = 0; i < le32_to_cpu(leaf->count); i++) /* !!! binsearch goes here */ + if (le32_to_cpu(leaf->map[i].rchunk) >= target) + break; + + /* + * If we didn't find the chunk, insert a new one at map[i]. + */ + if (i == le32_to_cpu(leaf->count) || + le32_to_cpu(leaf->map[i].rchunk) > target) { + if (free < sizeof(struct exception) + sizeof(struct tree_map)) + return -1; + ins = emap(leaf, i); + memmove(&leaf->map[i+1], &leaf->map[i], maptop - (char *)&leaf->map[i]); + leaf->map[i].offset = cpu_to_le32((char *)ins - (char *)leaf); + leaf->map[i].rchunk = cpu_to_le32(target); + leaf->count = cpu_to_le32(le32_to_cpu(leaf->count) + 1); + sharemap = snapshot == -1 ? active : mask; + goto insert; + } + + if (free < sizeof(struct exception)) + return -1; + /* + * Compute the share map from that of each existing exception entry + * for this chunk. If we're doing this for a chunk on the origin, + * the new exception is shared between those snapshots that weren't + * already sharing exceptions for this chunk. (We combine the sharing + * that already exists, invert it, then mask off everything but the + * active snapshots.) + * + * If this is a chunk on a snapshot we go through the existing + * exception list to turn off sharing with this snapshot (with the + * side effect that if the chunk was only shared by this snapshot it + * becomes unshared). We then set sharing for this snapshot in the + * new exception entry. + */ + if (snapshot == -1) { + for (sharemap = 0, ins = emap(leaf, i); ins < emap(leaf, i+1); ins++) + sharemap |= le64_to_cpu(ins->share); + sharemap = (~sharemap) & active; + } else { + for (ins = emap(leaf, i); ins < emap(leaf, i+1); ins++) { + u64 val = le64_to_cpu(ins->share); + if (val & mask) { + ins->share = cpu_to_le64(val & ~mask); + break; + } + } + sharemap = mask; + } + ins = emap(leaf, i); +insert: + /* + * Insert the new exception entry. These grow from the end of the + * block toward the beginning. First move any earlier exceptions up + * to make room for the new one, then insert the new entry in the + * space freed. Adjust the offsets for all earlier chunks. + */ + memmove(exceptions - 1, exceptions, (char *)ins - (char *)exceptions); + ins--; + ins->share = cpu_to_le64(sharemap); + ins->chunk = cpu_to_le64(exception); + + for (j = 0; j <= i; j++) { + u32 val = le32_to_cpu(leaf->map[j].offset); + leaf->map[j].offset = cpu_to_le32(val - sizeof(struct exception)); + } + + return 0; +} + +static void insert_child(struct node *node, struct index_entry *p, u64 child, + u64 childkey) +{ + size_t count = (char *)(&node->entries[0] + le32_to_cpu(node->count)) - + (char *)p; + memmove(p + 1, p, count); + p->chunk = cpu_to_le64(child); + p->key = cpu_to_le64(childkey); + node->count = cpu_to_le32(le32_to_cpu(node->count) + 1); +} + +static u64 split_leaf(struct leaf *leaf, struct leaf *leaf2) +{ + unsigned i, nhead, ntail, tailsize; + u64 splitpoint; + char *phead, *ptail; + + nhead = (le32_to_cpu(leaf->count) + 1) / 2; + ntail = le32_to_cpu(leaf->count) - nhead; + + /* Should split at middle of data instead of median exception */ + splitpoint = le32_to_cpu(leaf->map[nhead].rchunk) + + le64_to_cpu(leaf->base_chunk); + + phead = (char *)emap(leaf, 0); + ptail = (char *)emap(leaf, nhead); + tailsize = (char *)emap(leaf, le32_to_cpu(leaf->count)) - ptail; + + /* Copy upper half to new leaf */ + memcpy(leaf2, leaf, offsetof(struct leaf, map)); + memcpy(&leaf2->map[0], &leaf->map[nhead], (ntail + 1) * sizeof(struct tree_map)); + memcpy(ptail - (char *)leaf + (char *)leaf2, ptail, tailsize); + leaf2->count = cpu_to_le32(ntail); + + /* Move lower half to top of block */ + memmove(phead + tailsize, phead, ptail - phead); + leaf->count = cpu_to_le32(nhead); + for (i = 0; i <= nhead; i++) + leaf->map[i].offset = + cpu_to_le32(le32_to_cpu(leaf->map[i].offset) + tailsize); + leaf->map[nhead].rchunk = 0; + + return splitpoint; +} + +static int add_exception_to_tree(struct dm_exception_store *ps, + struct chunk_buffer *leafbuf, + u64 target, u64 exception, int snapbit, + struct etree_path path[], unsigned levels) +{ + struct node *newroot; + struct chunk_buffer *newrootbuf, *childbuf; + struct leaf *leaf; + u64 childkey, childsector; + int ret; + + ret = add_exception_to_leaf(buffer2leaf(leafbuf), target, + exception, snapbit, ps->snapmask); + if (!ret) { + brelse_dirty(ps, leafbuf); + return 0; + } + + /* + * There wasn't room to add a new exception to the leaf. Split it. + */ + + childbuf = new_leaf(ps); + if (!childbuf) + return -ENOMEM; /* this is the right thing to do? */ + + set_buffer_dirty(ps, childbuf); + + childkey = split_leaf(buffer2leaf(leafbuf), buffer2leaf(childbuf)); + childsector = childbuf->chunk; + + /* + * Now add the exception to the appropriate leaf. Childkey has the + * first chunk in the new leaf we just created. + */ + if (target < childkey) + leaf = buffer2leaf(leafbuf); + else + leaf = buffer2leaf(childbuf); + + ret = add_exception_to_leaf(leaf, target, exception, snapbit, + ps->snapmask); + if (ret) + return -ENOMEM; + + brelse_dirty(ps, leafbuf); + brelse_dirty(ps, childbuf); + + while (levels--) { + unsigned half; + u64 newkey; + struct index_entry *pnext = path[levels].pnext; + struct chunk_buffer *parentbuf = path[levels].buffer; + struct node *parent = buffer2node(parentbuf); + struct chunk_buffer *newbuf; + struct node *newnode; + int csize = ps->chunk_size; + int alloc_per_node = (csize - offsetof(struct node, entries)) + / sizeof(struct index_entry); + + if (le32_to_cpu(parent->count) < alloc_per_node) { + insert_child(parent, pnext, childsector, childkey); + set_buffer_dirty(ps, parentbuf); + return 0; + } + /* + * Split the node. + */ + half = le32_to_cpu(parent->count) / 2; + newkey = le64_to_cpu(parent->entries[half].key); + newbuf = new_node(ps); + if (!newbuf) + return -ENOMEM; + set_buffer_dirty(ps, newbuf); + newnode = buffer2node(newbuf); + + newnode->count = cpu_to_le32(le32_to_cpu(parent->count) - half); + memcpy(&newnode->entries[0], &parent->entries[half], + le32_to_cpu(newnode->count) * sizeof(struct index_entry)); + parent->count = cpu_to_le32(half); + /* + * If the path entry is in the new node, use that as the + * parent. + */ + if (pnext > &parent->entries[half]) { + pnext = pnext - &parent->entries[half] + newnode->entries; + set_buffer_dirty(ps, parentbuf); + parentbuf = newbuf; + parent = newnode; + } else + set_buffer_dirty(ps, newbuf); + /* + * Insert the child now that we have room in the parent, then + * climb the path and insert the new child there. + */ + insert_child(parent, pnext, childsector, childkey); + set_buffer_dirty(ps, parentbuf); + childkey = newkey; + childsector = newbuf->chunk; + brelse(newbuf); + } + + newrootbuf = new_node(ps); + if (!newrootbuf) + return -ENOMEM; + + newroot = buffer2node(newrootbuf); + + newroot->count = cpu_to_le32(2); + newroot->entries[0].chunk = cpu_to_le64(ps->root_tree_chunk); + newroot->entries[1].key = cpu_to_le64(childkey); + newroot->entries[1].chunk = cpu_to_le64(childsector); + ps->root_tree_chunk = newrootbuf->chunk; + ps->tree_level++; + ps->header_dirty = 1; + brelse_dirty(ps, newrootbuf); + return 0; +} + +static struct chunk_buffer *probe(struct dm_exception_store *ps, u64 chunk, + struct etree_path *path) +{ + unsigned i, levels = ps->tree_level; + struct node *node; + struct chunk_buffer *nodebuf = btbread(ps, ps->root_tree_chunk); + + if (!nodebuf) + return NULL; + node = buffer2node(nodebuf); + + for (i = 0; i < levels; i++) { + struct index_entry *pnext = node->entries, + *top = pnext + le32_to_cpu(node->count); + + while (++pnext < top) + if (le64_to_cpu(pnext->key) > chunk) + break; + + path[i].buffer = nodebuf; + path[i].pnext = pnext; + nodebuf = btbread(ps, le64_to_cpu((pnext - 1)->chunk)); + if (!nodebuf) + return NULL; + + node = (struct node *)nodebuf->data; + } + BUG_ON(le16_to_cpu(((struct leaf *)nodebuf->data)->magic) != 0x1eaf); + return nodebuf; +} + +static inline struct node *path_node(struct etree_path path[], int level) +{ + return buffer2node(path[level].buffer); +} + +/* + * Release each buffer in the given path array. + */ +static void brelse_path(struct etree_path *path, unsigned levels) +{ + unsigned i; + for (i = 0; i < levels; i++) + brelse(path[i].buffer); +} + +/* + * Merge the contents of 'leaf2' into 'leaf.' The leaves are contiguous and + * 'leaf2' follows 'leaf.' Move the exception lists in 'leaf' up to make room + * for those of 'leaf2,' adjusting the offsets in the map entries, then copy + * the map entries and exception lists straight from 'leaf2.' + */ +static void merge_leaves(struct leaf *leaf, struct leaf *leaf2) +{ + unsigned nhead, ntail, i; + unsigned tailsize; + char *phead, *ptail; + + nhead = le32_to_cpu(leaf->count); + ntail = le32_to_cpu(leaf2->count); + tailsize = (char *)emap(leaf2, ntail) - (char *)emap(leaf2, 0); + phead = (char *)emap(leaf, 0); + ptail = (char *)emap(leaf, nhead); + + /* move data down */ + memmove(phead - tailsize, phead, ptail - phead); + + /* adjust pointers */ + for (i = 0; i <= nhead; i++) { + u32 val = le32_to_cpu(leaf->map[i].offset); + /* also adjust sentinel */ + leaf->map[i].offset = cpu_to_le32(val - tailsize); + } + + /* move data from leaf2 to top */ + /* data */ + memcpy(ptail - tailsize, (char *)emap(leaf2, 0), tailsize); + /* map */ + memcpy(&leaf->map[nhead], &leaf2->map[0], + (ntail + 1) * sizeof(struct tree_map)); + leaf->count = cpu_to_le32(le32_to_cpu(leaf->count) + ntail); +} + +/* + * Remove the index entry at path[level].pnext-1 by moving entries below it up + * into its place. If it wasn't the last entry in the node but it _was_ the + * first entry (and we're not at the root), preserve the key by inserting it + * into the index entry of the parent node that refers to this node. + */ +static void remove_index(struct dm_exception_store *ps, struct etree_path path[], int level) +{ + struct node *node = path_node(path, level); + /* !!! out of bounds for delete of last from full index */ + chunk_t pivot = le64_to_cpu((path[level].pnext)->key); + int i, count = le32_to_cpu(node->count); + + /* stomps the node count (if 0th key holds count) */ + memmove(path[level].pnext - 1, path[level].pnext, + (char *)&node->entries[count] - (char *)path[level].pnext); + node->count = cpu_to_le32(count - 1); + --(path[level].pnext); + set_buffer_dirty(ps, path[level].buffer); + + /* no pivot for last entry */ + if (path[level].pnext == node->entries + le32_to_cpu(node->count)) + return; + + /* + * climb up to common parent and set pivot to deleted key + * what if index is now empty? (no deleted key) + * then some key above is going to be deleted and used to set pivot + */ + if (path[level].pnext == node->entries && level) { + /* Keep going up the path if we're at the first index entry. */ + for (i = level - 1; path[i].pnext - 1 == path_node(path, i)->entries; i--) + if (!i) /* If we hit the root, we're done. */ + return; + /* + * Found a node where we're not at the first entry. + * Set the key here to that of the deleted index + * entry. + */ + (path[i].pnext - 1)->key = cpu_to_le64(pivot); + set_buffer_dirty(ps, path[i].buffer); + } +} + +/* + * Returns the number of bytes of free space in the given leaf by computing + * the difference between the end of the map entry list and the beginning + * of the first set of exceptions. + */ +static unsigned leaf_freespace(struct leaf *leaf) +{ + /* include sentinel */ + char *maptop = (char *)(&leaf->map[le32_to_cpu(leaf->count) + 1]); + return (char *)emap(leaf, 0) - maptop; +} + +/* + * Returns the number of bytes used in the given leaf by computing the number + * of bytes used by the map entry list and all sets of exceptions. + */ +static unsigned leaf_payload(struct leaf *leaf) +{ + int count = le32_to_cpu(leaf->count); + int lower = (char *)(&leaf->map[count]) - (char *)leaf->map; + int upper = (char *)emap(leaf, count) - (char *)emap(leaf, 0); + return lower + upper; +} + +static void check_leaf(struct dm_exception_store *ps, struct leaf *leaf, u64 snapmask) +{ + struct exception *p; + int i; + + for (i = 0; i < le32_to_cpu(leaf->count); i++) { + for (p = emap(leaf, i); p < emap(leaf, i+1); p++) { + /* !!! should also check for any zero sharemaps here */ + if (le64_to_cpu(p->share) & snapmask) { + DMERR("nonzero bits %016llx outside snapmask %016llx", + (unsigned long long)p->share, + (unsigned long long)snapmask); + dm_multisnap_set_error(ps->dm, -EFSERROR); + } + } + } +} + +/* + * Remove all exceptions belonging to a given snapshot from the passed leaf. + * + * This clears the "share" bits on each chunk for the snapshot mask passed + * in the delete_info structure. In the process, it compresses out any + * exception entries that are entirely unshared and/or unused. In a second + * pass, it compresses out any map entries for which there are no exception + * entries remaining. + */ +static int delete_snapshots_from_leaf(struct dm_exception_store *ps, + struct leaf *leaf, u64 snapmask) +{ + /* p points just past the last map[] entry in the leaf. */ + struct exception *p = emap(leaf, le32_to_cpu(leaf->count)), *dest = p; + struct tree_map *pmap, *dmap; + unsigned i; + int ret = 0; + + /* Scan top to bottom clearing snapshot bit and moving + * non-zero entries to top of block */ + /* + * p points at each original exception; dest points at the location + * to receive an exception that is being moved down in the leaf. + * Exceptions that are unshared after clearing the share bit for + * the passed snapshot mask are skipped and the associated "exception" + * chunk is freed. This operates on the exceptions for one map entry + * at a time; when the beginning of a list of exceptions is reached, + * the associated map entry offset is adjusted. + */ + for (i = le32_to_cpu(leaf->count); i--;) { + /* + * False the first time through, since i is leaf->count and p + * was set to emap(leaf, leaf->count) above. + */ + while (p != emap(leaf, i)) { + u64 share = le64_to_cpu((--p)->share); + ret |= share & snapmask; + /* Unshare with given snapshot(s). */ + p->share = cpu_to_le64(le64_to_cpu(p->share) & ~snapmask); + if (le64_to_cpu(p->share)) /* If still used, keep chunk. */ + *--dest = *p; + else + shared_free_chunk(ps, le64_to_cpu(p->chunk)); + /* dirty_buffer_count_check(sb); */ + } + leaf->map[i].offset = cpu_to_le32((char *)dest - (char *)leaf); + } + /* Remove empties from map */ + /* + * This runs through the map entries themselves, skipping entries + * with matching offsets. If all the exceptions for a given map + * entry are skipped, its offset will be set to that of the following + * map entry (since the dest pointer will not have moved). + */ + dmap = pmap = &leaf->map[0]; + for (i = 0; i < le32_to_cpu(leaf->count); i++, pmap++) + if (le32_to_cpu(pmap->offset) != le32_to_cpu((pmap + 1)->offset)) + *dmap++ = *pmap; + /* + * There is always a phantom map entry after the last, that has the + * offset of the end of the leaf and, of course, no chunk number. + */ + dmap->offset = pmap->offset; + dmap->rchunk = 0; /* tidy up */ + leaf->count = cpu_to_le32(dmap - &leaf->map[0]); + check_leaf(ps, leaf, snapmask); + + return ret; +} + +/* + * Return true if path[level].pnext points at the end of the list of index + * entries. + */ +static inline int finished_level(struct etree_path path[], int level) +{ + struct node *node = path_node(path, level); + return path[level].pnext == node->entries + le32_to_cpu(node->count); +} + +/* + * Copy the index entries in 'node2' into 'node.' + */ +static void merge_nodes(struct node *node, struct node *node2) +{ + memcpy(&node->entries[le32_to_cpu(node->count)], + &node2->entries[0], + le32_to_cpu(node2->count) * sizeof(struct index_entry)); + node->count = cpu_to_le32(le32_to_cpu(node->count) + le32_to_cpu(node2->count)); +} + +static void brelse_free(struct dm_exception_store *ps, struct chunk_buffer *buffer) +{ + shared_free_chunk(ps, buffer->chunk); + free_chunk_buffer(ps, buffer); +} + +/* + * Delete all chunks in the B-tree for the snapshot(s) indicated by the + * passed snapshot mask, beginning at the passed chunk. + * + * Walk the tree (a stack-based inorder traversal) starting with the passed + * chunk, calling delete_snapshots_from_leaf() on each leaf to remove chunks + * associated with the snapshot(s) we're removing. As leaves and nodes become + * sparsely filled, merge them with their neighbors. When we reach the root + * we've finished the traversal; if there are empty levels (that is, level(s) + * directly below the root that only contain a single node), remove those + * empty levels until either the second level is no longer empty or we only + * have one level remaining. + */ +static int delete_tree_range(struct dm_exception_store *ps, u64 snapmask, chunk_t resume) +{ + int levels = ps->tree_level, level = levels - 1; + struct etree_path path[levels], hold[levels]; + struct chunk_buffer *leafbuf, *prevleaf = NULL; + unsigned i; + + /* can be initializer if not dynamic array (change it?) */ + for (i = 0; i < levels; i++) + hold[i] = (struct etree_path){ }; + /* + * Find the B-tree leaf with the chunk we were passed. Often + * this will be chunk 0. + */ + leafbuf = probe(ps, resume, path); + if (!leafbuf) + return -ENOMEM; + + while (1) { /* in-order leaf walk */ + if (delete_snapshots_from_leaf(ps, buffer2leaf(leafbuf), snapmask)) + set_buffer_dirty(ps, leafbuf); + /* + * If we have a previous leaf (i.e. we're past the first), + * try to merge the current leaf with it. + */ + if (prevleaf) { /* try to merge this leaf with prev */ + struct leaf *this = buffer2leaf(leafbuf); + struct leaf *prev = buffer2leaf(prevleaf); + + /* + * If there's room in the previous leaf for this leaf, + * merge this leaf into the previous leaf and remove + * the index entry that points to this leaf. + */ + if (leaf_payload(this) <= leaf_freespace(prev)) { + merge_leaves(prev, this); + remove_index(ps, path, level); + set_buffer_dirty(ps, prevleaf); + brelse_free(ps, leafbuf); + /* dirty_buffer_count_check(sb); */ + goto keep_prev_leaf; + } + brelse(prevleaf); + } + prevleaf = leafbuf; /* Save leaf for next time through. */ +keep_prev_leaf: + /* + * If we've reached the end of the index entries in the B-tree + * node at the current level, try to merge the node referred + * to at this level of the path with a prior node. Repeat + * this process at successively higher levels up the path; if + * we reach the root, clean up and exit. If we don't reach + * the root, we've reached a node with multiple entries; + * rebuild the path from the next index entry to the next + * leaf. + */ + if (finished_level(path, level)) { + do { /* pop and try to merge finished nodes */ + /* + * If we have a previous node at this level + * (again, we're past the first node), try to + * merge the current node with it. + */ + if (hold[level].buffer) { + struct node *this = path_node(path, level); + struct node *prev = path_node(hold, level); + int csize = ps->chunk_size; + int alloc_per_node = + (csize - offsetof(struct node, entries)) + / sizeof(struct index_entry); + + /* + * If there's room in the previous node + * for the entries in this node, merge + * this node into the previous node and + * remove the index entry that points + * to this node. + */ + if (le32_to_cpu(this->count) <= + alloc_per_node - le32_to_cpu(prev->count)) { + merge_nodes(prev, this); + remove_index(ps, path, level - 1); + set_buffer_dirty(ps, hold[level].buffer); + brelse_free(ps, path[level].buffer); +/* dirty_buffer_count_check(sb); */ + goto keep_prev_node; + } + brelse(hold[level].buffer); + } + /* Save the node for the next time through. */ + hold[level].buffer = path[level].buffer; +keep_prev_node: + /* + * If we're at the root, we need to clean up + * and return. First, though, try to reduce + * the number of levels. If the tree at the + * root has been reduced to only the nodes in + * our path, eliminate nodes with only one + * entry until we either have a new root node + * with multiple entries or we have only one + * level remaining in the B-tree. + */ + if (!level) { /* remove levels if possible */ + /* + * While above the first level and the + * root only has one entry, point the + * root at the (only) first-level node, + * reduce the number of levels and + * shift the path up one level. + */ + while (levels > 1 && + le32_to_cpu(path_node(hold, 0)->count) == 1) { + /* Point root at the first level. */ + ps->root_tree_chunk = hold[1].buffer->chunk; + brelse_free(ps, hold[0].buffer); +/* dirty_buffer_count_check(sb); */ + levels = --ps->tree_level; + memcpy(hold, hold + 1, levels * sizeof(hold[0])); + ps->header_dirty = 1; + } + brelse(prevleaf); + brelse_path(hold, levels); + +/* if (dirty_buffer_count) */ +/* commit_transaction(sb, 0); */ + ps->snapmask &= ~snapmask; + ps->header_dirty = 1; + return 0; + } + + level--; + } while (finished_level(path, level)); + /* + * Now rebuild the path from where we are (one entry + * past the last leaf we processed, which may have + * been adjusted in operations above) down to the node + * above the next leaf. + */ + do { /* push back down to leaf level */ + struct chunk_buffer *nodebuf; + + nodebuf = btbread(ps, + le64_to_cpu(path[level++].pnext++->chunk)); + if (!nodebuf) { + brelse_path(path, level - 1); /* anything else needs to be freed? */ + return -ENOMEM; + } + path[level].buffer = nodebuf; + path[level].pnext = buffer2node(nodebuf)->entries; + } while (level < levels - 1); + } + +/* dirty_buffer_count_check(sb); */ + /* + * Get the leaf indicated in the next index entry in the node + * at this level. + */ + leafbuf = btbread(ps, le64_to_cpu(path[level].pnext++->chunk)); + if (!leafbuf) { + brelse_path(path, level); + return -ENOMEM; + } + } +} + +static int origin_chunk_unique(struct leaf *leaf, u64 chunk, u64 snapmask) +{ + u64 using = 0; + u64 i, target = chunk - le64_to_cpu(leaf->base_chunk); + struct exception const *p; + + for (i = 0; i < le32_to_cpu(leaf->count); i++) + if (le32_to_cpu(leaf->map[i].rchunk) == target) + goto found; + return !snapmask; +found: + for (p = emap(leaf, i); p < emap(leaf, i+1); p++) + using |= le64_to_cpu(p->share); + + return !(~using & snapmask); +} + +static int snapshot_chunk_unique(struct leaf *leaf, u64 chunk, int snapbit, + chunk_t *exception) +{ + u64 mask = 1LL << snapbit; + unsigned i, target = chunk - le64_to_cpu(leaf->base_chunk); + struct exception const *p; + + for (i = 0; i < le32_to_cpu(leaf->count); i++) + if (le32_to_cpu(leaf->map[i].rchunk) == target) + goto found; + return 0; +found: + for (p = emap(leaf, i); p < emap(leaf, i+1); p++) + /* shared if more than one bit set including this one */ + if ((le64_to_cpu(p->share) & mask)) { + *exception = le64_to_cpu(p->chunk); + return !(le64_to_cpu(p->share) & ~mask); + } + return 0; +} + +static int shared_init(struct dm_multisnap *dm, struct dm_exception_store **sp, unsigned argc, char **argv, char **error) +{ + int r; + struct dm_exception_store *ps; + + ps = kzalloc(sizeof(struct dm_exception_store), GFP_KERNEL); + if (!ps) { + *error = "Could not allocate private area"; + r = -ENOMEM; + goto bad_private; + } + *sp = ps; + + ps->dm = dm; + ps->chunk_size = dm_multisnap_chunk_size(dm); + ps->chunk_shift = ffs(ps->chunk_size) - 1; + + if (argc != 0) { + *error = "Bad number of arguments"; + r = -EINVAL; + goto bad_arguments; + } + + if (ps->chunk_size != DM_CHUNK_SIZE_DEFAULT_SECTORS << SECTOR_SHIFT) { + *error = "Unsupported chunk size"; + r = -EINVAL; + goto bad_arguments; + } + + ps->area = NULL; + + r = shared_read_metadata(ps, error); + if (r) + goto bad_metadata; + + return 0; + +bad_metadata: +bad_arguments: + kfree(ps); +bad_private: + return r; +} + +static void shared_destroy(struct dm_exception_store *ps) +{ + struct chunk_buffer *bb, *n; + + list_for_each_entry_safe(bb, n, &ps->chunk_buffer_list, list) + free_chunk_buffer(ps, bb); + + shared_drop_header(ps); + + kfree(ps); +} + +static int shared_allocate_snapid(struct dm_exception_store *ps, snapid_t *snapid) +{ + int i; + + for (i = 0; i < MAX_SNAPSHOTS; i++) + if (!(ps->snapmask & 1LL << i)) { + *snapid = i; + return 0; + } + + return -ENOSPC; +} + +static int shared_create_snapshot(struct dm_exception_store *ps, snapid_t snapid) +{ + if (snapid >= MAX_SNAPSHOTS) { + DMERR("shared_create_snapshot: invalid snapshot id %llx", (unsigned long long)snapid); + return -EINVAL; + } + if (ps->snapmask & 1LL << snapid) { + DMERR("shared_create_snapshot: snapshot with id %llx already exists", (unsigned long long)snapid); + return -EINVAL; + } + ps->snapmask |= 1LL << snapid; + ps->nr_snapshots++; + write_header(ps); + return 0; +} + +static void shared_commit(struct dm_exception_store *ps); + +static int shared_delete_snapshot(struct dm_exception_store *ps, snapid_t idx) +{ + int r; + + if (ps->snapmask & 1LL << idx) { + u64 mask; + DMINFO("%s %d: delete %uth snapshot.", + __func__, __LINE__, (int)idx); + + mask = 1ULL << idx; + + r = delete_tree_range(ps, mask, 0); + if (!r) { + ps->snapmask &= ~(1LL << idx); + ps->nr_snapshots--; + } + + write_header(ps); + + shared_commit(ps); + } else { + BUG(); /* checked in the caller */ + } + + return r; +} + +static snapid_t shared_get_next_snapid(struct dm_exception_store *ps, snapid_t snapid) +{ + for (; snapid < MAX_SNAPSHOTS; snapid++) + if ((ps->snapmask >> snapid) & 1) + return snapid; + return SNAPID_T_ORIGIN; +} + +static int shared_find_snapshot_chunk(struct dm_exception_store *ps, snapid_t snapid, chunk_t chunk, int write, chunk_t *result) +{ + unsigned levels = ps->tree_level; + struct etree_path path[levels + 1]; + struct chunk_buffer *leafbuf; + + leafbuf = probe(ps, (u64)chunk, path); + if (!leafbuf) /* should make the snapshot invalid */ + return -1; + + return snapshot_chunk_unique(buffer2leaf(leafbuf), chunk, snapid, result); +} + +static void shared_reset_query(struct dm_exception_store *ps) +{ +} + +static int shared_query_next_remap(struct dm_exception_store *ps, chunk_t chunk) +{ + unsigned levels = ps->tree_level; + struct etree_path path[levels + 1]; + struct chunk_buffer *leafbuf; + + leafbuf = probe(ps, (u64)chunk, path); + if (!leafbuf) /* should make the snapshot invalid */ + return -1; + + ps->chunk_to_add = chunk; + + return !origin_chunk_unique(buffer2leaf(leafbuf), chunk, ps->snapmask); +} + +static void shared_add_next_remap(struct dm_exception_store *ps, union chunk_descriptor *cd, chunk_t *new_chunk) +{ + struct chunk_buffer *cb; + struct etree_path path[ps->tree_level + 1]; + chunk_t chunk = ps->chunk_to_add; + int ret; + + /* + * Mikulas: TODO: we could set just bits that we are really relocating. + * But setting more bits won't cause incorrect behavior, it'll just + * temporarily block access to already remapped snapshots. + */ + cd->bitmask = ps->snapmask; + + cb = probe(ps, chunk, path); + if (!cb) + return; + + ret = origin_chunk_unique(buffer2leaf(cb), chunk, ps->snapmask); + if (ret) { + DMERR("%s %d: bug %llu %d", __func__, __LINE__, + (unsigned long long)chunk, ret); + dm_multisnap_set_error(ps->dm, -EFSERROR); + return; + } + + *new_chunk = shared_allocate_chunk(ps); + if (!*new_chunk) + return; + + add_exception_to_tree(ps, cb, chunk, *new_chunk, -1, path, + ps->tree_level); + + /*DMINFO("%s %d: allocated new chunk, %llu", __func__, __LINE__, + (unsigned long long)*new_chunk);*/ +} + +static int shared_check_conflict(struct dm_exception_store *ps, union chunk_descriptor *cd, snapid_t snapid) +{ + return !!(cd->bitmask & (1LL << snapid)); +} + +static void shared_commit(struct dm_exception_store *ps) +{ + struct chunk_buffer *b, *n; + + /* Write bitmap */ + chunk_io(ps, ps->cur_bitmap_chunk, WRITE, ps->bitmap); + + list_for_each_entry_safe(b, n, &ps->chunk_buffer_dirty_list, + dirty_list) { + + list_del_init(&b->dirty_list); + list_move_tail(&b->list, &ps->chunk_buffer_list); + + /* todo: can be async */ + chunk_io(ps, b->chunk, WRITE, b->data); + } + + if (ps->header_dirty) + write_header(ps); + + list_for_each_entry_safe(b, n, &ps->chunk_buffer_list, list) { + if (ps->nr_chunk_buffers < MAX_CHUNK_BUFFERS) + break; + + free_chunk_buffer(ps, b); + } +} + +struct dm_multisnap_exception_store dm_multisnap_daniel_store = { + .name = "daniel", + .module = THIS_MODULE, + .init_exception_store = shared_init, + .exit_exception_store = shared_destroy, + .allocate_snapid = shared_allocate_snapid, + .create_snapshot = shared_create_snapshot, + .delete_snapshot = shared_delete_snapshot, + .get_next_snapid = shared_get_next_snapid, + .find_snapshot_chunk = shared_find_snapshot_chunk, + .reset_query = shared_reset_query, + .query_next_remap = shared_query_next_remap, + .add_next_remap = shared_add_next_remap, + .check_conflict = shared_check_conflict, + .commit = shared_commit, +}; + +static int __init dm_multisnapshot_daniel_module_init(void) +{ + return dm_multisnap_register_exception_store(&dm_multisnap_daniel_store); +} + +static void __exit dm_multisnapshot_daniel_module_exit(void) +{ + dm_multisnap_unregister_exception_store(&dm_multisnap_daniel_store); +} + +module_init(dm_multisnapshot_daniel_module_init); +module_exit(dm_multisnapshot_daniel_module_exit); + +MODULE_DESCRIPTION(DM_NAME " multisnapshot Fujita/Daniel's exceptions store"); +MODULE_AUTHOR("Fujita Tomonorig, Daniel Phillips"); +MODULE_LICENSE("GPL"); + + Index: linux-2.6.32-rc8-devel/drivers/md/Kconfig =================================================================== --- linux-2.6.32-rc8-devel.orig/drivers/md/Kconfig 2009-11-26 17:07:04.000000000 +0100 +++ linux-2.6.32-rc8-devel/drivers/md/Kconfig 2009-11-26 17:07:11.000000000 +0100 @@ -278,6 +278,20 @@ config DM_MULTISNAPSHOT_MIKULAS A B+-tree-based log-structured storage allowing unlimited number of snapshots. +config DM_MULTISNAPSHOT_DANIEL + tristate "Daniel's snapshot store" + depends on DM_MULTISNAPSHOT + ---help--- + Daniel Philips' exception store. The data structures were + designed by Daniel Phillips for Zumastore project, a porting + to kernel space was done by Fujita Tomonorig. + + This store has limit of at most 64 snapshots and supports + snapshot deletion. + + So far it doesn't support maintaining consistency across + crashes; journaling is under development. + config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM Index: linux-2.6.32-rc8-devel/drivers/md/Makefile =================================================================== --- linux-2.6.32-rc8-devel.orig/drivers/md/Makefile 2009-11-26 17:07:04.000000000 +0100 +++ linux-2.6.32-rc8-devel/drivers/md/Makefile 2009-11-26 17:07:11.000000000 +0100 @@ -13,6 +13,7 @@ dm-store-mikulas-y += dm-multisnap-mikul dm-multisnap-commit.o dm-multisnap-delete.o \ dm-multisnap-freelist.o dm-multisnap-io.o \ dm-multisnap-snaps.o dm-bufio.o +dm-store-daniel-y += dm-multisnap-daniel.o dm-mirror-y += dm-raid1.o dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o @@ -52,6 +53,7 @@ obj-$(CONFIG_DM_MULTIPATH_ST) += dm-serv obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_MULTISNAPSHOT) += dm-multisnapshot.o obj-$(CONFIG_DM_MULTISNAPSHOT_MIKULAS) += dm-store-mikulas.o +obj-$(CONFIG_DM_MULTISNAPSHOT_DANIEL) += dm-store-daniel.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o