--- linux/drivers/md/lvm.c.~1~ Mon Nov 19 17:56:04 2001 +++ linux/drivers/md/lvm.c Thu Feb 21 22:25:07 2002 @@ -198,6 +198,7 @@ #include #include #include +#include #include #include @@ -221,6 +222,7 @@ #include #include +#include #include "lvm-internal.h" @@ -249,7 +251,6 @@ static int lvm_chr_close(struct inode *, struct file *); static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong); - /* End external function prototypes */ @@ -277,6 +278,7 @@ static int lvm_do_lv_status_bydev(vg_t *, void *); static int lvm_do_pe_lock_unlock(vg_t *r, void *); +static int lvm_do_pe_locked_copy(vg_t *r, void *); static int lvm_do_pv_change(vg_t*, void*); static int lvm_do_pv_status(vg_t *, void *); @@ -287,8 +289,11 @@ static int lvm_do_vg_reduce(vg_t *, void *); static int lvm_do_vg_rename(vg_t *, void *); static int lvm_do_vg_remove(int); +static int lvm_push_callback(lv_t *, int, struct buffer_head *); +static void lvm_bh_callback(struct buffer_head *, int); static void lvm_geninit(struct gendisk *); static void __update_hardsectsize(lv_t *lv); +static int __do_le_remap(vg_t *, lv_t *, kdev_t, kdev_t, uint, uint); static void _queue_io(struct buffer_head *bh, int rw); @@ -312,7 +317,6 @@ int loadtime = 0; const char *const lvm_name = LVM_NAME; - /* volume group descriptor area pointers */ vg_t *vg[ABS_MAX_VG]; @@ -323,6 +327,12 @@ } vg_lv_map_t; static vg_lv_map_t vg_lv_map[ABS_MAX_LV]; +/* cache a buffer_head end_io callback state */ +typedef struct { + struct buffer_head bh_io; + lv_t *lv; + struct buffer_head *bh_orig; +} callback_t; /* Request structures (lvm_chr_ioctl()) */ static pv_change_req_t pv_change_req; @@ -372,6 +382,8 @@ static int lvm_hardsectsizes[MAX_LV]; static int lvm_size[MAX_LV]; +static mempool_t *lvm_callback_mempool; + static struct gendisk lvm_gendisk = { major: MAJOR_NR, @@ -383,26 +395,47 @@ nr_real: MAX_LV, }; + +static void * lvm_callback_alloc(int gfp_flags, void *data) +{ + callback_t *callback; + + callback = kmalloc(sizeof *callback, gfp_flags); + return callback; +} + +static void lvm_callback_free(void *callback, void *data) +{ + kfree(callback); +} + /* * Driver initialization... */ int lvm_init(void) { + int err = -EIO; + if (devfs_register_chrdev(LVM_CHAR_MAJOR, lvm_name, &lvm_chr_fops) < 0) { printk(KERN_ERR "%s -- devfs_register_chrdev failed\n", lvm_name); - return -EIO; + goto out_err; } if (devfs_register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) { printk("%s -- devfs_register_blkdev failed\n", lvm_name); - if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) - printk(KERN_ERR - "%s -- devfs_unregister_chrdev failed\n", - lvm_name); - return -EIO; + goto out_unreg_char; + } + + err = -ENOMEM; + lvm_callback_mempool = mempool_create(NR_LVM_CALLBACK, + lvm_callback_alloc, + lvm_callback_free, NULL); + if (!lvm_callback_mempool) { + printk("%s -- out of memory for callback pool\n", lvm_name); + goto out_unreg_block; } lvm_init_fs(); @@ -434,6 +467,18 @@ #endif return 0; + +out_unreg_block: + if (devfs_unregister_blkdev(MAJOR_NR, lvm_name) < 0) + printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n", + lvm_name); +out_unreg_char: + if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) + printk(KERN_ERR + "%s -- devfs_unregister_chrdev failed\n", + lvm_name); +out_err: + return err; } /* lvm_init() */ @@ -448,7 +493,8 @@ if (devfs_unregister_blkdev(MAJOR_NR, lvm_name) < 0) printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n", lvm_name); - + mempool_destroy(lvm_callback_mempool); + del_gendisk(&lvm_gendisk); blk_size[MAJOR_NR] = NULL; @@ -606,6 +652,11 @@ physical volume (move's done in user space's pvmove) */ return lvm_do_pe_lock_unlock(vg_ptr,arg); + case PE_LOCKED_COPY: + /* lock/unlock i/o to a physical extent to move it to another + physical volume (move's done in user space's pvmove) */ + return lvm_do_pe_locked_copy(vg_ptr,arg); + case VG_CREATE_OLD: /* create a VGDA */ return lvm_do_vg_create(arg, minor); @@ -1133,7 +1184,7 @@ kdev_t rdev_map; vg_t *vg_this = vg[VG_BLK(minor)]; lv_t *lv = vg_this->lv[LV_BLK(minor)]; - + int ret; down_read(&lv->lv_lock); if (!(lv->lv_status & LV_ACTIVE)) { @@ -1250,8 +1301,9 @@ out: bh->b_rdev = rdev_map; bh->b_rsector = rsector_map; + ret = lvm_push_callback(lv, rw, bh); up_read(&lv->lv_lock); - return 1; + return ret; bad: buffer_IO_error(bh); @@ -1264,6 +1316,65 @@ * internal support functions */ +/* + * Handle LVM callbacks on buffer_head IO completion: push an IO + * completion onto an existing buffer_head. preserve b_private by + * creating a new buffer_head for the mapped IO. + */ +static int lvm_push_callback(lv_t *lv, int rw, struct buffer_head *bh) +{ + callback_t *callback; + struct buffer_head *nbh; + + callback = mempool_alloc(lvm_callback_mempool, GFP_NOIO); + + callback->lv = lv; + callback->bh_orig = bh; + + nbh = &callback->bh_io; + + nbh->b_blocknr = bh->b_blocknr; + nbh->b_dev = bh->b_dev; + nbh->b_rdev = bh->b_rdev; + nbh->b_rsector = bh->b_rsector; + nbh->b_state = (1<b_count, 1); + nbh->b_size = bh->b_size; + nbh->b_page = bh->b_page; + nbh->b_data = bh->b_data; + nbh->b_list = 0; + nbh->b_reqnext = NULL; + + nbh->b_end_io = lvm_bh_callback; + nbh->b_private = callback; + + down_read(&lv->lv_io_sem); + generic_make_request(rw, nbh); + + return 0; /* Tell generic_make_request not to pursue the + original buffer_head any further now that we've + submitted a new one. */ +} + +static void lvm_bh_callback(struct buffer_head *bh, int uptodate) +{ + callback_t *callback; + struct buffer_head *obh; + lv_t *lv; + + callback = bh->b_private; + lv = callback->lv; + obh = callback->bh_orig; + + up_read(&lv->lv_io_sem); + + mempool_free(callback, lvm_callback_mempool); + if (obh->b_end_io) + obh->b_end_io(obh, uptodate); +} + + #ifdef LVM_HD_NAME /* * generate "hard disk" name @@ -1325,13 +1436,57 @@ } /* lvm_do_lock_lvm */ +static int do_pe_lock(kdev_t lv, kdev_t pv, uint32_t offset) +{ + down_write(&_pe_lock); + if (pe_lock_req.lock == LOCK_PE) { + up_write(&_pe_lock); + return -EBUSY; + } + + /* DEBUG ONLY */ + printk(KERN_DEBUG __FUNCTION__ ": Locking offset %u on %s\n", + offset, kdevname(pv)); + + /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */ + pe_lock_req.lock = LOCK_PE; + pe_lock_req.data.lv_dev = lv; + pe_lock_req.data.pv_dev = pv; + pe_lock_req.data.pv_offset = offset; + up_write(&_pe_lock); + return 0; +} + +static void do_pe_unlock(void) +{ + struct buffer_head *bh; + + down_write(&_pe_lock); + /* DEBUG ONLY */ + printk(KERN_DEBUG __FUNCTION__ ": Unlocking offset %u on %s\n", + pe_lock_req.data.pv_offset, kdevname(pe_lock_req.data.pv_dev)); + + pe_lock_req.lock = UNLOCK_PE; + pe_lock_req.data.lv_dev = 0; + pe_lock_req.data.pv_dev = 0; + pe_lock_req.data.pv_offset = 0; + bh = _dequeue_io(); + up_write(&_pe_lock); + + /* handle all deferred io for this PE */ + /* TODO: Eek, what about attaching callbacks to _flush_io() + deferred requests? --sct */ + _flush_io(bh); +} + + /* * character device support function lock/unlock physical extend */ static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg) { pe_lock_req_t new_lock; - struct buffer_head *bh; + int err; uint p; if (vg_ptr == NULL) return -ENXIO; @@ -1348,42 +1503,30 @@ if (p == vg_ptr->pv_max) return -ENXIO; /* - * this sync releaves memory pressure to lessen the - * likelyhood of pvmove being paged out - resulting in + * this sync relieves memory pressure to lessen the + * likelihood of pvmove being paged out - resulting in * deadlock. * - * This method of doing a pvmove is broken + * This method of doing a pvmove is *highly* broken for + * several reasons. It deadlocks, it does not + * synchronise correctly with outstanding write IO, and + * it defers the actual copy to a user mode app which + * has no cache coherency with the LV devices. */ fsync_dev(pe_lock_req.data.lv_dev); - down_write(&_pe_lock); - if (pe_lock_req.lock == LOCK_PE) { - up_write(&_pe_lock); - return -EBUSY; - } - - /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */ - pe_lock_req.lock = LOCK_PE; - pe_lock_req.data.lv_dev = new_lock.data.lv_dev; - pe_lock_req.data.pv_dev = new_lock.data.pv_dev; - pe_lock_req.data.pv_offset = new_lock.data.pv_offset; - up_write(&_pe_lock); + err = do_pe_lock(new_lock.data.lv_dev, + new_lock.data.pv_dev, + new_lock.data.pv_offset); + if (err) + return err; /* some requests may have got through since the fsync */ fsync_dev(pe_lock_req.data.pv_dev); break; case UNLOCK_PE: - down_write(&_pe_lock); - pe_lock_req.lock = UNLOCK_PE; - pe_lock_req.data.lv_dev = 0; - pe_lock_req.data.pv_dev = 0; - pe_lock_req.data.pv_offset = 0; - bh = _dequeue_io(); - up_write(&_pe_lock); - - /* handle all deferred io for this PE */ - _flush_io(bh); + do_pe_unlock(); break; default: @@ -1394,6 +1537,113 @@ /* + * character device support function: safe, locked PE copy + */ +static int lvm_do_pe_locked_copy(vg_t *vg_ptr, void *arg) +{ + pe_copy_req_t pe_copy_req; + int err; + lv_t *lv_ptr = NULL; + pv_t *pv_ptr = NULL; + int i; + unsigned long old_offset, new_offset; + + if (vg_ptr == NULL) return -ENXIO; + if (copy_from_user(&pe_copy_req, arg, + sizeof(pe_copy_req_t)) != 0) + return -EFAULT; + + printk(KERN_DEBUG __FUNCTION__ ": %d::%s %04x %d, %04x %d\n", + vg_ptr->vg_number, pe_copy_req.lv_name, + pe_copy_req.old_dev, pe_copy_req.old_pe, + pe_copy_req.new_dev, pe_copy_req.new_pe); + + /* First find the logical volume for the request... */ + + for (i = 0; i < vg_ptr->lv_max; i++) { + lv_ptr = vg_ptr->lv[i]; + if (lv_ptr != NULL && + strcmp(lv_ptr->lv_name, pe_copy_req.lv_name) == 0) + break; + } + + if (i == vg_ptr->lv_max) + return -EINVAL; + + printk(KERN_DEBUG __FUNCTION__ ": found lv %s.\n", + lv_ptr->lv_name); + + /* ... and the physical volume. */ + + for (i = 0; i < vg_ptr->pv_max; i++) { + pv_ptr = vg_ptr->pv[i]; + if (pv_ptr->pv_dev == pe_copy_req.old_dev) + break; + } + + if (i == vg_ptr->pv_max) + return -EINVAL; + + printk(KERN_DEBUG __FUNCTION__ ": found pv %s(%s).\n", + kdevname(pv_ptr->pv_dev), pv_ptr->pv_name); + + /* We'll take the lock on the source extent in the LV first. We + mutex out ALL IO to the entire logical volume before doing + this, so we can be absolutely certain that there is no + outstanding IO to this PE once the lock is in place. (We + can't mutex just one PE without tracking outstanding IO on a + per-extent basis.) */ + + down_write(&lv_ptr->lv_io_sem); + err = do_pe_lock(lv_ptr->lv_dev, + pe_copy_req.old_dev, + pe_copy_req.old_pe); + up_write(&lv_ptr->lv_io_sem); + + if (err) + return err; + + /* All prep done, we can copy the bits now */ + + err = lvm_do_bulk_copy(pe_copy_req.old_dev, pe_copy_req.new_dev, + pe_copy_req.old_pe, pe_copy_req.new_pe, + vg_ptr->pe_size); + + printk(KERN_DEBUG "lvm_do_bulk_copy returned %d.\n", err); + + if (!err) { + err = __do_le_remap(vg_ptr, lv_ptr, + pe_copy_req.old_dev, pe_copy_req.new_dev, + pe_copy_req.old_pe, pe_copy_req.new_pe); + printk(KERN_DEBUG "__do_le_remap returned %d.\n", err); + } + +out: + do_pe_unlock(); + return err; +} + +static int __do_le_remap(vg_t *vg_ptr, lv_t *lv_ptr, + kdev_t old_dev, kdev_t new_dev, + uint old_pe, uint new_pe) +{ + uint le; + + for (le = 0; le < lv_ptr->lv_allocated_le; le++) { + if (lv_ptr->lv_current_pe[le].dev == old_dev && + lv_ptr->lv_current_pe[le].pe == old_pe) { + lv_ptr->lv_current_pe[le].dev = new_dev; + lv_ptr->lv_current_pe[le].pe = new_pe; + + __update_hardsectsize(lv_ptr); + return 0; + } + } + return -EINVAL; +} + + +/* * character device support function logical extend remap */ static int lvm_do_le_remap(vg_t *vg_ptr, void *arg) @@ -1411,21 +1661,12 @@ if (lv_ptr != NULL && strcmp(lv_ptr->lv_name, le_remap_req.lv_name) == 0) { - for (le = 0; le < lv_ptr->lv_allocated_le; le++) { - if (lv_ptr->lv_current_pe[le].dev == - le_remap_req.old_dev && - lv_ptr->lv_current_pe[le].pe == - le_remap_req.old_pe) { - lv_ptr->lv_current_pe[le].dev = - le_remap_req.new_dev; - lv_ptr->lv_current_pe[le].pe = - le_remap_req.new_pe; - - __update_hardsectsize(lv_ptr); - return 0; - } - } - return -EINVAL; + + return __do_le_remap(vg_ptr, lv_ptr, + le_remap_req.old_dev, + le_remap_req.new_dev, + le_remap_req.old_pe, + le_remap_req.new_pe); } } return -ENXIO; @@ -1881,7 +2122,8 @@ lv_ptr->lv_snapshot_hash_table_size = 0; lv_ptr->lv_snapshot_hash_mask = 0; init_rwsem(&lv_ptr->lv_lock); - + init_rwsem(&lv_ptr->lv_io_sem); + lv_ptr->lv_snapshot_use_rate = 0; vg_ptr->lv[l] = lv_ptr; --- linux/include/linux/lvm.h.~1~ Sun Nov 11 18:09:32 2001 +++ linux/include/linux/lvm.h Thu Feb 21 22:25:07 2002 @@ -135,6 +135,11 @@ #define SECTOR_SIZE 512 #endif +/* + * Number of guaranteed callback structs in case of extreme VM load: + */ +#define NR_LVM_CALLBACK 256 + /* structure version */ #define LVM_STRUCT_VERSION 1 @@ -288,6 +293,7 @@ /* physical extent */ #define PE_LOCK_UNLOCK _IOW ( 0xfe, 0x50, 1) +#define PE_LOCKED_COPY _IOW ( 0xfe, 0x51, 1) /* i/o protocol version */ #define LVM_GET_IOP_VERSION _IOR ( 0xfe, 0x98, 1) @@ -515,6 +521,8 @@ struct vg_v3 *vg; uint lv_allocated_snapshot_le; + + struct rw_semaphore lv_io_sem; #else char dummy[200]; #endif @@ -633,6 +641,16 @@ } pe_lock_req_t; +/* Request structure PE_COPY */ +typedef struct { + char lv_name[NAME_LEN]; + kdev_t old_dev; + kdev_t new_dev; + uint32_t old_pe; + uint32_t new_pe; +} pe_copy_req_t; + + /* Request structure LV_STATUS_BYNAME */ typedef struct { char lv_name[NAME_LEN];