--- linux/fs/partitions/check.c.orig Mon Jun 12 03:27:25 2000 +++ linux/fs/partitions/check.c Mon Jun 12 03:27:30 2000 @@ -33,7 +33,7 @@ #include "ibm.h" extern void device_init(void); -extern void md_setup_drive(void); +extern void md_run_setup(void); extern int *blk_size[]; extern void rd_load(void); extern void initrd_load(void); @@ -437,7 +437,7 @@ rd_load(); #endif #ifdef CONFIG_BLK_DEV_MD - autodetect_raid(); + md_run_setup(); #endif return 0; } --- linux/fs/partitions/msdos.c.orig Mon Jun 12 03:27:25 2000 +++ linux/fs/partitions/msdos.c Mon Jun 12 03:27:30 2000 @@ -438,6 +438,11 @@ continue; add_gd_partition(hd, minor, first_sector+START_SECT(p)*sector_size, NR_SECTS(p)*sector_size); +#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID + if (SYS_IND(p) == LINUX_RAID_PARTITION) { + md_autodetect_dev(MKDEV(hd->major,minor)); + } +#endif if (is_extended_partition(p)) { printk(" <"); /* --- linux/init/main.c.orig Mon Jun 12 03:27:28 2000 +++ linux/init/main.c Mon Jun 12 03:27:30 2000 @@ -750,7 +750,7 @@ if (MAJOR(real_root_dev) != RAMDISK_MAJOR || MINOR(real_root_dev) != 0) { #ifdef CONFIG_BLK_DEV_MD - autodetect_raid(); + md_run_setup(); #endif error = change_root(real_root_dev,"/initrd"); if (error) --- linux/include/linux/raid/md.h.orig Mon Jun 12 03:27:22 2000 +++ linux/include/linux/raid/md.h Mon Jun 12 04:14:17 2000 @@ -78,7 +78,6 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_recover_arrays (void); extern int md_check_ordering (mddev_t *mddev); -extern void autodetect_raid(void); extern struct gendisk * find_gendisk (kdev_t dev); extern int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x); --- linux/include/linux/raid/md_k.h.orig Mon Jun 12 03:27:28 2000 +++ linux/include/linux/raid/md_k.h Mon Jun 12 03:27:30 2000 @@ -199,15 +199,16 @@ int sb_dirty; mdu_param_t param; int ro; - unsigned long curr_resync; - unsigned long resync_start; + unsigned long curr_resync; /* blocks scheduled */ + unsigned long resync_mark; /* a recent timestamp */ + unsigned long resync_mark_cnt;/* blocks written at resync_mark */ char *name; int recovery_running; struct semaphore reconfig_sem; struct semaphore recovery_sem; struct semaphore resync_sem; - atomic_t recovery_active; + atomic_t recovery_active; /* blocks scheduled, but not written */ md_wait_queue_head_t recovery_wait; struct md_list_head all_mddevs; @@ -218,11 +219,9 @@ { char *name; int (*make_request)(request_queue_t *q, mddev_t *mddev, int rw, struct buffer_head * bh); - void (*end_request)(struct buffer_head * bh, int uptodate); int (*run)(mddev_t *mddev); int (*stop)(mddev_t *mddev); int (*status)(char *page, mddev_t *mddev); - int max_invalid_dev; int (*error_handler)(mddev_t *mddev, kdev_t dev); /* --- linux/include/linux/raid/raid1.h.orig Mon Jun 12 03:27:22 2000 +++ linux/include/linux/raid/raid1.h Mon Jun 12 04:14:17 2000 @@ -9,6 +9,7 @@ kdev_t dev; int next; int sect_limit; + int head_position; /* * State bits: @@ -34,6 +35,18 @@ struct mirror_info *spare; md_spinlock_t device_lock; + /* buffer pool */ + /* buffer_heads that we have pre-allocated have b_pprev -> &freebh + * and are linked into a stack using b_next + * raid1_bh that are pre-allocated have R1BH_PreAlloc set. + * All these variable are protected by device_lock + */ + struct buffer_head *freebh; + int freebh_cnt; /* how many are on the list */ + struct raid1_bh *freer1; + struct raid1_bh *freebuf; /* each bh_req has a page allocated */ + md_wait_queue_head_t wait_buffer; + /* for use when syncing mirrors: */ int start_active, start_ready, start_pending, start_future; @@ -68,12 +81,12 @@ unsigned long state; mddev_t *mddev; struct buffer_head *master_bh; - struct buffer_head *mirror_bh [MD_SB_DISKS]; + struct buffer_head *mirror_bh_list; struct buffer_head bh_req; - struct buffer_head *next_retry; + struct raid1_bh *next_r1; /* next for retry or in free list */ }; /* bits for raid1_bh.state */ #define R1BH_Uptodate 1 #define R1BH_SyncPhase 2 - +#define R1BH_PreAlloc 3 /* this was pre-allocated, add to free list */ #endif --- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998 +++ linux/include/linux/raid1.h Mon Jun 12 03:27:30 2000 @@ -1,49 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -#include - -struct mirror_info { - int number; - int raid_disk; - kdev_t dev; - int next; - int sect_limit; - - /* - * State bits: - */ - int operational; - int write_only; - int spare; -}; - -struct raid1_data { - struct md_dev *mddev; - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */ - int raid_disks; - int working_disks; /* Number of working disks */ - int last_used; - unsigned long next_sect; - int sect_count; - int resync_running; -}; - -/* - * this is our 'private' 'collective' RAID1 buffer head. - * it contains information about what kind of IO operations were started - * for this RAID5 operation, and about their status: - */ - -struct raid1_bh { - unsigned int remaining; - int cmd; - unsigned long state; - struct md_dev *mddev; - struct buffer_head *master_bh; - struct buffer_head *mirror_bh [MD_SB_DISKS]; - struct buffer_head bh_req; - struct buffer_head *next_retry; -}; - -#endif --- linux/include/linux/raid5.h.orig Wed May 12 17:41:15 1999 +++ linux/include/linux/raid5.h Mon Jun 12 03:27:30 2000 @@ -1,110 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#ifdef __KERNEL__ -#include -#include - -struct disk_info { - kdev_t dev; - int operational; - int number; - int raid_disk; - int write_only; - int spare; -}; - -struct stripe_head { - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ - struct stripe_head *free_next; /* pool of free sh's */ - struct buffer_head *buffer_pool; /* pool of free buffers */ - struct buffer_head *bh_pool; /* pool of free bh's */ - struct raid5_data *raid_conf; - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ - unsigned long sector; /* sector of this row */ - int size; /* buffers size */ - int pd_idx; /* parity disk index */ - int nr_pending; /* nr of pending cmds */ - unsigned long state; /* state flags */ - int cmd; /* stripe cmd */ - int count; /* nr of waiters */ - int write_method; /* reconstruct-write / read-modify-write */ - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ - wait_queue_head_t wait; /* processes waiting for this stripe */ -}; - -/* - * Phase - */ -#define PHASE_BEGIN 0 -#define PHASE_READ_OLD 1 -#define PHASE_WRITE 2 -#define PHASE_READ 3 -#define PHASE_COMPLETE 4 - -/* - * Write method - */ -#define METHOD_NONE 0 -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 - -/* - * Stripe state - */ -#define STRIPE_LOCKED 0 -#define STRIPE_ERROR 1 - -/* - * Stripe commands - */ -#define STRIPE_NONE 0 -#define STRIPE_WRITE 1 -#define STRIPE_READ 2 - -struct raid5_data { - struct stripe_head **stripe_hashtbl; - struct md_dev *mddev; - struct md_thread *thread, *resync_thread; - struct disk_info disks[MD_SB_DISKS]; - struct disk_info *spare; - int buffer_size; - int chunk_size, level, algorithm; - int raid_disks, working_disks, failed_disks; - int sector_count; - unsigned long next_sector; - atomic_t nr_handle; - struct stripe_head *next_free_stripe; - int nr_stripes; - int resync_parity; - int max_nr_stripes; - int clock; - int nr_hashed_stripes; - int nr_locked_stripes; - int nr_pending_stripes; - int nr_cached_stripes; - - /* - * Free stripes pool - */ - int nr_free_sh; - struct stripe_head *free_sh_list; - wait_queue_head_t wait_for_stripe; -}; - -#endif - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 - -#endif --- linux/include/linux/genhd.h.orig Mon Jun 12 03:27:22 2000 +++ linux/include/linux/genhd.h Mon Jun 12 03:56:05 2000 @@ -50,7 +50,6 @@ struct hd_struct { long start_sect; long nr_sects; - int type; /* currently RAID or normal */ devfs_handle_t de; /* primary (master) devfs entry */ }; --- linux/drivers/block/Config.in.orig Mon Jun 12 03:27:26 2000 +++ linux/drivers/block/Config.in Mon Jun 12 03:31:11 2000 @@ -47,12 +47,11 @@ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD dep_tristate ' Linear (append) mode' CONFIG_MD_LINEAR $CONFIG_BLK_DEV_MD dep_tristate ' RAID-0 (striping) mode' CONFIG_MD_RAID0 $CONFIG_BLK_DEV_MD -if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - bool ' RAID-1/RAID-5 code (DANGEROUS)' CONFIG_RAID15_DANGEROUS - if [ "$CONFIG_RAID15_DANGEROUS" = "y" ]; then - dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD - dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD - fi +dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD +dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD +if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_RAID0" = "y" -o "$CONFIG_MD_RAID1" = "y" -o "$CONFIG_MD_RAID5" = "y" ]; then + bool ' Boot support' CONFIG_MD_BOOT + bool ' Auto Detect support' CONFIG_AUTODETECT_RAID fi tristate 'RAM disk support' CONFIG_BLK_DEV_RAM dep_bool ' Initial RAM disk (initrd) support' CONFIG_BLK_DEV_INITRD $CONFIG_BLK_DEV_RAM --- linux/drivers/block/linear.c.orig Mon Jun 12 03:27:21 2000 +++ linux/drivers/block/linear.c Mon Jun 12 04:02:15 2000 @@ -148,7 +148,7 @@ return -1; } bh->b_rdev = tmp_dev->dev; - bh->b_rsector = (block - tmp_dev->offset) << 1; + bh->b_rsector = ((block - tmp_dev->offset) << 1) + (bh->b_rsector & 1); return 1; } @@ -183,17 +183,11 @@ static mdk_personality_t linear_personality= { - "linear", - linear_make_request, - NULL, - linear_run, - linear_stop, - linear_status, - 0, - NULL, - NULL, - NULL, - NULL + name: "linear", + make_request: linear_make_request, + run: linear_run, + stop: linear_stop, + status: linear_status, }; #ifndef MODULE --- linux/drivers/block/md.c.orig Mon Jun 12 03:27:26 2000 +++ linux/drivers/block/md.c Mon Jun 12 04:00:09 2000 @@ -22,10 +22,10 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. - + You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include @@ -43,7 +43,7 @@ #include extern asmlinkage int sys_sched_yield(void); -extern asmlinkage long sys_setsid(void); +extern asmlinkage int sys_setsid(void); #define MAJOR_NR MD_MAJOR #define MD_DRIVER @@ -258,7 +258,7 @@ /* * The 'base' mddev is the one with data NULL. - * personalities can create additional mddevs + * personalities can create additional mddevs * if necessary. */ add_mddev_mapping(mddev, dev, 0); @@ -437,7 +437,7 @@ if (rdev->desc_nr == i) c++; } - if (c == 0) { + if (!c) { printk("md: md%d, missing disk #%d, aborting.\n", mdidx(mddev), i); goto abort; @@ -1012,7 +1012,7 @@ skip: return 0; } -#undef GETBLK_FAILED +#undef GETBLK_FAILED KERN_ERR static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) { @@ -1427,7 +1427,7 @@ break; } } - if (found) + if (found) continue; printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev)); @@ -1527,7 +1527,7 @@ * Do device size calculation. Bail out if too small. * (we have to do this after having validated chunk_size, * because device size has to be modulo chunk_size) - */ + */ persistent = !mddev->sb->not_persistent; ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->faulty) @@ -1578,7 +1578,7 @@ readahead = MD_READAHEAD; if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) readahead = mddev->sb->chunk_size * 4 * data_disks; - if (readahead < data_disks * MAX_SECTORS*512*2) + if (readahead < data_disks * MAX_SECTORS*512*2) readahead = data_disks * MAX_SECTORS*512*2; else { if (sb->level == -3) @@ -1619,7 +1619,7 @@ MD_BUG(); return -EINVAL; } - + if (mddev->pers) return -EBUSY; @@ -1631,7 +1631,7 @@ /* * Analyze all RAID superblock(s) - */ + */ if (analyze_sbs(mddev)) { MD_BUG(); return -EINVAL; @@ -1685,7 +1685,7 @@ #endif return -EINVAL; } - + if (device_size_calculation(mddev)) return -EINVAL; @@ -1700,9 +1700,9 @@ fsync_dev(rdev->dev); invalidate_buffers(rdev->dev); } - + mddev->pers = pers[pnum]; - + err = mddev->pers->run(mddev); if (err) { printk("pers->run() failed ...\n"); @@ -1719,7 +1719,7 @@ */ md_hd_struct[mdidx(mddev)].start_sect = 0; md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1; - + read_ahead[MD_MAJOR] = 1024; return (0); } @@ -1732,7 +1732,7 @@ static int restart_array (mddev_t *mddev) { int err = 0; - + /* * Complain if it has no devices */ @@ -1756,7 +1756,7 @@ mddev->pers->restart_resync(mddev); } else err = -EINVAL; - + out: return err; } @@ -1768,12 +1768,12 @@ { int err = 0, resync_interrupted = 0; kdev_t dev = mddev_to_kdev(mddev); - + if (!ro && get_super(dev)) { printk (STILL_MOUNTED, mdidx(mddev)); OUT(-EBUSY); } - + if (mddev->pers) { /* * It is safe to call stop here, it only frees private @@ -1835,7 +1835,7 @@ if (ro) set_device_ro(dev, 1); } - + /* * Free resources if final stop */ @@ -2071,14 +2071,25 @@ } raid_setup_args md__initdata = { 0, 0 }; +void md_setup_drive(void) md__init; + /* * Searches all registered partitions for autorun RAID arrays * at boot time. */ -void md__init autodetect_raid(void) +#ifdef CONFIG_AUTODETECT_RAID +static int detected_devices[128] md__initdata; +static int dev_cnt md__initdata=0; +void md__init md_autodetect_dev(kdev_t dev) +{ + if (dev_cnt < 127) + detected_devices[dev_cnt++] = dev; +} +#endif + +void md__init md_run_setup(void) { #ifdef CONFIG_AUTODETECT_RAID - struct gendisk *disk; mdk_rdev_t *rdev; int i; @@ -2088,36 +2099,35 @@ } printk(KERN_INFO "autodetecting RAID arrays\n"); - for (disk = gendisk_head ; disk ; disk = disk->next) { - for (i = 0; i < disk->max_p*disk->nr_real; i++) { - kdev_t dev = MKDEV(disk->major,i); - - if (disk->part[i].type != LINUX_RAID_PARTITION) - continue; + for (i=0; ifaulty) { - MD_BUG(); - continue; - } - md_list_add(&rdev->pending, &pending_raid_disks); + if (md_import_device(dev,1)) { + printk(KERN_ALERT "could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; } + md_list_add(&rdev->pending, &pending_raid_disks); } autorun_devices(); #endif +#ifdef CONFIG_MD_BOOT + md_setup_drive(); +#endif + } static int get_version (void * arg) @@ -2198,41 +2208,62 @@ } #undef SET_FROM_SB -#define SET_SB(x) mddev->sb->disks[nr].x = info.x +#define SET_SB(x) mddev->sb->disks[nr].x = info->x -static int add_new_disk (mddev_t * mddev, void * arg) +static int add_new_disk (mddev_t * mddev, mdu_disk_info_t *info) { int err, size, persistent; - mdu_disk_info_t info; mdk_rdev_t *rdev; unsigned int nr; kdev_t dev; - - if (!mddev->sb) - return -EINVAL; - - if (md_copy_from_user(&info, arg, sizeof(info))) - return -EFAULT; - - nr = info.number; - if (nr >= mddev->sb->nr_disks) - return -EINVAL; - - dev = MKDEV(info.major,info.minor); + dev = MKDEV(info->major,info->minor); if (find_rdev_all(dev)) { - printk("device %s already used in a RAID array!\n", + printk("device %s already used in a RAID array!\n", partition_name(dev)); return -EBUSY; } + if (!mddev->sb) { + /* expecting a device which has a superblock */ + err = md_import_device(dev, 1); + if (err) { + printk("md error, md_import_device returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (mddev->nb_dev) { + mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next, + mdk_rdev_t, same_set); + if (!uuid_equal(rdev0, rdev)) { + printk("md: %s has different UUID to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk("md: %s has same UUID but different superblock to %s\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + export_rdev(rdev); + return -EINVAL; + } + } + bind_rdev_to_array(rdev, mddev); + return 0; + } + + nr = info->number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; SET_SB(number); SET_SB(major); SET_SB(minor); SET_SB(raid_disk); SET_SB(state); - - if ((info.state & (1<state & (1<old_dev = dev; - rdev->desc_nr = info.number; - + rdev->desc_nr = info->number; + bind_rdev_to_array(rdev, mddev); - + persistent = !mddev->sb->not_persistent; if (!persistent) printk("nonpersistent superblock ...\n"); if (!mddev->sb->chunk_size) printk("no chunksize?\n"); - + size = calc_dev_size(dev, mddev, persistent); rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); - + if (!mddev->sb->size || (mddev->sb->size > size)) mddev->sb->size = size; } - + /* * sync all other superblocks with the main superblock */ @@ -2446,19 +2477,9 @@ return err; } -#define SET_SB(x) mddev->sb->x = info.x -static int set_array_info (mddev_t * mddev, void * arg) +#define SET_SB(x) mddev->sb->x = info->x +static int set_array_info (mddev_t * mddev, mdu_array_info_t *info) { - mdu_array_info_t info; - - if (mddev->sb) { - printk("array md%d already has a superblock!\n", - mdidx(mddev)); - return -EBUSY; - } - - if (md_copy_from_user(&info, arg, sizeof(info))) - return -EFAULT; if (alloc_array_sb(mddev)) return -ENOMEM; @@ -2636,11 +2657,25 @@ printk("ioctl, reason %d, cmd %d\n", err, cmd); goto abort; } - err = set_array_info(mddev, (void *)arg); - if (err) { - printk("couldnt set array info. %d\n", err); + + if (mddev->sb) { + printk("array md%d already has a superblock!\n", + mdidx(mddev)); + err = -EBUSY; goto abort_unlock; } + if (arg) { + mdu_array_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) { + err = -EFAULT; + goto abort_unlock; + } + err = set_array_info(mddev, &info); + if (err) { + printk("couldnt set array info. %d\n", err); + goto abort_unlock; + } + } goto done_unlock; case START_ARRAY: @@ -2671,6 +2706,11 @@ printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); goto abort; } + /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ + if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + err = -ENODEV; + goto abort_unlock; + } /* * Commands even a read-only array can execute: @@ -2691,7 +2731,10 @@ case STOP_ARRAY: err = do_md_stop (mddev, 0); - goto done; + if (err) + goto done_unlock; + else + goto done; case STOP_ARRAY_RO: err = do_md_stop (mddev, 1); @@ -2740,9 +2783,14 @@ goto done_unlock; case ADD_NEW_DISK: - err = add_new_disk(mddev, (void *)arg); + { + mdu_disk_info_t info; + if (md_copy_from_user(&info, (void*)arg, sizeof(info))) + err = -EFAULT; + else + err = add_new_disk(mddev, (void *)arg); goto done_unlock; - + } case HOT_REMOVE_DISK: err = hot_remove_disk(mddev, (kdev_t)arg); goto done_unlock; @@ -2773,13 +2821,13 @@ case RUN_ARRAY: { +/* The data is never used.... mdu_param_t param; - err = md_copy_from_user(¶m, (mdu_param_t *)arg, sizeof(param)); if (err) goto abort_unlock; - +*/ err = do_md_run (mddev); /* * we have to clean up the mess if @@ -2827,7 +2875,7 @@ open: md_open, ioctl: md_ioctl, }; - + int md_thread(void * arg) { @@ -3022,9 +3070,9 @@ static int status_resync (char * page, mddev_t * mddev) { int sz = 0; - unsigned int max_blocks, resync, res, dt, tt, et; + unsigned long max_blocks, resync, res, dt, db, rt; - resync = mddev->curr_resync; + resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); max_blocks = mddev->sb->size; /* @@ -3049,13 +3097,13 @@ /* * true resync */ - sz += sprintf(page + sz, " resync =%3u.%u%% (%u/%u)", + sz += sprintf(page + sz, " resync =%3lu.%lu%% (%lu/%lu)", res/10, res % 10, resync, max_blocks); else /* * recovery ... */ - sz += sprintf(page + sz, " recovery =%3u.%u%% (%u/%u)", + sz += sprintf(page + sz, " recovery =%3lu.%lu%% (%lu/%lu)", res/10, res % 10, resync, max_blocks); /* @@ -3063,21 +3111,18 @@ * the * 100 / 100 trick are important. We do a +1 to be * safe against division by zero. We only estimate anyway. * - * dt: time until now - * tt: total time - * et: estimated finish time - */ - dt = ((jiffies - mddev->resync_start) / HZ); - tt = (dt * (max_blocks / (resync/100+1)))/100; - if (tt > dt) - et = tt - dt; - else - /* - * ignore rounding effects near finish time - */ - et = 0; + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - mddev->resync_mark_cnt; + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; - sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6); + sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + sz += sprintf(page + sz, " speed=%ldK/sec", db/dt); return sz; } @@ -3103,7 +3148,7 @@ sz += sprintf(page+sz, "not set\n"); else sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); - + ITERATE_MDDEV(mddev,tmp) { sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), mddev->pers ? "" : "in"); @@ -3160,7 +3205,7 @@ if (pers[pnum]) return -EBUSY; - + pers[pnum] = p; printk(KERN_INFO "%s personality registered\n", p->name); return 0; @@ -3174,7 +3219,7 @@ printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name); pers[pnum] = NULL; return 0; -} +} static mdp_disk_t *get_spare(mddev_t *mddev) { @@ -3238,13 +3283,17 @@ } } +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) int md_do_sync(mddev_t *mddev, mdp_disk_t *spare) { mddev_t *mddev2; unsigned int max_blocks, currspeed, j, window, err, serialize; kdev_t read_disk = mddev_to_kdev(mddev); - unsigned long starttime; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; struct md_list_head *tmp; unsigned long last_check; @@ -3289,8 +3338,13 @@ current->priority = 1; is_mddev_idle(mddev); /* this also initializes IO event counters */ - starttime = jiffies; - mddev->resync_start = starttime; + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; /* * Tune reconstruction: @@ -3303,12 +3357,7 @@ last_check = 0; for (j = 0; j < max_blocks;) { int blocks; - if (j) - mddev->curr_resync = j; -/* wait_event(mddev->recovery_wait, - atomic_read(&mddev->recovery_active) < window); -*/ blocks = mddev->pers->sync_request(mddev, j); if (blocks < 0) { @@ -3317,12 +3366,24 @@ } atomic_add(blocks, &mddev->recovery_active); j += blocks; + mddev->curr_resync = j; if (last_check + window > j) continue; run_task_queue(&tq_disk); //?? + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + if (md_signal_pending(current)) { /* @@ -3347,7 +3408,8 @@ if (md_need_resched(current)) schedule(); - currspeed = j/((jiffies-starttime)/HZ + 1) + 1; + currspeed = (j-mddev->resync_mark_cnt)/((jiffies-mddev->resync_mark)/HZ +1) +1; + if (currspeed > sysctl_speed_limit_min) { current->priority = 1; @@ -3435,9 +3497,9 @@ if (disk_faulty(spare)) mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_INACTIVE); - if (err == -EINTR) { + if (err == -EINTR || err == -ENOMEM) { /* - * Recovery got interrupted ... + * Recovery got interrupted, or ran out of mem ... * signal back that we have finished using the array. */ mddev->pers->diskop(mddev, &spare, @@ -3605,6 +3667,173 @@ md_geninit(); return (0); } + +#ifdef CONFIG_MD_BOOT +#define MAX_MD_BOOT_DEVS 8 +struct { + unsigned long set; + int pers[MAX_MD_BOOT_DEVS]; + int chunk[MAX_MD_BOOT_DEVS]; + kdev_t devices[MAX_MD_BOOT_DEVS][MAX_REAL]; +} md_setup_args md__initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistant-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + */ +extern kdev_t name_to_kdev_t(char *line) md__init; +static int md__init md_setup(char *str) +{ + int minor, level, factor, fault, i=0; + kdev_t device; + char *devnames, *pername = ""; + + if(get_option(&str, &minor) != 2) { /* MD Number */ + printk("md: Too few arguments supplied to md=.\n"); + return 0; + } + if (minor >= MAX_MD_BOOT_DEVS) { + printk ("md: Minor device number too high.\n"); + return 0; + } else if (md_setup_args.set & (1 << minor)) { + printk ("md: Warning - md=%d,... has been specified twice;\n" + " will discard the first definition.\n", minor); + } + switch(get_option(&str, &level)) { /* RAID Personality */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == -1) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk("md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args.pers[minor] = level; + md_setup_args.chunk[minor] = 1 << (factor+12); + switch(level) { + case -1: + level = LINEAR; + pername = "linear"; + break; + case 0: + level = RAID0; + pername = "raid0"; + break; + default: + printk ("md: The kernel has not been configured for raid%d" + " support!\n", level); + return 0; + } + md_setup_args.pers[minor] = level; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + md_setup_args.devices[minor][i++] = level; + /* FALL THROUGH */ + case 0: + md_setup_args.pers[minor] = 0; + pername="super-block"; + } + devnames = str; + for (; isb->nr_disks++; + mddev->sb->raid_disks++; + mddev->sb->active_disks++; + mddev->sb->working_disks++; + err = add_new_disk (mddev, &dinfo); + } + } else { + /* persistent */ + for (i = 0; (dev = md_setup_args.devices[minor][i]); i++) { + dinfo.major = MAJOR(dev); + dinfo.minor = MINOR(dev); + add_new_disk (mddev, &dinfo); + } + } + if (!err) + err = do_md_run(mddev); + if (err) { + mddev->sb_dirty = 0; + do_md_stop(mddev, 0); + printk("md: starting md%d failed\n", minor); + } + } +} + +__setup("md=", md_setup); +#endif + MD_EXPORT_SYMBOL(md_size); MD_EXPORT_SYMBOL(register_md_personality); --- linux/drivers/block/raid0.c.orig Mon Jun 12 03:27:21 2000 +++ linux/drivers/block/raid0.c Mon Jun 12 03:27:30 2000 @@ -325,17 +325,11 @@ static mdk_personality_t raid0_personality= { - "raid0", - raid0_make_request, - NULL, /* no special end_request */ - raid0_run, - raid0_stop, - raid0_status, - 0, - NULL, /* no error_handler */ - NULL, /* no diskop */ - NULL, /* no stop resync */ - NULL /* no restart resync */ + name: "raid0", + make_request: raid0_make_request, + run: raid0_run, + stop: raid0_stop, + status: raid0_status, }; #ifndef MODULE --- linux/drivers/block/raid1.c.orig Mon Jun 12 03:27:26 2000 +++ linux/drivers/block/raid1.c Mon Jun 12 04:24:26 2000 @@ -7,6 +7,11 @@ * * RAID-1 management functions. * + * Better read-balancing code written by Mika Kuoppala , 2000 + * + * Fixes to reconstruction by Jakob Østergaard" + * Various fixes by Neil Brown + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) @@ -26,10 +31,7 @@ #define MD_DRIVER #define MD_PERSONALITY -#define MAX_LINEAR_SECTORS 128 - -#define MAX(a,b) ((a) > (b) ? (a) : (b)) -#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX_WORK_PER_DISK 128 /* * The following can be used to debug the driver @@ -41,48 +43,256 @@ #define inline #define __inline__ #else -#define inline -#define __inline__ #define PRINTK(x...) do { } while (0) #endif static mdk_personality_t raid1_personality; static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; -struct buffer_head *raid1_retry_list = NULL, **raid1_retry_tail; +struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail; -static void * raid1_kmalloc (int size) +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) { - void * ptr; - /* - * now we are rather fault tolerant than nice, but - * there are a couple of places in the RAID code where we - * simply can not afford to fail an allocation because - * there is no failure return path (eg. make_request()) - */ - while (!(ptr = kmalloc (size, GFP_KERNEL))) - printk ("raid1: out of memory, retrying...\n"); + /* return a linked list of "cnt" struct buffer_heads. + * don't take any off the free list unless we know we can + * get all we need, otherwise we could deadlock + */ + struct buffer_head *bh=NULL; + + while(cnt) { + struct buffer_head *t; + md_spin_lock_irq(&conf->device_lock); + if (conf->freebh_cnt >= cnt) + while (cnt) { + t = conf->freebh; + conf->freebh = t->b_next; + t->b_next = bh; + bh = t; + t->b_state = 0; + conf->freebh_cnt--; + cnt--; + } + md_spin_unlock_irq(&conf->device_lock); + if (cnt == 0) + break; + t = (struct buffer_head *)kmalloc(sizeof(struct buffer_head), GFP_KERNEL); + if (t) { + memset(t, 0, sizeof(*t)); + t->b_next = bh; + bh = t; + cnt--; + } else { + PRINTK("waiting for %d bh\n", cnt); + wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt); + } + } + return bh; +} - memset(ptr, 0, size); - return ptr; +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) +{ + md_spin_lock_irq(&conf->device_lock); + while (bh) { + struct buffer_head *t = bh; + bh=bh->b_next; + if (t->b_pprev == NULL) + kfree(t); + else { + t->b_next= conf->freebh; + conf->freebh = t; + conf->freebh_cnt++; + } + } + md_spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_buffer); } -static struct page * raid1_gfp (void) +static int raid1_grow_bh(raid1_conf_t *conf, int cnt) { - struct page *page; - /* - * now we are rather fault tolerant than nice, but - * there are a couple of places in the RAID code where we - * simply can not afford to fail an allocation because - * there is no failure return path (eg. make_request()) - * FIXME: be nicer here. - */ - while (!(page = (void*)alloc_page(GFP_KERNEL))) { - printk ("raid1: GFP out of memory, retrying...\n"); - schedule_timeout(2); + /* allocate cnt buffer_heads, possibly less if kalloc fails */ + int i = 0; + + while (i < cnt) { + struct buffer_head *bh; + bh = kmalloc(sizeof(*bh), GFP_KERNEL); + if (!bh) break; + memset(bh, 0, sizeof(*bh)); + + md_spin_lock_irq(&conf->device_lock); + bh->b_pprev = &conf->freebh; + bh->b_next = conf->freebh; + conf->freebh = bh; + conf->freebh_cnt++; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static int raid1_shrink_bh(raid1_conf_t *conf, int cnt) +{ + /* discard cnt buffer_heads, if we can find them */ + int i = 0; + + md_spin_lock_irq(&conf->device_lock); + while ((i < cnt) && conf->freebh) { + struct buffer_head *bh = conf->freebh; + conf->freebh = bh->b_next; + kfree(bh); + i++; + conf->freebh_cnt--; } + md_spin_unlock_irq(&conf->device_lock); + return i; +} + - return page; +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh = NULL; + + do { + md_spin_lock_irq(&conf->device_lock); + if (conf->freer1) { + r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + r1_bh->next_r1 = NULL; + r1_bh->state = 0; + r1_bh->bh_req.b_state = 0; + } + md_spin_unlock_irq(&conf->device_lock); + if (r1_bh) + return r1_bh; + r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), + GFP_KERNEL); + if (r1_bh) { + memset(r1_bh, 0, sizeof(*r1_bh)); + return r1_bh; + } + wait_event(conf->wait_buffer, conf->freer1); + } while (1); +} + +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + + r1_bh->mirror_bh_list = NULL; + + if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { + md_spin_lock_irq(&conf->device_lock); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + md_spin_unlock_irq(&conf->device_lock); + } else { + kfree(r1_bh); + } + raid1_free_bh(conf, bh); +} + +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + while (i < cnt) { + struct raid1_bh *r1_bh; + r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) + break; + memset(r1_bh, 0, sizeof(*r1_bh)); + + md_spin_lock_irq(&conf->device_lock); + set_bit(R1BH_PreAlloc, &r1_bh->state); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static void raid1_shrink_r1bh(raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freer1) { + struct raid1_bh *r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + + + +static inline void raid1_free_buf(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + r1_bh->mirror_bh_list = NULL; + + md_spin_lock_irq(&conf->device_lock); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + md_spin_unlock_irq(&conf->device_lock); + raid1_free_bh(conf, bh); +} + +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh; + + md_spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); + r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + r1_bh->next_r1= NULL; + md_spin_unlock_irq(&conf->device_lock); + + return r1_bh; +} + +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + md_spin_lock_irq(&conf->device_lock); + while (i < cnt) { + struct raid1_bh *r1_bh; + struct page *page; + + page = alloc_page(GFP_KERNEL); + if (!page) + break; + + r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) { + __free_page(page); + break; + } + memset(r1_bh, 0, sizeof(*r1_bh)); + r1_bh->bh_req.b_page = page; + r1_bh->bh_req.b_data = (char *) page_address(page); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + i++; + } + md_spin_unlock_irq(&conf->device_lock); + return i; +} + +static void raid1_shrink_buffers (raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freebuf) { + struct raid1_bh *r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + __free_page(r1_bh->bh_req.b_page); + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); } static int raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size) @@ -106,19 +316,18 @@ return (-1); } -static void raid1_reschedule_retry (struct buffer_head *bh) +static void raid1_reschedule_retry (struct raid1_bh *r1_bh) { unsigned long flags; - struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); mddev_t *mddev = r1_bh->mddev; raid1_conf_t *conf = mddev_to_conf(mddev); md_spin_lock_irqsave(&retry_list_lock, flags); if (raid1_retry_list == NULL) raid1_retry_tail = &raid1_retry_list; - *raid1_retry_tail = bh; - raid1_retry_tail = &r1_bh->next_retry; - r1_bh->next_retry = NULL; + *raid1_retry_tail = r1_bh; + raid1_retry_tail = &r1_bh->next_r1; + r1_bh->next_r1 = NULL; md_spin_unlock_irqrestore(&retry_list_lock, flags); md_wakeup_thread(conf->thread); } @@ -166,7 +375,7 @@ test_bit(R1BH_SyncPhase, &r1_bh->state)); bh->b_end_io(bh, uptodate); - kfree(r1_bh); + raid1_free_r1bh(r1_bh); } void raid1_end_request (struct buffer_head *bh, int uptodate) { @@ -176,7 +385,7 @@ * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) - md_error (bh->b_dev, bh->b_rdev); + md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); else /* * Set R1BH_Uptodate in our master buffer_head, so that @@ -208,7 +417,7 @@ */ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", partition_name(bh->b_dev), bh->b_blocknr); - raid1_reschedule_retry(bh); + raid1_reschedule_retry(r1_bh); return; } @@ -219,29 +428,128 @@ * already. */ - if (atomic_dec_and_test(&r1_bh->remaining)) { - int i, disks = MD_SB_DISKS; + if (atomic_dec_and_test(&r1_bh->remaining)) + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); +} - for ( i = 0; i < disks; i++) { - struct buffer_head *bh = r1_bh->mirror_bh[i]; - if (bh) { - // FIXME: make us a regular bcache member - kfree(bh); - } +/* + * This routine returns the disk from which the requested read should + * be done. It bookkeeps the last read position for every disk + * in array and when new read requests come, the disk which last + * position is nearest to the request, is chosen. + * + * TODO: now if there are 2 mirrors in the same 2 devices, performance + * degrades dramatically because position is mirror, not device based. + * This should be changed to be device based. Also atomic sequential + * reads should be somehow balanced. + */ + +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) +{ + int new_disk = conf->last_used; + const int sectors = bh->b_size >> 9; + const long this_sector = bh->b_blocknr * sectors; + int disk = new_disk; + unsigned long new_distance; + unsigned long current_distance; + + /* + * Check if it is sane at all to balance + */ + + if (conf->resync_mirrors) + goto rb_out; + + if (conf->working_disks < 2) { + int i = 0; + + while( !conf->mirrors[new_disk].operational && + (i < MD_SB_DISKS) ) { + new_disk = conf->mirrors[new_disk].next; + i++; } + + if (i >= MD_SB_DISKS) { + /* + * This means no working disk was found + * Nothing much to do, lets not change anything + * and hope for the best... + */ + + new_disk = conf->last_used; + } + + goto rb_out; + } - raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); + /* + * Don't touch anything for sequential reads. + */ + + if (this_sector == conf->mirrors[new_disk].head_position) + goto rb_out; + + /* + * If reads have been done only on a single disk + * for a time, lets give another disk a change. + * This is for kicking those idling disks so that + * they would find work near some hotspot. + */ + + if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { + conf->sect_count = 0; + + while( new_disk != conf->mirrors[new_disk].next ) { + if ((conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational) ) + continue; + + new_disk = conf->mirrors[new_disk].next; + break; + } + + goto rb_out; } + + current_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + /* Find the disk which is closest */ + + while( conf->mirrors[disk].next != conf->last_used ) { + disk = conf->mirrors[disk].next; + + if ((conf->mirrors[disk].write_only) || + (!conf->mirrors[disk].operational)) + continue; + + new_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + if (new_distance < current_distance) { + conf->sect_count = 0; + current_distance = new_distance; + new_disk = disk; + } + } + +rb_out: + conf->mirrors[new_disk].head_position = this_sector + sectors; + + conf->last_used = new_disk; + conf->sect_count += sectors; + + return new_disk; } static int raid1_make_request (request_queue_t *q, mddev_t *mddev, int rw, struct buffer_head * bh) { raid1_conf_t *conf = mddev_to_conf(mddev); - struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req; + struct buffer_head *bh_req, *bhl; struct raid1_bh * r1_bh; int disks = MD_SB_DISKS; - int i, sum_bhs = 0, switch_disks = 0, sectors; + int i, sum_bhs = 0, sectors; struct mirror_info *mirror; DECLARE_WAITQUEUE(wait, current); @@ -279,8 +587,7 @@ return 0; } } - r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); - + r1_bh = raid1_alloc_r1bh (conf); spin_lock_irq(&conf->segment_lock); wait_event_lock_irq(conf->wait_done, @@ -305,41 +612,20 @@ r1_bh->master_bh = bh; r1_bh->mddev = mddev; r1_bh->cmd = rw; - bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); + sectors = bh->b_size >> 9; if (rw == READ) { - int last_used = conf->last_used; - /* * read balancing logic: */ - mirror = conf->mirrors + last_used; - bh->b_rdev = mirror->dev; - sectors = bh->b_size >> 9; - - switch_disks = 0; - if (bh->b_blocknr * sectors == conf->next_sect) { - conf->sect_count += sectors; - if (conf->sect_count >= mirror->sect_limit) - switch_disks = 1; - } else - switch_disks = 1; - conf->next_sect = (bh->b_blocknr + 1) * sectors; - /* - * Do not switch disks if full resync is in progress ... - */ - if (switch_disks && !conf->resync_mirrors) { - conf->sect_count = 0; - last_used = conf->last_used = mirror->next; - /* - * Do not switch to write-only disks ... - * reconstruction is in progress - */ - while (conf->mirrors[last_used].write_only) - conf->last_used = conf->mirrors[last_used].next; - } + mirror = conf->mirrors + raid1_read_balance(conf, bh); + bh_req = &r1_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); + bh_req->b_blocknr = bh->b_rsector * sectors; + bh_req->b_dev = mirror->dev; + bh_req->b_rdev = mirror->dev; + /* bh_req->b_rsector = bh->n_rsector; */ bh_req->b_end_io = raid1_end_request; bh_req->b_dev_id = r1_bh; q = blk_get_queue(bh_req->b_rdev); @@ -351,15 +637,11 @@ * WRITE: */ + bhl = raid1_alloc_bh(conf, conf->raid_disks); for (i = 0; i < disks; i++) { - - if (!conf->mirrors[i].operational) { - /* - * the r1_bh->mirror_bh[i] pointer remains NULL - */ - mirror_bh[i] = NULL; + struct buffer_head *mbh; + if (!conf->mirrors[i].operational) continue; - } /* * We should use a private pool (size depending on NR_REQUEST), @@ -373,30 +655,38 @@ * manner in the write branch. Look how we lock the buffer at the * beginning of this function to grok the difference ;) */ - mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head)); - mirror_bh[i]->b_this_page = (struct buffer_head *)1; + mbh = bhl; + if (mbh == NULL) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_next = NULL; + mbh->b_this_page = (struct buffer_head *)1; /* - * prepare mirrored bh (fields ordered for max mem throughput): + * prepare mirrored mbh (fields ordered for max mem throughput): */ - mirror_bh[i]->b_blocknr = bh->b_blocknr; - mirror_bh[i]->b_dev = bh->b_dev; - mirror_bh[i]->b_rdev = conf->mirrors[i].dev; - mirror_bh[i]->b_rsector = bh->b_rsector; - mirror_bh[i]->b_state = (1<b_blocknr = bh->b_rsector * sectors; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_rsector; + mbh->b_state = (1<b_count, 1); - mirror_bh[i]->b_size = bh->b_size; - mirror_bh[i]->b_data = bh->b_data; - mirror_bh[i]->b_list = BUF_LOCKED; - mirror_bh[i]->b_end_io = raid1_end_request; - mirror_bh[i]->b_dev_id = r1_bh; - - r1_bh->mirror_bh[i] = mirror_bh[i]; + atomic_set(&mbh->b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = raid1_end_request; + mbh->b_dev_id = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; sum_bhs++; } - + if (bhl) raid1_free_bh(conf,bhl); md_atomic_set(&r1_bh->remaining, sum_bhs); /* @@ -410,12 +700,12 @@ * all requests finish until we had a chance to set up the * semaphore correctly ... lots of races). */ - for (i = 0; i < disks; i++) { - struct buffer_head *mbh = mirror_bh[i]; - if (mbh) { - q = blk_get_queue(mbh->b_rdev); - generic_make_request(q, rw, mbh); - } + bh = r1_bh->mirror_bh_list; + while(bh) { + struct buffer_head *bh2 = bh; + bh = bh->b_next; + q = blk_get_queue(bh2->b_rdev); + generic_make_request(q, rw, bh2); } return (0); } @@ -792,6 +1082,7 @@ adisk->write_only = 0; adisk->spare = 1; adisk->used_slot = 1; + adisk->head_position = 0; conf->nr_disks++; break; @@ -803,6 +1094,10 @@ } abort: md_spin_unlock_irq(&conf->device_lock); + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) + /* should move to "END_REBUILD" when such exists */ + raid1_shrink_buffers(conf); + print_raid1_conf(conf); return err; } @@ -836,19 +1131,19 @@ for (;;) { md_spin_lock_irqsave(&retry_list_lock, flags); - bh = raid1_retry_list; - if (!bh) + r1_bh = raid1_retry_list; + if (!r1_bh) break; - r1_bh = (struct raid1_bh *)(bh->b_dev_id); - raid1_retry_list = r1_bh->next_retry; + raid1_retry_list = r1_bh->next_r1; md_spin_unlock_irqrestore(&retry_list_lock, flags); - mddev = kdev_to_mddev(bh->b_dev); + mddev = r1_bh->mddev; if (mddev->sb_dirty) { printk(KERN_INFO "dirty sb detected, updating.\n"); mddev->sb_dirty = 0; md_update_sb(mddev); } + bh = &r1_bh->bh_req; switch(r1_bh->cmd) { case SPECIAL: /* have to allocate lots of bh structures and @@ -857,69 +1152,74 @@ if (test_bit(R1BH_Uptodate, &r1_bh->state)) { int i, sum_bhs = 0; int disks = MD_SB_DISKS; - struct buffer_head *mirror_bh[MD_SB_DISKS]; + struct buffer_head *bhl, *mbh; raid1_conf_t *conf; + int sectors = bh->b_size >> 9; conf = mddev_to_conf(mddev); + bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ for (i = 0; i < disks ; i++) { - if (!conf->mirrors[i].operational) { - mirror_bh[i] = NULL; + if (!conf->mirrors[i].operational) continue; - } - if (i==conf->last_used) { + if (i==conf->last_used) /* we read from here, no need to write */ - mirror_bh[i] = NULL; continue; - } if (i < conf->raid_disks - && !conf->resync_mirrors) { + && !conf->resync_mirrors) /* don't need to write this, * we are just rebuilding */ - mirror_bh[i] = NULL; continue; + mbh = bhl; + if (!mbh) { + MD_BUG(); + break; } + bhl = mbh->b_next; + mbh->b_this_page = (struct buffer_head *)1; - mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head)); - mirror_bh[i]->b_this_page = (struct buffer_head *)1; /* * prepare mirrored bh (fields ordered for max mem throughput): */ - mirror_bh[i]->b_blocknr = bh->b_blocknr; - mirror_bh[i]->b_dev = bh->b_dev; - mirror_bh[i]->b_rdev = conf->mirrors[i].dev; - mirror_bh[i]->b_rsector = bh->b_rsector; - mirror_bh[i]->b_state = (1<b_blocknr = bh->b_blocknr; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rdev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_blocknr * sectors; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = end_sync_write; + mbh->b_dev_id = r1_bh; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; - atomic_set(&mirror_bh[i]->b_count, 1); - mirror_bh[i]->b_size = bh->b_size; - mirror_bh[i]->b_data = bh->b_data; - mirror_bh[i]->b_list = BUF_LOCKED; - mirror_bh[i]->b_end_io = end_sync_write; - mirror_bh[i]->b_dev_id = r1_bh; - - r1_bh->mirror_bh[i] = mirror_bh[i]; sum_bhs++; } md_atomic_set(&r1_bh->remaining, sum_bhs); - for ( i = 0; i < disks ; i++) { - struct buffer_head *mbh = mirror_bh[i]; - if (mbh) { - q = blk_get_queue(mbh->b_rdev); - generic_make_request(q, WRITE, mbh); - drive_stat_acct(mbh->b_rdev, WRITE, -bh->b_size/512, 0); - } + if (bhl) raid1_free_bh(conf, bhl); + mbh = r1_bh->mirror_bh_list; + while (mbh) { + struct buffer_head *bh1 = mbh; + mbh = mbh->b_next; + q = blk_get_queue(bh1->b_rdev); + generic_make_request(q, WRITE, bh1); + drive_stat_acct(bh1->b_rdev, WRITE, -bh1->b_size/512, 0); } } else { - dev = bh->b_rdev; - raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9); - if (bh->b_rdev == dev) { + dev = bh->b_dev; + raid1_map (mddev, &bh->b_dev, bh->b_size >> 9); + if (bh->b_dev == dev) { printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); md_done_sync(mddev, bh->b_size>>10, 0); } else { printk (REDIRECT_SECTOR, partition_name(bh->b_dev), bh->b_blocknr); + bh->b_rdev = bh->b_dev; q = blk_get_queue(bh->b_rdev); generic_make_request (q, READ, bh); } @@ -928,15 +1228,16 @@ break; case READ: case READA: - dev = bh->b_rdev; + dev = bh->b_dev; - raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9); - if (bh->b_rdev == dev) { + raid1_map (mddev, &bh->b_dev, bh->b_size >> 9); + if (bh->b_dev == dev) { printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); raid1_end_bh_io(r1_bh, 0); } else { printk (REDIRECT_SECTOR, partition_name(bh->b_dev), bh->b_blocknr); + bh->b_rdev = bh->b_dev; q = blk_get_queue(bh->b_rdev); generic_make_request (q, r1_bh->cmd, bh); } @@ -962,15 +1263,37 @@ if (conf->resync_mirrors == 2) return; down(&mddev->recovery_sem); - if (md_do_sync(mddev, NULL)) { - up(&mddev->recovery_sem); - return; + if (!md_do_sync(mddev, NULL)) { + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; } - /* - * Only if everything went Ok. + + /* If reconstruction was interrupted, we need to close the "active" and "pending" + * holes. + * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 */ - conf->resync_mirrors = 0; + /* this is really needed when recovery stops too... */ + spin_lock_irq(&conf->segment_lock); + conf->start_active = conf->start_pending; + conf->start_ready = conf->start_pending; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; + conf->start_future = mddev->sb->size+1; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + conf->phase = conf->phase ^1; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; + conf->phase = 0; + conf->cnt_future = conf->cnt_done;; + conf->cnt_done = 0; + spin_unlock_irq(&conf->segment_lock); + wake_up(&conf->wait_done); + up(&mddev->recovery_sem); + raid1_shrink_buffers(conf); } /* @@ -1033,12 +1356,19 @@ spin_lock_irq(&conf->segment_lock); if (!block_nr) { /* initialize ...*/ + int buffs; conf->start_active = 0; conf->start_ready = 0; conf->start_pending = 0; conf->start_future = 0; conf->phase = 0; - conf->window = 128; + /* we want enough buffers to hold twice the window of 128*/ + buffs = 128 *2 / (PAGE_SIZE>>9); + buffs = raid1_grow_buffers(conf, buffs); + if (buffs < 2) + goto nomem; + + conf->window = buffs*(PAGE_SIZE>>9)/2; conf->cnt_future += conf->cnt_done+conf->cnt_pending; conf->cnt_done = conf->cnt_pending = 0; if (conf->cnt_ready || conf->cnt_active) @@ -1058,7 +1388,7 @@ conf->start_ready = conf->start_pending; conf->start_pending = conf->start_future; conf->start_future = conf->start_future+conf->window; - // Note: falling of the end is not a problem + // Note: falling off the end is not a problem conf->phase = conf->phase ^1; conf->cnt_active = conf->cnt_ready; conf->cnt_ready = 0; @@ -1076,12 +1406,11 @@ */ mirror = conf->mirrors+conf->last_used; - r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); + r1_bh = raid1_alloc_buf (conf); r1_bh->master_bh = NULL; r1_bh->mddev = mddev; r1_bh->cmd = SPECIAL; bh = &r1_bh->bh_req; - memset(bh, 0, sizeof(*bh)); bh->b_blocknr = block_nr; bsize = 1024; @@ -1092,11 +1421,15 @@ } bh->b_size = bsize; bh->b_list = BUF_LOCKED; - bh->b_dev = mddev_to_kdev(mddev); + bh->b_dev = mirror->dev; bh->b_rdev = mirror->dev; bh->b_state = (1<b_page = raid1_gfp(); - bh->b_data = (char *) page_address(bh->b_page); + if (!bh->b_page) + BUG(); + if (!bh->b_data) + BUG(); + if (bh->b_data != (char *) page_address(bh->b_page)) + BUG(); bh->b_end_io = end_sync_read; bh->b_dev_id = (void *) r1_bh; bh->b_rsector = block_nr<<1; @@ -1107,6 +1440,11 @@ drive_stat_acct(bh->b_rdev, READ, -bh->b_size/512, 0); return (bsize >> 10); + +nomem: + raid1_shrink_buffers(conf); + spin_unlock_irq(&conf->segment_lock); + return -ENOMEM; } static void end_sync_read(struct buffer_head *bh, int uptodate) @@ -1118,10 +1456,10 @@ * We don't do much here, just schedule handling by raid1d */ if (!uptodate) - md_error (bh->b_dev, bh->b_rdev); + md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); else set_bit(R1BH_Uptodate, &r1_bh->state); - raid1_reschedule_retry(bh); + raid1_reschedule_retry(r1_bh); } static void end_sync_write(struct buffer_head *bh, int uptodate) @@ -1129,22 +1467,12 @@ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); if (!uptodate) - md_error (bh->b_dev, bh->b_rdev); + md_error (mddev_to_kdev(r1_bh->mddev), bh->b_dev); if (atomic_dec_and_test(&r1_bh->remaining)) { - int i, disks = MD_SB_DISKS; mddev_t *mddev = r1_bh->mddev; - unsigned long sect = bh->b_rsector; + unsigned long sect = bh->b_blocknr * (bh->b_size>>9); int size = bh->b_size; - - free_page((unsigned long)bh->b_data); - for ( i = 0; i < disks; i++) { - struct buffer_head *bh = r1_bh->mirror_bh[i]; - if (bh) { - // FIXME: make us a regular bcache member - kfree(bh); - } - } - kfree(r1_bh); + raid1_free_buf(r1_bh); sync_request_done(sect, mddev_to_conf(mddev)); md_done_sync(mddev,size>>10, uptodate); } @@ -1278,12 +1606,13 @@ * should be freed in raid1_stop()] */ - conf = raid1_kmalloc(sizeof(raid1_conf_t)); + conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); mddev->private = conf; if (!conf) { printk(MEM_ERROR, mdidx(mddev)); goto out; } + memset(conf, 0, sizeof(*conf)); ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->faulty) { @@ -1306,11 +1635,12 @@ disk->number = descriptor->number; disk->raid_disk = disk_idx; disk->dev = rdev->dev; - disk->sect_limit = MAX_LINEAR_SECTORS; + disk->sect_limit = MAX_WORK_PER_DISK; disk->operational = 0; disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; + disk->head_position = 0; continue; } if (disk_active(descriptor)) { @@ -1337,11 +1667,12 @@ disk->number = descriptor->number; disk->raid_disk = disk_idx; disk->dev = rdev->dev; - disk->sect_limit = MAX_LINEAR_SECTORS; + disk->sect_limit = MAX_WORK_PER_DISK; disk->operational = 1; disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; + disk->head_position = 0; conf->working_disks++; } else { /* @@ -1351,27 +1682,43 @@ disk->number = descriptor->number; disk->raid_disk = disk_idx; disk->dev = rdev->dev; - disk->sect_limit = MAX_LINEAR_SECTORS; + disk->sect_limit = MAX_WORK_PER_DISK; disk->operational = 0; disk->write_only = 0; disk->spare = 1; disk->used_slot = 1; + disk->head_position = 0; } } - if (!conf->working_disks) { - printk(NONE_OPERATIONAL, mdidx(mddev)); - goto out_free_conf; - } - conf->raid_disks = sb->raid_disks; conf->nr_disks = sb->nr_disks; conf->mddev = mddev; conf->device_lock = MD_SPIN_LOCK_UNLOCKED; conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_buffer); init_waitqueue_head(&conf->wait_done); init_waitqueue_head(&conf->wait_ready); + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + + /* pre-allocate some buffer_head structures. + * As a minimum, 1 r1bh and raid_disks buffer_heads + * would probably get us by in tight memory situations, + * but a few more is probably a good idea. + * For now, try 16 r1bh and 16*raid_disks bufferheads + * This will allow at least 16 concurrent reads or writes + * even if kmalloc starts failing + */ + if (raid1_grow_r1bh(conf, 16) < 16 || + raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) { + printk(MEM_ERROR, mdidx(mddev)); + goto out_free_conf; + } for (i = 0; i < MD_SB_DISKS; i++) { @@ -1390,6 +1737,7 @@ disk->write_only = 0; disk->spare = 0; disk->used_slot = 1; + disk->head_position = 0; } } @@ -1477,6 +1825,9 @@ return 0; out_free_conf: + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf, conf->freebh_cnt); + raid1_shrink_buffers(conf); kfree(conf); mddev->private = NULL; out: @@ -1505,29 +1856,6 @@ conf->resync_mirrors = 2; md_interrupt_thread(conf->resync_thread); - /* this is really needed when recovery stops too... */ - spin_lock_irq(&conf->segment_lock); - wait_event_lock_irq(conf->wait_done, !conf->cnt_active, conf->segment_lock); - conf->start_active = conf->start_ready; - conf->start_ready = conf->start_pending; - conf->cnt_active = conf->cnt_ready; - conf->cnt_ready = 0; - wait_event_lock_irq(conf->wait_done, !conf->cnt_active, conf->segment_lock); - conf->start_active = conf->start_ready; - conf->cnt_ready = 0; - wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); - conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; - conf->start_future = mddev->sb->size+1; - conf->cnt_pending = conf->cnt_future; - conf->cnt_future = 0; - conf->phase = conf->phase ^1; - wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); - conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; - conf->phase = 0; - conf->cnt_done = conf->cnt_future; - conf->cnt_future = 0; - wake_up(&conf->wait_done); - printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); return 1; } @@ -1559,6 +1887,9 @@ md_unregister_thread(conf->thread); if (conf->resync_thread) md_unregister_thread(conf->resync_thread); + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf, conf->freebh_cnt); + raid1_shrink_buffers(conf); kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; @@ -1567,18 +1898,16 @@ static mdk_personality_t raid1_personality= { - "raid1", - raid1_make_request, - raid1_end_request, - raid1_run, - raid1_stop, - raid1_status, - 0, - raid1_error, - raid1_diskop, - raid1_stop_resync, - raid1_restart_resync, - raid1_sync_request + name: "raid1", + make_request: raid1_make_request, + run: raid1_run, + stop: raid1_stop, + status: raid1_status, + error_handler: raid1_error, + diskop: raid1_diskop, + stop_resync: raid1_stop_resync, + restart_resync: raid1_restart_resync, + sync_request: raid1_sync_request }; int raid1_init (void) --- linux/drivers/block/raid5.c.orig Mon Jun 12 03:27:26 2000 +++ linux/drivers/block/raid5.c Mon Jun 12 05:09:31 2000 @@ -15,7 +15,7 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include + #include #include #include @@ -40,23 +40,21 @@ * The following can be used to debug the driver */ #define RAID5_DEBUG 0 -#define RAID5_PARANOIA 1 -#if defined(CONFIG_SMP) || defined(DEBUG_SPINLOCKS) -# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() -# define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG() +#define RAID5_PARANOIA 1 +#if RAID5_PARANOIA && CONFIG_SMP +# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() +# define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG() #else -# define CHECK_DEVLOCK() 0 -# define CHECK_SHLOCK(sh) 0 +# define CHECK_DEVLOCK() do { } while (0) +# define CHECK_SHLOCK(sh) do { } while (0) #endif #if RAID5_DEBUG -#define PRINTK(x...) printk(x) +#define PRINTK(x...) printk(x) #define inline #define __inline__ #else -#define inline -#define __inline__ -#define PRINTK(x...) do { } while (0) +#define PRINTK(x...) do { } while (0) #endif static void print_raid5_conf (raid5_conf_t *conf); @@ -370,8 +368,8 @@ out: md_spin_unlock_irq(&conf->device_lock); PRINTK("shrink completed, nr_hashed_stripes %d, nr_pending_strips %d\n", - atomic_read(&conf->nr_hashed_stripes), - atomic_read(&conf->nr_pending_stripes)); + atomic_read(&conf->nr_hashed_stripes), + atomic_read(&conf->nr_pending_stripes)); return count; } @@ -427,7 +425,7 @@ sh = get_free_stripe(conf); if (!sh && cnt < (conf->max_nr_stripes/8)) { md_wakeup_thread(conf->thread); - PRINTK("waiting for some stripes to complete - %d %d\n", cnt, conf->max_nr_stripes/8); + PRINTK("waiting for some stripes to complete - %d %d\n", cnt, conf->max_nr_stripes/8); schedule(); } remove_wait_queue(&conf->wait_for_stripe, &wait); @@ -575,7 +573,7 @@ { struct buffer_head *bh = sh->bh_new[i]; - PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_rsector, uptodate); + PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_blocknr, uptodate); sh->bh_new[i] = NULL; raid5_free_bh(sh, sh->bh_req[i]); sh->bh_req[i] = NULL; @@ -583,7 +581,9 @@ bh->b_end_io(bh, uptodate); if (!uptodate) printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for " - "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr); + "block %lu\n", + partition_name(mddev_to_kdev(sh->raid_conf->mddev)), + bh->b_blocknr); } static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate) @@ -605,14 +605,14 @@ md_spin_lock_irqsave(&sh->stripe_lock, flags); raid5_mark_buffer_uptodate(bh, uptodate); if (!uptodate) - md_error(bh->b_dev, bh->b_rdev); + md_error(mddev_to_kdev(conf->mddev), bh->b_dev); if (conf->failed_disks) { for (i = 0; i < disks; i++) { if (conf->disks[i].operational) continue; if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i]) continue; - if (bh->b_rdev != conf->disks[i].dev) + if (bh->b_dev != conf->disks[i].dev) continue; set_bit(STRIPE_ERROR, &sh->state); } @@ -628,10 +628,8 @@ static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i) { raid5_conf_t *conf = sh->raid_conf; - mddev_t *mddev = conf->mddev; char *b_data; struct page *b_page; - kdev_t dev = mddev_to_kdev(mddev); int block = sh->sector / (sh->size >> 9); b_data = bh->b_data; @@ -639,14 +637,14 @@ memset (bh, 0, sizeof (struct buffer_head)); init_waitqueue_head(&bh->b_wait); init_buffer(bh, raid5_end_request, sh); - bh->b_dev = dev; + bh->b_dev = conf->disks[i].dev; bh->b_blocknr = block; bh->b_data = b_data; bh->b_page = b_page; bh->b_rdev = conf->disks[i].dev; - bh->b_rsector = sh->sector; + bh->b_rsector = sh->sector; bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); bh->b_size = sh->size; @@ -713,14 +711,14 @@ } /* - * Input: a 'big' sector number, + * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ static unsigned long raid5_compute_sector(int r_sector, unsigned int raid_disks, unsigned int data_disks, unsigned int * dd_idx, unsigned int * pd_idx, raid5_conf_t *conf) { - unsigned int stripe; + unsigned int stripe; int chunk_number, chunk_offset; unsigned long new_sector; int sectors_per_chunk = conf->chunk_size >> 9; @@ -775,15 +773,6 @@ * Finally, compute the new sector number */ new_sector = stripe * sectors_per_chunk + chunk_offset; - -#if 0 - if ( *dd_idx > data_disks || *pd_idx > data_disks || - chunk_offset + bh->b_size / 512 > sectors_per_chunk ) - - printk ("raid5: bug: dd_idx == %d, pd_idx == %d, chunk_offset == %d\n", - *dd_idx, *pd_idx, chunk_offset); -#endif - return new_sector; } @@ -854,9 +843,8 @@ count = 1; } } - if(count != 1) { + if (count != 1) xor_block(count, &bh_ptr[0]); - } raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1); } @@ -1097,20 +1085,20 @@ if (!operational[i] && !conf->resync_parity) { PRINTK("writing spare %d\n", i); atomic_inc(&sh->nr_pending); - bh->b_rdev = conf->spare->dev; + bh->b_dev = bh->b_rdev = conf->spare->dev; q = blk_get_queue(bh->b_rdev); generic_make_request(q, WRITERAW, bh); } else { #if 0 atomic_inc(&sh->nr_pending); - bh->b_rdev = conf->disks[i].dev; + bh->b_dev = bh->b_rdev = conf->disks[i].dev; q = blk_get_queue(bh->b_rdev); generic_make_request(q, WRITERAW, bh); #else if (!allclean || (i==sh->pd_idx)) { PRINTK("writing dirty %d\n", i); atomic_inc(&sh->nr_pending); - bh->b_rdev = conf->disks[i].dev; + bh->b_dev = bh->b_rdev = conf->disks[i].dev; q = blk_get_queue(bh->b_rdev); generic_make_request(q, WRITERAW, bh); } else { @@ -1156,7 +1144,7 @@ continue; lock_get_bh(sh->bh_old[i]); atomic_inc(&sh->nr_pending); - sh->bh_old[i]->b_rdev = conf->disks[i].dev; + sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; q = blk_get_queue(sh->bh_old[i]->b_rdev); generic_make_request(q, READ, sh->bh_old[i]); atomic_dec(&sh->bh_old[i]->b_count); @@ -1203,7 +1191,7 @@ raid5_build_block(sh, sh->bh_old[i], i); lock_get_bh(sh->bh_old[i]); atomic_inc(&sh->nr_pending); - sh->bh_old[i]->b_rdev = conf->disks[i].dev; + sh->bh_old[i]->b_dev = sh->bh_old[i]->b_rdev = conf->disks[i].dev; q = blk_get_queue(sh->bh_old[i]->b_rdev); generic_make_request(q, READ, sh->bh_old[i]); atomic_dec(&sh->bh_old[i]->b_count); @@ -1233,7 +1221,7 @@ #endif lock_get_bh(sh->bh_req[i]); atomic_inc(&sh->nr_pending); - sh->bh_req[i]->b_rdev = conf->disks[i].dev; + sh->bh_req[i]->b_dev = sh->bh_req[i]->b_rdev = conf->disks[i].dev; q = blk_get_queue(sh->bh_req[i]->b_rdev); generic_make_request(q, READ, sh->bh_req[i]); atomic_dec(&sh->bh_req[i]->b_count); @@ -1257,8 +1245,7 @@ * in bh_old */ PRINTK("handle_stripe_sync: sec=%lu disks=%d nr_cache=%d\n", sh->sector, disks, nr_cache); - if (nr_cache < disks-1 - || (nr_cache==disks-1 && !(parity_failed+nr_failed_other+nr_failed_overwrite)) + if ((nr_cache < disks-1) || ((nr_cache == disks-1) && !(parity_failed+nr_failed_other+nr_failed_overwrite)) ) { sh->phase = PHASE_READ_OLD; for (i = 0; i < disks; i++) { @@ -1272,7 +1259,7 @@ raid5_build_block(sh, bh, i); lock_get_bh(bh); atomic_inc(&sh->nr_pending); - bh->b_rdev = conf->disks[i].dev; + bh->b_dev = bh->b_rdev = conf->disks[i].dev; q = blk_get_queue(bh->b_rdev); generic_make_request(q, READ, bh); drive_stat_acct(bh->b_rdev, READ, -bh->b_size/512, 0); @@ -1302,7 +1289,7 @@ } atomic_inc(&sh->nr_pending); lock_get_bh(bh); - bh->b_rdev = conf->spare->dev; + bh->b_dev = bh->b_rdev = conf->spare->dev; q = blk_get_queue(bh->b_rdev); generic_make_request(q, WRITERAW, bh); drive_stat_acct(bh->b_rdev, WRITE, -bh->b_size/512, 0); @@ -1315,7 +1302,7 @@ } /* nr_cache == disks: - * check parity and compute/write if needed + * check parity and compute/write if needed */ compute_parity(sh, RECONSTRUCT_WRITE); @@ -1329,13 +1316,13 @@ atomic_set_buffer_dirty(bh); lock_get_bh(bh); atomic_inc(&sh->nr_pending); - bh->b_rdev = conf->disks[pd_idx].dev; + bh->b_dev = bh->b_rdev = conf->disks[pd_idx].dev; q = blk_get_queue(bh->b_rdev); generic_make_request(q, WRITERAW, bh); drive_stat_acct(bh->b_rdev, WRITE, -bh->b_size/512, 0); atomic_dec(&bh->b_count); PRINTK("handle_stripe_sync() %lu phase WRITE, pending %d buffers\n", - sh->sector, md_atomic_read(&sh->nr_pending)); + sh->sector, md_atomic_read(&sh->nr_pending)); } } @@ -1379,8 +1366,8 @@ } if ((sh->cmd == STRIPE_WRITE && sh->phase == PHASE_WRITE) || - (sh->cmd == STRIPE_READ && sh->phase == PHASE_READ) || - (sh->cmd == STRIPE_SYNC && sh->phase == PHASE_WRITE) + (sh->cmd == STRIPE_READ && sh->phase == PHASE_READ) || + (sh->cmd == STRIPE_SYNC && sh->phase == PHASE_WRITE) ) { /* * Completed @@ -1505,7 +1492,7 @@ raid5_conf_t *conf = (raid5_conf_t *) mddev->private; const unsigned int raid_disks = conf->raid_disks; const unsigned int data_disks = raid_disks - 1; - unsigned int dd_idx, pd_idx; + unsigned int dd_idx, pd_idx; unsigned long new_sector; struct stripe_head *sh; @@ -1513,7 +1500,7 @@ if (rw == READA) rw = READ; - new_sector = raid5_compute_sector(bh->b_blocknr*(bh->b_size>>9), + new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks, &dd_idx, &pd_idx, conf); PRINTK("raid5_make_request, sector %lu\n", new_sector); @@ -1572,13 +1559,12 @@ if (!conf->buffer_size) conf->buffer_size = /* device_bsize(mddev_to_kdev(mddev))*/ PAGE_SIZE; bufsize = conf->buffer_size; - /* Hmm... race on buffer_size ?? */ + /* Hmm... race on buffer_size ?? */ redone = block_nr% (bufsize>>10); block_nr -= redone; sh = get_lock_stripe(conf, block_nr<<1, bufsize); - first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk+chunk_offset, - raid_disks, data_disks, - &dd_idx, &pd_idx, conf); + first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk + + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); sh->pd_idx = pd_idx; sh->cmd = STRIPE_SYNC; sh->phase = PHASE_BEGIN; @@ -2152,7 +2138,7 @@ } /* * When we activate a spare disk we _must_ have a disk in - * the lower (active) part of the array to replace. + * the lower (active) part of the array to replace. */ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { MD_BUG(); @@ -2385,18 +2371,16 @@ static mdk_personality_t raid5_personality= { - "raid5", - raid5_make_request, - raid5_end_request, - raid5_run, - raid5_stop, - raid5_status, - 0, - raid5_error, - raid5_diskop, - raid5_stop_resync, - raid5_restart_resync, - raid5_sync_request + name: "raid5", + make_request: raid5_make_request, + run: raid5_run, + stop: raid5_stop, + status: raid5_status, + error_handler: raid5_error, + diskop: raid5_diskop, + stop_resync: raid5_stop_resync, + restart_resync: raid5_restart_resync, + sync_request: raid5_sync_request }; int raid5_init (void) --- linux/Documentation/md.txt.orig Mon Feb 2 22:07:47 1998 +++ linux/Documentation/md.txt Mon Jun 12 03:27:30 2000 @@ -1,15 +1,17 @@ -Tools that manage md devices can be found at sweet-smoke.ufr-info-p7.ibp.fr -in public/Linux/md035.tar.gz. +Tools that manage md devices can be found at + http://www..kernel.org/pub/linux/daemons/raid/.... - Marc ZYNGIER --- You can boot (if you selected boot support in the configuration) with your md -device with the following kernel command line: +device with the following kernel command lines: -md=,,,,dev0,dev1,...,devn +for old raid arrays without persistant superblocks: + md=,,,,dev0,dev1,...,devn +for raid arrays with persistant superblocks + md=,dev0,dev1,...,devn + md device no. = the number of the md device ... 0 means md0, 1 md1, @@ -19,19 +21,16 @@ raid level = -1 linear mode 0 striped mode - other modes are currently unsupported. + other modes are only supported with persistant super blocks chunk size factor = (raid-0 and raid-1 only) - Set the chunk size as PAGE_SIZE << n. + Set the chunk size as 4k << n. -fault level = (raid-1 only) - Set the maximum fault number as n. - Currently unsupported due to lack of boot support for raid1. +fault level = totally ignored dev0-devn: e.g. /dev/hda1,/dev/hdc1,/dev/sda1,/dev/sdb1 -my loadlin line looks like this: +A possible loadlin line (Harald Hoyer ) looks like this: e:\loadlin\loadlin e:\zimage root=/dev/md0 md=0,0,4,0,/dev/hdb2,/dev/hdc3 ro - Harald Hoyer --- linux/Documentation/Configure.help.orig Mon Jun 12 03:27:26 2000 +++ linux/Documentation/Configure.help Mon Jun 12 03:27:30 2000 @@ -1536,11 +1536,19 @@ If unsure, say Y. -Boot support (linear, striped) +RAID Boot support CONFIG_MD_BOOT - To boot with an initial linear or striped md device you have to - answer Y here. For lilo and loadlin options see the file - Documentation/md.txt. + To boot with an initial raid volume (any type) you can select + autodetect, or answer Y here and appropriate options to the kernel + at boot time. + For lilo and loadlin options see the file Documentation/md.txt. + +RAID AutoDetect support +CONFIG_AUTODETECT_RAID + An alternative to "Raid Boot support" is autodetect support. + With this selected, any partitons of type 0xFD will be considered for + inclusion in a RAID array. Information in the RAID-superblock on + the partition will determine how it is included. Support for Acer PICA 1 chipset CONFIG_ACER_PICA_61