--- linux/drivers/block/Makefile.orig Wed Apr 12 10:43:15 2000 +++ linux/drivers/block/Makefile Wed Apr 12 11:49:31 2000 @@ -166,26 +166,12 @@ endif ifeq ($(CONFIG_MD_RAID5),y) +LX_OBJS += xor.o L_OBJS += raid5.o else ifeq ($(CONFIG_MD_RAID5),m) + LX_OBJS += xor.o M_OBJS += raid5.o - endif -endif - -ifeq ($(CONFIG_MD_TRANSLUCENT),y) -L_OBJS += translucent.o -else - ifeq ($(CONFIG_MD_TRANSLUCENT),m) - M_OBJS += translucent.o - endif -endif - -ifeq ($(CONFIG_MD_HSM),y) -L_OBJS += hsm.o -else - ifeq ($(CONFIG_MD_HSM),m) - M_OBJS += hsm.o endif endif --- linux/drivers/block/raid1.c.orig Wed Apr 12 10:43:15 2000 +++ linux/drivers/block/raid1.c Wed Apr 12 11:39:33 2000 @@ -1,6 +1,9 @@ -/************************************************************************ +/* * raid1.c : Multiple Devices driver for Linux - * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * @@ -15,50 +18,52 @@ */ #include -#include #include -#include -#include -#include +#include #include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -/* - * The following can be used to debug the driver - */ -/*#define RAID1_DEBUG*/ -#ifdef RAID1_DEBUG -#define PRINTK(x) do { printk x; } while (0); -#else -#define PRINTK(x) do { ; } while (0); -#endif +#define MAX_LINEAR_SECTORS 128 #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b)) -static struct md_personality raid1_personality; -static struct md_thread *raid1_thread = NULL; +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock; struct buffer_head *raid1_retry_list = NULL; -static int __raid1_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static void * raid1_kmalloc (int size) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; + void * ptr; + /* + * now we are rather fault tolerant than nice, but + * there are a couple of places in the RAID code where we + * simply can not afford to fail an allocation because + * there is no failure return path (eg. make_request()) + */ + while (!(ptr = kmalloc (size, GFP_KERNEL))) + printk ("raid1: out of memory, retrying...\n"); + + memset(ptr, 0, size); + return ptr; +} + +static int __raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; /* * Later we do read balancing on the read side * now we use the first available disk. */ - PRINTK(("raid1_map().\n")); - - for (i=0; imirrors[i].operational) { - *rdev = raid_conf->mirrors[i].dev; + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; return (0); } } @@ -67,29 +72,26 @@ return (-1); } -static int raid1_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) -{ - return 0; -} - -void raid1_reschedule_retry (struct buffer_head *bh) +static void raid1_reschedule_retry (struct buffer_head *bh) { + unsigned long flags; struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); - PRINTK(("raid1_reschedule_retry().\n")); - + md_spin_lock_irqsave(&retry_list_lock, flags); r1_bh->next_retry = raid1_retry_list; raid1_retry_list = bh; - md_wakeup_thread(raid1_thread); + md_spin_unlock_irqrestore(&retry_list_lock, flags); + md_wakeup_thread(conf->thread); } /* - * raid1_end_buffer_io() is called when we have finished servicing a mirrored + * raid1_end_bh_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate) +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) { struct buffer_head *bh = r1_bh->master_bh; @@ -97,27 +99,16 @@ kfree(r1_bh); } -int raid1_one_error=0; - void raid1_end_request (struct buffer_head *bh, int uptodate) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); - unsigned long flags; - save_flags(flags); - cli(); - PRINTK(("raid1_end_request().\n")); - - if (raid1_one_error) { - raid1_one_error=0; - uptodate=0; - } /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) md_error (bh->b_dev, bh->b_rdev); - else { + else /* * Set BH_Uptodate in our master buffer_head, so that * we will return a good error code for to the higher @@ -128,7 +119,6 @@ * wait for the 'master' buffer_head. */ set_bit (BH_Uptodate, &r1_bh->state); - } /* * We split up the read and write side, imho they are @@ -136,87 +126,58 @@ */ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { - - PRINTK(("raid1_end_request(), read branch.\n")); - /* * we have only one buffer_head on the read side */ if (uptodate) { - PRINTK(("raid1_end_request(), read branch, uptodate.\n")); - raid1_end_buffer_io(r1_bh, uptodate); - restore_flags(flags); + raid1_end_bh_io(r1_bh, uptodate); return; } /* * oops, read error: */ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_reschedule_retry (bh); - restore_flags(flags); + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(bh); return; } /* - * WRITE. - */ - PRINTK(("raid1_end_request(), write branch.\n")); - - /* + * WRITE: + * * Let's see if all mirrored write operations have finished - * already [we have irqs off, so we can decrease]: + * already. */ - if (!--r1_bh->remaining) { - struct md_dev *mddev = r1_bh->mddev; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; + if (atomic_dec_and_test(&r1_bh->remaining)) { + int i, disks = MD_SB_DISKS; - PRINTK(("raid1_end_request(), remaining == 0.\n")); - - for ( i=0; imirror_bh[i]) kfree(r1_bh->mirror_bh[i]); + for ( i = 0; i < disks; i++) { + struct buffer_head *bh = r1_bh->mirror_bh[i]; + if (bh) { + // FIXME: make us a regular bcache member + kfree(bh); + } + } - raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state)); + raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state)); } - else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining)); - restore_flags(flags); } -/* This routine checks if the undelying device is an md device and in that - * case it maps the blocks before putting the request on the queue - */ -static inline void -map_and_make_request (int rw, struct buffer_head *bh) +static int raid1_make_request (request_queue_t *q, mddev_t *mddev, int rw, + struct buffer_head * bh) { - if (MAJOR (bh->b_rdev) == MD_MAJOR) - md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); - clear_bit(BH_Lock, &bh->b_state); - make_request (MAJOR (bh->b_rdev), rw, bh); -} - -static int -raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh) -{ - - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req; struct raid1_bh * r1_bh; - int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0, switch_disks = 0, sectors, skip_writeback = 0; struct mirror_info *mirror; - PRINTK(("raid1_make_request().\n")); - while (!( /* FIXME: now we are rather fault tolerant than nice */ - r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_KERNEL) - ) ) - { - printk ("raid1_make_request(#1): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - memset (r1_bh, 0, sizeof (struct raid1_bh)); + if (!buffer_locked(bh)) + BUG(); + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); /* * make_request() can abort the operation when READA is being @@ -224,61 +185,95 @@ * * Currently, just replace the command with READ/WRITE. */ - if (rw == READA) rw = READ; + if (rw == READA) + rw = READ; - if (rw == WRITE) - mark_buffer_clean(bh); /* Too early ? */ + if (rw == WRITE) { + rw = WRITERAW; + /* + * we first clean the bh, then we start the IO, then + * when the IO has finished, we end_io the bh and + * mark it uptodate. This way we do not miss the + * case when the bh got dirty again during the IO. + * + * We do an important optimization here - if the + * buffer was not dirty and we are during resync or + * reconstruction, then we can skip writing it back + * to the master disk! (we still have to write it + * back to the other disks, because we are not sync + * yet.) + */ + skip_writeback = 0; + if (atomic_set_buffer_clean(bh)) + __mark_buffer_clean(bh); + else { + if (conf->resync_mirrors) + skip_writeback = 1; + else { + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + return 0; + } + } + } -/* - * i think the read and write branch should be separated completely, since we want - * to do read balancing on the read side for example. Comments? :) --mingo - */ + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ - r1_bh->master_bh=bh; - r1_bh->mddev=mddev; + r1_bh->master_bh = bh; + r1_bh->mddev = mddev; r1_bh->cmd = rw; + bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); - if (rw==READ || rw==READA) { - int last_used = raid_conf->last_used; - PRINTK(("raid1_make_request(), read branch.\n")); - mirror = raid_conf->mirrors + last_used; + if (rw == READ) { + int last_used = conf->last_used; + + /* + * read balancing logic: + */ + mirror = conf->mirrors + last_used; bh->b_rdev = mirror->dev; sectors = bh->b_size >> 9; - if (bh->b_blocknr * sectors == raid_conf->next_sect) { - raid_conf->sect_count += sectors; - if (raid_conf->sect_count >= mirror->sect_limit) + + switch_disks = 0; + if (bh->b_blocknr * sectors == conf->next_sect) { + conf->sect_count += sectors; + if (conf->sect_count >= mirror->sect_limit) switch_disks = 1; } else switch_disks = 1; - raid_conf->next_sect = (bh->b_blocknr + 1) * sectors; - if (switch_disks) { - PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count)); - raid_conf->sect_count = 0; - last_used = raid_conf->last_used = mirror->next; + conf->next_sect = (bh->b_blocknr + 1) * sectors; + /* + * Do not switch disks if full resync is in progress ... + */ + if (switch_disks && !conf->resync_mirrors) { + conf->sect_count = 0; + last_used = conf->last_used = mirror->next; /* - * Do not switch to write-only disks ... resyncing - * is in progress + * Do not switch to write-only disks ... + * reconstruction is in progress */ - while (raid_conf->mirrors[last_used].write_only) - raid_conf->last_used = raid_conf->mirrors[last_used].next; + while (conf->mirrors[last_used].write_only) + conf->last_used = conf->mirrors[last_used].next; } - PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev))); bh_req = &r1_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); bh_req->b_end_io = raid1_end_request; bh_req->b_dev_id = r1_bh; - map_and_make_request (rw, bh_req); + q = blk_get_queue(bh_req->b_dev); + generic_make_request (q, rw, bh_req); return 0; } /* - * WRITE. + * WRITE: */ - PRINTK(("raid1_make_request(n=%d), write branch.\n",n)); - for (i = 0; i < n; i++) { + for (i = 0; i < disks; i++) { - if (!raid_conf->mirrors [i].operational) { + if (!conf->mirrors[i].operational) { /* * the r1_bh->mirror_bh[i] pointer remains NULL */ @@ -286,89 +281,95 @@ continue; } - /* - * We should use a private pool (size depending on NR_REQUEST), - * to avoid writes filling up the memory with bhs - * - * Such pools are much faster than kmalloc anyways (so we waste almost - * nothing by not using the master bh when writing and win alot of cleanness) - * - * but for now we are cool enough. --mingo - * - * It's safe to sleep here, buffer heads cannot be used in a shared - * manner in the write branch. Look how we lock the buffer at the beginning - * of this function to grok the difference ;) - */ - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_KERNEL) - ) ) - { - printk ("raid1_make_request(#2): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - memset (mirror_bh[i], 0, sizeof (struct buffer_head)); - - /* - * prepare mirrored bh (fields ordered for max mem throughput): - */ - mirror_bh [i]->b_blocknr = bh->b_blocknr; - mirror_bh [i]->b_dev = bh->b_dev; - mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev; - mirror_bh [i]->b_rsector = bh->b_rsector; - mirror_bh [i]->b_state = (1<b_count, 1); - mirror_bh [i]->b_size = bh->b_size; - mirror_bh [i]->b_data = bh->b_data; - mirror_bh [i]->b_list = BUF_LOCKED; - mirror_bh [i]->b_end_io = raid1_end_request; - mirror_bh [i]->b_dev_id = r1_bh; - - r1_bh->mirror_bh[i] = mirror_bh[i]; - sum_bhs++; - } - - r1_bh->remaining = sum_bhs; - - PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs)); - - /* - * We have to be a bit careful about the semaphore above, thats why we - * start the requests separately. Since kmalloc() could fail, sleep and - * make_request() can sleep too, this is the safer solution. Imagine, - * end_request decreasing the semaphore before we could have set it up ... - * We could play tricks with the semaphore (presetting it and correcting - * at the end if sum_bhs is not 'n' but we have to do end_request by hand - * if all requests finish until we had a chance to set up the semaphore - * correctly ... lots of races). - */ - for (i = 0; i < n; i++) - if (mirror_bh [i] != NULL) - map_and_make_request (rw, mirror_bh [i]); + /* + * special case for reconstruction ... + */ + if (skip_writeback && (i == conf->last_used)) { + mirror_bh[i] = NULL; + continue; + } + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head)); + mirror_bh[i]->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mirror_bh[i]->b_blocknr = bh->b_blocknr; + mirror_bh[i]->b_dev = bh->b_dev; + mirror_bh[i]->b_rdev = conf->mirrors[i].dev; + mirror_bh[i]->b_rsector = bh->b_rsector; + mirror_bh[i]->b_state = (1<b_count, 1); + mirror_bh[i]->b_size = bh->b_size; + mirror_bh[i]->b_data = bh->b_data; + mirror_bh[i]->b_list = BUF_LOCKED; + mirror_bh[i]->b_end_io = raid1_end_request; + mirror_bh[i]->b_dev_id = r1_bh; + + r1_bh->mirror_bh[i] = mirror_bh[i]; + sum_bhs++; + } + + md_atomic_set(&r1_bh->remaining, sum_bhs); + /* + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + for (i = 0; i < disks; i++) { + struct buffer_head *mbh = mirror_bh[i]; + if (mbh) { + q = blk_get_queue(mbh->b_dev); + generic_make_request(q, rw, mbh); + } + } return (0); } -static int raid1_status (char *page, int minor, struct md_dev *mddev) +static int raid1_status (char *page, mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); int sz = 0, i; - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks); - for (i = 0; i < raid_conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_"); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", + conf->mirrors[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); return sz; } -static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index) +static void unlink_disk (raid1_conf_t *conf, int target) { - int disks = raid_conf->raid_disks; - int j; + int disks = MD_SB_DISKS; + int i; - for (j = 0; j < disks; j++) - if (raid_conf->mirrors [j].next == failed_index) - raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next; + for (i = 0; i < disks; i++) + if (conf->mirrors[i].next == target) + conf->mirrors[i].next = conf->mirrors[target].next; } #define LAST_DISK KERN_ALERT \ @@ -387,48 +388,53 @@ #define ALREADY_SYNCING KERN_INFO \ "raid1: syncing already in progress.\n" -static int raid1_error (struct md_dev *mddev, kdev_t dev) +static void mark_disk_bad (mddev_t *mddev, int failed) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - int disks = raid_conf->raid_disks; - int i; + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + unlink_disk(conf, failed); + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} - PRINTK(("raid1_error called\n")); +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; - if (raid_conf->working_disks == 1) { + if (conf->working_disks == 1) { /* * Uh oh, we can do nothing if this is our last disk, but * first check if this is a queued request for a device * which has just failed. */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) - if (mirror->dev == dev && !mirror->operational) + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && !mirrors[i].operational) return 0; + } printk (LAST_DISK); } else { - /* Mark disk as unusable */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) { - if (mirror->dev == dev && mirror->operational){ - mirror->operational = 0; - raid1_fix_links (raid_conf, i); - sb->disks[mirror->number].state |= - (1 << MD_FAULTY_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_SYNC_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_ACTIVE_DEVICE); - sb->active_disks--; - sb->working_disks--; - sb->failed_disks++; - mddev->sb_dirty = 1; - md_wakeup_thread(raid1_thread); - raid_conf->working_disks--; - printk (DISK_FAILED, kdevname (dev), - raid_conf->working_disks); + /* + * Mark disk as unusable + */ + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && mirrors[i].operational) { + mark_disk_bad(mddev, i); + break; } } } @@ -441,156 +447,300 @@ #undef START_SYNCING /* - * This is the personality-specific hot-addition routine + * Insert the spare disk into the drive-ring */ +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror) +{ + int j, next; + int disks = MD_SB_DISKS; + struct mirror_info *p = conf->mirrors; -#define NO_SUPERBLOCK KERN_ERR \ -"raid1: cannot hot-add disk to the array with no RAID superblock\n" + for (j = 0; j < disks; j++, p++) + if (p->operational && !p->write_only) { + next = p->next; + p->next = mirror->raid_disk; + mirror->next = next; + return; + } -#define WRONG_LEVEL KERN_ERR \ -"raid1: hot-add: level of disk is not RAID-1\n" + printk("raid1: bug: no read-operational devices\n"); +} + +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; + + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); -#define HOT_ADD_SUCCEEDED KERN_INFO \ -"raid1: device %s hot-added\n" + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} -static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev) +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) { - unsigned long flags; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - struct real_dev * realdev; - int n; + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + print_raid1_conf(conf); + md_spin_lock_irq(&conf->device_lock); /* - * The device has its superblock already read and it was found - * to be consistent for generic RAID usage. Now we check whether - * it's usable for RAID-1 hot addition. + * find the disk ... */ + switch (state) { - n = mddev->nb_dev++; - realdev = &mddev->devices[n]; - if (!realdev->sb) { - printk (NO_SUPERBLOCK); - return -EINVAL; - } - if (realdev->sb->level != 1) { - printk (WRONG_LEVEL); - return -EINVAL; + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; } - /* FIXME: are there other things left we could sanity-check? */ + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; /* - * We have to disable interrupts, as our RAID-1 state is used - * from irq handlers as well. + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) */ - save_flags(flags); - cli(); + case DISKOP_SPARE_ACTIVE: + + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; - raid_conf->raid_disks++; - mirror = raid_conf->mirrors+n; + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; - mirror->number=n; - mirror->raid_disk=n; - mirror->dev=dev; - mirror->next=0; /* FIXME */ - mirror->sect_limit=128; + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } - mirror->operational=0; - mirror->spare=1; - mirror->write_only=0; + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } - sb->disks[n].state |= (1 << MD_FAULTY_DEVICE); - sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE); - sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE); - sb->nr_disks++; - sb->spare_disks++; + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } - restore_flags(flags); + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } - md_update_sb(MINOR(dev)); + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); - printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev)); + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); - return 0; -} + *d = failed_desc; -#undef NO_SUPERBLOCK -#undef WRONG_LEVEL -#undef HOT_ADD_SUCCEEDED + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + link_disk(conf, fdisk); -/* - * Insert the spare disk into the drive-ring - */ -static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror) -{ - int j, next; - struct mirror_info *p = raid_conf->mirrors; + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ - for (j = 0; j < raid_conf->raid_disks; j++, p++) - if (p->operational && !p->write_only) { - next = p->next; - p->next = mirror->raid_disk; - mirror->next = next; - return; + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; } - printk("raid1: bug: no read-operational devices\n"); + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + conf->nr_disks++; + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + print_raid1_conf(conf); + return err; } -static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, - int state) -{ - int i = 0, failed_disk = -1; - struct raid1_data *raid_conf = mddev->private; - struct mirror_info *mirror = raid_conf->mirrors; - md_descriptor_t *descriptor; - unsigned long flags; - for (i = 0; i < MD_SB_DISKS; i++, mirror++) { - if (mirror->spare && mirror->number == spare->number) - goto found; - } - return 1; -found: - for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks; - i++, mirror++) - if (!mirror->operational) - failed_disk = i; +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" - save_flags(flags); - cli(); - switch (state) { - case SPARE_WRITE: - mirror->operational = 1; - mirror->write_only = 1; - raid_conf->raid_disks = MAX(raid_conf->raid_disks, - mirror->raid_disk + 1); - break; - case SPARE_INACTIVE: - mirror->operational = 0; - mirror->write_only = 0; - break; - case SPARE_ACTIVE: - mirror->spare = 0; - mirror->write_only = 0; - raid_conf->working_disks++; - add_ring(raid_conf, mirror); - - if (failed_disk != -1) { - descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number]; - i = spare->raid_disk; - spare->raid_disk = descriptor->raid_disk; - descriptor->raid_disk = i; - } - break; - default: - printk("raid1_mark_spare: bug: state == %d\n", state); - restore_flags(flags); - return 1; - } - restore_flags(flags); - return 0; -} +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" /* * This is a kernel thread which: @@ -598,62 +748,96 @@ * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. */ -void raid1d (void *data) +static void raid1d (void *data) { + struct raid1_bh *r1_bh; struct buffer_head *bh; - kdev_t dev; unsigned long flags; - struct raid1_bh * r1_bh; - struct md_dev *mddev; + request_queue_t *q; + mddev_t *mddev; + kdev_t dev; - PRINTK(("raid1d() active\n")); - save_flags(flags); - cli(); - while (raid1_retry_list) { + for (;;) { + md_spin_lock_irqsave(&retry_list_lock, flags); bh = raid1_retry_list; + if (!bh) + break; r1_bh = (struct raid1_bh *)(bh->b_dev_id); raid1_retry_list = r1_bh->next_retry; - restore_flags(flags); + md_spin_unlock_irqrestore(&retry_list_lock, flags); - mddev = md_dev + MINOR(bh->b_dev); + mddev = kdev_to_mddev(bh->b_dev); if (mddev->sb_dirty) { - printk("dirty sb detected, updating.\n"); + printk(KERN_INFO "dirty sb detected, updating.\n"); mddev->sb_dirty = 0; - md_update_sb(MINOR(bh->b_dev)); + md_update_sb(mddev); } dev = bh->b_rdev; - __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); + + __raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9); if (bh->b_rdev == dev) { - printk (KERN_ALERT - "raid1: %s: unrecoverable I/O read error for block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_end_buffer_io(r1_bh, 0); + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); } else { - printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n", - kdevname(bh->b_dev), bh->b_blocknr); - map_and_make_request (r1_bh->cmd, bh); + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + q = blk_get_queue(bh->b_dev); + generic_make_request (q, r1_bh->cmd, bh); } - cli(); } - restore_flags(flags); + md_spin_unlock_irqrestore(&retry_list_lock, flags); } +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (conf->resync_mirrors == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev, NULL)) { + up(&mddev->recovery_sem); + return; + } + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + up(&mddev->recovery_sem); +} + /* * This will catch the scenario in which one of the mirrors was * mounted as a normal device rather than as a part of a raid set. + * + * check_consistency is very personality-dependent, eg. RAID5 cannot + * do this check, it uses another method. */ -static int __check_consistency (struct md_dev *mddev, int row) +static int __check_consistency (mddev_t *mddev, int row) { - struct raid1_data *raid_conf = mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); + int disks = MD_SB_DISKS; kdev_t dev; struct buffer_head *bh = NULL; int i, rc = 0; char *buffer = NULL; - for (i = 0; i < raid_conf->raid_disks; i++) { - if (!raid_conf->mirrors[i].operational) + for (i = 0; i < disks; i++) { + printk("(checking disk %d)\n",i); + if (!conf->mirrors[i].operational) continue; - dev = raid_conf->mirrors[i].dev; + printk("(really checking disk %d)\n",i); + dev = conf->mirrors[i].dev; set_blocksize(dev, 4096); if ((bh = bread(dev, row / 4, 4096)) == NULL) break; @@ -682,170 +866,346 @@ return rc; } -static int check_consistency (struct md_dev *mddev) +static int check_consistency (mddev_t *mddev) { - int size = mddev->sb->size; - int row; + if (__check_consistency(mddev, 0)) +/* + * we do not do this currently, as it's perfectly possible to + * have an inconsistent array when it's freshly created. Only + * newly written data has to be consistent. + */ + return 0; - for (row = 0; row < size; row += size / 8) - if (__check_consistency(mddev, row)) - return 1; return 0; } -static int raid1_run (int minor, struct md_dev *mddev) +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define RUNNING_CKRAID KERN_ERR \ +"raid1: detected mirror differences -- running resync\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) { - struct raid1_data *raid_conf; - int i, j, raid_disk; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", - kdevname(MKDEV(MD_MAJOR, minor)), sb->level); - MOD_DEC_USE_COUNT; - return -EIO; - } - /**** - * copy the now verified devices into our private RAID1 bookkeeping - * area. [whatever we allocate in raid1_run(), should be freed in - * raid1_stop()] + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] */ - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL) - ) ) - { - printk ("raid1_run(): out of memory\n"); - current->policy |= SCHED_YIELD; - schedule(); - } - raid_conf = mddev->private; - memset(raid_conf, 0, sizeof(*raid_conf)); - - PRINTK(("raid1_run(%d) called.\n", minor)); - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) { - printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev)); + conf = raid1_kmalloc(sizeof(raid1_conf_t)); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); continue; } - - /* - * This is important -- we are using the descriptor on - * the disk only to get a pointer to the descriptor on - * the main superblock, which might be more recent. - */ - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) { - printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev)); + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; continue; } - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) { - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) { - printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev)); + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); continue; } - raid_disk = descriptor->raid_disk; - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev)); + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); continue; } - if (raid_conf->mirrors[raid_disk].operational) { - printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk); + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); continue; } - printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk); - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].operational = 1; - raid_conf->mirrors[raid_disk].sect_limit = 128; - raid_conf->working_disks++; + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + conf->working_disks++; } else { /* * Must be a spare disk .. */ - printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev)); - raid_disk = descriptor->raid_disk; - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].sect_limit = 128; - - raid_conf->mirrors[raid_disk].operational = 0; - raid_conf->mirrors[raid_disk].write_only = 0; - raid_conf->mirrors[raid_disk].spare = 1; - } - } - if (!raid_conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; - } - - raid_conf->raid_disks = sb->raid_disks; - raid_conf->mddev = mddev; - - for (j = 0; !raid_conf->mirrors[j].operational; j++); - raid_conf->last_used = j; - for (i = raid_conf->raid_disks - 1; i >= 0; i--) { - if (raid_conf->mirrors[i].operational) { - PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j)); - raid_conf->mirrors[i].next = j; + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational; j++) + /* nothing */; + conf->last_used = j; + + /* + * initialize the 'working disks' list. + */ + for (i = conf->raid_disks - 1; i >= 0; i--) { + if (conf->mirrors[i].operational) { + conf->mirrors[i].next = j; j = i; } } - if (check_consistency(mddev)) { - printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n"); - sb->state |= 1 << MD_SB_ERRORS; - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; } + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) { + /* + * we do sanity checks even if the device says + * it's clean ... + */ + if (check_consistency(mddev)) { + printk(RUNNING_CKRAID); + sb->state &= ~(1 << MD_SB_CLEAN); + } + } + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + printk(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + } + /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ - for (i = 0; i < sb->nr_disks ; i++) { - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE); + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); for (j = 0; j < sb->raid_disks; j++) { - if (!raid_conf->mirrors[j].operational) + if (!conf->mirrors[j].operational) continue; - if (sb->disks[i].number == raid_conf->mirrors[j].number) - sb->disks[i].state |= 1 << MD_SYNC_DEVICE; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); } } - sb->active_disks = raid_conf->working_disks; + sb->active_disks = conf->working_disks; - printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks); - /* Ok, everything is just fine now */ - return (0); + if (start_recovery) + md_recover_arrays(); + + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef RUNNING_CKRAID +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + conf->resync_mirrors = 2; + md_interrupt_thread(conf->resync_thread); + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; } -static int raid1_stop (int minor, struct md_dev *mddev) +static int raid1_stop (mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); - kfree (raid_conf); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } -static struct md_personality raid1_personality= +static mdk_personality_t raid1_personality= { "raid1", - raid1_map, + NULL, raid1_make_request, raid1_end_request, raid1_run, @@ -854,15 +1214,13 @@ NULL, /* no ioctls */ 0, raid1_error, - raid1_hot_add_disk, - /* raid1_hot_remove_drive */ NULL, - raid1_mark_spare + raid1_diskop, + raid1_stop_resync, + raid1_restart_resync }; int raid1_init (void) { - if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL) - return -EBUSY; return register_md_personality (RAID1, &raid1_personality); } @@ -874,7 +1232,6 @@ void cleanup_module (void) { - md_unregister_thread (raid1_thread); unregister_md_personality (RAID1); } #endif --- linux/drivers/block/md.c.orig Wed Apr 12 10:43:15 2000 +++ linux/drivers/block/md.c Wed Apr 12 12:03:37 2000 @@ -1,6 +1,6 @@ /* md.c : Multiple Devices driver for Linux - Copyright (C) 1998, 1999, 2000 Ingo Molnar + Copyright (C) 1998, 1999, 2000 Ingo Molnar completely rewritten, based on the MD driver code from Marc Zyngier @@ -25,6 +25,7 @@ #include #include +#include #include #ifdef CONFIG_KMOD @@ -56,6 +57,21 @@ static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, }; /* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +/* * these have to be allocated separately because external * subsystems want to have a pre-defined structure */ @@ -215,8 +231,8 @@ blk_queue_make_request(q, md_make_request); q->plug_tq.sync = 0; - q->plug_tq.routine = &md_unplug_device; - q->plug_tq.data = mddev; + q->plug_tq.routine = &md_unplug_device; + q->plug_tq.data = mddev; /* * The 'base' mddev is the one with data NULL. @@ -606,6 +622,18 @@ return NULL; } +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + static MD_LIST_HEAD(all_raid_disks); static MD_LIST_HEAD(pending_raid_disks); @@ -3125,6 +3153,522 @@ return 0; } +static mdp_disk_t *get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} +/* + * Determine correct block size for this device. + */ +unsigned int device_bsize (kdev_t dev) +{ + unsigned int i, correct_size; + + correct_size = BLOCK_SIZE; + if (blksize_size[MAJOR(dev)]) { + i = blksize_size[MAJOR(dev)][MINOR(dev)]; + if (i) + correct_size = i; + } + + return correct_size; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +#define is_mddev_idle(x) 1 +/* + * during resync we keep the buffer locked, this is how we implement + * 'exclusive ownership' of a buffer-cache element. This also protects + * against addition/removal of the cache element. (we are not the + * primary cache manager) + */ +static void end_buffer_io_mdresync(struct buffer_head *bh, int uptodate) +{ + mark_buffer_uptodate(bh, uptodate); + bh->b_dev_id = (void *)1; + wake_up(&bh->b_wait); +} + +static void release_bh (struct buffer_head **bhp) +{ + struct buffer_head *bh = *bhp; + if (!bh) + return; + unlock_buffer(bh); + *bhp = NULL; +} + +void wait_on_mdresync_buffer(struct buffer_head * bh) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&bh->b_wait, &wait); +repeat: + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (!bh->b_dev_id) { + schedule(); + goto repeat; + } + tsk->state = TASK_RUNNING; + remove_wait_queue(&bh->b_wait, &wait); +} +#define RA_ORDER (1) +#define RA_PAGE_SIZE (PAGE_SIZE*(1<resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; + + blocksize = device_bsize(read_disk); + max_blocks = blk_size[major][minor] / (blocksize >> 10); + + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->priority = 1; + +// is_mddev_idle(mddev); /* this also initializes IO event counters */ + starttime = jiffies; + mddev->resync_start = starttime; + + /* + * Tune reconstruction: + */ + window = md_maxreadahead[mdidx(mddev)]/1024; + nr_blocks = window / (blocksize >> 10); + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS)) + nr_blocks = MAX_NR_BLOCKS; + printk(KERN_INFO "md: using %dk window, %d blocks.\n",window,nr_blocks); + + for (j = 0; j < max_blocks; j += nr_blocks) { + + if (j) + mddev->curr_resync = j; + /* + * B careful. When some1 mounts a non-'blocksize' filesystem + * then we get the blocksize changed right under us. Go deal + * with it transparently, recalculate 'blocksize', 'j' and + * 'max_blocks': + */ + curr_bsize = device_bsize(read_disk); + if (curr_bsize != blocksize) { + printk(KERN_INFO "md%d: blocksize changed\n", + mdidx(mddev)); +retry_read: + if (curr_bsize > blocksize) + /* + * this is safe, rounds downwards. + */ + j /= curr_bsize/blocksize; + else + j *= blocksize/curr_bsize; + + blocksize = curr_bsize; + nr_blocks = window / (blocksize >> 10); + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS)) + nr_blocks = MAX_NR_BLOCKS; + max_blocks = blk_size[major][minor] / (blocksize >> 10); + printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n", + nr_blocks, blocksize, j, max_blocks); + /* + * We will retry the current block-group + */ + } + + /* + * Cleanup routines expect this + */ + for (k = 0; k < nr_blocks; k++) + if (bh[k]) + BUG(); + + chunk = nr_blocks; + if (chunk > max_blocks-j) + chunk = max_blocks-j; + + /* + * request buffer heads ... + */ + for (i = 0; i < chunk; i++) { + /* + * We get an exclusive lock to the bh. + */ +repeat_getblk: +// bh[i] = getblk_lock (read_disk, j+i, blocksize); + bh[i] = NULL; + // FIXME: do this gracefully. + if (!bh[i]) + goto repeat_getblk; + } + + /* + * Read all (already locked) buffer heads ... + */ + for (i = 0; i < chunk; i++) { + bh[i]->b_end_io = end_buffer_io_mdresync; + bh[i]->b_dev_id = NULL; + set_bit(BH_Req, &bh[i]->b_state); + q = blk_get_queue(bh[i]->b_dev); + md_make_request(q, READ, bh[i]); + } + + run_task_queue(&tq_disk); + + /* + * Wait for them to complete and verify that + * all of them are OK ... + */ + for (i = 0; i < chunk; i++) { + ii = chunk-i-1; + wait_on_mdresync_buffer(bh[ii]); + if (!buffer_uptodate(bh[ii])) + goto read_error; + if (bh[ii]->b_end_io != end_buffer_io_mdresync) + BUG(); + /* + * We'll do the writeback here in the + * near future, to get better overlapping. + */ + } + + /* + * Write them out without marking them dirty! + * This enables us to optimize away IO at + * the personality level. + */ + for (i = 0; i < chunk; i++) { + bh[i]->b_dev_id = NULL; + q = blk_get_queue(bh[i]->b_dev); + md_make_request(q, WRITE, bh[i]); + } + run_task_queue(&tq_disk); + + for (i = 0; i < chunk; i++) { + ii = chunk-i-1; + + wait_on_mdresync_buffer(bh[ii]); + if (bh[ii]->b_end_io != end_buffer_io_mdresync) + BUG(); + bh[ii]->b_dev_id = NULL; + if (spare && disk_faulty(spare)) { + for (k = 0; k < chunk; k++) + release_bh(bh+k); + printk(" \n "); + err = -EIO; + goto out; + } + if (!buffer_uptodate(bh[ii])) + goto write_error; + } + + /* + * This is the normal 'everything went OK' case + * do a 'free-behind' logic, we sure dont need + * this buffer if it was the only user. + */ + for (i = 0; i < chunk; i++) + release_bh(bh+i); + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk("md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ +repeat: + if (md_need_resched(current)) + schedule(); + + currspeed = (blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1; + if (currspeed > sysctl_speed_limit_min) { + current->priority = 1; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + if (!md_signal_pending(current)) + goto repeat; + } + } else + current->priority = 40; + } + fsync_dev(read_disk); + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + up(&mddev->resync_sem); +out_nolock: + free_pages((unsigned long)bh, RA_ORDER); + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; + +write_error: + /* + * set_blocksize() might change the blocksize. This + * should not happen often, but it happens when eg. + * someone mounts a filesystem that has non-1k + * blocksize. set_blocksize() doesnt touch our + * buffer, but to avoid aliasing problems we change + * our internal blocksize too and retry the write. + */ + curr_bsize = device_bsize(read_disk); + if (curr_bsize != blocksize) { + printk(KERN_INFO "md%d: blocksize changed during write\n", + mdidx(mddev)); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; // we retry the read too. + } + + /* + * We were invalidated by the primary + * cache manager - hm, shouldnt happen, + * all invalidation synchronizes with + * the bh lock first. + */ + if (!test_bit(BH_Req, &bh[ii]->b_state)) + BUG(); + /* + * It's a real write problem. We retry and bail out + * only if it's excessive. + */ + if (max_write_errors) { + max_write_errors--; + printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; // we retry the read too. + } + printk (KERN_ALERT "too many write errors, stopping reconstruction.\n"); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + err = -EIO; + goto out; + +read_error: + /* + * set_blocksize() might change the blocksize. This + * should not happen often, but it happens when eg. + * someone mounts a filesystem that has non-1k + * blocksize. set_blocksize() doesnt touch our + * buffer, but to avoid aliasing problems we change + * our internal blocksize too and retry the read. + */ + curr_bsize = device_bsize(read_disk); + if (curr_bsize != blocksize) { + printk(KERN_INFO "md%d: blocksize changed during read\n", + mdidx(mddev)); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; + } + + /* + * It's a real read problem. We retry and bail out + * only if it's excessive. + */ + if (max_read_errors) { + max_read_errors--; + printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; + } + printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n"); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + err = -EIO; + goto out; +} + +#undef MAX_NR_BLOCKS + +/* + * This is a kernel thread which syncs a spare disk with the active array + * + * the amount of foolproofing might seem to be a tad excessive, but an + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs + * of my root partition with the first 0.5 gigs of my /home partition ... so + * i'm a bit nervous ;) + */ +void md_do_recovery (void *data) +{ + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct md_list_head *tmp; + + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) + continue; + if (sb->active_disks == sb->raid_disks) + continue; + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev)); + continue; + } + /* + * now here we get the spare and resync it. + */ + if ((spare = get_spare(mddev)) == NULL) + continue; + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) + continue; + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) + continue; + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR) { + /* + * Recovery got interrupted ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; + continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); + goto restart; + } + printk(KERN_INFO "md: recovery thread finished ...\n"); + +} + int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { @@ -3247,9 +3791,9 @@ raid5_init (); #endif #if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE) - /* - * pick a XOR routine, runtime. - */ + /* + * pick a XOR routine, runtime. + */ calibrate_xor_block(); #endif md_geninit(); --- linux/drivers/block/Config.in.orig Wed Apr 12 10:43:15 2000 +++ linux/drivers/block/Config.in Wed Apr 12 11:33:03 2000 @@ -44,8 +44,8 @@ bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD dep_tristate ' Linear (append) mode' CONFIG_MD_LINEAR $CONFIG_BLK_DEV_MD dep_tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED $CONFIG_BLK_DEV_MD -#dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING $CONFIG_BLK_DEV_MD -#dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD +dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING $CONFIG_BLK_DEV_MD +dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD tristate 'RAM disk support' CONFIG_BLK_DEV_RAM dep_bool ' Initial RAM disk (initrd) support' CONFIG_BLK_DEV_INITRD $CONFIG_BLK_DEV_RAM --- linux/drivers/block/raid5.c.orig Wed Apr 12 10:43:15 2000 +++ linux/drivers/block/raid5.c Wed Apr 12 11:47:06 2000 @@ -1,6 +1,7 @@ -/***************************************************************************** +/* * raid5.c : Multiple Devices driver for Linux - * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1999, 2000 Ingo Molnar * * RAID-5 management functions. * @@ -14,130 +15,107 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + #include #include #include -#include -#include +#include #include #include -#include -static struct md_personality raid5_personality; +static mdk_personality_t raid5_personality; /* * Stripe cache */ + #define NR_STRIPES 128 #define HASH_PAGES 1 #define HASH_PAGES_ORDER 0 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) #define HASH_MASK (NR_HASH - 1) -#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) +#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) /* * The following can be used to debug the driver */ #define RAID5_DEBUG 0 +#define CHECK_DEVLOCK() if (!conf->device_lock.lock) BUG() +#define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG() #if RAID5_DEBUG -#define PRINTK(x) do { printk x; } while (0); +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ #else -#define PRINTK(x) do { ; } while (0) +#define inline +#define __inline__ +#define PRINTK(x...) do { } while (0) #endif +static void print_raid5_conf (raid5_conf_t *conf); + static inline int stripe_locked(struct stripe_head *sh) { return test_bit(STRIPE_LOCKED, &sh->state); } -static inline int stripe_error(struct stripe_head *sh) -{ - return test_bit(STRIPE_ERROR, &sh->state); -} - -/* - * Stripes are locked whenever new buffers can't be added to them. - */ -static inline void lock_stripe(struct stripe_head *sh) -{ - struct raid5_data *raid_conf = sh->raid_conf; - if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) { - PRINTK(("locking stripe %lu\n", sh->sector)); - raid_conf->nr_locked_stripes++; - } -} - -static inline void unlock_stripe(struct stripe_head *sh) +static void __unlock_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) { - PRINTK(("unlocking stripe %lu\n", sh->sector)); - raid_conf->nr_locked_stripes--; - wake_up(&sh->wait); - } + if (!md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + PRINTK("unlocking stripe %lu\n", sh->sector); + wake_up(&sh->wait); } -static inline void finish_stripe(struct stripe_head *sh) +static void finish_unlock_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - unlock_stripe(sh); + raid5_conf_t *conf = sh->raid_conf; sh->cmd = STRIPE_NONE; sh->phase = PHASE_COMPLETE; - raid_conf->nr_pending_stripes--; - raid_conf->nr_cached_stripes++; - wake_up(&raid_conf->wait_for_stripe); -} - -void __wait_on_stripe(struct stripe_head *sh) -{ - DECLARE_WAITQUEUE(wait, current); - - PRINTK(("wait_on_stripe %lu\n", sh->sector)); - sh->count++; - add_wait_queue(&sh->wait, &wait); -repeat: - set_current_state(TASK_UNINTERRUPTIBLE); - if (stripe_locked(sh)) { - schedule(); - goto repeat; - } - PRINTK(("wait_on_stripe %lu done\n", sh->sector)); - remove_wait_queue(&sh->wait, &wait); - sh->count--; - current->state = TASK_RUNNING; -} - -static inline void wait_on_stripe(struct stripe_head *sh) -{ - if (stripe_locked(sh)) - __wait_on_stripe(sh); + atomic_dec(&conf->nr_pending_stripes); + atomic_inc(&conf->nr_cached_stripes); + __unlock_stripe(sh); + atomic_dec(&sh->count); + wake_up(&conf->wait_for_stripe); } -static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh) +static void remove_hash(raid5_conf_t *conf, struct stripe_head *sh) { - PRINTK(("remove_hash(), stripe %lu\n", sh->sector)); + PRINTK("remove_hash(), stripe %lu\n", sh->sector); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); if (sh->hash_pprev) { if (sh->hash_next) sh->hash_next->hash_pprev = sh->hash_pprev; *sh->hash_pprev = sh->hash_next; sh->hash_pprev = NULL; - raid_conf->nr_hashed_stripes--; + atomic_dec(&conf->nr_hashed_stripes); } } -static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh) +static void lock_get_bh (struct buffer_head *bh) +{ + while (md_test_and_set_bit(BH_Lock, &bh->b_state)) + __wait_on_buffer(bh); + atomic_inc(&bh->b_count); +} + +static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) { - struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size); + struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size); - PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes)); + PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", + sh->sector, atomic_read(&conf->nr_hashed_stripes)); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); if ((sh->hash_next = *shp) != NULL) (*shp)->hash_pprev = &sh->hash_next; *shp = sh; sh->hash_pprev = shp; - raid_conf->nr_hashed_stripes++; + atomic_inc(&conf->nr_hashed_stripes); } static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size) @@ -145,13 +123,18 @@ struct buffer_head *bh; unsigned long flags; - save_flags(flags); - cli(); - if ((bh = sh->buffer_pool) == NULL) - return NULL; + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->buffer_pool; + if (!bh) + goto out_unlock; sh->buffer_pool = bh->b_next; bh->b_size = b_size; - restore_flags(flags); + if (atomic_read(&bh->b_count)) + BUG(); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + return bh; } @@ -160,12 +143,17 @@ struct buffer_head *bh; unsigned long flags; - save_flags(flags); - cli(); - if ((bh = sh->bh_pool) == NULL) - return NULL; + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->bh_pool; + if (!bh) + goto out_unlock; sh->bh_pool = bh->b_next; - restore_flags(flags); + if (atomic_read(&bh->b_count)) + BUG(); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + return bh; } @@ -173,55 +161,58 @@ { unsigned long flags; - save_flags(flags); - cli(); + if (atomic_read(&bh->b_count)) + BUG(); + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); bh->b_next = sh->buffer_pool; sh->buffer_pool = bh; - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); } static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh) { unsigned long flags; - save_flags(flags); - cli(); + if (atomic_read(&bh->b_count)) + BUG(); + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); bh->b_next = sh->bh_pool; sh->bh_pool = bh; - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); } -static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf) +static struct stripe_head *get_free_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - unsigned long flags; - save_flags(flags); - cli(); - if ((sh = raid_conf->free_sh_list) == NULL) { - restore_flags(flags); - return NULL; - } - raid_conf->free_sh_list = sh->free_next; - raid_conf->nr_free_sh--; - if (!raid_conf->nr_free_sh && raid_conf->free_sh_list) - printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n"); - restore_flags(flags); - if (sh->hash_pprev || sh->nr_pending || sh->count) - printk("get_free_stripe(): bug\n"); + md_spin_lock_irq(&conf->device_lock); + sh = conf->free_sh_list; + if (!sh) + goto out; + conf->free_sh_list = sh->free_next; + atomic_dec(&conf->nr_free_sh); + if (!atomic_read(&conf->nr_free_sh) && conf->free_sh_list) + BUG(); + if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || + atomic_read(&sh->count)) + BUG(); +out: + md_spin_unlock_irq(&conf->device_lock); return sh; } -static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh) +static void __put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh) { - unsigned long flags; - - save_flags(flags); - cli(); - sh->free_next = raid_conf->free_sh_list; - raid_conf->free_sh_list = sh; - raid_conf->nr_free_sh++; - restore_flags(flags); + if (atomic_read(&sh->count) != 0) + BUG(); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + clear_bit(STRIPE_LOCKED, &sh->state); + sh->free_next = conf->free_sh_list; + conf->free_sh_list = sh; + atomic_inc(&conf->nr_free_sh); } static void shrink_buffers(struct stripe_head *sh, int num) @@ -229,7 +220,8 @@ struct buffer_head *bh; while (num--) { - if ((bh = get_free_buffer(sh, -1)) == NULL) + bh = get_free_buffer(sh, -1); + if (!bh) return; free_page((unsigned long) bh->b_data); kfree(bh); @@ -241,26 +233,30 @@ struct buffer_head *bh; while (num--) { - if ((bh = get_free_bh(sh)) == NULL) + bh = get_free_bh(sh); + if (!bh) return; kfree(bh); } } -static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority) +static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int priority) { struct buffer_head *bh; while (num--) { - if ((bh = kmalloc(sizeof(struct buffer_head), priority)) == NULL) + bh = kmalloc(sizeof(struct buffer_head), priority); + if (!bh) return 1; memset(bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); bh->b_data = (char *) __get_free_page(priority); if (!bh->b_data) { kfree(bh); return 1; } bh->b_size = b_size; + atomic_set(&bh->b_count, 0); put_free_buffer(sh, bh); } return 0; @@ -271,259 +267,303 @@ struct buffer_head *bh; while (num--) { - if ((bh = kmalloc(sizeof(struct buffer_head), priority)) == NULL) + bh = kmalloc(sizeof(struct buffer_head), priority); + if (!bh) return 1; memset(bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); put_free_bh(sh, bh); } return 0; } -static void raid5_kfree_buffer(struct stripe_head *sh, struct buffer_head *bh) +static void raid5_free_buffer(struct stripe_head *sh, struct buffer_head *bh) { - unsigned long flags; - - save_flags(flags); - cli(); put_free_buffer(sh, bh); - restore_flags(flags); } -static void raid5_kfree_bh(struct stripe_head *sh, struct buffer_head *bh) +static void raid5_free_bh(struct stripe_head *sh, struct buffer_head *bh) { - unsigned long flags; - - save_flags(flags); - cli(); put_free_bh(sh, bh); - restore_flags(flags); } -static void raid5_kfree_old_bh(struct stripe_head *sh, int i) +static void raid5_free_old_bh(struct stripe_head *sh, int i) { - if (!sh->bh_old[i]) { - printk("raid5_kfree_old_bh: bug: sector %lu, index %d not present\n", sh->sector, i); - return; - } - raid5_kfree_buffer(sh, sh->bh_old[i]); + CHECK_SHLOCK(sh); + if (!sh->bh_old[i]) + BUG(); + raid5_free_buffer(sh, sh->bh_old[i]); sh->bh_old[i] = NULL; } static void raid5_update_old_bh(struct stripe_head *sh, int i) { - PRINTK(("stripe %lu, idx %d, updating cache copy\n", sh->sector, i)); - if (!sh->bh_copy[i]) { - printk("raid5_update_old_bh: bug: sector %lu, index %d not present\n", sh->sector, i); - return; - } + CHECK_SHLOCK(sh); + PRINTK("stripe %lu, idx %d, updating cache copy\n", sh->sector, i); + if (!sh->bh_copy[i]) + BUG(); if (sh->bh_old[i]) - raid5_kfree_old_bh(sh, i); + raid5_free_old_bh(sh, i); sh->bh_old[i] = sh->bh_copy[i]; sh->bh_copy[i] = NULL; } -static void kfree_stripe(struct stripe_head *sh) +static void free_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks, j; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, j; - PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector)); - if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) { - printk("raid5: kfree_stripe(), sector %lu, phase %d, locked %d, count %d\n", sh->sector, sh->phase, stripe_locked(sh), sh->count); + if (atomic_read(&sh->count) != 0) + BUG(); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + PRINTK("free_stripe called, stripe %lu\n", sh->sector); + if (sh->phase != PHASE_COMPLETE || atomic_read(&sh->count)) { + PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh->sector, sh->phase, atomic_read(&sh->count)); return; } for (j = 0; j < disks; j++) { if (sh->bh_old[j]) - raid5_kfree_old_bh(sh, j); + raid5_free_old_bh(sh, j); if (sh->bh_new[j] || sh->bh_copy[j]) - printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]); + BUG(); } - remove_hash(raid_conf, sh); - put_free_stripe(raid_conf, sh); + remove_hash(conf, sh); + __put_free_stripe(conf, sh); } -static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr) +static int shrink_stripe_cache(raid5_conf_t *conf, int nr) { struct stripe_head *sh; int i, count = 0; - PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock)); + PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr, atomic_read(&conf->nr_hashed_stripes), conf->clock); + md_spin_lock_irq(&conf->device_lock); for (i = 0; i < NR_HASH; i++) { -repeat: - sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK]; + sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK]; for (; sh; sh = sh->hash_next) { if (sh->phase != PHASE_COMPLETE) continue; - if (stripe_locked(sh)) + if (atomic_read(&sh->count)) continue; - if (sh->count) + /* + * Try to lock this stripe: + */ + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) continue; - kfree_stripe(sh); + free_stripe(sh); if (++count == nr) { - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes)); - raid_conf->clock = (i + raid_conf->clock) & HASH_MASK; - return nr; + conf->clock = (i + conf->clock) & HASH_MASK; + goto out; } - goto repeat; } } - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes)); +out: + md_spin_unlock_irq(&conf->device_lock); + PRINTK("shrink completed, nr_hashed_stripes %d\n", atomic_read(&conf->nr_hashed_stripes)); return count; } -static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) +void __wait_lock_stripe(struct stripe_head *sh) { - struct stripe_head *sh; + MD_DECLARE_WAITQUEUE(wait, current); - if (raid_conf->buffer_size != size) { - PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size)); - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes); - raid_conf->buffer_size = size; + PRINTK("wait_lock_stripe %lu\n", sh->sector); + if (!atomic_read(&sh->count)) + BUG(); + add_wait_queue(&sh->wait, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) { + schedule(); + goto repeat; } + PRINTK("wait_lock_stripe %lu done\n", sh->sector); + remove_wait_queue(&sh->wait, &wait); + current->state = TASK_RUNNING; +} - PRINTK(("find_stripe, sector %lu\n", sector)); - for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next) - if (sh->sector == sector && sh->raid_conf == raid_conf) { - if (sh->size == size) { - PRINTK(("found stripe %lu\n", sector)); - return sh; - } else { - PRINTK(("switching size for %lu, %d --> %d\n", sector, sh->size, size)); - kfree_stripe(sh); - break; - } +static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh; + + PRINTK("__find_stripe, sector %lu\n", sector); + for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) { + if (sh->sector == sector && sh->raid_conf == conf) { + if (sh->size != size) + BUG(); + return sh; } - PRINTK(("stripe %lu not in cache\n", sector)); + } + PRINTK("__stripe %lu not in cache\n", sector); return NULL; } -static int grow_stripes(struct raid5_data *raid_conf, int num, int priority) +static inline struct stripe_head *alloc_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh; + struct buffer_head *buffer_pool, *bh_pool; + + PRINTK("alloc_stripe called\n"); + + while ((sh = get_free_stripe(conf)) == NULL) { + shrink_stripe_cache(conf, conf->max_nr_stripes / 8); + sh = get_free_stripe(conf); + if (sh) + break; + md_wakeup_thread(conf->thread); + PRINTK("waiting for some stripes to complete\n"); + sleep_on(&conf->wait_for_stripe); + } + + buffer_pool = sh->buffer_pool; + bh_pool = sh->bh_pool; + memset(sh, 0, sizeof(*sh)); + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&sh->wait); + sh->buffer_pool = buffer_pool; + sh->bh_pool = bh_pool; + sh->phase = PHASE_COMPLETE; + sh->cmd = STRIPE_NONE; + sh->raid_conf = conf; + sh->sector = sector; + sh->size = size; + atomic_inc(&conf->nr_cached_stripes); + + return sh; +} + +static struct stripe_head *get_lock_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh, *new = NULL; + + PRINTK("get_stripe, sector %lu\n", sector); + + /* + * Do this in set_blocksize()! + */ + if (conf->buffer_size != size) { + PRINTK("switching size, %d --> %d\n", conf->buffer_size, size); + shrink_stripe_cache(conf, conf->max_nr_stripes); + conf->buffer_size = size; + } + +repeat: + md_spin_lock_irq(&conf->device_lock); + sh = __find_stripe(conf, sector, size); + if (!sh) { + if (!new) { + md_spin_unlock_irq(&conf->device_lock); + new = alloc_stripe(conf, sector, size); + goto repeat; + } + sh = new; + new = NULL; + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + insert_hash(conf, sh); + atomic_inc(&sh->count); + md_spin_unlock_irq(&conf->device_lock); + } else { + atomic_inc(&sh->count); + if (new) { + if (md_test_and_set_bit(STRIPE_LOCKED, &new->state)) + BUG(); + __put_free_stripe(conf, new); + } + md_spin_unlock_irq(&conf->device_lock); + PRINTK("get_stripe, waiting, sector %lu\n", sector); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + __wait_lock_stripe(sh); + } + return sh; +} + +static int grow_stripes(raid5_conf_t *conf, int num, int priority) { struct stripe_head *sh; while (num--) { - if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL) + sh = kmalloc(sizeof(struct stripe_head), priority); + if (!sh) return 1; memset(sh, 0, sizeof(*sh)); - if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) { - shrink_buffers(sh, 2 * raid_conf->raid_disks); + sh->raid_conf = conf; + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&sh->wait); + + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + if (grow_raid5_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); kfree(sh); return 1; } - if (grow_bh(sh, raid_conf->raid_disks, priority)) { - shrink_buffers(sh, 2 * raid_conf->raid_disks); - shrink_bh(sh, raid_conf->raid_disks); + if (grow_bh(sh, conf->raid_disks, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); + shrink_bh(sh, conf->raid_disks); kfree(sh); return 1; } - put_free_stripe(raid_conf, sh); - raid_conf->nr_stripes++; + md_spin_lock_irq(&conf->device_lock); + __put_free_stripe(conf, sh); + atomic_inc(&conf->nr_stripes); + md_spin_unlock_irq(&conf->device_lock); } return 0; } -static void shrink_stripes(struct raid5_data *raid_conf, int num) +static void shrink_stripes(raid5_conf_t *conf, int num) { struct stripe_head *sh; while (num--) { - sh = get_free_stripe(raid_conf); + sh = get_free_stripe(conf); if (!sh) break; - shrink_buffers(sh, raid_conf->raid_disks * 2); - shrink_bh(sh, raid_conf->raid_disks); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + shrink_buffers(sh, conf->raid_disks * 2); + shrink_bh(sh, conf->raid_disks); kfree(sh); - raid_conf->nr_stripes--; - } -} - -static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) -{ - struct stripe_head *sh = NULL, *tmp; - struct buffer_head *buffer_pool, *bh_pool; - - PRINTK(("kmalloc_stripe called\n")); - - while ((sh = get_free_stripe(raid_conf)) == NULL) { - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8); - if ((sh = get_free_stripe(raid_conf)) != NULL) - break; - if (!raid_conf->nr_pending_stripes) - printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n"); - md_wakeup_thread(raid_conf->thread); - PRINTK(("waiting for some stripes to complete\n")); - sleep_on(&raid_conf->wait_for_stripe); + atomic_dec(&conf->nr_stripes); } - - /* - * The above might have slept, so perhaps another process - * already created the stripe for us.. - */ - if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) { - put_free_stripe(raid_conf, sh); - wait_on_stripe(tmp); - return tmp; - } - if (sh) { - buffer_pool = sh->buffer_pool; - bh_pool = sh->bh_pool; - memset(sh, 0, sizeof(*sh)); - sh->buffer_pool = buffer_pool; - sh->bh_pool = bh_pool; - sh->phase = PHASE_COMPLETE; - sh->cmd = STRIPE_NONE; - sh->raid_conf = raid_conf; - sh->sector = sector; - sh->size = size; - raid_conf->nr_cached_stripes++; - insert_hash(raid_conf, sh); - } else printk("raid5: bug: kmalloc_stripe() == NULL\n"); - return sh; -} - -static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) -{ - struct stripe_head *sh; - - PRINTK(("get_stripe, sector %lu\n", sector)); - sh = find_stripe(raid_conf, sector, size); - if (sh) - wait_on_stripe(sh); - else - sh = kmalloc_stripe(raid_conf, sector, size); - return sh; } -static struct buffer_head *raid5_kmalloc_buffer(struct stripe_head *sh, int b_size) +static struct buffer_head *raid5_alloc_buffer(struct stripe_head *sh, int b_size) { struct buffer_head *bh; - if ((bh = get_free_buffer(sh, b_size)) == NULL) - printk("raid5: bug: raid5_kmalloc_buffer() == NULL\n"); + bh = get_free_buffer(sh, b_size); + if (!bh) + BUG(); return bh; } -static struct buffer_head *raid5_kmalloc_bh(struct stripe_head *sh) +static struct buffer_head *raid5_alloc_bh(struct stripe_head *sh) { struct buffer_head *bh; - if ((bh = get_free_bh(sh)) == NULL) - printk("raid5: bug: raid5_kmalloc_bh() == NULL\n"); + bh = get_free_bh(sh); + if (!bh) + BUG(); return bh; } -static inline void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate) +static void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate) { struct buffer_head *bh = sh->bh_new[i]; + PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_rsector, uptodate); sh->bh_new[i] = NULL; - raid5_kfree_bh(sh, sh->bh_req[i]); + raid5_free_bh(sh, sh->bh_req[i]); sh->bh_req[i] = NULL; + PRINTK("calling %p->end_io: %p.\n", bh, bh->b_end_io); bh->b_end_io(bh, uptodate); if (!uptodate) printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for " - "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr); + "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr); } static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate) @@ -537,36 +577,35 @@ static void raid5_end_request (struct buffer_head * bh, int uptodate) { struct stripe_head *sh = bh->b_dev_id; - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks, i; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; unsigned long flags; - PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending)); - save_flags(flags); - cli(); + PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh->sector, atomic_read(&sh->nr_pending), uptodate, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3)); + md_spin_lock_irqsave(&sh->stripe_lock, flags); raid5_mark_buffer_uptodate(bh, uptodate); - --sh->nr_pending; - if (!sh->nr_pending) { - md_wakeup_thread(raid_conf->thread); - atomic_inc(&raid_conf->nr_handle); - } if (!uptodate) md_error(bh->b_dev, bh->b_rdev); - if (raid_conf->failed_disks) { + if (conf->failed_disks) { for (i = 0; i < disks; i++) { - if (raid_conf->disks[i].operational) + if (conf->disks[i].operational) continue; if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i]) continue; - if (bh->b_rdev != raid_conf->disks[i].dev) + if (bh->b_rdev != conf->disks[i].dev) continue; set_bit(STRIPE_ERROR, &sh->state); } } - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + + if (atomic_dec_and_test(&sh->nr_pending)) { + atomic_inc(&conf->nr_handle); + md_wakeup_thread(conf->thread); + } } -static int raid5_map (struct md_dev *mddev, kdev_t *rdev, +static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { /* No complex mapping used: the core of the work is done in the @@ -577,21 +616,22 @@ static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i) { - struct raid5_data *raid_conf = sh->raid_conf; - struct md_dev *mddev = raid_conf->mddev; - int minor = (int) (mddev - md_dev); + raid5_conf_t *conf = sh->raid_conf; + mddev_t *mddev = conf->mddev; char *b_data; - kdev_t dev = MKDEV(MD_MAJOR, minor); + kdev_t dev = mddev_to_kdev(mddev); int block = sh->sector / (sh->size >> 9); - b_data = ((volatile struct buffer_head *) bh)->b_data; + b_data = bh->b_data; memset (bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); init_buffer(bh, raid5_end_request, sh); bh->b_dev = dev; bh->b_blocknr = block; - ((volatile struct buffer_head *) bh)->b_data = b_data; - bh->b_rdev = raid_conf->disks[i].dev; + bh->b_data = b_data; + + bh->b_rdev = conf->disks[i].dev; bh->b_rsector = sh->sector; bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); @@ -599,33 +639,62 @@ bh->b_list = BUF_LOCKED; } -static int raid5_error (struct md_dev *mddev, kdev_t dev) +static int raid5_error (mddev_t *mddev, kdev_t dev) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - md_superblock_t *sb = mddev->sb; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; struct disk_info *disk; int i; - PRINTK(("raid5_error called\n")); - raid_conf->resync_parity = 0; - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++) + PRINTK("raid5_error called\n"); + conf->resync_parity = 0; + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { if (disk->dev == dev && disk->operational) { disk->operational = 0; - sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE); - sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE); - sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE); + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); sb->active_disks--; sb->working_disks--; sb->failed_disks++; mddev->sb_dirty = 1; - raid_conf->working_disks--; - raid_conf->failed_disks++; - md_wakeup_thread(raid_conf->thread); + conf->working_disks--; + conf->failed_disks++; + md_wakeup_thread(conf->thread); + printk (KERN_ALERT + "raid5: Disk failure on %s, disabling device." + " Operation continuing on %d devices\n", + partition_name (dev), conf->working_disks); + return -EIO; + } + } + /* + * handle errors in spares (during reconstruction) + */ + if (conf->spare) { + disk = conf->spare; + if (disk->dev == dev) { printk (KERN_ALERT - "RAID5: Disk failure on %s, disabling device." - "Operation continuing on %d devices\n", - kdevname (dev), raid_conf->working_disks); + "raid5: Disk failure on spare %s\n", + partition_name (dev)); + if (!conf->spare->operational) { + MD_BUG(); + return -EIO; + } + disk->operational = 0; + disk->write_only = 0; + conf->spare = NULL; + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + + return -EIO; } + } + MD_BUG(); return 0; } @@ -633,15 +702,14 @@ * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static inline unsigned long -raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks, - unsigned int * dd_idx, unsigned int * pd_idx, - struct raid5_data *raid_conf) +static unsigned long raid5_compute_sector(int r_sector, unsigned int raid_disks, + unsigned int data_disks, unsigned int * dd_idx, + unsigned int * pd_idx, raid5_conf_t *conf) { unsigned int stripe; int chunk_number, chunk_offset; unsigned long new_sector; - int sectors_per_chunk = raid_conf->chunk_size >> 9; + int sectors_per_chunk = conf->chunk_size >> 9; /* First compute the information on this sector */ @@ -664,9 +732,9 @@ /* * Select the parity disk based on the user selected algorithm. */ - if (raid_conf->level == 4) + if (conf->level == 4) *pd_idx = data_disks; - else switch (raid_conf->algorithm) { + else switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: *pd_idx = data_disks - stripe % raid_disks; if (*dd_idx >= *pd_idx) @@ -686,7 +754,7 @@ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; break; default: - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm); + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } /* @@ -707,16 +775,16 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i) { - struct raid5_data *raid_conf = sh->raid_conf; - int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1; + raid5_conf_t *conf = sh->raid_conf; + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; unsigned long new_sector = sh->sector, check; - int sectors_per_chunk = raid_conf->chunk_size >> 9; + int sectors_per_chunk = conf->chunk_size >> 9; unsigned long stripe = new_sector / sectors_per_chunk; int chunk_offset = new_sector % sectors_per_chunk; int chunk_number, dummy1, dummy2, dd_idx = i; unsigned long r_sector, blocknr; - switch (raid_conf->algorithm) { + switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -729,14 +797,14 @@ i -= (sh->pd_idx + 1); break; default: - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm); + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } chunk_number = stripe * data_disks + i; r_sector = chunk_number * sectors_per_chunk + chunk_offset; blocknr = r_sector / (sh->size >> 9); - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf); + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk("compute_blocknr: map not correct\n"); return 0; @@ -744,144 +812,148 @@ return blocknr; } -#ifdef HAVE_ARCH_XORBLOCK -static void xor_block(struct buffer_head *dest, struct buffer_head *source) -{ - __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size); -} -#else -static void xor_block(struct buffer_head *dest, struct buffer_head *source) -{ - long lines = dest->b_size / (sizeof (long)) / 8, i; - long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data; - - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(sourcep + 0); - *(destp + 1) ^= *(sourcep + 1); - *(destp + 2) ^= *(sourcep + 2); - *(destp + 3) ^= *(sourcep + 3); - *(destp + 4) ^= *(sourcep + 4); - *(destp + 5) ^= *(sourcep + 5); - *(destp + 6) ^= *(sourcep + 6); - *(destp + 7) ^= *(sourcep + 7); - destp += 8; - sourcep += 8; - } -} -#endif - static void compute_block(struct stripe_head *sh, int dd_idx) { - struct raid5_data *raid_conf = sh->raid_conf; - int i, disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx)); + PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); if (sh->bh_old[dd_idx] == NULL) - sh->bh_old[dd_idx] = raid5_kmalloc_buffer(sh, sh->size); + sh->bh_old[dd_idx] = raid5_alloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx); memset(sh->bh_old[dd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_old[dd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == dd_idx) continue; if (sh->bh_old[i]) { - xor_block(sh->bh_old[dd_idx], sh->bh_old[i]); - continue; - } else + bh_ptr[count++] = sh->bh_old[i]; + } else { printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); + } + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if(count != 1) { + xor_block(count, &bh_ptr[0]); } raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1); } static void compute_parity(struct stripe_head *sh, int method) { - struct raid5_data *raid_conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method)); + PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); for (i = 0; i < disks; i++) { if (i == pd_idx || !sh->bh_new[i]) continue; if (!sh->bh_copy[i]) - sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size); + sh->bh_copy[i] = raid5_alloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_copy[i], i); - mark_buffer_clean(sh->bh_new[i]); + if (atomic_set_buffer_clean(sh->bh_new[i])) + atomic_set_buffer_dirty(sh->bh_copy[i]); memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size); } - if (sh->bh_copy[pd_idx] == NULL) - sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size); + if (sh->bh_copy[pd_idx] == NULL) { + sh->bh_copy[pd_idx] = raid5_alloc_buffer(sh, sh->size); + atomic_set_buffer_dirty(sh->bh_copy[pd_idx]); + } raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx); if (method == RECONSTRUCT_WRITE) { memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == sh->pd_idx) continue; if (sh->bh_new[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]); - continue; + bh_ptr[count++] = sh->bh_copy[i]; + } else if (sh->bh_old[i]) { + bh_ptr[count++] = sh->bh_old[i]; } - if (sh->bh_old[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]); - continue; + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; } } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } } else if (method == READ_MODIFY_WRITE) { memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == sh->pd_idx) continue; if (sh->bh_new[i] && sh->bh_old[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]); - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]); - continue; + bh_ptr[count++] = sh->bh_copy[i]; + bh_ptr[count++] = sh->bh_old[i]; + } + if (count >= (MAX_XOR_BLOCKS - 1)) { + xor_block(count, &bh_ptr[0]); + count = 1; } } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } } raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1); } static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) { - struct raid5_data *raid_conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; struct buffer_head *bh_req; - if (sh->bh_new[dd_idx]) { - printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector); - printk("forcing oops.\n"); - *(int*)0=0; - } - - set_bit(BH_Lock, &bh->b_state); + PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); + CHECK_SHLOCK(sh); + if (sh->bh_new[dd_idx]) + BUG(); - bh_req = raid5_kmalloc_bh(sh); + bh_req = raid5_alloc_bh(sh); raid5_build_block(sh, bh_req, dd_idx); bh_req->b_data = bh->b_data; if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) { + PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh->sector, rw == READ ? "read" : "write"); sh->phase = PHASE_BEGIN; sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE; - raid_conf->nr_pending_stripes++; - atomic_inc(&raid_conf->nr_handle); + atomic_inc(&conf->nr_pending_stripes); + atomic_inc(&conf->nr_handle); + PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf->nr_pending_stripes), atomic_read(&conf->nr_handle)); } sh->bh_new[dd_idx] = bh; sh->bh_req[dd_idx] = bh_req; sh->cmd_new[dd_idx] = rw; sh->new[dd_idx] = 1; + + PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx); } static void complete_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; int i, new = 0; - PRINTK(("complete_stripe %lu\n", sh->sector)); + PRINTK("complete_stripe %lu\n", sh->sector); for (i = 0; i < disks; i++) { if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx) raid5_update_old_bh(sh, i); if (sh->bh_new[i]) { + PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh->sector, sh->new[i]); if (!sh->new[i]) { #if 0 if (sh->cmd == STRIPE_WRITE) { @@ -904,13 +976,238 @@ printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new); } if (!new) - finish_stripe(sh); + finish_unlock_stripe(sh); else { - PRINTK(("stripe %lu, new == %d\n", sh->sector, new)); + PRINTK("stripe %lu, new == %d\n", sh->sector, new); sh->phase = PHASE_BEGIN; } } + +static int is_stripe_allclean(struct stripe_head *sh, int disks) +{ + int i; + + return 0; + for (i = 0; i < disks; i++) { + if (sh->bh_new[i]) + if (test_bit(BH_Dirty, &sh->bh_new[i])) + return 0; + if (sh->bh_old[i]) + if (test_bit(BH_Dirty, &sh->bh_old[i])) + return 0; + } + return 1; +} + +static void handle_stripe_write (mddev_t *mddev , raid5_conf_t *conf, + struct stripe_head *sh, int nr_write, int * operational, int disks, + int parity, int parity_failed, int nr_cache, int nr_cache_other, + int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) +{ + int i, allclean; + request_queue_t *q; + unsigned int block; + struct buffer_head *bh; + int method1 = INT_MAX, method2 = INT_MAX; + + /* + * Attempt to add entries :-) + */ + if (nr_write != disks - 1) { + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i]) + continue; + block = (int) compute_blocknr(sh, i); + bh = get_hash_table(mddev_to_kdev(mddev), block, sh->size); + if (!bh) + continue; + if (buffer_dirty(bh) && !md_test_and_set_bit(BH_Lock, &bh->b_state)) { + PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block); + add_stripe_bh(sh, bh, i, WRITE); + sh->new[i] = 0; + nr_write++; + if (sh->bh_old[i]) { + nr_cache_overwrite++; + nr_cache_other--; + } else + if (!operational[i]) { + nr_failed_overwrite++; + nr_failed_other--; + } + } + atomic_dec(&bh->b_count); + } + } + PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh->sector); + /* + * Writing, need to update parity buffer. + * + * Compute the number of I/O requests in the "reconstruct + * write" and "read modify write" methods. + */ + if (!nr_failed_other) + method1 = (disks - 1) - (nr_write + nr_cache_other); + if (!nr_failed_overwrite && !parity_failed) + method2 = nr_write - nr_cache_overwrite + (1 - parity); + + if (method1 == INT_MAX && method2 == INT_MAX) + BUG(); + PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2); + + if (!method1 || !method2) { + allclean = is_stripe_allclean(sh, disks); + sh->phase = PHASE_WRITE; + compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); + + for (i = 0; i < disks; i++) { + if (!operational[i] && !conf->spare && !conf->resync_parity) + continue; + bh = sh->bh_copy[i]; + if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) + printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]); + if (i == sh->pd_idx && !bh) + printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i); + if (bh) { + PRINTK("making request for buffer %d\n", i); + lock_get_bh(bh); + if (!operational[i] && !conf->resync_parity) { + PRINTK("writing spare %d\n", i); + atomic_inc(&sh->nr_pending); + bh->b_rdev = conf->spare->dev; + q = blk_get_queue(bh->b_dev); + generic_make_request(q, WRITERAW, bh); + } else { +#if 0 + atomic_inc(&sh->nr_pending); + bh->b_rdev = conf->disks[i].dev; + q = blk_get_queue(bh->b_dev); + generic_make_request(q, WRITERAW, bh); +#else + if (!allclean || (i==sh->pd_idx)) { + PRINTK("writing dirty %d\n", i); + atomic_inc(&sh->nr_pending); + bh->b_rdev = conf->disks[i].dev; + q = blk_get_queue(bh->b_dev); + generic_make_request(q, WRITERAW, bh); + } else { + PRINTK("not writing clean %d\n", i); + raid5_end_request(bh, 1); + sh->new[i] = 0; + } +#endif + } + atomic_dec(&bh->b_count); + } + } + PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + return; + } + + if (method1 < method2) { + sh->write_method = RECONSTRUCT_WRITE; + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i] || sh->bh_old[i]) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + } + } else { + sh->write_method = READ_MODIFY_WRITE; + for (i = 0; i < disks; i++) { + if (sh->bh_old[i]) + continue; + if (!sh->bh_new[i] && i != sh->pd_idx) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + } + } + sh->phase = PHASE_READ_OLD; + for (i = 0; i < disks; i++) { + if (!sh->bh_old[i]) + continue; + if (test_bit(BH_Uptodate, &sh->bh_old[i]->b_state)) + continue; + lock_get_bh(sh->bh_old[i]); + atomic_inc(&sh->nr_pending); + sh->bh_old[i]->b_rdev = conf->disks[i].dev; + q = blk_get_queue(sh->bh_old[i]->b_dev); + generic_make_request(q, READ, sh->bh_old[i]); + atomic_dec(&sh->bh_old[i]->b_count); + } + PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); +} + +/* + * Reading + */ +static void handle_stripe_read (mddev_t *mddev , raid5_conf_t *conf, + struct stripe_head *sh, int nr_read, int * operational, int disks, + int parity, int parity_failed, int nr_cache, int nr_cache_other, + int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) +{ + int i; + request_queue_t *q; + int method1 = INT_MAX; + + method1 = nr_read - nr_cache_overwrite; + + PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1); + + if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { + PRINTK("read %lu completed from cache\n", sh->sector); + for (i = 0; i < disks; i++) { + if (!sh->bh_new[i]) + continue; + if (!sh->bh_old[i]) + compute_block(sh, i); + memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); + } + complete_stripe(sh); + return; + } + if (nr_failed_overwrite) { + sh->phase = PHASE_READ_OLD; + for (i = 0; i < disks; i++) { + if (sh->bh_old[i]) + continue; + if (!operational[i]) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + lock_get_bh(sh->bh_old[i]); + atomic_inc(&sh->nr_pending); + sh->bh_old[i]->b_rdev = conf->disks[i].dev; + q = blk_get_queue(sh->bh_old[i]->b_dev); + generic_make_request(q, READ, sh->bh_old[i]); + atomic_dec(&sh->bh_old[i]->b_count); + } + PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + return; + } + sh->phase = PHASE_READ; + for (i = 0; i < disks; i++) { + if (!sh->bh_new[i]) + continue; + if (sh->bh_old[i]) { + memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); + continue; + } + lock_get_bh(sh->bh_req[i]); + atomic_inc(&sh->nr_pending); + sh->bh_req[i]->b_rdev = conf->disks[i].dev; + q = blk_get_queue(sh->bh_req[i]->b_dev); + generic_make_request(q, READ, sh->bh_req[i]); + atomic_dec(&sh->bh_req[i]->b_count); + } + PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)); +} + /* * handle_stripe() is our main logic routine. Note that: * @@ -921,39 +1218,31 @@ * 2. We should be careful to set sh->nr_pending whenever we sleep, * to prevent re-entry of handle_stripe() for the same sh. * - * 3. raid_conf->failed_disks and disk->operational can be changed + * 3. conf->failed_disks and disk->operational can be changed * from an interrupt. This complicates things a bit, but it allows * us to stop issuing requests for a failed drive as soon as possible. */ static void handle_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - struct md_dev *mddev = raid_conf->mddev; - int minor = (int) (mddev - md_dev); - struct buffer_head *bh; - int disks = raid_conf->raid_disks; - int i, nr = 0, nr_read = 0, nr_write = 0; - int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0; + raid5_conf_t *conf = sh->raid_conf; + mddev_t *mddev = conf->mddev; + int disks = conf->raid_disks; + int i, nr_read = 0, nr_write = 0, parity = 0; + int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0; int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0; - int reading = 0, nr_writing = 0; - int method1 = INT_MAX, method2 = INT_MAX; - int block; - unsigned long flags; - int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks; + int operational[MD_SB_DISKS], failed_disks = conf->failed_disks; - PRINTK(("handle_stripe(), stripe %lu\n", sh->sector)); - if (sh->nr_pending) { - printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector); - return; - } - if (sh->phase == PHASE_COMPLETE) { - printk("handle_stripe(), stripe %lu, already complete\n", sh->sector); - return; - } + PRINTK("handle_stripe(), stripe %lu\n", sh->sector); + if (!stripe_locked(sh)) + BUG(); + if (md_atomic_read(&sh->nr_pending)) + BUG(); + if (sh->phase == PHASE_COMPLETE) + BUG(); - atomic_dec(&raid_conf->nr_handle); + atomic_dec(&conf->nr_handle); - if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) { + if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) { printk("raid5: restarting stripe %lu\n", sh->sector); sh->phase = PHASE_BEGIN; } @@ -968,16 +1257,18 @@ return; } - save_flags(flags); - cli(); + md_spin_lock_irq(&conf->device_lock); for (i = 0; i < disks; i++) { - operational[i] = raid_conf->disks[i].operational; - if (i == sh->pd_idx && raid_conf->resync_parity) + operational[i] = conf->disks[i].operational; + if (i == sh->pd_idx && conf->resync_parity) operational[i] = 0; } - failed_disks = raid_conf->failed_disks; - restore_flags(flags); + failed_disks = conf->failed_disks; + md_spin_unlock_irq(&conf->device_lock); + /* + * Make this one more graceful? + */ if (failed_disks > 1) { for (i = 0; i < disks; i++) { if (sh->bh_new[i]) { @@ -985,269 +1276,125 @@ continue; } } - finish_stripe(sh); + finish_unlock_stripe(sh); return; } + PRINTK("=== stripe index START ===\n"); for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) + PRINTK("disk %d, ", i); + if (sh->bh_old[i]) { nr_cache++; + PRINTK(" (old cached, %d)", nr_cache); + } if (i == sh->pd_idx) { - if (sh->bh_old[i]) + PRINTK(" PARITY."); + if (sh->bh_old[i]) { + PRINTK(" CACHED."); parity = 1; - else if(!operational[i]) - parity_failed = 1; + } else { + PRINTK(" UNCACHED."); + if (!operational[i]) { + PRINTK(" FAILED."); + parity_failed = 1; + } + } + PRINTK("\n"); continue; } if (!sh->bh_new[i]) { - if (sh->bh_old[i]) + PRINTK(" (no new data block) "); + if (sh->bh_old[i]) { + PRINTK(" (but old block cached) "); nr_cache_other++; - else if (!operational[i]) - nr_failed_other++; + } else { + if (!operational[i]) { + PRINTK(" (because failed disk) "); + nr_failed_other++; + } else + PRINTK(" (no old block either) "); + } + PRINTK("\n"); continue; } sh->new[i] = 0; - nr++; - if (sh->cmd_new[i] == READ) + if (sh->cmd_new[i] == READ) { nr_read++; - if (sh->cmd_new[i] == WRITE) - nr_write++; - if (sh->bh_old[i]) - nr_cache_overwrite++; - else if (!operational[i]) - nr_failed_overwrite++; - } - - if (nr_write && nr_read) - printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd); - - if (nr_write) { - /* - * Attempt to add entries :-) - */ - if (nr_write != disks - 1) { - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i]) - continue; - block = (int) compute_blocknr(sh, i); - bh = get_hash_table(MKDEV(MD_MAJOR, minor), block, sh->size); - if (bh) { - if (atomic_read(&bh->b_count) == 1 && - buffer_dirty(bh) && - !buffer_locked(bh)) { - PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block)); - add_stripe_bh(sh, bh, i, WRITE); - sh->new[i] = 0; - nr++; nr_write++; - if (sh->bh_old[i]) { - nr_cache_overwrite++; - nr_cache_other--; - } else if (!operational[i]) { - nr_failed_overwrite++; - nr_failed_other--; - } - } - atomic_dec(&bh->b_count); - } - } + PRINTK(" (new READ %d)", nr_read); } - PRINTK(("handle_stripe() -- begin writing, stripe %lu\n", sh->sector)); - /* - * Writing, need to update parity buffer. - * - * Compute the number of I/O requests in the "reconstruct - * write" and "read modify write" methods. - */ - if (!nr_failed_other) - method1 = (disks - 1) - (nr_write + nr_cache_other); - if (!nr_failed_overwrite && !parity_failed) - method2 = nr_write - nr_cache_overwrite + (1 - parity); - - if (method1 == INT_MAX && method2 == INT_MAX) - printk("raid5: bug: method1 == method2 == INT_MAX\n"); - PRINTK(("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2)); - - if (!method1 || !method2) { - lock_stripe(sh); - sh->nr_pending++; - sh->phase = PHASE_WRITE; - compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); - for (i = 0; i < disks; i++) { - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity) - continue; - if (i == sh->pd_idx || sh->bh_new[i]) - nr_writing++; - } - - sh->nr_pending = nr_writing; - PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending)); - - for (i = 0; i < disks; i++) { - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity) - continue; - bh = sh->bh_copy[i]; - if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) - printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]); - if (i == sh->pd_idx && !bh) - printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i); - if (bh) { - bh->b_state |= (1<b_state); - if (!operational[i] && !raid_conf->resync_parity) { - bh->b_rdev = raid_conf->spare->dev; - make_request(MAJOR(raid_conf->spare->dev), WRITE, bh); - } else - make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh); - } - } - return; - } - - lock_stripe(sh); - sh->nr_pending++; - if (method1 < method2) { - sh->write_method = RECONSTRUCT_WRITE; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i] || sh->bh_old[i]) - continue; - sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - reading++; - } - } else { - sh->write_method = READ_MODIFY_WRITE; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!sh->bh_new[i] && i != sh->pd_idx) - continue; - sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - reading++; - } - } - sh->phase = PHASE_READ_OLD; - sh->nr_pending = reading; - PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending)); - for (i = 0; i < disks; i++) { - if (!sh->bh_old[i]) - continue; - if (buffer_uptodate(sh->bh_old[i])) - continue; - clear_bit(BH_Lock, &sh->bh_old[i]->b_state); - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]); - } - } else { - /* - * Reading - */ - method1 = nr_read - nr_cache_overwrite; - lock_stripe(sh); - sh->nr_pending++; - - PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1)); - if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { - PRINTK(("read %lu completed from cache\n", sh->sector)); - for (i = 0; i < disks; i++) { - if (!sh->bh_new[i]) - continue; - if (!sh->bh_old[i]) - compute_block(sh, i); - memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); - } - sh->nr_pending--; - complete_stripe(sh); - return; + if (sh->cmd_new[i] == WRITE) { + nr_write++; + PRINTK(" (new WRITE %d)", nr_write); } - if (nr_failed_overwrite) { - sh->phase = PHASE_READ_OLD; - sh->nr_pending = (disks - 1) - nr_cache; - PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending)); - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!operational[i]) - continue; - sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - clear_bit(BH_Lock, &sh->bh_old[i]->b_state); - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]); - } + if (sh->bh_old[i]) { + nr_cache_overwrite++; + PRINTK(" (overwriting old %d)", nr_cache_overwrite); } else { - sh->phase = PHASE_READ; - sh->nr_pending = nr_read - nr_cache_overwrite; - PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending)); - for (i = 0; i < disks; i++) { - if (!sh->bh_new[i]) - continue; - if (sh->bh_old[i]) { - memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); - continue; - } - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]); + if (!operational[i]) { + nr_failed_overwrite++; + PRINTK(" (overwriting failed %d)", nr_failed_overwrite); } } + PRINTK("\n"); } + PRINTK("=== stripe index END ===\n"); + + if (nr_write && nr_read) + BUG(); + + if (nr_write) + handle_stripe_write( + mddev, conf, sh, nr_write, operational, disks, + parity, parity_failed, nr_cache, nr_cache_other, nr_failed_other, + nr_cache_overwrite, nr_failed_overwrite + ); + else + handle_stripe_read( + mddev, conf, sh, nr_read, operational, disks, + parity, parity_failed, nr_cache, nr_cache_other, + nr_failed_other, nr_cache_overwrite, nr_failed_overwrite + ); } -static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh) + +static int raid5_make_request (request_queue_t *q, mddev_t *mddev, int rw, struct buffer_head * bh) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - const unsigned int raid_disks = raid_conf->raid_disks; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + const unsigned int raid_disks = conf->raid_disks; const unsigned int data_disks = raid_disks - 1; unsigned int dd_idx, pd_idx; unsigned long new_sector; struct stripe_head *sh; - if (rw == READA) rw = READ; + if (rw == READA) + rw = READ; - new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks, - &dd_idx, &pd_idx, raid_conf); + new_sector = raid5_compute_sector(bh->b_blocknr*(bh->b_size>>9), + raid_disks, data_disks, &dd_idx, &pd_idx, conf); - PRINTK(("raid5_make_request, sector %lu\n", new_sector)); -repeat: - sh = get_stripe(raid_conf, new_sector, bh->b_size); + PRINTK("raid5_make_request, sector %lu\n", new_sector); + sh = get_lock_stripe(conf, new_sector, bh->b_size); +#if 0 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) { - PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd)); + PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd); lock_stripe(sh); - if (!sh->nr_pending) + if (!md_atomic_read(&sh->nr_pending)) handle_stripe(sh); goto repeat; } +#endif sh->pd_idx = pd_idx; if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN) - PRINTK(("stripe %lu catching the bus!\n", sh->sector)); - if (sh->bh_new[dd_idx]) { - printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector); - printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]); - lock_stripe(sh); - md_wakeup_thread(raid_conf->thread); - wait_on_stripe(sh); - goto repeat; - } + PRINTK("stripe %lu catching the bus!\n", sh->sector); + if (sh->bh_new[dd_idx]) + BUG(); add_stripe_bh(sh, bh, dd_idx, rw); - md_wakeup_thread(raid_conf->thread); + md_wakeup_thread(conf->thread); return 0; } -static void unplug_devices(struct stripe_head *sh) -{ -#if 0 - struct raid5_data *raid_conf = sh->raid_conf; - int i; - - for (i = 0; i < raid_conf->raid_disks; i++) - unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev)); -#endif -} - /* * This is our raid5 kernel thread. * @@ -1258,56 +1405,53 @@ static void raid5d (void *data) { struct stripe_head *sh; - struct raid5_data *raid_conf = data; - struct md_dev *mddev = raid_conf->mddev; - int i, handled = 0, unplug = 0; - unsigned long flags; - - PRINTK(("+++ raid5d active\n")); - + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + int i, handled; + + PRINTK("+++ raid5d active\n"); + + handled = 0; + md_spin_lock_irq(&conf->device_lock); + clear_bit(THREAD_WAKEUP, &conf->thread->flags); +repeat_pass: if (mddev->sb_dirty) { + md_spin_unlock_irq(&conf->device_lock); mddev->sb_dirty = 0; - md_update_sb((int) (mddev - md_dev)); + md_update_sb(mddev); + md_spin_lock_irq(&conf->device_lock); } for (i = 0; i < NR_HASH; i++) { repeat: - sh = raid_conf->stripe_hashtbl[i]; + sh = conf->stripe_hashtbl[i]; for (; sh; sh = sh->hash_next) { - if (sh->raid_conf != raid_conf) + if (sh->raid_conf != conf) continue; if (sh->phase == PHASE_COMPLETE) continue; - if (sh->nr_pending) + if (md_atomic_read(&sh->nr_pending)) continue; - if (sh->sector == raid_conf->next_sector) { - raid_conf->sector_count += (sh->size >> 9); - if (raid_conf->sector_count >= 128) - unplug = 1; - } else - unplug = 1; - if (unplug) { - PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count)); - unplug_devices(sh); - unplug = 0; - raid_conf->sector_count = 0; - } - raid_conf->next_sector = sh->sector + (sh->size >> 9); + md_spin_unlock_irq(&conf->device_lock); + if (!atomic_read(&sh->count)) + BUG(); + handled++; handle_stripe(sh); + md_spin_lock_irq(&conf->device_lock); goto repeat; } } - if (raid_conf) { - PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle))); - save_flags(flags); - cli(); - if (!atomic_read(&raid_conf->nr_handle)) - clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags); + if (conf) { + PRINTK("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)); + if (test_and_clear_bit(THREAD_WAKEUP, &conf->thread->flags) && + md_atomic_read(&conf->nr_handle)) + goto repeat_pass; } - PRINTK(("--- raid5d inactive\n")); + md_spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); } -#if SUPPORT_RECONSTRUCTION /* * Private kernel thread for parity reconstruction after an unclean * shutdown. Reconstruction on spare drives in case of a failed drive @@ -1315,44 +1459,67 @@ */ static void raid5syncd (void *data) { - struct raid5_data *raid_conf = data; - struct md_dev *mddev = raid_conf->mddev; + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; - if (!raid_conf->resync_parity) + if (!conf->resync_parity) + return; + if (conf->resync_parity == 2) return; - md_do_sync(mddev); - raid_conf->resync_parity = 0; + down(&mddev->recovery_sem); + if (md_do_sync(mddev,NULL)) { + up(&mddev->recovery_sem); + printk("raid5: resync aborted!\n"); + return; + } + conf->resync_parity = 0; + up(&mddev->recovery_sem); + printk("raid5: resync finished.\n"); } -#endif /* SUPPORT_RECONSTRUCTION */ -static int __check_consistency (struct md_dev *mddev, int row) +static int __check_consistency (mddev_t *mddev, int row) { - struct raid5_data *raid_conf = mddev->private; + raid5_conf_t *conf = mddev->private; kdev_t dev; - struct buffer_head *bh[MD_SB_DISKS], tmp; - int i, rc = 0, nr = 0; - - if (raid_conf->working_disks != raid_conf->raid_disks) - return 0; - tmp.b_size = 4096; - if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL) - return 0; + struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL; + int i, ret = 0, nr = 0, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + + if (conf->working_disks != conf->raid_disks) + goto out; + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + tmp->b_size = 4096; + tmp->b_data = (char *) get_free_page(GFP_KERNEL); + if (!tmp->b_data) + goto out; + md_clear_page((unsigned long)tmp->b_data); memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *)); - for (i = 0; i < raid_conf->raid_disks; i++) { - dev = raid_conf->disks[i].dev; + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; set_blocksize(dev, 4096); - if ((bh[i] = bread(dev, row / 4, 4096)) == NULL) + bh[i] = bread(dev, row / 4, 4096); + if (!bh[i]) break; nr++; } - if (nr == raid_conf->raid_disks) { - for (i = 1; i < nr; i++) - xor_block(&tmp, bh[i]); - if (memcmp(tmp.b_data, bh[0]->b_data, 4096)) - rc = 1; + if (nr == conf->raid_disks) { + bh_ptr[0] = tmp; + count = 1; + for (i = 1; i < nr; i++) { + bh_ptr[count++] = bh[i]; + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } + if (memcmp(tmp->b_data, bh[0]->b_data, 4096)) + ret = 1; } - for (i = 0; i < raid_conf->raid_disks; i++) { - dev = raid_conf->disks[i].dev; + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; if (bh[i]) { bforget(bh[i]); bh[i] = NULL; @@ -1360,289 +1527,668 @@ fsync_dev(dev); invalidate_buffers(dev); } - free_page((unsigned long) tmp.b_data); - return rc; + free_page((unsigned long) tmp->b_data); +out: + if (tmp) + kfree(tmp); + return ret; } -static int check_consistency (struct md_dev *mddev) +static int check_consistency (mddev_t *mddev) { - int size = mddev->sb->size; - int row; + if (__check_consistency(mddev, 0)) +/* + * We are not checking this currently, as it's legitimate to have + * an inconsistent array, at creation time. + */ + return 0; - for (row = 0; row < size; row += size / 8) - if (__check_consistency(mddev, row)) - return 1; return 0; } -static int raid5_run (int minor, struct md_dev *mddev) +static int raid5_run (mddev_t *mddev) { - struct raid5_data *raid_conf; + raid5_conf_t *conf; int i, j, raid_disk, memory; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *desc; + mdk_rdev_t *rdev; + struct disk_info *disk; + struct md_list_head *tmp; + int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 5 && sb->level != 4) { - printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level); + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level); MOD_DEC_USE_COUNT; return -EIO; } - mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL); - if ((raid_conf = mddev->private) == NULL) + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL); + if ((conf = mddev->private) == NULL) goto abort; - memset (raid_conf, 0, sizeof (*raid_conf)); - raid_conf->mddev = mddev; + memset (conf, 0, sizeof (*conf)); + conf->mddev = mddev; - if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) goto abort; - memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); - init_waitqueue_head(&raid_conf->wait_for_stripe); - PRINTK(("raid5_run(%d) called.\n", minor)); - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) { - printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev)); - continue; - } + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&conf->wait_for_stripe); + PRINTK("raid5_run(md%d) called.\n", mdidx(mddev)); + ITERATE_RDEV(mddev,rdev,tmp) { /* * This is important -- we are using the descriptor on * the disk only to get a pointer to the descriptor on * the main superblock, which might be more recent. */ - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) { - printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev)); + desc = sb->disks + rdev->desc_nr; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc)) { + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev)); + if (!rdev->faulty) { + MD_BUG(); + goto abort; + } + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; continue; } - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) { - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) { - printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev)); - continue; + if (disk_active(desc)) { + if (!disk_sync(desc)) { + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev)); + MD_BUG(); + goto abort; } - raid_disk = descriptor->raid_disk; - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev)); + if (raid_disk > sb->raid_disks) { + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev)); continue; } - if (raid_conf->disks[raid_disk].operational) { - printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk); + if (disk->operational) { + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk); continue; } - printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk); + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk); - raid_conf->disks[raid_disk].number = descriptor->number; - raid_conf->disks[raid_disk].raid_disk = raid_disk; - raid_conf->disks[raid_disk].dev = mddev->devices[i].dev; - raid_conf->disks[raid_disk].operational = 1; + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->operational = 1; + disk->used_slot = 1; - raid_conf->working_disks++; + conf->working_disks++; } else { /* * Must be a spare disk .. */ - printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev)); - raid_disk = descriptor->raid_disk; - raid_conf->disks[raid_disk].number = descriptor->number; - raid_conf->disks[raid_disk].raid_disk = raid_disk; - raid_conf->disks[raid_disk].dev = mddev->devices [i].dev; - - raid_conf->disks[raid_disk].operational = 0; - raid_conf->disks[raid_disk].write_only = 0; - raid_conf->disks[raid_disk].spare = 1; - } - } - raid_conf->raid_disks = sb->raid_disks; - raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks; - raid_conf->mddev = mddev; - raid_conf->chunk_size = sb->chunk_size; - raid_conf->level = sb->level; - raid_conf->algorithm = sb->parity_algorithm; - raid_conf->max_nr_stripes = NR_STRIPES; + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev)); + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; - if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) { - printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = sb->disks + i; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && + !conf->disks[raid_disk].used_slot) { + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } } - if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor))); + + conf->raid_disks = sb->raid_disks; + /* + * 0 for a fully functional array, 1 for a degraded array. + */ + conf->failed_disks = conf->raid_disks - conf->working_disks; + conf->mddev = mddev; + conf->chunk_size = sb->chunk_size; + conf->level = sb->level; + conf->algorithm = sb->layout; + conf->max_nr_stripes = NR_STRIPES; + +#if 0 + for (i = 0; i < conf->raid_disks; i++) { + if (!conf->disks[i].used_slot) { + MD_BUG(); + goto abort; + } + } +#endif + if (!conf->chunk_size || conf->chunk_size % 4) { + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); goto abort; } - if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor))); + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev)); goto abort; } - if (raid_conf->failed_disks > 1) { - printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks); + if (conf->failed_disks > 1) { + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks); goto abort; } - if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) { - printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n"); - sb->state |= 1 << MD_SB_ERRORS; - goto abort; + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; } - if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) { - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) && + check_consistency(mddev)) { + printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n"); + sb->state &= ~(1 << MD_SB_CLEAN); } -#if SUPPORT_RECONSTRUCTION - if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) { - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + { + const char * name = "raid5d"; + + conf->thread = md_register_thread(raid5d, conf, name); + if (!conf->thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } } -#endif /* SUPPORT_RECONSTRUCTION */ - memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) + - raid_conf->raid_disks * (sizeof(struct buffer_head) + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * (sizeof(struct buffer_head) + 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; - if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) { + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) { printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(raid_conf, raid_conf->max_nr_stripes); + shrink_stripes(conf, conf->max_nr_stripes); goto abort; } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor))); + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev)); /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ - for (i = 0; i < sb->nr_disks ; i++) { - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE); + for (i = 0; i < MD_SB_DISKS ; i++) { + mark_disk_nonsync(sb->disks + i); for (j = 0; j < sb->raid_disks; j++) { - if (!raid_conf->disks[j].operational) + if (!conf->disks[j].operational) continue; - if (sb->disks[i].number == raid_conf->disks[j].number) - sb->disks[i].state |= 1 << MD_SYNC_DEVICE; + if (sb->disks[i].number == conf->disks[j].number) + mark_disk_sync(sb->disks + i); } } - sb->active_disks = raid_conf->working_disks; + sb->active_disks = conf->working_disks; if (sb->active_disks == sb->raid_disks) - printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm); + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); else - printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm); + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); + + if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) { + const char * name = "raid5syncd"; + + conf->resync_thread = md_register_thread(raid5syncd, conf,name); + if (!conf->resync_thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } - if ((sb->state & (1 << MD_SB_CLEAN)) == 0) { - printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor))); - raid_conf->resync_parity = 1; -#if SUPPORT_RECONSTRUCTION - md_wakeup_thread(raid_conf->resync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev)); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); } + print_raid5_conf(conf); + if (start_recovery) + md_recover_arrays(); + print_raid5_conf(conf); + /* Ok, everything is just fine now */ return (0); abort: - if (raid_conf) { - if (raid_conf->stripe_hashtbl) - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER); - kfree(raid_conf); + if (conf) { + print_raid5_conf(conf); + if (conf->stripe_hashtbl) + free_pages((unsigned long) conf->stripe_hashtbl, + HASH_PAGES_ORDER); + kfree(conf); } mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor))); + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev)); MOD_DEC_USE_COUNT; return -EIO; } -static int raid5_stop (int minor, struct md_dev *mddev) +static int raid5_stop_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + mdk_thread_t *thread = conf->resync_thread; + + if (thread) { + if (conf->resync_parity) { + conf->resync_parity = 2; + md_interrupt_thread(thread); + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid5_restart_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_parity) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + printk("raid5: waking up raid5resync.\n"); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } else + printk("raid5: no restart-resync needed.\n"); + return 0; +} + + +static int raid5_stop (mddev_t *mddev) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes); - shrink_stripes(raid_conf, raid_conf->max_nr_stripes); - md_unregister_thread(raid_conf->thread); -#if SUPPORT_RECONSTRUCTION - md_unregister_thread(raid_conf->resync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER); - kfree(raid_conf); + shrink_stripe_cache(conf, conf->max_nr_stripes); + shrink_stripes(conf, conf->max_nr_stripes); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } -static int raid5_status (char *page, int minor, struct md_dev *mddev) +#if RAID5_DEBUG +static void print_sh (struct stripe_head *sh) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - md_superblock_t *sb = mddev->sb; + int i; + + printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh->sector, sh->phase, sh->size, sh->pd_idx, sh->state, sh->cmd); + printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh->sector, sh->write_method, atomic_read(&sh->nr_pending), atomic_read(&sh->count)); + printk("sh %lu, ", sh->sector); + for (i = 0; i < MD_SB_DISKS; i++) { + if (sh->bh_old[i]) + printk("(old%d: %p) ", i, sh->bh_old[i]); + if (sh->bh_new[i]) + printk("(new%d: %p) ", i, sh->bh_new[i]); + if (sh->bh_copy[i]) + printk("(copy%d: %p) ", i, sh->bh_copy[i]); + if (sh->bh_req[i]) + printk("(req%d: %p) ", i, sh->bh_req[i]); + } + printk("\n"); + for (i = 0; i < MD_SB_DISKS; i++) + printk("%d(%d/%d) ", i, sh->cmd_new[i], sh->new[i]); + printk("\n"); +} + +static void printall (raid5_conf_t *conf) +{ + struct stripe_head *sh; + int i; + + md_spin_lock_irq(&conf->device_lock); + for (i = 0; i < NR_HASH; i++) { + sh = conf->stripe_hashtbl[i]; + for (; sh; sh = sh->hash_next) { + if (sh->raid_conf != conf) + continue; + print_sh(sh); + } + } + md_spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); +} +#endif + +static int raid5_status (char *page, mddev_t *mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; int sz = 0, i; - sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm); - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks); - for (i = 0; i < raid_conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_"); + sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); +#if RAID5_DEBUG +#define D(x) \ + sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x)) + D(nr_handle); + D(nr_stripes); + D(nr_hashed_stripes); + D(nr_locked_stripes); + D(nr_pending_stripes); + D(nr_cached_stripes); + D(nr_free_sh); + printall(conf); +#endif return sz; } -static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state) +static void print_raid5_conf (raid5_conf_t *conf) { - int i = 0, failed_disk = -1; - struct raid5_data *raid_conf = mddev->private; - struct disk_info *disk = raid_conf->disks; - unsigned long flags; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; + int i; + struct disk_info *tmp; - for (i = 0; i < MD_SB_DISKS; i++, disk++) { - if (disk->spare && disk->number == spare->number) - goto found; + printk("RAID5 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; } - return 1; -found: - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++) - if (!disk->operational) - failed_disk = i; - if (failed_disk == -1) - return 1; - save_flags(flags); - cli(); + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, + conf->working_disks, conf->failed_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid5_conf_t *conf = mddev->private; + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + + print_raid5_conf(conf); + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ switch (state) { - case SPARE_WRITE: - disk->operational = 1; - disk->write_only = 1; - raid_conf->spare = disk; - break; - case SPARE_INACTIVE: - disk->operational = 0; - disk->write_only = 0; - raid_conf->spare = NULL; - break; - case SPARE_ACTIVE: - disk->spare = 0; - disk->write_only = 0; - descriptor = &sb->disks[raid_conf->disks[failed_disk].number]; - i = spare->raid_disk; - disk->raid_disk = spare->raid_disk = descriptor->raid_disk; - if (disk->raid_disk != failed_disk) - printk("raid5: disk->raid_disk != failed_disk"); - descriptor->raid_disk = i; - - raid_conf->spare = NULL; - raid_conf->working_disks++; - raid_conf->failed_disks--; - raid_conf->disks[failed_disk] = *disk; - break; - default: - printk("raid5_mark_spare: bug: state == %d\n", state); - restore_flags(flags); - return 1; + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID5 configuration ... + * (this can only be in the first conf->raid_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->disks + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; } - restore_flags(flags); - return 0; + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + if (conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + conf->spare = sdisk; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->disks + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + /* + * Was the spare being resynced? + */ + if (conf->spare == sdisk) + conf->spare = NULL; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->raid_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + if (!conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + fdisk = conf->disks + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + conf->failed_disks--; + conf->working_disks++; + conf->spare = NULL; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->disks + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->disks + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + print_raid5_conf(conf); + return err; } -static struct md_personality raid5_personality= +static mdk_personality_t raid5_personality= { "raid5", raid5_map, @@ -1654,14 +2200,19 @@ NULL, /* no ioctls */ 0, raid5_error, - /* raid5_hot_add_disk, */ NULL, - /* raid1_hot_remove_drive */ NULL, - raid5_mark_spare + raid5_diskop, + raid5_stop_resync, + raid5_restart_resync }; int raid5_init (void) { - return register_md_personality (RAID5, &raid5_personality); + int err; + + err = register_md_personality (RAID5, &raid5_personality); + if (err) + return err; + return 0; } #ifdef MODULE --- linux/drivers/block/xor.c.orig Wed Apr 12 10:43:39 2000 +++ linux/drivers/block/xor.c Wed Apr 12 10:43:33 2000 @@ -0,0 +1,1894 @@ +/* + * xor.c : Multiple Devices driver for Linux + * + * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek + * + * + * optimized RAID-5 checksumming functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#define BH_TRACE 0 +#include +#include +#ifdef __sparc_v9__ +#include +#include +#include +#endif + +/* + * we use the 'XOR function template' to register multiple xor + * functions runtime. The kernel measures their speed upon bootup + * and decides which one to use. (compile-time registration is + * not enough as certain CPU features like MMX can only be detected + * runtime) + * + * this architecture makes it pretty easy to add new routines + * that are faster on certain CPUs, without killing other CPU's + * 'native' routine. Although the current routines are belived + * to be the physically fastest ones on all CPUs tested, but + * feel free to prove me wrong and add yet another routine =B-) + * --mingo + */ + +#define MAX_XOR_BLOCKS 5 + +#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr) + +typedef void (*xor_block_t) XOR_ARGS; +xor_block_t xor_block = NULL; + +#ifndef __sparc_v9__ + +struct xor_block_template; + +struct xor_block_template { + char * name; + xor_block_t xor_block; + int speed; + struct xor_block_template * next; +}; + +struct xor_block_template * xor_functions = NULL; + +#define XORBLOCK_TEMPLATE(x) \ +static void xor_block_##x XOR_ARGS; \ +static struct xor_block_template t_xor_block_##x = \ + { #x, xor_block_##x, 0, NULL }; \ +static void xor_block_##x XOR_ARGS + +#ifdef __i386__ + +#ifdef CONFIG_X86_XMM +/* + * Cache avoiding checksumming functions utilizing KNI instructions + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) + */ + +XORBLOCK_TEMPLATE(pIII_kni) +{ + char xmm_save[16*4]; + int cr0; + int lines = (bh_ptr[0]->b_size>>8); + + __asm__ __volatile__ ( + "movl %%cr0,%0 ;\n\t" + "clts ;\n\t" + "movups %%xmm0,(%1) ;\n\t" + "movups %%xmm1,0x10(%1) ;\n\t" + "movups %%xmm2,0x20(%1) ;\n\t" + "movups %%xmm3,0x30(%1) ;\n\t" + : "=r" (cr0) + : "r" (xmm_save) + : "memory" ); + +#define OFFS(x) "8*("#x"*2)" +#define PF0(x) \ + " prefetcht0 "OFFS(x)"(%1) ;\n" +#define LD(x,y) \ + " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" +#define ST(x,y) \ + " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" +#define PF1(x) \ + " prefetchnta "OFFS(x)"(%2) ;\n" +#define PF2(x) \ + " prefetchnta "OFFS(x)"(%3) ;\n" +#define PF3(x) \ + " prefetchnta "OFFS(x)"(%4) ;\n" +#define PF4(x) \ + " prefetchnta "OFFS(x)"(%5) ;\n" +#define PF5(x) \ + " prefetchnta "OFFS(x)"(%6) ;\n" +#define XO1(x,y) \ + " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" +#define XO2(x,y) \ + " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" +#define XO3(x,y) \ + " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" +#define XO4(x,y) \ + " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" +#define XO5(x,y) \ + " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + PF1(i) \ + PF1(i+2) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + PF4(i) \ + PF4(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + XO4(i+1,1) \ + XO4(i+2,2) \ + XO4(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " addl $256, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( + "sfence ;\n\t" + "movups (%1),%%xmm0 ;\n\t" + "movups 0x10(%1),%%xmm1 ;\n\t" + "movups 0x20(%1),%%xmm2 ;\n\t" + "movups 0x30(%1),%%xmm3 ;\n\t" + "movl %0,%%cr0 ;\n\t" + : + : "r" (cr0), "r" (xmm_save) + : "memory" ); +} + +#undef OFFS +#undef LD +#undef ST +#undef PF0 +#undef PF1 +#undef PF2 +#undef PF3 +#undef PF4 +#undef PF5 +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef XO5 +#undef BLOCK + +#endif /* CONFIG_X86_XMM */ + +/* + * high-speed RAID5 checksumming functions utilizing MMX instructions + * Copyright (C) 1998 Ingo Molnar + */ +XORBLOCK_TEMPLATE(pII_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>7); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + +#define LD(x,y) \ + " movq 8*("#x")(%1), %%mm"#y" ;\n" +#define ST(x,y) \ + " movq %%mm"#y", 8*("#x")(%1) ;\n" +#define XO1(x,y) \ + " pxor 8*("#x")(%2), %%mm"#y" ;\n" +#define XO2(x,y) \ + " pxor 8*("#x")(%3), %%mm"#y" ;\n" +#define XO3(x,y) \ + " pxor 8*("#x")(%4), %%mm"#y" ;\n" +#define XO4(x,y) \ + " pxor 8*("#x")(%5), %%mm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + ST(i,0) \ + XO1(i+1,1) \ + ST(i+1,1) \ + XO1(i+2,2) \ + ST(i+2,2) \ + XO1(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory"); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + ST(i,0) \ + XO2(i+1,1) \ + ST(i+1,1) \ + XO2(i+2,2) \ + ST(i+2,2) \ + XO2(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory"); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + ST(i,0) \ + XO3(i+1,1) \ + ST(i+1,1) \ + XO3(i+2,2) \ + ST(i+2,2) \ + XO3(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory"); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + ST(i,0) \ + XO4(i+1,1) \ + ST(i+1,1) \ + XO4(i+2,2) \ + ST(i+2,2) \ + XO4(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " addl $128, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} + +#undef LD +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef ST +#undef BLOCK + +XORBLOCK_TEMPLATE(p5_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>6); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + + switch(count) { + case 2: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq 40(%1), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor (%4), %%mm0 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " pxor 16(%4), %%mm2 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 24(%4), %%mm3 ;\n" + " movq %%mm3, 24(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq 48(%1), %%mm6 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor (%4), %%mm0 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor (%5), %%mm0 ;\n" + " pxor 8(%5), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 16(%4), %%mm2 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%5), %%mm2 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%4), %%mm3 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%5), %%mm3 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 32(%5), %%mm4 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " pxor 40(%5), %%mm5 ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%5), %%mm6 ;\n" + " pxor 56(%5), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " addl $64, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory" ); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} +#endif /* __i386__ */ +#endif /* !__sparc_v9__ */ + +#ifdef __sparc_v9__ +/* + * High speed xor_block operation for RAID4/5 utilizing the + * UltraSparc Visual Instruction Set. + * + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + * Requirements: + * !(((long)dest | (long)sourceN) & (64 - 1)) && + * !(len & 127) && len >= 256 + * + * It is done in pure assembly, as otherwise gcc makes it + * a non-leaf function, which is not what we want. + * Also, we don't measure the speeds as on other architectures, + * as the measuring routine does not take into account cold caches + * and the fact that xor_block_VIS bypasses the caches. + * xor_block_32regs might be 5% faster for count 2 if caches are hot + * and things just right (for count 3 VIS is about as fast as 32regs for + * hot caches and for count 4 and 5 VIS is faster by good margin always), + * but I think it is better not to pollute the caches. + * Actually, if I'd just fight for speed for hot caches, I could + * write a hybrid VIS/integer routine, which would do always two + * 64B blocks in VIS and two in IEUs, but I really care more about + * caches. + */ +extern void *VISenter(void); +extern void xor_block_VIS XOR_ARGS; + +void __xor_block_VIS(void) +{ +__asm__ (" + .globl xor_block_VIS +xor_block_VIS: + ldx [%%o1 + 0], %%o4 + ldx [%%o1 + 8], %%o3 + ldx [%%o4 + %1], %%g5 + ldx [%%o4 + %0], %%o4 + ldx [%%o3 + %0], %%o3 + rd %%fprs, %%o5 + andcc %%o5, %2, %%g0 + be,pt %%icc, 297f + sethi %%hi(%5), %%g1 + jmpl %%g1 + %%lo(%5), %%g7 + add %%g7, 8, %%g7 +297: wr %%g0, %4, %%fprs + membar #LoadStore|#StoreLoad|#StoreStore + sub %%g5, 64, %%g5 + ldda [%%o4] %3, %%f0 + ldda [%%o3] %3, %%f16 + cmp %%o0, 4 + bgeu,pt %%xcc, 10f + cmp %%o0, 3 + be,pn %%xcc, 13f + mov -64, %%g1 + sub %%g5, 64, %%g5 + rd %%asi, %%g1 + wr %%g0, %3, %%asi + +2: ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + ldda [%%o4 + 128] %%asi, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + add %%o4, 128, %%o4 + fxor %%f36, %%f52, %%f52 + add %%o3, 128, %%o3 + fxor %%f38, %%f54, %%f54 + subcc %%g5, 128, %%g5 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 - 64] %%asi + bne,pt %%xcc, 2b + ldda [%%o3] %3, %%f16 + + ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + 64] %%asi + membar #Sync|#StoreStore|#StoreLoad + wr %%g0, 0, %%fprs + retl + wr %%g1, %%g0, %%asi + +13: ldx [%%o1 + 16], %%o2 + ldx [%%o2 + %0], %%o2 + +3: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 3b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +10: cmp %%o0, 5 + be,pt %%xcc, 15f + mov -64, %%g1 + +14: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + +4: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + add %%o2, 64, %%o2 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + ldda [%%o4] %3, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + add %%o0, 64, %%o0 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 4b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +15: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o1 + 32], %%o1 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + ldx [%%o1 + %0], %%o1 + +5: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + add %%o0, 64, %%o0 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + add %%o1, 64, %%o1 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 5b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + " : : + "i" (&((struct buffer_head *)0)->b_data), + "i" (&((struct buffer_head *)0)->b_data), + "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P), + "i" (FPRS_FEF), "i" (VISenter)); +} +#endif /* __sparc_v9__ */ + +#if defined(__sparc__) && !defined(__sparc_v9__) +/* + * High speed xor_block operation for RAID4/5 utilizing the + * ldd/std SPARC instructions. + * + * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + */ + +XORBLOCK_TEMPLATE(SPARC) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1 = (long *) bh_ptr[1]->b_data; + long *source2, *source3, *source4; + + switch (count) { + case 2: + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", + "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + } + break; + case 4: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + } + break; + case 5: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source4 = (long *) bh_ptr[4]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%4 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%4 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%4 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%4 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + } + break; + } +} +#endif /* __sparc_v[78]__ */ + +#ifndef __sparc_v9__ + +/* + * this one works reasonably on any x86 CPU + * (send me an assembly version for inclusion if you can make it faster) + * + * this one is just as fast as written in pure assembly on x86. + * the reason for this separate version is that the + * fast open-coded xor routine "32reg" produces suboptimal code + * on x86, due to lack of registers. + */ +XORBLOCK_TEMPLATE(8regs) +{ + int len = bh_ptr[0]->b_size; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + long lines = len / (sizeof (long)) / 8, i; + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 7) ^= *(source1 + 7); + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 0) ^= *(source4 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 1) ^= *(source4 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 2) ^= *(source4 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 3) ^= *(source4 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 4) ^= *(source4 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 5) ^= *(source4 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 6) ^= *(source4 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + *(destp + 7) ^= *(source4 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * platform independent RAID5 checksum calculation, this should + * be very fast on any platform that has a decent amount of + * registers. (32 or more) + */ +XORBLOCK_TEMPLATE(32regs) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + + /* LOTS of registers available... + We do explicite loop-unrolling here for code which + favours RISC machines. In fact this is almoast direct + RISC assembly on Alpha and SPARC :-) */ + + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + d0 ^= source4[0]; + d1 ^= source4[1]; + d2 ^= source4[2]; + d3 ^= source4[3]; + d4 ^= source4[4]; + d5 ^= source4[5]; + d6 ^= source4[6]; + d7 ^= source4[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * (the -6*32 shift factor colors the cache) + */ +#define SIZE (PAGE_SIZE-6*32) + +static void xor_speed ( struct xor_block_template * func, + struct buffer_head *b1, struct buffer_head *b2) +{ + int speed; + unsigned long now; + int i, count, max; + struct buffer_head *bh_ptr[6]; + + func->next = xor_functions; + xor_functions = func; + bh_ptr[0] = b1; + bh_ptr[1] = b2; + + /* + * count the number of XORs done during a whole jiffy. + * calculate the speed of checksumming from this. + * (we use a 2-page allocation to have guaranteed + * color L1-cache layout) + */ + max = 0; + for (i = 0; i < 5; i++) { + now = jiffies; + count = 0; + while (jiffies == now) { + mb(); + func->xor_block(2,bh_ptr); + mb(); + count++; + mb(); + } + if (count > max) + max = count; + } + + speed = max * (HZ*SIZE/1024); + func->speed = speed; + + printk( " %-10s: %5d.%03d MB/sec\n", func->name, + speed / 1000, speed % 1000); +} + +static inline void pick_fastest_function(void) +{ + struct xor_block_template *f, *fastest; + + fastest = xor_functions; + for (f = fastest; f; f = f->next) { + if (f->speed > fastest->speed) + fastest = f; + } +#ifdef CONFIG_X86_XMM + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) { + fastest = &t_xor_block_pIII_kni; + } +#endif + xor_block = fastest->xor_block; + printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name, + fastest->speed / 1000, fastest->speed % 1000); +} + +static struct buffer_head b1, b2; + +void calibrate_xor_block(void) +{ + memset(&b1,0,sizeof(b1)); + b2 = b1; + + b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2); + if (!b1.b_data) { + pick_fastest_function(); + return; + } + b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE; + + b1.b_size = SIZE; + + printk(KERN_INFO "raid5: measuring checksumming speed\n"); + + sti(); /* should be safe */ + +#if defined(__sparc__) && !defined(__sparc_v9__) + printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n"); + xor_speed(&t_xor_block_SPARC,&b1,&b2); +#endif + +#ifdef CONFIG_X86_XMM + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) { + printk(KERN_INFO + "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n"); + /* we force the use of the KNI xor block because it + can write around l2. we may also be able + to load into the l1 only depending on how + the cpu deals with a load to a line that is + being prefetched. + */ + xor_speed(&t_xor_block_pIII_kni,&b1,&b2); + } +#endif /* CONFIG_X86_XMM */ + +#ifdef __i386__ + + if (md_cpu_has_mmx()) { + printk(KERN_INFO + "raid5: MMX detected, trying high-speed MMX checksum routines\n"); + xor_speed(&t_xor_block_pII_mmx,&b1,&b2); + xor_speed(&t_xor_block_p5_mmx,&b1,&b2); + } + +#endif /* __i386__ */ + + + xor_speed(&t_xor_block_8regs,&b1,&b2); + xor_speed(&t_xor_block_32regs,&b1,&b2); + + free_pages((unsigned long)b1.b_data,2); + pick_fastest_function(); +} + +#else /* __sparc_v9__ */ + +void calibrate_xor_block(void) +{ + printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n"); + xor_block = xor_block_VIS; +} + +#endif /* __sparc_v9__ */ + +MD_EXPORT_SYMBOL(xor_block); +