--- linux/fs/ext2/fsync.c.orig Sun Jan 16 06:38:26 2000 +++ linux/fs/ext2/fsync.c Sun Jan 16 17:45:52 2000 @@ -60,7 +60,7 @@ return 0; } ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); + bput(bh); return 0; } --- linux/fs/ext2/truncate.c.orig Sun Jan 16 06:38:14 2000 +++ linux/fs/ext2/truncate.c Sun Jan 16 17:45:52 2000 @@ -104,26 +104,18 @@ { int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); u32 * ind = (u32 *) bh->b_data; - int i, retry; - - /* Make sure both buffers are unlocked */ - do { - retry = 0; - if (buffer_locked(bh)) { - __wait_on_buffer(bh); - retry = 1; - } - if (ind_bh && buffer_locked(ind_bh)) { - __wait_on_buffer(ind_bh); - retry = 1; - } - } while (retry); + int i, retry = 0; + /* + * We do not have to wait for IO completion, proper handling + * of bforgotten buffers with IO on them is done by the + * buffer-cache and IO layer. + */ for (i = 0; i < addr_per_block; i++) if (*(ind++)) goto in_use; - if (atomic_read(&bh->b_count) == 1) { + if (bcount(bh) == 1) { int tmp; tmp = le32_to_cpu(*p); *p = 0; @@ -151,9 +143,6 @@ return retry; } -#define DATA_BUFFER_USED(bh) \ - (atomic_read(&bh->b_count) || buffer_locked(bh)) - static int trunc_direct (struct inode * inode) { int i, retry = 0; @@ -225,7 +214,8 @@ for (i = indirect_block ; i < addr_per_block ; i++) { u32 * ind = i + (u32 *) ind_bh->b_data; - wait_on_buffer(ind_bh); + if (!buffer_uptodate(ind_bh)) + wait_on_buffer(ind_bh); tmp = le32_to_cpu(*ind); if (!tmp) continue; --- linux/fs/ext2/inode.c.orig Sun Jan 16 06:38:26 2000 +++ linux/fs/ext2/inode.c Sun Jan 16 17:45:52 2000 @@ -204,6 +204,11 @@ if (tmp) { if (metadata) { result = getblk (inode->i_dev, tmp, blocksize); + if (!buffer_uptodate(result)) { + ll_rw_block (READ, 1, &result); + if (!buffer_uptodate(result)) + wait_on_buffer(result); + } if (tmp == le32_to_cpu(*p)) return result; brelse (result); @@ -255,8 +260,6 @@ } if (metadata) { result = getblk (inode->i_dev, tmp, blocksize); - if (!buffer_uptodate(result)) - wait_on_buffer(result); memset(result->b_data, 0, blocksize); mark_buffer_uptodate(result, 1); mark_buffer_dirty(result, 1); @@ -318,7 +321,8 @@ goto out; if (!buffer_uptodate(bh)) { ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + wait_on_buffer (bh); if (!buffer_uptodate(bh)) goto out; } @@ -328,6 +332,11 @@ if (tmp) { if (metadata) { result = getblk (bh->b_dev, tmp, blocksize); + if (!buffer_uptodate(result)) { + ll_rw_block (READ, 1, &result); + if (!buffer_uptodate(result)) + wait_on_buffer(result); + } if (tmp == le32_to_cpu(*p)) goto out; brelse (result); @@ -366,8 +375,6 @@ goto out; if (metadata) { result = getblk (bh->b_dev, tmp, blocksize); - if (!buffer_uptodate(result)) - wait_on_buffer(result); memset(result->b_data, 0, inode->i_sb->s_blocksize); mark_buffer_uptodate(result, 1); mark_buffer_dirty(result, 1); @@ -602,7 +609,8 @@ if (buffer_uptodate(bh)) return bh; ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + wait_on_buffer (bh); if (buffer_uptodate(bh)) return bh; brelse (bh); --- linux/fs/ext2/namei.c.orig Sun Jan 16 11:32:27 2000 +++ linux/fs/ext2/namei.c Sun Jan 16 17:45:52 2000 @@ -104,7 +104,8 @@ offset += sb->s_blocksize; continue; } - wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + wait_on_buffer (bh); if (!buffer_uptodate(bh)) { /* * read error: all bets are off --- linux/fs/partitions/check.c.orig Mon Aug 30 19:24:14 1999 +++ linux/fs/partitions/check.c Sun Jan 16 17:45:52 2000 @@ -18,6 +18,7 @@ #include #include #include +#include #include "check.h" @@ -323,6 +324,12 @@ else #endif rd_load(); +#endif +#ifdef CONFIG_BLK_DEV_MD + { + extern void autodetect_raid(void); + autodetect_raid(); + } #endif #ifdef CONFIG_MD_BOOT md_setup_drive(); --- linux/fs/buffer.c.orig Sun Jan 16 06:38:25 2000 +++ linux/fs/buffer.c Sun Jan 16 17:45:52 2000 @@ -28,6 +28,12 @@ /* async buffer flushing, 1999 Andrea Arcangeli */ +/* + * Integrated buffer and page cache, improved buffer freeing, + * + * 2000 Ingo Molnar + */ + #include #include #include @@ -60,7 +66,7 @@ #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9]) #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) #define NR_RESERVED (2*MAX_BUF_PER_PAGE) -#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this +#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this number of unused buffer heads */ /* Anti-deadlock ordering: @@ -77,12 +83,14 @@ static struct buffer_head *lru_list[NR_LIST]; static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED; + static int nr_buffers_type[NR_LIST] = {0,}; static unsigned long size_buffers_type[NR_LIST] = {0,}; static struct buffer_head * unused_list = NULL; static int nr_unused_buffer_heads = 0; static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; + static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); struct bh_free_head { @@ -94,6 +102,7 @@ kmem_cache_t *bh_cachep; static int grow_buffers(int size); +static int __try_to_free_buffers(struct page * page, int priority); /* This is used by some architectures to estimate available memory. */ atomic_t buffermem_pages = ATOMIC_INIT(0); @@ -109,7 +118,7 @@ */ union bdflush_param { struct { - int nfract; /* Percentage of buffer cache dirty to + int nfract; /* Percentage of buffer cache dirty to activate bdflush */ int ndirty; /* Maximum number of dirty blocks to write out per wake-cycle */ @@ -141,21 +150,20 @@ */ void __wait_on_buffer(struct buffer_head * bh) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(wait, current); - atomic_inc(&bh->b_count); + bget(bh); add_wait_queue(&bh->b_wait, &wait); repeat: run_task_queue(&tq_disk); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_task_state(current, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { schedule(); goto repeat; } - tsk->state = TASK_RUNNING; + current->state = TASK_RUNNING; remove_wait_queue(&bh->b_wait, &wait); - atomic_dec(&bh->b_count); + bput(bh); } /* Call sync_buffers with wait!=0 to ensure that the call does not @@ -193,22 +201,27 @@ for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) { next = bh->b_next_free; - if (!lru_list[BUF_DIRTY]) + bget(bh); + if (!lru_list[BUF_DIRTY]) { + bput(bh); break; - if (dev && bh->b_dev != dev) + } + if (dev && bh->b_dev != dev) { + bput(bh); continue; + } if (buffer_locked(bh)) { /* Buffer is locked; skip it unless wait is * requested AND pass > 0. */ if (!wait || !pass) { retry = 1; + bput(bh); continue; } - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer (bh); - atomic_dec(&bh->b_count); + bput(bh); goto repeat; } @@ -218,20 +231,22 @@ if (wait && buffer_req(bh) && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_uptodate(bh)) { err = -EIO; + bput(bh); continue; } /* Don't write clean buffers. Don't write ANY buffers * on the third pass. */ - if (!buffer_dirty(bh) || pass >= 2) + if (!buffer_dirty(bh) || pass >= 2) { + bput(bh); continue; + } - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); retry = 1; + bput(bh); goto repeat; } @@ -248,21 +263,23 @@ break; if (dev && bh->b_dev != dev) continue; + bget(bh); if (buffer_locked(bh)) { /* Buffer is locked; skip it unless wait is * requested AND pass > 0. */ if (!wait || !pass) { retry = 1; + bput(bh); continue; } - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer (bh); spin_lock(&lru_list_lock); - atomic_dec(&bh->b_count); + bput(bh); goto repeat2; } + bput(bh); } spin_unlock(&lru_list_lock); @@ -317,7 +334,7 @@ /* * filp may be NULL if called via the msync of a vma. */ - + int file_fsync(struct file *filp, struct dentry *dentry) { struct inode * inode = dentry->d_inode; @@ -412,39 +429,6 @@ return err; } -void invalidate_buffers(kdev_t dev) -{ - int nlist; - - spin_lock(&lru_list_lock); - for(nlist = 0; nlist < NR_LIST; nlist++) { - struct buffer_head * bh; - int i; - retry: - bh = lru_list[nlist]; - if (!bh) - continue; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) { - if (bh->b_dev != dev) - continue; - if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - wait_on_buffer(bh); - spin_lock(&lru_list_lock); - atomic_dec(&bh->b_count); - goto retry; - } - if (atomic_read(&bh->b_count)) - continue; - clear_bit(BH_Protected, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - } - } - spin_unlock(&lru_list_lock); -} /* After several hours of tedious analysis, the following hash * function won. Do not mess with it... -DaveM @@ -456,6 +440,10 @@ static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head) { + if (bh->b_dev == B_FREE) + BUG(); + if (bh->b_pprev) + BUG(); if ((bh->b_next = *head) != NULL) bh->b_next->b_pprev = &bh->b_next; *head = bh; @@ -464,12 +452,35 @@ static __inline__ void __hash_unlink(struct buffer_head *bh) { + if (!bh->b_pprev) { + if (test_bit(BH_Mapped, &bh->b_state)) + BH_BUG(bh); + return; + } if (bh->b_next) bh->b_next->b_pprev = bh->b_pprev; *(bh->b_pprev) = bh->b_next; bh->b_pprev = NULL; + clear_bit(BH_Mapped, &bh->b_state); } +static inline struct buffer_head * __get_hash_table(struct buffer_head **head, kdev_t dev, int block, int size) +{ + struct buffer_head *bh; + + for (bh = *head; bh; bh = bh->b_next) + if (bh->b_blocknr == block && + bh->b_size == size && + bh->b_dev == dev) + break; + if (bh) { + bget(bh); + if (!buffer_mapped(bh)) + BH_BUG(bh); + } + + return bh; +} static void __insert_into_lru_list(struct buffer_head * bh, int blist) { struct buffer_head **bhp = &lru_list[blist]; @@ -488,9 +499,17 @@ static void __remove_from_lru_list(struct buffer_head * bh, int blist) { + if (bh->b_dev == B_FREE) + BUG(); if (bh->b_prev_free || bh->b_next_free) { - bh->b_prev_free->b_next_free = bh->b_next_free; - bh->b_next_free->b_prev_free = bh->b_prev_free; + if (!bh->b_prev_free) + BH_BUG(bh); + else + bh->b_prev_free->b_next_free = bh->b_next_free; + if (!bh->b_next_free) + BH_BUG(bh); + else + bh->b_next_free->b_prev_free = bh->b_prev_free; if (lru_list[blist] == bh) lru_list[blist] = bh->b_next_free; if (lru_list[blist] == bh) @@ -503,11 +522,19 @@ static void __remove_from_free_list(struct buffer_head * bh, int index) { + if (bh->b_dev != B_FREE) + BUG(); if(bh->b_next_free == bh) free_list[index].list = NULL; else { - bh->b_prev_free->b_next_free = bh->b_next_free; - bh->b_next_free->b_prev_free = bh->b_prev_free; + if (!bh->b_prev_free) + BH_BUG(bh); + else + bh->b_prev_free->b_next_free = bh->b_next_free; + if (!bh->b_next_free) + BH_BUG(bh); + else + bh->b_next_free->b_prev_free = bh->b_prev_free; if (free_list[index].list == bh) free_list[index].list = bh->b_next_free; } @@ -518,49 +545,192 @@ * because they control the visibility of a buffer head * to the rest of the kernel. */ -static __inline__ void __remove_from_queues(struct buffer_head *bh) +static inline void __remove_from_queues(struct buffer_head *bh) { - write_lock(&hash_table_lock); - if (bh->b_pprev) - __hash_unlink(bh); + __hash_unlink(bh); __remove_from_lru_list(bh, bh->b_list); +} + +static inline struct buffer_head * insert_into_queues_atomic (struct buffer_head *bh, int lock) +{ + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr), *alias; + + if (!buffer_mapped(bh)) + BUG(); + +repeat: + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + alias = __get_hash_table(head, bh->b_dev, bh->b_blocknr, bh->b_size); + if (alias) { + if (lock) { + if (test_and_set_bit(BH_Lock, &alias->b_state)) { + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + wait_on_buffer(alias); + brelse(alias); + goto repeat; + } + } + bh = alias; + } else { + __hash_link(bh, head); + __insert_into_lru_list(bh, bh->b_list); + } write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + + return bh; } -static void insert_into_queues(struct buffer_head *bh) +void insert_into_queues_exclusive (struct buffer_head *bh) { - struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr), *alias; + + if (!buffer_mapped(bh)) + BUG(); + if (!buffer_locked(bh)) + BUG(); spin_lock(&lru_list_lock); write_lock(&hash_table_lock); + alias = __get_hash_table(head, bh->b_dev, bh->b_blocknr, bh->b_size); + if (alias) { + int gotlock = 1; + /* + * If IO is going on for this bh we have to + * synchronize with it, but only if it's not + * an invalidated buffer. + * + * SUBTLE: the fact that we are atomically testing BH_Req + * in ll_rw_block ensures that we cannot accidentally + * write/read this new bh before the old IO finishes. + */ + clear_bit(BH_Req, &alias->b_state); + clear_bit(BH_Uptodate, &alias->b_state); + clear_bit(BH_Dirty, &alias->b_state); + if (test_and_set_bit(BH_Lock, &alias->b_state)) + gotlock = 0; + if (!alias->b_pprev) + BH_BUG(alias); + __remove_from_queues(alias); + if (gotlock) + clear_bit(BH_Lock, &alias->b_state); + bput(alias); + } __hash_link(bh, head); __insert_into_lru_list(bh, bh->b_list); write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); } -/* This function must only run if there are no other - * references _anywhere_ to this buffer head. +/* + * Reserve NR_RESERVED buffer heads for async IO requests to avoid + * no-buffer-head deadlock. Return NULL on failure; waiting for + * buffer heads is now handled in create_buffers(). */ -static void put_last_free(struct buffer_head * bh) +static struct buffer_head * __get_unused_bh(int async) { - struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; - struct buffer_head **bhp = &head->list; + struct buffer_head * bh; - spin_lock(&head->lock); - bh->b_dev = B_FREE; - if(!*bhp) { - *bhp = bh; - bh->b_prev_free = bh; + /* + * It's a common case that the unused list is empty, + * thus this 'unsafe' optimization. (we read the counter + * without the spinlock held) If we are really low on + * buffer heads, then we'll re-check the counter with the + * spinlock held anyway, in the async case. + */ + if (nr_unused_buffer_heads > NR_RESERVED) { + spin_lock(&unused_list_lock); +reserve_async: + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; } - bh->b_next_free = *bhp; - bh->b_prev_free = (*bhp)->b_prev_free; - (*bhp)->b_prev_free->b_next_free = bh; - (*bhp)->b_prev_free = bh; - spin_unlock(&head->lock); + + /* This is critical. We can't swap out pages to get + * more buffer heads, because the swap-out may need + * more buffer-heads itself. Thus SLAB_BUFFER. + */ + bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + if (bh) { + memset(bh, 0, sizeof(*bh)); + bh->b_blocknr = -1; + init_waitqueue_head(&bh->b_wait); + return bh; + } + + /* + * If we need an async buffer, use the reserved buffer heads. + */ + if (async) { + spin_lock(&unused_list_lock); + if (unused_list) + goto reserve_async; + spin_unlock(&unused_list_lock); + } +#if 0 + /* + * (Pending further analysis ...) + * Ordinary (non-async) requests can use a different memory priority + * to free up pages. Any swapping thus generated will use async + * buffer heads. + */ + if(!async && + (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { + memset(bh, 0, sizeof(*bh)); + init_waitqueue_head(&bh->b_wait); + return bh; + } +#endif + return NULL; } /* + * Note: the caller should wake up the buffer_wait list if needed. + */ +static void __put_unused_bh(struct buffer_head * bh) +{ + if (!PageLocked(bh->b_page)) + BUG(); + if (bh->b_pprev) + BH_BUG(bh); + if (bh->b_prev_free || bh->b_next_free) + BH_BUG(bh); + if (bcount(bh) || test_bit(BH_Lock, &bh->b_state)) + BH_BUG(bh); + if (waitqueue_active(&bh->b_wait)) + BH_BUG(bh); + if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { + memset(bh, 0x77, sizeof(*bh)); + kmem_cache_free(bh_cachep, bh); + } else { + memset(bh, 0x55, sizeof(*bh)); + bh->b_blocknr = -1; + init_waitqueue_head(&bh->b_wait); + nr_unused_buffer_heads++; + bh->b_next_free = unused_list; + bh->b_this_page = NULL; + unused_list = bh; + } +} + +struct buffer_head * get_unused_bh (void) +{ + struct buffer_head *bh; + + bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + memset(bh, 0, sizeof(*bh)); + return bh; +} + +void put_unused_bh(struct buffer_head * bh) +{ + kmem_cache_free(bh_cachep, bh); +} +/* * Why like this, I hear you say... The reason is race-conditions. * As we don't lock buffers (unless we are reading them, that is), * something might happen to it while we sleep (ie a read-error @@ -573,13 +743,7 @@ struct buffer_head *bh; read_lock(&hash_table_lock); - for(bh = *head; bh; bh = bh->b_next) - if (bh->b_blocknr == block && - bh->b_size == size && - bh->b_dev == dev) - break; - if (bh) - atomic_inc(&bh->b_count); + bh = __get_hash_table(head, dev, block, size); read_unlock(&hash_table_lock); return bh; @@ -604,6 +768,43 @@ return 0; } +static int __unlink_drop_bh (struct buffer_head *bh) +{ + struct page *page = bh_page(bh); + int freed = 0; + + + if (test_bit(BH_Lock, &bh->b_state)) + BH_BUG(bh); + + __remove_from_queues(bh); + if (!TryLockPage(page)) { + /* + * Safe because the page must have buffers and we just + * managed to lock it. + */ + get_page(page); + if (__try_to_free_buffers(page, 0)) { + /* + * We never remove the mapping prior to removing + * page->buffers. This means that !page->mappings + * are pure buffer-cache pages. + */ + if (!page->mapping) { + if (!page->lru.next && !page->lru.prev) + BUG(); + else + lru_cache_del(page); + atomic_dec(&buffermem_pages); + } + freed = 1; + } + UnlockPage(page); + put_page(page); + } + return freed; +} + void set_blocksize(kdev_t dev, int size) { extern int *blksize_size[]; @@ -630,39 +831,97 @@ * around on the free list, and we can get in a loop if we are not careful. */ for(nlist = 0; nlist < NR_LIST; nlist++) { - repeat: + /* + * Contrary to sync() here we also have to lock the + * hash table, because we might unhash entries. + * setblocksize() is a rare operation so this is not + * a real performance problem. + */ spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); +repeat: bh = lru_list[nlist]; for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { if(!bh) break; - bhnext = bh->b_next_free; + bhnext = bh->b_next_free; if (bh->b_dev != dev) continue; if (bh->b_size == size) continue; - if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + bget(bh); + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); wait_on_buffer(bh); - atomic_dec(&bh->b_count); + bput(bh); goto repeat; } + bput(bh); if (bh->b_dev == dev && bh->b_size != size) { clear_bit(BH_Dirty, &bh->b_state); clear_bit(BH_Uptodate, &bh->b_state); clear_bit(BH_Req, &bh->b_state); - } - if (atomic_read(&bh->b_count) == 0) { - __remove_from_queues(bh); - put_last_free(bh); + clear_bit(BH_Lock, &bh->b_state); + if (!bcount(bh)) + /* + * Careful, if we freed a page then + * we might have freed bhnext as + * well. + */ + if (__unlink_drop_bh(bh)) + goto repeat; } } + write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); } } +void invalidate_buffers (kdev_t dev) +{ + int nlist; + + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + + for (nlist = 0; nlist < NR_LIST; nlist++) { + struct buffer_head *bh, *bhnext; + int i; + retry: + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { + bhnext = bh->b_next_free; + if (bh->b_dev != dev) + continue; + bget(bh); + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + bput(bh); + goto retry; + } + bput(bh); + clear_bit(BH_Lock, &bh->b_state); + if (bcount(bh)) + continue; + clear_bit(BH_Protected, &bh->b_state); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + if (__unlink_drop_bh(bh)) + goto retry; + } + } + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); +} /* * We used to try various strange things. Let's not. */ @@ -680,22 +939,23 @@ bh->b_list = BUF_CLEAN; bh->b_end_io = handler; bh->b_dev_id = dev_id; + bh->b_rdev = MKDEV(0,0); } -static void end_buffer_io_sync(struct buffer_head *bh, int uptodate) +void end_buffer_io_sync(struct buffer_head *bh, int uptodate) { mark_buffer_uptodate(bh, uptodate); unlock_buffer(bh); } -static void end_buffer_io_bad(struct buffer_head *bh, int uptodate) +void end_buffer_io_bad(struct buffer_head *bh, int uptodate) { mark_buffer_uptodate(bh, uptodate); unlock_buffer(bh); BUG(); } -static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +static void end_buffer_io_page(struct buffer_head * bh, int uptodate) { static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; unsigned long flags; @@ -724,10 +984,10 @@ */ spin_lock_irqsave(&page_uptodate_lock, flags); unlock_buffer(bh); - atomic_dec(&bh->b_count); + bh->b_end_io = end_buffer_io_sync; tmp = bh->b_this_page; while (tmp != bh) { - if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp)) + if (tmp->b_end_io == end_buffer_io_page && test_bit(BH_Lock, &tmp->b_state)) goto still_busy; tmp = tmp->b_this_page; } @@ -767,13 +1027,24 @@ * 14.02.92: changed it to sync dirty buffers a bit: better performance * when the filesystem starts to get full of dirty blocks (I hope). */ -struct buffer_head * getblk(kdev_t dev, int block, int size) +static inline struct buffer_head * __getblk (kdev_t dev, int block, int size, int lock) { - struct buffer_head * bh; + struct buffer_head **head = &hash(dev, block); + struct buffer_head *bh, *tmp; int isize; repeat: - bh = get_hash_table(dev, block, size); + write_lock(&hash_table_lock); + bh = __get_hash_table(head, dev, block, size); + if (bh && lock) { + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + write_unlock(&hash_table_lock); + wait_on_buffer(bh); + brelse(bh); + goto repeat; + } + } + write_unlock(&hash_table_lock); if (bh) goto out; @@ -782,7 +1053,9 @@ bh = free_list[isize].list; if (bh) { __remove_from_free_list(bh, isize); - atomic_set(&bh->b_count, 1); + if (bcount(bh)) + BH_BUG(bh); + bget(bh); } spin_unlock(&free_list[isize].lock); if (!bh) @@ -794,10 +1067,23 @@ init_buffer(bh, end_buffer_io_sync, NULL); bh->b_dev = dev; bh->b_blocknr = block; - bh->b_state = 1 << BH_Mapped; + set_bit(BH_Mapped, &bh->b_state); + if (lock) { + set_bit(BH_Lock, &bh->b_state); + } - /* Insert the buffer into the regular lists */ - insert_into_queues(bh); + /* + * Insert the buffer into the regular lists, handle + * the case where someone else added a bh while we + * were allocating. + */ + tmp = insert_into_queues_atomic(bh, lock); + if (tmp != bh) { + clear_bit(BH_Lock, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + bforget(bh); + bh = tmp; + } goto out; /* @@ -811,9 +1097,21 @@ return bh; } -/* -1 -> no need to flush - 0 -> async flush - 1 -> sync flush (wait for I/O completation) */ +struct buffer_head * getblk (kdev_t dev, int block, int size) +{ + return __getblk(dev, block, size, 0); +} + +struct buffer_head * getblk_lock (kdev_t dev, int block, int size) +{ + return __getblk(dev, block, size, 1); +} + +/* + * -1 -> no need to flush + * 0 -> async flush + * 1 -> sync flush (wait for I/O completation) + */ static int balance_dirty_state(kdev_t dev) { unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; @@ -851,7 +1149,6 @@ static inline void __mark_dirty(struct buffer_head *bh, int flag) { bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer); - clear_bit(BH_New, &bh->b_state); refile_buffer(bh); } @@ -867,6 +1164,7 @@ static __inline__ void __refile_buffer(struct buffer_head *bh) { int dispose = BUF_CLEAN; + if (buffer_locked(bh)) dispose = BUF_LOCKED; if (buffer_dirty(bh)) @@ -892,11 +1190,35 @@ { touch_buffer(buf); - if (atomic_read(&buf->b_count)) { - atomic_dec(&buf->b_count); + if (bcount(buf)) { + bput(buf); return; } - printk("VFS: brelse: Trying to free free buffer\n"); + printk("brelse: Trying to free free buffer\n"); + BUG(); +} + +static inline void __bforget_generic (struct buffer_head * buf, + int destroy_dirty) +{ + /* + * Grab the lru lock here to block bdflush. + */ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + if (!bput_and_test(buf) || test_bit(BH_Lock, &buf->b_state)) { + goto in_use; + } + if (!destroy_dirty && test_bit(BH_Dirty, &buf->b_state)) { + goto in_use; + } + clear_bit(BH_Uptodate, &buf->b_state); + clear_bit(BH_Req, &buf->b_state); + clear_bit(BH_Dirty, &buf->b_state); + __unlink_drop_bh(buf); +in_use: + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); } /* @@ -907,23 +1229,17 @@ */ void __bforget(struct buffer_head * buf) { - /* grab the lru lock here to block bdflush. */ - spin_lock(&lru_list_lock); - write_lock(&hash_table_lock); - if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) - goto in_use; - if (buf->b_pprev) - __hash_unlink(buf); - write_unlock(&hash_table_lock); - __remove_from_lru_list(buf, buf->b_list); - spin_unlock(&lru_list_lock); - buf->b_state = 0; - put_last_free(buf); - return; + __bforget_generic(buf, 1); +} - in_use: - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); +/* + * bdrop() frees the buffer if it does not result in information + * loss (ie. the buffer is not dirty). bdrop() is basically a + * 'free behind' brelse(). + */ +void __bdrop(struct buffer_head * buf) +{ + __bforget_generic(buf, 0); } /* @@ -938,7 +1254,13 @@ if (buffer_uptodate(bh)) return bh; ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); + /* + * ll_rw_block might have slept and someone else might have + * requested the buffer meanwhile - thus re-check the uptodate + * flag. + */ + if (!buffer_uptodate(bh)) + wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; brelse(bh); @@ -972,22 +1294,22 @@ index = BUFSIZE_INDEX(bh->b_size); if (buffer_uptodate(bh)) - return(bh); + return(bh); else ll_rw_block(READ, 1, &bh); blocks = (filesize - pos) >> (9+index); if (blocks < (read_ahead[MAJOR(dev)] >> index)) blocks = read_ahead[MAJOR(dev)] >> index; - if (blocks > NBUF) + if (blocks > NBUF) blocks = NBUF; /* if (blocks) printk("breada (new) %d blocks\n",blocks); */ bhlist[0] = bh; j = 1; - for(i=1; i1) - ll_rw_block(READA, (j-1), bhlist+1); - for(i=1; i 1) + ll_rw_block(READA, (j-1), bhlist+1); + for (i = 1; i < j; i++) brelse(bhlist[i]); /* Wait for this buffer, and then continue on. */ bh = bhlist[0]; - wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; brelse(bh); return NULL; } -/* - * Note: the caller should wake up the buffer_wait list if needed. - */ -static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) -{ - if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { - kmem_cache_free(bh_cachep, bh); - } else { - bh->b_blocknr = -1; - init_waitqueue_head(&bh->b_wait); - nr_unused_buffer_heads++; - bh->b_next_free = unused_list; - bh->b_this_page = NULL; - unused_list = bh; - } -} - -/* - * Reserve NR_RESERVED buffer heads for async IO requests to avoid - * no-buffer-head deadlock. Return NULL on failure; waiting for - * buffer heads is now handled in create_buffers(). - */ -static struct buffer_head * get_unused_buffer_head(int async) -{ - struct buffer_head * bh; - - spin_lock(&unused_list_lock); - if (nr_unused_buffer_heads > NR_RESERVED) { - bh = unused_list; - unused_list = bh->b_next_free; - nr_unused_buffer_heads--; - spin_unlock(&unused_list_lock); - return bh; - } - spin_unlock(&unused_list_lock); - - /* This is critical. We can't swap out pages to get - * more buffer heads, because the swap-out may need - * more buffer-heads itself. Thus SLAB_BUFFER. - */ - if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) { - memset(bh, 0, sizeof(*bh)); - init_waitqueue_head(&bh->b_wait); - return bh; - } - - /* - * If we need an async buffer, use the reserved buffer heads. - */ - if (async) { - spin_lock(&unused_list_lock); - if (unused_list) { - bh = unused_list; - unused_list = bh->b_next_free; - nr_unused_buffer_heads--; - spin_unlock(&unused_list_lock); - return bh; - } - spin_unlock(&unused_list_lock); - } -#if 0 - /* - * (Pending further analysis ...) - * Ordinary (non-async) requests can use a different memory priority - * to free up pages. Any swapping thus generated will use async - * buffer heads. - */ - if(!async && - (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { - memset(bh, 0, sizeof(*bh)); - init_waitqueue_head(&bh->b_wait); - return bh; - } -#endif - - return NULL; -} - void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) { bh->b_page = page; @@ -1109,49 +1354,66 @@ * buffers. * The async flag is used to differentiate async IO (paging, swapping) * from ordinary buffer allocations, and only async requests are allowed - * to sleep waiting for buffer heads. + * to sleep waiting for buffer heads. */ static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) { - struct buffer_head *bh, *head; + struct buffer_head *bh, *head, *tail; long offset; try_again: - head = NULL; + head = tail = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { - bh = get_unused_buffer_head(async); + bh = __get_unused_bh(async); if (!bh) goto no_grow; - bh->b_dev = B_FREE; /* Flag as unused */ - bh->b_this_page = head; - head = bh; - + bh->b_next = NULL; + bh->b_blocknr = -1; + bh->b_size = size; + bh->b_list = BUF_CLEAN; + bh->b_dev = MKDEV(0,0); /* Flag as unused */ + bh->b_rdev = MKDEV(0,0); /* Flag as unused */ bh->b_state = 0; - bh->b_next_free = NULL; + bh->b_flushtime = 0; + + bh->b_next_free = bh->b_prev_free = NULL; + bh->b_this_page = head; + bh->b_reqnext = NULL; bh->b_pprev = NULL; - atomic_set(&bh->b_count, 0); - bh->b_size = size; set_bh_page(bh, page, offset); - bh->b_list = BUF_CLEAN; bh->b_end_io = end_buffer_io_bad; + bh->b_dev_id = NULL; + bh->b_rsector = -1; + init_waitqueue_head(&bh->b_wait); + + bh_set(bh, 0); + + if (!tail) + tail = bh; + head = bh; } + tail->b_this_page = head; return head; /* * In case anything failed, we just free everything we got. */ no_grow: if (head) { + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); spin_lock(&unused_list_lock); do { bh = head; head = head->b_this_page; - __put_unused_buffer_head(bh); + __put_unused_bh(bh); } while (head); spin_unlock(&unused_list_lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); /* Wake up any waiters ... */ wake_up(&buffer_wait); @@ -1160,7 +1422,7 @@ /* * Return failure for non-async IO requests. Async IO requests * are not allowed to fail, so we have to wait until buffer heads - * become available. But we don't want tasks sleeping with + * become available. But we don't want tasks sleeping with * partially complete buffers, so all were released above. */ if (!async) @@ -1169,12 +1431,12 @@ /* We're _really_ low on memory. Now we just * wait for old buffer heads to become free due to * finishing IO. Since this is an async request and - * the reserve list is empty, we're sure there are + * the reserve list is empty, we're sure there are * async buffer heads in use. */ run_task_queue(&tq_disk); - /* + /* * Set our state for sleeping, then check again for buffer heads. * This ensures we won't miss a wake_up from an interrupt. */ @@ -1182,7 +1444,19 @@ goto try_again; } -static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size) + +static void __set_page_buffers(struct page *page, struct buffer_head *head) +{ + if (head && page->buffers) + BUG(); + if (!head && !page->buffers) + BUG(); + if (!PageLocked(page)) + BUG(); + page->buffers = head; +} + +static __inline__ int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size) { struct buffer_head *head, *bh, *tail; int block; @@ -1199,32 +1473,38 @@ BUG(); if (!head) BUG(); - tail = head; - for (bh = head; bh; bh = bh->b_this_page) { + bh = head; + do { block = *(b++); tail = bh; - init_buffer(bh, end_buffer_io_async, NULL); + init_buffer(bh, end_buffer_io_page, NULL); bh->b_dev = dev; bh->b_blocknr = block; - + if (!block) + BUG(); set_bit(BH_Mapped, &bh->b_state); - } - tail->b_this_page = head; + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + bh = bh->b_this_page; + } while (bh != head); + + if (tail->b_this_page != head) + BUG(); get_page(page); - page->buffers = head; + __set_page_buffers(page, head); + return 0; } -static void unmap_buffer(struct buffer_head * bh) +static __inline__ void unmap_buffer (struct buffer_head * bh) { - if (buffer_mapped(bh)) - { - mark_buffer_clean(bh); - wait_on_buffer(bh); + if (!PageLocked(bh->b_page)) + BUG(); + if (buffer_mapped(bh)) { clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Mapped, &bh->b_state); clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); } } @@ -1271,10 +1551,19 @@ * instead. */ if (!offset) { - if (!try_to_free_buffers(page)) { - atomic_inc(&buffermem_pages); + /* + * Dont be too agressive dropping cached bhs on + * the same page. + */ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + if (!__try_to_free_buffers(page, 0)) { + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); return 0; } + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); } return 1; @@ -1288,37 +1577,21 @@ if (page->buffers) BUG(); - bh = head; + bh = tail = head; do { bh->b_dev = inode->i_dev; bh->b_blocknr = 0; bh->b_end_io = end_buffer_io_bad; + bh_set(bh, 0); tail = bh; bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - page->buffers = head; + } while (bh != head); + if (tail->b_this_page != head) + BUG(); + __set_page_buffers(page, head); get_page(page); } -static void unmap_underlying_metadata(struct buffer_head * bh) -{ -#if 0 - if (buffer_new(bh)) { - struct buffer_head *old_bh; - - old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); - if (old_bh) { - unmap_buffer(old_bh); - /* Here we could run brelse or bforget. We use - bforget because it will try to put the buffer - in the freelist. */ - __bforget(old_bh); - } - } -#endif -} - /* * block_write_full_page() is SMP-safe - currently it's still * being called with the kernel lock held, but the code is ready. @@ -1347,6 +1620,7 @@ do { if (!bh) BUG(); + bget(bh); /* * If the buffer isn't up-to-date, we can't be sure @@ -1356,15 +1630,24 @@ * Leave it to the low-level FS to make all those * decisions (block #0 may actually be a valid block) */ - bh->b_end_io = end_buffer_io_sync; if (!buffer_mapped(bh)) { + bh->b_end_io = end_buffer_io_sync; err = inode->i_op->get_block(inode, block, bh, 1); - if (err) + clear_bit(BH_New, &bh->b_state); + if (err) { + bput(bh); goto out; - unmap_underlying_metadata(bh); + } + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + /* + * bdflush will take care of it. + */ + clear_bit(BH_Lock, &bh->b_state); } set_bit(BH_Uptodate, &bh->b_state); mark_buffer_dirty(bh,0); + bput(bh); bh = bh->b_this_page; block++; @@ -1385,7 +1668,7 @@ unsigned long block; int err = 0, partial = 0, need_balance_dirty = 0; unsigned blocksize, bbits; - struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait, *tmp; char *kaddr = (char *)kmap(page); blocksize = inode->i_sb->s_blocksize; @@ -1403,41 +1686,68 @@ */ for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { + int newblock; if (!bh) BUG(); - block_end = block_start+blocksize; + block_end = block_start + blocksize; if (block_end <= zerofrom) continue; if (block_start >= to) break; - bh->b_end_io = end_buffer_io_sync; + bget(bh); + newblock = 0; if (!buffer_mapped(bh)) { + bh->b_end_io = end_buffer_io_sync; err = inode->i_op->get_block(inode, block, bh, 1); - if (err) + /* + * We have to clear the New bit before inserting + * into the hash, otherwise bdflush and other + * external cache managers might see and do IO to it. + */ + newblock = test_and_clear_bit(BH_New, &bh->b_state); + if (err) { + while(wait_bh > wait) { + tmp = *--wait_bh; + bput(tmp); + } + bput(bh); goto out; - unmap_underlying_metadata(bh); + } + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + /* + * bdflush will write it out. + */ + clear_bit(BH_Lock, &bh->b_state); } - if (buffer_new(bh)) { + if (newblock) { zeroto = block_end; if (block_start < zerofrom) zerofrom = block_start; + bput(bh); continue; } if (!buffer_uptodate(bh) && (block_start < zerofrom || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; - } + } else + bput(bh); } /* * If we issued read requests - let them complete. */ + err = 0; while(wait_bh > wait) { - wait_on_buffer(*--wait_bh); - err = -EIO; - if (!buffer_uptodate(*wait_bh)) - goto out; + tmp = *--wait_bh; + if (!buffer_uptodate(tmp)) + wait_on_buffer(tmp); + if (!buffer_uptodate(tmp)) + err = -EIO; + bput(tmp); } + if (err) + goto out; /* * Now we can copy the data. */ @@ -1470,6 +1780,7 @@ bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; + bget(bh); if (block_end <= zerofrom || block_start >= zeroto) { if (!buffer_uptodate(bh)) partial = 1; @@ -1480,6 +1791,7 @@ need_balance_dirty = 1; } } + bput(bh); } if (need_balance_dirty) @@ -1537,7 +1849,6 @@ return err ? err : bytes; } - /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. @@ -1580,6 +1891,8 @@ kiobuf_wait_for_io(kiobuf); + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); spin_lock(&unused_list_lock); iosize = 0; @@ -1592,10 +1905,12 @@ amount of IO before the first error. */ iosize = 0; } - __put_unused_buffer_head(tmp); + __put_unused_bh(tmp); } spin_unlock(&unused_list_lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); if (iosize) return iosize; @@ -1663,7 +1978,7 @@ while (length > 0) { blocknr = b[bufind++]; - tmp = get_unused_buffer_head(0); + tmp = get_unused_bh(); if (!tmp) { err = -ENOMEM; goto error; @@ -1726,11 +2041,14 @@ error: /* We got an error allocating the bh'es. Just free the current buffer_heads and exit. */ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); spin_lock(&unused_list_lock); - for (i = bhind; --i >= 0; ) { - __put_unused_buffer_head(bh[bhind]); - } + for (i = bhind; --i >= 0; ) + __put_unused_bh(bh[bhind]); spin_unlock(&unused_list_lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); goto finished; } @@ -1744,7 +2062,7 @@ * kernel lock held - but the code is ready. * * FIXME: we need a swapper_inode->get_block function to remove - * some of the bmap kludges and interface ugliness here. + * some of the bmap kludges and interface ugliness here. */ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) { @@ -1753,7 +2071,6 @@ if (!PageLocked(page)) panic("brw_page: page not locked for I/O"); -// clear_bit(PG_error, &page->flags); /* * We pretty much rely on the page lock for this, because * create_page_buffers() might sleep. @@ -1772,20 +2089,24 @@ do { block = *(b++); - if (fresh && (atomic_read(&bh->b_count) != 0)) + if (fresh && bcount(bh)) BUG(); + bget(bh); if (rw == READ) { if (!fresh) BUG(); if (!buffer_uptodate(bh)) { arr[nr++] = bh; - atomic_inc(&bh->b_count); } } else { /* WRITE */ if (!bh->b_blocknr) { if (!block) BUG(); + if (buffer_mapped(bh)) + BUG(); bh->b_blocknr = block; + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); } else { if (!block) BUG(); @@ -1793,8 +2114,8 @@ set_bit(BH_Uptodate, &bh->b_state); set_bit(BH_Dirty, &bh->b_state); arr[nr++] = bh; - atomic_inc(&bh->b_count); } + bput(bh); bh = bh->b_this_page; } while (bh != head); if (rw == READ) @@ -1802,14 +2123,14 @@ if ((rw == READ) && nr) { if (Page_Uptodate(page)) BUG(); - ll_rw_block(rw, nr, arr); + ll_rw_block_locked(rw, nr, arr); } else { if (!nr && rw == READ) { SetPageUptodate(page); UnlockPage(page); } if (nr && (rw == WRITE)) - ll_rw_block(rw, nr, arr); + ll_rw_block_locked(rw, nr, arr); } return 0; } @@ -1843,8 +2164,11 @@ i = 0; do { - if (buffer_uptodate(bh)) + bget(bh); + if (buffer_uptodate(bh)) { + bput(bh); continue; + } if (!buffer_mapped(bh)) { inode->i_op->get_block(inode, iblock, bh, 0); @@ -1853,21 +2177,24 @@ kaddr = kmap(page); memset((char *)(kaddr + i*blocksize), 0, blocksize); set_bit(BH_Uptodate, &bh->b_state); + bput(bh); continue; } + init_buffer(bh, end_buffer_io_page, NULL); + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + arr[nr] = bh; + nr++; } - - init_buffer(bh, end_buffer_io_async, NULL); - atomic_inc(&bh->b_count); - arr[nr] = bh; - nr++; } while (i++, iblock++, (bh = bh->b_this_page) != head); ++current->maj_flt; if (nr) { if (Page_Uptodate(page)) BUG(); - ll_rw_block(READ, nr, arr); + ll_rw_block_locked(READ, nr, arr); + for (i = 0; i < nr; i++) + bput(arr[i]); } else { /* * all buffers are uptodate - we can set the page @@ -1929,7 +2256,7 @@ static int grow_buffers(int size) { struct page * page; - struct buffer_head *bh, *tmp; + struct buffer_head *bh, *head, *tail; struct buffer_head * insert_point; int isize; @@ -1941,37 +2268,44 @@ page = alloc_page(GFP_BUFFER); if (!page) goto out; - bh = create_buffers(page, size, 0); - if (!bh) + if (page->mapping) + BUG(); + if (PageLocked(page)) + BUG(); + lock_page(page); + head = create_buffers(page, size, 0); + if (!head) goto no_buffer_head; isize = BUFSIZE_INDEX(size); spin_lock(&free_list[isize].lock); insert_point = free_list[isize].list; - tmp = bh; - while (1) { + tail = bh = head; + do { + bh->b_dev = B_FREE; if (insert_point) { - tmp->b_next_free = insert_point->b_next_free; - tmp->b_prev_free = insert_point; - insert_point->b_next_free->b_prev_free = tmp; - insert_point->b_next_free = tmp; + bh->b_next_free = insert_point->b_next_free; + bh->b_prev_free = insert_point; + insert_point->b_next_free->b_prev_free = bh; + insert_point->b_next_free = bh; } else { - tmp->b_prev_free = tmp; - tmp->b_next_free = tmp; + bh->b_prev_free = bh; + bh->b_next_free = bh; } - insert_point = tmp; - if (tmp->b_this_page) - tmp = tmp->b_this_page; - else - break; - } - tmp->b_this_page = bh; + insert_point = bh; + tail = bh; + bh = bh->b_this_page; + } while (bh != head); + + if (tail->b_this_page != bh) + BUG(); free_list[isize].list = bh; + __set_page_buffers(page, bh); + lru_cache_add(page); spin_unlock(&free_list[isize].lock); + UnlockPage(page); - page->buffers = bh; - lru_cache_add(page); atomic_inc(&buffermem_pages); return 1; @@ -1984,36 +2318,30 @@ /* * Can the buffer be thrown out? */ -#define BUFFER_BUSY_BITS ((1<b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) +#define BUFFER_BUSY_BITS ((1<b_state & BUFFER_BUSY_BITS)) -/* - * try_to_free_buffers() checks if all the buffers on this particular page - * are unused, and free's the page if so. - * - * Wake up bdflush() if this fails - if we're running low on memory due - * to dirty buffers, we need to flush them out as quickly as possible. - * - * NOTE: There are quite a number of ways that threads of control can - * obtain a reference to a buffer head within a page. So we must - * lock out all of these paths to cleanly toss the page. - */ -int try_to_free_buffers(struct page * page) +static int __try_to_free_buffers(struct page * page, int priority) { - struct buffer_head * tmp, * bh = page->buffers; + struct buffer_head *tmp, *p, *bh = page->buffers; int index = BUFSIZE_INDEX(bh->b_size); int ret; - spin_lock(&lru_list_lock); - write_lock(&hash_table_lock); + if (!PageLocked(page)) + BUG(); spin_lock(&free_list[index].lock); tmp = bh; do { - struct buffer_head * p = tmp; - + p = tmp; tmp = tmp->b_this_page; if (buffer_busy(p)) goto busy_buffer_page; + /* + * Remove the page only if none of it bhs is uptodate. + * (ie. all bhs got either dropped or invalidated) + */ + if (!priority && test_bit(BH_Uptodate, &bh->b_state)) + goto busy_buffer_page; } while (tmp != bh); spin_lock(&unused_list_lock); @@ -2025,28 +2353,31 @@ /* The buffer can be either on the regular * queues or on the free list.. */ + bget(p); if (p->b_dev == B_FREE) { + if (buffer_mapped(p)) + BH_BUG(p); __remove_from_free_list(p, index); - } else { - if (p->b_pprev) - __hash_unlink(p); - __remove_from_lru_list(p, p->b_list); } - __put_unused_buffer_head(p); + if (buffer_mapped(p)) { + if (p->b_dev == B_FREE) + BH_BUG(p); + __remove_from_queues(p); + } + bput(p); + __put_unused_bh(p); } while (tmp != bh); + __set_page_buffers(page, NULL); + /* And free the page */ spin_unlock(&unused_list_lock); /* Wake up anyone waiting for buffer heads */ wake_up(&buffer_wait); - /* And free the page */ - page->buffers = NULL; __free_page(page); ret = 1; out: spin_unlock(&free_list[index].lock); - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); return ret; busy_buffer_page: @@ -2056,6 +2387,29 @@ goto out; } +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and free's the page if so. + * + * Wake up bdflush() if this fails - if we're running low on memory due + * to dirty buffers, we need to flush them out as quickly as possible. + * + * NOTE: There are quite a number of ways that threads of control can + * obtain a reference to a buffer head within a page. So we must + * lock out all of these paths to cleanly toss the page. + */ +int try_to_free_buffers(struct page * page) +{ + int ret; + + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + ret = __try_to_free_buffers(page, 1); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + return ret; +} + /* ================== Debugging =================== */ void show_buffers(void) @@ -2087,7 +2441,7 @@ protected++; if (buffer_dirty(bh)) dirty++; - if (atomic_read(&bh->b_count)) + if (bcount(bh)) used++, lastused = found; bh = bh->b_next_free; } while (bh != lru_list[nlist]); @@ -2224,34 +2578,37 @@ { next = bh->b_next_free; - if (!buffer_dirty(bh)) - { + bget(bh); + if (!buffer_dirty(bh)) { __refile_buffer(bh); + bput(bh); continue; } - if (buffer_locked(bh)) + if (buffer_locked(bh)) { + bput(bh); continue; + } - if (check_flushtime) - { + if (check_flushtime) { /* The dirty lru list is chronogical ordered so if the current bh is not yet timed out, then also all the following bhs will be too young. */ - if (time_before(jiffies, bh->b_flushtime)) + if (time_before(jiffies, bh->b_flushtime)) { + bput(bh); goto out_unlock; - } - else - { - if (++flushed > bdf_prm.b_un.ndirty) + } + } else { + if (++flushed > bdf_prm.b_un.ndirty) { + bput(bh); goto out_unlock; + } } /* OK, now we are committed to write it out. */ - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); + bput(bh); if (current->need_resched) schedule(); @@ -2261,10 +2618,10 @@ spin_unlock(&lru_list_lock); } -/* - * Here we attempt to write back old buffers. We also try to flush inodes - * and supers as well, since this function is essentially "update", and - * otherwise there would be no way of ensuring that these quantities ever +/* + * Here we attempt to write back old buffers. We also try to flush inodes + * and supers as well, since this function is essentially "update", and + * otherwise there would be no way of ensuring that these quantities ever * get written back. Ideally, we would have a timestamp on the inodes * and superblocks so that we could write back only the old ones as well */ @@ -2283,8 +2640,8 @@ } /* This is the interface to bdflush. As we get more sophisticated, we can - * pass tuning parameters to this "process", to adjust how it behaves. - * We would want to verify each parameter, however, to make sure that it + * pass tuning parameters to this "process", to adjust how it behaves. + * We would want to verify each parameter, however, to make sure that it * is reasonable. */ asmlinkage long sys_bdflush(int func, long data) @@ -2329,7 +2686,7 @@ } /* Having func 0 used to launch the actual bdflush and then never - * return (unless explicitly killed). We return zero here to + * return (unless explicitly killed). We return zero here to * remain semi-compatible with present update(8) programs. */ return 0; @@ -2340,12 +2697,12 @@ * the syscall above, but now we launch it ourselves internally with * kernel_thread(...) directly after the first thread in init/main.c */ -int bdflush(void * unused) +int bdflush(void * unused) { /* * We have a bare-bones task_struct, and really should fill * in a few more things so "top" and /proc/2/{exe,root,cwd} - * display semi-sane things. Not real crucial though... + * display semi-sane things. Not real crucial though... */ current->session = 1; @@ -2394,7 +2751,7 @@ * You don't need to change your userspace configuration since * the userspace `update` will do_exit(0) at the first sys_bdflush(). */ -int kupdate(void * unused) +int kupdate(void * unused) { struct task_struct * tsk = current; int interval; @@ -2452,6 +2809,7 @@ kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); return 0; } + module_init(bdflush_init) --- linux/fs/inode.c.orig Sun Jan 16 06:38:26 2000 +++ linux/fs/inode.c Sun Jan 16 17:45:52 2000 @@ -494,6 +494,7 @@ if (inode) { spin_lock(&inode_lock); + INIT_LIST_HEAD(&inode->i_list); list_add(&inode->i_list, &inode_in_use); inode->i_sb = NULL; inode->i_dev = 0; @@ -525,7 +526,9 @@ /* We released the lock, so.. */ old = find_inode(sb, ino, head, find_actor, opaque); if (!old) { + INIT_LIST_HEAD(&inode->i_list); list_add(&inode->i_list, &inode_in_use); + INIT_LIST_HEAD(&inode->i_hash); list_add(&inode->i_hash, head); inode->i_sb = sb; inode->i_dev = sb->s_dev; @@ -722,15 +725,19 @@ int bmap(struct inode * inode, int block) { - struct buffer_head tmp; + struct buffer_head *tmp; + int ret = 0; if (inode->i_op && inode->i_op->get_block) { - tmp.b_state = 0; - tmp.b_blocknr = 0; - inode->i_op->get_block(inode, block, &tmp, 0); - return tmp.b_blocknr; + tmp = get_unused_bh(); + tmp->b_state = 0; + tmp->b_blocknr = 0; + bh_set(tmp, 0); + inode->i_op->get_block(inode, block, tmp, 0); + ret = tmp->b_blocknr; + put_unused_bh(tmp); } - return 0; + return ret; } /* --- linux/fs/ioctl.c.orig Mon Aug 23 20:15:53 1999 +++ linux/fs/ioctl.c Sun Jan 16 17:45:52 2000 @@ -19,7 +19,8 @@ switch (cmd) { case FIBMAP: { - struct buffer_head tmp; + int ret; + struct buffer_head *tmp; if (inode->i_op == NULL) return -EBADF; @@ -30,10 +31,15 @@ if ((error = get_user(block, (int *) arg)) != 0) return error; - tmp.b_state = 0; - tmp.b_blocknr = 0; - inode->i_op->get_block(inode, block, &tmp, 0); - return put_user(tmp.b_blocknr, (int *) arg); + tmp = get_unused_bh(); + tmp->b_state = 0; + tmp->b_blocknr = 0; + bh_set(tmp, 1); + + inode->i_op->get_block(inode, block, tmp, 0); + ret = put_user(tmp->b_blocknr, (int *) arg); + put_unused_bh(tmp); + return ret; } case FIGETBSZ: if (inode->i_sb == NULL) --- linux/fs/dcache.c.orig Sun Jan 16 06:38:25 2000 +++ linux/fs/dcache.c Sun Jan 16 17:45:52 2000 @@ -458,12 +458,12 @@ dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; + INIT_LIST_HEAD(&dentry->d_child); if (parent) { dentry->d_parent = dget(parent); dentry->d_sb = parent->d_sb; list_add(&dentry->d_child, &parent->d_subdirs); - } else - INIT_LIST_HEAD(&dentry->d_child); + } dentry->d_mounts = dentry; dentry->d_covers = dentry; --- linux/init/main.c.orig Sun Jan 16 06:38:25 2000 +++ linux/init/main.c Sun Jan 16 17:45:52 2000 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -674,6 +675,9 @@ while (pid != wait(&i)); if (MAJOR(real_root_dev) != RAMDISK_MAJOR || MINOR(real_root_dev) != 0) { +#ifdef CONFIG_BLK_DEV_MD + autodetect_raid(); +#endif error = change_root(real_root_dev,"/initrd"); if (error) printk(KERN_ERR "Change root to /initrd: " --- linux/kernel/ksyms.c.orig Sun Jan 16 06:38:28 2000 +++ linux/kernel/ksyms.c Sun Jan 16 17:45:52 2000 @@ -263,8 +263,8 @@ EXPORT_SYMBOL(ioctl_by_bdev); EXPORT_SYMBOL(gendisk_head); EXPORT_SYMBOL(resetup_one_dev); -EXPORT_SYMBOL(unplug_device); -EXPORT_SYMBOL(make_request); +EXPORT_SYMBOL(generic_unplug_device); +EXPORT_SYMBOL(generic_make_request); EXPORT_SYMBOL(tq_disk); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(refile_buffer); @@ -381,7 +381,6 @@ EXPORT_SYMBOL(kdevname); EXPORT_SYMBOL(bdevname); EXPORT_SYMBOL(cdevname); -EXPORT_SYMBOL(partition_name); /* md.c only */ EXPORT_SYMBOL(simple_strtoul); EXPORT_SYMBOL(system_utsname); /* UTS data */ EXPORT_SYMBOL(uts_sem); /* UTS semaphore */ --- linux/mm/slab.c.orig Sun Jan 16 06:38:28 2000 +++ linux/mm/slab.c Sun Jan 16 17:45:52 2000 @@ -535,6 +535,7 @@ */ while (i--) { PageClearSlab(page); + INIT_LIST_HEAD(&page->list); page++; } free_pages((unsigned long)addr, cachep->c_gfporder); --- linux/mm/filemap.c.orig Sun Jan 16 06:38:28 2000 +++ linux/mm/filemap.c Sun Jan 16 17:45:52 2000 @@ -224,40 +224,48 @@ spin_lock(&pagemap_lru_lock); - while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + while (count-- > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + page = list_entry(page_lru, struct page, lru); - list_del(page_lru); + + /* avoid unscalable SMP locking */ + if (!page->buffers && page_count(page) > 1) + continue; + /* + * We do this first because this synchronizes the page + * lock with LRU-removal, needed by try_to_free_buffers(). + */ + if (TryLockPage(page)) + continue; dispose = &lru_cache; + list_del(page_lru); + if (test_and_clear_bit(PG_referenced, &page->flags)) /* Roll the page at the top of the lru list, * we could also be more aggressive putting * the page in the young-dispose-list, so * avoiding to free young pages in each pass. */ - goto dispose_continue; + goto dispose_unlock_noput_continue; dispose = &old; /* don't account passes over not DMA pages */ if (zone && (!memclass(page->zone, zone))) - goto dispose_continue; + goto dispose_unlock_noput_continue; - count--; + + /* avoid freeing the page while it's locked */ + get_page(page); + spin_unlock(&pagemap_lru_lock); dispose = &young; - if (TryLockPage(page)) - goto dispose_continue; /* Release the pagemap_lru lock even if the page is not yet queued in any lru queue since we have just locked down the page so nobody else may SMP race with us running a lru_cache_del() (lru_cache_del() always run with the page locked down ;). */ - spin_unlock(&pagemap_lru_lock); - - /* avoid unscalable SMP locking */ - if (!page->buffers && page_count(page) > 1) - goto unlock_noput_continue; /* Take the pagecache_lock spinlock held to avoid other tasks to notice the page while we are looking at its @@ -265,8 +273,8 @@ in one atomic transaction after checking its page count. */ spin_lock(&pagecache_lock); - /* avoid freeing the page while it's locked */ - get_page(page); + if (page_count(page) == 1) + BUG(); /* Is it a buffer page? */ if (page->buffers) { @@ -274,10 +282,8 @@ if (!try_to_free_buffers(page)) goto unlock_continue; /* page was locked, inode can't go away under us */ - if (!page->mapping) { - atomic_dec(&buffermem_pages); + if (!page->mapping) goto made_buffer_progress; - } spin_lock(&pagecache_lock); } @@ -312,6 +318,8 @@ goto cache_unlock_continue; } + if (page->buffers || page->mapping) + printk("huh?\n"); dispose = &forget; printk(KERN_ERR "shrink_mmap: unknown LRU page!\n"); @@ -320,21 +328,18 @@ unlock_continue: UnlockPage(page); put_page(page); -dispose_relock_continue: /* even if the dispose list is local, a truncate_inode_page() may remove a page from its queue so always synchronize with the lru lock while accesing the page->lru field */ spin_lock(&pagemap_lru_lock); +dispose_continue: list_add(page_lru, dispose); continue; - -unlock_noput_continue: +dispose_unlock_noput_continue: + count++; UnlockPage(page); - goto dispose_relock_continue; - -dispose_continue: - list_add(page_lru, dispose); + goto dispose_continue; } goto out; @@ -386,8 +391,11 @@ { struct buffer_head *bh, *head = page->buffers; + if (!PageLocked(page)) + BUG(); bh = head; do { + bget(bh); if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh)) continue; @@ -402,11 +410,14 @@ int error = 0; struct buffer_head *bh, *head = page->buffers; + if (!PageLocked(page)) + BUG(); bh = head; do { wait_on_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) error = -EIO; + bput(bh); } while ((bh = bh->b_this_page) != head); return error; } @@ -470,7 +481,6 @@ struct address_space *mapping, unsigned long offset, struct page **hash) { - struct page *alias; unsigned long flags; flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced)); @@ -480,9 +490,6 @@ add_page_to_inode_queue(mapping, page); __add_page_to_hash_queue(page, hash); lru_cache_add(page); - alias = __find_page_nolock(mapping, offset, *hash); - if (alias != page) - BUG(); } void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) --- linux/mm/page_alloc.c.orig Sun Jan 16 06:38:25 2000 +++ linux/mm/page_alloc.c Sun Jan 16 17:45:52 2000 @@ -78,11 +78,11 @@ return; if (page-mem_map >= max_mapnr) - BUG(); + PAGE_BUG(page); if (PageSwapCache(page)) - BUG(); + PAGE_BUG(page); if (PageLocked(page)) - BUG(); + PAGE_BUG(page); zone = page->zone; @@ -562,7 +562,7 @@ memlist_init(&p->list); } - offset = lmem_map - mem_map; + offset = lmem_map - mem_map; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; unsigned long mask = -1; @@ -570,7 +570,7 @@ size = zones_size[j]; - printk("zone(%ld): %ld pages.\n", j, size); + printk("zone(%ld): %ld pages, offset %d.\n", j, size, offset); zone->size = size; zone->name = zone_names[j]; zone->lock = SPIN_LOCK_UNLOCKED; --- linux/include/linux/genhd.h.orig Fri Oct 15 18:29:44 1999 +++ linux/include/linux/genhd.h Sun Jan 16 17:45:52 2000 @@ -19,6 +19,7 @@ #define WIN98_EXTENDED_PARTITION 0x0f #define LINUX_SWAP_PARTITION 0x82 +#define LINUX_RAID_PARTITION 0xfd /* autodetect RAID partition */ #ifdef CONFIG_SOLARIS_X86_PARTITION #define SOLARIS_X86_PARTITION LINUX_SWAP_PARTITION @@ -45,6 +46,7 @@ struct hd_struct { long start_sect; long nr_sects; + int type; /* currently RAID or normal */ }; struct gendisk { --- linux/include/linux/raid/linear.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/linear.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,32 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +#include + +struct dev_info { + kdev_t dev; + int size; + unsigned int offset; +}; + +typedef struct dev_info dev_info_t; + +struct linear_hash +{ + dev_info_t *dev0, *dev1; +}; + +struct linear_private_data +{ + struct linear_hash *hash_table; + dev_info_t disks[MD_SB_DISKS]; + dev_info_t *smallest; + int nr_zones; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/hsm_p.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/hsm_p.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,237 @@ +#ifndef _HSM_P_H +#define _HSM_P_H + +#define HSM_BLOCKSIZE 4096 +#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4) +#define PACKED __attribute__ ((packed)) + +/* + * Identifies a block in physical space + */ +typedef struct phys_idx_s { + __u16 phys_nr; + __u32 phys_block; + +} PACKED phys_idx_t; + +/* + * Identifies a block in logical space + */ +typedef struct log_idx_s { + __u16 log_id; + __u32 log_index; + +} PACKED log_idx_t; + +/* + * Describes one PV + */ +#define HSM_PV_SB_MAGIC 0xf091ae9fU + +#define HSM_PV_SB_GENERIC_WORDS 32 +#define HSM_PV_SB_RESERVED_WORDS \ + (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS) + +/* + * On-disk PV identification data, on block 0 in any PV. + */ +typedef struct pv_sb_s +{ + __u32 pv_magic; /* 0 */ + + __u32 pv_uuid0; /* 1 */ + __u32 pv_uuid1; /* 2 */ + __u32 pv_uuid2; /* 3 */ + __u32 pv_uuid3; /* 4 */ + + __u32 pv_major; /* 5 */ + __u32 pv_minor; /* 6 */ + __u32 pv_patch; /* 7 */ + + __u32 pv_ctime; /* 8 Creation time */ + + __u32 pv_total_size; /* 9 size of this PV, in blocks */ + __u32 pv_first_free; /* 10 first free block */ + __u32 pv_first_used; /* 11 first used block */ + __u32 pv_blocks_left; /* 12 unallocated blocks */ + __u32 pv_bg_size; /* 13 size of a block group, in blocks */ + __u32 pv_block_size; /* 14 size of blocks, in bytes */ + __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */ + __u32 pv_block_groups; /* 16 number of block groups */ + + __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17]; + + /* + * Reserved + */ + __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS]; + +} PACKED pv_sb_t; + +/* + * this is pretty much arbitrary, but has to be less than ~64 + */ +#define HSM_MAX_LVS_PER_VG 32 + +#define HSM_VG_SB_GENERIC_WORDS 32 + +#define LV_DESCRIPTOR_WORDS 8 +#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \ + LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS) + +#if (HSM_PV_SB_RESERVED_WORDS < 0) +#error you messed this one up dude ... +#endif + +typedef struct lv_descriptor_s +{ + __u32 lv_id; /* 0 */ + phys_idx_t lv_root_idx; /* 1 */ + __u16 __reserved; /* 2 */ + __u32 lv_max_indices; /* 3 */ + __u32 lv_free_indices; /* 4 */ + __u32 md_id; /* 5 */ + + __u32 reserved[LV_DESCRIPTOR_WORDS - 6]; + +} PACKED lv_descriptor_t; + +#define HSM_VG_SB_MAGIC 0x98320d7aU +/* + * On-disk VG identification data, in block 1 on all PVs + */ +typedef struct vg_sb_s +{ + __u32 vg_magic; /* 0 */ + __u32 nr_lvs; /* 1 */ + + __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2]; + + lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG]; + /* + * Reserved + */ + __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS]; + +} PACKED vg_sb_t; + +/* + * Describes one LV + */ + +#define HSM_LV_SB_MAGIC 0xe182bd8aU + +/* do we need lv_sb_t? */ + +typedef struct lv_sb_s +{ + /* + * On-disk LV identifier + */ + __u32 lv_magic; /* 0 LV identifier */ + __u32 lv_uuid0; /* 1 */ + __u32 lv_uuid1; /* 2 */ + __u32 lv_uuid2; /* 3 */ + __u32 lv_uuid3; /* 4 */ + + __u32 lv_major; /* 5 PV identifier */ + __u32 lv_minor; /* 6 PV identifier */ + __u32 lv_patch; /* 7 PV identifier */ + + __u32 ctime; /* 8 Creation time */ + __u32 size; /* 9 size of this LV, in blocks */ + phys_idx_t start; /* 10 position of root index block */ + log_idx_t first_free; /* 11-12 first free index */ + + /* + * Reserved + */ + __u32 reserved[HSM_BLOCKSIZE_WORDS-13]; + +} PACKED lv_sb_t; + +/* + * Pointer pointing from the physical space, points to + * the LV owning this block. It also contains various + * statistics about the physical block. + */ +typedef struct pv_pptr_s +{ + union { + /* case 1 */ + struct { + log_idx_t owner; + log_idx_t predicted; + __u32 last_referenced; + } used; + /* case 2 */ + struct { + __u16 log_id; + __u16 __unused1; + __u32 next_free; + __u32 __unused2; + __u32 __unused3; + } free; + } u; +} PACKED pv_pptr_t; + +static __inline__ int pv_pptr_free (const pv_pptr_t * pptr) +{ + return !pptr->u.free.log_id; +} + + +#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1)) + +#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1) +/* + * A table of pointers filling up a single block, managing + * the next DATA_BLOCKS_PER_BG physical blocks. Such block + * groups form the physical space of blocks. + */ +typedef struct pv_block_group_s +{ + __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8]; + + pv_pptr_t blocks[DATA_BLOCKS_PER_BG]; + +} PACKED pv_block_group_t; + +/* + * Pointer from the logical space, points to + * the (PV,block) containing this logical block + */ +typedef struct lv_lptr_s +{ + phys_idx_t data; + __u16 __reserved; + __u32 cpu_addr; + __u32 __reserved2; + +} PACKED lv_lptr_t; + +static __inline__ int index_free (const lv_lptr_t * index) +{ + return !index->data.phys_block; +} + +static __inline__ int index_present (const lv_lptr_t * index) +{ + return index->cpu_addr; +} + + +#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t)) +/* + * A table of pointers filling up a single block, managing + * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form + * the logical space of blocks. + */ +typedef struct lv_index_block_s +{ + lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK]; + +} PACKED lv_index_block_t; + +#endif + --- linux/include/linux/raid/md.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,95 @@ +/* + md.h : Multiple Devices driver for Linux + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + Copyright (C) 1994-96 Marc ZYNGIER + or + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_H +#define _MD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +/* + * 'md_p.h' holds the 'physical' layout of RAID devices + * 'md_u.h' holds the user <=> kernel API + * + * 'md_k.h' holds kernel internal definitions + */ + +#include +#include +#include + +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +#define MD_PATCHLEVEL_VERSION 0 + +extern int md_size[MAX_MD_DEVS]; +extern struct hd_struct md_hd_struct[MAX_MD_DEVS]; + +extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data); +extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev); +extern char * partition_name (kdev_t dev); +extern int register_md_personality (int p_num, mdk_personality_t *p); +extern int unregister_md_personality (int p_num); +extern mdk_thread_t * md_register_thread (void (*run) (void *data), + void *data, const char *name); +extern void md_unregister_thread (mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_interrupt_thread (mdk_thread_t *thread); +extern int md_update_sb (mddev_t *mddev); +extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare); +extern void md_recover_arrays (void); +extern int md_check_ordering (mddev_t *mddev); +extern void autodetect_raid(void); +extern struct gendisk * find_gendisk (kdev_t dev); +extern int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x); +extern int md_error (kdev_t mddev, kdev_t rdev); + +#if CONFIG_BLK_DEV_MD +extern void raid_setup(char *str,int *ints) md__init; +#endif + +extern void md_print_devices (void); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + +#endif _MD_H + --- linux/include/linux/raid/md_compatible.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_compatible.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,551 @@ + +/* + md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2 + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#ifndef _MD_COMPATIBLE_H +#define _MD_COMPATIBLE_H + +#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s)) + +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0) + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return x86_capability & 0x00800000; +} +#endif + +/* 002 */ +#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE) + +/* 003 */ +/* + * someone please suggest a sane compatibility layer for modules + */ +#define MD_EXPORT_SYMBOL(x) + +/* 004 */ +static inline unsigned long +md_copy_from_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_READ,from,n); + if (!err) + memcpy_fromfs(to, from, n); + return err; +} + +/* 005 */ +extern inline unsigned long +md_copy_to_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_WRITE,to,n); + if (!err) + memcpy_tofs(to, from, n); + return err; +} + +/* 006 */ +#define md_put_user(x,ptr) \ +({ \ + int __err; \ + \ + __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \ + if (!__err) \ + put_user(x,ptr); \ + __err; \ +}) + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return suser(); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + current->signal = 0; +} + +/* 010 */ +#define __S(nr) (1<<((nr)-1)) +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + current->blocked = ~(__S(SIGKILL)); +} +#undef __S + +/* 011 */ +extern inline unsigned long md_signal_pending (struct task_struct * tsk) +{ + return (tsk->signal & ~tsk->blocked); +} + +/* 012 */ +#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD + +/* 013 */ +#define md_mdelay(n) (\ + {unsigned long msec=(n); while (msec--) udelay(1000);}) + +/* 014 */ +#define MD_SYS_DOWN 0 +#define MD_SYS_HALT 0 +#define MD_SYS_POWER_OFF 0 + +/* 015 */ +#define md_register_reboot_notifier(x) + +/* 016 */ +extern __inline__ unsigned long +md_test_and_set_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + set_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 017 */ +extern __inline__ unsigned long +md_test_and_clear_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + clear_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 018 */ +#define md_atomic_read(x) (*(volatile int *)(x)) +#define md_atomic_set(x,y) (*(volatile int *)(x) = (y)) + +/* 019 */ +extern __inline__ void md_lock_kernel (void) +{ +#if __SMP__ + lock_kernel(); + syscall_count++; +#endif +} + +extern __inline__ void md_unlock_kernel (void) +{ +#if __SMP__ + syscall_count--; + unlock_kernel(); +#endif +} +/* 020 */ + +#define md__init +#define md__initdata +#define md__initfunc(__arginit) __arginit + +/* 021 */ + +/* 022 */ + +struct md_list_head { + struct md_list_head *next, *prev; +}; + +#define MD_LIST_HEAD(name) \ + struct md_list_head name = { &name, &name } + +#define MD_INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +static __inline__ void md__list_add(struct md_list_head * new, + struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static __inline__ void md_list_add(struct md_list_head *new, + struct md_list_head *head) +{ + md__list_add(new, head, head->next); +} + +static __inline__ void md__list_del(struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +static __inline__ void md_list_del(struct md_list_head *entry) +{ + md__list_del(entry->prev, entry->next); +} + +static __inline__ int md_list_empty(struct md_list_head *head) +{ + return head->next == head; +} + +#define md_list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/* 023 */ + +static __inline__ signed long md_schedule_timeout(signed long timeout) +{ + current->timeout = jiffies + timeout; + schedule(); + return 0; +} + +/* 024 */ +#define md_need_resched(tsk) (need_resched) + +/* 025 */ +typedef struct { int gcc_is_buggy; } md_spinlock_t; +#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 } + +#define md_spin_lock() do { } while (0) +#define md_spin_unlock() do { } while (0) +#define md_spin_lock_irq cli +#define md_spin_unlock_irq sti +#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags) +#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0) + +/* 026 */ +typedef struct wait_queue * md_wait_queue_head_t; +#define MD_DECLARE_WAITQUEUE(w,t) \ + struct wait_queue (w) = { (t), NULL } +#define MD_DECLARE_WAIT_QUEUE_HEAD(x) \ + static struct wait_queue *x = (struct wait_queue *)NULL +#define md_init_waitqueue_head init_waitqueue + +/* END */ + +#else + +#if LINUX_VERSION_CODE < LinuxVersionCode(2,3,0) + +#include +#include + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return boot_cpu_data.x86_capability & X86_FEATURE_MMX; +} +#endif + +/* 002 */ +#define md_clear_page(page) clear_page(page) + +/* 003 */ +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x) + +/* 004 */ +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z) + +/* 005 */ +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z) + +/* 006 */ +#define md_put_user put_user + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return capable(CAP_SYS_ADMIN); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +/* 010 */ +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +/* 011 */ +#define md_signal_pending signal_pending + +/* 012 */ +extern inline void md_set_global_readahead(int * table) +{ + max_readahead[MD_MAJOR] = table; +} + +/* 013 */ +#define md_mdelay(x) mdelay(x) + +/* 014 */ +#define MD_SYS_DOWN SYS_DOWN +#define MD_SYS_HALT SYS_HALT +#define MD_SYS_POWER_OFF SYS_POWER_OFF + +/* 015 */ +#define md_register_reboot_notifier register_reboot_notifier + +/* 016 */ +#define md_test_and_set_bit test_and_set_bit + +/* 017 */ +#define md_test_and_clear_bit test_and_clear_bit + +/* 018 */ +#define md_atomic_read atomic_read +#define md_atomic_set atomic_set + +/* 019 */ +#define md_lock_kernel lock_kernel +#define md_unlock_kernel unlock_kernel + +/* 020 */ + +#include + +#define md__init __init +#define md__initdata __initdata +#define md__initfunc(__arginit) __initfunc(__arginit) + +/* 021 */ + + +/* 022 */ + +#define md_list_head list_head +#define MD_LIST_HEAD(name) LIST_HEAD(name) +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr) +#define md_list_add list_add +#define md_list_del list_del +#define md_list_empty list_empty + +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member) + +/* 023 */ + +#define md_schedule_timeout schedule_timeout + +/* 024 */ +#define md_need_resched(tsk) ((tsk)->need_resched) + +/* 025 */ +#define md_spinlock_t spinlock_t +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +#define md_spin_lock spin_lock +#define md_spin_unlock spin_unlock +#define md_spin_lock_irq spin_lock_irq +#define md_spin_unlock_irq spin_unlock_irq +#define md_spin_unlock_irqrestore spin_unlock_irqrestore +#define md_spin_lock_irqsave spin_lock_irqsave + +/* 026 */ +typedef struct wait_queue * md_wait_queue_head_t; +#define MD_DECLARE_WAITQUEUE(w,t) \ + struct wait_queue (w) = { (t), NULL } +#define MD_DECLARE_WAIT_QUEUE_HEAD(x) \ + static struct wait_queue *x = (struct wait_queue *)NULL +#define md_init_waitqueue_head init_waitqueue + +/* END */ + +#else + +/** 2.3/2.4 stuff: **/ + +#include +#include +#include + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return boot_cpu_data.x86_capability & X86_FEATURE_MMX; +} +#endif + +/* 002 */ +#define md_clear_page(page) clear_page(page) + +/* 003 */ +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x) + +/* 004 */ +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z) + +/* 005 */ +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z) + +/* 006 */ +#define md_put_user put_user + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return capable(CAP_SYS_ADMIN); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +/* 010 */ +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +/* 011 */ +#define md_signal_pending signal_pending + +/* 012 */ +extern inline void md_set_global_readahead(int * table) +{ + max_readahead[MD_MAJOR] = table; +} + +/* 013 */ +#define md_mdelay(x) mdelay(x) + +/* 014 */ +#define MD_SYS_DOWN SYS_DOWN +#define MD_SYS_HALT SYS_HALT +#define MD_SYS_POWER_OFF SYS_POWER_OFF + +/* 015 */ +#define md_register_reboot_notifier register_reboot_notifier + +/* 016 */ +#define md_test_and_set_bit test_and_set_bit + +/* 017 */ +#define md_test_and_clear_bit test_and_clear_bit + +/* 018 */ +#define md_atomic_read atomic_read +#define md_atomic_set atomic_set + +/* 019 */ +#define md_lock_kernel lock_kernel +#define md_unlock_kernel unlock_kernel + +/* 020 */ + +#include + +#define md__init __init +#define md__initdata __initdata +#define md__initfunc(__arginit) __initfunc(__arginit) + +/* 021 */ + + +/* 022 */ + +#define md_list_head list_head +#define MD_LIST_HEAD(name) LIST_HEAD(name) +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr) +#define md_list_add list_add +#define md_list_del list_del +#define md_list_empty list_empty + +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member) + +/* 023 */ + +#define md_schedule_timeout schedule_timeout + +/* 024 */ +#define md_need_resched(tsk) ((tsk)->need_resched) + +/* 025 */ +#define md_spinlock_t spinlock_t +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +#define md_spin_lock spin_lock +#define md_spin_unlock spin_unlock +#define md_spin_lock_irq spin_lock_irq +#define md_spin_unlock_irq spin_unlock_irq +#define md_spin_unlock_irqrestore spin_unlock_irqrestore +#define md_spin_lock_irqsave spin_lock_irqsave + +/* 026 */ +typedef wait_queue_head_t md_wait_queue_head_t; +#define MD_DECLARE_WAITQUEUE(w,t) DECLARE_WAITQUEUE((w),(t)) +#define MD_DECLARE_WAIT_QUEUE_HEAD(x) DECLARE_WAIT_QUEUE_HEAD(x) +#define md_init_waitqueue_head init_waitqueue_head + +/* END */ + +#endif + +#endif + +#endif _MD_COMPATIBLE_H + --- linux/include/linux/raid/md_k.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_k.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,341 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#define MD_RESERVED 0UL +#define LINEAR 1UL +#define STRIPED 2UL +#define RAID0 STRIPED +#define RAID1 3UL +#define RAID5 4UL +#define TRANSLUCENT 5UL +#define HSM 6UL +#define MAX_PERSONALITY 7UL + +extern inline int pers_to_level (int pers) +{ + switch (pers) { + case HSM: return -3; + case TRANSLUCENT: return -2; + case LINEAR: return -1; + case RAID0: return 0; + case RAID1: return 1; + case RAID5: return 5; + } + panic("pers_to_level()"); +} + +extern inline int level_to_pers (int level) +{ + switch (level) { + case -3: return HSM; + case -2: return TRANSLUCENT; + case -1: return LINEAR; + case 0: return RAID0; + case 1: return RAID1; + case 4: + case 5: return RAID5; + } + return MD_RESERVED; +} + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +#if (MINORBITS != 8) +#error MD doesnt handle bigger kdev yet +#endif + +#define MAX_REAL 12 /* Max number of disks per md dev */ +#define MAX_MD_DEVS (1<state & (1 << MD_DISK_FAULTY); +} + +extern inline int disk_active(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_ACTIVE); +} + +extern inline int disk_sync(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_SYNC); +} + +extern inline int disk_spare(mdp_disk_t * d) +{ + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); +} + +extern inline int disk_removed(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_faulty(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_FAULTY); +} + +extern inline void mark_disk_active(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_sync(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_SYNC); +} + +extern inline void mark_disk_spare(mdp_disk_t * d) +{ + d->state = 0; +} + +extern inline void mark_disk_removed(mdp_disk_t * d) +{ + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_inactive(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_nonsync(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_SYNC); +} + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct md_list_head same_set; /* RAID devices within the same set */ + struct md_list_head all; /* all RAID devices */ + struct md_list_head pending; /* undetected RAID devices */ + + kdev_t dev; /* Device number */ + kdev_t old_dev; /* "" when it was last imported */ + int size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + unsigned long last_events; /* IO event timestamp */ + + struct inode *inode; /* Lock inode */ + struct file filp; /* Lock file */ + + mdp_super_t *sb; + int sb_offset; + + int faulty; /* if faulty do not issue IO requests */ + int desc_nr; /* descriptor index in the superblock */ +}; + + +/* + * disk operations in a working array: + */ +#define DISKOP_SPARE_INACTIVE 0 +#define DISKOP_SPARE_WRITE 1 +#define DISKOP_SPARE_ACTIVE 2 +#define DISKOP_HOT_REMOVE_DISK 3 +#define DISKOP_HOT_ADD_DISK 4 + +typedef struct mdk_personality_s mdk_personality_t; + +struct mddev_s +{ + void *private; + mdk_personality_t *pers; + int __minor; + mdp_super_t *sb; + int nb_dev; + struct md_list_head disks; + int sb_dirty; + mdu_param_t param; + int ro; + unsigned int curr_resync; + unsigned long resync_start; + char *name; + int recovery_running; + struct semaphore reconfig_sem; + struct semaphore recovery_sem; + struct semaphore resync_sem; + struct md_list_head all_mddevs; + request_queue_t queue; +}; + +struct mdk_personality_s +{ + char *name; + int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size); + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh); + void (*end_request)(struct buffer_head * bh, int uptodate); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + int (*status)(char *page, mddev_t *mddev); + int (*ioctl)(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); + int max_invalid_dev; + int (*error_handler)(mddev_t *mddev, kdev_t dev); + +/* + * Some personalities (RAID-1, RAID-5) can have disks hot-added and + * hot-removed. Hot removal is different from failure. (failure marks + * a disk inactive, but the disk is still part of the array) The interface + * to such operations is the 'pers->diskop()' function, can be NULL. + * + * the diskop function can change the pointer pointing to the incoming + * descriptor, but must do so very carefully. (currently only + * SPARE_ACTIVE expects such a change) + */ + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state); + + int (*stop_resync)(mddev_t *mddev); + int (*restart_resync)(mddev_t *mddev); +}; + + +/* + * Currently we index md_array directly, based on the minor + * number. This will have to change to dynamic allocation + * once we start supporting partitioning of md devices. + */ +extern inline int mdidx (mddev_t * mddev) +{ + return mddev->__minor; +} + +extern inline kdev_t mddev_to_kdev(mddev_t * mddev) +{ + return MKDEV(MD_MAJOR, mdidx(mddev)); +} + +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev); +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \ + \ + for (tmp = head.next; \ + rdev = md_list_entry(tmp, mdk_rdev_t, field), \ + tmp = tmp->next, tmp->prev != &head \ + ; ) +/* + * iterates through the 'same array disks' ringlist + */ +#define ITERATE_RDEV(mddev,rdev,tmp) \ + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp) + +/* + * Same as above, but assumes that the device has rdev->desc_nr numbered + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order. + */ +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \ + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++) + + +/* + * Iterates through all 'RAID managed disks' + */ +#define ITERATE_RDEV_ALL(rdev,tmp) \ + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp) + +/* + * Iterates through 'pending RAID disks' + */ +#define ITERATE_RDEV_PENDING(rdev,tmp) \ + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp) + +/* + * iterates through all used mddevs in the system. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (tmp = all_mddevs.next; \ + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ + tmp = tmp->next, tmp->prev != &all_mddevs \ + ; ) + +extern inline int lock_mddev (mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +extern inline void unlock_mddev (mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \ + x = y; y = __tmp; } while (0) + +typedef struct mdk_thread_s { + void (*run) (void *data); + void *data; + md_wait_queue_head_t wqueue; + unsigned long flags; + struct semaphore *sem; + struct task_struct *tsk; + const char *name; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +#define MAX_DISKNAME_LEN 32 + +typedef struct dev_name_s { + struct md_list_head list; + kdev_t dev; + char name [MAX_DISKNAME_LEN]; +} dev_name_t; + +#endif _MD_K_H + --- linux/include/linux/raid/md_p.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_p.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,161 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_DISKS 27 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 14 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ + __u64 events; /* 7 number of superblock updates (64-bit!) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +#endif _MD_P_H + --- linux/include/linux/raid/md_u.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_u.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,115 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_U_H +#define _MD_U_H + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) + +/* configuration */ +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24) +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +#define START_ARRAY _IO (MD_MAJOR, 0x31) +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif _MD_U_H + --- linux/include/linux/raid/raid0.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/raid0.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,33 @@ +#ifndef _RAID0_H +#define _RAID0_H + +#include + +struct strip_zone +{ + int zone_offset; /* Zone offset in md_dev */ + int dev_offset; /* Zone offset in real dev */ + int size; /* Zone size */ + int nb_dev; /* # of devices attached to the zone */ + mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */ +}; + +struct raid0_hash +{ + struct strip_zone *zone0, *zone1; +}; + +struct raid0_private_data +{ + struct raid0_hash *hash_table; /* Dynamically allocated */ + struct strip_zone *strip_zone; /* This one too */ + int nr_strip_zones; + struct strip_zone *smallest; + int nr_zones; +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/raid1.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/raid1.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,65 @@ +#ifndef _RAID1_H +#define _RAID1_H + +#include + +struct mirror_info { + int number; + int raid_disk; + kdev_t dev; + int next; + int sect_limit; + + /* + * State bits: + */ + int operational; + int write_only; + int spare; + + int used_slot; +}; + +struct raid1_private_data { + mddev_t *mddev; + struct mirror_info mirrors[MD_SB_DISKS]; + int nr_disks; + int raid_disks; + int working_disks; + int last_used; + unsigned long next_sect; + int sect_count; + mdk_thread_t *thread, *resync_thread; + int resync_mirrors; + struct mirror_info *spare; + md_spinlock_t device_lock; +}; + +typedef struct raid1_private_data raid1_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' RAID1 buffer head. + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct raid1_bh { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + int cmd; + unsigned long state; + mddev_t *mddev; + struct buffer_head *master_bh; + struct buffer_head *mirror_bh [MD_SB_DISKS]; + struct buffer_head bh_req; + struct buffer_head *next_retry; +}; + +#endif --- linux/include/linux/raid/raid5.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/raid5.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,115 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include + +struct disk_info { + kdev_t dev; + int operational; + int number; + int raid_disk; + int write_only; + int spare; + int used_slot; +}; + +struct stripe_head { + md_spinlock_t stripe_lock; + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ + struct stripe_head *free_next; /* pool of free sh's */ + struct buffer_head *buffer_pool; /* pool of free buffers */ + struct buffer_head *bh_pool; /* pool of free bh's */ + struct raid5_private_data *raid_conf; + struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ + struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ + struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ + struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ + int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ + int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ + unsigned long sector; /* sector of this row */ + int size; /* buffers size */ + int pd_idx; /* parity disk index */ + atomic_t nr_pending; /* nr of pending cmds */ + unsigned long state; /* state flags */ + int cmd; /* stripe cmd */ + atomic_t count; /* nr of waiters */ + int write_method; /* reconstruct-write / read-modify-write */ + int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ + md_wait_queue_head_t wait; /* processes waiting for this stripe */ +}; + +/* + * Phase + */ +#define PHASE_BEGIN 0 +#define PHASE_READ_OLD 1 +#define PHASE_WRITE 2 +#define PHASE_READ 3 +#define PHASE_COMPLETE 4 + +/* + * Write method + */ +#define METHOD_NONE 0 +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 + +/* + * Stripe state + */ +#define STRIPE_LOCKED 0 +#define STRIPE_ERROR 1 + +/* + * Stripe commands + */ +#define STRIPE_NONE 0 +#define STRIPE_WRITE 1 +#define STRIPE_READ 2 + +struct raid5_private_data { + struct stripe_head **stripe_hashtbl; + mddev_t *mddev; + mdk_thread_t *thread, *resync_thread; + struct disk_info disks[MD_SB_DISKS]; + struct disk_info *spare; + int buffer_size; + int chunk_size, level, algorithm; + int raid_disks, working_disks, failed_disks; + int sector_count; + unsigned long next_sector; + atomic_t nr_handle; + struct stripe_head *next_free_stripe; + atomic_t nr_stripes; + int resync_parity; + int max_nr_stripes; + int clock; + atomic_t nr_hashed_stripes; + atomic_t nr_locked_stripes; + atomic_t nr_pending_stripes; + atomic_t nr_cached_stripes; + + /* + * Free stripes pool + */ + atomic_t nr_free_sh; + struct stripe_head *free_sh_list; + md_wait_queue_head_t wait_for_stripe; + + md_spinlock_t device_lock; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +#endif --- linux/include/linux/raid/translucent.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/translucent.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,23 @@ +#ifndef _TRANSLUCENT_H +#define _TRANSLUCENT_H + +#include + +typedef struct dev_info dev_info_t; + +struct dev_info { + kdev_t dev; + int size; +}; + +struct translucent_private_data +{ + dev_info_t disks[MD_SB_DISKS]; +}; + + +typedef struct translucent_private_data translucent_conf_t; + +#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/xor.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/xor.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,12 @@ +#ifndef _XOR_H +#define _XOR_H + +#include + +#define MAX_XOR_BLOCKS 5 + +extern void calibrate_xor_block(void); +extern void (*xor_block)(unsigned int count, + struct buffer_head **bh_ptr); + +#endif --- linux/include/linux/raid/hsm.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/hsm.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,65 @@ +#ifndef _HSM_H +#define _HSM_H + +#include + +#if __alpha__ +#error fix cpu_addr on Alpha first +#endif + +#include + +#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr) +#define index_dev(lv,index) index_pv((lv),(index))->dev +#define index_block(lv,index) (index)->data.phys_block +#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr)) + +#define ptr_to_cpuaddr(ptr) ((__u32) (ptr)) + + +typedef struct pv_bg_desc_s { + unsigned int free_blocks; + pv_block_group_t *bg; +} pv_bg_desc_t; + +typedef struct pv_s pv_t; +typedef struct vg_s vg_t; +typedef struct lv_s lv_t; + +struct pv_s +{ + int phys_nr; + kdev_t dev; + pv_sb_t *pv_sb; + pv_bg_desc_t *bg_array; +}; + +struct lv_s +{ + int log_id; + vg_t *vg; + + unsigned int max_indices; + unsigned int free_indices; + lv_lptr_t root_index; + + kdev_t dev; +}; + +struct vg_s +{ + int nr_pv; + pv_t pv_array [MD_SB_DISKS]; + + int nr_lv; + lv_t lv_array [HSM_MAX_LVS_PER_VG]; + + vg_sb_t *vg_sb; + mddev_t *mddev; +}; + +#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data) +#define mddev_to_vg(mddev) ((vg_t *) mddev->private) + +#endif + --- linux/include/linux/sysctl.h.orig Sun Jan 16 06:38:26 2000 +++ linux/include/linux/sysctl.h Sun Jan 16 17:45:53 2000 @@ -504,7 +504,8 @@ enum { DEV_CDROM=1, DEV_HWMON=2, - DEV_PARPORT=3 + DEV_PARPORT=3, + DEV_MD=4 }; /* /proc/sys/dev/cdrom */ @@ -544,6 +545,11 @@ /* /proc/sys/dev/parport/parport n/devices/device n */ enum { DEV_PARPORT_DEVICE_TIMESLICE=1, +}; + +/* /proc/sys/dev/md */ +enum { + DEV_MD_SPEED_LIMIT=1 }; #ifdef __KERNEL__ --- linux/include/linux/blkdev.h.orig Sun Jan 16 06:38:16 2000 +++ linux/include/linux/blkdev.h Sun Jan 16 17:45:53 2000 @@ -36,14 +36,17 @@ }; typedef struct request_queue request_queue_t; -typedef int (merge_request_fn) (request_queue_t *, - struct request * req, - struct buffer_head *); -typedef int (merge_requests_fn) (request_queue_t *, - struct request * req, - struct request * req2); -typedef void (request_fn_proc) (request_queue_t *); +typedef int (merge_request_fn) (request_queue_t *q, + struct request *req, + struct buffer_head *bh); +typedef int (merge_requests_fn) (request_queue_t *q, + struct request *req, + struct request *req2); +typedef void (request_fn_proc) (request_queue_t *q); typedef request_queue_t * (queue_proc) (kdev_t dev); +typedef void (make_request_fn) (int rw, struct buffer_head *bh); +typedef void (plug_device_fn) (request_queue_t *q, kdev_t device); +typedef void (unplug_device_fn) (void *q); struct request_queue { @@ -51,6 +54,8 @@ request_fn_proc * request_fn; merge_request_fn * merge_fn; merge_requests_fn * merge_requests_fn; + make_request_fn * make_request_fn; + plug_device_fn * plug_device_fn; /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. @@ -71,12 +76,6 @@ * not. */ char head_active; - - /* - * Boolean that indicates whether we should use plugging on - * this queue or not. - */ - char use_plug; }; struct blk_dev_struct { @@ -105,8 +104,10 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV]; extern wait_queue_head_t wait_for_request; extern void resetup_one_dev(struct gendisk *dev, int drive); -extern void unplug_device(void * data); -extern void make_request(int major,int rw, struct buffer_head * bh); +extern void generic_unplug_device(void * data); +extern void generic_plug_device (request_queue_t *q, kdev_t dev); +extern void generic_make_request(int rw, struct buffer_head * bh); +extern request_queue_t * blk_get_queue(kdev_t dev); /* * Access functions for manipulating queue properties @@ -114,12 +115,8 @@ extern void blk_init_queue(request_queue_t *, request_fn_proc *); extern void blk_cleanup_queue(request_queue_t *); extern void blk_queue_headactive(request_queue_t *, int); -extern void blk_queue_pluggable(request_queue_t *, int); - -/* md needs this function to remap requests */ -extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size); -extern int md_make_request (int minor, int rw, struct buffer_head * bh); -extern int md_error (kdev_t mddev, kdev_t rdev); +extern void blk_queue_pluggable(request_queue_t *, plug_device_fn *); +extern void blk_queue_make_request(request_queue_t *, make_request_fn *); extern int * blk_size[MAX_BLKDEV]; --- linux/include/linux/fs.h.orig Sun Jan 16 06:38:28 2000 +++ linux/include/linux/fs.h Sun Jan 16 17:45:53 2000 @@ -22,6 +22,7 @@ #include #include +#include struct poll_table_struct; @@ -210,8 +211,8 @@ /* First cache line: */ struct buffer_head *b_next; /* Hash queue list */ unsigned long b_blocknr; /* block number */ - unsigned short b_size; /* block size */ - unsigned short b_list; /* List that this buffer appears */ + int b_size; /* block size */ + int b_list; /* List that this buffer appears */ kdev_t b_dev; /* device (B_FREE = free) */ atomic_t b_count; /* users using this block */ @@ -225,7 +226,7 @@ struct buffer_head *b_reqnext; /* request queue */ struct buffer_head **b_pprev; /* doubly linked list of hash-queue */ - char * b_data; /* pointer to data block (512 byte) */ + char * b_data; /* pointer to data block */ struct page *b_page; /* the page this bh is mapped to */ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ void *b_dev_id; @@ -237,8 +238,17 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); void init_buffer(struct buffer_head *, bh_end_io_t *, void *); +extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate); +extern void end_buffer_io_bad(struct buffer_head *bh, int uptodate); -#define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0) + +#define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0) + +#define bcount(bh) (atomic_read(&((bh)->b_count))) +#define bh_set(bh,v) do { atomic_set(&(bh)->b_count,v); } while (0) +#define bget(bh) do { atomic_inc(&(bh)->b_count); } while (0) +#define bput(bh) do { atomic_dec(&(bh)->b_count); } while (0) +#define bput_and_test(bh) (atomic_dec_and_test(&(bh)->b_count)) #define buffer_uptodate(bh) __buffer_state(bh,Uptodate) #define buffer_dirty(bh) __buffer_state(bh,Dirty) @@ -248,11 +258,12 @@ #define buffer_new(bh) __buffer_state(bh,New) #define buffer_protected(bh) __buffer_state(bh,Protected) -#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) +#define bh_offset(bh) ((unsigned long)bh->b_data & ~PAGE_MASK) +#define bh_page(bh) page_cache_entry(bh->b_data) extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); -#define touch_buffer(bh) set_bit(PG_referenced, &bh->b_page->flags) +#define touch_buffer(bh) set_bit(PG_referenced, &(bh)->b_page->flags) #include #include @@ -811,6 +822,7 @@ extern int try_to_free_buffers(struct page *); extern void refile_buffer(struct buffer_head * buf); + #define BUF_CLEAN 0 #define BUF_LOCKED 1 /* Buffers scheduled for write */ #define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */ @@ -822,6 +834,8 @@ */ extern inline void mark_buffer_uptodate(struct buffer_head * bh, int on) { + if (0 && !bcount(bh)) + BH_BUG(bh); if (on) set_bit(BH_Uptodate, &bh->b_state); else @@ -832,11 +846,15 @@ extern inline void __mark_buffer_clean(struct buffer_head *bh) { + if (0 && !bcount(bh)) + BH_BUG(bh); refile_buffer(bh); } extern inline void mark_buffer_clean(struct buffer_head * bh) { + if (0 && !bcount(bh)) + BH_BUG(bh); if (atomic_set_buffer_clean(bh)) __mark_buffer_clean(bh); } @@ -860,6 +878,11 @@ extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag) { + /* + * We must not touch buffers we do not own. + */ + if (0 && !bcount(bh)) + BH_BUG(bh); if (!atomic_set_buffer_dirty(bh)) __mark_buffer_dirty(bh, flag); } @@ -958,7 +981,12 @@ extern void file_moveto(struct file *new, struct file *old); extern struct buffer_head * get_hash_table(kdev_t, int, int); extern struct buffer_head * getblk(kdev_t, int, int); +extern struct buffer_head * getblk_lock(kdev_t, int, int); +extern struct buffer_head * get_unused_bh(void); +extern void put_unused_bh(struct buffer_head * bh); +extern void insert_into_queues_exclusive(struct buffer_head *bh); extern void ll_rw_block(int, int, struct buffer_head * bh[]); +extern void ll_rw_block_locked(int, int, struct buffer_head * bh[]); extern int is_read_only(kdev_t); extern void __brelse(struct buffer_head *); extern inline void brelse(struct buffer_head *buf) @@ -971,6 +999,12 @@ { if (buf) __bforget(buf); +} +extern void __bdrop(struct buffer_head *); +extern inline void bdrop(struct buffer_head *buf) +{ + if (buf) + __bdrop(buf); } extern void set_blocksize(kdev_t, int); extern unsigned int get_hardblocksize(kdev_t); --- linux/include/linux/raid0.h.orig Tue Oct 29 14:20:24 1996 +++ linux/include/linux/raid0.h Sun Jan 16 17:45:53 2000 @@ -1,27 +0,0 @@ -#ifndef _RAID0_H -#define _RAID0_H - -struct strip_zone -{ - int zone_offset; /* Zone offset in md_dev */ - int dev_offset; /* Zone offset in real dev */ - int size; /* Zone size */ - int nb_dev; /* Number of devices attached to the zone */ - struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */ -}; - -struct raid0_hash -{ - struct strip_zone *zone0, *zone1; -}; - -struct raid0_data -{ - struct raid0_hash *hash_table; /* Dynamically allocated */ - struct strip_zone *strip_zone; /* This one too */ - int nr_strip_zones; - struct strip_zone *smallest; - int nr_zones; -}; - -#endif --- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998 +++ linux/include/linux/raid1.h Sun Jan 16 17:45:53 2000 @@ -1,49 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -#include - -struct mirror_info { - int number; - int raid_disk; - kdev_t dev; - int next; - int sect_limit; - - /* - * State bits: - */ - int operational; - int write_only; - int spare; -}; - -struct raid1_data { - struct md_dev *mddev; - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */ - int raid_disks; - int working_disks; /* Number of working disks */ - int last_used; - unsigned long next_sect; - int sect_count; - int resync_running; -}; - -/* - * this is our 'private' 'collective' RAID1 buffer head. - * it contains information about what kind of IO operations were started - * for this RAID5 operation, and about their status: - */ - -struct raid1_bh { - unsigned int remaining; - int cmd; - unsigned long state; - struct md_dev *mddev; - struct buffer_head *master_bh; - struct buffer_head *mirror_bh [MD_SB_DISKS]; - struct buffer_head bh_req; - struct buffer_head *next_retry; -}; - -#endif --- linux/include/linux/mm.h.orig Sun Jan 16 06:38:28 2000 +++ linux/include/linux/mm.h Sun Jan 16 17:45:53 2000 @@ -141,6 +141,9 @@ struct buffer_head * buffers; unsigned long virtual; /* nonzero if kmapped */ struct zone_struct *zone; +#if PAGE_TRACE + struct buffer_hist __hist; +#endif } mem_map_t; #define get_page(p) atomic_inc(&(p)->count) @@ -165,7 +168,6 @@ #define PG_highmem 12 /* bits 21-30 unused */ #define PG_reserved 31 - /* Make it prettier to test the above... */ #define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags) --- linux/include/linux/tqueue.h.orig Fri Oct 15 18:29:43 1999 +++ linux/include/linux/tqueue.h Sun Jan 16 17:45:53 2000 @@ -116,7 +116,8 @@ p = p -> next; mb(); save_p -> sync = 0; - (*f)(arg); + if (f) + (*f)(arg); } } } --- linux/include/linux/list.h.orig Mon Jul 12 07:50:27 1999 +++ linux/include/linux/list.h Sun Jan 16 17:45:53 2000 @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include + /* * Simple doubly linked list implementation. * @@ -26,6 +28,7 @@ (ptr)->next = (ptr); (ptr)->prev = (ptr); \ } while (0) +#define LIST_UNUSED(ptr) ((!(ptr)->next && !(ptr)->prev) || (((ptr)->next == (ptr)) && ((ptr)->prev = (ptr)))) /* * Insert a new entry between two known consecutive entries. * @@ -36,6 +39,8 @@ struct list_head * prev, struct list_head * next) { + if (!LIST_UNUSED(new)) + BUG(); next->prev = new; new->next = next; new->prev = prev; @@ -75,6 +80,8 @@ static __inline__ void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); + entry->prev = 0; + entry->next = 0; } static __inline__ int list_empty(struct list_head *head) --- linux/include/linux/wait.h.orig Fri Oct 15 18:29:42 1999 +++ linux/include/linux/wait.h Sun Jan 16 17:45:53 2000 @@ -143,6 +143,7 @@ #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif + INIT_LIST_HEAD(&q->task_list); } static inline int waitqueue_active(wait_queue_head_t *q) --- linux/include/linux/raid5.h.orig Wed May 12 17:41:15 1999 +++ linux/include/linux/raid5.h Sun Jan 16 17:45:53 2000 @@ -1,110 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#ifdef __KERNEL__ -#include -#include - -struct disk_info { - kdev_t dev; - int operational; - int number; - int raid_disk; - int write_only; - int spare; -}; - -struct stripe_head { - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ - struct stripe_head *free_next; /* pool of free sh's */ - struct buffer_head *buffer_pool; /* pool of free buffers */ - struct buffer_head *bh_pool; /* pool of free bh's */ - struct raid5_data *raid_conf; - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ - unsigned long sector; /* sector of this row */ - int size; /* buffers size */ - int pd_idx; /* parity disk index */ - int nr_pending; /* nr of pending cmds */ - unsigned long state; /* state flags */ - int cmd; /* stripe cmd */ - int count; /* nr of waiters */ - int write_method; /* reconstruct-write / read-modify-write */ - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ - wait_queue_head_t wait; /* processes waiting for this stripe */ -}; - -/* - * Phase - */ -#define PHASE_BEGIN 0 -#define PHASE_READ_OLD 1 -#define PHASE_WRITE 2 -#define PHASE_READ 3 -#define PHASE_COMPLETE 4 - -/* - * Write method - */ -#define METHOD_NONE 0 -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 - -/* - * Stripe state - */ -#define STRIPE_LOCKED 0 -#define STRIPE_ERROR 1 - -/* - * Stripe commands - */ -#define STRIPE_NONE 0 -#define STRIPE_WRITE 1 -#define STRIPE_READ 2 - -struct raid5_data { - struct stripe_head **stripe_hashtbl; - struct md_dev *mddev; - struct md_thread *thread, *resync_thread; - struct disk_info disks[MD_SB_DISKS]; - struct disk_info *spare; - int buffer_size; - int chunk_size, level, algorithm; - int raid_disks, working_disks, failed_disks; - int sector_count; - unsigned long next_sector; - atomic_t nr_handle; - struct stripe_head *next_free_stripe; - int nr_stripes; - int resync_parity; - int max_nr_stripes; - int clock; - int nr_hashed_stripes; - int nr_locked_stripes; - int nr_pending_stripes; - int nr_cached_stripes; - - /* - * Free stripes pool - */ - int nr_free_sh; - struct stripe_head *free_sh_list; - wait_queue_head_t wait_for_stripe; -}; - -#endif - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 - -#endif --- linux/include/linux/md.h.orig Tue May 11 23:46:24 1999 +++ linux/include/linux/md.h Sun Jan 16 17:45:53 2000 @@ -1,300 +0,0 @@ -/* - md.h : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_H -#define _MD_H - -#include -#include -#include - -/* - * Different major versions are not compatible. - * Different minor versions are only downward compatible. - * Different patchlevel versions are downward and upward compatible. - */ -#define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 36 -#define MD_PATCHLEVEL_VERSION 6 - -#define MD_DEFAULT_DISK_READAHEAD (256 * 1024) - -/* ioctls */ -#define REGISTER_DEV _IO (MD_MAJOR, 1) -#define START_MD _IO (MD_MAJOR, 2) -#define STOP_MD _IO (MD_MAJOR, 3) -#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4) - -/* - personalities : - Byte 0 : Chunk size factor - Byte 1 : Fault tolerance count for each physical device - ( 0 means no fault tolerance, - 0xFF means always tolerate faults), not used by now. - Byte 2 : Personality - Byte 3 : Reserved. - */ - -#define FAULT_SHIFT 8 -#define PERSONALITY_SHIFT 16 - -#define FACTOR_MASK 0x000000FFUL -#define FAULT_MASK 0x0000FF00UL -#define PERSONALITY_MASK 0x00FF0000UL - -#define MD_RESERVED 0 /* Not used by now */ -#define LINEAR (1UL << PERSONALITY_SHIFT) -#define STRIPED (2UL << PERSONALITY_SHIFT) -#define RAID0 STRIPED -#define RAID1 (3UL << PERSONALITY_SHIFT) -#define RAID5 (4UL << PERSONALITY_SHIFT) -#define MAX_PERSONALITY 5 - -/* - * MD superblock. - * - * The MD superblock maintains some statistics on each MD configuration. - * Each real device in the MD set contains it near the end of the device. - * Some of the ideas are copied from the ext2fs implementation. - * - * We currently use 4096 bytes as follows: - * - * word offset function - * - * 0 - 31 Constant generic MD device information. - * 32 - 63 Generic state information. - * 64 - 127 Personality specific information. - * 128 - 511 12 32-words descriptors of the disks in the raid set. - * 512 - 911 Reserved. - * 912 - 1023 Disk specific descriptor. - */ - -/* - * If x is the real device size in bytes, we return an apparent size of: - * - * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES - * - * and place the 4kB superblock at offset y. - */ -#define MD_RESERVED_BYTES (64 * 1024) -#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) -#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) - -#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) -#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) - -#define MD_SB_BYTES 4096 -#define MD_SB_WORDS (MD_SB_BYTES / 4) -#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) -#define MD_SB_SECTORS (MD_SB_BYTES / 512) - -/* - * The following are counted in 32-bit words - */ -#define MD_SB_GENERIC_OFFSET 0 -#define MD_SB_PERSONALITY_OFFSET 64 -#define MD_SB_DISKS_OFFSET 128 -#define MD_SB_DESCRIPTOR_OFFSET 992 - -#define MD_SB_GENERIC_CONSTANT_WORDS 32 -#define MD_SB_GENERIC_STATE_WORDS 32 -#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) -#define MD_SB_PERSONALITY_WORDS 64 -#define MD_SB_DISKS_WORDS 384 -#define MD_SB_DESCRIPTOR_WORDS 32 -#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) -#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) -#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS) - -/* - * Device "operational" state bits - */ -#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */ -#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */ -#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */ - -typedef struct md_device_descriptor_s { - __u32 number; /* 0 Device number in the entire set */ - __u32 major; /* 1 Device major number */ - __u32 minor; /* 2 Device minor number */ - __u32 raid_disk; /* 3 The role of the device in the raid set */ - __u32 state; /* 4 Operational state */ - __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; -} md_descriptor_t; - -#define MD_SB_MAGIC 0xa92b4efc - -/* - * Superblock state bits - */ -#define MD_SB_CLEAN 0 -#define MD_SB_ERRORS 1 - -typedef struct md_superblock_s { - - /* - * Constant generic information - */ - __u32 md_magic; /* 0 MD identifier */ - __u32 major_version; /* 1 major version to which the set conforms */ - __u32 minor_version; /* 2 minor version to which the set conforms */ - __u32 patch_version; /* 3 patchlevel version to which the set conforms */ - __u32 gvalid_words; /* 4 Number of non-reserved words in this section */ - __u32 set_magic; /* 5 Raid set identifier */ - __u32 ctime; /* 6 Creation time */ - __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */ - __u32 size; /* 8 Apparent size of each individual disk, in kB */ - __u32 nr_disks; /* 9 Number of total disks in the raid set */ - __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */ - __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11]; - - /* - * Generic state information - */ - __u32 utime; /* 0 Superblock update time */ - __u32 state; /* 1 State bits (clean, ...) */ - __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */ - __u32 working_disks; /* 3 Number of working disks */ - __u32 failed_disks; /* 4 Number of failed disks */ - __u32 spare_disks; /* 5 Number of spare disks */ - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6]; - - /* - * Personality information - */ - __u32 parity_algorithm; - __u32 chunk_size; - __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2]; - - /* - * Disks information - */ - md_descriptor_t disks[MD_SB_DISKS]; - - /* - * Reserved - */ - __u32 reserved[MD_SB_RESERVED_WORDS]; - - /* - * Active descriptor - */ - md_descriptor_t descriptor; -} md_superblock_t; - -#ifdef __KERNEL__ - -#include -#include -#include -#include - -/* - * Kernel-based reconstruction is mostly working, but still requires - * some additional work. - */ -#define SUPPORT_RECONSTRUCTION 0 - -#define MAX_REAL 8 /* Max number of physical dev per md dev */ -#define MAX_MD_DEV 4 /* Max number of md dev */ - -#define FACTOR(a) ((a)->repartition & FACTOR_MASK) -#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8) -#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK) - -#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10) - -struct real_dev -{ - kdev_t dev; /* Device number */ - int size; /* Device size (in blocks) */ - int offset; /* Real device offset (in blocks) in md dev - (only used in linear mode) */ - struct inode *inode; /* Lock inode */ - md_superblock_t *sb; - u32 sb_offset; -}; - -struct md_dev; - -#define SPARE_INACTIVE 0 -#define SPARE_WRITE 1 -#define SPARE_ACTIVE 2 - -struct md_personality -{ - char *name; - int (*map)(struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size); - int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh); - void (*end_request)(struct buffer_head * bh, int uptodate); - int (*run)(int minor, struct md_dev *mddev); - int (*stop)(int minor, struct md_dev *mddev); - int (*status)(char *page, int minor, struct md_dev *mddev); - int (*ioctl)(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); - int max_invalid_dev; - int (*error_handler)(struct md_dev *mddev, kdev_t dev); - -/* - * Some personalities (RAID-1, RAID-5) can get disks hot-added and - * hot-removed. Hot removal is different from failure. (failure marks - * a disk inactive, but the disk is still part of the array) - */ - int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev); - int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev); - int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state); -}; - -struct md_dev -{ - struct real_dev devices[MAX_REAL]; - struct md_personality *pers; - md_superblock_t *sb; - int sb_dirty; - int repartition; - int busy; - int nb_dev; - void *private; -}; - -struct md_thread { - void (*run) (void *data); - void *data; - wait_queue_head_t wqueue; - unsigned long flags; - struct semaphore *sem; - struct task_struct *tsk; -}; - -#define THREAD_WAKEUP 0 - -extern struct md_dev md_dev[MAX_MD_DEV]; -extern int md_size[MAX_MD_DEV]; -extern int md_maxreadahead[MAX_MD_DEV]; - -extern char *partition_name (kdev_t dev); - -extern int register_md_personality (int p_num, struct md_personality *p); -extern int unregister_md_personality (int p_num); -extern struct md_thread *md_register_thread (void (*run) (void *data), void *data); -extern void md_unregister_thread (struct md_thread *thread); -extern void md_wakeup_thread(struct md_thread *thread); -extern int md_update_sb (int minor); -extern int md_do_sync(struct md_dev *mddev); - -#endif __KERNEL__ -#endif _MD_H --- linux/include/asm-i386/md.h.orig Fri May 8 09:17:13 1998 +++ linux/include/asm-i386/md.h Sun Jan 16 17:45:53 2000 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:11:57 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-i386/page.h.orig Sun Jan 16 06:38:18 2000 +++ linux/include/asm-i386/page.h Sun Jan 16 17:45:53 2000 @@ -91,6 +91,8 @@ #define PAGE_BUG(page) do { \ BUG(); \ } while (0) +#define BH_BUG(bh) do { \ + BUG(); } while (0) #endif /* __ASSEMBLY__ */ --- linux/include/asm-alpha/md.h.orig Fri May 8 09:17:13 1998 +++ linux/include/asm-alpha/md.h Sun Jan 16 17:45:53 2000 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:11:48 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-m68k/md.h.orig Fri May 8 09:15:22 1998 +++ linux/include/asm-m68k/md.h Sun Jan 16 17:45:53 2000 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:12:04 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-sparc/md.h.orig Tue Jan 13 00:15:54 1998 +++ linux/include/asm-sparc/md.h Sun Jan 16 17:45:53 2000 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:12:39 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-ppc/md.h.orig Tue Jan 13 00:18:13 1998 +++ linux/include/asm-ppc/md.h Sun Jan 16 17:45:53 2000 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:12:15 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/drivers/block/Makefile.orig Sun Jan 16 06:38:10 2000 +++ linux/drivers/block/Makefile Sun Jan 16 17:45:53 2000 @@ -346,10 +346,30 @@ endif ifeq ($(CONFIG_MD_RAID5),y) +LX_OBJS += xor.o +CFLAGS_xor.o := $(PROFILING) -fomit-frame-pointer L_OBJS += raid5.o else ifeq ($(CONFIG_MD_RAID5),m) + LX_OBJS += xor.o + CFLAGS_xor.o := $(PROFILING) -fomit-frame-pointer M_OBJS += raid5.o + endif +endif + +ifeq ($(CONFIG_MD_TRANSLUCENT),y) +L_OBJS += translucent.o +else + ifeq ($(CONFIG_MD_TRANSLUCENT),m) + M_OBJS += translucent.o + endif +endif + +ifeq ($(CONFIG_MD_HSM),y) +L_OBJS += hsm.o +else + ifeq ($(CONFIG_MD_HSM),m) + M_OBJS += hsm.o endif endif --- linux/drivers/block/Config.in.orig Sun Jan 16 06:38:24 2000 +++ linux/drivers/block/Config.in Sun Jan 16 17:45:53 2000 @@ -189,12 +189,15 @@ if [ "$CONFIG_NET" = "y" ]; then tristate 'Network block device support' CONFIG_BLK_DEV_NBD fi -bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD +bool 'Multiple devices (Software RAID) driver support' CONFIG_BLK_DEV_MD if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then - tristate ' Linear (append) mode' CONFIG_MD_LINEAR - tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED - tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING - tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID + tristate ' Linear (append) mode' CONFIG_MD_LINEAR + tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED + tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING + tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM fi if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then bool ' Boot support (linear, striped)' CONFIG_MD_BOOT --- linux/drivers/block/linear.c.orig Mon Aug 9 19:23:09 1999 +++ linux/drivers/block/linear.c Sun Jan 16 17:45:53 2000 @@ -1,4 +1,3 @@ - /* linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER @@ -19,186 +18,204 @@ #include -#include +#include #include -#include -#include "linear.h" +#include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -static int linear_run (int minor, struct md_dev *mddev) -{ - int cur=0, i, size, dev0_size, nb_zone; - struct linear_data *data; - - MOD_INC_USE_COUNT; - - mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL); - data=(struct linear_data *) mddev->private; - - /* - Find out the smallest device. This was previously done - at registry time, but since it violates modularity, - I moved it here... Any comment ? ;-) - */ - - data->smallest=mddev->devices; - for (i=1; inb_dev; i++) - if (data->smallest->size > mddev->devices[i].size) - data->smallest=mddev->devices+i; - - nb_zone=data->nr_zones= - md_size[minor]/data->smallest->size + - (md_size[minor]%data->smallest->size ? 1 : 0); - - data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL); - - size=mddev->devices[cur].size; - - i=0; - while (curnb_dev) - { - data->hash_table[i].dev0=mddev->devices+cur; - - if (size>=data->smallest->size) /* If we completely fill the slot */ - { - data->hash_table[i++].dev1=NULL; - size-=data->smallest->size; - - if (!size) - { - if (++cur==mddev->nb_dev) continue; - size=mddev->devices[cur].size; - } - - continue; - } - - if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */ - { - data->hash_table[i].dev1=NULL; - continue; - } - - dev0_size=size; /* Here, we use a 2nd dev to fill the slot */ - size=mddev->devices[cur].size; - data->hash_table[i++].dev1=mddev->devices+cur; - size-=(data->smallest->size - dev0_size); - } - - return 0; -} - -static int linear_stop (int minor, struct md_dev *mddev) -{ - struct linear_data *data=(struct linear_data *) mddev->private; - - kfree (data->hash_table); - kfree (data); - - MOD_DEC_USE_COUNT; - - return 0; -} - - -static int linear_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static int linear_run (mddev_t *mddev) { - struct linear_data *data=(struct linear_data *) mddev->private; - struct linear_hash *hash; - struct real_dev *tmp_dev; - long block; - - block=*rsector >> 1; - hash=data->hash_table+(block/data->smallest->size); - - if (block >= (hash->dev0->size + hash->dev0->offset)) - { - if (!hash->dev1) - { - printk ("linear_map : hash->dev1==NULL for block %ld\n", block); - return (-1); - } + linear_conf_t *conf; + struct linear_hash *table; + mdk_rdev_t *rdev; + int size, i, j, nb_zone; + unsigned int curr_offset; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (md_check_ordering(mddev)) { + printk("linear: disks are not ordered, aborting!\n"); + goto out; + } + /* + * Find the smallest device. + */ + + conf->smallest = NULL; + curr_offset = 0; + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + dev_info_t *disk = conf->disks + j; + + disk->dev = rdev->dev; + disk->size = rdev->size; + disk->offset = curr_offset; + + curr_offset += disk->size; + + if (!conf->smallest || (disk->size < conf->smallest->size)) + conf->smallest = disk; + } + + nb_zone = conf->nr_zones = + md_size[mdidx(mddev)] / conf->smallest->size + + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0); + + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone, + GFP_KERNEL); + if (!conf->hash_table) + goto out; + + /* + * Here we generate the linear hash table + */ + table = conf->hash_table; + i = 0; + size = 0; + for (j = 0; j < mddev->nb_dev; j++) { + dev_info_t *disk = conf->disks + j; + + if (size < 0) { + table->dev1 = disk; + table++; + } + size += disk->size; + + while (size) { + table->dev0 = disk; + size -= conf->smallest->size; + if (size < 0) + break; + table->dev1 = NULL; + table++; + } + } + table->dev1 = NULL; + + return 0; + +out: + if (conf) + kfree(conf); + MOD_DEC_USE_COUNT; + return 1; +} + +static int linear_stop (mddev_t *mddev) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + kfree(conf->hash_table); + kfree(conf); + + MOD_DEC_USE_COUNT; + + return 0; +} + +static int linear_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + struct linear_hash *hash; + dev_info_t *tmp_dev; + long block; + + block = bh->b_blocknr * (bh->b_size >> 10); + hash = conf->hash_table + (block / conf->smallest->size); + + if (block >= (hash->dev0->size + hash->dev0->offset)) { + if (!hash->dev1) { + printk ("linear_make_request : hash->dev1==NULL for block %ld\n", + block); + return -1; + } + tmp_dev = hash->dev1; + } else + tmp_dev = hash->dev0; - tmp_dev=hash->dev1; - } - else - tmp_dev=hash->dev0; - - if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset) - printk ("Block %ld out of bounds on dev %s size %d offset %d\n", - block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); - - *rdev=tmp_dev->dev; - *rsector=(block-(tmp_dev->offset)) << 1; + if (block >= (tmp_dev->size + tmp_dev->offset) + || block < tmp_dev->offset) { + printk ("linear_make_request: Block %ld out of bounds on dev %s size %d offset %d\n", block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); + return -1; + } + bh->b_rdev = tmp_dev->dev; + bh->b_rsector = (block - tmp_dev->offset) << 1; - return (0); + generic_make_request(rw, bh); + return 0; } -static int linear_status (char *page, int minor, struct md_dev *mddev) +static int linear_status (char *page, mddev_t *mddev) { - int sz=0; + int sz = 0; #undef MD_DEBUG #ifdef MD_DEBUG - int j; - struct linear_data *data=(struct linear_data *) mddev->private; + int j; + linear_conf_t *conf = mddev_to_conf(mddev); - sz+=sprintf (page+sz, " "); - for (j=0; jnr_zones; j++) - { - sz+=sprintf (page+sz, "[%s", - partition_name (data->hash_table[j].dev0->dev)); - - if (data->hash_table[j].dev1) - sz+=sprintf (page+sz, "/%s] ", - partition_name(data->hash_table[j].dev1->dev)); - else - sz+=sprintf (page+sz, "] "); - } - - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page+sz, " "); + for (j = 0; j < conf->nr_zones; j++) + { + sz += sprintf(page+sz, "[%s", + partition_name(conf->hash_table[j].dev0->dev)); + + if (conf->hash_table[j].dev1) + sz += sprintf(page+sz, "/%s] ", + partition_name(conf->hash_table[j].dev1->dev)); + else + sz += sprintf(page+sz, "] "); + } + sz += sprintf(page+sz, "\n"); #endif - sz+=sprintf (page+sz, " %dk rounding", 1<param.chunk_size/1024); + return sz; } -static struct md_personality linear_personality= +static mdk_personality_t linear_personality= { - "linear", - linear_map, - NULL, - NULL, - linear_run, - linear_stop, - linear_status, - NULL, /* no ioctls */ - 0 + "linear", + NULL, + linear_make_request, + NULL, + linear_run, + linear_stop, + linear_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL }; - #ifndef MODULE -void __init linear_init (void) +void md__init linear_init (void) { - register_md_personality (LINEAR, &linear_personality); + register_md_personality (LINEAR, &linear_personality); } #else int init_module (void) { - return (register_md_personality (LINEAR, &linear_personality)); + return (register_md_personality (LINEAR, &linear_personality)); } void cleanup_module (void) { - unregister_md_personality (LINEAR); + unregister_md_personality (LINEAR); } #endif + --- linux/drivers/block/linear.h.orig Fri Nov 22 15:07:23 1996 +++ linux/drivers/block/linear.h Sun Jan 16 17:45:53 2000 @@ -1,16 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct linear_hash -{ - struct real_dev *dev0, *dev1; -}; - -struct linear_data -{ - struct linear_hash *hash_table; /* Dynamically allocated */ - struct real_dev *smallest; - int nr_zones; -}; - -#endif --- linux/drivers/block/md.c.orig Sun Jan 16 06:38:26 2000 +++ linux/drivers/block/md.c Sun Jan 16 17:45:53 2000 @@ -1,21 +1,17 @@ - /* md.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - + Copyright (C) 1998, 1999, 2000 Ingo Molnar - A lot of inspiration came from hd.c ... + completely rewritten, based on the MD driver code from Marc Zyngier - kerneld support by Boris Tobotras - boot support for linear and striped mode by Harald Hoyer + Changes: - RAID-1/RAID-5 extensions by: - Ingo Molnar, Miguel de Icaza, Gadi Oxman + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher - Changes for kmod by: - Cyrus Durgin - This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) @@ -26,735 +22,2938 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* - * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so - * the extra system load does not show up that much. Increase it if your - * system can take more. - */ -#define SPEED_LIMIT 1024 +#include +#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #ifdef CONFIG_KMOD #include #endif -#include -#include #define __KERNEL_SYSCALLS__ #include +#include + +extern asmlinkage int sys_sched_yield(void); +extern asmlinkage int sys_setsid(void); + +extern unsigned long io_events[MAX_BLKDEV]; + #define MAJOR_NR MD_MAJOR #define MD_DRIVER #include -#include -#include -#include -#include #ifdef CONFIG_MD_BOOT -extern kdev_t name_to_kdev_t(char *line) __init; +extern kdev_t name_to_kdev_t(char *line) md__init; +#endif + +#define DEBUG 0 +#if DEBUG +# define dprintk(x...) printk(x) +#else +# define dprintk(x...) do { } while(0) #endif -static struct hd_struct md_hd_struct[MAX_MD_DEV]; -static int md_blocksizes[MAX_MD_DEV]; -int md_maxreadahead[MAX_MD_DEV]; -#if SUPPORT_RECONSTRUCTION -static struct md_thread *md_sync_thread = NULL; -#endif /* SUPPORT_RECONSTRUCTION */ +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, }; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_maxreadahead[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread = NULL; -int md_size[MAX_MD_DEV]={0, }; +int md_size[MAX_MD_DEVS] = {0, }; static void md_geninit (struct gendisk *); static struct gendisk md_gendisk= { - MD_MAJOR, - "md", - 0, - 1, - MAX_MD_DEV, - md_geninit, - md_hd_struct, - md_size, - MAX_MD_DEV, - NULL, - NULL + MD_MAJOR, + "md", + 0, + 1, + MAX_MD_DEVS, + md_geninit, + md_hd_struct, + md_size, + MAX_MD_DEVS, + NULL, + NULL +}; + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + +static struct ctl_table_header *md_table_header; + +static ctl_table md_table[] = { + {DEV_MD_SPEED_LIMIT, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_MD_SPEED_LIMIT, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table md_dir_table[] = { + {DEV_MD, "raid", NULL, 0, 0555, md_table}, + {0} }; -static struct md_personality *pers[MAX_PERSONALITY]={NULL, }; -struct md_dev md_dev[MAX_MD_DEV]; +static ctl_table md_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table}, + {0} +}; -int md_thread(void * arg); +static void md_register_sysctl(void) +{ + md_table_header = register_sysctl_table(md_root_table, 1); +} -static int legacy_raid_sb (int minor, int pnum) +void md_unregister_sysctl(void) { - int i, factor; + unregister_sysctl_table(md_table_header); +} - factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); +void md_plug_device (request_queue_t *mdqueue, kdev_t dev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + request_queue_t *q; + mddev_t *mddev; - /***** - * do size and offset calculations. - */ - for (i=0; iplugged)) { + mddev = kdev_to_mddev(dev); + ITERATE_RDEV(mddev,rdev,tmp) { + q = blk_get_queue(rdev->dev); + generic_unplug_device(q); + } + queue_task(&mdqueue->plug_tq, &tq_disk); } - if (pnum == RAID0 >> PERSONALITY_SHIFT) - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev; - return 0; } -static void free_sb (struct md_dev *mddev) +static void md_unplug_device (void * data) { - int i; - struct real_dev *realdev; + mdk_rdev_t * rdev; + struct md_list_head *tmp; + mddev_t *mddev = (mddev_t *)data; + request_queue_t *mdqueue = &mddev->queue, *q; - if (mddev->sb) { - free_page((unsigned long) mddev->sb); - mddev->sb = NULL; - } - for (i = 0; i nb_dev; i++) { - realdev = mddev->devices + i; - if (realdev->sb) { - free_page((unsigned long) realdev->sb); - realdev->sb = NULL; - } + clear_bit(0, (atomic_t *)&mdqueue->plugged); + ITERATE_RDEV(mddev,rdev,tmp) { + q = blk_get_queue(rdev->dev); + generic_unplug_device(q); } } /* - * Check one RAID superblock for generic plausibility + * Enables to iterate over all existing md arrays */ +static MD_LIST_HEAD(all_mddevs); -#define BAD_MAGIC KERN_ERR \ -"md: %s: invalid raid superblock magic (%x) on block %u\n" +/* + * The mapping between kdev and mddev is not necessary a simple + * one! Eg. HSM uses several sub-devices to implement Logical + * Volumes. All these sub-devices map to the same mddev. + */ +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, }; -#define OUT_OF_MEM KERN_ALERT \ -"md: out of memory.\n" +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data) +{ + unsigned int minor = MINOR(dev); -#define NO_DEVICE KERN_ERR \ -"md: disabled device %s\n" + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (mddev_map[minor].mddev != NULL) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = mddev; + mddev_map[minor].data = data; +} + +void del_mddev_mapping (mddev_t * mddev, kdev_t dev) +{ + unsigned int minor = MINOR(dev); -#define SUCCESS 0 -#define FAILURE -1 + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (mddev_map[minor].mddev != mddev) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = NULL; + mddev_map[minor].data = NULL; +} -static int analyze_one_sb (struct real_dev * rdev) +static request_queue_t *md_get_queue (kdev_t dev) { - int ret = FAILURE; - struct buffer_head *bh; - kdev_t dev = rdev->dev; - md_superblock_t *sb; + mddev_t *mddev = kdev_to_mddev(dev); + + if (!mddev) + return NULL; + return &mddev->queue; +} + +static void do_md_request (request_queue_t * q) +{ + printk(KERN_ALERT "Got md request, not good..."); + BUG(); + return; +} + +void md_make_request (int rw, struct buffer_head * bh) +{ + mddev_t *mddev = kdev_to_mddev(bh->b_dev); + + if (!mddev || !mddev->pers) + bh->b_end_io(bh, 0); + else { + if ((rw == READ || rw == READA) && buffer_uptodate(bh)) + bh->b_end_io(bh, 1); + else + mddev->pers->make_request(mddev, rw, bh); + } +} + +static mddev_t * alloc_mddev (kdev_t dev) +{ + request_queue_t *q; + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + MD_INIT_LIST_HEAD(&mddev->disks); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + + q = &mddev->queue; + blk_init_queue(q, DEVICE_REQUEST); + blk_queue_pluggable(q, md_plug_device); + blk_queue_make_request(q, md_make_request); + + q->plug_tq.sync = 0; + q->plug_tq.routine = &md_unplug_device; + q->plug_tq.data = mddev; /* - * Read the superblock, it's at the end of the disk + * The 'base' mddev is the one with data NULL. + * personalities can create additional mddevs + * if necessary. */ - rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]); - set_blocksize (dev, MD_SB_BYTES); - bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - - if (bh) { - sb = (md_superblock_t *) bh->b_data; - if (sb->md_magic != MD_SB_MAGIC) { - printk (BAD_MAGIC, kdevname(dev), - sb->md_magic, rdev->sb_offset); - goto abort; - } - rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL); - if (!rdev->sb) { - printk (OUT_OF_MEM); - goto abort; - } - memcpy (rdev->sb, bh->b_data, MD_SB_BYTES); + add_mddev_mapping(mddev, dev, 0); + md_list_add(&mddev->all_mddevs, &all_mddevs); - rdev->size = sb->size; - } else - printk (NO_DEVICE,kdevname(rdev->dev)); - ret = SUCCESS; -abort: - if (bh) - brelse (bh); - return ret; + return mddev; } -#undef SUCCESS -#undef FAILURE +static void free_mddev (mddev_t *mddev) +{ + if (!mddev) { + MD_BUG(); + return; + } -#undef BAD_MAGIC -#undef OUT_OF_MEM -#undef NO_DEVICE + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (md_atomic_read(&mddev->resync_sem.count) != 1) + schedule(); + while (md_atomic_read(&mddev->recovery_sem.count) != 1) + schedule(); -/* - * Check a full RAID array for plausibility - */ + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); + md_list_del(&mddev->all_mddevs); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + kfree(mddev); +} -#define INCONSISTENT KERN_ERR \ -"md: superblock inconsistency -- run ckraid\n" +struct gendisk * find_gendisk (kdev_t dev) +{ + struct gendisk *tmp = gendisk_head; -#define OUT_OF_DATE KERN_ERR \ -"md: superblock update time inconsistenty -- using the most recent one\n" + while (tmp != NULL) { + if (tmp->major == MAJOR(dev)) + return (tmp); + tmp = tmp->next; + } + return (NULL); +} -#define OLD_VERSION KERN_ALERT \ -"md: %s: unsupported raid array version %d.%d.%d\n" +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; -#define NOT_CLEAN KERN_ERR \ -"md: %s: raid array is not clean -- run ckraid\n" + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} -#define NOT_CLEAN_IGNORE KERN_ERR \ -"md: %s: raid array is not clean -- reconstructing parity\n" +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; -#define UNKNOWN_LEVEL KERN_ERR \ -"md: %s: unsupported raid level %d\n" + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); -static int analyze_sbs (int minor, int pnum) +char * partition_name (kdev_t dev) { - struct md_dev *mddev = md_dev + minor; - int i, N = mddev->nb_dev, out_of_date = 0; - struct real_dev * disks = mddev->devices; - md_superblock_t *sb, *freshest = NULL; + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp = device_names.next; - /* - * RAID-0 and linear don't use a RAID superblock - */ - if (pnum == RAID0 >> PERSONALITY_SHIFT || - pnum == LINEAR >> PERSONALITY_SHIFT) - return legacy_raid_sb (minor, pnum); + while (tmp != &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + tmp = tmp->next; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + if (!dname) + return nomem; /* - * Verify the RAID superblock on each real device + * ok, add this new device name to the list */ - for (i = 0; i < N; i++) - if (analyze_one_sb(disks+i)) - goto abort; + hd = find_gendisk (dev); + + if (!hd) + sprintf (dname->name, "[dev %s]", kdevname(dev)); + else + disk_name (hd, MINOR(dev), dname->name); + + dname->dev = dev; + MD_INIT_LIST_HEAD(&dname->list); + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +/* + * We check wether all devices are numbered from 0 to nb_dev-1. The + * order is guaranteed even after device name changes. + * + * Some personalities (raid0, linear) use this. Personalities that + * provide data have to be able to deal with loss of individual + * disks, so they do their checking themselves. + */ +int md_check_ordering (mddev_t *mddev) +{ + int i, c; + mdk_rdev_t *rdev; + struct md_list_head *tmp; /* - * The superblock constant part has to be the same - * for all disks in the array. + * First, all devices must be fully functional */ - sb = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; - if (!sb) { - sb = disks[i].sb; - continue; - } - if (memcmp(sb, - disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) { - printk (INCONSISTENT); + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk("md: md%d's device %s faulty, aborting.\n", + mdidx(mddev), partition_name(rdev->dev)); goto abort; } } - /* - * OK, we have all disks and the array is ready to run. Let's - * find the freshest superblock, that one will be the superblock - * that represents the whole array. - */ - if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL) + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + c++; + } + if (c != mddev->nb_dev) { + MD_BUG(); goto abort; - freshest = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; - if (!freshest) { - freshest = disks[i].sb; - continue; - } - /* - * Find the newest superblock version - */ - if (disks[i].sb->utime != freshest->utime) { - out_of_date = 1; - if (disks[i].sb->utime > freshest->utime) - freshest = disks[i].sb; - } } - if (out_of_date) - printk(OUT_OF_DATE); - memcpy (sb, freshest, sizeof(*freshest)); - - /* - * Check if we can support this RAID array - */ - if (sb->major_version != MD_MAJOR_VERSION || - sb->minor_version > MD_MINOR_VERSION) { - - printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)), - sb->major_version, sb->minor_version, - sb->patch_version); + if (mddev->nb_dev != mddev->sb->raid_disks) { + printk("md: md%d, array needs %d disks, has %d, aborting.\n", + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); goto abort; } - /* - * We need to add this as a superblock option. + * Now the numbering check */ -#if SUPPORT_RECONSTRUCTION - if (sb->state != (1 << MD_SB_CLEAN)) { - if (sb->level == 1) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); + for (i = 0; i < mddev->nb_dev; i++) { + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == i) + c++; + } + if (c == 0) { + printk("md: md%d, missing disk #%d, aborting.\n", + mdidx(mddev), i); goto abort; - } else - printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor))); - } -#else - if (sb->state != (1 << MD_SB_CLEAN)) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; - } -#endif /* SUPPORT_RECONSTRUCTION */ - - switch (sb->level) { - case 1: - md_size[minor] = sb->size; - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD; - break; - case 4: - case 5: - md_size[minor] = sb->size * (sb->raid_disks - 1); - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1); - break; - default: - printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)), - sb->level); + } + if (c > 1) { + printk("md: md%d, too many disks #%d, aborting.\n", + mdidx(mddev), i); goto abort; + } } return 0; abort: - free_sb(mddev); return 1; } -#undef INCONSISTENT -#undef OUT_OF_DATE -#undef OLD_VERSION -#undef NOT_CLEAN -#undef OLD_LEVEL - -int md_update_sb(int minor) +static unsigned int zoned_raid_size (mddev_t *mddev) { - struct md_dev *mddev = md_dev + minor; - struct buffer_head *bh; - md_superblock_t *sb = mddev->sb; - struct real_dev *realdev; - kdev_t dev; - int i; - u32 sb_offset; + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; - sb->utime = CURRENT_TIME; - for (i = 0; i < mddev->nb_dev; i++) { - realdev = mddev->devices + i; - if (!realdev->sb) - continue; - dev = realdev->dev; - sb_offset = realdev->sb_offset; - set_blocksize(dev, MD_SB_BYTES); - printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset); - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - if (bh) { - sb = (md_superblock_t *) bh->b_data; - memcpy(sb, mddev->sb, MD_SB_BYTES); - memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4); - mark_buffer_uptodate(bh, 1); - mark_buffer_dirty(bh, 1); - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - bforget(bh); - fsync_dev(dev); - invalidate_buffers(dev); - } else - printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev)); + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); +printk("mask %08x\n", mask); + + ITERATE_RDEV(mddev,rdev,tmp) { +printk(" rdev->size: %d\n", rdev->size); + rdev->size &= mask; +printk(" masked rdev->size: %d\n", rdev->size); + md_size[mdidx(mddev)] += rdev->size; +printk(" new md_size: %d\n", md_size[mdidx(mddev)]); } return 0; } -static int do_md_run (int minor, int repart) +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb) { - int pnum, i, min, factor, err; + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} - if (!md_dev[minor].nb_dev) - return -EINVAL; - - if (md_dev[minor].pers) - return -EBUSY; +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" - md_dev[minor].repartition=repart; - - if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT)) - >= MAX_PERSONALITY) - return -EINVAL; - - /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */ - if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){ - for (i = 0; i < md_dev [minor].nb_dev; i++) - if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR) - return -EINVAL; - } - if (!pers[pnum]) - { -#ifdef CONFIG_KMOD - char module_name[80]; - sprintf (module_name, "md-personality-%d", pnum); - request_module (module_name); - if (!pers[pnum]) -#endif - return -EINVAL; - } - - factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); - - for (i=0; irun (minor, md_dev+minor))) - { - md_dev[minor].pers=NULL; - free_sb(md_dev + minor); - return (err); - } - - if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT) - { - md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN); - md_update_sb(minor); - } - - /* FIXME : We assume here we have blocks - that are twice as large as sectors. - THIS MAY NOT BE TRUE !!! */ - md_hd_struct[minor].start_sect=0; - md_hd_struct[minor].nr_sects=md_size[minor]<<1; - - read_ahead[MD_MAJOR] = 128; - return (0); -} +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" -static int do_md_stop (int minor, struct inode *inode) +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" + +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb (mddev_t * mddev) { - int i; - - if (inode->i_count>1 || md_dev[minor].busy>1) { - /* - * ioctl : one open channel - */ - printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n", - minor, inode->i_count, md_dev[minor].busy); - return -EBUSY; - } - - if (md_dev[minor].pers) { - /* - * It is safe to call stop here, it only frees private - * data. Also, it tells us if a device is unstoppable - * (eg. resyncing is in progress) - */ - if (md_dev[minor].pers->stop (minor, md_dev+minor)) - return -EBUSY; - /* - * The device won't exist anymore -> flush it now - */ - fsync_dev (inode->i_rdev); - invalidate_buffers (inode->i_rdev); - if (md_dev[minor].sb) { - md_dev[minor].sb->state |= 1 << MD_SB_CLEAN; - md_update_sb(minor); - } + if (mddev->sb) { + MD_BUG(); + return 0; } - - /* Remove locks. */ - if (md_dev[minor].sb) - free_sb(md_dev + minor); - for (i=0; isb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page((unsigned long)mddev->sb); + return 0; } -static int do_md_add (int minor, kdev_t dev) +static int alloc_disk_sb (mdk_rdev_t * rdev) { - int i; - int hot_add=0; - struct real_dev *realdev; + if (rdev->sb) + MD_BUG(); - if (md_dev[minor].nb_dev==MAX_REAL) + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); + if (!rdev->sb) { + printk (OUT_OF_MEM); return -EINVAL; + } + md_clear_page((unsigned long)rdev->sb); - if (!fs_may_mount (dev)) + return 0; +} + +static void free_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) { + free_page((unsigned long) rdev->sb); + rdev->sb = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); + } +} + +static void mark_rdev_faulty (mdk_rdev_t * rdev) +{ + if (!rdev) { + MD_BUG(); + return; + } + free_disk_sb(rdev); + rdev->faulty = 1; +} + +static int read_disk_sb (mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + struct buffer_head *bh = NULL; + kdev_t dev = rdev->dev; + mdp_super_t *sb; + u32 sb_offset; + + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + printk("(read) %s's sb offset: %d", partition_name(dev), + sb_offset); + fsync_dev(dev); + set_blocksize (dev, MD_SB_BYTES); + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); + + if (bh) { + sb = (mdp_super_t *) bh->b_data; + memcpy (rdev->sb, sb, MD_SB_BYTES); + } else { + printk (NO_SB,partition_name(rdev->dev)); + goto abort; + } + printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events)); + ret = 0; +abort: + if (bh) + brelse (bh); + return ret; +} + +static unsigned int calc_sb_csum (mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb (mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk (BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk (BAD_MINOR, partition_name(rdev->dev), + sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) + printk(BAD_CSUM, partition_name(rdev->dev)); + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = find_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + mddev->nb_dev++; + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); +} + +static void unbind_rdev_from_array (mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + md_list_del(&rdev->same_set); + MD_INIT_LIST_HEAD(&rdev->same_set); + rdev->mddev->nb_dev--; + printk("unbind<%s,%d>\n", partition_name(rdev->dev), + rdev->mddev->nb_dev); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev (mdk_rdev_t *rdev) +{ + int err = 0; + + /* + * First insert a dummy inode. + */ + if (rdev->inode) + MD_BUG(); + rdev->inode = get_empty_inode(); + if (!rdev->inode) + return -ENOMEM; + /* + * we dont care about any other fields + */ + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; + insert_inode_hash(rdev->inode); + + memset(&rdev->filp, 0, sizeof(rdev->filp)); + rdev->filp.f_mode = 3; /* read write */ + return err; +} + +static void unlock_rdev (mdk_rdev_t *rdev) +{ + if (!rdev->inode) + MD_BUG(); + iput(rdev->inode); + rdev->inode = NULL; +} + +static void export_rdev (mdk_rdev_t * rdev) +{ + printk("export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + md_list_del(&rdev->all); + MD_INIT_LIST_HEAD(&rdev->all); + if (rdev->pending.next != &rdev->pending) { + printk("(%s was pending)\n",partition_name(rdev->dev)); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array (mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array (mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (mddev->nb_dev) + MD_BUG(); +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)get_unaligned(&sb->events)); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + printk(" D %2d: ", i); + print_desc(desc); + } + printk(" THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk("rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk("no rdev superblock!\n"); +} + +void md_print_devices (void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk(" **********************************\n"); + printk(" * *\n"); + printk(" **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk(" **********************************\n"); + printk("\n"); +} + +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +static mdk_rdev_t * find_rdev_all (kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + tmp = all_raid_disks.next; + while (tmp != &all_raid_disks) { + rdev = md_list_entry(tmp, mdk_rdev_t, all); + if (rdev->dev == dev) + return rdev; + tmp = tmp->next; + } + return NULL; +} + +#define GETBLK_FAILED KERN_ERR \ +"md: getblk failed for device %s\n" + +static int write_disk_sb(mdk_rdev_t * rdev) +{ + struct buffer_head *bh; + kdev_t dev; + u32 sb_offset, size; + mdp_super_t *sb; + + if (!rdev->sb) { + MD_BUG(); + return -1; + } + if (rdev->faulty) { + MD_BUG(); + return -1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return -1; + } + + dev = rdev->dev; + sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1); + if (rdev->sb_offset != sb_offset) { + printk("%s's sb offset has changed from %d to %d, skipping\n", partition_name(dev), rdev->sb_offset, sb_offset); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * it's size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = calc_dev_size(dev, rdev->mddev, 1); + if (size != rdev->size) { + printk("%s's size has changed from %d to %d since import, skipping\n", partition_name(dev), rdev->size, size); + goto skip; + } + + printk("(write) %s's sb offset: %d\n", partition_name(dev), sb_offset); + fsync_dev(dev); + set_blocksize(dev, MD_SB_BYTES); + bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); + if (!bh) { + printk(GETBLK_FAILED, partition_name(dev)); + return 1; + } + memset(bh->b_data,0,bh->b_size); + sb = (mdp_super_t *) bh->b_data; + memcpy(sb, rdev->sb, MD_SB_BYTES); + + mark_buffer_uptodate(bh, 1); + mark_buffer_dirty(bh, 1); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + brelse(bh); + fsync_dev(dev); +skip: + return 0; +} +#undef GETBLK_FAILED KERN_ERR + +static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev) +{ + int i, ok = 0; + mdp_disk_t *desc; + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = mddev->sb->disks + i; +#if 0 + if (disk_faulty(desc)) { + if (MKDEV(desc->major,desc->minor) == rdev->dev) + ok = 1; + continue; + } +#endif + if (MKDEV(desc->major,desc->minor) == rdev->dev) { + rdev->sb->this_disk = *desc; + rdev->desc_nr = desc->number; + ok = 1; + break; + } + } + + if (!ok) { + MD_BUG(); + } +} + +static int sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + mdp_super_t *sb; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + sb = rdev->sb; + *sb = *mddev->sb; + set_this_disk(mddev, rdev); + sb->sb_csum = calc_sb_csum(sb); + } + return 0; +} + +int md_update_sb(mddev_t * mddev) +{ + int first, err, count = 100; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + __u64 ev; + +repeat: + mddev->sb->utime = CURRENT_TIME; + ev = get_unaligned(&mddev->sb->events); + ++ev; + put_unaligned(ev,&mddev->sb->events); + if (ev == (__u64)0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + --ev; + put_unaligned(ev,&mddev->sb->events); + } + sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + printk(KERN_INFO "md: updating md%d RAID superblock on device\n", + mdidx(mddev)); + + first = 1; + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (!first) { + first = 0; + printk(", "); + } + if (rdev->faulty) + printk("(skipping faulty "); + printk("%s ", partition_name(rdev->dev)); + if (!rdev->faulty) { + printk("[events: %08lx]", + (unsigned long)get_unaligned(&rdev->sb->events)); + err += write_disk_sb(rdev); + } else + printk(")\n"); + } + printk(".\n"); + if (err) { + printk("errors occured during superblock update, repeating\n"); + if (--count) + goto repeat; + printk("excessive errors occured during superblock update, exiting\n"); + } + return 0; +} + +/* + * Import a device. If 'on_disk', then sanity check the superblock + * + * mark the device faulty if: + * + * - the device is nonexistent (zero size) + * - the device has no valid superblock + * + * a faulty rdev _never_ has rdev->sb set. + */ +static int md_import_device (kdev_t newdev, int on_disk) +{ + int err; + mdk_rdev_t *rdev; + unsigned int size; + + if (find_rdev_all(newdev)) + return -EEXIST; + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + printk("could not alloc mem for %s!\n", partition_name(newdev)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if (!fs_may_mount(newdev)) { + printk("md: can not import %s, has active inodes!\n", + partition_name(newdev)); + err = -EBUSY; + goto abort_free; + } + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->dev = newdev; + if (lock_rdev(rdev)) { + printk("md: could not lock %s, zero-size? Marking faulty.\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + rdev->desc_nr = -1; + rdev->faulty = 0; + + size = 0; + if (blk_size[MAJOR(newdev)]) + size = blk_size[MAJOR(newdev)][MINOR(newdev)]; + if (!size) { + printk("md: %s has zero size, marking faulty!\n", + partition_name(newdev)); + err = -EINVAL; + goto abort_free; + } + + if (on_disk) { + if ((err = read_disk_sb(rdev))) { + printk("md: could not read %s's sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + printk("md: %s has invalid sb, not importing!\n", + partition_name(newdev)); + goto abort_free; + } + + rdev->old_dev = MKDEV(rdev->sb->this_disk.major, + rdev->sb->this_disk.minor); + rdev->desc_nr = rdev->sb->this_disk.number; + } + md_list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + + if (rdev->faulty && rdev->sb) + free_disk_sb(rdev); + return 0; + +abort_free: + if (rdev->sb) { + if (rdev->inode) + unlock_rdev(rdev); + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + +/* + * Check a full RAID array for plausibility + */ + +#define INCONSISTENT KERN_ERR \ +"md: fatal superblock inconsistency in %s -- removing from array\n" + +#define OUT_OF_DATE KERN_ERR \ +"md: superblock update time inconsistency -- using the most recent one\n" + +#define OLD_VERSION KERN_ALERT \ +"md: md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md: md%d: raid array is not clean -- starting background reconstruction\n" + +#define UNKNOWN_LEVEL KERN_ERR \ +"md: md%d: unsupported raid level %d\n" + +static int analyze_sbs (mddev_t * mddev) +{ + int out_of_date = 0, i; + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + printk (INCONSISTENT, partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + __u64 ev = get_unaligned(&rdev->sb->events); + if (ev != (__u64)0) { + --ev; + put_unaligned(ev,&rdev->sb->events); + } + } + + printk("%s's event counter: %08lx\n", partition_name(rdev->dev), + (unsigned long)get_unaligned(&rdev->sb->events)); + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&freshest->sb->events); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + printk(OUT_OF_DATE); + printk("freshest: %s\n", partition_name(freshest->dev)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices faulty + */ + __u64 ev1, ev2; + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&sb->events); + ++ev1; + if (ev1 < ev2) { + printk("md: kicking non-fresh %s from array!\n", + partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * Fix up changed device names ... but only if this disk has a + * recent update time. Use faulty checksum ones too. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2, ev3; + if (rdev->faulty) { /* REMOVEME */ + MD_BUG(); + goto abort; + } + ev1 = get_unaligned(&rdev->sb->events); + ev2 = get_unaligned(&sb->events); + ev3 = ev2; + --ev3; + if ((rdev->dev != rdev->old_dev) && + ((ev1 == ev2) || (ev1 == ev3))) { + mdp_disk_t *desc; + + printk("md: device name has changed from %s to %s since last import!\n", partition_name(rdev->old_dev), partition_name(rdev->dev)); + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + desc = &sb->disks[rdev->desc_nr]; + if (rdev->old_dev != MKDEV(desc->major, desc->minor)) { + MD_BUG(); + goto abort; + } + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + desc = &rdev->sb->this_disk; + desc->major = MAJOR(rdev->dev); + desc->minor = MINOR(rdev->dev); + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + /* + * We kick faulty devices/descriptors immediately. + */ + if (disk_faulty(desc)) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + printk("md%d: kicking faulty %s!\n", + mdidx(mddev),partition_name(rdev->dev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + if (dev == MKDEV(0,0)) + continue; + printk("md%d: removing former faulty %s!\n", + mdidx(mddev), partition_name(dev)); + } + remove_descriptor(desc, sb); + continue; + } + + if (dev == MKDEV(0,0)) + continue; + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + printk("md%d: former device %s is unavailable, removing from array!\n", mdidx(mddev), partition_name(dev)); + remove_descriptor(desc, sb); + } + + /* + * Double check wether all devices mentioned in the + * superblock are in the rdev ring. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + + if (disk_faulty(desc)) { + MD_BUG(); + goto abort; + } + + rdev = find_rdev(mddev, dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + } + + /* + * Do a final reality check. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + /* + * is the device unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->dev == rdev->dev)) { + MD_BUG(); + goto abort; + } + } + } + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + printk (OLD_VERSION, mdidx(mddev), sb->major_version, + sb->minor_version, sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + printk (NOT_CLEAN_IGNORE, mdidx(mddev)); + + return 0; +abort: + return 1; +} + +#undef INCONSISTENT +#undef OUT_OF_DATE +#undef OLD_VERSION +#undef OLD_LEVEL + +static int device_size_calculation (mddev_t * mddev) +{ + int data_disks = 0, persistent; + unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + MD_BUG(); + continue; + } + rdev->size = calc_dev_size(rdev->dev, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + printk (KERN_WARNING + "Dev %s smaller than chunk_size: %dk < %dk\n", + partition_name(rdev->dev), + rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + printk (UNKNOWN_LEVEL, mdidx(mddev), sb->level); + goto abort; + } + if (!md_size[mdidx(mddev)]) + md_size[mdidx(mddev)] = sb->size * data_disks; + + readahead = MD_READAHEAD; + if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) + readahead = mddev->sb->chunk_size * 4 * data_disks; + if (readahead < data_disks * MAX_SECTORS*512*2) + readahead = data_disks * MAX_SECTORS*512*2; + else { + if (sb->level == -3) + readahead = 0; + } + md_maxreadahead[mdidx(mddev)] = readahead; + + printk(KERN_INFO "md%d: max total readahead window set to %dk\n", + mdidx(mddev), readahead/1024); + + printk(KERN_INFO + "md%d: %d data-disks, max readahead per data-disk: %dk\n", + mdidx(mddev), data_disks, readahead/data_disks/1024); + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run (mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + + if (!mddev->nb_dev) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + mddev->param.chunk_size = chunk_size; + mddev->param.personality = pnum; + + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if ((pnum != RAID1) && (pnum != LINEAR) && !chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + return -EINVAL; + } + + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + fsync_dev(rdev->dev); + invalidate_buffers(rdev->dev); + } + + mddev->pers = pers[pnum]; + + err = mddev->pers->run(mddev); + if (err) { + printk("pers->run() failed ...\n"); + mddev->pers = NULL; + return -EINVAL; + } + + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + md_update_sb(mddev); + + /* + * md_size has units of 1K blocks, which are + * twice as large as sectors. + */ + md_hd_struct[mdidx(mddev)].start_sect = 0; + md_hd_struct[mdidx(mddev)].nr_sects = md_size[mdidx(mddev)] << 1; + + read_ahead[MD_MAJOR] = 1024; + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + +#define OUT(x) do { err = (x); goto out; } while (0) + +static int restart_array (mddev_t *mddev) +{ + int err = 0; + + /* + * Complain if it has no devices + */ + if (!mddev->nb_dev) + OUT(-ENXIO); + + if (mddev->pers) { + if (!mddev->ro) + OUT(-EBUSY); + + mddev->ro = 0; + set_device_ro(mddev_to_kdev(mddev), 0); + + printk (KERN_INFO + "md%d switched to read-write mode.\n", mdidx(mddev)); + /* + * Kick recovery or resync if necessary + */ + md_recover_arrays(); + if (mddev->pers->restart_resync) + mddev->pers->restart_resync(mddev); + } else + err = -EINVAL; + +out: + return err; +} + +#define STILL_MOUNTED KERN_WARNING \ +"md: md%d still mounted.\n" + +static int do_md_stop (mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (!ro && !fs_may_mount (dev)) { + printk (STILL_MOUNTED, mdidx(mddev)); + OUT(-EBUSY); + } + + /* + * complain if it's already stopped + */ + if (!mddev->nb_dev) + OUT(-ENXIO); + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + /* + * sync and invalidate buffers because we cannot kill the + * main thread with valid IO transfers still around. + * the kernel lock protects us from new requests being + * added after invalidate_buffers(). + */ + fsync_dev (mddev_to_kdev(mddev)); + fsync_dev (dev); + invalidate_buffers (dev); + + if (ro) { + if (mddev->ro) + OUT(-ENXIO); + mddev->ro = 1; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + if (mddev->ro) + set_device_ro(dev, 1); + OUT(-EBUSY); + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + printk("marking sb clean...\n"); + mddev->sb->state |= 1 << MD_SB_CLEAN; + } + md_update_sb(mddev); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + export_array(mddev); + md_size[mdidx(mddev)] = 0; + md_hd_struct[mdidx(mddev)].nr_sects = 0; + free_mddev(mddev); + + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev)); + } else + printk (KERN_INFO + "md%d switched to read-only mode.\n", mdidx(mddev)); +out: + return err; +} + +#undef OUT + +/* + * We have to safely support old arrays too. + */ +int detect_old_array (mdp_super_t *sb) +{ + if (sb->major_version > 0) + return 0; + if (sb->minor_version >= 90) + return 0; + + return -EINVAL; +} + + +static void autorun_array (mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int err; + + if (mddev->disks.prev == &mddev->disks) { + MD_BUG(); + return; + } + + printk("running: "); + + ITERATE_RDEV(mddev,rdev,tmp) { + printk("<%s>", partition_name(rdev->dev)); + } + printk("\nnow!\n"); + + err = do_md_run (mddev); + if (err) { + printk("do_md_run() returned %d\n", err); + /* + * prevent the writeback of an unrunnable array + */ + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + */ +static void autorun_devices (void) +{ + struct md_list_head candidates; + struct md_list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + printk("autorun ...\n"); + while (pending_raid_disks.next != &pending_raid_disks) { + rdev0 = md_list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + + printk("considering %s ...\n", partition_name(rdev0->dev)); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + printk("%s has same UUID as %s, but superblocks differ ...\n", partition_name(rdev->dev), partition_name(rdev0->dev)); + continue; + } + printk(" adding %s ...\n", partition_name(rdev->dev)); + md_list_del(&rdev->pending); + md_list_add(&rdev->pending, &candidates); + } + } + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev) { + printk("md%d already running, cannot run %s\n", + mdidx(mddev), partition_name(rdev0->dev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) + export_rdev(rdev); + continue; + } + mddev = alloc_mddev(md_kdev); + printk("created md%d\n", mdidx(mddev)); + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + autorun_array(mddev); + } + printk("... autorun DONE.\n"); +} + +/* + * import RAID devices based on one partition + * if possible, the array gets run as well. + */ + +#define BAD_VERSION KERN_ERR \ +"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n" + +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_DEVICE KERN_ERR \ +"md: disabled device %s\n" + +#define AUTOADD_FAILED KERN_ERR \ +"md: auto-adding devices to md%d FAILED (error %d).\n" + +#define AUTOADD_FAILED_USED KERN_ERR \ +"md: cannot auto-add device %s to md%d, already used.\n" + +#define AUTORUN_FAILED KERN_ERR \ +"md: auto-running md%d FAILED (error %d).\n" + +#define MDDEV_BUSY KERN_ERR \ +"md: cannot auto-add to md%d, already running.\n" + +#define AUTOADDING KERN_INFO \ +"md: auto-adding devices to md%d, based on %s's superblock.\n" + +#define AUTORUNNING KERN_INFO \ +"md: auto-running md%d.\n" + +static int autostart_array (kdev_t startdev) +{ + int err = -EINVAL, i; + mdp_super_t *sb = NULL; + mdk_rdev_t *start_rdev = NULL, *rdev; + + if (md_import_device(startdev, 1)) { + printk("could not import %s!\n", partition_name(startdev)); + goto abort; + } + + start_rdev = find_rdev_all(startdev); + if (!start_rdev) { + MD_BUG(); + goto abort; + } + if (start_rdev->faulty) { + printk("can not autostart based on faulty %s!\n", + partition_name(startdev)); + goto abort; + } + md_list_add(&start_rdev->pending, &pending_raid_disks); + + sb = start_rdev->sb; + + err = detect_old_array(sb); + if (err) { + printk("array version is too old to be autostarted, use raidtools 0.90 mkraid --upgrade\nto upgrade the array without data loss!\n"); + goto abort; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + kdev_t dev; + + desc = sb->disks + i; + dev = MKDEV(desc->major, desc->minor); + + if (dev == MKDEV(0,0)) + continue; + if (dev == startdev) + continue; + if (md_import_device(dev, 1)) { + printk("could not import %s, trying to run array nevertheless.\n", partition_name(dev)); + continue; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + goto abort; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + + /* + * possibly return codes + */ + autorun_devices(); + return 0; + +abort: + if (start_rdev) + export_rdev(start_rdev); + return err; +} + +#undef BAD_VERSION +#undef OUT_OF_MEM +#undef NO_DEVICE +#undef AUTOADD_FAILED_USED +#undef AUTOADD_FAILED +#undef AUTORUN_FAILED +#undef AUTOADDING +#undef AUTORUNNING + +struct { + int set; + int noautodetect; + +} raid_setup_args md__initdata = { 0, 0 }; + +/* + * Searches all registered partitions for autorun RAID arrays + * at boot time. + */ +void md__init autodetect_raid(void) +{ +#ifdef CONFIG_AUTODETECT_RAID + struct gendisk *disk; + mdk_rdev_t *rdev; + int i; + + if (raid_setup_args.noautodetect) { + printk(KERN_INFO "skipping autodetection of RAID arrays\n"); + return; + } + printk(KERN_INFO "autodetecting RAID arrays\n"); + + for (disk = gendisk_head ; disk ; disk = disk->next) { + for (i = 0; i < disk->max_p*disk->max_nr; i++) { + kdev_t dev = MKDEV(disk->major,i); + + if (disk->part[i].type != LINUX_RAID_PARTITION) + continue; + + if (md_import_device(dev,1)) { + printk(KERN_ALERT "could not import %s!\n", + partition_name(dev)); + continue; + } + /* + * Sanity checks: + */ + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + md_list_add(&rdev->pending, &pending_raid_disks); + } + } + + autorun_devices(); +#endif +} + +static int get_version (void * arg) +{ + mdu_version_t ver; + + ver.major = MD_MAJOR_VERSION; + ver.minor = MD_MINOR_VERSION; + ver.patchlevel = MD_PATCHLEVEL_VERSION; + + if (md_copy_to_user(arg, &ver, sizeof(ver))) + return -EFAULT; + + return 0; +} + +#define SET_FROM_SB(x) info.x = mddev->sb->x +static int get_array_info (mddev_t * mddev, void * arg) +{ + mdu_array_info_t info; + + if (!mddev->sb) + return -EINVAL; + + SET_FROM_SB(major_version); + SET_FROM_SB(minor_version); + SET_FROM_SB(patch_version); + SET_FROM_SB(ctime); + SET_FROM_SB(level); + SET_FROM_SB(size); + SET_FROM_SB(nr_disks); + SET_FROM_SB(raid_disks); + SET_FROM_SB(md_minor); + SET_FROM_SB(not_persistent); + + SET_FROM_SB(utime); + SET_FROM_SB(state); + SET_FROM_SB(active_disks); + SET_FROM_SB(working_disks); + SET_FROM_SB(failed_disks); + SET_FROM_SB(spare_disks); + + SET_FROM_SB(layout); + SET_FROM_SB(chunk_size); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x +static int get_disk_info (mddev_t * mddev, void * arg) +{ + mdu_disk_info_t info; + unsigned int nr; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + SET_FROM_SB(major); + SET_FROM_SB(minor); + SET_FROM_SB(raid_disk); + SET_FROM_SB(state); + + if (md_copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} +#undef SET_FROM_SB + +#define SET_SB(x) mddev->sb->disks[nr].x = info.x + +static int add_new_disk (mddev_t * mddev, void * arg) +{ + int err, size, persistent; + mdu_disk_info_t info; + mdk_rdev_t *rdev; + unsigned int nr; + kdev_t dev; + + if (!mddev->sb) + return -EINVAL; + + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + nr = info.number; + if (nr >= mddev->sb->nr_disks) + return -EINVAL; + + dev = MKDEV(info.major,info.minor); + + if (find_rdev_all(dev)) { + printk("device %s already used in a RAID array!\n", + partition_name(dev)); return -EBUSY; + } + + SET_SB(number); + SET_SB(major); + SET_SB(minor); + SET_SB(raid_disk); + SET_SB(state); + + if ((info.state & (1<old_dev = dev; + rdev->desc_nr = info.number; + + bind_rdev_to_array(rdev, mddev); + + persistent = !mddev->sb->not_persistent; + if (!persistent) + printk("nonpersistent superblock ...\n"); + if (!mddev->sb->chunk_size) + printk("no chunksize?\n"); + + size = calc_dev_size(dev, mddev, persistent); + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + if (!mddev->sb->size || (mddev->sb->size > size)) + mddev->sb->size = size; + } + + /* + * sync all other superblocks with the main superblock + */ + sync_sbs(mddev); + + return 0; +} +#undef SET_SB - if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) { - printk("md_add(): zero device size, huh, bailing out.\n"); +static int hot_remove_disk (mddev_t * mddev, kdev_t dev) +{ + int err; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to remove %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); return -EINVAL; } - if (md_dev[minor].pers) { - /* - * The array is already running, hot-add the drive, or - * bail out: - */ - if (!md_dev[minor].pers->hot_add_disk) - return -EBUSY; - else - hot_add=1; + rdev = find_rdev(mddev, dev); + if (!rdev) + return -ENXIO; + + if (rdev->desc_nr == -1) { + MD_BUG(); + return -EINVAL; + } + disk = &mddev->sb->disks[rdev->desc_nr]; + if (disk_active(disk)) + goto busy; + if (disk_removed(disk)) { + MD_BUG(); + return -EINVAL; + } + + err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + if (err == -EBUSY) + goto busy; + if (err) { + MD_BUG(); + return -EINVAL; + } + + remove_descriptor(disk, mddev->sb); + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + md_update_sb(mddev); + + return 0; +busy: + printk("cannot remove active disk %s from md%d ... \n", + partition_name(dev), mdidx(mddev)); + return -EBUSY; +} + +static int hot_add_disk (mddev_t * mddev, kdev_t dev) +{ + int i, err, persistent; + unsigned int size; + mdk_rdev_t *rdev; + mdp_disk_t *disk; + + if (!mddev->pers) + return -ENODEV; + + printk("trying to hot-add %s to md%d ... \n", + partition_name(dev), mdidx(mddev)); + + if (!mddev->pers->diskop) { + printk("md%d: personality does not support diskops!\n", + mdidx(mddev)); + return -EINVAL; + } + + persistent = !mddev->sb->not_persistent; + size = calc_dev_size(dev, mddev, persistent); + + if (size < mddev->sb->size) { + printk("md%d: disk size %d blocks < array size %d\n", + mdidx(mddev), size, mddev->sb->size); + return -ENOSPC; + } + + rdev = find_rdev(mddev, dev); + if (rdev) + return -EBUSY; + + err = md_import_device (dev, 0); + if (err) { + printk("md: error, md_import_device() returned %d\n", err); + return -EINVAL; + } + rdev = find_rdev_all(dev); + if (!rdev) { + MD_BUG(); + return -EINVAL; + } + if (rdev->faulty) { + printk("md: can not hot-add faulty %s disk to md%d!\n", + partition_name(dev), mdidx(mddev)); + err = -EINVAL; + goto abort_export; } + bind_rdev_to_array(rdev, mddev); /* - * Careful. We cannot increase nb_dev for a running array. + * The rest should better be atomic, we can have disk failures + * noticed in interrupt contexts ... */ - i=md_dev[minor].nb_dev; - realdev = &md_dev[minor].devices[i]; - realdev->dev=dev; - - /* Lock the device by inserting a dummy inode. This doesn't - smell very good, but I need to be consistent with the - mount stuff, specially with fs_may_mount. If someone have - a better idea, please help ! */ - - realdev->inode=get_empty_inode (); - if (!realdev->inode) - return -ENOMEM; - realdev->inode->i_dev=dev; /* don't care about other fields */ - insert_inode_hash (realdev->inode); - - /* Sizes are now rounded at run time */ - -/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/ + rdev->old_dev = dev; + rdev->size = size; + rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent); + + disk = mddev->sb->disks + mddev->sb->raid_disks; + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; - realdev->size=blk_size[MAJOR(dev)][MINOR(dev)]; + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + printk("md%d: can not hot-add to full array!\n", mdidx(mddev)); + err = -EBUSY; + goto abort_unbind_export; + } - if (hot_add) { - /* - * Check the superblock for consistency. - * The personality itself has to check whether it's getting - * added with the proper flags. The personality has to be - * checked too. ;) - */ - if (analyze_one_sb (realdev)) - return -EINVAL; + if (disk_removed(disk)) { /* - * hot_add has to bump up nb_dev itself + * reuse slot */ - if (md_dev[minor].pers->hot_add_disk (&md_dev[minor], dev)) { - /* - * FIXME: here we should free up the inode and stuff - */ - printk ("FIXME\n"); - return -EINVAL; + if (disk->number != i) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; } - } else - md_dev[minor].nb_dev++; + } else { + disk->number = i; + } - printk ("REGISTER_DEV %s to md%x done\n", partition_name(dev), minor); - return (0); + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) { + MD_BUG(); + err = -EINVAL; + goto abort_unbind_export; + } + + mark_disk_spare(disk); + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + + mddev->sb_dirty = 1; + + md_update_sb(mddev); + + /* + * Kick recovery, maybe this spare has to be added to the + * array immediately. + */ + md_recover_arrays(); + + return 0; + +abort_unbind_export: + unbind_rdev_from_array(rdev); + +abort_export: + export_rdev(rdev); + return err; } -static int md_ioctl (struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +#define SET_SB(x) mddev->sb->x = info.x +static int set_array_info (mddev_t * mddev, void * arg) { - int minor, err; - struct hd_geometry *loc = (struct hd_geometry *) arg; + mdu_array_info_t info; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + if (mddev->sb) { + printk("array md%d already has a superblock!\n", + mdidx(mddev)); + return -EBUSY; + } - if (((minor=MINOR(inode->i_rdev)) & 0x80) && - (minor & 0x7f) < MAX_PERSONALITY && - pers[minor & 0x7f] && - pers[minor & 0x7f]->ioctl) - return (pers[minor & 0x7f]->ioctl (inode, file, cmd, arg)); - - if (minor >= MAX_MD_DEV) - return -EINVAL; + if (md_copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; - switch (cmd) - { - case REGISTER_DEV: - return do_md_add (minor, to_kdev_t ((dev_t) arg)); - - case START_MD: - return do_md_run (minor, (int) arg); - - case STOP_MD: - return do_md_stop (minor, inode); - - case BLKGETSIZE: /* Return device size */ - if (!arg) return -EINVAL; - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].nr_sects, (long *) arg); - if (err) - return err; - break; - - - /* We have a problem here : there is no easy way to give a CHS - virtual geometry. We currently pretend that we have a 2 heads - 4 sectors (with a BIG number of cylinders...). This drives dosfs - just mad... ;-) */ - - case HDIO_GETGEO: - if (!loc) return -EINVAL; - err = put_user (2, (char *) &loc->heads); - if (err) - return err; - err = put_user (4, (char *) &loc->sectors); - if (err) - return err; - err = put_user (md_hd_struct[minor].nr_sects/8, (short *) &loc->cylinders); - if (err) - return err; - err = put_user (md_hd_struct[MINOR(inode->i_rdev)].start_sect, - (long *) &loc->start); - if (err) - return err; - break; - - case BLKROSET: - case BLKROGET: - case BLKRAGET: - case BLKRASET: - case BLKFLSBUF: - return blk_ioctl(inode->i_rdev, cmd, arg); - - default: - return -EINVAL; - } + if (alloc_array_sb(mddev)) + return -ENOMEM; + + mddev->sb->major_version = MD_MAJOR_VERSION; + mddev->sb->minor_version = MD_MINOR_VERSION; + mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; + mddev->sb->ctime = CURRENT_TIME; + + SET_SB(level); + SET_SB(size); + SET_SB(nr_disks); + SET_SB(raid_disks); + SET_SB(md_minor); + SET_SB(not_persistent); + + SET_SB(state); + SET_SB(active_disks); + SET_SB(working_disks); + SET_SB(failed_disks); + SET_SB(spare_disks); + + SET_SB(layout); + SET_SB(chunk_size); + + mddev->sb->md_magic = MD_SB_MAGIC; + + /* + * Generate a 128 bit UUID + */ + get_random_bytes(&mddev->sb->set_uuid0, 4); + get_random_bytes(&mddev->sb->set_uuid1, 4); + get_random_bytes(&mddev->sb->set_uuid2, 4); + get_random_bytes(&mddev->sb->set_uuid3, 4); + + return 0; +} +#undef SET_SB - return (0); +static int set_disk_info (mddev_t * mddev, void * arg) +{ + printk("not yet"); + return -EINVAL; } -static int md_open (struct inode *inode, struct file *file) +static int clear_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int write_raid_info (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int protect_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int unprotect_array (mddev_t * mddev) +{ + printk("not yet"); + return -EINVAL; +} + +static int set_disk_faulty (mddev_t *mddev, kdev_t dev) { - int minor=MINOR(inode->i_rdev); + int ret; - md_dev[minor].busy++; - return (0); /* Always succeed */ + fsync_dev(mddev_to_kdev(mddev)); + ret = md_error(mddev_to_kdev(mddev), dev); + return ret; } +static int md_ioctl (struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + unsigned int minor; + int err = 0; + struct hd_geometry *loc = (struct hd_geometry *) arg; + mddev_t *mddev = NULL; + kdev_t dev; + + if (!md_capable_admin()) + return -EACCES; + + dev = inode->i_rdev; + minor = MINOR(dev); + if (minor >= MAX_MD_DEVS) + return -EINVAL; + + /* + * Commands dealing with the RAID driver but not any + * particular array: + */ + switch (cmd) + { + case RAID_VERSION: + err = get_version((void *)arg); + goto done; + + case PRINT_RAID_DEBUG: + err = 0; + md_print_devices(); + goto done_unlock; + + case BLKGETSIZE: /* Return device size */ + if (!arg) { + err = -EINVAL; + goto abort; + } + err = md_put_user(md_hd_struct[minor].nr_sects, + (long *) arg); + goto done; + + case BLKFLSBUF: + fsync_dev(dev); + invalidate_buffers(dev); + goto done; + + case BLKRASET: + if (arg > 0xff) { + err = -EINVAL; + goto abort; + } + read_ahead[MAJOR(dev)] = arg; + goto done; + + case BLKRAGET: + if (!arg) { + err = -EINVAL; + goto abort; + } + err = md_put_user (read_ahead[ + MAJOR(dev)], (long *) arg); + goto done; + default: + } + + /* + * Commands creating/starting a new array: + */ + + mddev = kdev_to_mddev(dev); + + switch (cmd) + { + case SET_ARRAY_INFO: + case START_ARRAY: + if (mddev) { + printk("array md%d already exists!\n", + mdidx(mddev)); + err = -EEXIST; + goto abort; + } + default: + } + + switch (cmd) + { + case SET_ARRAY_INFO: + mddev = alloc_mddev(dev); + if (!mddev) { + err = -ENOMEM; + goto abort; + } + /* + * alloc_mddev() should possibly self-lock. + */ + err = lock_mddev(mddev); + if (err) { + printk("ioctl, reason %d, cmd %d\n", err, cmd); + goto abort; + } + err = set_array_info(mddev, (void *)arg); + if (err) { + printk("couldnt set array info. %d\n", err); + goto abort; + } + goto done_unlock; + + case START_ARRAY: + /* + * possibly make it lock the array ... + */ + err = autostart_array((kdev_t)arg); + if (err) { + printk("autostart %s failed!\n", + partition_name((kdev_t)arg)); + goto abort; + } + goto done; + + default: + } + + /* + * Commands querying/configuring an existing array: + */ + + if (!mddev) { + err = -ENODEV; + goto abort; + } + err = lock_mddev(mddev); + if (err) { + printk("ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); + goto abort; + } + + /* + * Commands even a read-only array can execute: + */ + switch (cmd) + { + case GET_ARRAY_INFO: + err = get_array_info(mddev, (void *)arg); + goto done_unlock; + + case GET_DISK_INFO: + err = get_disk_info(mddev, (void *)arg); + goto done_unlock; + + case RESTART_ARRAY_RW: + err = restart_array(mddev); + goto done_unlock; + + case STOP_ARRAY: + err = do_md_stop (mddev, 0); + goto done_unlock; + + case STOP_ARRAY_RO: + err = do_md_stop (mddev, 1); + goto done_unlock; + + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + case HDIO_GETGEO: + if (!loc) { + err = -EINVAL; + goto abort_unlock; + } + err = md_put_user (2, (char *) &loc->heads); + if (err) + goto abort_unlock; + err = md_put_user (4, (char *) &loc->sectors); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8, + (short *) &loc->cylinders); + if (err) + goto abort_unlock; + err = md_put_user (md_hd_struct[minor].start_sect, + (long *) &loc->start); + goto done_unlock; + } + + /* + * The remaining ioctls are changing the state of the + * superblock, so we do not allow read-only arrays + * here: + */ + if (mddev->ro) { + err = -EROFS; + goto abort_unlock; + } + + switch (cmd) + { + case CLEAR_ARRAY: + err = clear_array(mddev); + goto done_unlock; + + case ADD_NEW_DISK: + err = add_new_disk(mddev, (void *)arg); + goto done_unlock; + + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case HOT_ADD_DISK: + err = hot_add_disk(mddev, (kdev_t)arg); + goto done_unlock; + + case SET_DISK_INFO: + err = set_disk_info(mddev, (void *)arg); + goto done_unlock; + + case WRITE_RAID_INFO: + err = write_raid_info(mddev); + goto done_unlock; + + case UNPROTECT_ARRAY: + err = unprotect_array(mddev); + goto done_unlock; + + case PROTECT_ARRAY: + err = protect_array(mddev); + goto done_unlock; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, (kdev_t)arg); + goto done_unlock; + + case RUN_ARRAY: + { + mdu_param_t param; + + err = md_copy_from_user(¶m, (mdu_param_t *)arg, + sizeof(param)); + if (err) + goto abort_unlock; + + err = do_md_run (mddev); + /* + * we have to clean up the mess if + * the array cannot be run for some + * reason ... + */ + if (err) { + mddev->sb_dirty = 0; + do_md_stop (mddev, 0); + } + goto done_unlock; + } + + default: + printk(KERN_WARNING "%s(pid %d) used obsolete MD ioctl, upgrade your software to use new ictls.\n", current->comm, current->pid); + err = -EINVAL; + goto abort_unlock; + } + +done_unlock: +abort_unlock: + if (mddev) + unlock_mddev(mddev); + else + printk("huh11?\n"); + + return err; +done: + if (err) + printk("huh12?\n"); +abort: + return err; +} -static int md_release (struct inode *inode, struct file *file) +static int md_open (struct inode *inode, struct file *file) { - int minor=MINOR(inode->i_rdev); - md_dev[minor].busy--; - return 0; + /* + * Always succeed + */ + return (0); } static struct block_device_operations md_fops= { open: md_open, - release: md_release, ioctl: md_ioctl, }; + -int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size) +int md_map (kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size) { - if ((unsigned int) minor >= MAX_MD_DEV) - { - printk ("Bad md device %d\n", minor); - return (-1); - } - - if (!md_dev[minor].pers) - { - printk ("Oops ! md%d not running, giving up !\n", minor); - return (-1); - } + int err; + mddev_t *mddev = kdev_to_mddev(dev); + + if (!mddev || !mddev->pers) { + err = -ENXIO; + goto out; + } - return (md_dev[minor].pers->map(md_dev+minor, rdev, rsector, size)); + err = mddev->pers->map(mddev, dev, rdev, rsector, size); +out: + return err; } -int md_make_request (int minor, int rw, struct buffer_head * bh) +int md_thread(void * arg) { - if (md_dev [minor].pers->make_request) { - if (buffer_locked(bh)) - return 0; - set_bit(BH_Lock, &bh->b_state); - if (rw == WRITE) { - if (!buffer_dirty(bh)) { - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - return 0; - } + mdk_thread_t *thread = arg; + + md_lock_kernel(); + exit_mm(current); + exit_files(current); + exit_fs(current); + + /* + * Detach thread + */ + sys_setsid(); + sprintf(current->comm, thread->name); + md_init_signals(); + md_flush_signals(); + thread->tsk = current; + + /* + * md_thread is a 'system-thread', it's priority should be very + * high. We avoid resource deadlocks individually in each + * raid personality. (RAID5 does preallocation) We also use RR and + * the very same RT priority as kswapd, thus we will never get + * into a priority inversion deadlock. + * + * we definitely have to have equal or higher priority than + * bdflush, otherwise bdflush will deadlock if there are too + * many dirty RAID5 blocks. + */ + current->policy = SCHED_OTHER; + current->priority = 40; +// md_unlock_kernel(); + + up(thread->sem); + + for (;;) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&thread->wqueue, &wait); + if (!test_bit(THREAD_WAKEUP, &thread->flags)) { + set_task_state(current, TASK_INTERRUPTIBLE); + dprintk("thread %p went to sleep.\n", thread); + schedule(); + dprintk("thread %p woke up.\n", thread); + current->state = TASK_RUNNING; } - if (rw == READ || rw == READA) { - if (buffer_uptodate(bh)) { - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - return 0; - } + remove_wait_queue(&thread->wqueue, &wait); + clear_bit(THREAD_WAKEUP, &thread->flags); + + if (thread->run) { + thread->run(thread->data); + run_task_queue(&tq_disk); + } else + break; + if (md_signal_pending(current)) { + printk("%8s(%d) flushing signals.\n", current->comm, + current->pid); + md_flush_signals(); } - return (md_dev[minor].pers->make_request(md_dev+minor, rw, bh)); - } else { - make_request (MAJOR(bh->b_rdev), rw, bh); - return 0; } + up(thread->sem); + return 0; } -static void do_md_request (request_queue_t * q) -{ - printk ("Got md request, not good..."); - return; -} - -void md_wakeup_thread(struct md_thread *thread) +void md_wakeup_thread(mdk_thread_t *thread) { + dprintk("waking up MD thread %p.\n", thread); set_bit(THREAD_WAKEUP, &thread->flags); wake_up(&thread->wqueue); } -struct md_thread *md_register_thread (void (*run) (void *), void *data) +mdk_thread_t *md_register_thread (void (*run) (void *), + void *data, const char *name) { - struct md_thread *thread = (struct md_thread *) - kmalloc(sizeof(struct md_thread), GFP_KERNEL); + mdk_thread_t *thread; int ret; DECLARE_MUTEX_LOCKED(sem); - if (!thread) return NULL; + thread = (mdk_thread_t *) kmalloc + (sizeof(mdk_thread_t), GFP_KERNEL); + if (!thread) + return NULL; - memset(thread, 0, sizeof(struct md_thread)); - init_waitqueue_head(&thread->wqueue); + memset(thread, 0, sizeof(mdk_thread_t)); + md_init_waitqueue_head(&thread->wqueue); thread->sem = &sem; thread->run = run; thread->data = data; + thread->name = name; ret = kernel_thread(md_thread, thread, 0); if (ret < 0) { kfree(thread); @@ -764,277 +2963,460 @@ return thread; } -void md_unregister_thread (struct md_thread *thread) +void md_interrupt_thread (mdk_thread_t *thread) +{ + if (!thread->tsk) { + MD_BUG(); + return; + } + printk("interrupting MD-thread pid %d\n", thread->tsk->pid); + send_sig(SIGKILL, thread->tsk, 1); +} + +void md_unregister_thread (mdk_thread_t *thread) { DECLARE_MUTEX_LOCKED(sem); thread->sem = &sem; thread->run = NULL; - if (thread->tsk) - printk("Killing md_thread %d %p %s\n", - thread->tsk->pid, thread->tsk, thread->tsk->comm); - else - printk("Aiee. md_thread has 0 tsk\n"); - send_sig(SIGKILL, thread->tsk, 1); - printk("downing on %p\n", &sem); + thread->name = NULL; + if (!thread->tsk) { + MD_BUG(); + return; + } + md_interrupt_thread(thread); down(&sem); } -#define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM)) +void md_recover_arrays (void) +{ + if (!md_recovery_thread) { + MD_BUG(); + return; + } + md_wakeup_thread(md_recovery_thread); +} + -int md_thread(void * arg) +int md_error (kdev_t dev, kdev_t rdev) { - struct md_thread *thread = arg; + mddev_t *mddev = kdev_to_mddev(dev); + mdk_rdev_t * rrdev; + int rc; - lock_kernel(); - exit_mm(current); - exit_files(current); - exit_fs(current); - - current->session = 1; - current->pgrp = 1; - sprintf(current->comm, "md_thread"); - siginitsetinv(¤t->blocked, SHUTDOWN_SIGS); - thread->tsk = current; - up(thread->sem); + printk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",MAJOR(dev),MINOR(dev),MAJOR(rdev),MINOR(rdev), __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2),__builtin_return_address(3)); - for (;;) { - cli(); - if (!test_bit(THREAD_WAKEUP, &thread->flags)) { - do { - spin_lock(¤t->sigmask_lock); - flush_signals(current); - spin_unlock(¤t->sigmask_lock); - interruptible_sleep_on(&thread->wqueue); - cli(); - if (test_bit(THREAD_WAKEUP, &thread->flags)) - break; - if (!thread->run) { - sti(); - up(thread->sem); - return 0; - } - } while (signal_pending(current)); - } - sti(); - clear_bit(THREAD_WAKEUP, &thread->flags); - if (thread->run) { - thread->run(thread->data); - run_task_queue(&tq_disk); + if (!mddev) { + MD_BUG(); + return 0; + } + rrdev = find_rdev(mddev, rdev); + mark_rdev_faulty(rrdev); + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + md_interrupt_thread(md_recovery_thread); + if (mddev->pers->error_handler) { + rc = mddev->pers->error_handler(mddev, rdev); + md_recover_arrays(); + return rc; + } + return 0; +} + +static int status_unused (char * page) +{ + int sz = 0, i = 0; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + sz += sprintf(page + sz, "unused devices: "); + + ITERATE_RDEV_ALL(rdev,tmp) { + if (!rdev->same_set.next && !rdev->same_set.prev) { + /* + * The device is not yet used by any array. + */ + i++; + sz += sprintf(page + sz, "%s ", + partition_name(rdev->dev)); } } + if (!i) + sz += sprintf(page + sz, ""); + + sz += sprintf(page + sz, "\n"); + return sz; } -EXPORT_SYMBOL(md_size); -EXPORT_SYMBOL(md_maxreadahead); -EXPORT_SYMBOL(register_md_personality); -EXPORT_SYMBOL(unregister_md_personality); -EXPORT_SYMBOL(md_dev); -EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_register_thread); -EXPORT_SYMBOL(md_unregister_thread); -EXPORT_SYMBOL(md_update_sb); -EXPORT_SYMBOL(md_map); -EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_do_sync); -#ifdef CONFIG_PROC_FS -static int md_status_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int status_resync (char * page, mddev_t * mddev) { - int sz = 0, i, j, size; - int begin = 0; + int sz = 0; + unsigned int blocksize, max_blocks, resync, res, dt, tt, et; - sz=sprintf( page, "Personalities : "); - for (i=0; iname); - page[sz-1]='\n'; - - sz+=sprintf (page+sz, "read_ahead "); - if (read_ahead[MD_MAJOR]==INT_MAX) - sz+=sprintf (page+sz, "not set\n"); + resync = mddev->curr_resync; + blocksize = blksize_size[MD_MAJOR][mdidx(mddev)]; + max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10); + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return 0; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + sz += sprintf(page + sz, "["); + for (i = 0; i < x; i++) + sz += sprintf(page + sz, "="); + sz += sprintf(page + sz, ">"); + for (i = 0; i < y; i++) + sz += sprintf(page + sz, "."); + sz += sprintf(page + sz, "] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + sz += sprintf(page + sz, " resync =%3u.%u%% (%u/%u)", + res/10, res % 10, resync, max_blocks); else - sz+=sprintf (page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); + /* + * recovery ... + */ + sz += sprintf(page + sz, " recovery =%3u.%u%% (%u/%u)", + res/10, res % 10, resync, max_blocks); - for (i=0; i= off+count) { - *eof = 1; - break; - } - sz+=sprintf (page+sz, "md%d : %sactive", - i, md_dev[i].pers ? "" : "in"); + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time until now + * tt: total time + * et: estimated finish time + */ + dt = ((jiffies - mddev->resync_start) / HZ); + tt = (dt * (max_blocks / (resync/100+1)))/100; + if (tt > dt) + et = tt - dt; + else + /* + * ignore rounding effects near finish time + */ + et = 0; + + sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6); - if (md_dev[i].pers) - sz+=sprintf (page+sz, " %s", md_dev[i].pers->name); + return sz; +} - for (j=0, size=0; jname); + + sz += sprintf(page+sz, "\n"); + + + sz += sprintf(page+sz, "read_ahead "); + if (read_ahead[MD_MAJOR] == INT_MAX) + sz += sprintf(page+sz, "not set\n"); + else + sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); + + ITERATE_MDDEV(mddev,tmp) { + sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + sz += sprintf(page + sz, " (read-only)"); + sz += sprintf(page + sz, " %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + sz += sprintf(page + sz, " %s[%d]", + partition_name(rdev->dev), rdev->desc_nr); + if (rdev->faulty) { + sz += sprintf(page + sz, "(F)"); + continue; + } + size += rdev->size; } - if (md_dev[i].nb_dev) { - if (md_dev[i].pers) - sz+=sprintf (page+sz, " %d blocks", md_size[i]); + if (mddev->nb_dev) { + if (mddev->pers) + sz += sprintf(page + sz, "\n %d blocks", + md_size[mdidx(mddev)]); else - sz+=sprintf (page+sz, " %d blocks", size); + sz += sprintf(page + sz, "\n %d blocks", size); } - if (!md_dev[i].pers) { - sz+=sprintf (page+sz, "\n"); + if (!mddev->pers) { + sz += sprintf(page+sz, "\n"); continue; } - if (md_dev[i].pers->max_invalid_dev) - sz+=sprintf (page+sz, " maxfault=%ld", - MAX_FAULT(md_dev+i)); + sz += mddev->pers->status (page+sz, mddev); - sz+=md_dev[i].pers->status (page+sz, i, md_dev+i); - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page+sz, "\n "); + if (mddev->curr_resync) { + sz += status_resync (page+sz, mddev); + } else { + if (md_atomic_read(&mddev->resync_sem.count) != 1) + sz += sprintf(page + sz, " resync=DELAYED"); + } + sz += sprintf(page + sz, "\n"); } + sz += status_unused (page + sz); - sz -= off; - *start = page + off; - if (sz>count) - sz = count; - if (sz<0) - sz = 0; return sz; } -#endif - -static void md_geninit (struct gendisk *gdisk) -{ - int i; - - for(i=0;i MAX_MD_DEV) - panic ("md_error gets unknown device\n"); - if (!md_dev [minor].pers) - panic ("md_error gets an error for an unknown device\n"); - if (md_dev [minor].pers->error_handler) { - rc = md_dev [minor].pers->error_handler (md_dev+minor, rdev); -#if SUPPORT_RECONSTRUCTION - md_wakeup_thread(md_sync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ - return rc; - } - return 0; -} - -int register_md_personality (int p_num, struct md_personality *p) +int register_md_personality (int pnum, mdk_personality_t *p) { - int i=(p_num >> PERSONALITY_SHIFT); - - if (i >= MAX_PERSONALITY) - return -EINVAL; + if (pnum >= MAX_PERSONALITY) + return -EINVAL; - if (pers[i]) - return -EBUSY; + if (pers[pnum]) + return -EBUSY; - pers[i]=p; - printk ("%s personality registered\n", p->name); - return 0; + pers[pnum] = p; + printk(KERN_INFO "%s personality registered\n", p->name); + return 0; } -int unregister_md_personality (int p_num) +int unregister_md_personality (int pnum) { - int i=(p_num >> PERSONALITY_SHIFT); - - if (i >= MAX_PERSONALITY) - return -EINVAL; + if (pnum >= MAX_PERSONALITY) + return -EINVAL; - printk ("%s personality unregistered\n", pers[i]->name); - pers[i]=NULL; - return 0; + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; } -static md_descriptor_t *get_spare(struct md_dev *mddev) +static mdp_disk_t *get_spare(mddev_t *mddev) { - int i; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (!rdev->sb) { + MD_BUG(); continue; - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); continue; - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) + } + if (disk_active(disk)) continue; - return descriptor; + return disk; } return NULL; } +static int is_mddev_idle (mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + curr_events = io_events[MAJOR(rdev->dev)]; + + if (curr_events != rdev->last_events) { +// printk("!I(%d)", curr_events-rdev->last_events); + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + /* * parallel resyncing thread. - * - * FIXME: - make it abort with a dirty array on mdstop, now it just blocks - * - fix read error handing */ -int md_do_sync(struct md_dev *mddev) +/* + * Determine correct block size for this device. + */ +unsigned int device_bsize (kdev_t dev) +{ + unsigned int i, correct_size; + + correct_size = BLOCK_SIZE; + if (blksize_size[MAJOR(dev)]) { + i = blksize_size[MAJOR(dev)][MINOR(dev)]; + if (i) + correct_size = i; + } + + return correct_size; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait); + +/* + * during resync we keep the buffer locked, this is how we implement + * 'exclusive ownership' of a buffer-cache element. This also protects + * against addition/removal of the cache element. (we are not the + * primary cache manager) + */ +static void end_buffer_io_mdresync(struct buffer_head *bh, int uptodate) +{ + mark_buffer_uptodate(bh, uptodate); + bh->b_dev_id = (void *)1; + wake_up(&bh->b_wait); +} + +static void release_bh (struct buffer_head **bhp) +{ + struct buffer_head *bh = *bhp; + if (!bh) + return; + bh->b_end_io = end_buffer_io_sync; + unlock_buffer(bh); + bdrop(bh); + *bhp = NULL; +} + +void wait_on_mdresync_buffer(struct buffer_head * bh) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + bget(bh); + add_wait_queue(&bh->b_wait, &wait); +repeat: + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (!bh->b_dev_id) { + schedule(); + goto repeat; + } + tsk->state = TASK_RUNNING; + remove_wait_queue(&bh->b_wait, &wait); + bput(bh); +} +#define RA_ORDER (1) +#define RA_PAGE_SIZE (PAGE_SIZE*(1<resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + printk(KERN_INFO "md: serializing resync, md%d has overlapping physical units with md%d!\n", mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; - blocksize = blksize_size[major][minor]; + blocksize = device_bsize(read_disk); max_blocks = blk_size[major][minor] / (blocksize >> 10); - printk("... resync log\n"); - printk(" .... mddev->nb_dev: %d\n", mddev->nb_dev); - printk(" .... raid array: %s\n", kdevname(read_disk)); - printk(" .... max_blocks: %d blocksize: %d\n", max_blocks, blocksize); - printk("md: syncing RAID array %s\n", kdevname(read_disk)); + printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev)); + printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec.\n", + sysctl_speed_limit_min); + printk(KERN_INFO "md: using maximum available idle IO bandwith (but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ + current->priority = 1; + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + starttime = jiffies; + mddev->resync_start = starttime; - mddev->busy++; + /* + * Tune reconstruction: + */ + window = md_maxreadahead[mdidx(mddev)]/1024; + nr_blocks = window / (blocksize >> 10); + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS)) + nr_blocks = MAX_NR_BLOCKS; + printk(KERN_INFO "md: using %dk window, %d blocks.\n",window,nr_blocks); - starttime=jiffies; - for (j = 0; j < max_blocks; j++) { + for (j = 0; j < max_blocks; j += nr_blocks) { + if (j) + mddev->curr_resync = j; /* * B careful. When some1 mounts a non-'blocksize' filesystem * then we get the blocksize changed right under us. Go deal * with it transparently, recalculate 'blocksize', 'j' and * 'max_blocks': */ - curr_bsize = blksize_size[major][minor]; + curr_bsize = device_bsize(read_disk); if (curr_bsize != blocksize) { - diff_blocksize: + printk(KERN_INFO "md%d: blocksize changed\n", + mdidx(mddev)); +retry_read: if (curr_bsize > blocksize) /* * this is safe, rounds downwards. @@ -1044,109 +3426,394 @@ j *= blocksize/curr_bsize; blocksize = curr_bsize; + nr_blocks = window / (blocksize >> 10); + if (!nr_blocks || (nr_blocks > MAX_NR_BLOCKS)) + nr_blocks = MAX_NR_BLOCKS; max_blocks = blk_size[major][minor] / (blocksize >> 10); + printk("nr_blocks changed to %d (blocksize %d, j %d, max_blocks %d)\n", + nr_blocks, blocksize, j, max_blocks); + /* + * We will retry the current block-group + */ } - if ((bh = breada (read_disk, j, blocksize, j * blocksize, - max_blocks * blocksize)) != NULL) { - mark_buffer_dirty(bh, 1); - brelse(bh); - } else { + + /* + * Cleanup routines expect this + */ + for (k = 0; k < nr_blocks; k++) + if (bh[k]) + BH_BUG(bh[k]); + + chunk = nr_blocks; + if (chunk > max_blocks-j) + chunk = max_blocks-j; + + /* + * request buffer heads ... + */ + for (i = 0; i < chunk; i++) { /* - * FIXME: Ugly, but set_blocksize() isnt safe ... + * We get an exclusive lock to the bh. */ - curr_bsize = blksize_size[major][minor]; - if (curr_bsize != blocksize) - goto diff_blocksize; +repeat_getblk: + bh[i] = getblk_lock (read_disk, j+i, blocksize); + // FIXME: do this gracefully. + if (!bh[i]) + goto repeat_getblk; + } + + /* + * Read all (already locked) buffer heads ... + */ + for (i = 0; i < chunk; i++) { + bh[i]->b_end_io = end_buffer_io_mdresync; + bh[i]->b_dev_id = NULL; + set_bit(BH_Req, &bh[i]->b_state); + md_make_request(READ, bh[i]); + } + run_task_queue(&tq_disk); + + /* + * Wait for them to complete and verify that + * all of them are OK ... + */ + for (i = 0; i < chunk; i++) { + ii = chunk-i-1; + wait_on_mdresync_buffer(bh[ii]); + if (!buffer_uptodate(bh[ii])) + goto read_error; + if (bh[ii]->b_end_io != end_buffer_io_mdresync) + BH_BUG(bh[ii]); /* - * It's a real read problem. FIXME, handle this - * a better way. + * We'll do the writeback here in the + * near future, to get better overlapping. */ - printk ( KERN_ALERT - "read error, stopping reconstruction.\n"); - mddev->busy--; - return 1; } /* - * Let's sleep some if we are faster than our speed limit: + * Write them out without marking them dirty! + * This enables us to optimize away IO at + * the personality level. */ - while (blocksize*j/(jiffies-starttime+1)*HZ/1024 > SPEED_LIMIT) - { - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(1); + for (i = 0; i < chunk; i++) { + bh[i]->b_dev_id = NULL; + md_make_request(WRITE, bh[i]); + } + run_task_queue(&tq_disk); + + for (i = 0; i < chunk; i++) { + ii = chunk-i-1; + + wait_on_mdresync_buffer(bh[ii]); + if (bh[ii]->b_end_io != end_buffer_io_mdresync) + BH_BUG(bh[ii]); + bh[ii]->b_dev_id = NULL; + if (spare && disk_faulty(spare)) { + for (k = 0; k < chunk; k++) + release_bh(bh+k); + printk(" \n "); + err = -EIO; + goto out; + } + if (!buffer_uptodate(bh[ii])) + goto write_error; } /* - * FIXME: put this status bar thing into /proc + * This is the normal 'everything went OK' case + * do a 'free-behind' logic, we sure dont need + * this buffer if it was the only user. */ - if (!(j%(max_blocks/100))) { - if (!(percent%10)) - printk (" %03d%% done.\n",percent); - else - printk ("."); - percent++; + for (i = 0; i < chunk; i++) + release_bh(bh+i); + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + printk("md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ +repeat: + if (md_need_resched(current)) + schedule(); + + currspeed = (blocksize/1024)*j/((jiffies-starttime)/HZ + 1) + 1; + if (currspeed > sysctl_speed_limit_min) { + current->priority = 1; + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { + current->state = TASK_INTERRUPTIBLE; + md_schedule_timeout(HZ/4); + if (!md_signal_pending(current)) + goto repeat; + } + } else + current->priority = 40; } fsync_dev(read_disk); - printk("md: %s: sync done.\n", kdevname(read_disk)); - mddev->busy--; - return 0; + printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + up(&mddev->resync_sem); +out_nolock: + free_pages((unsigned long)bh, RA_ORDER); + mddev->curr_resync = 0; + wake_up(&resync_wait); + return err; + +write_error: + /* + * set_blocksize() might change the blocksize. This + * should not happen often, but it happens when eg. + * someone mounts a filesystem that has non-1k + * blocksize. set_blocksize() doesnt touch our + * buffer, but to avoid aliasing problems we change + * our internal blocksize too and retry the write. + */ + curr_bsize = device_bsize(read_disk); + if (curr_bsize != blocksize) { + printk(KERN_INFO "md%d: blocksize changed during write\n", + mdidx(mddev)); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; // we retry the read too. + } + + /* + * We were invalidated by the primary + * cache manager - hm, shouldnt happen, + * all invalidation synchronizes with + * the bh lock first. + */ + if (!test_bit(BH_Req, &bh[ii]->b_state)) + BH_BUG(bh[ii]); + /* + * It's a real write problem. We retry and bail out + * only if it's excessive. + */ + if (max_write_errors) { + max_write_errors--; + printk ( KERN_WARNING "md%d: write error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; // we retry the read too. + } + printk (KERN_ALERT "too many write errors, stopping reconstruction.\n"); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + err = -EIO; + goto out; + +read_error: + /* + * set_blocksize() might change the blocksize. This + * should not happen often, but it happens when eg. + * someone mounts a filesystem that has non-1k + * blocksize. set_blocksize() doesnt touch our + * buffer, but to avoid aliasing problems we change + * our internal blocksize too and retry the read. + */ + curr_bsize = device_bsize(read_disk); + if (curr_bsize != blocksize) { + printk(KERN_INFO "md%d: blocksize changed during read\n", + mdidx(mddev)); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; + } + + /* + * It's a real read problem. We retry and bail out + * only if it's excessive. + */ + if (max_read_errors) { + max_read_errors--; + printk ( KERN_WARNING "md%d: read error while reconstructing, at block %u(%d).\n", mdidx(mddev), j, blocksize); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + goto retry_read; + } + printk ( KERN_ALERT "too many read errors, stopping reconstruction.\n"); + for (k = 0; k < chunk; k++) + release_bh(bh+k); + err = -EIO; + goto out; } +#undef MAX_NR_BLOCKS + /* - * This is a kernel thread which: syncs a spare disk with the active array + * This is a kernel thread which syncs a spare disk with the active array * * the amount of foolproofing might seem to be a tad excessive, but an * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs * of my root partition with the first 0.5 gigs of my /home partition ... so * i'm a bit nervous ;) */ -void mdsyncd (void *data) +void md_do_recovery (void *data) { - int i; - struct md_dev *mddev; - md_superblock_t *sb; - md_descriptor_t *spare; - unsigned long flags; - - for (i = 0, mddev = md_dev; i < MAX_MD_DEV; i++, mddev++) { - if ((sb = mddev->sb) == NULL) + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct md_list_head *tmp; + + printk(KERN_INFO "md: recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) continue; if (sb->active_disks == sb->raid_disks) continue; - if (!sb->spare_disks) + if (!sb->spare_disks) { + printk(KERN_ERR "md%d: no spare disk to reconstruct array! -- continuing in degraded mode\n", mdidx(mddev)); continue; + } + /* + * now here we get the spare and resync it. + */ if ((spare = get_spare(mddev)) == NULL) continue; - if (!mddev->pers->mark_spare) + printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) continue; - if (mddev->pers->mark_spare(mddev, spare, SPARE_WRITE)) + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) continue; - if (md_do_sync(mddev) || (spare->state & (1 << MD_FAULTY_DEVICE))) { - mddev->pers->mark_spare(mddev, spare, SPARE_INACTIVE); + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = md_do_sync(mddev, spare); + if (err == -EIO) { + printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR) { + /* + * Recovery got interrupted ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; } - save_flags(flags); - cli(); - mddev->pers->mark_spare(mddev, spare, SPARE_ACTIVE); - spare->state |= (1 << MD_SYNC_DEVICE); - spare->state |= (1 << MD_ACTIVE_DEVICE); - sb->spare_disks--; - sb->active_disks++; mddev->sb_dirty = 1; - md_update_sb(mddev - md_dev); - restore_flags(flags); + md_update_sb(mddev); + goto restart; } + printk(KERN_INFO "md: recovery thread finished ...\n"); } +int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x) +{ + struct md_list_head *tmp; + mddev_t *mddev; + + if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT) + || (code == MD_SYS_POWER_OFF)) { + + printk(KERN_INFO "stopping all md devices.\n"); + + ITERATE_MDDEV(mddev,tmp) + do_md_stop (mddev, 1); + /* + * certain more exotic SCSI devices are known to be + * volatile wrt too early system reboots. While the + * right place to handle this issue is the given + * driver, we do want to have a safe RAID driver ... + */ + md_mdelay(1000*1); + } + return NOTIFY_DONE; +} + +struct notifier_block md_notifier = { + md_notify_reboot, + NULL, + 0 +}; + +void md__init raid_setup(char *str, int *ints) +{ + char tmpline[100]; + int len, pos, nr, i; + + len = strlen(str) + 1; + nr = 0; + pos = 0; + + for (i = 0; i < len; i++) { + char c = str[i]; + + if (c == ',' || !c) { + tmpline[pos] = 0; + if (!strcmp(tmpline,"noautodetect")) + raid_setup_args.noautodetect = 1; + nr++; + pos = 0; + continue; + } + tmpline[pos] = c; + pos++; + } + raid_setup_args.set = 1; + return; +} + #ifdef CONFIG_MD_BOOT struct { unsigned long set; - int pers[MAX_MD_DEV]; - kdev_t devices[MAX_MD_DEV][MAX_REAL]; -} md_setup_args __initdata = { + int pers[MAX_MD_DEVS]; + kdev_t devices[MAX_MD_DEVS][MAX_REAL]; +} md_setup_args md__initdata = { 0,{0},{{0}} }; @@ -1161,7 +3828,7 @@ * the MD devices (by specifying multiple "md=" lines) * instead of just one. -- KTK */ -int __init md_setup(char *str) +static int __init md_setup(char *str) { int minor, level, factor, fault, i; kdev_t device; @@ -1173,31 +3840,31 @@ get_option(&str, &fault) != 2) { printk("md: Too few arguments supplied to md=.\n"); return 0; - } else if (minor >= MAX_MD_DEV) { - printk ("md: Minor device number too high.\n"); + } else if (minor >= MAX_MD_DEVS) { + printk ("md: Minor device number too high.\n"); return 0; } else if (md_setup_args.set & (1 << minor)) { printk ("md: Warning - md=%d,... has been specified twice;\n" " will discard the first definition.\n", minor); - } + } switch(level) { #ifdef CONFIG_MD_LINEAR case -1: level = LINEAR; pername = "linear"; - break; + break; #endif #ifdef CONFIG_MD_STRIPED case 0: level = STRIPED; pername = "striped"; - break; + break; #endif default: printk ("md: The kernel has not been configured for raid%d" " support!\n", level); return 0; - } + } devnames = str; for (i = 0; str; i++) { if ((device = name_to_kdev_t(str))) { @@ -1221,60 +3888,80 @@ md_setup_args.set |= (1 << minor); return 0; } - #endif +void hsm_init (void); +void translucent_init (void); void linear_init (void); void raid0_init (void); void raid1_init (void); void raid5_init (void); -int __init md_init (void) +int md__init md_init (void) { - printk ("md driver %d.%d.%d MAX_MD_DEV=%d, MAX_REAL=%d\n", - MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION, - MAX_MD_DEV, MAX_REAL); - - if (register_blkdev (MD_MAJOR, "md", &md_fops)) - { - printk ("Unable to get major %d for md\n", MD_MAJOR); - return (-1); - } - - blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); - read_ahead[MD_MAJOR]=INT_MAX; - memset(md_dev, 0, MAX_MD_DEV * sizeof (struct md_dev)); - md_gendisk.next=gendisk_head; - - gendisk_head=&md_gendisk; - -#if SUPPORT_RECONSTRUCTION - if ((md_sync_thread = md_register_thread(mdsyncd, NULL)) == NULL) - printk("md: bug: md_sync_thread == NULL\n"); -#endif /* SUPPORT_RECONSTRUCTION */ + static char * name = "mdrecoveryd"; + + printk (KERN_INFO "md driver %d.%d.%d MAX_MD_DEVS=%d, MAX_REAL=%d\n", + MD_MAJOR_VERSION, MD_MINOR_VERSION, + MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MAX_REAL); + + if (register_blkdev (MD_MAJOR, "md", &md_fops)) + { + printk (KERN_ALERT "Unable to get major %d for md\n", MD_MAJOR); + return (-1); + } + blk_dev[MD_MAJOR].queue = md_get_queue; + + read_ahead[MD_MAJOR] = INT_MAX; + md_gendisk.next = gendisk_head; + + gendisk_head = &md_gendisk; + + md_recovery_thread = md_register_thread(md_do_recovery, NULL, name); + if (!md_recovery_thread) + printk(KERN_ALERT "bug: couldn't allocate md_recovery_thread\n"); + + md_register_reboot_notifier(&md_notifier); + md_register_sysctl(); + +#ifdef CONFIG_MD_HSM + hsm_init (); +#endif +#ifdef CONFIG_MD_TRANSLUCENT + translucent_init (); +#endif #ifdef CONFIG_MD_LINEAR - linear_init (); + linear_init (); #endif #ifdef CONFIG_MD_STRIPED - raid0_init (); + raid0_init (); #endif #ifdef CONFIG_MD_MIRRORING - raid1_init (); + raid1_init (); #endif #ifdef CONFIG_MD_RAID5 - raid5_init (); + raid5_init (); +#endif +#if defined(CONFIG_MD_RAID5) || defined(CONFIG_MD_RAID5_MODULE) + /* + * pick a XOR routine, runtime. + */ + calibrate_xor_block(); #endif - return (0); + + return (0); } #ifdef CONFIG_MD_BOOT -void __init md_setup_drive(void) +static void __init md_setup_drive(void) { + if(md_setup_args.set) + do_md_setup(md_setup_args.str, md_setup_args.ints); int minor, i; kdev_t dev; - for (minor = 0; minor < MAX_MD_DEV; minor++) { + for (minor = 0; minor < MAX_MD_DEVS; minor++) { if ((md_setup_args.set & (1 << minor)) == 0) continue; printk("md: Loading md%d.\n", minor); @@ -1286,3 +3973,42 @@ __setup("md=", md_setup); #endif + +MD_EXPORT_SYMBOL(md_size); +MD_EXPORT_SYMBOL(register_md_personality); +MD_EXPORT_SYMBOL(unregister_md_personality); +MD_EXPORT_SYMBOL(partition_name); +MD_EXPORT_SYMBOL(md_error); +MD_EXPORT_SYMBOL(md_recover_arrays); +MD_EXPORT_SYMBOL(md_register_thread); +MD_EXPORT_SYMBOL(md_unregister_thread); +MD_EXPORT_SYMBOL(md_update_sb); +MD_EXPORT_SYMBOL(md_map); +MD_EXPORT_SYMBOL(md_wakeup_thread); +MD_EXPORT_SYMBOL(md_do_sync); +MD_EXPORT_SYMBOL(md_print_devices); +MD_EXPORT_SYMBOL(find_rdev_nr); +MD_EXPORT_SYMBOL(md_check_ordering); +MD_EXPORT_SYMBOL(md_interrupt_thread); +MD_EXPORT_SYMBOL(mddev_map); + +static void md_geninit (struct gendisk *gdisk) +{ + int i; + + for(i = 0; i < MAX_MD_DEVS; i++) { + md_blocksizes[i] = 1024; + md_maxreadahead[i] = MD_READAHEAD; + md_gendisk.part[i].start_sect = -1; /* avoid partition check */ + md_gendisk.part[i].nr_sects = 0; + } + + printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + + blksize_size[MD_MAJOR] = md_blocksizes; + md_set_global_readahead(md_maxreadahead); + +#ifdef CONFIG_PROC_FS + create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL); +#endif +} --- linux/drivers/block/raid0.c.orig Fri May 8 09:17:13 1998 +++ linux/drivers/block/raid0.c Sun Jan 16 17:45:53 2000 @@ -1,9 +1,10 @@ - /* raid0.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER or + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + RAID-0 management functions. @@ -18,146 +19,201 @@ */ #include -#include -#include -#include +#include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -static int create_strip_zones (int minor, struct md_dev *mddev) +static int create_strip_zones (mddev_t *mddev) { - int i, j, c=0; - int current_offset=0; - struct real_dev *smallest_by_zone; - struct raid0_data *data=(struct raid0_data *) mddev->private; - - data->nr_strip_zones=1; - - for (i=1; inb_dev; i++) - { - for (j=0; jdevices[i].size==mddev->devices[j].size) - { - c=1; - break; - } - - if (!c) - data->nr_strip_zones++; - - c=0; - } - - if ((data->strip_zone=vmalloc(sizeof(struct strip_zone)*data->nr_strip_zones)) == NULL) - return 1; - - data->smallest=NULL; - - for (i=0; inr_strip_zones; i++) - { - data->strip_zone[i].dev_offset=current_offset; - smallest_by_zone=NULL; - c=0; - - for (j=0; jnb_dev; j++) - if (mddev->devices[j].size>current_offset) - { - data->strip_zone[i].dev[c++]=mddev->devices+j; - if (!smallest_by_zone || - smallest_by_zone->size > mddev->devices[j].size) - smallest_by_zone=mddev->devices+j; - } - - data->strip_zone[i].nb_dev=c; - data->strip_zone[i].size=(smallest_by_zone->size-current_offset)*c; - - if (!data->smallest || - data->smallest->size > data->strip_zone[i].size) - data->smallest=data->strip_zone+i; - - data->strip_zone[i].zone_offset=i ? (data->strip_zone[i-1].zone_offset+ - data->strip_zone[i-1].size) : 0; - current_offset=smallest_by_zone->size; - } - return 0; + int i, c, j, j1, j2; + int current_offset, curr_zone_offset; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; + + /* + * The number of 'same size groups' + */ + conf->nr_strip_zones = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) { + printk("raid0: looking at %s\n", partition_name(rdev1->dev)); + c = 0; + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) { + printk("raid0: comparing %s(%d) with %s(%d)\n", partition_name(rdev1->dev), rdev1->size, partition_name(rdev2->dev), rdev2->size); + if (rdev2 == rdev1) { + printk("raid0: END\n"); + break; + } + if (rdev2->size == rdev1->size) + { + /* + * Not unique, dont count it as a new + * group + */ + printk("raid0: EQUAL\n"); + c = 1; + break; + } + printk("raid0: NOT EQUAL\n"); + } + if (!c) { + printk("raid0: ==> UNIQUE\n"); + conf->nr_strip_zones++; + printk("raid0: %d zones\n", conf->nr_strip_zones); + } + } + printk("raid0: FINAL %d zones\n", conf->nr_strip_zones); + + conf->strip_zone = vmalloc(sizeof(struct strip_zone)* + conf->nr_strip_zones); + if (!conf->strip_zone) + return 1; + + + conf->smallest = NULL; + current_offset = 0; + curr_zone_offset = 0; + + for (i = 0; i < conf->nr_strip_zones; i++) + { + struct strip_zone *zone = conf->strip_zone + i; + + printk("zone %d\n", i); + zone->dev_offset = current_offset; + smallest = NULL; + c = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + + printk(" checking %s ...", partition_name(rdev->dev)); + if (rdev->size > current_offset) + { + printk(" contained as device %d\n", c); + zone->dev[c] = rdev; + c++; + if (!smallest || (rdev->size size)) { + smallest = rdev; + printk(" (%d) is smallest!.\n", rdev->size); + } + } else + printk(" nope.\n"); + } + + zone->nb_dev = c; + zone->size = (smallest->size - current_offset) * c; + printk(" zone->nb_dev: %d, size: %d\n",zone->nb_dev,zone->size); + + if (!conf->smallest || (zone->size < conf->smallest->size)) + conf->smallest = zone; + + zone->zone_offset = curr_zone_offset; + curr_zone_offset += zone->size; + + current_offset = smallest->size; + printk("current zone offset: %d\n", current_offset); + } + printk("done.\n"); + return 0; } -static int raid0_run (int minor, struct md_dev *mddev) +static int raid0_run (mddev_t *mddev) { - int cur=0, i=0, size, zone0_size, nb_zone; - struct raid0_data *data; + int cur=0, i=0, size, zone0_size, nb_zone; + raid0_conf_t *conf; - MOD_INC_USE_COUNT; - - if ((mddev->private=vmalloc (sizeof (struct raid0_data))) == NULL) return 1; - data=(struct raid0_data *) mddev->private; - - if (create_strip_zones (minor, mddev)) - { - vfree(data); - return 1; - } - - nb_zone=data->nr_zones= - md_size[minor]/data->smallest->size + - (md_size[minor]%data->smallest->size ? 1 : 0); - - printk ("raid0 : Allocating %ld bytes for hash.\n",(long)sizeof(struct raid0_hash)*nb_zone); - if ((data->hash_table=vmalloc (sizeof (struct raid0_hash)*nb_zone)) == NULL) - { - vfree(data->strip_zone); - vfree(data); - return 1; - } - size=data->strip_zone[cur].size; - - i=0; - while (curnr_strip_zones) - { - data->hash_table[i].zone0=data->strip_zone+cur; - - if (size>=data->smallest->size)/* If we completely fill the slot */ - { - data->hash_table[i++].zone1=NULL; - size-=data->smallest->size; - - if (!size) - { - if (++cur==data->nr_strip_zones) continue; - size=data->strip_zone[cur].size; - } - - continue; - } - - if (++cur==data->nr_strip_zones) /* Last dev, set unit1 as NULL */ - { - data->hash_table[i].zone1=NULL; - continue; - } - - zone0_size=size; /* Here, we use a 2nd dev to fill the slot */ - size=data->strip_zone[cur].size; - data->hash_table[i++].zone1=data->strip_zone+cur; - size-=(data->smallest->size - zone0_size); - } + MOD_INC_USE_COUNT; - return (0); + conf = vmalloc(sizeof (raid0_conf_t)); + if (!conf) + goto out; + mddev->private = (void *)conf; + + if (md_check_ordering(mddev)) { + printk("raid0: disks are not ordered, aborting!\n"); + goto out_free_conf; + } + + if (create_strip_zones (mddev)) + goto out_free_conf; + + printk("raid0 : md_size is %d blocks.\n", md_size[mdidx(mddev)]); + printk("raid0 : conf->smallest->size is %d blocks.\n", conf->smallest->size); + nb_zone = md_size[mdidx(mddev)]/conf->smallest->size + + (md_size[mdidx(mddev)] % conf->smallest->size ? 1 : 0); + printk("raid0 : nb_zone is %d.\n", nb_zone); + conf->nr_zones = nb_zone; + + printk("raid0 : Allocating %d bytes for hash.\n", + sizeof(struct raid0_hash)*nb_zone); + + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone); + if (!conf->hash_table) + goto out_free_zone_conf; + size = conf->strip_zone[cur].size; + + i = 0; + while (cur < conf->nr_strip_zones) { + conf->hash_table[i].zone0 = conf->strip_zone + cur; + + /* + * If we completely fill the slot + */ + if (size >= conf->smallest->size) { + conf->hash_table[i++].zone1 = NULL; + size -= conf->smallest->size; + + if (!size) { + if (++cur == conf->nr_strip_zones) + continue; + size = conf->strip_zone[cur].size; + } + continue; + } + if (++cur == conf->nr_strip_zones) { + /* + * Last dev, set unit1 as NULL + */ + conf->hash_table[i].zone1=NULL; + continue; + } + + /* + * Here we use a 2nd dev to fill the slot + */ + zone0_size = size; + size = conf->strip_zone[cur].size; + conf->hash_table[i++].zone1 = conf->strip_zone + cur; + size -= (conf->smallest->size - zone0_size); + } + return 0; + +out_free_zone_conf: + vfree(conf->strip_zone); + conf->strip_zone = NULL; + +out_free_conf: + vfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return 1; } - -static int raid0_stop (int minor, struct md_dev *mddev) +static int raid0_stop (mddev_t *mddev) { - struct raid0_data *data=(struct raid0_data *) mddev->private; + raid0_conf_t *conf = mddev_to_conf(mddev); - vfree (data->hash_table); - vfree (data->strip_zone); - vfree (data); + vfree (conf->hash_table); + conf->hash_table = NULL; + vfree (conf->strip_zone); + conf->strip_zone = NULL; + vfree (conf); + mddev->private = NULL; - MOD_DEC_USE_COUNT; - return 0; + MOD_DEC_USE_COUNT; + return 0; } /* @@ -167,129 +223,142 @@ * Of course, those facts may not be valid anymore (and surely won't...) * Hey guys, there's some work out there ;-) */ -static int raid0_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static int raid0_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) { - struct raid0_data *data=(struct raid0_data *) mddev->private; - static struct raid0_hash *hash; - struct strip_zone *zone; - struct real_dev *tmp_dev; - int blk_in_chunk, factor, chunk, chunk_size; - long block, rblock; - - factor=FACTOR(mddev); - chunk_size=(1UL << FACTOR_SHIFT(factor)); - block=*rsector >> 1; - hash=data->hash_table+(block/data->smallest->size); - - /* Sanity check */ - if ((chunk_size*2)<(*rsector % (chunk_size*2))+size) - { - printk ("raid0_convert : can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, *rsector, size); - return (-1); - } - - if (block >= (hash->zone0->size + - hash->zone0->zone_offset)) - { - if (!hash->zone1) - { - printk ("raid0_convert : hash->zone1==NULL for block %ld\n", block); - return (-1); - } - - zone=hash->zone1; - } - else - zone=hash->zone0; + unsigned long size = bh->b_size >> 10; + raid0_conf_t *conf = mddev_to_conf(mddev); + struct raid0_hash *hash; + struct strip_zone *zone; + mdk_rdev_t *tmp_dev; + int blk_in_chunk, chunksize_bits, chunk, chunk_size; + long block, rblock; + + chunk_size = mddev->param.chunk_size >> 10; + chunksize_bits = ffz(~chunk_size); + block = bh->b_blocknr * size; + hash = conf->hash_table + block / conf->smallest->size; + + /* Sanity check */ + if (chunk_size < (block % chunk_size) + size) + goto bad_map; + + if (!hash) + goto bad_hash; + + if (!hash->zone0) + goto bad_zone0; + + if (block >= (hash->zone0->size + hash->zone0->zone_offset)) { + if (!hash->zone1) + goto bad_zone1; + zone = hash->zone1; + } else + zone = hash->zone0; - blk_in_chunk=block & (chunk_size -1); - chunk=(block - zone->zone_offset) / (zone->nb_dev<dev[(block >> FACTOR_SHIFT(factor)) % zone->nb_dev]; - rblock=(chunk << FACTOR_SHIFT(factor)) + blk_in_chunk + zone->dev_offset; - - *rdev=tmp_dev->dev; - *rsector=rblock<<1; - - return (0); + blk_in_chunk = block & (chunk_size -1); + chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits); + tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev]; + rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset; + + /* + * Important, at this point we are not guaranteed to be the only + * CPU modifying b_rdev and b_rsector! Only __make_request() later + * on serializes the IO. So in 2.4 we must never write temporary + * values to bh->b_rdev, like 2.2 and 2.0 did. + */ + bh->b_rdev = tmp_dev->dev; + bh->b_rsector = rblock << 1; + + generic_make_request(rw, bh); + + return 0; + +bad_map: + printk ("raid0_make_request bug: can't convert block across chunks or bigger than %dk %ld %ld\n", chunk_size, bh->b_rsector, size); + return -1; +bad_hash: + printk("raid0_make_request bug: hash==NULL for block %ld\n", block); + return -1; +bad_zone0: + printk ("raid0_make_request bug: hash->zone0==NULL for block %ld\n", block); + return -1; +bad_zone1: + printk ("raid0_make_request bug: hash->zone1==NULL for block %ld\n", block); + return -1; } - -static int raid0_status (char *page, int minor, struct md_dev *mddev) +static int raid0_status (char *page, mddev_t *mddev) { - int sz=0; + int sz = 0; #undef MD_DEBUG #ifdef MD_DEBUG - int j, k; - struct raid0_data *data=(struct raid0_data *) mddev->private; + int j, k; + raid0_conf_t *conf = mddev_to_conf(mddev); - sz+=sprintf (page+sz, " "); - for (j=0; jnr_zones; j++) - { - sz+=sprintf (page+sz, "[z%d", - data->hash_table[j].zone0-data->strip_zone); - if (data->hash_table[j].zone1) - sz+=sprintf (page+sz, "/z%d] ", - data->hash_table[j].zone1-data->strip_zone); - else - sz+=sprintf (page+sz, "] "); - } + sz += sprintf(page + sz, " "); + for (j = 0; j < conf->nr_zones; j++) { + sz += sprintf(page + sz, "[z%d", + conf->hash_table[j].zone0 - conf->strip_zone); + if (conf->hash_table[j].zone1) + sz += sprintf(page+sz, "/z%d] ", + conf->hash_table[j].zone1 - conf->strip_zone); + else + sz += sprintf(page+sz, "] "); + } - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page + sz, "\n"); - for (j=0; jnr_strip_zones; j++) - { - sz+=sprintf (page+sz, " z%d=[", j); - for (k=0; kstrip_zone[j].nb_dev; k++) - sz+=sprintf (page+sz, "%s/", - partition_name(data->strip_zone[j].dev[k]->dev)); - sz--; - sz+=sprintf (page+sz, "] zo=%d do=%d s=%d\n", - data->strip_zone[j].zone_offset, - data->strip_zone[j].dev_offset, - data->strip_zone[j].size); - } + for (j = 0; j < conf->nr_strip_zones; j++) { + sz += sprintf(page + sz, " z%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + sz += sprintf (page+sz, "%s/", partition_name( + conf->strip_zone[j].dev[k]->dev)); + sz--; + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n", + conf->strip_zone[j].zone_offset, + conf->strip_zone[j].dev_offset, + conf->strip_zone[j].size); + } #endif - sz+=sprintf (page+sz, " %dk chunks", 1<param.chunk_size/1024); + return sz; } - -static struct md_personality raid0_personality= +static mdk_personality_t raid0_personality= { - "raid0", - raid0_map, - NULL, /* no special make_request */ - NULL, /* no special end_request */ - raid0_run, - raid0_stop, - raid0_status, - NULL, /* no ioctls */ - 0, - NULL, /* no error_handler */ - NULL, /* hot_add_disk */ - NULL, /* hot_remove_disk */ - NULL /* mark_spare */ + "raid0", + NULL, /* no special map */ + raid0_make_request, + NULL, /* no special end_request */ + raid0_run, + raid0_stop, + raid0_status, + NULL, /* no ioctls */ + 0, + NULL, /* no error_handler */ + NULL, /* no diskop */ + NULL, /* no stop resync */ + NULL /* no restart resync */ }; - #ifndef MODULE void raid0_init (void) { - register_md_personality (RAID0, &raid0_personality); + register_md_personality (RAID0, &raid0_personality); } #else int init_module (void) { - return (register_md_personality (RAID0, &raid0_personality)); + return (register_md_personality (RAID0, &raid0_personality)); } void cleanup_module (void) { - unregister_md_personality (RAID0); + unregister_md_personality (RAID0); } #endif + --- linux/drivers/block/ll_rw_blk.c.orig Sun Jan 16 06:38:16 2000 +++ linux/drivers/block/ll_rw_blk.c Sun Jan 16 17:45:53 2000 @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -61,11 +62,16 @@ spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED; /* + * per-major idle-IO detection + */ +unsigned long io_events[MAX_BLKDEV] = {0, }; + +/* * used to wait on when there are no free requests */ DECLARE_WAIT_QUEUE_HEAD(wait_for_request); -/* This specifies how many sectors to read ahead on the disk. */ +/* This specifies how many sectors to read ahead on the disk. */ int read_ahead[MAX_BLKDEV] = {0, }; @@ -138,18 +144,24 @@ } /* - * Is called with the request spinlock aquired. * NOTE: the device-specific queue() functions * have to be atomic! */ -static inline request_queue_t *get_queue(kdev_t dev) +request_queue_t * blk_get_queue (kdev_t dev) { int major = MAJOR(dev); struct blk_dev_struct *bdev = blk_dev + major; + unsigned long flags; + request_queue_t *ret; + spin_lock_irqsave(&io_request_lock,flags); if (bdev->queue) - return bdev->queue(dev); - return &blk_dev[major].request_queue; + ret = bdev->queue(dev); + else + ret = &blk_dev[major].request_queue; + spin_unlock_irqrestore(&io_request_lock,flags); + + return ret; } void blk_cleanup_queue(request_queue_t * q) @@ -159,12 +171,17 @@ void blk_queue_headactive(request_queue_t * q, int active) { - q->head_active = active; + q->head_active = active; +} + +void blk_queue_pluggable (request_queue_t * q, plug_device_fn *plug) +{ + q->plug_device_fn = plug; } -void blk_queue_pluggable(request_queue_t * q, int use_plug) +void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) { - q->use_plug = use_plug; + q->make_request_fn = mfn; } void blk_init_queue(request_queue_t * q, request_fn_proc * rfn) @@ -173,8 +190,10 @@ q->current_request = NULL; q->merge_fn = NULL; q->merge_requests_fn = NULL; + q->merge_requests_fn = NULL; + q->make_request_fn = NULL; q->plug_tq.sync = 0; - q->plug_tq.routine = &unplug_device; + q->plug_tq.routine = &generic_unplug_device; q->plug_tq.data = q; q->plugged = 0; /* @@ -183,31 +202,11 @@ * use the appropriate functions to alter the queue properties. * as appropriate. */ - q->use_plug = 1; + q->plug_device_fn = NULL; q->head_active = 1; } /* - * remove the plug and let it rip.. - */ -void unplug_device(void * data) -{ - request_queue_t * q = (request_queue_t *) data; - unsigned long flags; - - spin_lock_irqsave(&io_request_lock,flags); - if( q->plugged ) - { - q->plugged = 0; - if( q->current_request != NULL ) - { - (q->request_fn)(q); - } - } - spin_unlock_irqrestore(&io_request_lock,flags); -} - -/* * "plug" the device if there are no outstanding requests: this will * force the transfer to start only after we have put all the requests * on the list. @@ -215,8 +214,12 @@ * This is called with interrupts off and no requests on the queue. * (and with the request spinlock aquired) */ -static inline void plug_device(request_queue_t * q) +inline void generic_plug_device (request_queue_t *q, kdev_t dev) { + if (MAJOR(dev) == MD_MAJOR) { + spin_unlock_irq(&io_request_lock); + BUG(); + } if (q->current_request) return; @@ -225,6 +228,23 @@ } /* + * remove the plug and let it rip.. + */ +void generic_unplug_device(void * data) +{ + request_queue_t * q = (request_queue_t *) data; + unsigned long flags; + + spin_lock_irqsave(&io_request_lock,flags); + if (q->plugged) { + q->plugged = 0; + if (q->current_request) + (q->request_fn)(q); + } + spin_unlock_irqrestore(&io_request_lock,flags); +} + +/* * look for a free request in the first N entries. * NOTE: interrupts must be disabled on the way in (on SMP the request queue * spinlock has to be aquired), and will still be disabled on the way out. @@ -321,7 +341,7 @@ } static inline void drive_stat_acct(struct request *req, - unsigned long nr_sectors, int new_io) + unsigned long nr_sectors, int new_io) { int major = MAJOR(req->rq_dev); int minor = MINOR(req->rq_dev); @@ -368,23 +388,17 @@ * which is important for drive_stat_acct() above. */ -static void add_request(request_queue_t * q, struct request * req) +static inline void __add_request(request_queue_t * q, struct request * req) { int major = MAJOR(req->rq_dev); struct request * tmp; - unsigned long flags; drive_stat_acct(req, req->nr_sectors, 1); req->next = NULL; - /* - * We use the goto to reduce locking complexity - */ - spin_lock_irqsave(&io_request_lock,flags); - if (!(tmp = q->current_request)) { q->current_request = req; - goto out; + return; } for ( ; tmp->next ; tmp = tmp->next) { const int after_current = IN_ORDER(tmp,req); @@ -404,7 +418,7 @@ /* * FIXME(eric) I don't understand why there is a need for this * special case code. It clearly doesn't fit any more with - * the new queueing architecture, and it got added in 2.3.10. + * the new queueing architecture, and it got added in 2.3.10. * I am leaving this in here until I hear back from the COMPAQ * people. */ @@ -417,16 +431,13 @@ { (q->request_fn)(q); } - -out: - spin_unlock_irqrestore(&io_request_lock,flags); } /* * Has to be called with the request spinlock aquired */ static inline void attempt_merge (request_queue_t * q, - struct request *req, + struct request *req, int max_sectors, int max_segments) { @@ -445,23 +456,17 @@ if (total_segments > max_segments) return; - if( q->merge_requests_fn != NULL ) - { + if (q->merge_requests_fn) { /* * If we are not allowed to merge these requests, then * return. If we are allowed to merge, then the count * will have been updated to the appropriate number, * and we shouldn't do it here too. */ - if( !(q->merge_requests_fn)(q, req, next) ) - { + if (!(q->merge_requests_fn)(q, req, next)) return; - } - } - else - { + } else req->nr_segments = total_segments; - } req->bhtail->b_reqnext = next->bh; req->bhtail = next->bhtail; @@ -471,11 +476,10 @@ wake_up (&wait_for_request); } -static void __make_request(request_queue_t * q, - int major, - int rw, +static inline void __make_request(request_queue_t * q, int rw, struct buffer_head * bh) { + int major = MAJOR(bh->b_rdev); unsigned int sector, count; struct request * req; int rw_ahead, max_req, max_sectors, max_segments; @@ -488,24 +492,22 @@ if (buffer_new(bh)) BUG(); - /* Only one thread can actually submit the I/O. */ - if (test_and_set_bit(BH_Lock, &bh->b_state)) - return; - if (blk_size[major]) { unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1; if (maxsector < count || maxsector - count < sector) { bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped); - /* This may well happen - the kernel calls bread() - without checking the size of the device, e.g., - when mounting a device. */ + if (!blk_size[major][MINOR(bh->b_rdev)]) + goto end_io; + /* This may well happen - the kernel calls bread() + without checking the size of the device, e.g., + when mounting a device. */ printk(KERN_INFO - "attempt to access beyond end of device\n"); + "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n", - kdevname(bh->b_rdev), rw, - (sector + count)>>1, - blk_size[major][MINOR(bh->b_rdev)]); + kdevname(bh->b_rdev), rw, + (sector + count)>>1, + blk_size[major][MINOR(bh->b_rdev)]); goto end_io; } } @@ -539,8 +541,7 @@ max_req = (NR_REQUEST * 2) / 3; break; default: - printk(KERN_ERR "make_request: bad block dev cmd," - " must be R/W/RA/WA\n"); + BUG(); goto end_io; } @@ -561,10 +562,12 @@ #endif /* look for a free request. */ - /* Loop uses two requests, 1 for loop and 1 for the real device. - * Cut max_req in half to avoid running out and deadlocking. */ + /* + * Loop uses two requests, 1 for loop and 1 for the real device. + * Cut max_req in half to avoid running out and deadlocking. + */ if ((major == LOOP_MAJOR) || (major == NBD_MAJOR)) - max_req >>= 1; + max_req >>= 1; /* * Try to coalesce the new request with old requests @@ -580,71 +583,72 @@ req = q->current_request; if (!req) { /* MD and loop can't handle plugging without deadlocking */ - if (major != MD_MAJOR && major != LOOP_MAJOR && - major != DDV_MAJOR && major != NBD_MAJOR - && q->use_plug) - plug_device(q); /* is atomic */ + if (q->plug_device_fn) + q->plug_device_fn(q, bh->b_rdev); /* is atomic */ + else + generic_plug_device(q, bh->b_rdev); /* is atomic */ } else switch (major) { - /* - * FIXME(eric) - this entire switch statement is going away - * soon, and we will instead key off of q->head_active to decide - * whether the top request in the queue is active on the device - * or not. - */ - case IDE0_MAJOR: /* same as HD_MAJOR */ - case IDE1_MAJOR: - case FLOPPY_MAJOR: - case IDE2_MAJOR: - case IDE3_MAJOR: - case IDE4_MAJOR: - case IDE5_MAJOR: - case IDE6_MAJOR: - case IDE7_MAJOR: - case IDE8_MAJOR: - case IDE9_MAJOR: - case ACSI_MAJOR: - case MFM_ACORN_MAJOR: /* - * The scsi disk and cdrom drivers completely remove the request - * from the queue when they start processing an entry. For this - * reason it is safe to continue to add links to the top entry for - * those devices. + * FIXME(eric) - this entire switch statement is going + * away soon, and we will instead key off of q->head_active + * to decide whether the top request in the queue is active + * on the device or not. + */ + case IDE0_MAJOR: /* same as HD_MAJOR */ + case IDE1_MAJOR: + case FLOPPY_MAJOR: + case IDE2_MAJOR: + case IDE3_MAJOR: + case IDE4_MAJOR: + case IDE5_MAJOR: + case IDE6_MAJOR: + case IDE7_MAJOR: + case IDE8_MAJOR: + case IDE9_MAJOR: + case ACSI_MAJOR: + case MFM_ACORN_MAJOR: + /* + * The scsi disk and cdrom drivers completely remove the + * request from the queue when they start processing an entry. + * For this reason it is safe to continue to add links to the + * top entry for those devices. * - * All other drivers need to jump over the first entry, as that - * entry may be busy being processed and we thus can't change it. + * All other drivers need to jump over the first entry, as + * that entry may be busy being processed and we thus can't + * change it. */ if (req == q->current_request) - req = req->next; + req = req->next; if (!req) break; /* fall through */ - case SCSI_DISK0_MAJOR: - case SCSI_DISK1_MAJOR: - case SCSI_DISK2_MAJOR: - case SCSI_DISK3_MAJOR: - case SCSI_DISK4_MAJOR: - case SCSI_DISK5_MAJOR: - case SCSI_DISK6_MAJOR: - case SCSI_DISK7_MAJOR: - case SCSI_CDROM_MAJOR: - case DAC960_MAJOR+0: - case DAC960_MAJOR+1: - case DAC960_MAJOR+2: - case DAC960_MAJOR+3: - case DAC960_MAJOR+4: - case DAC960_MAJOR+5: - case DAC960_MAJOR+6: - case DAC960_MAJOR+7: - case I2O_MAJOR: - case COMPAQ_SMART2_MAJOR+0: - case COMPAQ_SMART2_MAJOR+1: - case COMPAQ_SMART2_MAJOR+2: - case COMPAQ_SMART2_MAJOR+3: - case COMPAQ_SMART2_MAJOR+4: - case COMPAQ_SMART2_MAJOR+5: - case COMPAQ_SMART2_MAJOR+6: - case COMPAQ_SMART2_MAJOR+7: + case SCSI_DISK0_MAJOR: + case SCSI_DISK1_MAJOR: + case SCSI_DISK2_MAJOR: + case SCSI_DISK3_MAJOR: + case SCSI_DISK4_MAJOR: + case SCSI_DISK5_MAJOR: + case SCSI_DISK6_MAJOR: + case SCSI_DISK7_MAJOR: + case SCSI_CDROM_MAJOR: + case DAC960_MAJOR+0: + case DAC960_MAJOR+1: + case DAC960_MAJOR+2: + case DAC960_MAJOR+3: + case DAC960_MAJOR+4: + case DAC960_MAJOR+5: + case DAC960_MAJOR+6: + case DAC960_MAJOR+7: + case I2O_MAJOR: + case COMPAQ_SMART2_MAJOR+0: + case COMPAQ_SMART2_MAJOR+1: + case COMPAQ_SMART2_MAJOR+2: + case COMPAQ_SMART2_MAJOR+3: + case COMPAQ_SMART2_MAJOR+4: + case COMPAQ_SMART2_MAJOR+5: + case COMPAQ_SMART2_MAJOR+6: + case COMPAQ_SMART2_MAJOR+7: do { if (req->sem) @@ -659,36 +663,31 @@ if (req->sector + req->nr_sectors == sector) { /* * The merge_fn is a more advanced way - * of accomplishing the same task. Instead + * of accomplishing the same task. Instead * of applying a fixed limit of some sort * we instead define a function which can * determine whether or not it is safe to * merge the request or not. */ - if( q->merge_fn == NULL ) - { + if (!q->merge_fn) { if (req->bhtail->b_data + req->bhtail->b_size != bh->b_data) { if (req->nr_segments < max_segments) req->nr_segments++; else continue; } - } - else - { + } else { /* * See if this queue has rules that * may suggest that we shouldn't merge - * this + * this */ - if( !(q->merge_fn)(q, req, bh) ) - { + if (!(q->merge_fn)(q, req, bh)) continue; - } } req->bhtail->b_reqnext = bh; req->bhtail = bh; - req->nr_sectors += count; + req->nr_sectors += count; drive_stat_acct(req, count, 0); /* Can we now merge this req with the next? */ attempt_merge(q, req, max_sectors, max_segments); @@ -696,45 +695,40 @@ } else if (req->sector - count == sector) { /* * The merge_fn is a more advanced way - * of accomplishing the same task. Instead + * of accomplishing the same task. Instead * of applying a fixed limit of some sort * we instead define a function which can * determine whether or not it is safe to * merge the request or not. */ - if( q->merge_fn == NULL ) - { + if (!q->merge_fn) { if (bh->b_data + bh->b_size != req->bh->b_data) { if (req->nr_segments < max_segments) req->nr_segments++; else continue; } - } - else - { + } else { /* * See if this queue has rules that * may suggest that we shouldn't merge - * this + * this */ - if( !(q->merge_fn)(q, req, bh) ) - { + if (!(q->merge_fn)(q, req, bh)) continue; - } } - bh->b_reqnext = req->bh; - req->bh = bh; - req->buffer = bh->b_data; - req->current_nr_sectors = count; - req->sector = sector; - req->nr_sectors += count; + bh->b_reqnext = req->bh; + req->bh = bh; + req->buffer = bh->b_data; + req->current_nr_sectors = count; + req->sector = sector; + req->nr_sectors += count; drive_stat_acct(req, count, 0); } else continue; spin_unlock_irqrestore(&io_request_lock,flags); - return; + return; } while ((req = req->next) != NULL); } @@ -742,13 +736,34 @@ /* find an unused request. */ req = get_request(max_req, bh->b_rdev); - spin_unlock_irqrestore(&io_request_lock,flags); - -/* if no request available: if rw_ahead, forget it; otherwise try again blocking.. */ + /* + * if no request available: if rw_ahead, forget it, + * otherwise try again blocking.. + */ if (!req) { + spin_unlock_irqrestore(&io_request_lock,flags); if (rw_ahead) goto end_io; req = __get_request_wait(max_req, bh->b_rdev); + spin_lock_irqsave(&io_request_lock,flags); + } + /* + * Dont start the IO if the buffer has been + * invalidated meanwhile. (we have to do this + * within the io request lock and atomically + * before adding the request, see buffer.c's + * insert_into_queues_exclusive() function. + */ + if (!test_bit(BH_Req, &bh->b_state)) { + req->rq_status = RQ_INACTIVE; + spin_unlock_irqrestore(&io_request_lock,flags); + /* + * A fake 'everything went ok' completion event. + * The bh doesnt matter anymore, but we should not + * signal errors to RAID levels. + */ + bh->b_end_io(bh, 1); + return; } /* fill up the request-info, and add it to the queue */ @@ -763,52 +778,51 @@ req->bh = bh; req->bhtail = bh; req->next = NULL; - add_request(q, req); + __add_request(q, req); + spin_unlock_irqrestore(&io_request_lock, flags); return; end_io: bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); } -void make_request(int major,int rw, struct buffer_head * bh) +void generic_make_request(int rw, struct buffer_head * bh) { request_queue_t * q; unsigned long flags; - - q = get_queue(bh->b_dev); - __make_request(q, major, rw, bh); + q = blk_get_queue(bh->b_rdev); + + __make_request(q, rw, bh); spin_lock_irqsave(&io_request_lock,flags); - if( !q->plugged ) + if (q && !q->plugged) (q->request_fn)(q); spin_unlock_irqrestore(&io_request_lock,flags); } - /* This function can be used to request a number of buffers from a block device. Currently the only restriction is that all buffers must belong to the same device */ -void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) +static void __ll_rw_block(int rw, int nr, struct buffer_head * bh[],int haslock) { unsigned int major; int correct_size; - request_queue_t * q; - unsigned long flags; + request_queue_t *q; int i; - major = MAJOR(bh[0]->b_dev); - if (!(q = get_queue(bh[0]->b_dev))) { + q = blk_get_queue(bh[0]->b_dev); + if (!q) { printk(KERN_ERR "ll_rw_block: Trying to read nonexistent block-device %s (%ld)\n", kdevname(bh[0]->b_dev), bh[0]->b_blocknr); goto sorry; } - /* Determine correct block size for this device. */ + /* Determine correct block size for this device. */ correct_size = BLOCK_SIZE; if (blksize_size[major]) { i = blksize_size[major][MINOR(bh[0]->b_dev)]; @@ -816,7 +830,7 @@ correct_size = i; } - /* Verify requested block sizes. */ + /* Verify requested block sizes. */ for (i = 0; i < nr; i++) { if (bh[i]->b_size != correct_size) { printk(KERN_NOTICE "ll_rw_block: device %s: " @@ -825,19 +839,6 @@ correct_size, bh[i]->b_size); goto sorry; } - - /* Md remaps blocks now */ - bh[i]->b_rdev = bh[i]->b_dev; - bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9); -#ifdef CONFIG_BLK_DEV_MD - if (major==MD_MAJOR && - md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev, - &bh[i]->b_rsector, bh[i]->b_size >> 9)) { - printk (KERN_ERR - "Bad md_map in ll_rw_block\n"); - goto sorry; - } -#endif } if ((rw & WRITE) && is_read_only(bh[0]->b_dev)) { @@ -847,25 +848,32 @@ } for (i = 0; i < nr; i++) { + /* Only one thread can actually submit the I/O. */ + if (haslock) { + if (!buffer_locked(bh[i])) + BH_BUG(bh[i]); + } else { + if (test_and_set_bit(BH_Lock, &bh[i]->b_state)) + continue; + if (bh[i]->b_end_io != end_buffer_io_sync) + BH_BUG(bh[i]); + } + io_events[major]++; set_bit(BH_Req, &bh[i]->b_state); -#ifdef CONFIG_BLK_DEV_MD - if (MAJOR(bh[i]->b_dev) == MD_MAJOR) { - md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]); - continue; + + if (q->make_request_fn) + q->make_request_fn(rw, bh[i]); + else { + bh[i]->b_rdev = bh[i]->b_dev; + bh[i]->b_rsector = bh[i]->b_blocknr*(bh[i]->b_size>>9); + + generic_make_request(rw, bh[i]); } -#endif - __make_request(q, MAJOR(bh[i]->b_rdev), rw, bh[i]); } - spin_lock_irqsave(&io_request_lock,flags); - if( !q->plugged ) - { - (q->request_fn)(q); - } - spin_unlock_irqrestore(&io_request_lock,flags); return; - sorry: +sorry: for (i = 0; i < nr; i++) { mark_buffer_clean(bh[i]); /* remeber to refile it */ clear_bit(BH_Uptodate, &bh[i]->b_state); @@ -874,8 +882,18 @@ return; } +void ll_rw_block(int rw, int nr, struct buffer_head * bh[]) +{ + __ll_rw_block(rw, nr, bh, 0); +} + +void ll_rw_block_locked(int rw, int nr, struct buffer_head * bh[]) +{ + __ll_rw_block(rw, nr, bh, 1); +} + #ifdef CONFIG_STRAM_SWAP -extern int stram_device_init( void ); +extern int stram_device_init (void); #endif /* @@ -885,8 +903,7 @@ * 1 means we are done */ -int -end_that_request_first( struct request *req, int uptodate, char *name ) +int end_that_request_first (struct request *req, int uptodate, char *name) { struct buffer_head * bh; int nsect; @@ -921,8 +938,7 @@ return 0; } -void -end_that_request_last( struct request *req ) +void end_that_request_last(struct request *req) { if (req->sem != NULL) up(req->sem); @@ -936,7 +952,7 @@ struct blk_dev_struct *dev; for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) { - dev->queue = NULL; + dev->queue = NULL; blk_init_queue(&dev->request_queue, NULL); } @@ -1017,7 +1033,7 @@ sbpcd_init(); #endif CONFIG_SBPCD #ifdef CONFIG_AZTCD - aztcd_init(); + aztcd_init(); #endif CONFIG_AZTCD #ifdef CONFIG_CDU535 sony535_init(); --- linux/drivers/block/raid1.c.orig Thu Aug 12 19:16:28 1999 +++ linux/drivers/block/raid1.c Sun Jan 16 17:45:53 2000 @@ -1,6 +1,9 @@ -/************************************************************************ +/* * raid1.c : Multiple Devices driver for Linux - * Copyright (C) 1996 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * RAID-1 management functions. * @@ -15,50 +18,52 @@ */ #include -#include #include -#include -#include -#include +#include #include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -/* - * The following can be used to debug the driver - */ -/*#define RAID1_DEBUG*/ -#ifdef RAID1_DEBUG -#define PRINTK(x) do { printk x; } while (0); -#else -#define PRINTK(x) do { ; } while (0); -#endif +#define MAX_LINEAR_SECTORS 128 #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b)) -static struct md_personality raid1_personality; -static struct md_thread *raid1_thread = NULL; +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock; struct buffer_head *raid1_retry_list = NULL; -static int __raid1_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) +static void * raid1_kmalloc (int size) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; + void * ptr; + /* + * now we are rather fault tolerant than nice, but + * there are a couple of places in the RAID code where we + * simply can not afford to fail an allocation because + * there is no failure return path (eg. make_request()) + */ + while (!(ptr = kmalloc (size, GFP_KERNEL))) + printk ("raid1: out of memory, retrying...\n"); + + memset(ptr, 0, size); + return ptr; +} + +static int __raid1_map (mddev_t *mddev, kdev_t *rdev, unsigned long size) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i, disks = MD_SB_DISKS; /* * Later we do read balancing on the read side * now we use the first available disk. */ - PRINTK(("raid1_map().\n")); - - for (i=0; imirrors[i].operational) { - *rdev = raid_conf->mirrors[i].dev; + for (i = 0; i < disks; i++) { + if (conf->mirrors[i].operational) { + *rdev = conf->mirrors[i].dev; return (0); } } @@ -67,29 +72,26 @@ return (-1); } -static int raid1_map (struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size) -{ - return 0; -} - -void raid1_reschedule_retry (struct buffer_head *bh) +static void raid1_reschedule_retry (struct buffer_head *bh) { + unsigned long flags; struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); - PRINTK(("raid1_reschedule_retry().\n")); - + md_spin_lock_irqsave(&retry_list_lock, flags); r1_bh->next_retry = raid1_retry_list; raid1_retry_list = bh; - md_wakeup_thread(raid1_thread); + md_spin_unlock_irqrestore(&retry_list_lock, flags); + md_wakeup_thread(conf->thread); } /* - * raid1_end_buffer_io() is called when we have finished servicing a mirrored + * raid1_end_bh_io() is called when we have finished servicing a mirrored * operation and are ready to return a success/failure code to the buffer * cache layer. */ -static inline void raid1_end_buffer_io(struct raid1_bh *r1_bh, int uptodate) +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) { struct buffer_head *bh = r1_bh->master_bh; @@ -97,27 +99,16 @@ kfree(r1_bh); } -int raid1_one_error=0; - void raid1_end_request (struct buffer_head *bh, int uptodate) { struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_dev_id); - unsigned long flags; - save_flags(flags); - cli(); - PRINTK(("raid1_end_request().\n")); - - if (raid1_one_error) { - raid1_one_error=0; - uptodate=0; - } /* * this branch is our 'one mirror IO has finished' event handler: */ if (!uptodate) md_error (bh->b_dev, bh->b_rdev); - else { + else /* * Set BH_Uptodate in our master buffer_head, so that * we will return a good error code for to the higher @@ -128,7 +119,6 @@ * wait for the 'master' buffer_head. */ set_bit (BH_Uptodate, &r1_bh->state); - } /* * We split up the read and write side, imho they are @@ -136,83 +126,58 @@ */ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) { - - PRINTK(("raid1_end_request(), read branch.\n")); - /* * we have only one buffer_head on the read side */ if (uptodate) { - PRINTK(("raid1_end_request(), read branch, uptodate.\n")); - raid1_end_buffer_io(r1_bh, uptodate); - restore_flags(flags); + raid1_end_bh_io(r1_bh, uptodate); return; } /* * oops, read error: */ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_reschedule_retry (bh); - restore_flags(flags); + partition_name(bh->b_dev), bh->b_blocknr); + raid1_reschedule_retry(bh); return; } /* - * WRITE. - */ - PRINTK(("raid1_end_request(), write branch.\n")); - - /* + * WRITE: + * * Let's see if all mirrored write operations have finished - * already [we have irqs off, so we can decrease]: + * already. */ - if (!--r1_bh->remaining) { - struct md_dev *mddev = r1_bh->mddev; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - int i, n = raid_conf->raid_disks; - - PRINTK(("raid1_end_request(), remaining == 0.\n")); + if (atomic_dec_and_test(&r1_bh->remaining)) { + int i, disks = MD_SB_DISKS; - for ( i=0; imirror_bh[i]) kfree(r1_bh->mirror_bh[i]); + for ( i = 0; i < disks; i++) { + struct buffer_head *bh = r1_bh->mirror_bh[i]; + if (bh) { + // FIXME: make us a regular bcache member + kfree(bh); + } + } - raid1_end_buffer_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state)); + raid1_end_bh_io(r1_bh, test_bit(BH_Uptodate, &r1_bh->state)); } - else PRINTK(("raid1_end_request(), remaining == %u.\n", r1_bh->remaining)); - restore_flags(flags); } -/* This routine checks if the undelying device is an md device and in that - * case it maps the blocks before putting the request on the queue - */ -static inline void -map_and_make_request (int rw, struct buffer_head *bh) -{ - if (MAJOR (bh->b_rdev) == MD_MAJOR) - md_map (MINOR (bh->b_rdev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); - clear_bit(BH_Lock, &bh->b_state); - make_request (MAJOR (bh->b_rdev), rw, bh); -} - -static int -raid1_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh) +static int raid1_make_request (mddev_t *mddev, int rw, + struct buffer_head * bh) { - - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req; struct raid1_bh * r1_bh; - int n = raid_conf->raid_disks, i, sum_bhs = 0, switch_disks = 0, sectors; + int disks = MD_SB_DISKS; + int i, sum_bhs = 0, switch_disks = 0, sectors, skip_writeback = 0; struct mirror_info *mirror; - PRINTK(("raid1_make_request().\n")); - while (!( /* FIXME: now we are rather fault tolerant than nice */ - r1_bh = kmalloc (sizeof (struct raid1_bh), GFP_KERNEL) - ) ) - printk ("raid1_make_request(#1): out of memory\n"); - memset (r1_bh, 0, sizeof (struct raid1_bh)); + if (!buffer_locked(bh)) + BUG(); + r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); /* * make_request() can abort the operation when READA is being @@ -220,61 +185,94 @@ * * Currently, just replace the command with READ/WRITE. */ - if (rw == READA) rw = READ; + if (rw == READA) + rw = READ; - if (rw == WRITE) - mark_buffer_clean(bh); /* Too early ? */ + if (rw == WRITE) { + rw = WRITERAW; + /* + * we first clean the bh, then we start the IO, then + * when the IO has finished, we end_io the bh and + * mark it uptodate. This way we do not miss the + * case when the bh got dirty again during the IO. + * + * We do an important optimization here - if the + * buffer was not dirty and we are during resync or + * reconstruction, then we can skip writing it back + * to the master disk! (we still have to write it + * back to the other disks, because we are not sync + * yet.) + */ + skip_writeback = 0; + if (atomic_set_buffer_clean(bh)) + __mark_buffer_clean(bh); + else { + if (conf->resync_mirrors) + skip_writeback = 1; + else { + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + return 0; + } + } + } -/* - * i think the read and write branch should be separated completely, since we want - * to do read balancing on the read side for example. Comments? :) --mingo - */ + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ - r1_bh->master_bh=bh; - r1_bh->mddev=mddev; + r1_bh->master_bh = bh; + r1_bh->mddev = mddev; r1_bh->cmd = rw; + bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); + + if (rw == READ) { + int last_used = conf->last_used; - if (rw==READ || rw==READA) { - int last_used = raid_conf->last_used; - PRINTK(("raid1_make_request(), read branch.\n")); - mirror = raid_conf->mirrors + last_used; + /* + * read balancing logic: + */ + mirror = conf->mirrors + last_used; bh->b_rdev = mirror->dev; sectors = bh->b_size >> 9; - if (bh->b_blocknr * sectors == raid_conf->next_sect) { - raid_conf->sect_count += sectors; - if (raid_conf->sect_count >= mirror->sect_limit) + + switch_disks = 0; + if (bh->b_blocknr * sectors == conf->next_sect) { + conf->sect_count += sectors; + if (conf->sect_count >= mirror->sect_limit) switch_disks = 1; } else switch_disks = 1; - raid_conf->next_sect = (bh->b_blocknr + 1) * sectors; - if (switch_disks) { - PRINTK(("read-balancing: switching %d -> %d (%d sectors)\n", last_used, mirror->next, raid_conf->sect_count)); - raid_conf->sect_count = 0; - last_used = raid_conf->last_used = mirror->next; + conf->next_sect = (bh->b_blocknr + 1) * sectors; + /* + * Do not switch disks if full resync is in progress ... + */ + if (switch_disks && !conf->resync_mirrors) { + conf->sect_count = 0; + last_used = conf->last_used = mirror->next; /* - * Do not switch to write-only disks ... resyncing - * is in progress + * Do not switch to write-only disks ... + * reconstruction is in progress */ - while (raid_conf->mirrors[last_used].write_only) - raid_conf->last_used = raid_conf->mirrors[last_used].next; + while (conf->mirrors[last_used].write_only) + conf->last_used = conf->mirrors[last_used].next; } - PRINTK (("raid1 read queue: %d %d\n", MAJOR (bh->b_rdev), MINOR (bh->b_rdev))); bh_req = &r1_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); bh_req->b_end_io = raid1_end_request; bh_req->b_dev_id = r1_bh; - map_and_make_request (rw, bh_req); + generic_make_request (rw, bh_req); return 0; } /* - * WRITE. + * WRITE: */ - PRINTK(("raid1_make_request(n=%d), write branch.\n",n)); - for (i = 0; i < n; i++) { + for (i = 0; i < disks; i++) { - if (!raid_conf->mirrors [i].operational) { + if (!conf->mirrors[i].operational) { /* * the r1_bh->mirror_bh[i] pointer remains NULL */ @@ -282,85 +280,93 @@ continue; } - /* - * We should use a private pool (size depending on NR_REQUEST), - * to avoid writes filling up the memory with bhs - * - * Such pools are much faster than kmalloc anyways (so we waste almost - * nothing by not using the master bh when writing and win alot of cleanness) - * - * but for now we are cool enough. --mingo - * - * It's safe to sleep here, buffer heads cannot be used in a shared - * manner in the write branch. Look how we lock the buffer at the beginning - * of this function to grok the difference ;) - */ - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mirror_bh[i] = kmalloc (sizeof (struct buffer_head), GFP_KERNEL) - ) ) - printk ("raid1_make_request(#2): out of memory\n"); - memset (mirror_bh[i], 0, sizeof (struct buffer_head)); - - /* - * prepare mirrored bh (fields ordered for max mem throughput): - */ - mirror_bh [i]->b_blocknr = bh->b_blocknr; - mirror_bh [i]->b_dev = bh->b_dev; - mirror_bh [i]->b_rdev = raid_conf->mirrors [i].dev; - mirror_bh [i]->b_rsector = bh->b_rsector; - mirror_bh [i]->b_state = (1<b_count, 1); - mirror_bh [i]->b_size = bh->b_size; - mirror_bh [i]->b_data = bh->b_data; - mirror_bh [i]->b_list = BUF_LOCKED; - mirror_bh [i]->b_end_io = raid1_end_request; - mirror_bh [i]->b_dev_id = r1_bh; - - r1_bh->mirror_bh[i] = mirror_bh[i]; - sum_bhs++; - } - - r1_bh->remaining = sum_bhs; - - PRINTK(("raid1_make_request(), write branch, sum_bhs=%d.\n",sum_bhs)); - - /* - * We have to be a bit careful about the semaphore above, thats why we - * start the requests separately. Since kmalloc() could fail, sleep and - * make_request() can sleep too, this is the safer solution. Imagine, - * end_request decreasing the semaphore before we could have set it up ... - * We could play tricks with the semaphore (presetting it and correcting - * at the end if sum_bhs is not 'n' but we have to do end_request by hand - * if all requests finish until we had a chance to set up the semaphore - * correctly ... lots of races). - */ - for (i = 0; i < n; i++) - if (mirror_bh [i] != NULL) - map_and_make_request (rw, mirror_bh [i]); + /* + * special case for reconstruction ... + */ + if (skip_writeback && (i == conf->last_used)) { + mirror_bh[i] = NULL; + continue; + } + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mirror_bh[i] = raid1_kmalloc(sizeof(struct buffer_head)); + mirror_bh[i]->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mirror_bh[i]->b_blocknr = bh->b_blocknr; + mirror_bh[i]->b_dev = bh->b_dev; + mirror_bh[i]->b_rdev = conf->mirrors[i].dev; + mirror_bh[i]->b_rsector = bh->b_rsector; + mirror_bh[i]->b_state = (1<b_size = bh->b_size; + mirror_bh[i]->b_data = bh->b_data; + mirror_bh[i]->b_list = BUF_LOCKED; + mirror_bh[i]->b_end_io = raid1_end_request; + mirror_bh[i]->b_dev_id = r1_bh; + + r1_bh->mirror_bh[i] = mirror_bh[i]; + sum_bhs++; + } + md_atomic_set(&r1_bh->remaining, sum_bhs); + + /* + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + for (i = 0; i < disks; i++) { + struct buffer_head *mbh = mirror_bh[i]; + if (mbh) + generic_make_request(rw, mbh); + } return (0); } -static int raid1_status (char *page, int minor, struct md_dev *mddev) +static int raid1_status (char *page, mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); int sz = 0, i; - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks); - for (i = 0; i < raid_conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", raid_conf->mirrors [i].operational ? "U" : "_"); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", + conf->mirrors[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); return sz; } -static void raid1_fix_links (struct raid1_data *raid_conf, int failed_index) +static void unlink_disk (raid1_conf_t *conf, int target) { - int disks = raid_conf->raid_disks; - int j; + int disks = MD_SB_DISKS; + int i; - for (j = 0; j < disks; j++) - if (raid_conf->mirrors [j].next == failed_index) - raid_conf->mirrors [j].next = raid_conf->mirrors [failed_index].next; + for (i = 0; i < disks; i++) + if (conf->mirrors[i].next == target) + conf->mirrors[i].next = conf->mirrors[target].next; } #define LAST_DISK KERN_ALERT \ @@ -379,48 +385,53 @@ #define ALREADY_SYNCING KERN_INFO \ "raid1: syncing already in progress.\n" -static int raid1_error (struct md_dev *mddev, kdev_t dev) +static void mark_disk_bad (mddev_t *mddev, int failed) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - int disks = raid_conf->raid_disks; - int i; + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + unlink_disk(conf, failed); + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + md_wakeup_thread(conf->thread); + conf->working_disks--; + printk (DISK_FAILED, partition_name (mirror->dev), + conf->working_disks); +} - PRINTK(("raid1_error called\n")); +static int raid1_error (mddev_t *mddev, kdev_t dev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; - if (raid_conf->working_disks == 1) { + if (conf->working_disks == 1) { /* * Uh oh, we can do nothing if this is our last disk, but * first check if this is a queued request for a device * which has just failed. */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) - if (mirror->dev == dev && !mirror->operational) + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && !mirrors[i].operational) return 0; + } printk (LAST_DISK); } else { - /* Mark disk as unusable */ - for (i = 0, mirror = raid_conf->mirrors; i < disks; - i++, mirror++) { - if (mirror->dev == dev && mirror->operational){ - mirror->operational = 0; - raid1_fix_links (raid_conf, i); - sb->disks[mirror->number].state |= - (1 << MD_FAULTY_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_SYNC_DEVICE); - sb->disks[mirror->number].state &= - ~(1 << MD_ACTIVE_DEVICE); - sb->active_disks--; - sb->working_disks--; - sb->failed_disks++; - mddev->sb_dirty = 1; - md_wakeup_thread(raid1_thread); - raid_conf->working_disks--; - printk (DISK_FAILED, kdevname (dev), - raid_conf->working_disks); + /* + * Mark disk as unusable + */ + for (i = 0; i < disks; i++) { + if (mirrors[i].dev==dev && mirrors[i].operational) { + mark_disk_bad(mddev, i); + break; } } } @@ -433,156 +444,300 @@ #undef START_SYNCING /* - * This is the personality-specific hot-addition routine + * Insert the spare disk into the drive-ring */ +static void link_disk(raid1_conf_t *conf, struct mirror_info *mirror) +{ + int j, next; + int disks = MD_SB_DISKS; + struct mirror_info *p = conf->mirrors; -#define NO_SUPERBLOCK KERN_ERR \ -"raid1: cannot hot-add disk to the array with no RAID superblock\n" + for (j = 0; j < disks; j++, p++) + if (p->operational && !p->write_only) { + next = p->next; + p->next = mirror->raid_disk; + mirror->next = next; + return; + } -#define WRONG_LEVEL KERN_ERR \ -"raid1: hot-add: level of disk is not RAID-1\n" + printk("raid1: bug: no read-operational devices\n"); +} -#define HOT_ADD_SUCCEEDED KERN_INFO \ -"raid1: device %s hot-added\n" +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; + + printk("RAID1 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; + } + printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, + conf->raid_disks, conf->nr_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} -static int raid1_hot_add_disk (struct md_dev *mddev, kdev_t dev) +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) { - unsigned long flags; - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; - struct mirror_info *mirror; - md_superblock_t *sb = mddev->sb; - struct real_dev * realdev; - int n; + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + print_raid1_conf(conf); + md_spin_lock_irq(&conf->device_lock); /* - * The device has its superblock already read and it was found - * to be consistent for generic RAID usage. Now we check whether - * it's usable for RAID-1 hot addition. + * find the disk ... */ + switch (state) { - n = mddev->nb_dev++; - realdev = &mddev->devices[n]; - if (!realdev->sb) { - printk (NO_SUPERBLOCK); - return -EINVAL; - } - if (realdev->sb->level != 1) { - printk (WRONG_LEVEL); - return -EINVAL; + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; } - /* FIXME: are there other things left we could sanity-check? */ + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; /* - * We have to disable interrupts, as our RAID-1 state is used - * from irq handlers as well. + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) */ - save_flags(flags); - cli(); + case DISKOP_SPARE_ACTIVE: - raid_conf->raid_disks++; - mirror = raid_conf->mirrors+n; + sdisk = conf->mirrors + spare_disk; + fdisk = conf->mirrors + failed_disk; - mirror->number=n; - mirror->raid_disk=n; - mirror->dev=dev; - mirror->next=0; /* FIXME */ - mirror->sect_limit=128; + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; - mirror->operational=0; - mirror->spare=1; - mirror->write_only=0; + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } - sb->disks[n].state |= (1 << MD_FAULTY_DEVICE); - sb->disks[n].state &= ~(1 << MD_SYNC_DEVICE); - sb->disks[n].state &= ~(1 << MD_ACTIVE_DEVICE); - sb->nr_disks++; - sb->spare_disks++; + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } - restore_flags(flags); + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } - md_update_sb(MINOR(dev)); + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } - printk (HOT_ADD_SUCCEEDED, kdevname(realdev->dev)); + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); - return 0; -} + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); -#undef NO_SUPERBLOCK -#undef WRONG_LEVEL -#undef HOT_ADD_SUCCEEDED + *d = failed_desc; -/* - * Insert the spare disk into the drive-ring - */ -static void add_ring(struct raid1_data *raid_conf, struct mirror_info *mirror) -{ - int j, next; - struct mirror_info *p = raid_conf->mirrors; + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + link_disk(conf, fdisk); - for (j = 0; j < raid_conf->raid_disks; j++, p++) - if (p->operational && !p->write_only) { - next = p->next; - p->next = mirror->raid_disk; - mirror->next = next; - return; + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + + conf->working_disks++; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; } - printk("raid1: bug: no read-operational devices\n"); + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->mirrors + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + conf->nr_disks++; + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + print_raid1_conf(conf); + return err; } -static int raid1_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, - int state) -{ - int i = 0, failed_disk = -1; - struct raid1_data *raid_conf = mddev->private; - struct mirror_info *mirror = raid_conf->mirrors; - md_descriptor_t *descriptor; - unsigned long flags; - for (i = 0; i < MD_SB_DISKS; i++, mirror++) { - if (mirror->spare && mirror->number == spare->number) - goto found; - } - return 1; -found: - for (i = 0, mirror = raid_conf->mirrors; i < raid_conf->raid_disks; - i++, mirror++) - if (!mirror->operational) - failed_disk = i; +#define IO_ERROR KERN_ALERT \ +"raid1: %s: unrecoverable I/O read error for block %lu\n" - save_flags(flags); - cli(); - switch (state) { - case SPARE_WRITE: - mirror->operational = 1; - mirror->write_only = 1; - raid_conf->raid_disks = MAX(raid_conf->raid_disks, - mirror->raid_disk + 1); - break; - case SPARE_INACTIVE: - mirror->operational = 0; - mirror->write_only = 0; - break; - case SPARE_ACTIVE: - mirror->spare = 0; - mirror->write_only = 0; - raid_conf->working_disks++; - add_ring(raid_conf, mirror); - - if (failed_disk != -1) { - descriptor = &mddev->sb->disks[raid_conf->mirrors[failed_disk].number]; - i = spare->raid_disk; - spare->raid_disk = descriptor->raid_disk; - descriptor->raid_disk = i; - } - break; - default: - printk("raid1_mark_spare: bug: state == %d\n", state); - restore_flags(flags); - return 1; - } - restore_flags(flags); - return 0; -} +#define REDIRECT_SECTOR KERN_ERR \ +"raid1: %s: redirecting sector %lu to another mirror\n" /* * This is a kernel thread which: @@ -590,62 +745,94 @@ * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. */ -void raid1d (void *data) +static void raid1d (void *data) { + struct raid1_bh *r1_bh; struct buffer_head *bh; - kdev_t dev; unsigned long flags; - struct raid1_bh * r1_bh; - struct md_dev *mddev; + mddev_t *mddev; + kdev_t dev; - PRINTK(("raid1d() active\n")); - save_flags(flags); - cli(); - while (raid1_retry_list) { + for (;;) { + md_spin_lock_irqsave(&retry_list_lock, flags); bh = raid1_retry_list; + if (!bh) + break; r1_bh = (struct raid1_bh *)(bh->b_dev_id); raid1_retry_list = r1_bh->next_retry; - restore_flags(flags); + md_spin_unlock_irqrestore(&retry_list_lock, flags); - mddev = md_dev + MINOR(bh->b_dev); + mddev = kdev_to_mddev(bh->b_dev); if (mddev->sb_dirty) { - printk("dirty sb detected, updating.\n"); + printk(KERN_INFO "dirty sb detected, updating.\n"); mddev->sb_dirty = 0; - md_update_sb(MINOR(bh->b_dev)); + md_update_sb(mddev); } dev = bh->b_rdev; - __raid1_map (md_dev + MINOR(bh->b_dev), &bh->b_rdev, &bh->b_rsector, bh->b_size >> 9); + + __raid1_map (mddev, &bh->b_rdev, bh->b_size >> 9); if (bh->b_rdev == dev) { - printk (KERN_ALERT - "raid1: %s: unrecoverable I/O read error for block %lu\n", - kdevname(bh->b_dev), bh->b_blocknr); - raid1_end_buffer_io(r1_bh, 0); + printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr); + raid1_end_bh_io(r1_bh, 0); } else { - printk (KERN_ERR "raid1: %s: redirecting sector %lu to another mirror\n", - kdevname(bh->b_dev), bh->b_blocknr); - map_and_make_request (r1_bh->cmd, bh); + printk (REDIRECT_SECTOR, + partition_name(bh->b_dev), bh->b_blocknr); + generic_make_request (r1_bh->cmd, bh); } - cli(); } - restore_flags(flags); + md_spin_unlock_irqrestore(&retry_list_lock, flags); } +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (conf->resync_mirrors == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev, NULL)) { + up(&mddev->recovery_sem); + return; + } + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + up(&mddev->recovery_sem); +} + /* * This will catch the scenario in which one of the mirrors was * mounted as a normal device rather than as a part of a raid set. + * + * check_consistency is very personality-dependent, eg. RAID5 cannot + * do this check, it uses another method. */ -static int __check_consistency (struct md_dev *mddev, int row) +static int __check_consistency (mddev_t *mddev, int row) { - struct raid1_data *raid_conf = mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); + int disks = MD_SB_DISKS; kdev_t dev; struct buffer_head *bh = NULL; int i, rc = 0; char *buffer = NULL; - for (i = 0; i < raid_conf->raid_disks; i++) { - if (!raid_conf->mirrors[i].operational) + for (i = 0; i < disks; i++) { + printk("(checking disk %d)\n",i); + if (!conf->mirrors[i].operational) continue; - dev = raid_conf->mirrors[i].dev; + printk("(really checking disk %d)\n",i); + dev = conf->mirrors[i].dev; set_blocksize(dev, 4096); if ((bh = bread(dev, row / 4, 4096)) == NULL) break; @@ -674,166 +861,346 @@ return rc; } -static int check_consistency (struct md_dev *mddev) +static int check_consistency (mddev_t *mddev) { - int size = mddev->sb->size; - int row; + if (__check_consistency(mddev, 0)) +/* + * we do not do this currently, as it's perfectly possible to + * have an inconsistent array when it's freshly created. Only + * newly written data has to be consistent. + */ + return 0; - for (row = 0; row < size; row += size / 8) - if (__check_consistency(mddev, row)) - return 1; return 0; } -static int raid1_run (int minor, struct md_dev *mddev) +#define INVALID_LEVEL KERN_WARNING \ +"raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"raid1: no operational mirrors for md%d\n" + +#define RUNNING_CKRAID KERN_ERR \ +"raid1: detected mirror differences -- running resync\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) { - struct raid1_data *raid_conf; - int i, j, raid_disk; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", - kdevname(MKDEV(MD_MAJOR, minor)), sb->level); - MOD_DEC_USE_COUNT; - return -EIO; - } - /**** - * copy the now verified devices into our private RAID1 bookkeeping - * area. [whatever we allocate in raid1_run(), should be freed in - * raid1_stop()] - */ - - while (!( /* FIXME: now we are rather fault tolerant than nice */ - mddev->private = kmalloc (sizeof (struct raid1_data), GFP_KERNEL) - ) ) - printk ("raid1_run(): out of memory\n"); - raid_conf = mddev->private; - memset(raid_conf, 0, sizeof(*raid_conf)); - - PRINTK(("raid1_run(%d) called.\n", minor)); - - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) { - printk(KERN_ERR "raid1: disabled mirror %s (couldn't access raid superblock)\n", kdevname(realdev->dev)); + printk(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] + */ + + conf = raid1_kmalloc(sizeof(raid1_conf_t)); + mddev->private = conf; + if (!conf) { + printk(MEM_ERROR, mdidx(mddev)); + goto out; + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk(ERRORS, partition_name(rdev->dev)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); continue; } - - /* - * This is important -- we are using the descriptor on - * the disk only to get a pointer to the descriptor on - * the main superblock, which might be more recent. - */ - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) { - printk(KERN_ERR "raid1: disabled mirror %s (errors detected)\n", kdevname(realdev->dev)); + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; continue; } - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) { - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) { - printk(KERN_ERR "raid1: disabled mirror %s (not in sync)\n", kdevname(realdev->dev)); + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + printk(NOT_IN_SYNC, + partition_name(rdev->dev)); continue; } - raid_disk = descriptor->raid_disk; - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid1: disabled mirror %s (inconsistent descriptor)\n", kdevname(realdev->dev)); + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + printk(INCONSISTENT, + partition_name(rdev->dev)); continue; } - if (raid_conf->mirrors[raid_disk].operational) { - printk(KERN_ERR "raid1: disabled mirror %s (mirror %d already operational)\n", kdevname(realdev->dev), raid_disk); + if (disk->operational) { + printk(ALREADY_RUNNING, + partition_name(rdev->dev), + disk_idx); continue; } - printk(KERN_INFO "raid1: device %s operational as mirror %d\n", kdevname(realdev->dev), raid_disk); - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].operational = 1; - raid_conf->mirrors[raid_disk].sect_limit = 128; - raid_conf->working_disks++; + printk(OPERATIONAL, partition_name(rdev->dev), + disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + conf->working_disks++; } else { /* * Must be a spare disk .. */ - printk(KERN_INFO "raid1: spare disk %s\n", kdevname(realdev->dev)); - raid_disk = descriptor->raid_disk; - raid_conf->mirrors[raid_disk].number = descriptor->number; - raid_conf->mirrors[raid_disk].raid_disk = raid_disk; - raid_conf->mirrors[raid_disk].dev = mddev->devices [i].dev; - raid_conf->mirrors[raid_disk].sect_limit = 128; - - raid_conf->mirrors[raid_disk].operational = 0; - raid_conf->mirrors[raid_disk].write_only = 0; - raid_conf->mirrors[raid_disk].spare = 1; - } - } - if (!raid_conf->working_disks) { - printk(KERN_ERR "raid1: no operational mirrors for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; - } - - raid_conf->raid_disks = sb->raid_disks; - raid_conf->mddev = mddev; - - for (j = 0; !raid_conf->mirrors[j].operational; j++); - raid_conf->last_used = j; - for (i = raid_conf->raid_disks - 1; i >= 0; i--) { - if (raid_conf->mirrors[i].operational) { - PRINTK(("raid_conf->mirrors[%d].next == %d\n", i, j)); - raid_conf->mirrors[i].next = j; + printk(SPARE, partition_name(rdev->dev)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = rdev->dev; + disk->sect_limit = MAX_LINEAR_SECTORS; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + if (!conf->working_disks) { + printk(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational; j++) + /* nothing */; + conf->last_used = j; + + /* + * initialize the 'working disks' list. + */ + for (i = conf->raid_disks - 1; i >= 0; i--) { + if (conf->mirrors[i].operational) { + conf->mirrors[i].next = j; j = i; } } - if (check_consistency(mddev)) { - printk(KERN_ERR "raid1: detected mirror differences -- run ckraid\n"); - sb->state |= 1 << MD_SB_ERRORS; - kfree(raid_conf); - mddev->private = NULL; - MOD_DEC_USE_COUNT; - return -EIO; + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; } + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN))) { + /* + * we do sanity checks even if the device says + * it's clean ... + */ + if (check_consistency(mddev)) { + printk(RUNNING_CKRAID); + sb->state &= ~(1 << MD_SB_CLEAN); + } + } + + { + const char * name = "raid1d"; + + conf->thread = md_register_thread(raid1d, conf, name); + if (!conf->thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char * name = "raid1syncd"; + + conf->resync_thread = md_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + printk(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + printk(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + } + /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ - for (i = 0; i < sb->nr_disks ; i++) { - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE); + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); for (j = 0; j < sb->raid_disks; j++) { - if (!raid_conf->mirrors[j].operational) + if (!conf->mirrors[j].operational) continue; - if (sb->disks[i].number == raid_conf->mirrors[j].number) - sb->disks[i].state |= 1 << MD_SYNC_DEVICE; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); } } - sb->active_disks = raid_conf->working_disks; + sb->active_disks = conf->working_disks; - printk("raid1: raid set %s active with %d out of %d mirrors\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks); - /* Ok, everything is just fine now */ - return (0); + if (start_recovery) + md_recover_arrays(); + + + printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef RUNNING_CKRAID +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_thread) { + if (conf->resync_mirrors) { + conf->resync_mirrors = 2; + md_interrupt_thread(conf->resync_thread); + printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + conf->resync_mirrors = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; } -static int raid1_stop (int minor, struct md_dev *mddev) +static int raid1_stop (mddev_t *mddev) { - struct raid1_data *raid_conf = (struct raid1_data *) mddev->private; + raid1_conf_t *conf = mddev_to_conf(mddev); - kfree (raid_conf); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } -static struct md_personality raid1_personality= +static mdk_personality_t raid1_personality= { "raid1", - raid1_map, + NULL, raid1_make_request, raid1_end_request, raid1_run, @@ -842,15 +1209,13 @@ NULL, /* no ioctls */ 0, raid1_error, - raid1_hot_add_disk, - /* raid1_hot_remove_drive */ NULL, - raid1_mark_spare + raid1_diskop, + raid1_stop_resync, + raid1_restart_resync }; int raid1_init (void) { - if ((raid1_thread = md_register_thread(raid1d, NULL)) == NULL) - return -EBUSY; return register_md_personality (RAID1, &raid1_personality); } @@ -862,7 +1227,6 @@ void cleanup_module (void) { - md_unregister_thread (raid1_thread); unregister_md_personality (RAID1); } #endif --- linux/drivers/block/raid5.c.orig Tue Aug 31 20:30:47 1999 +++ linux/drivers/block/raid5.c Sun Jan 16 17:45:53 2000 @@ -1,6 +1,7 @@ -/***************************************************************************** +/* * raid5.c : Multiple Devices driver for Linux - * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1999, 2000 Ingo Molnar * * RAID-5 management functions. * @@ -14,130 +15,107 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + #include #include #include -#include -#include +#include #include #include -#include -static struct md_personality raid5_personality; +static mdk_personality_t raid5_personality; /* * Stripe cache */ + #define NR_STRIPES 128 #define HASH_PAGES 1 #define HASH_PAGES_ORDER 0 #define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) #define HASH_MASK (NR_HASH - 1) -#define stripe_hash(raid_conf, sect, size) ((raid_conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) +#define stripe_hash(conf, sect, size) ((conf)->stripe_hashtbl[((sect) / (size >> 9)) & HASH_MASK]) /* * The following can be used to debug the driver */ #define RAID5_DEBUG 0 +#define CHECK_DEVLOCK() if (!conf->device_lock.lock) BUG() +#define CHECK_SHLOCK(sh) if (!stripe_locked(sh)) BUG() #if RAID5_DEBUG -#define PRINTK(x) do { printk x; } while (0); +#define PRINTK(x...) printk(x) +#define inline +#define __inline__ #else -#define PRINTK(x) do { ; } while (0) +#define inline +#define __inline__ +#define PRINTK(x...) do { } while (0) #endif +static void print_raid5_conf (raid5_conf_t *conf); + static inline int stripe_locked(struct stripe_head *sh) { return test_bit(STRIPE_LOCKED, &sh->state); } -static inline int stripe_error(struct stripe_head *sh) -{ - return test_bit(STRIPE_ERROR, &sh->state); -} - -/* - * Stripes are locked whenever new buffers can't be added to them. - */ -static inline void lock_stripe(struct stripe_head *sh) -{ - struct raid5_data *raid_conf = sh->raid_conf; - if (!test_and_set_bit(STRIPE_LOCKED, &sh->state)) { - PRINTK(("locking stripe %lu\n", sh->sector)); - raid_conf->nr_locked_stripes++; - } -} - -static inline void unlock_stripe(struct stripe_head *sh) +static void __unlock_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - if (test_and_clear_bit(STRIPE_LOCKED, &sh->state)) { - PRINTK(("unlocking stripe %lu\n", sh->sector)); - raid_conf->nr_locked_stripes--; - wake_up(&sh->wait); - } + if (!md_test_and_clear_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + PRINTK("unlocking stripe %lu\n", sh->sector); + wake_up(&sh->wait); } -static inline void finish_stripe(struct stripe_head *sh) +static void finish_unlock_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - unlock_stripe(sh); + raid5_conf_t *conf = sh->raid_conf; sh->cmd = STRIPE_NONE; sh->phase = PHASE_COMPLETE; - raid_conf->nr_pending_stripes--; - raid_conf->nr_cached_stripes++; - wake_up(&raid_conf->wait_for_stripe); -} - -void __wait_on_stripe(struct stripe_head *sh) -{ - DECLARE_WAITQUEUE(wait, current); - - PRINTK(("wait_on_stripe %lu\n", sh->sector)); - sh->count++; - add_wait_queue(&sh->wait, &wait); -repeat: - set_current_state(TASK_UNINTERRUPTIBLE); - if (stripe_locked(sh)) { - schedule(); - goto repeat; - } - PRINTK(("wait_on_stripe %lu done\n", sh->sector)); - remove_wait_queue(&sh->wait, &wait); - sh->count--; - current->state = TASK_RUNNING; -} - -static inline void wait_on_stripe(struct stripe_head *sh) -{ - if (stripe_locked(sh)) - __wait_on_stripe(sh); + atomic_dec(&conf->nr_pending_stripes); + atomic_inc(&conf->nr_cached_stripes); + __unlock_stripe(sh); + atomic_dec(&sh->count); + wake_up(&conf->wait_for_stripe); } -static inline void remove_hash(struct raid5_data *raid_conf, struct stripe_head *sh) +static void remove_hash(raid5_conf_t *conf, struct stripe_head *sh) { - PRINTK(("remove_hash(), stripe %lu\n", sh->sector)); + PRINTK("remove_hash(), stripe %lu\n", sh->sector); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); if (sh->hash_pprev) { if (sh->hash_next) sh->hash_next->hash_pprev = sh->hash_pprev; *sh->hash_pprev = sh->hash_next; sh->hash_pprev = NULL; - raid_conf->nr_hashed_stripes--; + atomic_dec(&conf->nr_hashed_stripes); } } -static inline void insert_hash(struct raid5_data *raid_conf, struct stripe_head *sh) +static void lock_get_bh (struct buffer_head *bh) +{ + while (md_test_and_set_bit(BH_Lock, &bh->b_state)) + __wait_on_buffer(bh); + bget(bh); +} + +static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) { - struct stripe_head **shp = &stripe_hash(raid_conf, sh->sector, sh->size); + struct stripe_head **shp = &stripe_hash(conf, sh->sector, sh->size); - PRINTK(("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", sh->sector, raid_conf->nr_hashed_stripes)); + PRINTK("insert_hash(), stripe %lu, nr_hashed_stripes %d\n", + sh->sector, atomic_read(&conf->nr_hashed_stripes)); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); if ((sh->hash_next = *shp) != NULL) (*shp)->hash_pprev = &sh->hash_next; *shp = sh; sh->hash_pprev = shp; - raid_conf->nr_hashed_stripes++; + atomic_inc(&conf->nr_hashed_stripes); } static struct buffer_head *get_free_buffer(struct stripe_head *sh, int b_size) @@ -145,13 +123,18 @@ struct buffer_head *bh; unsigned long flags; - save_flags(flags); - cli(); - if ((bh = sh->buffer_pool) == NULL) - return NULL; + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->buffer_pool; + if (!bh) + goto out_unlock; sh->buffer_pool = bh->b_next; bh->b_size = b_size; - restore_flags(flags); + if (bcount(bh) != 0) + BUG(); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + return bh; } @@ -160,12 +143,17 @@ struct buffer_head *bh; unsigned long flags; - save_flags(flags); - cli(); - if ((bh = sh->bh_pool) == NULL) - return NULL; + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); + bh = sh->bh_pool; + if (!bh) + goto out_unlock; sh->bh_pool = bh->b_next; - restore_flags(flags); + if (bcount(bh) != 0) + BUG(); +out_unlock: + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + return bh; } @@ -173,55 +161,58 @@ { unsigned long flags; - save_flags(flags); - cli(); + if (bcount(bh) != 0) + BUG(); + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); bh->b_next = sh->buffer_pool; sh->buffer_pool = bh; - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); } static void put_free_bh(struct stripe_head *sh, struct buffer_head *bh) { unsigned long flags; - save_flags(flags); - cli(); + if (bcount(bh) != 0) + BUG(); + CHECK_SHLOCK(sh); + md_spin_lock_irqsave(&sh->stripe_lock, flags); bh->b_next = sh->bh_pool; sh->bh_pool = bh; - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); } -static struct stripe_head *get_free_stripe(struct raid5_data *raid_conf) +static struct stripe_head *get_free_stripe(raid5_conf_t *conf) { struct stripe_head *sh; - unsigned long flags; - save_flags(flags); - cli(); - if ((sh = raid_conf->free_sh_list) == NULL) { - restore_flags(flags); - return NULL; - } - raid_conf->free_sh_list = sh->free_next; - raid_conf->nr_free_sh--; - if (!raid_conf->nr_free_sh && raid_conf->free_sh_list) - printk ("raid5: bug: free_sh_list != NULL, nr_free_sh == 0\n"); - restore_flags(flags); - if (sh->hash_pprev || sh->nr_pending || sh->count) - printk("get_free_stripe(): bug\n"); + md_spin_lock_irq(&conf->device_lock); + sh = conf->free_sh_list; + if (!sh) + goto out; + conf->free_sh_list = sh->free_next; + atomic_dec(&conf->nr_free_sh); + if (!atomic_read(&conf->nr_free_sh) && conf->free_sh_list) + BUG(); + if (sh->hash_pprev || md_atomic_read(&sh->nr_pending) || + atomic_read(&sh->count)) + BUG(); +out: + md_spin_unlock_irq(&conf->device_lock); return sh; } -static void put_free_stripe(struct raid5_data *raid_conf, struct stripe_head *sh) +static void __put_free_stripe(raid5_conf_t *conf, struct stripe_head *sh) { - unsigned long flags; - - save_flags(flags); - cli(); - sh->free_next = raid_conf->free_sh_list; - raid_conf->free_sh_list = sh; - raid_conf->nr_free_sh++; - restore_flags(flags); + if (atomic_read(&sh->count) != 0) + BUG(); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + clear_bit(STRIPE_LOCKED, &sh->state); + sh->free_next = conf->free_sh_list; + conf->free_sh_list = sh; + atomic_inc(&conf->nr_free_sh); } static void shrink_buffers(struct stripe_head *sh, int num) @@ -229,7 +220,8 @@ struct buffer_head *bh; while (num--) { - if ((bh = get_free_buffer(sh, -1)) == NULL) + bh = get_free_buffer(sh, -1); + if (!bh) return; free_page((unsigned long) bh->b_data); kfree(bh); @@ -241,26 +233,30 @@ struct buffer_head *bh; while (num--) { - if ((bh = get_free_bh(sh)) == NULL) + bh = get_free_bh(sh); + if (!bh) return; kfree(bh); } } -static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority) +static int grow_raid5_buffers(struct stripe_head *sh, int num, int b_size, int priority) { struct buffer_head *bh; while (num--) { - if ((bh = kmalloc(sizeof(struct buffer_head), priority)) == NULL) + bh = kmalloc(sizeof(struct buffer_head), priority); + if (!bh) return 1; memset(bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); bh->b_data = (char *) __get_free_page(priority); if (!bh->b_data) { kfree(bh); return 1; } bh->b_size = b_size; + bh_set(bh, 0); put_free_buffer(sh, bh); } return 0; @@ -271,259 +267,303 @@ struct buffer_head *bh; while (num--) { - if ((bh = kmalloc(sizeof(struct buffer_head), priority)) == NULL) + bh = kmalloc(sizeof(struct buffer_head), priority); + if (!bh) return 1; memset(bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); put_free_bh(sh, bh); } return 0; } -static void raid5_kfree_buffer(struct stripe_head *sh, struct buffer_head *bh) +static void raid5_free_buffer(struct stripe_head *sh, struct buffer_head *bh) { - unsigned long flags; - - save_flags(flags); - cli(); put_free_buffer(sh, bh); - restore_flags(flags); } -static void raid5_kfree_bh(struct stripe_head *sh, struct buffer_head *bh) +static void raid5_free_bh(struct stripe_head *sh, struct buffer_head *bh) { - unsigned long flags; - - save_flags(flags); - cli(); put_free_bh(sh, bh); - restore_flags(flags); } -static void raid5_kfree_old_bh(struct stripe_head *sh, int i) +static void raid5_free_old_bh(struct stripe_head *sh, int i) { - if (!sh->bh_old[i]) { - printk("raid5_kfree_old_bh: bug: sector %lu, index %d not present\n", sh->sector, i); - return; - } - raid5_kfree_buffer(sh, sh->bh_old[i]); + CHECK_SHLOCK(sh); + if (!sh->bh_old[i]) + BUG(); + raid5_free_buffer(sh, sh->bh_old[i]); sh->bh_old[i] = NULL; } static void raid5_update_old_bh(struct stripe_head *sh, int i) { - PRINTK(("stripe %lu, idx %d, updating cache copy\n", sh->sector, i)); - if (!sh->bh_copy[i]) { - printk("raid5_update_old_bh: bug: sector %lu, index %d not present\n", sh->sector, i); - return; - } + CHECK_SHLOCK(sh); + PRINTK("stripe %lu, idx %d, updating cache copy\n", sh->sector, i); + if (!sh->bh_copy[i]) + BUG(); if (sh->bh_old[i]) - raid5_kfree_old_bh(sh, i); + raid5_free_old_bh(sh, i); sh->bh_old[i] = sh->bh_copy[i]; sh->bh_copy[i] = NULL; } -static void kfree_stripe(struct stripe_head *sh) +static void free_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks, j; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, j; - PRINTK(("kfree_stripe called, stripe %lu\n", sh->sector)); - if (sh->phase != PHASE_COMPLETE || stripe_locked(sh) || sh->count) { - printk("raid5: kfree_stripe(), sector %lu, phase %d, locked %d, count %d\n", sh->sector, sh->phase, stripe_locked(sh), sh->count); + if (atomic_read(&sh->count) != 0) + BUG(); + CHECK_DEVLOCK(); + CHECK_SHLOCK(sh); + PRINTK("free_stripe called, stripe %lu\n", sh->sector); + if (sh->phase != PHASE_COMPLETE || atomic_read(&sh->count)) { + PRINTK("raid5: free_stripe(), sector %lu, phase %d, count %d\n", sh->sector, sh->phase, atomic_read(&sh->count)); return; } for (j = 0; j < disks; j++) { if (sh->bh_old[j]) - raid5_kfree_old_bh(sh, j); + raid5_free_old_bh(sh, j); if (sh->bh_new[j] || sh->bh_copy[j]) - printk("raid5: bug: sector %lu, new %p, copy %p\n", sh->sector, sh->bh_new[j], sh->bh_copy[j]); + BUG(); } - remove_hash(raid_conf, sh); - put_free_stripe(raid_conf, sh); + remove_hash(conf, sh); + __put_free_stripe(conf, sh); } -static int shrink_stripe_cache(struct raid5_data *raid_conf, int nr) +static int shrink_stripe_cache(raid5_conf_t *conf, int nr) { struct stripe_head *sh; int i, count = 0; - PRINTK(("shrink_stripe_cache called, %d/%d, clock %d\n", nr, raid_conf->nr_hashed_stripes, raid_conf->clock)); + PRINTK("shrink_stripe_cache called, %d/%d, clock %d\n", nr, atomic_read(&conf->nr_hashed_stripes), conf->clock); + md_spin_lock_irq(&conf->device_lock); for (i = 0; i < NR_HASH; i++) { -repeat: - sh = raid_conf->stripe_hashtbl[(i + raid_conf->clock) & HASH_MASK]; + sh = conf->stripe_hashtbl[(i + conf->clock) & HASH_MASK]; for (; sh; sh = sh->hash_next) { if (sh->phase != PHASE_COMPLETE) continue; - if (stripe_locked(sh)) + if (atomic_read(&sh->count)) continue; - if (sh->count) + /* + * Try to lock this stripe: + */ + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) continue; - kfree_stripe(sh); + free_stripe(sh); if (++count == nr) { - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes)); - raid_conf->clock = (i + raid_conf->clock) & HASH_MASK; - return nr; + conf->clock = (i + conf->clock) & HASH_MASK; + goto out; } - goto repeat; } } - PRINTK(("shrink completed, nr_hashed_stripes %d\n", raid_conf->nr_hashed_stripes)); +out: + md_spin_unlock_irq(&conf->device_lock); + PRINTK("shrink completed, nr_hashed_stripes %d\n", atomic_read(&conf->nr_hashed_stripes)); return count; } -static struct stripe_head *find_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) +void __wait_lock_stripe(struct stripe_head *sh) { - struct stripe_head *sh; + MD_DECLARE_WAITQUEUE(wait, current); - if (raid_conf->buffer_size != size) { - PRINTK(("switching size, %d --> %d\n", raid_conf->buffer_size, size)); - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes); - raid_conf->buffer_size = size; + PRINTK("wait_lock_stripe %lu\n", sh->sector); + if (!atomic_read(&sh->count)) + BUG(); + add_wait_queue(&sh->wait, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) { + schedule(); + goto repeat; } + PRINTK("wait_lock_stripe %lu done\n", sh->sector); + remove_wait_queue(&sh->wait, &wait); + current->state = TASK_RUNNING; +} - PRINTK(("find_stripe, sector %lu\n", sector)); - for (sh = stripe_hash(raid_conf, sector, size); sh; sh = sh->hash_next) - if (sh->sector == sector && sh->raid_conf == raid_conf) { - if (sh->size == size) { - PRINTK(("found stripe %lu\n", sector)); - return sh; - } else { - PRINTK(("switching size for %lu, %d --> %d\n", sector, sh->size, size)); - kfree_stripe(sh); - break; - } +static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh; + + PRINTK("__find_stripe, sector %lu\n", sector); + for (sh = stripe_hash(conf, sector, size); sh; sh = sh->hash_next) { + if (sh->sector == sector && sh->raid_conf == conf) { + if (sh->size != size) + BUG(); + return sh; } - PRINTK(("stripe %lu not in cache\n", sector)); + } + PRINTK("__stripe %lu not in cache\n", sector); return NULL; } -static int grow_stripes(struct raid5_data *raid_conf, int num, int priority) +static inline struct stripe_head *alloc_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh; + struct buffer_head *buffer_pool, *bh_pool; + + PRINTK("alloc_stripe called\n"); + + while ((sh = get_free_stripe(conf)) == NULL) { + shrink_stripe_cache(conf, conf->max_nr_stripes / 8); + sh = get_free_stripe(conf); + if (sh) + break; + md_wakeup_thread(conf->thread); + PRINTK("waiting for some stripes to complete\n"); + sleep_on(&conf->wait_for_stripe); + } + + buffer_pool = sh->buffer_pool; + bh_pool = sh->bh_pool; + memset(sh, 0, sizeof(*sh)); + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&sh->wait); + sh->buffer_pool = buffer_pool; + sh->bh_pool = bh_pool; + sh->phase = PHASE_COMPLETE; + sh->cmd = STRIPE_NONE; + sh->raid_conf = conf; + sh->sector = sector; + sh->size = size; + atomic_inc(&conf->nr_cached_stripes); + + return sh; +} + +static struct stripe_head *get_lock_stripe(raid5_conf_t *conf, unsigned long sector, int size) +{ + struct stripe_head *sh, *new = NULL; + + PRINTK("get_stripe, sector %lu\n", sector); + + /* + * Do this in set_blocksize()! + */ + if (conf->buffer_size != size) { + PRINTK("switching size, %d --> %d\n", conf->buffer_size, size); + shrink_stripe_cache(conf, conf->max_nr_stripes); + conf->buffer_size = size; + } + +repeat: + md_spin_lock_irq(&conf->device_lock); + sh = __find_stripe(conf, sector, size); + if (!sh) { + if (!new) { + md_spin_unlock_irq(&conf->device_lock); + new = alloc_stripe(conf, sector, size); + goto repeat; + } + sh = new; + new = NULL; + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + insert_hash(conf, sh); + atomic_inc(&sh->count); + md_spin_unlock_irq(&conf->device_lock); + } else { + atomic_inc(&sh->count); + if (new) { + if (md_test_and_set_bit(STRIPE_LOCKED, &new->state)) + BUG(); + __put_free_stripe(conf, new); + } + md_spin_unlock_irq(&conf->device_lock); + PRINTK("get_stripe, waiting, sector %lu\n", sector); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + __wait_lock_stripe(sh); + } + return sh; +} + +static int grow_stripes(raid5_conf_t *conf, int num, int priority) { struct stripe_head *sh; while (num--) { - if ((sh = kmalloc(sizeof(struct stripe_head), priority)) == NULL) + sh = kmalloc(sizeof(struct stripe_head), priority); + if (!sh) return 1; memset(sh, 0, sizeof(*sh)); - if (grow_buffers(sh, 2 * raid_conf->raid_disks, PAGE_SIZE, priority)) { - shrink_buffers(sh, 2 * raid_conf->raid_disks); + sh->raid_conf = conf; + sh->stripe_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&sh->wait); + + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + if (grow_raid5_buffers(sh, 2 * conf->raid_disks, PAGE_SIZE, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); kfree(sh); return 1; } - if (grow_bh(sh, raid_conf->raid_disks, priority)) { - shrink_buffers(sh, 2 * raid_conf->raid_disks); - shrink_bh(sh, raid_conf->raid_disks); + if (grow_bh(sh, conf->raid_disks, priority)) { + shrink_buffers(sh, 2 * conf->raid_disks); + shrink_bh(sh, conf->raid_disks); kfree(sh); return 1; } - put_free_stripe(raid_conf, sh); - raid_conf->nr_stripes++; + md_spin_lock_irq(&conf->device_lock); + __put_free_stripe(conf, sh); + atomic_inc(&conf->nr_stripes); + md_spin_unlock_irq(&conf->device_lock); } return 0; } -static void shrink_stripes(struct raid5_data *raid_conf, int num) +static void shrink_stripes(raid5_conf_t *conf, int num) { struct stripe_head *sh; while (num--) { - sh = get_free_stripe(raid_conf); + sh = get_free_stripe(conf); if (!sh) break; - shrink_buffers(sh, raid_conf->raid_disks * 2); - shrink_bh(sh, raid_conf->raid_disks); + if (md_test_and_set_bit(STRIPE_LOCKED, &sh->state)) + BUG(); + shrink_buffers(sh, conf->raid_disks * 2); + shrink_bh(sh, conf->raid_disks); kfree(sh); - raid_conf->nr_stripes--; - } -} - -static struct stripe_head *kmalloc_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) -{ - struct stripe_head *sh = NULL, *tmp; - struct buffer_head *buffer_pool, *bh_pool; - - PRINTK(("kmalloc_stripe called\n")); - - while ((sh = get_free_stripe(raid_conf)) == NULL) { - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes / 8); - if ((sh = get_free_stripe(raid_conf)) != NULL) - break; - if (!raid_conf->nr_pending_stripes) - printk("raid5: bug: nr_free_sh == 0, nr_pending_stripes == 0\n"); - md_wakeup_thread(raid_conf->thread); - PRINTK(("waiting for some stripes to complete\n")); - sleep_on(&raid_conf->wait_for_stripe); + atomic_dec(&conf->nr_stripes); } - - /* - * The above might have slept, so perhaps another process - * already created the stripe for us.. - */ - if ((tmp = find_stripe(raid_conf, sector, size)) != NULL) { - put_free_stripe(raid_conf, sh); - wait_on_stripe(tmp); - return tmp; - } - if (sh) { - buffer_pool = sh->buffer_pool; - bh_pool = sh->bh_pool; - memset(sh, 0, sizeof(*sh)); - sh->buffer_pool = buffer_pool; - sh->bh_pool = bh_pool; - sh->phase = PHASE_COMPLETE; - sh->cmd = STRIPE_NONE; - sh->raid_conf = raid_conf; - sh->sector = sector; - sh->size = size; - raid_conf->nr_cached_stripes++; - insert_hash(raid_conf, sh); - } else printk("raid5: bug: kmalloc_stripe() == NULL\n"); - return sh; -} - -static struct stripe_head *get_stripe(struct raid5_data *raid_conf, unsigned long sector, int size) -{ - struct stripe_head *sh; - - PRINTK(("get_stripe, sector %lu\n", sector)); - sh = find_stripe(raid_conf, sector, size); - if (sh) - wait_on_stripe(sh); - else - sh = kmalloc_stripe(raid_conf, sector, size); - return sh; } -static struct buffer_head *raid5_kmalloc_buffer(struct stripe_head *sh, int b_size) +static struct buffer_head *raid5_alloc_buffer(struct stripe_head *sh, int b_size) { struct buffer_head *bh; - if ((bh = get_free_buffer(sh, b_size)) == NULL) - printk("raid5: bug: raid5_kmalloc_buffer() == NULL\n"); + bh = get_free_buffer(sh, b_size); + if (!bh) + BUG(); return bh; } -static struct buffer_head *raid5_kmalloc_bh(struct stripe_head *sh) +static struct buffer_head *raid5_alloc_bh(struct stripe_head *sh) { struct buffer_head *bh; - if ((bh = get_free_bh(sh)) == NULL) - printk("raid5: bug: raid5_kmalloc_bh() == NULL\n"); + bh = get_free_bh(sh); + if (!bh) + BUG(); return bh; } -static inline void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate) +static void raid5_end_buffer_io (struct stripe_head *sh, int i, int uptodate) { struct buffer_head *bh = sh->bh_new[i]; + PRINTK("raid5_end_buffer_io %lu, uptodate: %d.\n", bh->b_rsector, uptodate); sh->bh_new[i] = NULL; - raid5_kfree_bh(sh, sh->bh_req[i]); + raid5_free_bh(sh, sh->bh_req[i]); sh->bh_req[i] = NULL; + PRINTK("calling %p->end_io: %p.\n", bh, bh->b_end_io); bh->b_end_io(bh, uptodate); if (!uptodate) printk(KERN_ALERT "raid5: %s: unrecoverable I/O error for " - "block %lu\n", kdevname(bh->b_dev), bh->b_blocknr); + "block %lu\n", partition_name(bh->b_dev), bh->b_blocknr); } static inline void raid5_mark_buffer_uptodate (struct buffer_head *bh, int uptodate) @@ -537,36 +577,35 @@ static void raid5_end_request (struct buffer_head * bh, int uptodate) { struct stripe_head *sh = bh->b_dev_id; - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks, i; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; unsigned long flags; - PRINTK(("end_request %lu, nr_pending %d\n", sh->sector, sh->nr_pending)); - save_flags(flags); - cli(); + PRINTK("end_request %lu, nr_pending %d, uptodate: %d, (caller: %p,%p,%p,%p).\n", sh->sector, atomic_read(&sh->nr_pending), uptodate, __builtin_return_address(0),__builtin_return_address(1),__builtin_return_address(2), __builtin_return_address(3)); + md_spin_lock_irqsave(&sh->stripe_lock, flags); raid5_mark_buffer_uptodate(bh, uptodate); - --sh->nr_pending; - if (!sh->nr_pending) { - md_wakeup_thread(raid_conf->thread); - atomic_inc(&raid_conf->nr_handle); - } if (!uptodate) md_error(bh->b_dev, bh->b_rdev); - if (raid_conf->failed_disks) { + if (conf->failed_disks) { for (i = 0; i < disks; i++) { - if (raid_conf->disks[i].operational) + if (conf->disks[i].operational) continue; if (bh != sh->bh_old[i] && bh != sh->bh_req[i] && bh != sh->bh_copy[i]) continue; - if (bh->b_rdev != raid_conf->disks[i].dev) + if (bh->b_rdev != conf->disks[i].dev) continue; set_bit(STRIPE_ERROR, &sh->state); } } - restore_flags(flags); + md_spin_unlock_irqrestore(&sh->stripe_lock, flags); + + if (atomic_dec_and_test(&sh->nr_pending)) { + atomic_inc(&conf->nr_handle); + md_wakeup_thread(conf->thread); + } } -static int raid5_map (struct md_dev *mddev, kdev_t *rdev, +static int raid5_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { /* No complex mapping used: the core of the work is done in the @@ -577,21 +616,22 @@ static void raid5_build_block (struct stripe_head *sh, struct buffer_head *bh, int i) { - struct raid5_data *raid_conf = sh->raid_conf; - struct md_dev *mddev = raid_conf->mddev; - int minor = (int) (mddev - md_dev); + raid5_conf_t *conf = sh->raid_conf; + mddev_t *mddev = conf->mddev; char *b_data; - kdev_t dev = MKDEV(MD_MAJOR, minor); + kdev_t dev = mddev_to_kdev(mddev); int block = sh->sector / (sh->size >> 9); - b_data = ((volatile struct buffer_head *) bh)->b_data; + b_data = bh->b_data; memset (bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); init_buffer(bh, raid5_end_request, sh); bh->b_dev = dev; bh->b_blocknr = block; - ((volatile struct buffer_head *) bh)->b_data = b_data; - bh->b_rdev = raid_conf->disks[i].dev; + bh->b_data = b_data; + + bh->b_rdev = conf->disks[i].dev; bh->b_rsector = sh->sector; bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); @@ -599,33 +639,62 @@ bh->b_list = BUF_LOCKED; } -static int raid5_error (struct md_dev *mddev, kdev_t dev) +static int raid5_error (mddev_t *mddev, kdev_t dev) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - md_superblock_t *sb = mddev->sb; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; struct disk_info *disk; int i; - PRINTK(("raid5_error called\n")); - raid_conf->resync_parity = 0; - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++) + PRINTK("raid5_error called\n"); + conf->resync_parity = 0; + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { if (disk->dev == dev && disk->operational) { disk->operational = 0; - sb->disks[disk->number].state |= (1 << MD_FAULTY_DEVICE); - sb->disks[disk->number].state &= ~(1 << MD_SYNC_DEVICE); - sb->disks[disk->number].state &= ~(1 << MD_ACTIVE_DEVICE); + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); sb->active_disks--; sb->working_disks--; sb->failed_disks++; mddev->sb_dirty = 1; - raid_conf->working_disks--; - raid_conf->failed_disks++; - md_wakeup_thread(raid_conf->thread); + conf->working_disks--; + conf->failed_disks++; + md_wakeup_thread(conf->thread); + printk (KERN_ALERT + "raid5: Disk failure on %s, disabling device." + " Operation continuing on %d devices\n", + partition_name (dev), conf->working_disks); + return -EIO; + } + } + /* + * handle errors in spares (during reconstruction) + */ + if (conf->spare) { + disk = conf->spare; + if (disk->dev == dev) { printk (KERN_ALERT - "RAID5: Disk failure on %s, disabling device." - "Operation continuing on %d devices\n", - kdevname (dev), raid_conf->working_disks); + "raid5: Disk failure on spare %s\n", + partition_name (dev)); + if (!conf->spare->operational) { + MD_BUG(); + return -EIO; + } + disk->operational = 0; + disk->write_only = 0; + conf->spare = NULL; + mark_disk_faulty(sb->disks+disk->number); + mark_disk_nonsync(sb->disks+disk->number); + mark_disk_inactive(sb->disks+disk->number); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + + return -EIO; } + } + MD_BUG(); return 0; } @@ -633,15 +702,14 @@ * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static inline unsigned long -raid5_compute_sector (int r_sector, unsigned int raid_disks, unsigned int data_disks, - unsigned int * dd_idx, unsigned int * pd_idx, - struct raid5_data *raid_conf) +static unsigned long raid5_compute_sector(int r_sector, unsigned int raid_disks, + unsigned int data_disks, unsigned int * dd_idx, + unsigned int * pd_idx, raid5_conf_t *conf) { unsigned int stripe; int chunk_number, chunk_offset; unsigned long new_sector; - int sectors_per_chunk = raid_conf->chunk_size >> 9; + int sectors_per_chunk = conf->chunk_size >> 9; /* First compute the information on this sector */ @@ -664,9 +732,9 @@ /* * Select the parity disk based on the user selected algorithm. */ - if (raid_conf->level == 4) + if (conf->level == 4) *pd_idx = data_disks; - else switch (raid_conf->algorithm) { + else switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: *pd_idx = data_disks - stripe % raid_disks; if (*dd_idx >= *pd_idx) @@ -686,7 +754,7 @@ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; break; default: - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm); + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } /* @@ -707,16 +775,16 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i) { - struct raid5_data *raid_conf = sh->raid_conf; - int raid_disks = raid_conf->raid_disks, data_disks = raid_disks - 1; + raid5_conf_t *conf = sh->raid_conf; + int raid_disks = conf->raid_disks, data_disks = raid_disks - 1; unsigned long new_sector = sh->sector, check; - int sectors_per_chunk = raid_conf->chunk_size >> 9; + int sectors_per_chunk = conf->chunk_size >> 9; unsigned long stripe = new_sector / sectors_per_chunk; int chunk_offset = new_sector % sectors_per_chunk; int chunk_number, dummy1, dummy2, dd_idx = i; unsigned long r_sector, blocknr; - switch (raid_conf->algorithm) { + switch (conf->algorithm) { case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC: if (i > sh->pd_idx) @@ -729,14 +797,14 @@ i -= (sh->pd_idx + 1); break; default: - printk ("raid5: unsupported algorithm %d\n", raid_conf->algorithm); + printk ("raid5: unsupported algorithm %d\n", conf->algorithm); } chunk_number = stripe * data_disks + i; r_sector = chunk_number * sectors_per_chunk + chunk_offset; blocknr = r_sector / (sh->size >> 9); - check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, raid_conf); + check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { printk("compute_blocknr: map not correct\n"); return 0; @@ -744,144 +812,148 @@ return blocknr; } -#ifdef HAVE_ARCH_XORBLOCK -static void xor_block(struct buffer_head *dest, struct buffer_head *source) -{ - __xor_block((char *) dest->b_data, (char *) source->b_data, dest->b_size); -} -#else -static void xor_block(struct buffer_head *dest, struct buffer_head *source) -{ - long lines = dest->b_size / (sizeof (long)) / 8, i; - long *destp = (long *) dest->b_data, *sourcep = (long *) source->b_data; - - for (i = lines; i > 0; i--) { - *(destp + 0) ^= *(sourcep + 0); - *(destp + 1) ^= *(sourcep + 1); - *(destp + 2) ^= *(sourcep + 2); - *(destp + 3) ^= *(sourcep + 3); - *(destp + 4) ^= *(sourcep + 4); - *(destp + 5) ^= *(sourcep + 5); - *(destp + 6) ^= *(sourcep + 6); - *(destp + 7) ^= *(sourcep + 7); - destp += 8; - sourcep += 8; - } -} -#endif - static void compute_block(struct stripe_head *sh, int dd_idx) { - struct raid5_data *raid_conf = sh->raid_conf; - int i, disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - PRINTK(("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx)); + PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx); if (sh->bh_old[dd_idx] == NULL) - sh->bh_old[dd_idx] = raid5_kmalloc_buffer(sh, sh->size); + sh->bh_old[dd_idx] = raid5_alloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_old[dd_idx], dd_idx); memset(sh->bh_old[dd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_old[dd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == dd_idx) continue; if (sh->bh_old[i]) { - xor_block(sh->bh_old[dd_idx], sh->bh_old[i]); - continue; - } else + bh_ptr[count++] = sh->bh_old[i]; + } else { printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i); + } + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if(count != 1) { + xor_block(count, &bh_ptr[0]); } raid5_mark_buffer_uptodate(sh->bh_old[dd_idx], 1); } static void compute_parity(struct stripe_head *sh, int method) { - struct raid5_data *raid_conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; - PRINTK(("compute_parity, stripe %lu, method %d\n", sh->sector, method)); + PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method); for (i = 0; i < disks; i++) { if (i == pd_idx || !sh->bh_new[i]) continue; if (!sh->bh_copy[i]) - sh->bh_copy[i] = raid5_kmalloc_buffer(sh, sh->size); + sh->bh_copy[i] = raid5_alloc_buffer(sh, sh->size); raid5_build_block(sh, sh->bh_copy[i], i); - mark_buffer_clean(sh->bh_new[i]); + if (atomic_set_buffer_clean(sh->bh_new[i])) + atomic_set_buffer_dirty(sh->bh_copy[i]); memcpy(sh->bh_copy[i]->b_data, sh->bh_new[i]->b_data, sh->size); } - if (sh->bh_copy[pd_idx] == NULL) - sh->bh_copy[pd_idx] = raid5_kmalloc_buffer(sh, sh->size); + if (sh->bh_copy[pd_idx] == NULL) { + sh->bh_copy[pd_idx] = raid5_alloc_buffer(sh, sh->size); + atomic_set_buffer_dirty(sh->bh_copy[pd_idx]); + } raid5_build_block(sh, sh->bh_copy[pd_idx], sh->pd_idx); if (method == RECONSTRUCT_WRITE) { memset(sh->bh_copy[pd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == sh->pd_idx) continue; if (sh->bh_new[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]); - continue; + bh_ptr[count++] = sh->bh_copy[i]; + } else if (sh->bh_old[i]) { + bh_ptr[count++] = sh->bh_old[i]; } - if (sh->bh_old[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]); - continue; + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; } } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } } else if (method == READ_MODIFY_WRITE) { memcpy(sh->bh_copy[pd_idx]->b_data, sh->bh_old[pd_idx]->b_data, sh->size); + bh_ptr[0] = sh->bh_copy[pd_idx]; + count = 1; for (i = 0; i < disks; i++) { if (i == sh->pd_idx) continue; if (sh->bh_new[i] && sh->bh_old[i]) { - xor_block(sh->bh_copy[pd_idx], sh->bh_copy[i]); - xor_block(sh->bh_copy[pd_idx], sh->bh_old[i]); - continue; + bh_ptr[count++] = sh->bh_copy[i]; + bh_ptr[count++] = sh->bh_old[i]; + } + if (count >= (MAX_XOR_BLOCKS - 1)) { + xor_block(count, &bh_ptr[0]); + count = 1; } } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } } raid5_mark_buffer_uptodate(sh->bh_copy[pd_idx], 1); } static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw) { - struct raid5_data *raid_conf = sh->raid_conf; + raid5_conf_t *conf = sh->raid_conf; struct buffer_head *bh_req; - if (sh->bh_new[dd_idx]) { - printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector); - printk("forcing oops.\n"); - *(int*)0=0; - } - - set_bit(BH_Lock, &bh->b_state); + PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector); + CHECK_SHLOCK(sh); + if (sh->bh_new[dd_idx]) + BUG(); - bh_req = raid5_kmalloc_bh(sh); + bh_req = raid5_alloc_bh(sh); raid5_build_block(sh, bh_req, dd_idx); bh_req->b_data = bh->b_data; if (sh->phase == PHASE_COMPLETE && sh->cmd == STRIPE_NONE) { + PRINTK("stripe s#%lu => PHASE_BEGIN (%s)\n", sh->sector, rw == READ ? "read" : "write"); sh->phase = PHASE_BEGIN; sh->cmd = (rw == READ) ? STRIPE_READ : STRIPE_WRITE; - raid_conf->nr_pending_stripes++; - atomic_inc(&raid_conf->nr_handle); + atomic_inc(&conf->nr_pending_stripes); + atomic_inc(&conf->nr_handle); + PRINTK("# of pending stripes: %u, # of handle: %u\n", atomic_read(&conf->nr_pending_stripes), atomic_read(&conf->nr_handle)); } sh->bh_new[dd_idx] = bh; sh->bh_req[dd_idx] = bh_req; sh->cmd_new[dd_idx] = rw; sh->new[dd_idx] = 1; + + PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx); } static void complete_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - int disks = raid_conf->raid_disks; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; int i, new = 0; - PRINTK(("complete_stripe %lu\n", sh->sector)); + PRINTK("complete_stripe %lu\n", sh->sector); for (i = 0; i < disks; i++) { if (sh->cmd == STRIPE_WRITE && i == sh->pd_idx) raid5_update_old_bh(sh, i); if (sh->bh_new[i]) { + PRINTK("stripe %lu finishes new bh, sh->new == %d\n", sh->sector, sh->new[i]); if (!sh->new[i]) { #if 0 if (sh->cmd == STRIPE_WRITE) { @@ -904,13 +976,230 @@ printk("raid5: bug, completed STRIPE_WRITE with new == %d\n", new); } if (!new) - finish_stripe(sh); + finish_unlock_stripe(sh); else { - PRINTK(("stripe %lu, new == %d\n", sh->sector, new)); + PRINTK("stripe %lu, new == %d\n", sh->sector, new); sh->phase = PHASE_BEGIN; } } + +static int is_stripe_allclean(struct stripe_head *sh, int disks) +{ + int i; + + return 0; + for (i = 0; i < disks; i++) { + if (sh->bh_new[i]) + if (test_bit(BH_Dirty, &sh->bh_new[i])) + return 0; + if (sh->bh_old[i]) + if (test_bit(BH_Dirty, &sh->bh_old[i])) + return 0; + } + return 1; +} + +static void handle_stripe_write (mddev_t *mddev , raid5_conf_t *conf, + struct stripe_head *sh, int nr_write, int * operational, int disks, + int parity, int parity_failed, int nr_cache, int nr_cache_other, + int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) +{ + int i, allclean; + struct buffer_head *bh; + unsigned int block; + int method1 = INT_MAX, method2 = INT_MAX; + + /* + * Attempt to add entries :-) + */ + if (nr_write != disks - 1) { + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i]) + continue; + block = (int) compute_blocknr(sh, i); + bh = get_hash_table(mddev_to_kdev(mddev), block, sh->size); + if (!bh) + continue; + if (buffer_dirty(bh) && !md_test_and_set_bit(BH_Lock, &bh->b_state)) { + PRINTK("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block); + add_stripe_bh(sh, bh, i, WRITE); + sh->new[i] = 0; + nr_write++; + if (sh->bh_old[i]) { + nr_cache_overwrite++; + nr_cache_other--; + } else + if (!operational[i]) { + nr_failed_overwrite++; + nr_failed_other--; + } + } + bput(bh); + } + } + PRINTK("handle_stripe() -- begin writing, stripe %lu\n", sh->sector); + /* + * Writing, need to update parity buffer. + * + * Compute the number of I/O requests in the "reconstruct + * write" and "read modify write" methods. + */ + if (!nr_failed_other) + method1 = (disks - 1) - (nr_write + nr_cache_other); + if (!nr_failed_overwrite && !parity_failed) + method2 = nr_write - nr_cache_overwrite + (1 - parity); + + if (method1 == INT_MAX && method2 == INT_MAX) + BUG(); + PRINTK("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2); + + if (!method1 || !method2) { + allclean = is_stripe_allclean(sh, disks); + sh->phase = PHASE_WRITE; + compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); + + for (i = 0; i < disks; i++) { + if (!operational[i] && !conf->spare && !conf->resync_parity) + continue; + bh = sh->bh_copy[i]; + if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) + printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]); + if (i == sh->pd_idx && !bh) + printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i); + if (bh) { + PRINTK("making request for buffer %d\n", i); + lock_get_bh(bh); + if (!operational[i] && !conf->resync_parity) { + PRINTK("writing spare %d\n", i); + atomic_inc(&sh->nr_pending); + bh->b_rdev = conf->spare->dev; + generic_make_request(WRITERAW, bh); + } else { +#if 0 + atomic_inc(&sh->nr_pending); + bh->b_rdev = conf->disks[i].dev; + generic_make_request(WRITERAW, bh); +#else + if (!allclean || (i==sh->pd_idx)) { + PRINTK("writing dirty %d\n", i); + atomic_inc(&sh->nr_pending); + bh->b_rdev = conf->disks[i].dev; + generic_make_request(WRITERAW, bh); + } else { + PRINTK("not writing clean %d\n", i); + raid5_end_request(bh, 1); + sh->new[i] = 0; + } +#endif + } + bput(bh); + } + } + PRINTK("handle_stripe() %lu, writing back %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + return; + } + + if (method1 < method2) { + sh->write_method = RECONSTRUCT_WRITE; + for (i = 0; i < disks; i++) { + if (i == sh->pd_idx) + continue; + if (sh->bh_new[i] || sh->bh_old[i]) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + } + } else { + sh->write_method = READ_MODIFY_WRITE; + for (i = 0; i < disks; i++) { + if (sh->bh_old[i]) + continue; + if (!sh->bh_new[i] && i != sh->pd_idx) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + } + } + sh->phase = PHASE_READ_OLD; + for (i = 0; i < disks; i++) { + if (!sh->bh_old[i]) + continue; + if (test_bit(BH_Uptodate, &sh->bh_old[i]->b_state)) + continue; + lock_get_bh(sh->bh_old[i]); + atomic_inc(&sh->nr_pending); + sh->bh_old[i]->b_rdev = conf->disks[i].dev; + generic_make_request(READ, sh->bh_old[i]); + bput(sh->bh_old[i]); + } + PRINTK("handle_stripe() %lu, reading %d old buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); +} + +/* + * Reading + */ +static void handle_stripe_read (mddev_t *mddev , raid5_conf_t *conf, + struct stripe_head *sh, int nr_read, int * operational, int disks, + int parity, int parity_failed, int nr_cache, int nr_cache_other, + int nr_failed_other, int nr_cache_overwrite, int nr_failed_overwrite) +{ + int i; + int method1 = INT_MAX; + + method1 = nr_read - nr_cache_overwrite; + + PRINTK("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1); + + if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { + PRINTK("read %lu completed from cache\n", sh->sector); + for (i = 0; i < disks; i++) { + if (!sh->bh_new[i]) + continue; + if (!sh->bh_old[i]) + compute_block(sh, i); + memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); + } + complete_stripe(sh); + return; + } + if (nr_failed_overwrite) { + sh->phase = PHASE_READ_OLD; + for (i = 0; i < disks; i++) { + if (sh->bh_old[i]) + continue; + if (!operational[i]) + continue; + sh->bh_old[i] = raid5_alloc_buffer(sh, sh->size); + raid5_build_block(sh, sh->bh_old[i], i); + lock_get_bh(sh->bh_old[i]); + atomic_inc(&sh->nr_pending); + sh->bh_old[i]->b_rdev = conf->disks[i].dev; + generic_make_request(READ, sh->bh_old[i]); + bput(sh->bh_old[i]); + } + PRINTK("handle_stripe() %lu, phase READ_OLD, pending %d buffers\n", sh->sector, md_atomic_read(&sh->nr_pending)); + return; + } + sh->phase = PHASE_READ; + for (i = 0; i < disks; i++) { + if (!sh->bh_new[i]) + continue; + if (sh->bh_old[i]) { + memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); + continue; + } + lock_get_bh(sh->bh_req[i]); + atomic_inc(&sh->nr_pending); + sh->bh_req[i]->b_rdev = conf->disks[i].dev; + generic_make_request(READ, sh->bh_req[i]); + bput(sh->bh_req[i]); + } + PRINTK("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, md_atomic_read(&sh->nr_pending)); +} + /* * handle_stripe() is our main logic routine. Note that: * @@ -921,39 +1210,31 @@ * 2. We should be careful to set sh->nr_pending whenever we sleep, * to prevent re-entry of handle_stripe() for the same sh. * - * 3. raid_conf->failed_disks and disk->operational can be changed + * 3. conf->failed_disks and disk->operational can be changed * from an interrupt. This complicates things a bit, but it allows * us to stop issuing requests for a failed drive as soon as possible. */ static void handle_stripe(struct stripe_head *sh) { - struct raid5_data *raid_conf = sh->raid_conf; - struct md_dev *mddev = raid_conf->mddev; - int minor = (int) (mddev - md_dev); - struct buffer_head *bh; - int disks = raid_conf->raid_disks; - int i, nr = 0, nr_read = 0, nr_write = 0; - int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0, parity = 0; + raid5_conf_t *conf = sh->raid_conf; + mddev_t *mddev = conf->mddev; + int disks = conf->raid_disks; + int i, nr_read = 0, nr_write = 0, parity = 0; + int nr_cache = 0, nr_cache_other = 0, nr_cache_overwrite = 0; int nr_failed_other = 0, nr_failed_overwrite = 0, parity_failed = 0; - int reading = 0, nr_writing = 0; - int method1 = INT_MAX, method2 = INT_MAX; - int block; - unsigned long flags; - int operational[MD_SB_DISKS], failed_disks = raid_conf->failed_disks; + int operational[MD_SB_DISKS], failed_disks = conf->failed_disks; - PRINTK(("handle_stripe(), stripe %lu\n", sh->sector)); - if (sh->nr_pending) { - printk("handle_stripe(), stripe %lu, io still pending\n", sh->sector); - return; - } - if (sh->phase == PHASE_COMPLETE) { - printk("handle_stripe(), stripe %lu, already complete\n", sh->sector); - return; - } + PRINTK("handle_stripe(), stripe %lu\n", sh->sector); + if (!stripe_locked(sh)) + BUG(); + if (md_atomic_read(&sh->nr_pending)) + BUG(); + if (sh->phase == PHASE_COMPLETE) + BUG(); - atomic_dec(&raid_conf->nr_handle); + atomic_dec(&conf->nr_handle); - if (test_and_clear_bit(STRIPE_ERROR, &sh->state)) { + if (md_test_and_clear_bit(STRIPE_ERROR, &sh->state)) { printk("raid5: restarting stripe %lu\n", sh->sector); sh->phase = PHASE_BEGIN; } @@ -968,16 +1249,18 @@ return; } - save_flags(flags); - cli(); + md_spin_lock_irq(&conf->device_lock); for (i = 0; i < disks; i++) { - operational[i] = raid_conf->disks[i].operational; - if (i == sh->pd_idx && raid_conf->resync_parity) + operational[i] = conf->disks[i].operational; + if (i == sh->pd_idx && conf->resync_parity) operational[i] = 0; } - failed_disks = raid_conf->failed_disks; - restore_flags(flags); + failed_disks = conf->failed_disks; + md_spin_unlock_irq(&conf->device_lock); + /* + * Make this one more graceful? + */ if (failed_disks > 1) { for (i = 0; i < disks; i++) { if (sh->bh_new[i]) { @@ -985,269 +1268,125 @@ continue; } } - finish_stripe(sh); + finish_unlock_stripe(sh); return; } + PRINTK("=== stripe index START ===\n"); for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) + PRINTK("disk %d, ", i); + if (sh->bh_old[i]) { nr_cache++; + PRINTK(" (old cached, %d)", nr_cache); + } if (i == sh->pd_idx) { - if (sh->bh_old[i]) + PRINTK(" PARITY."); + if (sh->bh_old[i]) { + PRINTK(" CACHED."); parity = 1; - else if(!operational[i]) - parity_failed = 1; + } else { + PRINTK(" UNCACHED."); + if (!operational[i]) { + PRINTK(" FAILED."); + parity_failed = 1; + } + } + PRINTK("\n"); continue; } if (!sh->bh_new[i]) { - if (sh->bh_old[i]) + PRINTK(" (no new data block) "); + if (sh->bh_old[i]) { + PRINTK(" (but old block cached) "); nr_cache_other++; - else if (!operational[i]) - nr_failed_other++; + } else { + if (!operational[i]) { + PRINTK(" (because failed disk) "); + nr_failed_other++; + } else + PRINTK(" (no old block either) "); + } + PRINTK("\n"); continue; } sh->new[i] = 0; - nr++; - if (sh->cmd_new[i] == READ) + if (sh->cmd_new[i] == READ) { nr_read++; - if (sh->cmd_new[i] == WRITE) - nr_write++; - if (sh->bh_old[i]) - nr_cache_overwrite++; - else if (!operational[i]) - nr_failed_overwrite++; - } - - if (nr_write && nr_read) - printk("raid5: bug, nr_write == %d, nr_read == %d, sh->cmd == %d\n", nr_write, nr_read, sh->cmd); - - if (nr_write) { - /* - * Attempt to add entries :-) - */ - if (nr_write != disks - 1) { - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i]) - continue; - block = (int) compute_blocknr(sh, i); - bh = get_hash_table(MKDEV(MD_MAJOR, minor), block, sh->size); - if (bh) { - if (atomic_read(&bh->b_count) == 1 && - buffer_dirty(bh) && - !buffer_locked(bh)) { - PRINTK(("Whee.. sector %lu, index %d (%d) found in the buffer cache!\n", sh->sector, i, block)); - add_stripe_bh(sh, bh, i, WRITE); - sh->new[i] = 0; - nr++; nr_write++; - if (sh->bh_old[i]) { - nr_cache_overwrite++; - nr_cache_other--; - } else if (!operational[i]) { - nr_failed_overwrite++; - nr_failed_other--; - } - } - atomic_dec(&bh->b_count); - } - } + PRINTK(" (new READ %d)", nr_read); } - PRINTK(("handle_stripe() -- begin writing, stripe %lu\n", sh->sector)); - /* - * Writing, need to update parity buffer. - * - * Compute the number of I/O requests in the "reconstruct - * write" and "read modify write" methods. - */ - if (!nr_failed_other) - method1 = (disks - 1) - (nr_write + nr_cache_other); - if (!nr_failed_overwrite && !parity_failed) - method2 = nr_write - nr_cache_overwrite + (1 - parity); - - if (method1 == INT_MAX && method2 == INT_MAX) - printk("raid5: bug: method1 == method2 == INT_MAX\n"); - PRINTK(("handle_stripe(), sector %lu, nr_write %d, method1 %d, method2 %d\n", sh->sector, nr_write, method1, method2)); - - if (!method1 || !method2) { - lock_stripe(sh); - sh->nr_pending++; - sh->phase = PHASE_WRITE; - compute_parity(sh, method1 <= method2 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); - for (i = 0; i < disks; i++) { - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity) - continue; - if (i == sh->pd_idx || sh->bh_new[i]) - nr_writing++; - } - - sh->nr_pending = nr_writing; - PRINTK(("handle_stripe() %lu, writing back %d\n", sh->sector, sh->nr_pending)); - - for (i = 0; i < disks; i++) { - if (!operational[i] && !raid_conf->spare && !raid_conf->resync_parity) - continue; - bh = sh->bh_copy[i]; - if (i != sh->pd_idx && ((bh == NULL) ^ (sh->bh_new[i] == NULL))) - printk("raid5: bug: bh == %p, bh_new[%d] == %p\n", bh, i, sh->bh_new[i]); - if (i == sh->pd_idx && !bh) - printk("raid5: bug: bh == NULL, i == pd_idx == %d\n", i); - if (bh) { - bh->b_state |= (1<b_state); - if (!operational[i] && !raid_conf->resync_parity) { - bh->b_rdev = raid_conf->spare->dev; - make_request(MAJOR(raid_conf->spare->dev), WRITE, bh); - } else - make_request(MAJOR(raid_conf->disks[i].dev), WRITE, bh); - } - } - return; - } - - lock_stripe(sh); - sh->nr_pending++; - if (method1 < method2) { - sh->write_method = RECONSTRUCT_WRITE; - for (i = 0; i < disks; i++) { - if (i == sh->pd_idx) - continue; - if (sh->bh_new[i] || sh->bh_old[i]) - continue; - sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - reading++; - } - } else { - sh->write_method = READ_MODIFY_WRITE; - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!sh->bh_new[i] && i != sh->pd_idx) - continue; - sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - reading++; - } - } - sh->phase = PHASE_READ_OLD; - sh->nr_pending = reading; - PRINTK(("handle_stripe() %lu, reading %d old buffers\n", sh->sector, sh->nr_pending)); - for (i = 0; i < disks; i++) { - if (!sh->bh_old[i]) - continue; - if (buffer_uptodate(sh->bh_old[i])) - continue; - clear_bit(BH_Lock, &sh->bh_old[i]->b_state); - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]); - } - } else { - /* - * Reading - */ - method1 = nr_read - nr_cache_overwrite; - lock_stripe(sh); - sh->nr_pending++; - - PRINTK(("handle_stripe(), sector %lu, nr_read %d, nr_cache %d, method1 %d\n", sh->sector, nr_read, nr_cache, method1)); - if (!method1 || (method1 == 1 && nr_cache == disks - 1)) { - PRINTK(("read %lu completed from cache\n", sh->sector)); - for (i = 0; i < disks; i++) { - if (!sh->bh_new[i]) - continue; - if (!sh->bh_old[i]) - compute_block(sh, i); - memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); - } - sh->nr_pending--; - complete_stripe(sh); - return; + if (sh->cmd_new[i] == WRITE) { + nr_write++; + PRINTK(" (new WRITE %d)", nr_write); } - if (nr_failed_overwrite) { - sh->phase = PHASE_READ_OLD; - sh->nr_pending = (disks - 1) - nr_cache; - PRINTK(("handle_stripe() %lu, phase READ_OLD, pending %d\n", sh->sector, sh->nr_pending)); - for (i = 0; i < disks; i++) { - if (sh->bh_old[i]) - continue; - if (!operational[i]) - continue; - sh->bh_old[i] = raid5_kmalloc_buffer(sh, sh->size); - raid5_build_block(sh, sh->bh_old[i], i); - clear_bit(BH_Lock, &sh->bh_old[i]->b_state); - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_old[i]); - } + if (sh->bh_old[i]) { + nr_cache_overwrite++; + PRINTK(" (overwriting old %d)", nr_cache_overwrite); } else { - sh->phase = PHASE_READ; - sh->nr_pending = nr_read - nr_cache_overwrite; - PRINTK(("handle_stripe() %lu, phase READ, pending %d\n", sh->sector, sh->nr_pending)); - for (i = 0; i < disks; i++) { - if (!sh->bh_new[i]) - continue; - if (sh->bh_old[i]) { - memcpy(sh->bh_new[i]->b_data, sh->bh_old[i]->b_data, sh->size); - continue; - } - make_request(MAJOR(raid_conf->disks[i].dev), READ, sh->bh_req[i]); + if (!operational[i]) { + nr_failed_overwrite++; + PRINTK(" (overwriting failed %d)", nr_failed_overwrite); } } + PRINTK("\n"); } + PRINTK("=== stripe index END ===\n"); + + if (nr_write && nr_read) + BUG(); + + if (nr_write) + handle_stripe_write( + mddev, conf, sh, nr_write, operational, disks, + parity, parity_failed, nr_cache, nr_cache_other, nr_failed_other, + nr_cache_overwrite, nr_failed_overwrite + ); + else + handle_stripe_read( + mddev, conf, sh, nr_read, operational, disks, + parity, parity_failed, nr_cache, nr_cache_other, + nr_failed_other, nr_cache_overwrite, nr_failed_overwrite + ); } -static int raid5_make_request (struct md_dev *mddev, int rw, struct buffer_head * bh) + +static int raid5_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - const unsigned int raid_disks = raid_conf->raid_disks; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + const unsigned int raid_disks = conf->raid_disks; const unsigned int data_disks = raid_disks - 1; unsigned int dd_idx, pd_idx; unsigned long new_sector; struct stripe_head *sh; - if (rw == READA) rw = READ; + if (rw == READA) + rw = READ; - new_sector = raid5_compute_sector(bh->b_rsector, raid_disks, data_disks, - &dd_idx, &pd_idx, raid_conf); + new_sector = raid5_compute_sector(bh->b_blocknr*(bh->b_size>>9), + raid_disks, data_disks, &dd_idx, &pd_idx, conf); - PRINTK(("raid5_make_request, sector %lu\n", new_sector)); -repeat: - sh = get_stripe(raid_conf, new_sector, bh->b_size); + PRINTK("raid5_make_request, sector %lu\n", new_sector); + sh = get_lock_stripe(conf, new_sector, bh->b_size); +#if 0 if ((rw == READ && sh->cmd == STRIPE_WRITE) || (rw == WRITE && sh->cmd == STRIPE_READ)) { - PRINTK(("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd)); + PRINTK("raid5: lock contention, rw == %d, sh->cmd == %d\n", rw, sh->cmd); lock_stripe(sh); - if (!sh->nr_pending) + if (!md_atomic_read(&sh->nr_pending)) handle_stripe(sh); goto repeat; } +#endif sh->pd_idx = pd_idx; if (sh->phase != PHASE_COMPLETE && sh->phase != PHASE_BEGIN) - PRINTK(("stripe %lu catching the bus!\n", sh->sector)); - if (sh->bh_new[dd_idx]) { - printk("raid5: bug: stripe->bh_new[%d], sector %lu exists\n", dd_idx, sh->sector); - printk("raid5: bh %p, bh_new %p\n", bh, sh->bh_new[dd_idx]); - lock_stripe(sh); - md_wakeup_thread(raid_conf->thread); - wait_on_stripe(sh); - goto repeat; - } + PRINTK("stripe %lu catching the bus!\n", sh->sector); + if (sh->bh_new[dd_idx]) + BUG(); add_stripe_bh(sh, bh, dd_idx, rw); - md_wakeup_thread(raid_conf->thread); + md_wakeup_thread(conf->thread); return 0; } -static void unplug_devices(struct stripe_head *sh) -{ -#if 0 - struct raid5_data *raid_conf = sh->raid_conf; - int i; - - for (i = 0; i < raid_conf->raid_disks; i++) - unplug_device(blk_dev + MAJOR(raid_conf->disks[i].dev)); -#endif -} - /* * This is our raid5 kernel thread. * @@ -1258,56 +1397,53 @@ static void raid5d (void *data) { struct stripe_head *sh; - struct raid5_data *raid_conf = data; - struct md_dev *mddev = raid_conf->mddev; - int i, handled = 0, unplug = 0; - unsigned long flags; - - PRINTK(("+++ raid5d active\n")); - + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + int i, handled; + + PRINTK("+++ raid5d active\n"); + + handled = 0; + md_spin_lock_irq(&conf->device_lock); + clear_bit(THREAD_WAKEUP, &conf->thread->flags); +repeat_pass: if (mddev->sb_dirty) { + md_spin_unlock_irq(&conf->device_lock); mddev->sb_dirty = 0; - md_update_sb((int) (mddev - md_dev)); + md_update_sb(mddev); + md_spin_lock_irq(&conf->device_lock); } for (i = 0; i < NR_HASH; i++) { repeat: - sh = raid_conf->stripe_hashtbl[i]; + sh = conf->stripe_hashtbl[i]; for (; sh; sh = sh->hash_next) { - if (sh->raid_conf != raid_conf) + if (sh->raid_conf != conf) continue; if (sh->phase == PHASE_COMPLETE) continue; - if (sh->nr_pending) + if (md_atomic_read(&sh->nr_pending)) continue; - if (sh->sector == raid_conf->next_sector) { - raid_conf->sector_count += (sh->size >> 9); - if (raid_conf->sector_count >= 128) - unplug = 1; - } else - unplug = 1; - if (unplug) { - PRINTK(("unplugging devices, sector == %lu, count == %d\n", sh->sector, raid_conf->sector_count)); - unplug_devices(sh); - unplug = 0; - raid_conf->sector_count = 0; - } - raid_conf->next_sector = sh->sector + (sh->size >> 9); + md_spin_unlock_irq(&conf->device_lock); + if (!atomic_read(&sh->count)) + BUG(); + handled++; handle_stripe(sh); + md_spin_lock_irq(&conf->device_lock); goto repeat; } } - if (raid_conf) { - PRINTK(("%d stripes handled, nr_handle %d\n", handled, atomic_read(&raid_conf->nr_handle))); - save_flags(flags); - cli(); - if (!atomic_read(&raid_conf->nr_handle)) - clear_bit(THREAD_WAKEUP, &raid_conf->thread->flags); + if (conf) { + PRINTK("%d stripes handled, nr_handle %d\n", handled, md_atomic_read(&conf->nr_handle)); + if (test_and_clear_bit(THREAD_WAKEUP, &conf->thread->flags) && + md_atomic_read(&conf->nr_handle)) + goto repeat_pass; } - PRINTK(("--- raid5d inactive\n")); + md_spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); } -#if SUPPORT_RECONSTRUCTION /* * Private kernel thread for parity reconstruction after an unclean * shutdown. Reconstruction on spare drives in case of a failed drive @@ -1315,44 +1451,67 @@ */ static void raid5syncd (void *data) { - struct raid5_data *raid_conf = data; - struct md_dev *mddev = raid_conf->mddev; + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; - if (!raid_conf->resync_parity) + if (!conf->resync_parity) + return; + if (conf->resync_parity == 2) + return; + down(&mddev->recovery_sem); + if (md_do_sync(mddev,NULL)) { + up(&mddev->recovery_sem); + printk("raid5: resync aborted!\n"); return; - md_do_sync(mddev); - raid_conf->resync_parity = 0; + } + conf->resync_parity = 0; + up(&mddev->recovery_sem); + printk("raid5: resync finished.\n"); } -#endif /* SUPPORT_RECONSTRUCTION */ -static int __check_consistency (struct md_dev *mddev, int row) +static int __check_consistency (mddev_t *mddev, int row) { - struct raid5_data *raid_conf = mddev->private; + raid5_conf_t *conf = mddev->private; kdev_t dev; - struct buffer_head *bh[MD_SB_DISKS], tmp; - int i, rc = 0, nr = 0; - - if (raid_conf->working_disks != raid_conf->raid_disks) - return 0; - tmp.b_size = 4096; - if ((tmp.b_data = (char *) get_free_page(GFP_KERNEL)) == NULL) - return 0; + struct buffer_head *bh[MD_SB_DISKS], *tmp = NULL; + int i, ret = 0, nr = 0, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + + if (conf->working_disks != conf->raid_disks) + goto out; + tmp = get_unused_bh(); + tmp->b_size = 4096; + tmp->b_data = (char *) get_free_page(GFP_KERNEL); + if (!tmp->b_data) + goto out; + md_clear_page((unsigned long)tmp->b_data); memset(bh, 0, MD_SB_DISKS * sizeof(struct buffer_head *)); - for (i = 0; i < raid_conf->raid_disks; i++) { - dev = raid_conf->disks[i].dev; + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; set_blocksize(dev, 4096); - if ((bh[i] = bread(dev, row / 4, 4096)) == NULL) + bh[i] = bread(dev, row / 4, 4096); + if (!bh[i]) break; nr++; } - if (nr == raid_conf->raid_disks) { - for (i = 1; i < nr; i++) - xor_block(&tmp, bh[i]); - if (memcmp(tmp.b_data, bh[0]->b_data, 4096)) - rc = 1; + if (nr == conf->raid_disks) { + bh_ptr[0] = tmp; + count = 1; + for (i = 1; i < nr; i++) { + bh_ptr[count++] = bh[i]; + if (count == MAX_XOR_BLOCKS) { + xor_block(count, &bh_ptr[0]); + count = 1; + } + } + if (count != 1) { + xor_block(count, &bh_ptr[0]); + } + if (memcmp(tmp->b_data, bh[0]->b_data, 4096)) + ret = 1; } - for (i = 0; i < raid_conf->raid_disks; i++) { - dev = raid_conf->disks[i].dev; + for (i = 0; i < conf->raid_disks; i++) { + dev = conf->disks[i].dev; if (bh[i]) { bforget(bh[i]); bh[i] = NULL; @@ -1360,289 +1519,668 @@ fsync_dev(dev); invalidate_buffers(dev); } - free_page((unsigned long) tmp.b_data); - return rc; + free_page((unsigned long) tmp->b_data); +out: + if (tmp) + put_unused_bh(tmp); + return ret; } -static int check_consistency (struct md_dev *mddev) +static int check_consistency (mddev_t *mddev) { - int size = mddev->sb->size; - int row; + if (__check_consistency(mddev, 0)) +/* + * We are not checking this currently, as it's legitimate to have + * an inconsistent array, at creation time. + */ + return 0; - for (row = 0; row < size; row += size / 8) - if (__check_consistency(mddev, row)) - return 1; return 0; } -static int raid5_run (int minor, struct md_dev *mddev) +static int raid5_run (mddev_t *mddev) { - struct raid5_data *raid_conf; + raid5_conf_t *conf; int i, j, raid_disk, memory; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; - struct real_dev *realdev; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *desc; + mdk_rdev_t *rdev; + struct disk_info *disk; + struct md_list_head *tmp; + int start_recovery = 0; MOD_INC_USE_COUNT; if (sb->level != 5 && sb->level != 4) { - printk("raid5: %s: raid level not set to 4/5 (%d)\n", kdevname(MKDEV(MD_MAJOR, minor)), sb->level); + printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level); MOD_DEC_USE_COUNT; return -EIO; } - mddev->private = kmalloc (sizeof (struct raid5_data), GFP_KERNEL); - if ((raid_conf = mddev->private) == NULL) + mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL); + if ((conf = mddev->private) == NULL) goto abort; - memset (raid_conf, 0, sizeof (*raid_conf)); - raid_conf->mddev = mddev; + memset (conf, 0, sizeof (*conf)); + conf->mddev = mddev; - if ((raid_conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) + if ((conf->stripe_hashtbl = (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL) goto abort; - memset(raid_conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); - - init_waitqueue_head(&raid_conf->wait_for_stripe); - PRINTK(("raid5_run(%d) called.\n", minor)); + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); - for (i = 0; i < mddev->nb_dev; i++) { - realdev = &mddev->devices[i]; - if (!realdev->sb) { - printk(KERN_ERR "raid5: disabled device %s (couldn't access raid superblock)\n", kdevname(realdev->dev)); - continue; - } + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&conf->wait_for_stripe); + PRINTK("raid5_run(md%d) called.\n", mdidx(mddev)); + ITERATE_RDEV(mddev,rdev,tmp) { /* * This is important -- we are using the descriptor on * the disk only to get a pointer to the descriptor on * the main superblock, which might be more recent. */ - descriptor = &sb->disks[realdev->sb->descriptor.number]; - if (descriptor->state & (1 << MD_FAULTY_DEVICE)) { - printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", kdevname(realdev->dev)); + desc = sb->disks + rdev->desc_nr; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc)) { + printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev)); + if (!rdev->faulty) { + MD_BUG(); + goto abort; + } + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; continue; } - if (descriptor->state & (1 << MD_ACTIVE_DEVICE)) { - if (!(descriptor->state & (1 << MD_SYNC_DEVICE))) { - printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", kdevname(realdev->dev)); - continue; + if (disk_active(desc)) { + if (!disk_sync(desc)) { + printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev)); + MD_BUG(); + goto abort; } - raid_disk = descriptor->raid_disk; - if (descriptor->number > sb->nr_disks || raid_disk > sb->raid_disks) { - printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", kdevname(realdev->dev)); + if (raid_disk > sb->raid_disks) { + printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev)); continue; } - if (raid_conf->disks[raid_disk].operational) { - printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", kdevname(realdev->dev), raid_disk); + if (disk->operational) { + printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk); continue; } - printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", kdevname(realdev->dev), raid_disk); + printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk); - raid_conf->disks[raid_disk].number = descriptor->number; - raid_conf->disks[raid_disk].raid_disk = raid_disk; - raid_conf->disks[raid_disk].dev = mddev->devices[i].dev; - raid_conf->disks[raid_disk].operational = 1; + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->operational = 1; + disk->used_slot = 1; - raid_conf->working_disks++; + conf->working_disks++; } else { /* * Must be a spare disk .. */ - printk(KERN_INFO "raid5: spare disk %s\n", kdevname(realdev->dev)); - raid_disk = descriptor->raid_disk; - raid_conf->disks[raid_disk].number = descriptor->number; - raid_conf->disks[raid_disk].raid_disk = raid_disk; - raid_conf->disks[raid_disk].dev = mddev->devices [i].dev; - - raid_conf->disks[raid_disk].operational = 0; - raid_conf->disks[raid_disk].write_only = 0; - raid_conf->disks[raid_disk].spare = 1; - } - } - raid_conf->raid_disks = sb->raid_disks; - raid_conf->failed_disks = raid_conf->raid_disks - raid_conf->working_disks; - raid_conf->mddev = mddev; - raid_conf->chunk_size = sb->chunk_size; - raid_conf->level = sb->level; - raid_conf->algorithm = sb->parity_algorithm; - raid_conf->max_nr_stripes = NR_STRIPES; + printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev)); + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; - if (raid_conf->working_disks != sb->raid_disks && sb->state != (1 << MD_SB_CLEAN)) { - printk(KERN_ALERT "raid5: raid set %s not clean and not all disks are operational -- run ckraid\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } } - if (!raid_conf->chunk_size || raid_conf->chunk_size % 4) { - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", raid_conf->chunk_size, kdevname(MKDEV(MD_MAJOR, minor))); + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = sb->disks + i; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && + !conf->disks[raid_disk].used_slot) { + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + conf->raid_disks = sb->raid_disks; + /* + * 0 for a fully functional array, 1 for a degraded array. + */ + conf->failed_disks = conf->raid_disks - conf->working_disks; + conf->mddev = mddev; + conf->chunk_size = sb->chunk_size; + conf->level = sb->level; + conf->algorithm = sb->layout; + conf->max_nr_stripes = NR_STRIPES; + +#if 0 + for (i = 0; i < conf->raid_disks; i++) { + if (!conf->disks[i].used_slot) { + MD_BUG(); + goto abort; + } + } +#endif + if (!conf->chunk_size || conf->chunk_size % 4) { + printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); goto abort; } - if (raid_conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { - printk(KERN_ERR "raid5: unsupported parity algorithm %d for %s\n", raid_conf->algorithm, kdevname(MKDEV(MD_MAJOR, minor))); + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { + printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev)); goto abort; } - if (raid_conf->failed_disks > 1) { - printk(KERN_ERR "raid5: not enough operational devices for %s (%d/%d failed)\n", kdevname(MKDEV(MD_MAJOR, minor)), raid_conf->failed_disks, raid_conf->raid_disks); + if (conf->failed_disks > 1) { + printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks); goto abort; } - if ((sb->state & (1 << MD_SB_CLEAN)) && check_consistency(mddev)) { - printk(KERN_ERR "raid5: detected raid-5 xor inconsistenty -- run ckraid\n"); - sb->state |= 1 << MD_SB_ERRORS; - goto abort; + if (conf->working_disks != sb->raid_disks) { + printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev)); + start_recovery = 1; } - if ((raid_conf->thread = md_register_thread(raid5d, raid_conf)) == NULL) { - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + if (!start_recovery && (sb->state & (1 << MD_SB_CLEAN)) && + check_consistency(mddev)) { + printk(KERN_ERR "raid5: detected raid-5 superblock xor inconsistency -- running resync\n"); + sb->state &= ~(1 << MD_SB_CLEAN); } -#if SUPPORT_RECONSTRUCTION - if ((raid_conf->resync_thread = md_register_thread(raid5syncd, raid_conf)) == NULL) { - printk(KERN_ERR "raid5: couldn't allocate thread for %s\n", kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; + { + const char * name = "raid5d"; + + conf->thread = md_register_thread(raid5d, conf, name); + if (!conf->thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } } -#endif /* SUPPORT_RECONSTRUCTION */ - memory = raid_conf->max_nr_stripes * (sizeof(struct stripe_head) + - raid_conf->raid_disks * (sizeof(struct buffer_head) + + memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + + conf->raid_disks * (sizeof(struct buffer_head) + 2 * (sizeof(struct buffer_head) + PAGE_SIZE))) / 1024; - if (grow_stripes(raid_conf, raid_conf->max_nr_stripes, GFP_KERNEL)) { + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) { printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory); - shrink_stripes(raid_conf, raid_conf->max_nr_stripes); + shrink_stripes(conf, conf->max_nr_stripes); goto abort; } else - printk(KERN_INFO "raid5: allocated %dkB for %s\n", memory, kdevname(MKDEV(MD_MAJOR, minor))); + printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev)); /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ - for (i = 0; i < sb->nr_disks ; i++) { - sb->disks[i].state &= ~(1 << MD_SYNC_DEVICE); + for (i = 0; i < MD_SB_DISKS ; i++) { + mark_disk_nonsync(sb->disks + i); for (j = 0; j < sb->raid_disks; j++) { - if (!raid_conf->disks[j].operational) + if (!conf->disks[j].operational) continue; - if (sb->disks[i].number == raid_conf->disks[j].number) - sb->disks[i].state |= 1 << MD_SYNC_DEVICE; + if (sb->disks[i].number == conf->disks[j].number) + mark_disk_sync(sb->disks + i); } } - sb->active_disks = raid_conf->working_disks; + sb->active_disks = conf->working_disks; if (sb->active_disks == sb->raid_disks) - printk("raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm); + printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); else - printk(KERN_ALERT "raid5: raid level %d set %s active with %d out of %d devices, algorithm %d\n", raid_conf->level, kdevname(MKDEV(MD_MAJOR, minor)), sb->active_disks, sb->raid_disks, raid_conf->algorithm); + printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm); + + if (!start_recovery && ((sb->state & (1 << MD_SB_CLEAN))==0)) { + const char * name = "raid5syncd"; + + conf->resync_thread = md_register_thread(raid5syncd, conf,name); + if (!conf->resync_thread) { + printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev)); + goto abort; + } - if ((sb->state & (1 << MD_SB_CLEAN)) == 0) { - printk("raid5: raid set %s not clean; re-constructing parity\n", kdevname(MKDEV(MD_MAJOR, minor))); - raid_conf->resync_parity = 1; -#if SUPPORT_RECONSTRUCTION - md_wakeup_thread(raid_conf->resync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ + printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev)); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); } + print_raid5_conf(conf); + if (start_recovery) + md_recover_arrays(); + print_raid5_conf(conf); + /* Ok, everything is just fine now */ return (0); abort: - if (raid_conf) { - if (raid_conf->stripe_hashtbl) - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER); - kfree(raid_conf); + if (conf) { + print_raid5_conf(conf); + if (conf->stripe_hashtbl) + free_pages((unsigned long) conf->stripe_hashtbl, + HASH_PAGES_ORDER); + kfree(conf); } mddev->private = NULL; - printk(KERN_ALERT "raid5: failed to run raid set %s\n", kdevname(MKDEV(MD_MAJOR, minor))); + printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev)); MOD_DEC_USE_COUNT; return -EIO; } -static int raid5_stop (int minor, struct md_dev *mddev) +static int raid5_stop_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + mdk_thread_t *thread = conf->resync_thread; + + if (thread) { + if (conf->resync_parity) { + conf->resync_parity = 2; + md_interrupt_thread(thread); + printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid5_restart_resync (mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_parity) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + printk("raid5: waking up raid5resync.\n"); + conf->resync_parity = 1; + md_wakeup_thread(conf->resync_thread); + return 1; + } else + printk("raid5: no restart-resync needed.\n"); + return 0; +} + + +static int raid5_stop (mddev_t *mddev) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; - shrink_stripe_cache(raid_conf, raid_conf->max_nr_stripes); - shrink_stripes(raid_conf, raid_conf->max_nr_stripes); - md_unregister_thread(raid_conf->thread); -#if SUPPORT_RECONSTRUCTION - md_unregister_thread(raid_conf->resync_thread); -#endif /* SUPPORT_RECONSTRUCTION */ - free_pages((unsigned long) raid_conf->stripe_hashtbl, HASH_PAGES_ORDER); - kfree(raid_conf); + shrink_stripe_cache(conf, conf->max_nr_stripes); + shrink_stripes(conf, conf->max_nr_stripes); + md_unregister_thread(conf->thread); + if (conf->resync_thread) + md_unregister_thread(conf->resync_thread); + free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER); + kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } -static int raid5_status (char *page, int minor, struct md_dev *mddev) +#if RAID5_DEBUG +static void print_sh (struct stripe_head *sh) +{ + int i; + + printk("sh %lu, phase %d, size %d, pd_idx %d, state %ld, cmd %d.\n", sh->sector, sh->phase, sh->size, sh->pd_idx, sh->state, sh->cmd); + printk("sh %lu, write_method %d, nr_pending %d, count %d.\n", sh->sector, sh->write_method, atomic_read(&sh->nr_pending), atomic_read(&sh->count)); + printk("sh %lu, ", sh->sector); + for (i = 0; i < MD_SB_DISKS; i++) { + if (sh->bh_old[i]) + printk("(old%d: %p) ", i, sh->bh_old[i]); + if (sh->bh_new[i]) + printk("(new%d: %p) ", i, sh->bh_new[i]); + if (sh->bh_copy[i]) + printk("(copy%d: %p) ", i, sh->bh_copy[i]); + if (sh->bh_req[i]) + printk("(req%d: %p) ", i, sh->bh_req[i]); + } + printk("\n"); + for (i = 0; i < MD_SB_DISKS; i++) + printk("%d(%d/%d) ", i, sh->cmd_new[i], sh->new[i]); + printk("\n"); +} + +static void printall (raid5_conf_t *conf) { - struct raid5_data *raid_conf = (struct raid5_data *) mddev->private; - md_superblock_t *sb = mddev->sb; + struct stripe_head *sh; + int i; + + md_spin_lock_irq(&conf->device_lock); + for (i = 0; i < NR_HASH; i++) { + sh = conf->stripe_hashtbl[i]; + for (; sh; sh = sh->hash_next) { + if (sh->raid_conf != conf) + continue; + print_sh(sh); + } + } + md_spin_unlock_irq(&conf->device_lock); + + PRINTK("--- raid5d inactive\n"); +} +#endif + +static int raid5_status (char *page, mddev_t *mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; int sz = 0, i; - sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->parity_algorithm); - sz += sprintf (page+sz, " [%d/%d] [", raid_conf->raid_disks, raid_conf->working_disks); - for (i = 0; i < raid_conf->raid_disks; i++) - sz += sprintf (page+sz, "%s", raid_conf->disks[i].operational ? "U" : "_"); + sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout); + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); +#if RAID5_DEBUG +#define D(x) \ + sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x)) + D(nr_handle); + D(nr_stripes); + D(nr_hashed_stripes); + D(nr_locked_stripes); + D(nr_pending_stripes); + D(nr_cached_stripes); + D(nr_free_sh); + printall(conf); +#endif return sz; } -static int raid5_mark_spare(struct md_dev *mddev, md_descriptor_t *spare, int state) +static void print_raid5_conf (raid5_conf_t *conf) { - int i = 0, failed_disk = -1; - struct raid5_data *raid_conf = mddev->private; - struct disk_info *disk = raid_conf->disks; - unsigned long flags; - md_superblock_t *sb = mddev->sb; - md_descriptor_t *descriptor; + int i; + struct disk_info *tmp; - for (i = 0; i < MD_SB_DISKS; i++, disk++) { - if (disk->spare && disk->number == spare->number) - goto found; + printk("RAID5 conf printout:\n"); + if (!conf) { + printk("(conf==NULL)\n"); + return; } - return 1; -found: - for (i = 0, disk = raid_conf->disks; i < raid_conf->raid_disks; i++, disk++) - if (!disk->operational) - failed_disk = i; - if (failed_disk == -1) - return 1; - save_flags(flags); - cli(); + printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, + conf->working_disks, conf->failed_disks); + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + partition_name(tmp->dev)); + } +} + +static int raid5_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; + raid5_conf_t *conf = mddev->private; + struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc, *added_desc; + + print_raid5_conf(conf); + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ switch (state) { - case SPARE_WRITE: - disk->operational = 1; - disk->write_only = 1; - raid_conf->spare = disk; - break; - case SPARE_INACTIVE: - disk->operational = 0; - disk->write_only = 0; - raid_conf->spare = NULL; - break; - case SPARE_ACTIVE: - disk->spare = 0; - disk->write_only = 0; - descriptor = &sb->disks[raid_conf->disks[failed_disk].number]; - i = spare->raid_disk; - disk->raid_disk = spare->raid_disk = descriptor->raid_disk; - if (disk->raid_disk != failed_disk) - printk("raid5: disk->raid_disk != failed_disk"); - descriptor->raid_disk = i; - - raid_conf->spare = NULL; - raid_conf->working_disks++; - raid_conf->failed_disks--; - raid_conf->disks[failed_disk] = *disk; - break; - default: - printk("raid5_mark_spare: bug: state == %d\n", state); - restore_flags(flags); - return 1; + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID5 configuration ... + * (this can only be in the first conf->raid_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->disks + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (!tmp->used_slot) { + added_disk = i; + break; + } + } + if (added_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; } - restore_flags(flags); - return 0; + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + if (conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + conf->spare = sdisk; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->disks + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + /* + * Was the spare being resynced? + */ + if (conf->spare == sdisk) + conf->spare = NULL; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->raid_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + if (!conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + fdisk = conf->disks + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + conf->failed_disks--; + conf->working_disks++; + conf->spare = NULL; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->disks + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + rdisk->dev = MKDEV(0,0); + rdisk->used_slot = 0; + + break; + + case DISKOP_HOT_ADD_DISK: + adisk = conf->disks + added_disk; + added_desc = *d; + + if (added_disk != added_desc->number) { + MD_BUG(); + err = 1; + goto abort; + } + + adisk->number = added_desc->number; + adisk->raid_disk = added_desc->raid_disk; + adisk->dev = MKDEV(added_desc->major,added_desc->minor); + + adisk->operational = 0; + adisk->write_only = 0; + adisk->spare = 1; + adisk->used_slot = 1; + + + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + print_raid5_conf(conf); + return err; } -static struct md_personality raid5_personality= +static mdk_personality_t raid5_personality= { "raid5", raid5_map, @@ -1654,14 +2192,19 @@ NULL, /* no ioctls */ 0, raid5_error, - /* raid5_hot_add_disk, */ NULL, - /* raid1_hot_remove_drive */ NULL, - raid5_mark_spare + raid5_diskop, + raid5_stop_resync, + raid5_restart_resync }; int raid5_init (void) { - return register_md_personality (RAID5, &raid5_personality); + int err; + + err = register_md_personality (RAID5, &raid5_personality); + if (err) + return err; + return 0; } #ifdef MODULE --- linux/drivers/block/translucent.c.orig Sun Jan 16 11:26:03 2000 +++ linux/drivers/block/translucent.c Sun Jan 16 17:45:53 2000 @@ -0,0 +1,136 @@ +/* + translucent.c : Translucent RAID driver for Linux + Copyright (C) 1998 Ingo Molnar + + Translucent mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#include +#include + +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +static int translucent_run (mddev_t *mddev) +{ + translucent_conf_t *conf; + mdk_rdev_t *rdev; + int i; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (mddev->nb_dev != 2) { + printk("translucent: this mode needs 2 disks, aborting!\n"); + goto out; + } + + if (md_check_ordering(mddev)) { + printk("translucent: disks are not ordered, aborting!\n"); + goto out; + } + + ITERATE_RDEV_ORDERED(mddev,rdev,i) { + dev_info_t *disk = conf->disks + i; + + disk->dev = rdev->dev; + disk->size = rdev->size; + } + + return 0; + +out: + if (conf) + kfree(conf); + + MOD_DEC_USE_COUNT; + return 1; +} + +static int translucent_stop (mddev_t *mddev) +{ + translucent_conf_t *conf = mddev_to_conf(mddev); + + kfree(conf); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static int translucent_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size) +{ + translucent_conf_t *conf = mddev_to_conf(mddev); + + *rdev = conf->disks[0].dev; + + return 0; +} + +static int translucent_status (char *page, mddev_t *mddev) +{ + int sz = 0; + + sz += sprintf(page+sz, " %d%% full", 10); + return sz; +} + + +static mdk_personality_t translucent_personality= +{ + "translucent", + translucent_map, + NULL, + NULL, + translucent_run, + translucent_stop, + translucent_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL +}; + +#ifndef MODULE + +md__initfunc(void translucent_init (void)) +{ + register_md_personality (TRANSLUCENT, &translucent_personality); +} + +#else + +int init_module (void) +{ + return (register_md_personality (TRANSLUCENT, &translucent_personality)); +} + +void cleanup_module (void) +{ + unregister_md_personality (TRANSLUCENT); +} + +#endif + --- linux/drivers/block/xor.c.orig Sun Jan 16 11:26:03 2000 +++ linux/drivers/block/xor.c Sun Jan 16 17:45:53 2000 @@ -0,0 +1,1894 @@ +/* + * xor.c : Multiple Devices driver for Linux + * + * Copyright (C) 1996, 1997, 1998, 1999 Ingo Molnar, Matti Aarnio, Jakub Jelinek + * + * + * optimized RAID-5 checksumming functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#define BH_TRACE 0 +#include +#include +#ifdef __sparc_v9__ +#include +#include +#include +#endif + +/* + * we use the 'XOR function template' to register multiple xor + * functions runtime. The kernel measures their speed upon bootup + * and decides which one to use. (compile-time registration is + * not enough as certain CPU features like MMX can only be detected + * runtime) + * + * this architecture makes it pretty easy to add new routines + * that are faster on certain CPUs, without killing other CPU's + * 'native' routine. Although the current routines are belived + * to be the physically fastest ones on all CPUs tested, but + * feel free to prove me wrong and add yet another routine =B-) + * --mingo + */ + +#define MAX_XOR_BLOCKS 5 + +#define XOR_ARGS (unsigned int count, struct buffer_head **bh_ptr) + +typedef void (*xor_block_t) XOR_ARGS; +xor_block_t xor_block = NULL; + +#ifndef __sparc_v9__ + +struct xor_block_template; + +struct xor_block_template { + char * name; + xor_block_t xor_block; + int speed; + struct xor_block_template * next; +}; + +struct xor_block_template * xor_functions = NULL; + +#define XORBLOCK_TEMPLATE(x) \ +static void xor_block_##x XOR_ARGS; \ +static struct xor_block_template t_xor_block_##x = \ + { #x, xor_block_##x, 0, NULL }; \ +static void xor_block_##x XOR_ARGS + +#ifdef __i386__ + +#ifdef CONFIG_X86_XMM +/* + * Cache avoiding checksumming functions utilizing KNI instructions + * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) + */ + +XORBLOCK_TEMPLATE(pIII_kni) +{ + char xmm_save[16*4]; + int cr0; + int lines = (bh_ptr[0]->b_size>>8); + + __asm__ __volatile__ ( + "movl %%cr0,%0 ;\n\t" + "clts ;\n\t" + "movups %%xmm0,(%1) ;\n\t" + "movups %%xmm1,0x10(%1) ;\n\t" + "movups %%xmm2,0x20(%1) ;\n\t" + "movups %%xmm3,0x30(%1) ;\n\t" + : "=r" (cr0) + : "r" (xmm_save) + : "memory" ); + +#define OFFS(x) "8*("#x"*2)" +#define PF0(x) \ + " prefetcht0 "OFFS(x)"(%1) ;\n" +#define LD(x,y) \ + " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" +#define ST(x,y) \ + " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" +#define PF1(x) \ + " prefetchnta "OFFS(x)"(%2) ;\n" +#define PF2(x) \ + " prefetchnta "OFFS(x)"(%3) ;\n" +#define PF3(x) \ + " prefetchnta "OFFS(x)"(%4) ;\n" +#define PF4(x) \ + " prefetchnta "OFFS(x)"(%5) ;\n" +#define PF5(x) \ + " prefetchnta "OFFS(x)"(%6) ;\n" +#define XO1(x,y) \ + " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" +#define XO2(x,y) \ + " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" +#define XO3(x,y) \ + " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" +#define XO4(x,y) \ + " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" +#define XO5(x,y) \ + " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + PF1(i) \ + PF1(i+2) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i+2) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + PF2(i) \ + PF2(i+2) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + PF3(i) \ + PF3(i+2) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + PF4(i) \ + PF4(i+2) \ + PF0(i+4) \ + PF0(i+6) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + XO4(i+1,1) \ + XO4(i+2,2) \ + XO4(i+3,3) \ + ST(i,0) \ + ST(i+1,1) \ + ST(i+2,2) \ + ST(i+3,3) \ + + + PF0(0) + PF0(2) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $256, %1 ;\n" + " addl $256, %2 ;\n" + " addl $256, %3 ;\n" + " addl $256, %4 ;\n" + " addl $256, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( + "sfence ;\n\t" + "movups (%1),%%xmm0 ;\n\t" + "movups 0x10(%1),%%xmm1 ;\n\t" + "movups 0x20(%1),%%xmm2 ;\n\t" + "movups 0x30(%1),%%xmm3 ;\n\t" + "movl %0,%%cr0 ;\n\t" + : + : "r" (cr0), "r" (xmm_save) + : "memory" ); +} + +#undef OFFS +#undef LD +#undef ST +#undef PF0 +#undef PF1 +#undef PF2 +#undef PF3 +#undef PF4 +#undef PF5 +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef XO5 +#undef BLOCK + +#endif /* CONFIG_X86_XMM */ + +/* + * high-speed RAID5 checksumming functions utilizing MMX instructions + * Copyright (C) 1998 Ingo Molnar + */ +XORBLOCK_TEMPLATE(pII_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>7); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + +#define LD(x,y) \ + " movq 8*("#x")(%1), %%mm"#y" ;\n" +#define ST(x,y) \ + " movq %%mm"#y", 8*("#x")(%1) ;\n" +#define XO1(x,y) \ + " pxor 8*("#x")(%2), %%mm"#y" ;\n" +#define XO2(x,y) \ + " pxor 8*("#x")(%3), %%mm"#y" ;\n" +#define XO3(x,y) \ + " pxor 8*("#x")(%4), %%mm"#y" ;\n" +#define XO4(x,y) \ + " pxor 8*("#x")(%5), %%mm"#y" ;\n" + + switch(count) { + case 2: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + ST(i,0) \ + XO1(i+1,1) \ + ST(i+1,1) \ + XO1(i+2,2) \ + ST(i+2,2) \ + XO1(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory"); + break; + case 3: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + ST(i,0) \ + XO2(i+1,1) \ + ST(i+1,1) \ + XO2(i+2,2) \ + ST(i+2,2) \ + XO2(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory"); + break; + case 4: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + ST(i,0) \ + XO3(i+1,1) \ + ST(i+1,1) \ + XO3(i+2,2) \ + ST(i+2,2) \ + XO3(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory"); + break; + case 5: + __asm__ __volatile__ ( +#undef BLOCK +#define BLOCK(i) \ + LD(i,0) \ + LD(i+1,1) \ + LD(i+2,2) \ + LD(i+3,3) \ + XO1(i,0) \ + XO1(i+1,1) \ + XO1(i+2,2) \ + XO1(i+3,3) \ + XO2(i,0) \ + XO2(i+1,1) \ + XO2(i+2,2) \ + XO2(i+3,3) \ + XO3(i,0) \ + XO3(i+1,1) \ + XO3(i+2,2) \ + XO3(i+3,3) \ + XO4(i,0) \ + ST(i,0) \ + XO4(i+1,1) \ + ST(i+1,1) \ + XO4(i+2,2) \ + ST(i+2,2) \ + XO4(i+3,3) \ + ST(i+3,3) + + " .align 32,0x90 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addl $128, %1 ;\n" + " addl $128, %2 ;\n" + " addl $128, %3 ;\n" + " addl $128, %4 ;\n" + " addl $128, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory"); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} + +#undef LD +#undef XO1 +#undef XO2 +#undef XO3 +#undef XO4 +#undef ST +#undef BLOCK + +XORBLOCK_TEMPLATE(p5_mmx) +{ + char fpu_save[108]; + int lines = (bh_ptr[0]->b_size>>6); + + if (!(current->flags & PF_USEDFPU)) + __asm__ __volatile__ ( " clts;\n"); + + __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); + + switch(count) { + case 2: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq 40(%1), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data) + : "memory" ); + break; + case 3: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data) + : "memory" ); + break; + case 4: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor (%4), %%mm0 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " movq 32(%1), %%mm4 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " pxor 16(%4), %%mm2 ;\n" + " movq %%mm1, 8(%1) ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 40(%2), %%mm5 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 24(%4), %%mm3 ;\n" + " movq %%mm3, 24(%1) ;\n" + " movq 56(%1), %%mm7 ;\n" + " movq 48(%1), %%mm6 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 56(%2), %%mm7 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data) + : "memory" ); + break; + case 5: + __asm__ __volatile__ ( + + " .align 32,0x90 ;\n" + " 1: ;\n" + " movq (%1), %%mm0 ;\n" + " movq 8(%1), %%mm1 ;\n" + " pxor (%2), %%mm0 ;\n" + " pxor 8(%2), %%mm1 ;\n" + " movq 16(%1), %%mm2 ;\n" + " pxor (%3), %%mm0 ;\n" + " pxor 8(%3), %%mm1 ;\n" + " pxor 16(%2), %%mm2 ;\n" + " pxor (%4), %%mm0 ;\n" + " pxor 8(%4), %%mm1 ;\n" + " pxor 16(%3), %%mm2 ;\n" + " movq 24(%1), %%mm3 ;\n" + " pxor (%5), %%mm0 ;\n" + " pxor 8(%5), %%mm1 ;\n" + " movq %%mm0, (%1) ;\n" + " pxor 16(%4), %%mm2 ;\n" + " pxor 24(%2), %%mm3 ;\n" + " movq %%mm1, 8(%1) ;\n" + " pxor 16(%5), %%mm2 ;\n" + " pxor 24(%3), %%mm3 ;\n" + " movq 32(%1), %%mm4 ;\n" + " movq %%mm2, 16(%1) ;\n" + " pxor 24(%4), %%mm3 ;\n" + " pxor 32(%2), %%mm4 ;\n" + " movq 40(%1), %%mm5 ;\n" + " pxor 24(%5), %%mm3 ;\n" + " pxor 32(%3), %%mm4 ;\n" + " pxor 40(%2), %%mm5 ;\n" + " movq %%mm3, 24(%1) ;\n" + " pxor 32(%4), %%mm4 ;\n" + " pxor 40(%3), %%mm5 ;\n" + " movq 48(%1), %%mm6 ;\n" + " movq 56(%1), %%mm7 ;\n" + " pxor 32(%5), %%mm4 ;\n" + " pxor 40(%4), %%mm5 ;\n" + " pxor 48(%2), %%mm6 ;\n" + " pxor 56(%2), %%mm7 ;\n" + " movq %%mm4, 32(%1) ;\n" + " pxor 48(%3), %%mm6 ;\n" + " pxor 56(%3), %%mm7 ;\n" + " pxor 40(%5), %%mm5 ;\n" + " pxor 48(%4), %%mm6 ;\n" + " pxor 56(%4), %%mm7 ;\n" + " movq %%mm5, 40(%1) ;\n" + " pxor 48(%5), %%mm6 ;\n" + " pxor 56(%5), %%mm7 ;\n" + " movq %%mm6, 48(%1) ;\n" + " movq %%mm7, 56(%1) ;\n" + + " addl $64, %1 ;\n" + " addl $64, %2 ;\n" + " addl $64, %3 ;\n" + " addl $64, %4 ;\n" + " addl $64, %5 ;\n" + " decl %0 ;\n" + " jnz 1b ;\n" + + : + : "r" (lines), + "r" (bh_ptr[0]->b_data), + "r" (bh_ptr[1]->b_data), + "r" (bh_ptr[2]->b_data), + "r" (bh_ptr[3]->b_data), + "r" (bh_ptr[4]->b_data) + : "memory" ); + break; + } + + __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); + + if (!(current->flags & PF_USEDFPU)) + stts(); +} +#endif /* __i386__ */ +#endif /* !__sparc_v9__ */ + +#ifdef __sparc_v9__ +/* + * High speed xor_block operation for RAID4/5 utilizing the + * UltraSparc Visual Instruction Set. + * + * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + * Requirements: + * !(((long)dest | (long)sourceN) & (64 - 1)) && + * !(len & 127) && len >= 256 + * + * It is done in pure assembly, as otherwise gcc makes it + * a non-leaf function, which is not what we want. + * Also, we don't measure the speeds as on other architectures, + * as the measuring routine does not take into account cold caches + * and the fact that xor_block_VIS bypasses the caches. + * xor_block_32regs might be 5% faster for count 2 if caches are hot + * and things just right (for count 3 VIS is about as fast as 32regs for + * hot caches and for count 4 and 5 VIS is faster by good margin always), + * but I think it is better not to pollute the caches. + * Actually, if I'd just fight for speed for hot caches, I could + * write a hybrid VIS/integer routine, which would do always two + * 64B blocks in VIS and two in IEUs, but I really care more about + * caches. + */ +extern void *VISenter(void); +extern void xor_block_VIS XOR_ARGS; + +void __xor_block_VIS(void) +{ +__asm__ (" + .globl xor_block_VIS +xor_block_VIS: + ldx [%%o1 + 0], %%o4 + ldx [%%o1 + 8], %%o3 + ldx [%%o4 + %1], %%g5 + ldx [%%o4 + %0], %%o4 + ldx [%%o3 + %0], %%o3 + rd %%fprs, %%o5 + andcc %%o5, %2, %%g0 + be,pt %%icc, 297f + sethi %%hi(%5), %%g1 + jmpl %%g1 + %%lo(%5), %%g7 + add %%g7, 8, %%g7 +297: wr %%g0, %4, %%fprs + membar #LoadStore|#StoreLoad|#StoreStore + sub %%g5, 64, %%g5 + ldda [%%o4] %3, %%f0 + ldda [%%o3] %3, %%f16 + cmp %%o0, 4 + bgeu,pt %%xcc, 10f + cmp %%o0, 3 + be,pn %%xcc, 13f + mov -64, %%g1 + sub %%g5, 64, %%g5 + rd %%asi, %%g1 + wr %%g0, %3, %%asi + +2: ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + ldda [%%o4 + 128] %%asi, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + add %%o4, 128, %%o4 + fxor %%f36, %%f52, %%f52 + add %%o3, 128, %%o3 + fxor %%f38, %%f54, %%f54 + subcc %%g5, 128, %%g5 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 - 64] %%asi + bne,pt %%xcc, 2b + ldda [%%o3] %3, %%f16 + + ldda [%%o4 + 64] %%asi, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + stda %%f16, [%%o4] %3 + ldda [%%o3 + 64] %%asi, %%f48 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + 64] %%asi + membar #Sync|#StoreStore|#StoreLoad + wr %%g0, 0, %%fprs + retl + wr %%g1, %%g0, %%asi + +13: ldx [%%o1 + 16], %%o2 + ldx [%%o2 + %0], %%o2 + +3: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 3b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +10: cmp %%o0, 5 + be,pt %%xcc, 15f + mov -64, %%g1 + +14: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + +4: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + add %%o2, 64, %%o2 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + ldda [%%o4] %3, %%f0 + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + add %%o0, 64, %%o0 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 4b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f16 + fxor %%f2, %%f18, %%f18 + fxor %%f4, %%f20, %%f20 + fxor %%f6, %%f22, %%f22 + fxor %%f8, %%f24, %%f24 + fxor %%f10, %%f26, %%f26 + fxor %%f12, %%f28, %%f28 + fxor %%f14, %%f30, %%f30 + ldda [%%o0] %3, %%f48 + fxor %%f16, %%f32, %%f32 + fxor %%f18, %%f34, %%f34 + fxor %%f20, %%f36, %%f36 + fxor %%f22, %%f38, %%f38 + fxor %%f24, %%f40, %%f40 + fxor %%f26, %%f42, %%f42 + fxor %%f28, %%f44, %%f44 + fxor %%f30, %%f46, %%f46 + membar #Sync + fxor %%f32, %%f48, %%f48 + fxor %%f34, %%f50, %%f50 + fxor %%f36, %%f52, %%f52 + fxor %%f38, %%f54, %%f54 + fxor %%f40, %%f56, %%f56 + fxor %%f42, %%f58, %%f58 + fxor %%f44, %%f60, %%f60 + fxor %%f46, %%f62, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + +15: ldx [%%o1 + 16], %%o2 + ldx [%%o1 + 24], %%o0 + ldx [%%o1 + 32], %%o1 + ldx [%%o2 + %0], %%o2 + ldx [%%o0 + %0], %%o0 + ldx [%%o1 + %0], %%o1 + +5: ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + add %%o4, 64, %%o4 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + add %%o3, 64, %%o3 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + add %%o2, 64, %%o2 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + add %%o0, 64, %%o0 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + add %%o1, 64, %%o1 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + ldda [%%o4] %3, %%f0 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + subcc %%g5, 64, %%g5 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4 + %%g1] %3 + bne,pt %%xcc, 5b + ldda [%%o3] %3, %%f16 + + ldda [%%o2] %3, %%f32 + fxor %%f0, %%f16, %%f48 + fxor %%f2, %%f18, %%f50 + fxor %%f4, %%f20, %%f52 + fxor %%f6, %%f22, %%f54 + fxor %%f8, %%f24, %%f56 + fxor %%f10, %%f26, %%f58 + fxor %%f12, %%f28, %%f60 + fxor %%f14, %%f30, %%f62 + ldda [%%o0] %3, %%f16 + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + ldda [%%o1] %3, %%f32 + fxor %%f48, %%f16, %%f48 + fxor %%f50, %%f18, %%f50 + fxor %%f52, %%f20, %%f52 + fxor %%f54, %%f22, %%f54 + fxor %%f56, %%f24, %%f56 + fxor %%f58, %%f26, %%f58 + fxor %%f60, %%f28, %%f60 + fxor %%f62, %%f30, %%f62 + membar #Sync + fxor %%f48, %%f32, %%f48 + fxor %%f50, %%f34, %%f50 + fxor %%f52, %%f36, %%f52 + fxor %%f54, %%f38, %%f54 + fxor %%f56, %%f40, %%f56 + fxor %%f58, %%f42, %%f58 + fxor %%f60, %%f44, %%f60 + fxor %%f62, %%f46, %%f62 + stda %%f48, [%%o4] %3 + membar #Sync|#StoreStore|#StoreLoad + retl + wr %%g0, 0, %%fprs + " : : + "i" (&((struct buffer_head *)0)->b_data), + "i" (&((struct buffer_head *)0)->b_data), + "i" (FPRS_FEF|FPRS_DU), "i" (ASI_BLK_P), + "i" (FPRS_FEF), "i" (VISenter)); +} +#endif /* __sparc_v9__ */ + +#if defined(__sparc__) && !defined(__sparc_v9__) +/* + * High speed xor_block operation for RAID4/5 utilizing the + * ldd/std SPARC instructions. + * + * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz) + * + */ + +XORBLOCK_TEMPLATE(SPARC) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1 = (long *) bh_ptr[1]->b_data; + long *source2, *source3, *source4; + + switch (count) { + case 2: + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1) : "g2", "g3", "g4", "g5", "o0", + "o1", "o2", "o3", "o4", "o5", "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + } + break; + case 4: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + } + break; + case 5: + source2 = (long *) bh_ptr[2]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source4 = (long *) bh_ptr[4]->b_data; + for (i = lines; i > 0; i--) { + __asm__ __volatile__(" + ldd [%0 + 0x00], %%g2 + ldd [%0 + 0x08], %%g4 + ldd [%0 + 0x10], %%o0 + ldd [%0 + 0x18], %%o2 + ldd [%1 + 0x00], %%o4 + ldd [%1 + 0x08], %%l0 + ldd [%1 + 0x10], %%l2 + ldd [%1 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%2 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%2 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%2 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%2 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%3 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%3 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%3 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%3 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + ldd [%4 + 0x00], %%o4 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + ldd [%4 + 0x08], %%l0 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + ldd [%4 + 0x10], %%l2 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + ldd [%4 + 0x18], %%l4 + xor %%g2, %%o4, %%g2 + xor %%g3, %%o5, %%g3 + xor %%g4, %%l0, %%g4 + xor %%g5, %%l1, %%g5 + xor %%o0, %%l2, %%o0 + xor %%o1, %%l3, %%o1 + xor %%o2, %%l4, %%o2 + xor %%o3, %%l5, %%o3 + std %%g2, [%0 + 0x00] + std %%g4, [%0 + 0x08] + std %%o0, [%0 + 0x10] + std %%o2, [%0 + 0x18] + " : : "r" (destp), "r" (source1), "r" (source2), "r" (source3), "r" (source4) + : "g2", "g3", "g4", "g5", "o0", "o1", "o2", "o3", "o4", "o5", + "l0", "l1", "l2", "l3", "l4", "l5"); + destp += 8; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + } + break; + } +} +#endif /* __sparc_v[78]__ */ + +#ifndef __sparc_v9__ + +/* + * this one works reasonably on any x86 CPU + * (send me an assembly version for inclusion if you can make it faster) + * + * this one is just as fast as written in pure assembly on x86. + * the reason for this separate version is that the + * fast open-coded xor routine "32reg" produces suboptimal code + * on x86, due to lack of registers. + */ +XORBLOCK_TEMPLATE(8regs) +{ + int len = bh_ptr[0]->b_size; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + long lines = len / (sizeof (long)) / 8, i; + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 7) ^= *(source1 + 7); + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + *(destp + 0) ^= *(source1 + 0); + *(destp + 0) ^= *(source2 + 0); + *(destp + 0) ^= *(source3 + 0); + *(destp + 0) ^= *(source4 + 0); + *(destp + 1) ^= *(source1 + 1); + *(destp + 1) ^= *(source2 + 1); + *(destp + 1) ^= *(source3 + 1); + *(destp + 1) ^= *(source4 + 1); + *(destp + 2) ^= *(source1 + 2); + *(destp + 2) ^= *(source2 + 2); + *(destp + 2) ^= *(source3 + 2); + *(destp + 2) ^= *(source4 + 2); + *(destp + 3) ^= *(source1 + 3); + *(destp + 3) ^= *(source2 + 3); + *(destp + 3) ^= *(source3 + 3); + *(destp + 3) ^= *(source4 + 3); + *(destp + 4) ^= *(source1 + 4); + *(destp + 4) ^= *(source2 + 4); + *(destp + 4) ^= *(source3 + 4); + *(destp + 4) ^= *(source4 + 4); + *(destp + 5) ^= *(source1 + 5); + *(destp + 5) ^= *(source2 + 5); + *(destp + 5) ^= *(source3 + 5); + *(destp + 5) ^= *(source4 + 5); + *(destp + 6) ^= *(source1 + 6); + *(destp + 6) ^= *(source2 + 6); + *(destp + 6) ^= *(source3 + 6); + *(destp + 6) ^= *(source4 + 6); + *(destp + 7) ^= *(source1 + 7); + *(destp + 7) ^= *(source2 + 7); + *(destp + 7) ^= *(source3 + 7); + *(destp + 7) ^= *(source4 + 7); + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * platform independent RAID5 checksum calculation, this should + * be very fast on any platform that has a decent amount of + * registers. (32 or more) + */ +XORBLOCK_TEMPLATE(32regs) +{ + int size = bh_ptr[0]->b_size; + int lines = size / (sizeof (long)) / 8, i; + long *destp = (long *) bh_ptr[0]->b_data; + long *source1, *source2, *source3, *source4; + + /* LOTS of registers available... + We do explicite loop-unrolling here for code which + favours RISC machines. In fact this is almoast direct + RISC assembly on Alpha and SPARC :-) */ + + + switch(count) { + case 2: + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + destp += 8; + } + break; + case 3: + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + destp += 8; + } + break; + case 4: + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + destp += 8; + } + break; + case 5: + source4 = (long *) bh_ptr[4]->b_data; + source3 = (long *) bh_ptr[3]->b_data; + source2 = (long *) bh_ptr[2]->b_data; + source1 = (long *) bh_ptr[1]->b_data; + for (i = lines; i > 0; i--) { + register long d0, d1, d2, d3, d4, d5, d6, d7; + d0 = destp[0]; /* Pull the stuff into registers */ + d1 = destp[1]; /* ... in bursts, if possible. */ + d2 = destp[2]; + d3 = destp[3]; + d4 = destp[4]; + d5 = destp[5]; + d6 = destp[6]; + d7 = destp[7]; + d0 ^= source1[0]; + d1 ^= source1[1]; + d2 ^= source1[2]; + d3 ^= source1[3]; + d4 ^= source1[4]; + d5 ^= source1[5]; + d6 ^= source1[6]; + d7 ^= source1[7]; + d0 ^= source2[0]; + d1 ^= source2[1]; + d2 ^= source2[2]; + d3 ^= source2[3]; + d4 ^= source2[4]; + d5 ^= source2[5]; + d6 ^= source2[6]; + d7 ^= source2[7]; + d0 ^= source3[0]; + d1 ^= source3[1]; + d2 ^= source3[2]; + d3 ^= source3[3]; + d4 ^= source3[4]; + d5 ^= source3[5]; + d6 ^= source3[6]; + d7 ^= source3[7]; + d0 ^= source4[0]; + d1 ^= source4[1]; + d2 ^= source4[2]; + d3 ^= source4[3]; + d4 ^= source4[4]; + d5 ^= source4[5]; + d6 ^= source4[6]; + d7 ^= source4[7]; + destp[0] = d0; /* Store the result (in burts) */ + destp[1] = d1; + destp[2] = d2; + destp[3] = d3; + destp[4] = d4; /* Store the result (in burts) */ + destp[5] = d5; + destp[6] = d6; + destp[7] = d7; + source1 += 8; + source2 += 8; + source3 += 8; + source4 += 8; + destp += 8; + } + break; + } +} + +/* + * (the -6*32 shift factor colors the cache) + */ +#define SIZE (PAGE_SIZE-6*32) + +static void xor_speed ( struct xor_block_template * func, + struct buffer_head *b1, struct buffer_head *b2) +{ + int speed; + unsigned long now; + int i, count, max; + struct buffer_head *bh_ptr[6]; + + func->next = xor_functions; + xor_functions = func; + bh_ptr[0] = b1; + bh_ptr[1] = b2; + + /* + * count the number of XORs done during a whole jiffy. + * calculate the speed of checksumming from this. + * (we use a 2-page allocation to have guaranteed + * color L1-cache layout) + */ + max = 0; + for (i = 0; i < 5; i++) { + now = jiffies; + count = 0; + while (jiffies == now) { + mb(); + func->xor_block(2,bh_ptr); + mb(); + count++; + mb(); + } + if (count > max) + max = count; + } + + speed = max * (HZ*SIZE/1024); + func->speed = speed; + + printk( " %-10s: %5d.%03d MB/sec\n", func->name, + speed / 1000, speed % 1000); +} + +static inline void pick_fastest_function(void) +{ + struct xor_block_template *f, *fastest; + + fastest = xor_functions; + for (f = fastest; f; f = f->next) { + if (f->speed > fastest->speed) + fastest = f; + } +#ifdef CONFIG_X86_XMM + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) { + fastest = &t_xor_block_pIII_kni; + } +#endif + xor_block = fastest->xor_block; + printk( "using fastest function: %s (%d.%03d MB/sec)\n", fastest->name, + fastest->speed / 1000, fastest->speed % 1000); +} + +static struct buffer_head b1, b2; + +void calibrate_xor_block(void) +{ + memset(&b1,0,sizeof(b1)); + b2 = b1; + + b1.b_data = (char *) md__get_free_pages(GFP_KERNEL,2); + if (!b1.b_data) { + pick_fastest_function(); + return; + } + b2.b_data = b1.b_data + 2*PAGE_SIZE + SIZE; + + b1.b_size = SIZE; + + printk(KERN_INFO "raid5: measuring checksumming speed\n"); + + sti(); /* should be safe */ + +#if defined(__sparc__) && !defined(__sparc_v9__) + printk(KERN_INFO "raid5: trying high-speed SPARC checksum routine\n"); + xor_speed(&t_xor_block_SPARC,&b1,&b2); +#endif + +#ifdef CONFIG_X86_XMM + if (boot_cpu_data.mmu_cr4_features & X86_CR4_OSXMMEXCPT) { + printk(KERN_INFO + "raid5: KNI detected, trying cache-avoiding KNI checksum routine\n"); + /* we force the use of the KNI xor block because it + can write around l2. we may also be able + to load into the l1 only depending on how + the cpu deals with a load to a line that is + being prefetched. + */ + xor_speed(&t_xor_block_pIII_kni,&b1,&b2); + } +#endif /* CONFIG_X86_XMM */ + +#ifdef __i386__ + + if (md_cpu_has_mmx()) { + printk(KERN_INFO + "raid5: MMX detected, trying high-speed MMX checksum routines\n"); + xor_speed(&t_xor_block_pII_mmx,&b1,&b2); + xor_speed(&t_xor_block_p5_mmx,&b1,&b2); + } + +#endif /* __i386__ */ + + + xor_speed(&t_xor_block_8regs,&b1,&b2); + xor_speed(&t_xor_block_32regs,&b1,&b2); + + free_pages((unsigned long)b1.b_data,2); + pick_fastest_function(); +} + +#else /* __sparc_v9__ */ + +void calibrate_xor_block(void) +{ + printk(KERN_INFO "raid5: using high-speed VIS checksum routine\n"); + xor_block = xor_block_VIS; +} + +#endif /* __sparc_v9__ */ + +MD_EXPORT_SYMBOL(xor_block); + --- linux/drivers/block/hsm.c.orig Sun Jan 16 11:26:03 2000 +++ linux/drivers/block/hsm.c Sun Jan 16 17:45:53 2000 @@ -0,0 +1,840 @@ +/* + hsm.c : HSM RAID driver for Linux + Copyright (C) 1998 Ingo Molnar + + HSM mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#include +#include + +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + + +#define DEBUG_HSM 1 + +#if DEBUG_HSM +#define dprintk(x,y...) printk(x,##y) +#else +#define dprintk(x,y...) do { } while (0) +#endif + +void print_bh(struct buffer_head *bh) +{ + dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh, + bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev, + bh->b_rsector, bh->b_this_page, bh->b_state, + bh->b_next_free, bh->b_count, bh->b_data, + bh->b_list, bh->b_flushtime + ); +} + +static int check_bg (pv_t *pv, pv_block_group_t * bg) +{ + int i, free = 0; + + dprintk("checking bg ...\n"); + + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) { + if (pv_pptr_free(bg->blocks + i)) { + free++; + if (test_bit(i, bg->used_bitmap)) { + printk("hm, bit %d set?\n", i); + } + } else { + if (!test_bit(i, bg->used_bitmap)) { + printk("hm, bit %d not set?\n", i); + } + } + } + dprintk("%d free blocks in bg ...\n", free); + return free; +} + +static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr) +{ + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2; + struct buffer_head *bh; + + dprintk("... getting BG at %u ...\n", bg_pos); + + bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return; + } + desc->bg = (pv_block_group_t *) bh->b_data; + desc->free_blocks = check_bg(pv, desc->bg); +} + +static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr, + unsigned int lblock, lv_lptr_t * index) +{ + int i; + + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) { + pv_pptr_t * bptr = desc->bg->blocks + i; + if (pv_pptr_free(bptr)) { + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2; + + if (test_bit(i, desc->bg->used_bitmap)) { + MD_BUG(); + continue; + } + bptr->u.used.owner.log_id = lv->log_id; + bptr->u.used.owner.log_index = lblock; + index->data.phys_nr = pv->phys_nr; + index->data.phys_block = bg_pos + i + 1; + set_bit(i, desc->bg->used_bitmap); + desc->free_blocks--; + dprintk(".....free blocks left in bg %p: %d\n", + desc->bg, desc->free_blocks); + return 0; + } + } + return -ENOSPC; +} + +static int __get_free_block (lv_t *lv, pv_t *pv, + unsigned int lblock, lv_lptr_t * index) +{ + int i; + + dprintk("trying to get free block for lblock %d ...\n", lblock); + + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) { + pv_bg_desc_t *desc = pv->bg_array + i; + + dprintk("looking at desc #%d (%p)...\n", i, desc->bg); + if (!desc->bg) + get_bg(pv, desc, i); + + if (desc->bg && desc->free_blocks) + return find_free_block(lv, pv, desc, i, + lblock, index); + } + dprintk("hsm: pv %s full!\n", partition_name(pv->dev)); + return -ENOSPC; +} + +static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index) +{ + int err; + + if (!lv->free_indices) + return -ENOSPC; + + /* fix me */ + err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index); + + if (err || !index->data.phys_block) { + MD_BUG(); + return -ENOSPC; + } + + lv->free_indices--; + + return 0; +} + +/* + * fix me: wordsize assumptions ... + */ +#define INDEX_BITS 8 +#define INDEX_DEPTH (32/INDEX_BITS) +#define INDEX_MASK ((1< [.", index->data.phys_nr, + index->data.phys_block, index->cpu_addr); + + tmp = index_child(index); + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index_block(lv, tmp)) + dprintk("(%d->%d)", i, index_block(lv, tmp)); + tmp++; + } + dprintk(".]\n"); +} + +static int read_index_group (lv_t *lv, lv_lptr_t *index) +{ + lv_lptr_t *index_group, *tmp; + struct buffer_head *bh; + int i; + + dprintk("reading index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -EIO; + } + if (!buffer_uptodate(bh)) + MD_BUG(); + + index_group = (lv_lptr_t *) bh->b_data; + tmp = index_group; + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index_block(lv, tmp)) { + dprintk("index group has BLOCK %d, non-present.\n", i); + tmp->cpu_addr = 0; + } + tmp++; + } + index->cpu_addr = ptr_to_cpuaddr(index_group); + + dprintk("have read index group %p at block %d.\n", + index_group, index_block(lv, index)); + print_index_list(lv, index); + + return 0; +} + +static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index) +{ + struct buffer_head *bh; + lv_lptr_t * index_group; + + if (get_free_block(lv, lblock, index)) + return -ENOSPC; + + dprintk("creating block for index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = getblk(index_dev(lv, index), + index_block(lv, index), HSM_BLOCKSIZE); + + index_group = (lv_lptr_t *) bh->b_data; + md_clear_page(index_group); + mark_buffer_uptodate(bh, 1); + + index->cpu_addr = ptr_to_cpuaddr(index_group); + + dprintk("allocated index group %p at block %d.\n", + index_group, index_block(lv, index)); + return 0; +} + +static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock) +{ + lv_lptr_t * index = index_child(&lv->root_index); + int idx, l; + + for (l = INDEX_DEPTH-1; l >= 0; l--) { + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK; + index += idx; + if (!l) + break; + if (!index_present(index)) { + dprintk("no group, level %u, pos %u\n", l, idx); + if (alloc_index_group(lv, lblock, index)) + return NULL; + } + index = index_child(index); + } + if (!index_block(lv,index)) { + dprintk("no data, pos %u\n", idx); + if (get_free_block(lv, lblock, index)) + return NULL; + return index; + } + MD_BUG(); + return index; +} + +static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock) +{ + lv_lptr_t * index = index_child(&lv->root_index); + int idx, l; + + for (l = INDEX_DEPTH-1; l >= 0; l--) { + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK; + index += idx; + if (!l) + break; + if (index_free(index)) + return NULL; + if (!index_present(index)) + read_index_group(lv, index); + if (!index_present(index)) { + MD_BUG(); + return NULL; + } + index = index_child(index); + } + if (!index_block(lv,index)) + return NULL; + return index; +} + +static int read_root_index(lv_t *lv) +{ + int err; + lv_lptr_t *index = &lv->root_index; + + if (!index_block(lv, index)) { + printk("LV has no root index yet, creating.\n"); + + err = alloc_index_group (lv, 0, index); + if (err) { + printk("could not create index group, err:%d\n", err); + return err; + } + lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx = + lv->root_index.data; + } else { + printk("LV already has a root index.\n"); + printk("... at <%s:%d>.\n", + partition_name(index_dev(lv, index)), + index_block(lv, index)); + + read_index_group(lv, index); + } + return 0; +} + +static int init_pv(pv_t *pv) +{ + struct buffer_head *bh; + pv_sb_t *pv_sb; + + bh = bread (pv->dev, 0, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + + pv_sb = (pv_sb_t *) bh->b_data; + pv->pv_sb = pv_sb; + + if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) { + printk("%s is not a PV, has magic %x instead of %x!\n", + partition_name(pv->dev), pv_sb->pv_magic, + HSM_PV_SB_MAGIC); + return -1; + } + printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev), + pv->phys_nr); + printk("... created under HSM version %d.%d.%d, at %x.\n", + pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime); + printk("... total # of blocks: %d (%d left unallocated).\n", + pv_sb->pv_total_size, pv_sb->pv_blocks_left); + + printk("... block size: %d bytes.\n", pv_sb->pv_block_size); + printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size); + printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size); + printk("... # of block groups: %d.\n", pv_sb->pv_block_groups); + + if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) { + MD_BUG(); + return 1; + } + pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL); + if (!pv->bg_array) { + MD_BUG(); + return 1; + } + memset(pv->bg_array, 0, PAGE_SIZE); + + return 0; +} + +static int free_pv(pv_t *pv) +{ + struct buffer_head *bh; + + dprintk("freeing PV %d ...\n", pv->phys_nr); + + if (pv->bg_array) { + int i; + + dprintk(".... freeing BGs ...\n"); + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) { + unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2; + pv_bg_desc_t *desc = pv->bg_array + i; + + if (desc->bg) { + dprintk(".... freeing BG %d ...\n", i); + bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE); + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + } + } + free_page((unsigned long)pv->bg_array); + } else + MD_BUG(); + + bh = getblk (pv->dev, 0, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + + return 0; +} + +struct semaphore hsm_sem = MUTEX; + +#define HSM_SECTORS (HSM_BLOCKSIZE/512) + +static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long bsectors) +{ + lv_t *lv = kdev_to_lv(dev); + lv_lptr_t *index; + unsigned int lblock = *rsector / HSM_SECTORS; + unsigned int offset = *rsector % HSM_SECTORS; + int err = -EIO; + + if (!lv) { + printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev)); + goto out; + } + if (offset + bsectors > HSM_SECTORS) { + MD_BUG(); + goto out; + } + down(&hsm_sem); + index = find_index(lv, lblock); + if (!index) { + printk("no block %u yet ... allocating\n", lblock); + index = alloc_fixed_index(lv, lblock); + } + + err = 0; + + printk(" %u <%s : %ld(%ld)> -> ", lblock, + partition_name(*rdev), *rsector, bsectors); + + *rdev = index_dev(lv, index); + *rsector = index_block(lv, index) * HSM_SECTORS + offset; + + printk(" <%s : %ld> %u\n", + partition_name(*rdev), *rsector, index_block(lv, index)); + + up(&hsm_sem); +out: + return err; +} + +static void free_index (lv_t *lv, lv_lptr_t * index) +{ + struct buffer_head *bh; + + printk("tryin to get cached block for index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE); + + printk("....FREEING "); + print_index_list(lv, index); + + if (bh) { + if (!buffer_uptodate(bh)) + MD_BUG(); + if ((lv_lptr_t *)bh->b_data != index_child(index)) { + printk("huh? b_data is %p, index content is %p.\n", + bh->b_data, index_child(index)); + } else + printk("good, b_data == index content == %p.\n", + index_child(index)); + printk("b_count == %d, writing.\n", bh->b_count); + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + printk("done.\n"); + } else { + printk("FAILED!\n"); + } + print_index_list(lv, index); + index_child(index) = NULL; +} + +static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0) +{ + char dots [3*8]; + lv_lptr_t * index; + int i, nr_dots; + + nr_dots = (INDEX_DEPTH-level)*3; + memcpy(dots,"...............",nr_dots); + dots[nr_dots] = 0; + + dprintk("%s level %d index group block:\n", dots, level); + + + index = index_0; + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index->data.phys_block) { + dprintk("%s block <%u,%u,%x>\n", dots, + index->data.phys_nr, + index->data.phys_block, + index->cpu_addr); + if (level && index_present(index)) { + dprintk("%s==> deeper one level\n", dots); + free_index_group(lv, level-1, + index_child(index)); + dprintk("%s freeing index group block %p ...", + dots, index_child(index)); + free_index(lv, index); + } + } + index++; + } + dprintk("%s DONE: level %d index group block.\n", dots, level); +} + +static void free_lv_indextree (lv_t *lv) +{ + dprintk("freeing LV %d ...\n", lv->log_id); + dprintk("..root index: %p\n", index_child(&lv->root_index)); + dprintk("..INDEX TREE:\n"); + free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index)); + dprintk("..freeing root index %p ...", index_child(&lv->root_index)); + dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr, + lv->root_index.data.phys_block, lv->root_index.cpu_addr); + free_index(lv, &lv->root_index); + dprintk("..INDEX TREE done.\n"); + fsync_dev(lv->vg->pv_array[0].dev); /* fix me */ + lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices; +} + +static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0) +{ + char dots [3*5]; + lv_lptr_t * index; + int i, nr_dots; + + nr_dots = (INDEX_DEPTH-level)*3; + memcpy(dots,"...............",nr_dots); + dots[nr_dots] = 0; + + dprintk("%s level %d index group block:\n", dots, level); + + + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + index = index_0 + i; + if (index->data.phys_block) { + dprintk("%s block <%u,%u,%x>\n", dots, + index->data.phys_nr, + index->data.phys_block, + index->cpu_addr); + if (level && index_present(index)) { + dprintk("%s==> deeper one level\n", dots); + print_index_group(lv, level-1, + index_child(index)); + } + } + } + dprintk("%s DONE: level %d index group block.\n", dots, level); +} + +static void print_lv (lv_t *lv) +{ + dprintk("printing LV %d ...\n", lv->log_id); + dprintk("..root index: %p\n", index_child(&lv->root_index)); + dprintk("..INDEX TREE:\n"); + print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index)); + dprintk("..INDEX TREE done.\n"); +} + +static int map_lv (lv_t *lv) +{ + kdev_t dev = lv->dev; + unsigned int nr = MINOR(dev); + mddev_t *mddev = lv->vg->mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return -1; + } + if (kdev_to_mddev(dev)) { + MD_BUG(); + return -1; + } + md_hd_struct[nr].start_sect = 0; + md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1; + md_size[nr] = md_size[mdidx(mddev)]; + add_mddev_mapping(mddev, dev, lv); + + return 0; +} + +static int unmap_lv (lv_t *lv) +{ + kdev_t dev = lv->dev; + unsigned int nr = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return -1; + } + md_hd_struct[nr].start_sect = 0; + md_hd_struct[nr].nr_sects = 0; + md_size[nr] = 0; + del_mddev_mapping(lv->vg->mddev, dev); + + return 0; +} + +static int init_vg (vg_t *vg) +{ + int i; + lv_t *lv; + kdev_t dev; + vg_sb_t *vg_sb; + struct buffer_head *bh; + lv_descriptor_t *lv_desc; + + /* + * fix me: read all PVs and compare the SB + */ + dev = vg->pv_array[0].dev; + bh = bread (dev, 1, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + + vg_sb = (vg_sb_t *) bh->b_data; + vg->vg_sb = vg_sb; + + if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) { + printk("%s is not a valid VG, has magic %x instead of %x!\n", + partition_name(dev), vg_sb->vg_magic, + HSM_VG_SB_MAGIC); + return -1; + } + + vg->nr_lv = 0; + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + unsigned int id; + lv_desc = vg->vg_sb->lv_array + i; + + id = lv_desc->lv_id; + if (!id) { + printk("... LV desc %d empty\n", i); + continue; + } + if (id >= HSM_MAX_LVS_PER_VG) { + MD_BUG(); + continue; + } + + lv = vg->lv_array + id; + if (lv->vg) { + MD_BUG(); + continue; + } + lv->log_id = id; + lv->vg = vg; + lv->max_indices = lv_desc->lv_max_indices; + lv->free_indices = lv_desc->lv_free_indices; + lv->root_index.data = lv_desc->lv_root_idx; + lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id); + + vg->nr_lv++; + + map_lv(lv); + if (read_root_index(lv)) { + vg->nr_lv--; + unmap_lv(lv); + memset(lv, 0, sizeof(*lv)); + } + } + if (vg->nr_lv != vg_sb->nr_lvs) + MD_BUG(); + + return 0; +} + +static int hsm_run (mddev_t *mddev) +{ + int i; + vg_t *vg; + mdk_rdev_t *rdev; + + MOD_INC_USE_COUNT; + + vg = kmalloc (sizeof (*vg), GFP_KERNEL); + if (!vg) + goto out; + memset(vg, 0, sizeof(*vg)); + mddev->private = vg; + vg->mddev = mddev; + + if (md_check_ordering(mddev)) { + printk("hsm: disks are not ordered, aborting!\n"); + goto out; + } + + set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE); + + vg->nr_pv = mddev->nb_dev; + ITERATE_RDEV_ORDERED(mddev,rdev,i) { + pv_t *pv = vg->pv_array + i; + + pv->dev = rdev->dev; + fsync_dev (pv->dev); + set_blocksize (pv->dev, HSM_BLOCKSIZE); + pv->phys_nr = i; + if (init_pv(pv)) + goto out; + } + + init_vg(vg); + + return 0; + +out: + if (vg) { + kfree(vg); + mddev->private = NULL; + } + MOD_DEC_USE_COUNT; + + return 1; +} + +static int hsm_stop (mddev_t *mddev) +{ + lv_t *lv; + vg_t *vg; + int i; + + vg = mddev_to_vg(mddev); + + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + lv = vg->lv_array + i; + if (!lv->log_id) + continue; + print_lv(lv); + free_lv_indextree(lv); + unmap_lv(lv); + } + for (i = 0; i < vg->nr_pv; i++) + free_pv(vg->pv_array + i); + + kfree(vg); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static int hsm_status (char *page, mddev_t *mddev) +{ + int sz = 0, i; + lv_t *lv; + vg_t *vg; + + vg = mddev_to_vg(mddev); + + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + lv = vg->lv_array + i; + if (!lv->log_id) + continue; + sz += sprintf(page+sz, " ", lv->log_id, + lv->max_indices - lv->free_indices, lv->max_indices); + } + return sz; +} + + +static mdk_personality_t hsm_personality= +{ + "hsm", + hsm_map, + NULL, + NULL, + hsm_run, + hsm_stop, + hsm_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL +}; + +#ifndef MODULE + +md__initfunc(void hsm_init (void)) +{ + register_md_personality (HSM, &hsm_personality); +} + +#else + +int init_module (void) +{ + return (register_md_personality (HSM, &hsm_personality)); +} + +void cleanup_module (void) +{ + unregister_md_personality (HSM); +} + +#endif + +/* + * This Linus-trick catches bugs via the linker. + */ + +extern void __BUG__in__hsm_dot_c_1(void); +extern void __BUG__in__hsm_dot_c_2(void); +extern void __BUG__in__hsm_dot_c_3(void); +extern void __BUG__in__hsm_dot_c_4(void); +extern void __BUG__in__hsm_dot_c_5(void); +extern void __BUG__in__hsm_dot_c_6(void); +extern void __BUG__in__hsm_dot_c_7(void); + +void bugcatcher (void) +{ + if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_1(); + if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_2(); + + if (sizeof(pv_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_4(); + if (sizeof(lv_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_3(); + if (sizeof(vg_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_6(); + + if (sizeof(lv_lptr_t) != 16) + __BUG__in__hsm_dot_c_5(); + if (sizeof(pv_pptr_t) != 16) + __BUG__in__hsm_dot_c_6(); +} + --- linux/drivers/block/blkpg.c.orig Wed May 26 18:30:31 1999 +++ linux/drivers/block/blkpg.c Sun Jan 16 17:45:53 2000 @@ -65,20 +65,6 @@ return g; } -/* moved here from md.c - will be discarded later */ -char *partition_name (kdev_t dev) { - static char name[40]; /* kdevname returns 32 bytes */ - /* disk_name requires 32 bytes */ - struct gendisk *hd = get_gendisk (dev); - - if (!hd) { - sprintf (name, "[dev %s]", kdevname(dev)); - return (name); - } - - return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */ -} - /* * Add a partition. * --- linux/drivers/char/sysrq.c.orig Sun Jan 16 06:38:08 2000 +++ linux/drivers/char/sysrq.c Sun Jan 16 17:45:53 2000 @@ -72,6 +72,14 @@ #ifdef CONFIG_VT case 'k': /* K -- SAK */ printk("SAK\n"); +{ + int i; + + printk("clearing profiling buffer\n"); + for (i = 0; i < prof_len; i++) + prof_buffer[i] = 0; +} + if (tty) do_SAK(tty); reset_vc(fg_console); --- linux/drivers/scsi/scsi_lib.c.orig Sun Jan 16 06:38:25 2000 +++ linux/drivers/scsi/scsi_lib.c Sun Jan 16 17:45:53 2000 @@ -229,6 +229,8 @@ /* * Just hit the requeue function for the queue. */ + if (!q->request_fn) + goto out; q->request_fn(q); SDpnt = (Scsi_Device *) q->queuedata; @@ -287,6 +289,7 @@ SHpnt->some_device_starved = 0; } } +out: spin_unlock_irqrestore(&io_request_lock, flags); } --- linux/drivers/scsi/sd.c.orig Sun Jan 16 06:38:26 2000 +++ linux/drivers/scsi/sd.c Sun Jan 16 17:45:53 2000 @@ -241,6 +241,8 @@ dpnt = &rscsi_disks[target]; if (!dpnt) return NULL; /* No such device */ + if (!dpnt->device) + return NULL; return &dpnt->device->request_queue; } --- linux/drivers/scsi/scsi.c.orig Sun Jan 16 06:38:25 2000 +++ linux/drivers/scsi/scsi.c Sun Jan 16 17:45:53 2000 @@ -2339,6 +2339,7 @@ * guaranteed this device doesn't corrupt an ongoing data transfer. */ if (!strncmp("add-single-device", buffer + 5, 17)) { + unsigned long flags; p = buffer + 23; host = simple_strtoul(p, &p, 0); @@ -2349,14 +2350,17 @@ printk("scsi singledevice %d %d %d %d\n", host, channel, id, lun); + spin_lock_irqsave(&io_request_lock,flags); for (HBA_ptr = scsi_hostlist; HBA_ptr; HBA_ptr = HBA_ptr->next) { if (HBA_ptr->host_no == host) { break; } } err = -ENXIO; - if (!HBA_ptr) - goto out; + if (!HBA_ptr) { + spin_unlock_irqrestore(&io_request_lock,flags); + goto out_add; + } for (scd = HBA_ptr->host_queue; scd; scd = scd->next) { if ((scd->channel == channel @@ -2365,10 +2369,11 @@ break; } } + spin_unlock_irqrestore(&io_request_lock,flags); err = -ENOSYS; if (scd) - goto out; /* We do not yet support unplugging */ + goto out_add; /* We do not yet support unplugging */ scan_scsis(HBA_ptr, 1, channel, id, lun); @@ -2379,6 +2384,7 @@ (HBA_ptr->select_queue_depths) (HBA_ptr, HBA_ptr->host_queue); err = length; +out_add: goto out; } /* @@ -2393,6 +2399,7 @@ * */ else if (!strncmp("remove-single-device", buffer + 5, 20)) { + unsigned long flags; p = buffer + 26; host = simple_strtoul(p, &p, 0); @@ -2400,7 +2407,7 @@ id = simple_strtoul(p + 1, &p, 0); lun = simple_strtoul(p + 1, &p, 0); - + spin_lock_irqsave(&io_request_lock,flags); for (HBA_ptr = scsi_hostlist; HBA_ptr; HBA_ptr = HBA_ptr->next) { if (HBA_ptr->host_no == host) { break; @@ -2408,7 +2415,7 @@ } err = -ENODEV; if (!HBA_ptr) - goto out; + goto out_remove; for (scd = HBA_ptr->host_queue; scd; scd = scd->next) { if ((scd->channel == channel @@ -2419,11 +2426,11 @@ } if (scd == NULL) - goto out; /* there is no such device attached */ + goto out_remove; /* there is no such device attached */ err = -EBUSY; if (scd->access_count) - goto out; + goto out_remove; SDTpnt = scsi_devicelist; while (SDTpnt != NULL) { @@ -2454,12 +2461,13 @@ blk_cleanup_queue(&scd->request_queue); scsi_init_free((char *) scd, sizeof(Scsi_Device)); } else { - goto out; + goto out_remove; } err = 0; +out_remove: + spin_unlock_irqrestore(&io_request_lock,flags); } out: - free_page((unsigned long) buffer); return err; } --- linux/arch/i386/kernel/traps.c.orig Sun Jan 16 06:38:28 2000 +++ linux/arch/i386/kernel/traps.c Sun Jan 16 17:45:53 2000 @@ -383,10 +383,10 @@ if (last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... + * wait a few IRQs (30 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*HZ) { + if (alert_counter[cpu] == 30*HZ) { spin_lock(&nmi_print_lock); console_lock.lock = 0; // we are in trouble anyway printk("NMI Watchdog detected LOCKUP on CPU%d, registers:\n", cpu); --- linux/arch/i386/defconfig.orig Sun Jan 16 06:38:28 2000 +++ linux/arch/i386/defconfig Sun Jan 16 17:45:54 2000 @@ -115,7 +115,15 @@ # # CONFIG_BLK_DEV_LOOP is not set # CONFIG_BLK_DEV_NBD is not set -# CONFIG_BLK_DEV_MD is not set +CONFIG_BLK_DEV_MD=y +CONFIG_AUTODETECT_RAID=y +CONFIG_MD_TRANSLUCENT=y +CONFIG_MD_LINEAR=y +CONFIG_MD_STRIPED=y +CONFIG_MD_MIRRORING=y +CONFIG_MD_RAID5=y +CONFIG_MD_BOOT=y +CONFIG_BLK_DEV_HSM=y # CONFIG_BLK_DEV_RAM is not set # CONFIG_BLK_DEV_XD is not set # CONFIG_BLK_DEV_DAC960 is not set --- linux/arch/sparc64/kernel/ioctl32.c.orig Sun Jan 16 06:38:28 2000 +++ linux/arch/sparc64/kernel/ioctl32.c Sun Jan 16 17:45:54 2000 @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -2040,11 +2040,24 @@ case BLKRASET: /* 0x09 */ - case REGISTER_DEV: - case REGISTER_DEV_NEW: - case START_MD: - case STOP_MD: - + case RAID_VERSION: + case GET_ARRAY_INFO: + case GET_DISK_INFO: + case CLEAR_ARRAY: + case ADD_NEW_DISK: + case HOT_REMOVE_DISK: + case SET_ARRAY_INFO: + case SET_DISK_INFO: + case WRITE_RAID_INFO: + case UNPROTECT_ARRAY: + case PROTECT_ARRAY: + case HOT_ADD_DISK: + case RUN_ARRAY: + case START_ARRAY: + case STOP_ARRAY: + case STOP_ARRAY_RO: + case RESTART_ARRAY_RW: + /* Big K */ case PIO_FONT: case GIO_FONT: --- linux/arch/sparc64/kernel/sparc64_ksyms.c.orig Sun Jan 16 06:38:24 2000 +++ linux/arch/sparc64/kernel/sparc64_ksyms.c Sun Jan 16 17:45:54 2000 @@ -81,7 +81,8 @@ extern int sys32_ioctl(unsigned int fd, unsigned int cmd, u32 arg); extern int (*handle_mathemu)(struct pt_regs *, struct fpustate *); extern long sparc32_open(const char * filename, int flags, int mode); - +extern void VISenter(void); + extern void bcopy (const char *, char *, int); extern int __ashrdi3(int, int); @@ -313,6 +314,9 @@ * and will always be 'void __ret_efault(void)'. */ EXPORT_SYMBOL_NOVERS(__ret_efault); +/* VISenter is defined in assembly as well. + */ +EXPORT_SYMBOL_NOVERS(VISenter); /* No version information on these, as gcc produces such symbols. */ EXPORT_SYMBOL_NOVERS(memcmp); --- linux/Documentation/Configure.help.orig Sun Jan 16 06:38:27 2000 +++ linux/Documentation/Configure.help Sun Jan 16 17:45:54 2000 @@ -1224,7 +1224,7 @@ called on26.o. You must also have a high-level driver for the type of device that you want to support. -Multiple devices driver support +Multiple devices (Software RAID) driver support CONFIG_BLK_DEV_MD This driver lets you combine several hard disk partitions into one logical block device. This can be used to simply append one @@ -1242,6 +1242,13 @@ If unsure, say N. +Autodetect RAID partitions +CONFIG_AUTODETECT_RAID + This feature lets the kernel detect RAID partitions on bootup. + An autodetect RAID partition is a normal partition with partition + type 0xfd. Use this if you want to boot RAID devices, or want to + run them automatically. + Linear (append) mode CONFIG_MD_LINEAR If you say Y here, then your multiple devices driver will be able to @@ -1320,6 +1327,21 @@ Documentation/modules.txt. If unsure, say Y. + +Translucent Block Device Support (EXPERIMENTAL) +CONFIG_MD_TRANSLUCENT + DO NOT USE THIS STUFF YET! + + currently there is only a placeholder there as the implementation + is not yet usable. + +Hierarchical Storage Management support (EXPERIMENTAL) +CONFIG_MD_HSM + DO NOT USE THIS STUFF YET! + + i have released this so people can comment on the architecture, + but user-space tools are still unusable so there is nothing much + you can do with this unless you are a kernel hacker. Boot support (linear, striped) CONFIG_MD_BOOT --- linux/MAINTAINERS.orig Sun Jan 16 06:38:26 2000 +++ linux/MAINTAINERS Sun Jan 16 17:45:54 2000 @@ -847,6 +847,11 @@ L: linux-smp@vger.rutgers.edu S: Maintained +SOFTWARE RAID (MD) SUPPORT +P: Ingo Molnar +M: mingo@redhat.com +S: Maintained + SONIC NETWORK DRIVER P: Thomas Bogendoerfer M: tsbogend@alpha.franken.de --- linux/Makefile.orig Sun Jan 16 06:38:27 2000 +++ linux/Makefile Sun Jan 16 17:45:54 2000 @@ -92,7 +92,7 @@ CPPFLAGS += -D__SMP__ endif -CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer +CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -O2 -fno-omit-frame-pointer -g AFLAGS := $(CPPFLAGS) # use '-fno-strict-aliasing', but only if the compiler can take it