--- linux/fs/ext2/fsync.c.orig Sun Jan 16 06:38:26 2000 +++ linux/fs/ext2/fsync.c Sun Jan 16 17:45:52 2000 @@ -60,7 +60,7 @@ return 0; } ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); + bput(bh); return 0; } --- linux/fs/ext2/truncate.c.orig Sun Jan 16 06:38:14 2000 +++ linux/fs/ext2/truncate.c Sun Jan 16 17:45:52 2000 @@ -104,26 +104,18 @@ { int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); u32 * ind = (u32 *) bh->b_data; - int i, retry; - - /* Make sure both buffers are unlocked */ - do { - retry = 0; - if (buffer_locked(bh)) { - __wait_on_buffer(bh); - retry = 1; - } - if (ind_bh && buffer_locked(ind_bh)) { - __wait_on_buffer(ind_bh); - retry = 1; - } - } while (retry); + int i, retry = 0; + /* + * We do not have to wait for IO completion, proper handling + * of bforgotten buffers with IO on them is done by the + * buffer-cache and IO layer. + */ for (i = 0; i < addr_per_block; i++) if (*(ind++)) goto in_use; - if (atomic_read(&bh->b_count) == 1) { + if (bcount(bh) == 1) { int tmp; tmp = le32_to_cpu(*p); *p = 0; @@ -151,9 +143,6 @@ return retry; } -#define DATA_BUFFER_USED(bh) \ - (atomic_read(&bh->b_count) || buffer_locked(bh)) - static int trunc_direct (struct inode * inode) { int i, retry = 0; @@ -225,7 +214,8 @@ for (i = indirect_block ; i < addr_per_block ; i++) { u32 * ind = i + (u32 *) ind_bh->b_data; - wait_on_buffer(ind_bh); + if (!buffer_uptodate(ind_bh)) + wait_on_buffer(ind_bh); tmp = le32_to_cpu(*ind); if (!tmp) continue; --- linux/fs/ext2/inode.c.orig Sun Jan 16 06:38:26 2000 +++ linux/fs/ext2/inode.c Sun Jan 16 17:45:52 2000 @@ -204,6 +204,11 @@ if (tmp) { if (metadata) { result = getblk (inode->i_dev, tmp, blocksize); + if (!buffer_uptodate(result)) { + ll_rw_block (READ, 1, &result); + if (!buffer_uptodate(result)) + wait_on_buffer(result); + } if (tmp == le32_to_cpu(*p)) return result; brelse (result); @@ -255,8 +260,6 @@ } if (metadata) { result = getblk (inode->i_dev, tmp, blocksize); - if (!buffer_uptodate(result)) - wait_on_buffer(result); memset(result->b_data, 0, blocksize); mark_buffer_uptodate(result, 1); mark_buffer_dirty(result, 1); @@ -318,7 +321,8 @@ goto out; if (!buffer_uptodate(bh)) { ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + wait_on_buffer (bh); if (!buffer_uptodate(bh)) goto out; } @@ -328,6 +332,11 @@ if (tmp) { if (metadata) { result = getblk (bh->b_dev, tmp, blocksize); + if (!buffer_uptodate(result)) { + ll_rw_block (READ, 1, &result); + if (!buffer_uptodate(result)) + wait_on_buffer(result); + } if (tmp == le32_to_cpu(*p)) goto out; brelse (result); @@ -366,8 +375,6 @@ goto out; if (metadata) { result = getblk (bh->b_dev, tmp, blocksize); - if (!buffer_uptodate(result)) - wait_on_buffer(result); memset(result->b_data, 0, inode->i_sb->s_blocksize); mark_buffer_uptodate(result, 1); mark_buffer_dirty(result, 1); @@ -602,7 +609,8 @@ if (buffer_uptodate(bh)) return bh; ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + wait_on_buffer (bh); if (buffer_uptodate(bh)) return bh; brelse (bh); --- linux/fs/ext2/namei.c.orig Sun Jan 16 11:32:27 2000 +++ linux/fs/ext2/namei.c Sun Jan 16 17:45:52 2000 @@ -104,7 +104,8 @@ offset += sb->s_blocksize; continue; } - wait_on_buffer (bh); + if (!buffer_uptodate(bh)) + wait_on_buffer (bh); if (!buffer_uptodate(bh)) { /* * read error: all bets are off --- linux/fs/partitions/check.c.orig Mon Aug 30 19:24:14 1999 +++ linux/fs/partitions/check.c Sun Jan 16 17:45:52 2000 @@ -18,6 +18,7 @@ #include #include #include +#include #include "check.h" @@ -323,6 +324,12 @@ else #endif rd_load(); +#endif +#ifdef CONFIG_BLK_DEV_MD + { + extern void autodetect_raid(void); + autodetect_raid(); + } #endif #ifdef CONFIG_MD_BOOT md_setup_drive(); --- linux/fs/buffer.c.orig Sun Jan 16 06:38:25 2000 +++ linux/fs/buffer.c Sun Jan 16 17:45:52 2000 @@ -28,6 +28,12 @@ /* async buffer flushing, 1999 Andrea Arcangeli */ +/* + * Integrated buffer and page cache, improved buffer freeing, + * + * 2000 Ingo Molnar + */ + #include #include #include @@ -60,7 +66,7 @@ #define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9]) #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) #define NR_RESERVED (2*MAX_BUF_PER_PAGE) -#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this +#define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this number of unused buffer heads */ /* Anti-deadlock ordering: @@ -77,12 +83,14 @@ static struct buffer_head *lru_list[NR_LIST]; static spinlock_t lru_list_lock = SPIN_LOCK_UNLOCKED; + static int nr_buffers_type[NR_LIST] = {0,}; static unsigned long size_buffers_type[NR_LIST] = {0,}; static struct buffer_head * unused_list = NULL; static int nr_unused_buffer_heads = 0; static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; + static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); struct bh_free_head { @@ -94,6 +102,7 @@ kmem_cache_t *bh_cachep; static int grow_buffers(int size); +static int __try_to_free_buffers(struct page * page, int priority); /* This is used by some architectures to estimate available memory. */ atomic_t buffermem_pages = ATOMIC_INIT(0); @@ -109,7 +118,7 @@ */ union bdflush_param { struct { - int nfract; /* Percentage of buffer cache dirty to + int nfract; /* Percentage of buffer cache dirty to activate bdflush */ int ndirty; /* Maximum number of dirty blocks to write out per wake-cycle */ @@ -141,21 +150,20 @@ */ void __wait_on_buffer(struct buffer_head * bh) { - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DECLARE_WAITQUEUE(wait, current); - atomic_inc(&bh->b_count); + bget(bh); add_wait_queue(&bh->b_wait, &wait); repeat: run_task_queue(&tq_disk); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_task_state(current, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { schedule(); goto repeat; } - tsk->state = TASK_RUNNING; + current->state = TASK_RUNNING; remove_wait_queue(&bh->b_wait, &wait); - atomic_dec(&bh->b_count); + bput(bh); } /* Call sync_buffers with wait!=0 to ensure that the call does not @@ -193,22 +201,27 @@ for (i = nr_buffers_type[BUF_DIRTY]*2 ; i-- > 0 ; bh = next) { next = bh->b_next_free; - if (!lru_list[BUF_DIRTY]) + bget(bh); + if (!lru_list[BUF_DIRTY]) { + bput(bh); break; - if (dev && bh->b_dev != dev) + } + if (dev && bh->b_dev != dev) { + bput(bh); continue; + } if (buffer_locked(bh)) { /* Buffer is locked; skip it unless wait is * requested AND pass > 0. */ if (!wait || !pass) { retry = 1; + bput(bh); continue; } - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer (bh); - atomic_dec(&bh->b_count); + bput(bh); goto repeat; } @@ -218,20 +231,22 @@ if (wait && buffer_req(bh) && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_uptodate(bh)) { err = -EIO; + bput(bh); continue; } /* Don't write clean buffers. Don't write ANY buffers * on the third pass. */ - if (!buffer_dirty(bh) || pass >= 2) + if (!buffer_dirty(bh) || pass >= 2) { + bput(bh); continue; + } - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); retry = 1; + bput(bh); goto repeat; } @@ -248,21 +263,23 @@ break; if (dev && bh->b_dev != dev) continue; + bget(bh); if (buffer_locked(bh)) { /* Buffer is locked; skip it unless wait is * requested AND pass > 0. */ if (!wait || !pass) { retry = 1; + bput(bh); continue; } - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); wait_on_buffer (bh); spin_lock(&lru_list_lock); - atomic_dec(&bh->b_count); + bput(bh); goto repeat2; } + bput(bh); } spin_unlock(&lru_list_lock); @@ -317,7 +334,7 @@ /* * filp may be NULL if called via the msync of a vma. */ - + int file_fsync(struct file *filp, struct dentry *dentry) { struct inode * inode = dentry->d_inode; @@ -412,39 +429,6 @@ return err; } -void invalidate_buffers(kdev_t dev) -{ - int nlist; - - spin_lock(&lru_list_lock); - for(nlist = 0; nlist < NR_LIST; nlist++) { - struct buffer_head * bh; - int i; - retry: - bh = lru_list[nlist]; - if (!bh) - continue; - for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) { - if (bh->b_dev != dev) - continue; - if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - wait_on_buffer(bh); - spin_lock(&lru_list_lock); - atomic_dec(&bh->b_count); - goto retry; - } - if (atomic_read(&bh->b_count)) - continue; - clear_bit(BH_Protected, &bh->b_state); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Dirty, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - } - } - spin_unlock(&lru_list_lock); -} /* After several hours of tedious analysis, the following hash * function won. Do not mess with it... -DaveM @@ -456,6 +440,10 @@ static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head) { + if (bh->b_dev == B_FREE) + BUG(); + if (bh->b_pprev) + BUG(); if ((bh->b_next = *head) != NULL) bh->b_next->b_pprev = &bh->b_next; *head = bh; @@ -464,12 +452,35 @@ static __inline__ void __hash_unlink(struct buffer_head *bh) { + if (!bh->b_pprev) { + if (test_bit(BH_Mapped, &bh->b_state)) + BH_BUG(bh); + return; + } if (bh->b_next) bh->b_next->b_pprev = bh->b_pprev; *(bh->b_pprev) = bh->b_next; bh->b_pprev = NULL; + clear_bit(BH_Mapped, &bh->b_state); } +static inline struct buffer_head * __get_hash_table(struct buffer_head **head, kdev_t dev, int block, int size) +{ + struct buffer_head *bh; + + for (bh = *head; bh; bh = bh->b_next) + if (bh->b_blocknr == block && + bh->b_size == size && + bh->b_dev == dev) + break; + if (bh) { + bget(bh); + if (!buffer_mapped(bh)) + BH_BUG(bh); + } + + return bh; +} static void __insert_into_lru_list(struct buffer_head * bh, int blist) { struct buffer_head **bhp = &lru_list[blist]; @@ -488,9 +499,17 @@ static void __remove_from_lru_list(struct buffer_head * bh, int blist) { + if (bh->b_dev == B_FREE) + BUG(); if (bh->b_prev_free || bh->b_next_free) { - bh->b_prev_free->b_next_free = bh->b_next_free; - bh->b_next_free->b_prev_free = bh->b_prev_free; + if (!bh->b_prev_free) + BH_BUG(bh); + else + bh->b_prev_free->b_next_free = bh->b_next_free; + if (!bh->b_next_free) + BH_BUG(bh); + else + bh->b_next_free->b_prev_free = bh->b_prev_free; if (lru_list[blist] == bh) lru_list[blist] = bh->b_next_free; if (lru_list[blist] == bh) @@ -503,11 +522,19 @@ static void __remove_from_free_list(struct buffer_head * bh, int index) { + if (bh->b_dev != B_FREE) + BUG(); if(bh->b_next_free == bh) free_list[index].list = NULL; else { - bh->b_prev_free->b_next_free = bh->b_next_free; - bh->b_next_free->b_prev_free = bh->b_prev_free; + if (!bh->b_prev_free) + BH_BUG(bh); + else + bh->b_prev_free->b_next_free = bh->b_next_free; + if (!bh->b_next_free) + BH_BUG(bh); + else + bh->b_next_free->b_prev_free = bh->b_prev_free; if (free_list[index].list == bh) free_list[index].list = bh->b_next_free; } @@ -518,49 +545,192 @@ * because they control the visibility of a buffer head * to the rest of the kernel. */ -static __inline__ void __remove_from_queues(struct buffer_head *bh) +static inline void __remove_from_queues(struct buffer_head *bh) { - write_lock(&hash_table_lock); - if (bh->b_pprev) - __hash_unlink(bh); + __hash_unlink(bh); __remove_from_lru_list(bh, bh->b_list); +} + +static inline struct buffer_head * insert_into_queues_atomic (struct buffer_head *bh, int lock) +{ + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr), *alias; + + if (!buffer_mapped(bh)) + BUG(); + +repeat: + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + alias = __get_hash_table(head, bh->b_dev, bh->b_blocknr, bh->b_size); + if (alias) { + if (lock) { + if (test_and_set_bit(BH_Lock, &alias->b_state)) { + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + wait_on_buffer(alias); + brelse(alias); + goto repeat; + } + } + bh = alias; + } else { + __hash_link(bh, head); + __insert_into_lru_list(bh, bh->b_list); + } write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + + return bh; } -static void insert_into_queues(struct buffer_head *bh) +void insert_into_queues_exclusive (struct buffer_head *bh) { - struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr), *alias; + + if (!buffer_mapped(bh)) + BUG(); + if (!buffer_locked(bh)) + BUG(); spin_lock(&lru_list_lock); write_lock(&hash_table_lock); + alias = __get_hash_table(head, bh->b_dev, bh->b_blocknr, bh->b_size); + if (alias) { + int gotlock = 1; + /* + * If IO is going on for this bh we have to + * synchronize with it, but only if it's not + * an invalidated buffer. + * + * SUBTLE: the fact that we are atomically testing BH_Req + * in ll_rw_block ensures that we cannot accidentally + * write/read this new bh before the old IO finishes. + */ + clear_bit(BH_Req, &alias->b_state); + clear_bit(BH_Uptodate, &alias->b_state); + clear_bit(BH_Dirty, &alias->b_state); + if (test_and_set_bit(BH_Lock, &alias->b_state)) + gotlock = 0; + if (!alias->b_pprev) + BH_BUG(alias); + __remove_from_queues(alias); + if (gotlock) + clear_bit(BH_Lock, &alias->b_state); + bput(alias); + } __hash_link(bh, head); __insert_into_lru_list(bh, bh->b_list); write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); } -/* This function must only run if there are no other - * references _anywhere_ to this buffer head. +/* + * Reserve NR_RESERVED buffer heads for async IO requests to avoid + * no-buffer-head deadlock. Return NULL on failure; waiting for + * buffer heads is now handled in create_buffers(). */ -static void put_last_free(struct buffer_head * bh) +static struct buffer_head * __get_unused_bh(int async) { - struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; - struct buffer_head **bhp = &head->list; + struct buffer_head * bh; - spin_lock(&head->lock); - bh->b_dev = B_FREE; - if(!*bhp) { - *bhp = bh; - bh->b_prev_free = bh; + /* + * It's a common case that the unused list is empty, + * thus this 'unsafe' optimization. (we read the counter + * without the spinlock held) If we are really low on + * buffer heads, then we'll re-check the counter with the + * spinlock held anyway, in the async case. + */ + if (nr_unused_buffer_heads > NR_RESERVED) { + spin_lock(&unused_list_lock); +reserve_async: + bh = unused_list; + unused_list = bh->b_next_free; + nr_unused_buffer_heads--; + spin_unlock(&unused_list_lock); + return bh; } - bh->b_next_free = *bhp; - bh->b_prev_free = (*bhp)->b_prev_free; - (*bhp)->b_prev_free->b_next_free = bh; - (*bhp)->b_prev_free = bh; - spin_unlock(&head->lock); + + /* This is critical. We can't swap out pages to get + * more buffer heads, because the swap-out may need + * more buffer-heads itself. Thus SLAB_BUFFER. + */ + bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + if (bh) { + memset(bh, 0, sizeof(*bh)); + bh->b_blocknr = -1; + init_waitqueue_head(&bh->b_wait); + return bh; + } + + /* + * If we need an async buffer, use the reserved buffer heads. + */ + if (async) { + spin_lock(&unused_list_lock); + if (unused_list) + goto reserve_async; + spin_unlock(&unused_list_lock); + } +#if 0 + /* + * (Pending further analysis ...) + * Ordinary (non-async) requests can use a different memory priority + * to free up pages. Any swapping thus generated will use async + * buffer heads. + */ + if(!async && + (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { + memset(bh, 0, sizeof(*bh)); + init_waitqueue_head(&bh->b_wait); + return bh; + } +#endif + return NULL; } /* + * Note: the caller should wake up the buffer_wait list if needed. + */ +static void __put_unused_bh(struct buffer_head * bh) +{ + if (!PageLocked(bh->b_page)) + BUG(); + if (bh->b_pprev) + BH_BUG(bh); + if (bh->b_prev_free || bh->b_next_free) + BH_BUG(bh); + if (bcount(bh) || test_bit(BH_Lock, &bh->b_state)) + BH_BUG(bh); + if (waitqueue_active(&bh->b_wait)) + BH_BUG(bh); + if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { + memset(bh, 0x77, sizeof(*bh)); + kmem_cache_free(bh_cachep, bh); + } else { + memset(bh, 0x55, sizeof(*bh)); + bh->b_blocknr = -1; + init_waitqueue_head(&bh->b_wait); + nr_unused_buffer_heads++; + bh->b_next_free = unused_list; + bh->b_this_page = NULL; + unused_list = bh; + } +} + +struct buffer_head * get_unused_bh (void) +{ + struct buffer_head *bh; + + bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER); + memset(bh, 0, sizeof(*bh)); + return bh; +} + +void put_unused_bh(struct buffer_head * bh) +{ + kmem_cache_free(bh_cachep, bh); +} +/* * Why like this, I hear you say... The reason is race-conditions. * As we don't lock buffers (unless we are reading them, that is), * something might happen to it while we sleep (ie a read-error @@ -573,13 +743,7 @@ struct buffer_head *bh; read_lock(&hash_table_lock); - for(bh = *head; bh; bh = bh->b_next) - if (bh->b_blocknr == block && - bh->b_size == size && - bh->b_dev == dev) - break; - if (bh) - atomic_inc(&bh->b_count); + bh = __get_hash_table(head, dev, block, size); read_unlock(&hash_table_lock); return bh; @@ -604,6 +768,43 @@ return 0; } +static int __unlink_drop_bh (struct buffer_head *bh) +{ + struct page *page = bh_page(bh); + int freed = 0; + + + if (test_bit(BH_Lock, &bh->b_state)) + BH_BUG(bh); + + __remove_from_queues(bh); + if (!TryLockPage(page)) { + /* + * Safe because the page must have buffers and we just + * managed to lock it. + */ + get_page(page); + if (__try_to_free_buffers(page, 0)) { + /* + * We never remove the mapping prior to removing + * page->buffers. This means that !page->mappings + * are pure buffer-cache pages. + */ + if (!page->mapping) { + if (!page->lru.next && !page->lru.prev) + BUG(); + else + lru_cache_del(page); + atomic_dec(&buffermem_pages); + } + freed = 1; + } + UnlockPage(page); + put_page(page); + } + return freed; +} + void set_blocksize(kdev_t dev, int size) { extern int *blksize_size[]; @@ -630,39 +831,97 @@ * around on the free list, and we can get in a loop if we are not careful. */ for(nlist = 0; nlist < NR_LIST; nlist++) { - repeat: + /* + * Contrary to sync() here we also have to lock the + * hash table, because we might unhash entries. + * setblocksize() is a rare operation so this is not + * a real performance problem. + */ spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); +repeat: bh = lru_list[nlist]; for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { if(!bh) break; - bhnext = bh->b_next_free; + bhnext = bh->b_next_free; if (bh->b_dev != dev) continue; if (bh->b_size == size) continue; - if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + bget(bh); + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); wait_on_buffer(bh); - atomic_dec(&bh->b_count); + bput(bh); goto repeat; } + bput(bh); if (bh->b_dev == dev && bh->b_size != size) { clear_bit(BH_Dirty, &bh->b_state); clear_bit(BH_Uptodate, &bh->b_state); clear_bit(BH_Req, &bh->b_state); - } - if (atomic_read(&bh->b_count) == 0) { - __remove_from_queues(bh); - put_last_free(bh); + clear_bit(BH_Lock, &bh->b_state); + if (!bcount(bh)) + /* + * Careful, if we freed a page then + * we might have freed bhnext as + * well. + */ + if (__unlink_drop_bh(bh)) + goto repeat; } } + write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); } } +void invalidate_buffers (kdev_t dev) +{ + int nlist; + + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + + for (nlist = 0; nlist < NR_LIST; nlist++) { + struct buffer_head *bh, *bhnext; + int i; + retry: + bh = lru_list[nlist]; + if (!bh) + continue; + for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) { + bhnext = bh->b_next_free; + if (bh->b_dev != dev) + continue; + bget(bh); + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + wait_on_buffer(bh); + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + bput(bh); + goto retry; + } + bput(bh); + clear_bit(BH_Lock, &bh->b_state); + if (bcount(bh)) + continue; + clear_bit(BH_Protected, &bh->b_state); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + if (__unlink_drop_bh(bh)) + goto retry; + } + } + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); +} /* * We used to try various strange things. Let's not. */ @@ -680,22 +939,23 @@ bh->b_list = BUF_CLEAN; bh->b_end_io = handler; bh->b_dev_id = dev_id; + bh->b_rdev = MKDEV(0,0); } -static void end_buffer_io_sync(struct buffer_head *bh, int uptodate) +void end_buffer_io_sync(struct buffer_head *bh, int uptodate) { mark_buffer_uptodate(bh, uptodate); unlock_buffer(bh); } -static void end_buffer_io_bad(struct buffer_head *bh, int uptodate) +void end_buffer_io_bad(struct buffer_head *bh, int uptodate) { mark_buffer_uptodate(bh, uptodate); unlock_buffer(bh); BUG(); } -static void end_buffer_io_async(struct buffer_head * bh, int uptodate) +static void end_buffer_io_page(struct buffer_head * bh, int uptodate) { static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; unsigned long flags; @@ -724,10 +984,10 @@ */ spin_lock_irqsave(&page_uptodate_lock, flags); unlock_buffer(bh); - atomic_dec(&bh->b_count); + bh->b_end_io = end_buffer_io_sync; tmp = bh->b_this_page; while (tmp != bh) { - if (tmp->b_end_io == end_buffer_io_async && buffer_locked(tmp)) + if (tmp->b_end_io == end_buffer_io_page && test_bit(BH_Lock, &tmp->b_state)) goto still_busy; tmp = tmp->b_this_page; } @@ -767,13 +1027,24 @@ * 14.02.92: changed it to sync dirty buffers a bit: better performance * when the filesystem starts to get full of dirty blocks (I hope). */ -struct buffer_head * getblk(kdev_t dev, int block, int size) +static inline struct buffer_head * __getblk (kdev_t dev, int block, int size, int lock) { - struct buffer_head * bh; + struct buffer_head **head = &hash(dev, block); + struct buffer_head *bh, *tmp; int isize; repeat: - bh = get_hash_table(dev, block, size); + write_lock(&hash_table_lock); + bh = __get_hash_table(head, dev, block, size); + if (bh && lock) { + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + write_unlock(&hash_table_lock); + wait_on_buffer(bh); + brelse(bh); + goto repeat; + } + } + write_unlock(&hash_table_lock); if (bh) goto out; @@ -782,7 +1053,9 @@ bh = free_list[isize].list; if (bh) { __remove_from_free_list(bh, isize); - atomic_set(&bh->b_count, 1); + if (bcount(bh)) + BH_BUG(bh); + bget(bh); } spin_unlock(&free_list[isize].lock); if (!bh) @@ -794,10 +1067,23 @@ init_buffer(bh, end_buffer_io_sync, NULL); bh->b_dev = dev; bh->b_blocknr = block; - bh->b_state = 1 << BH_Mapped; + set_bit(BH_Mapped, &bh->b_state); + if (lock) { + set_bit(BH_Lock, &bh->b_state); + } - /* Insert the buffer into the regular lists */ - insert_into_queues(bh); + /* + * Insert the buffer into the regular lists, handle + * the case where someone else added a bh while we + * were allocating. + */ + tmp = insert_into_queues_atomic(bh, lock); + if (tmp != bh) { + clear_bit(BH_Lock, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + bforget(bh); + bh = tmp; + } goto out; /* @@ -811,9 +1097,21 @@ return bh; } -/* -1 -> no need to flush - 0 -> async flush - 1 -> sync flush (wait for I/O completation) */ +struct buffer_head * getblk (kdev_t dev, int block, int size) +{ + return __getblk(dev, block, size, 0); +} + +struct buffer_head * getblk_lock (kdev_t dev, int block, int size) +{ + return __getblk(dev, block, size, 1); +} + +/* + * -1 -> no need to flush + * 0 -> async flush + * 1 -> sync flush (wait for I/O completation) + */ static int balance_dirty_state(kdev_t dev) { unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit; @@ -851,7 +1149,6 @@ static inline void __mark_dirty(struct buffer_head *bh, int flag) { bh->b_flushtime = jiffies + (flag ? bdf_prm.b_un.age_super : bdf_prm.b_un.age_buffer); - clear_bit(BH_New, &bh->b_state); refile_buffer(bh); } @@ -867,6 +1164,7 @@ static __inline__ void __refile_buffer(struct buffer_head *bh) { int dispose = BUF_CLEAN; + if (buffer_locked(bh)) dispose = BUF_LOCKED; if (buffer_dirty(bh)) @@ -892,11 +1190,35 @@ { touch_buffer(buf); - if (atomic_read(&buf->b_count)) { - atomic_dec(&buf->b_count); + if (bcount(buf)) { + bput(buf); return; } - printk("VFS: brelse: Trying to free free buffer\n"); + printk("brelse: Trying to free free buffer\n"); + BUG(); +} + +static inline void __bforget_generic (struct buffer_head * buf, + int destroy_dirty) +{ + /* + * Grab the lru lock here to block bdflush. + */ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + if (!bput_and_test(buf) || test_bit(BH_Lock, &buf->b_state)) { + goto in_use; + } + if (!destroy_dirty && test_bit(BH_Dirty, &buf->b_state)) { + goto in_use; + } + clear_bit(BH_Uptodate, &buf->b_state); + clear_bit(BH_Req, &buf->b_state); + clear_bit(BH_Dirty, &buf->b_state); + __unlink_drop_bh(buf); +in_use: + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); } /* @@ -907,23 +1229,17 @@ */ void __bforget(struct buffer_head * buf) { - /* grab the lru lock here to block bdflush. */ - spin_lock(&lru_list_lock); - write_lock(&hash_table_lock); - if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) - goto in_use; - if (buf->b_pprev) - __hash_unlink(buf); - write_unlock(&hash_table_lock); - __remove_from_lru_list(buf, buf->b_list); - spin_unlock(&lru_list_lock); - buf->b_state = 0; - put_last_free(buf); - return; + __bforget_generic(buf, 1); +} - in_use: - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); +/* + * bdrop() frees the buffer if it does not result in information + * loss (ie. the buffer is not dirty). bdrop() is basically a + * 'free behind' brelse(). + */ +void __bdrop(struct buffer_head * buf) +{ + __bforget_generic(buf, 0); } /* @@ -938,7 +1254,13 @@ if (buffer_uptodate(bh)) return bh; ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); + /* + * ll_rw_block might have slept and someone else might have + * requested the buffer meanwhile - thus re-check the uptodate + * flag. + */ + if (!buffer_uptodate(bh)) + wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; brelse(bh); @@ -972,22 +1294,22 @@ index = BUFSIZE_INDEX(bh->b_size); if (buffer_uptodate(bh)) - return(bh); + return(bh); else ll_rw_block(READ, 1, &bh); blocks = (filesize - pos) >> (9+index); if (blocks < (read_ahead[MAJOR(dev)] >> index)) blocks = read_ahead[MAJOR(dev)] >> index; - if (blocks > NBUF) + if (blocks > NBUF) blocks = NBUF; /* if (blocks) printk("breada (new) %d blocks\n",blocks); */ bhlist[0] = bh; j = 1; - for(i=1; i1) - ll_rw_block(READA, (j-1), bhlist+1); - for(i=1; i 1) + ll_rw_block(READA, (j-1), bhlist+1); + for (i = 1; i < j; i++) brelse(bhlist[i]); /* Wait for this buffer, and then continue on. */ bh = bhlist[0]; - wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; brelse(bh); return NULL; } -/* - * Note: the caller should wake up the buffer_wait list if needed. - */ -static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) -{ - if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { - kmem_cache_free(bh_cachep, bh); - } else { - bh->b_blocknr = -1; - init_waitqueue_head(&bh->b_wait); - nr_unused_buffer_heads++; - bh->b_next_free = unused_list; - bh->b_this_page = NULL; - unused_list = bh; - } -} - -/* - * Reserve NR_RESERVED buffer heads for async IO requests to avoid - * no-buffer-head deadlock. Return NULL on failure; waiting for - * buffer heads is now handled in create_buffers(). - */ -static struct buffer_head * get_unused_buffer_head(int async) -{ - struct buffer_head * bh; - - spin_lock(&unused_list_lock); - if (nr_unused_buffer_heads > NR_RESERVED) { - bh = unused_list; - unused_list = bh->b_next_free; - nr_unused_buffer_heads--; - spin_unlock(&unused_list_lock); - return bh; - } - spin_unlock(&unused_list_lock); - - /* This is critical. We can't swap out pages to get - * more buffer heads, because the swap-out may need - * more buffer-heads itself. Thus SLAB_BUFFER. - */ - if((bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER)) != NULL) { - memset(bh, 0, sizeof(*bh)); - init_waitqueue_head(&bh->b_wait); - return bh; - } - - /* - * If we need an async buffer, use the reserved buffer heads. - */ - if (async) { - spin_lock(&unused_list_lock); - if (unused_list) { - bh = unused_list; - unused_list = bh->b_next_free; - nr_unused_buffer_heads--; - spin_unlock(&unused_list_lock); - return bh; - } - spin_unlock(&unused_list_lock); - } -#if 0 - /* - * (Pending further analysis ...) - * Ordinary (non-async) requests can use a different memory priority - * to free up pages. Any swapping thus generated will use async - * buffer heads. - */ - if(!async && - (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { - memset(bh, 0, sizeof(*bh)); - init_waitqueue_head(&bh->b_wait); - return bh; - } -#endif - - return NULL; -} - void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) { bh->b_page = page; @@ -1109,49 +1354,66 @@ * buffers. * The async flag is used to differentiate async IO (paging, swapping) * from ordinary buffer allocations, and only async requests are allowed - * to sleep waiting for buffer heads. + * to sleep waiting for buffer heads. */ static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) { - struct buffer_head *bh, *head; + struct buffer_head *bh, *head, *tail; long offset; try_again: - head = NULL; + head = tail = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { - bh = get_unused_buffer_head(async); + bh = __get_unused_bh(async); if (!bh) goto no_grow; - bh->b_dev = B_FREE; /* Flag as unused */ - bh->b_this_page = head; - head = bh; - + bh->b_next = NULL; + bh->b_blocknr = -1; + bh->b_size = size; + bh->b_list = BUF_CLEAN; + bh->b_dev = MKDEV(0,0); /* Flag as unused */ + bh->b_rdev = MKDEV(0,0); /* Flag as unused */ bh->b_state = 0; - bh->b_next_free = NULL; + bh->b_flushtime = 0; + + bh->b_next_free = bh->b_prev_free = NULL; + bh->b_this_page = head; + bh->b_reqnext = NULL; bh->b_pprev = NULL; - atomic_set(&bh->b_count, 0); - bh->b_size = size; set_bh_page(bh, page, offset); - bh->b_list = BUF_CLEAN; bh->b_end_io = end_buffer_io_bad; + bh->b_dev_id = NULL; + bh->b_rsector = -1; + init_waitqueue_head(&bh->b_wait); + + bh_set(bh, 0); + + if (!tail) + tail = bh; + head = bh; } + tail->b_this_page = head; return head; /* * In case anything failed, we just free everything we got. */ no_grow: if (head) { + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); spin_lock(&unused_list_lock); do { bh = head; head = head->b_this_page; - __put_unused_buffer_head(bh); + __put_unused_bh(bh); } while (head); spin_unlock(&unused_list_lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); /* Wake up any waiters ... */ wake_up(&buffer_wait); @@ -1160,7 +1422,7 @@ /* * Return failure for non-async IO requests. Async IO requests * are not allowed to fail, so we have to wait until buffer heads - * become available. But we don't want tasks sleeping with + * become available. But we don't want tasks sleeping with * partially complete buffers, so all were released above. */ if (!async) @@ -1169,12 +1431,12 @@ /* We're _really_ low on memory. Now we just * wait for old buffer heads to become free due to * finishing IO. Since this is an async request and - * the reserve list is empty, we're sure there are + * the reserve list is empty, we're sure there are * async buffer heads in use. */ run_task_queue(&tq_disk); - /* + /* * Set our state for sleeping, then check again for buffer heads. * This ensures we won't miss a wake_up from an interrupt. */ @@ -1182,7 +1444,19 @@ goto try_again; } -static int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size) + +static void __set_page_buffers(struct page *page, struct buffer_head *head) +{ + if (head && page->buffers) + BUG(); + if (!head && !page->buffers) + BUG(); + if (!PageLocked(page)) + BUG(); + page->buffers = head; +} + +static __inline__ int create_page_buffers(int rw, struct page *page, kdev_t dev, int b[], int size) { struct buffer_head *head, *bh, *tail; int block; @@ -1199,32 +1473,38 @@ BUG(); if (!head) BUG(); - tail = head; - for (bh = head; bh; bh = bh->b_this_page) { + bh = head; + do { block = *(b++); tail = bh; - init_buffer(bh, end_buffer_io_async, NULL); + init_buffer(bh, end_buffer_io_page, NULL); bh->b_dev = dev; bh->b_blocknr = block; - + if (!block) + BUG(); set_bit(BH_Mapped, &bh->b_state); - } - tail->b_this_page = head; + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + bh = bh->b_this_page; + } while (bh != head); + + if (tail->b_this_page != head) + BUG(); get_page(page); - page->buffers = head; + __set_page_buffers(page, head); + return 0; } -static void unmap_buffer(struct buffer_head * bh) +static __inline__ void unmap_buffer (struct buffer_head * bh) { - if (buffer_mapped(bh)) - { - mark_buffer_clean(bh); - wait_on_buffer(bh); + if (!PageLocked(bh->b_page)) + BUG(); + if (buffer_mapped(bh)) { clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Mapped, &bh->b_state); clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_Dirty, &bh->b_state); } } @@ -1271,10 +1551,19 @@ * instead. */ if (!offset) { - if (!try_to_free_buffers(page)) { - atomic_inc(&buffermem_pages); + /* + * Dont be too agressive dropping cached bhs on + * the same page. + */ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + if (!__try_to_free_buffers(page, 0)) { + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); return 0; } + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); } return 1; @@ -1288,37 +1577,21 @@ if (page->buffers) BUG(); - bh = head; + bh = tail = head; do { bh->b_dev = inode->i_dev; bh->b_blocknr = 0; bh->b_end_io = end_buffer_io_bad; + bh_set(bh, 0); tail = bh; bh = bh->b_this_page; - } while (bh); - tail->b_this_page = head; - page->buffers = head; + } while (bh != head); + if (tail->b_this_page != head) + BUG(); + __set_page_buffers(page, head); get_page(page); } -static void unmap_underlying_metadata(struct buffer_head * bh) -{ -#if 0 - if (buffer_new(bh)) { - struct buffer_head *old_bh; - - old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); - if (old_bh) { - unmap_buffer(old_bh); - /* Here we could run brelse or bforget. We use - bforget because it will try to put the buffer - in the freelist. */ - __bforget(old_bh); - } - } -#endif -} - /* * block_write_full_page() is SMP-safe - currently it's still * being called with the kernel lock held, but the code is ready. @@ -1347,6 +1620,7 @@ do { if (!bh) BUG(); + bget(bh); /* * If the buffer isn't up-to-date, we can't be sure @@ -1356,15 +1630,24 @@ * Leave it to the low-level FS to make all those * decisions (block #0 may actually be a valid block) */ - bh->b_end_io = end_buffer_io_sync; if (!buffer_mapped(bh)) { + bh->b_end_io = end_buffer_io_sync; err = inode->i_op->get_block(inode, block, bh, 1); - if (err) + clear_bit(BH_New, &bh->b_state); + if (err) { + bput(bh); goto out; - unmap_underlying_metadata(bh); + } + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + /* + * bdflush will take care of it. + */ + clear_bit(BH_Lock, &bh->b_state); } set_bit(BH_Uptodate, &bh->b_state); mark_buffer_dirty(bh,0); + bput(bh); bh = bh->b_this_page; block++; @@ -1385,7 +1668,7 @@ unsigned long block; int err = 0, partial = 0, need_balance_dirty = 0; unsigned blocksize, bbits; - struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait, *tmp; char *kaddr = (char *)kmap(page); blocksize = inode->i_sb->s_blocksize; @@ -1403,41 +1686,68 @@ */ for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { + int newblock; if (!bh) BUG(); - block_end = block_start+blocksize; + block_end = block_start + blocksize; if (block_end <= zerofrom) continue; if (block_start >= to) break; - bh->b_end_io = end_buffer_io_sync; + bget(bh); + newblock = 0; if (!buffer_mapped(bh)) { + bh->b_end_io = end_buffer_io_sync; err = inode->i_op->get_block(inode, block, bh, 1); - if (err) + /* + * We have to clear the New bit before inserting + * into the hash, otherwise bdflush and other + * external cache managers might see and do IO to it. + */ + newblock = test_and_clear_bit(BH_New, &bh->b_state); + if (err) { + while(wait_bh > wait) { + tmp = *--wait_bh; + bput(tmp); + } + bput(bh); goto out; - unmap_underlying_metadata(bh); + } + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + /* + * bdflush will write it out. + */ + clear_bit(BH_Lock, &bh->b_state); } - if (buffer_new(bh)) { + if (newblock) { zeroto = block_end; if (block_start < zerofrom) zerofrom = block_start; + bput(bh); continue; } if (!buffer_uptodate(bh) && (block_start < zerofrom || block_end > to)) { ll_rw_block(READ, 1, &bh); *wait_bh++=bh; - } + } else + bput(bh); } /* * If we issued read requests - let them complete. */ + err = 0; while(wait_bh > wait) { - wait_on_buffer(*--wait_bh); - err = -EIO; - if (!buffer_uptodate(*wait_bh)) - goto out; + tmp = *--wait_bh; + if (!buffer_uptodate(tmp)) + wait_on_buffer(tmp); + if (!buffer_uptodate(tmp)) + err = -EIO; + bput(tmp); } + if (err) + goto out; /* * Now we can copy the data. */ @@ -1470,6 +1780,7 @@ bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; + bget(bh); if (block_end <= zerofrom || block_start >= zeroto) { if (!buffer_uptodate(bh)) partial = 1; @@ -1480,6 +1791,7 @@ need_balance_dirty = 1; } } + bput(bh); } if (need_balance_dirty) @@ -1537,7 +1849,6 @@ return err ? err : bytes; } - /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. @@ -1580,6 +1891,8 @@ kiobuf_wait_for_io(kiobuf); + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); spin_lock(&unused_list_lock); iosize = 0; @@ -1592,10 +1905,12 @@ amount of IO before the first error. */ iosize = 0; } - __put_unused_buffer_head(tmp); + __put_unused_bh(tmp); } spin_unlock(&unused_list_lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); if (iosize) return iosize; @@ -1663,7 +1978,7 @@ while (length > 0) { blocknr = b[bufind++]; - tmp = get_unused_buffer_head(0); + tmp = get_unused_bh(); if (!tmp) { err = -ENOMEM; goto error; @@ -1726,11 +2041,14 @@ error: /* We got an error allocating the bh'es. Just free the current buffer_heads and exit. */ + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); spin_lock(&unused_list_lock); - for (i = bhind; --i >= 0; ) { - __put_unused_buffer_head(bh[bhind]); - } + for (i = bhind; --i >= 0; ) + __put_unused_bh(bh[bhind]); spin_unlock(&unused_list_lock); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); goto finished; } @@ -1744,7 +2062,7 @@ * kernel lock held - but the code is ready. * * FIXME: we need a swapper_inode->get_block function to remove - * some of the bmap kludges and interface ugliness here. + * some of the bmap kludges and interface ugliness here. */ int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size) { @@ -1753,7 +2071,6 @@ if (!PageLocked(page)) panic("brw_page: page not locked for I/O"); -// clear_bit(PG_error, &page->flags); /* * We pretty much rely on the page lock for this, because * create_page_buffers() might sleep. @@ -1772,20 +2089,24 @@ do { block = *(b++); - if (fresh && (atomic_read(&bh->b_count) != 0)) + if (fresh && bcount(bh)) BUG(); + bget(bh); if (rw == READ) { if (!fresh) BUG(); if (!buffer_uptodate(bh)) { arr[nr++] = bh; - atomic_inc(&bh->b_count); } } else { /* WRITE */ if (!bh->b_blocknr) { if (!block) BUG(); + if (buffer_mapped(bh)) + BUG(); bh->b_blocknr = block; + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); } else { if (!block) BUG(); @@ -1793,8 +2114,8 @@ set_bit(BH_Uptodate, &bh->b_state); set_bit(BH_Dirty, &bh->b_state); arr[nr++] = bh; - atomic_inc(&bh->b_count); } + bput(bh); bh = bh->b_this_page; } while (bh != head); if (rw == READ) @@ -1802,14 +2123,14 @@ if ((rw == READ) && nr) { if (Page_Uptodate(page)) BUG(); - ll_rw_block(rw, nr, arr); + ll_rw_block_locked(rw, nr, arr); } else { if (!nr && rw == READ) { SetPageUptodate(page); UnlockPage(page); } if (nr && (rw == WRITE)) - ll_rw_block(rw, nr, arr); + ll_rw_block_locked(rw, nr, arr); } return 0; } @@ -1843,8 +2164,11 @@ i = 0; do { - if (buffer_uptodate(bh)) + bget(bh); + if (buffer_uptodate(bh)) { + bput(bh); continue; + } if (!buffer_mapped(bh)) { inode->i_op->get_block(inode, iblock, bh, 0); @@ -1853,21 +2177,24 @@ kaddr = kmap(page); memset((char *)(kaddr + i*blocksize), 0, blocksize); set_bit(BH_Uptodate, &bh->b_state); + bput(bh); continue; } + init_buffer(bh, end_buffer_io_page, NULL); + set_bit(BH_Lock, &bh->b_state); + insert_into_queues_exclusive(bh); + arr[nr] = bh; + nr++; } - - init_buffer(bh, end_buffer_io_async, NULL); - atomic_inc(&bh->b_count); - arr[nr] = bh; - nr++; } while (i++, iblock++, (bh = bh->b_this_page) != head); ++current->maj_flt; if (nr) { if (Page_Uptodate(page)) BUG(); - ll_rw_block(READ, nr, arr); + ll_rw_block_locked(READ, nr, arr); + for (i = 0; i < nr; i++) + bput(arr[i]); } else { /* * all buffers are uptodate - we can set the page @@ -1929,7 +2256,7 @@ static int grow_buffers(int size) { struct page * page; - struct buffer_head *bh, *tmp; + struct buffer_head *bh, *head, *tail; struct buffer_head * insert_point; int isize; @@ -1941,37 +2268,44 @@ page = alloc_page(GFP_BUFFER); if (!page) goto out; - bh = create_buffers(page, size, 0); - if (!bh) + if (page->mapping) + BUG(); + if (PageLocked(page)) + BUG(); + lock_page(page); + head = create_buffers(page, size, 0); + if (!head) goto no_buffer_head; isize = BUFSIZE_INDEX(size); spin_lock(&free_list[isize].lock); insert_point = free_list[isize].list; - tmp = bh; - while (1) { + tail = bh = head; + do { + bh->b_dev = B_FREE; if (insert_point) { - tmp->b_next_free = insert_point->b_next_free; - tmp->b_prev_free = insert_point; - insert_point->b_next_free->b_prev_free = tmp; - insert_point->b_next_free = tmp; + bh->b_next_free = insert_point->b_next_free; + bh->b_prev_free = insert_point; + insert_point->b_next_free->b_prev_free = bh; + insert_point->b_next_free = bh; } else { - tmp->b_prev_free = tmp; - tmp->b_next_free = tmp; + bh->b_prev_free = bh; + bh->b_next_free = bh; } - insert_point = tmp; - if (tmp->b_this_page) - tmp = tmp->b_this_page; - else - break; - } - tmp->b_this_page = bh; + insert_point = bh; + tail = bh; + bh = bh->b_this_page; + } while (bh != head); + + if (tail->b_this_page != bh) + BUG(); free_list[isize].list = bh; + __set_page_buffers(page, bh); + lru_cache_add(page); spin_unlock(&free_list[isize].lock); + UnlockPage(page); - page->buffers = bh; - lru_cache_add(page); atomic_inc(&buffermem_pages); return 1; @@ -1984,36 +2318,30 @@ /* * Can the buffer be thrown out? */ -#define BUFFER_BUSY_BITS ((1<b_count) | ((bh)->b_state & BUFFER_BUSY_BITS)) +#define BUFFER_BUSY_BITS ((1<b_state & BUFFER_BUSY_BITS)) -/* - * try_to_free_buffers() checks if all the buffers on this particular page - * are unused, and free's the page if so. - * - * Wake up bdflush() if this fails - if we're running low on memory due - * to dirty buffers, we need to flush them out as quickly as possible. - * - * NOTE: There are quite a number of ways that threads of control can - * obtain a reference to a buffer head within a page. So we must - * lock out all of these paths to cleanly toss the page. - */ -int try_to_free_buffers(struct page * page) +static int __try_to_free_buffers(struct page * page, int priority) { - struct buffer_head * tmp, * bh = page->buffers; + struct buffer_head *tmp, *p, *bh = page->buffers; int index = BUFSIZE_INDEX(bh->b_size); int ret; - spin_lock(&lru_list_lock); - write_lock(&hash_table_lock); + if (!PageLocked(page)) + BUG(); spin_lock(&free_list[index].lock); tmp = bh; do { - struct buffer_head * p = tmp; - + p = tmp; tmp = tmp->b_this_page; if (buffer_busy(p)) goto busy_buffer_page; + /* + * Remove the page only if none of it bhs is uptodate. + * (ie. all bhs got either dropped or invalidated) + */ + if (!priority && test_bit(BH_Uptodate, &bh->b_state)) + goto busy_buffer_page; } while (tmp != bh); spin_lock(&unused_list_lock); @@ -2025,28 +2353,31 @@ /* The buffer can be either on the regular * queues or on the free list.. */ + bget(p); if (p->b_dev == B_FREE) { + if (buffer_mapped(p)) + BH_BUG(p); __remove_from_free_list(p, index); - } else { - if (p->b_pprev) - __hash_unlink(p); - __remove_from_lru_list(p, p->b_list); } - __put_unused_buffer_head(p); + if (buffer_mapped(p)) { + if (p->b_dev == B_FREE) + BH_BUG(p); + __remove_from_queues(p); + } + bput(p); + __put_unused_bh(p); } while (tmp != bh); + __set_page_buffers(page, NULL); + /* And free the page */ spin_unlock(&unused_list_lock); /* Wake up anyone waiting for buffer heads */ wake_up(&buffer_wait); - /* And free the page */ - page->buffers = NULL; __free_page(page); ret = 1; out: spin_unlock(&free_list[index].lock); - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); return ret; busy_buffer_page: @@ -2056,6 +2387,29 @@ goto out; } +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and free's the page if so. + * + * Wake up bdflush() if this fails - if we're running low on memory due + * to dirty buffers, we need to flush them out as quickly as possible. + * + * NOTE: There are quite a number of ways that threads of control can + * obtain a reference to a buffer head within a page. So we must + * lock out all of these paths to cleanly toss the page. + */ +int try_to_free_buffers(struct page * page) +{ + int ret; + + spin_lock(&lru_list_lock); + write_lock(&hash_table_lock); + ret = __try_to_free_buffers(page, 1); + write_unlock(&hash_table_lock); + spin_unlock(&lru_list_lock); + return ret; +} + /* ================== Debugging =================== */ void show_buffers(void) @@ -2087,7 +2441,7 @@ protected++; if (buffer_dirty(bh)) dirty++; - if (atomic_read(&bh->b_count)) + if (bcount(bh)) used++, lastused = found; bh = bh->b_next_free; } while (bh != lru_list[nlist]); @@ -2224,34 +2578,37 @@ { next = bh->b_next_free; - if (!buffer_dirty(bh)) - { + bget(bh); + if (!buffer_dirty(bh)) { __refile_buffer(bh); + bput(bh); continue; } - if (buffer_locked(bh)) + if (buffer_locked(bh)) { + bput(bh); continue; + } - if (check_flushtime) - { + if (check_flushtime) { /* The dirty lru list is chronogical ordered so if the current bh is not yet timed out, then also all the following bhs will be too young. */ - if (time_before(jiffies, bh->b_flushtime)) + if (time_before(jiffies, bh->b_flushtime)) { + bput(bh); goto out_unlock; - } - else - { - if (++flushed > bdf_prm.b_un.ndirty) + } + } else { + if (++flushed > bdf_prm.b_un.ndirty) { + bput(bh); goto out_unlock; + } } /* OK, now we are committed to write it out. */ - atomic_inc(&bh->b_count); spin_unlock(&lru_list_lock); ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); + bput(bh); if (current->need_resched) schedule(); @@ -2261,10 +2618,10 @@ spin_unlock(&lru_list_lock); } -/* - * Here we attempt to write back old buffers. We also try to flush inodes - * and supers as well, since this function is essentially "update", and - * otherwise there would be no way of ensuring that these quantities ever +/* + * Here we attempt to write back old buffers. We also try to flush inodes + * and supers as well, since this function is essentially "update", and + * otherwise there would be no way of ensuring that these quantities ever * get written back. Ideally, we would have a timestamp on the inodes * and superblocks so that we could write back only the old ones as well */ @@ -2283,8 +2640,8 @@ } /* This is the interface to bdflush. As we get more sophisticated, we can - * pass tuning parameters to this "process", to adjust how it behaves. - * We would want to verify each parameter, however, to make sure that it + * pass tuning parameters to this "process", to adjust how it behaves. + * We would want to verify each parameter, however, to make sure that it * is reasonable. */ asmlinkage long sys_bdflush(int func, long data) @@ -2329,7 +2686,7 @@ } /* Having func 0 used to launch the actual bdflush and then never - * return (unless explicitly killed). We return zero here to + * return (unless explicitly killed). We return zero here to * remain semi-compatible with present update(8) programs. */ return 0; @@ -2340,12 +2697,12 @@ * the syscall above, but now we launch it ourselves internally with * kernel_thread(...) directly after the first thread in init/main.c */ -int bdflush(void * unused) +int bdflush(void * unused) { /* * We have a bare-bones task_struct, and really should fill * in a few more things so "top" and /proc/2/{exe,root,cwd} - * display semi-sane things. Not real crucial though... + * display semi-sane things. Not real crucial though... */ current->session = 1; @@ -2394,7 +2751,7 @@ * You don't need to change your userspace configuration since * the userspace `update` will do_exit(0) at the first sys_bdflush(). */ -int kupdate(void * unused) +int kupdate(void * unused) { struct task_struct * tsk = current; int interval; @@ -2452,6 +2809,7 @@ kernel_thread(kupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); return 0; } + module_init(bdflush_init) --- linux/fs/inode.c.orig Sun Jan 16 06:38:26 2000 +++ linux/fs/inode.c Sun Jan 16 17:45:52 2000 @@ -494,6 +494,7 @@ if (inode) { spin_lock(&inode_lock); + INIT_LIST_HEAD(&inode->i_list); list_add(&inode->i_list, &inode_in_use); inode->i_sb = NULL; inode->i_dev = 0; @@ -525,7 +526,9 @@ /* We released the lock, so.. */ old = find_inode(sb, ino, head, find_actor, opaque); if (!old) { + INIT_LIST_HEAD(&inode->i_list); list_add(&inode->i_list, &inode_in_use); + INIT_LIST_HEAD(&inode->i_hash); list_add(&inode->i_hash, head); inode->i_sb = sb; inode->i_dev = sb->s_dev; @@ -722,15 +725,19 @@ int bmap(struct inode * inode, int block) { - struct buffer_head tmp; + struct buffer_head *tmp; + int ret = 0; if (inode->i_op && inode->i_op->get_block) { - tmp.b_state = 0; - tmp.b_blocknr = 0; - inode->i_op->get_block(inode, block, &tmp, 0); - return tmp.b_blocknr; + tmp = get_unused_bh(); + tmp->b_state = 0; + tmp->b_blocknr = 0; + bh_set(tmp, 0); + inode->i_op->get_block(inode, block, tmp, 0); + ret = tmp->b_blocknr; + put_unused_bh(tmp); } - return 0; + return ret; } /* --- linux/fs/ioctl.c.orig Mon Aug 23 20:15:53 1999 +++ linux/fs/ioctl.c Sun Jan 16 17:45:52 2000 @@ -19,7 +19,8 @@ switch (cmd) { case FIBMAP: { - struct buffer_head tmp; + int ret; + struct buffer_head *tmp; if (inode->i_op == NULL) return -EBADF; @@ -30,10 +31,15 @@ if ((error = get_user(block, (int *) arg)) != 0) return error; - tmp.b_state = 0; - tmp.b_blocknr = 0; - inode->i_op->get_block(inode, block, &tmp, 0); - return put_user(tmp.b_blocknr, (int *) arg); + tmp = get_unused_bh(); + tmp->b_state = 0; + tmp->b_blocknr = 0; + bh_set(tmp, 1); + + inode->i_op->get_block(inode, block, tmp, 0); + ret = put_user(tmp->b_blocknr, (int *) arg); + put_unused_bh(tmp); + return ret; } case FIGETBSZ: if (inode->i_sb == NULL) --- linux/fs/dcache.c.orig Sun Jan 16 06:38:25 2000 +++ linux/fs/dcache.c Sun Jan 16 17:45:52 2000 @@ -458,12 +458,12 @@ dentry->d_inode = NULL; dentry->d_parent = NULL; dentry->d_sb = NULL; + INIT_LIST_HEAD(&dentry->d_child); if (parent) { dentry->d_parent = dget(parent); dentry->d_sb = parent->d_sb; list_add(&dentry->d_child, &parent->d_subdirs); - } else - INIT_LIST_HEAD(&dentry->d_child); + } dentry->d_mounts = dentry; dentry->d_covers = dentry; --- linux/init/main.c.orig Sun Jan 16 06:38:25 2000 +++ linux/init/main.c Sun Jan 16 17:45:52 2000 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -674,6 +675,9 @@ while (pid != wait(&i)); if (MAJOR(real_root_dev) != RAMDISK_MAJOR || MINOR(real_root_dev) != 0) { +#ifdef CONFIG_BLK_DEV_MD + autodetect_raid(); +#endif error = change_root(real_root_dev,"/initrd"); if (error) printk(KERN_ERR "Change root to /initrd: " --- linux/kernel/ksyms.c.orig Sun Jan 16 06:38:28 2000 +++ linux/kernel/ksyms.c Sun Jan 16 17:45:52 2000 @@ -263,8 +263,8 @@ EXPORT_SYMBOL(ioctl_by_bdev); EXPORT_SYMBOL(gendisk_head); EXPORT_SYMBOL(resetup_one_dev); -EXPORT_SYMBOL(unplug_device); -EXPORT_SYMBOL(make_request); +EXPORT_SYMBOL(generic_unplug_device); +EXPORT_SYMBOL(generic_make_request); EXPORT_SYMBOL(tq_disk); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(refile_buffer); @@ -381,7 +381,6 @@ EXPORT_SYMBOL(kdevname); EXPORT_SYMBOL(bdevname); EXPORT_SYMBOL(cdevname); -EXPORT_SYMBOL(partition_name); /* md.c only */ EXPORT_SYMBOL(simple_strtoul); EXPORT_SYMBOL(system_utsname); /* UTS data */ EXPORT_SYMBOL(uts_sem); /* UTS semaphore */ --- linux/mm/slab.c.orig Sun Jan 16 06:38:28 2000 +++ linux/mm/slab.c Sun Jan 16 17:45:52 2000 @@ -535,6 +535,7 @@ */ while (i--) { PageClearSlab(page); + INIT_LIST_HEAD(&page->list); page++; } free_pages((unsigned long)addr, cachep->c_gfporder); --- linux/mm/filemap.c.orig Sun Jan 16 06:38:28 2000 +++ linux/mm/filemap.c Sun Jan 16 17:45:52 2000 @@ -224,40 +224,48 @@ spin_lock(&pagemap_lru_lock); - while (count > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + while (count-- > 0 && (page_lru = lru_cache.prev) != &lru_cache) { + page = list_entry(page_lru, struct page, lru); - list_del(page_lru); + + /* avoid unscalable SMP locking */ + if (!page->buffers && page_count(page) > 1) + continue; + /* + * We do this first because this synchronizes the page + * lock with LRU-removal, needed by try_to_free_buffers(). + */ + if (TryLockPage(page)) + continue; dispose = &lru_cache; + list_del(page_lru); + if (test_and_clear_bit(PG_referenced, &page->flags)) /* Roll the page at the top of the lru list, * we could also be more aggressive putting * the page in the young-dispose-list, so * avoiding to free young pages in each pass. */ - goto dispose_continue; + goto dispose_unlock_noput_continue; dispose = &old; /* don't account passes over not DMA pages */ if (zone && (!memclass(page->zone, zone))) - goto dispose_continue; + goto dispose_unlock_noput_continue; - count--; + + /* avoid freeing the page while it's locked */ + get_page(page); + spin_unlock(&pagemap_lru_lock); dispose = &young; - if (TryLockPage(page)) - goto dispose_continue; /* Release the pagemap_lru lock even if the page is not yet queued in any lru queue since we have just locked down the page so nobody else may SMP race with us running a lru_cache_del() (lru_cache_del() always run with the page locked down ;). */ - spin_unlock(&pagemap_lru_lock); - - /* avoid unscalable SMP locking */ - if (!page->buffers && page_count(page) > 1) - goto unlock_noput_continue; /* Take the pagecache_lock spinlock held to avoid other tasks to notice the page while we are looking at its @@ -265,8 +273,8 @@ in one atomic transaction after checking its page count. */ spin_lock(&pagecache_lock); - /* avoid freeing the page while it's locked */ - get_page(page); + if (page_count(page) == 1) + BUG(); /* Is it a buffer page? */ if (page->buffers) { @@ -274,10 +282,8 @@ if (!try_to_free_buffers(page)) goto unlock_continue; /* page was locked, inode can't go away under us */ - if (!page->mapping) { - atomic_dec(&buffermem_pages); + if (!page->mapping) goto made_buffer_progress; - } spin_lock(&pagecache_lock); } @@ -312,6 +318,8 @@ goto cache_unlock_continue; } + if (page->buffers || page->mapping) + printk("huh?\n"); dispose = &forget; printk(KERN_ERR "shrink_mmap: unknown LRU page!\n"); @@ -320,21 +328,18 @@ unlock_continue: UnlockPage(page); put_page(page); -dispose_relock_continue: /* even if the dispose list is local, a truncate_inode_page() may remove a page from its queue so always synchronize with the lru lock while accesing the page->lru field */ spin_lock(&pagemap_lru_lock); +dispose_continue: list_add(page_lru, dispose); continue; - -unlock_noput_continue: +dispose_unlock_noput_continue: + count++; UnlockPage(page); - goto dispose_relock_continue; - -dispose_continue: - list_add(page_lru, dispose); + goto dispose_continue; } goto out; @@ -386,8 +391,11 @@ { struct buffer_head *bh, *head = page->buffers; + if (!PageLocked(page)) + BUG(); bh = head; do { + bget(bh); if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh)) continue; @@ -402,11 +410,14 @@ int error = 0; struct buffer_head *bh, *head = page->buffers; + if (!PageLocked(page)) + BUG(); bh = head; do { wait_on_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) error = -EIO; + bput(bh); } while ((bh = bh->b_this_page) != head); return error; } @@ -470,7 +481,6 @@ struct address_space *mapping, unsigned long offset, struct page **hash) { - struct page *alias; unsigned long flags; flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced)); @@ -480,9 +490,6 @@ add_page_to_inode_queue(mapping, page); __add_page_to_hash_queue(page, hash); lru_cache_add(page); - alias = __find_page_nolock(mapping, offset, *hash); - if (alias != page) - BUG(); } void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) --- linux/mm/page_alloc.c.orig Sun Jan 16 06:38:25 2000 +++ linux/mm/page_alloc.c Sun Jan 16 17:45:52 2000 @@ -78,11 +78,11 @@ return; if (page-mem_map >= max_mapnr) - BUG(); + PAGE_BUG(page); if (PageSwapCache(page)) - BUG(); + PAGE_BUG(page); if (PageLocked(page)) - BUG(); + PAGE_BUG(page); zone = page->zone; @@ -562,7 +562,7 @@ memlist_init(&p->list); } - offset = lmem_map - mem_map; + offset = lmem_map - mem_map; for (j = 0; j < MAX_NR_ZONES; j++) { zone_t *zone = pgdat->node_zones + j; unsigned long mask = -1; @@ -570,7 +570,7 @@ size = zones_size[j]; - printk("zone(%ld): %ld pages.\n", j, size); + printk("zone(%ld): %ld pages, offset %d.\n", j, size, offset); zone->size = size; zone->name = zone_names[j]; zone->lock = SPIN_LOCK_UNLOCKED; --- linux/include/linux/genhd.h.orig Fri Oct 15 18:29:44 1999 +++ linux/include/linux/genhd.h Sun Jan 16 17:45:52 2000 @@ -19,6 +19,7 @@ #define WIN98_EXTENDED_PARTITION 0x0f #define LINUX_SWAP_PARTITION 0x82 +#define LINUX_RAID_PARTITION 0xfd /* autodetect RAID partition */ #ifdef CONFIG_SOLARIS_X86_PARTITION #define SOLARIS_X86_PARTITION LINUX_SWAP_PARTITION @@ -45,6 +46,7 @@ struct hd_struct { long start_sect; long nr_sects; + int type; /* currently RAID or normal */ }; struct gendisk { --- linux/include/linux/raid/linear.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/linear.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,32 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +#include + +struct dev_info { + kdev_t dev; + int size; + unsigned int offset; +}; + +typedef struct dev_info dev_info_t; + +struct linear_hash +{ + dev_info_t *dev0, *dev1; +}; + +struct linear_private_data +{ + struct linear_hash *hash_table; + dev_info_t disks[MD_SB_DISKS]; + dev_info_t *smallest; + int nr_zones; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/hsm_p.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/hsm_p.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,237 @@ +#ifndef _HSM_P_H +#define _HSM_P_H + +#define HSM_BLOCKSIZE 4096 +#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4) +#define PACKED __attribute__ ((packed)) + +/* + * Identifies a block in physical space + */ +typedef struct phys_idx_s { + __u16 phys_nr; + __u32 phys_block; + +} PACKED phys_idx_t; + +/* + * Identifies a block in logical space + */ +typedef struct log_idx_s { + __u16 log_id; + __u32 log_index; + +} PACKED log_idx_t; + +/* + * Describes one PV + */ +#define HSM_PV_SB_MAGIC 0xf091ae9fU + +#define HSM_PV_SB_GENERIC_WORDS 32 +#define HSM_PV_SB_RESERVED_WORDS \ + (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS) + +/* + * On-disk PV identification data, on block 0 in any PV. + */ +typedef struct pv_sb_s +{ + __u32 pv_magic; /* 0 */ + + __u32 pv_uuid0; /* 1 */ + __u32 pv_uuid1; /* 2 */ + __u32 pv_uuid2; /* 3 */ + __u32 pv_uuid3; /* 4 */ + + __u32 pv_major; /* 5 */ + __u32 pv_minor; /* 6 */ + __u32 pv_patch; /* 7 */ + + __u32 pv_ctime; /* 8 Creation time */ + + __u32 pv_total_size; /* 9 size of this PV, in blocks */ + __u32 pv_first_free; /* 10 first free block */ + __u32 pv_first_used; /* 11 first used block */ + __u32 pv_blocks_left; /* 12 unallocated blocks */ + __u32 pv_bg_size; /* 13 size of a block group, in blocks */ + __u32 pv_block_size; /* 14 size of blocks, in bytes */ + __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */ + __u32 pv_block_groups; /* 16 number of block groups */ + + __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17]; + + /* + * Reserved + */ + __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS]; + +} PACKED pv_sb_t; + +/* + * this is pretty much arbitrary, but has to be less than ~64 + */ +#define HSM_MAX_LVS_PER_VG 32 + +#define HSM_VG_SB_GENERIC_WORDS 32 + +#define LV_DESCRIPTOR_WORDS 8 +#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \ + LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS) + +#if (HSM_PV_SB_RESERVED_WORDS < 0) +#error you messed this one up dude ... +#endif + +typedef struct lv_descriptor_s +{ + __u32 lv_id; /* 0 */ + phys_idx_t lv_root_idx; /* 1 */ + __u16 __reserved; /* 2 */ + __u32 lv_max_indices; /* 3 */ + __u32 lv_free_indices; /* 4 */ + __u32 md_id; /* 5 */ + + __u32 reserved[LV_DESCRIPTOR_WORDS - 6]; + +} PACKED lv_descriptor_t; + +#define HSM_VG_SB_MAGIC 0x98320d7aU +/* + * On-disk VG identification data, in block 1 on all PVs + */ +typedef struct vg_sb_s +{ + __u32 vg_magic; /* 0 */ + __u32 nr_lvs; /* 1 */ + + __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2]; + + lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG]; + /* + * Reserved + */ + __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS]; + +} PACKED vg_sb_t; + +/* + * Describes one LV + */ + +#define HSM_LV_SB_MAGIC 0xe182bd8aU + +/* do we need lv_sb_t? */ + +typedef struct lv_sb_s +{ + /* + * On-disk LV identifier + */ + __u32 lv_magic; /* 0 LV identifier */ + __u32 lv_uuid0; /* 1 */ + __u32 lv_uuid1; /* 2 */ + __u32 lv_uuid2; /* 3 */ + __u32 lv_uuid3; /* 4 */ + + __u32 lv_major; /* 5 PV identifier */ + __u32 lv_minor; /* 6 PV identifier */ + __u32 lv_patch; /* 7 PV identifier */ + + __u32 ctime; /* 8 Creation time */ + __u32 size; /* 9 size of this LV, in blocks */ + phys_idx_t start; /* 10 position of root index block */ + log_idx_t first_free; /* 11-12 first free index */ + + /* + * Reserved + */ + __u32 reserved[HSM_BLOCKSIZE_WORDS-13]; + +} PACKED lv_sb_t; + +/* + * Pointer pointing from the physical space, points to + * the LV owning this block. It also contains various + * statistics about the physical block. + */ +typedef struct pv_pptr_s +{ + union { + /* case 1 */ + struct { + log_idx_t owner; + log_idx_t predicted; + __u32 last_referenced; + } used; + /* case 2 */ + struct { + __u16 log_id; + __u16 __unused1; + __u32 next_free; + __u32 __unused2; + __u32 __unused3; + } free; + } u; +} PACKED pv_pptr_t; + +static __inline__ int pv_pptr_free (const pv_pptr_t * pptr) +{ + return !pptr->u.free.log_id; +} + + +#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1)) + +#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1) +/* + * A table of pointers filling up a single block, managing + * the next DATA_BLOCKS_PER_BG physical blocks. Such block + * groups form the physical space of blocks. + */ +typedef struct pv_block_group_s +{ + __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8]; + + pv_pptr_t blocks[DATA_BLOCKS_PER_BG]; + +} PACKED pv_block_group_t; + +/* + * Pointer from the logical space, points to + * the (PV,block) containing this logical block + */ +typedef struct lv_lptr_s +{ + phys_idx_t data; + __u16 __reserved; + __u32 cpu_addr; + __u32 __reserved2; + +} PACKED lv_lptr_t; + +static __inline__ int index_free (const lv_lptr_t * index) +{ + return !index->data.phys_block; +} + +static __inline__ int index_present (const lv_lptr_t * index) +{ + return index->cpu_addr; +} + + +#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t)) +/* + * A table of pointers filling up a single block, managing + * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form + * the logical space of blocks. + */ +typedef struct lv_index_block_s +{ + lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK]; + +} PACKED lv_index_block_t; + +#endif + --- linux/include/linux/raid/md.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,95 @@ +/* + md.h : Multiple Devices driver for Linux + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + Copyright (C) 1994-96 Marc ZYNGIER + or + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_H +#define _MD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +/* + * 'md_p.h' holds the 'physical' layout of RAID devices + * 'md_u.h' holds the user <=> kernel API + * + * 'md_k.h' holds kernel internal definitions + */ + +#include +#include +#include + +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +#define MD_PATCHLEVEL_VERSION 0 + +extern int md_size[MAX_MD_DEVS]; +extern struct hd_struct md_hd_struct[MAX_MD_DEVS]; + +extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data); +extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev); +extern char * partition_name (kdev_t dev); +extern int register_md_personality (int p_num, mdk_personality_t *p); +extern int unregister_md_personality (int p_num); +extern mdk_thread_t * md_register_thread (void (*run) (void *data), + void *data, const char *name); +extern void md_unregister_thread (mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_interrupt_thread (mdk_thread_t *thread); +extern int md_update_sb (mddev_t *mddev); +extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare); +extern void md_recover_arrays (void); +extern int md_check_ordering (mddev_t *mddev); +extern void autodetect_raid(void); +extern struct gendisk * find_gendisk (kdev_t dev); +extern int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x); +extern int md_error (kdev_t mddev, kdev_t rdev); + +#if CONFIG_BLK_DEV_MD +extern void raid_setup(char *str,int *ints) md__init; +#endif + +extern void md_print_devices (void); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + +#endif _MD_H + --- linux/include/linux/raid/md_compatible.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_compatible.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,551 @@ + +/* + md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2 + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#ifndef _MD_COMPATIBLE_H +#define _MD_COMPATIBLE_H + +#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s)) + +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0) + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return x86_capability & 0x00800000; +} +#endif + +/* 002 */ +#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE) + +/* 003 */ +/* + * someone please suggest a sane compatibility layer for modules + */ +#define MD_EXPORT_SYMBOL(x) + +/* 004 */ +static inline unsigned long +md_copy_from_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_READ,from,n); + if (!err) + memcpy_fromfs(to, from, n); + return err; +} + +/* 005 */ +extern inline unsigned long +md_copy_to_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_WRITE,to,n); + if (!err) + memcpy_tofs(to, from, n); + return err; +} + +/* 006 */ +#define md_put_user(x,ptr) \ +({ \ + int __err; \ + \ + __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \ + if (!__err) \ + put_user(x,ptr); \ + __err; \ +}) + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return suser(); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + current->signal = 0; +} + +/* 010 */ +#define __S(nr) (1<<((nr)-1)) +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + current->blocked = ~(__S(SIGKILL)); +} +#undef __S + +/* 011 */ +extern inline unsigned long md_signal_pending (struct task_struct * tsk) +{ + return (tsk->signal & ~tsk->blocked); +} + +/* 012 */ +#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD + +/* 013 */ +#define md_mdelay(n) (\ + {unsigned long msec=(n); while (msec--) udelay(1000);}) + +/* 014 */ +#define MD_SYS_DOWN 0 +#define MD_SYS_HALT 0 +#define MD_SYS_POWER_OFF 0 + +/* 015 */ +#define md_register_reboot_notifier(x) + +/* 016 */ +extern __inline__ unsigned long +md_test_and_set_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + set_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 017 */ +extern __inline__ unsigned long +md_test_and_clear_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + clear_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 018 */ +#define md_atomic_read(x) (*(volatile int *)(x)) +#define md_atomic_set(x,y) (*(volatile int *)(x) = (y)) + +/* 019 */ +extern __inline__ void md_lock_kernel (void) +{ +#if __SMP__ + lock_kernel(); + syscall_count++; +#endif +} + +extern __inline__ void md_unlock_kernel (void) +{ +#if __SMP__ + syscall_count--; + unlock_kernel(); +#endif +} +/* 020 */ + +#define md__init +#define md__initdata +#define md__initfunc(__arginit) __arginit + +/* 021 */ + +/* 022 */ + +struct md_list_head { + struct md_list_head *next, *prev; +}; + +#define MD_LIST_HEAD(name) \ + struct md_list_head name = { &name, &name } + +#define MD_INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +static __inline__ void md__list_add(struct md_list_head * new, + struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static __inline__ void md_list_add(struct md_list_head *new, + struct md_list_head *head) +{ + md__list_add(new, head, head->next); +} + +static __inline__ void md__list_del(struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +static __inline__ void md_list_del(struct md_list_head *entry) +{ + md__list_del(entry->prev, entry->next); +} + +static __inline__ int md_list_empty(struct md_list_head *head) +{ + return head->next == head; +} + +#define md_list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/* 023 */ + +static __inline__ signed long md_schedule_timeout(signed long timeout) +{ + current->timeout = jiffies + timeout; + schedule(); + return 0; +} + +/* 024 */ +#define md_need_resched(tsk) (need_resched) + +/* 025 */ +typedef struct { int gcc_is_buggy; } md_spinlock_t; +#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 } + +#define md_spin_lock() do { } while (0) +#define md_spin_unlock() do { } while (0) +#define md_spin_lock_irq cli +#define md_spin_unlock_irq sti +#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags) +#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0) + +/* 026 */ +typedef struct wait_queue * md_wait_queue_head_t; +#define MD_DECLARE_WAITQUEUE(w,t) \ + struct wait_queue (w) = { (t), NULL } +#define MD_DECLARE_WAIT_QUEUE_HEAD(x) \ + static struct wait_queue *x = (struct wait_queue *)NULL +#define md_init_waitqueue_head init_waitqueue + +/* END */ + +#else + +#if LINUX_VERSION_CODE < LinuxVersionCode(2,3,0) + +#include +#include + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return boot_cpu_data.x86_capability & X86_FEATURE_MMX; +} +#endif + +/* 002 */ +#define md_clear_page(page) clear_page(page) + +/* 003 */ +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x) + +/* 004 */ +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z) + +/* 005 */ +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z) + +/* 006 */ +#define md_put_user put_user + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return capable(CAP_SYS_ADMIN); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +/* 010 */ +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +/* 011 */ +#define md_signal_pending signal_pending + +/* 012 */ +extern inline void md_set_global_readahead(int * table) +{ + max_readahead[MD_MAJOR] = table; +} + +/* 013 */ +#define md_mdelay(x) mdelay(x) + +/* 014 */ +#define MD_SYS_DOWN SYS_DOWN +#define MD_SYS_HALT SYS_HALT +#define MD_SYS_POWER_OFF SYS_POWER_OFF + +/* 015 */ +#define md_register_reboot_notifier register_reboot_notifier + +/* 016 */ +#define md_test_and_set_bit test_and_set_bit + +/* 017 */ +#define md_test_and_clear_bit test_and_clear_bit + +/* 018 */ +#define md_atomic_read atomic_read +#define md_atomic_set atomic_set + +/* 019 */ +#define md_lock_kernel lock_kernel +#define md_unlock_kernel unlock_kernel + +/* 020 */ + +#include + +#define md__init __init +#define md__initdata __initdata +#define md__initfunc(__arginit) __initfunc(__arginit) + +/* 021 */ + + +/* 022 */ + +#define md_list_head list_head +#define MD_LIST_HEAD(name) LIST_HEAD(name) +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr) +#define md_list_add list_add +#define md_list_del list_del +#define md_list_empty list_empty + +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member) + +/* 023 */ + +#define md_schedule_timeout schedule_timeout + +/* 024 */ +#define md_need_resched(tsk) ((tsk)->need_resched) + +/* 025 */ +#define md_spinlock_t spinlock_t +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +#define md_spin_lock spin_lock +#define md_spin_unlock spin_unlock +#define md_spin_lock_irq spin_lock_irq +#define md_spin_unlock_irq spin_unlock_irq +#define md_spin_unlock_irqrestore spin_unlock_irqrestore +#define md_spin_lock_irqsave spin_lock_irqsave + +/* 026 */ +typedef struct wait_queue * md_wait_queue_head_t; +#define MD_DECLARE_WAITQUEUE(w,t) \ + struct wait_queue (w) = { (t), NULL } +#define MD_DECLARE_WAIT_QUEUE_HEAD(x) \ + static struct wait_queue *x = (struct wait_queue *)NULL +#define md_init_waitqueue_head init_waitqueue + +/* END */ + +#else + +/** 2.3/2.4 stuff: **/ + +#include +#include +#include + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return boot_cpu_data.x86_capability & X86_FEATURE_MMX; +} +#endif + +/* 002 */ +#define md_clear_page(page) clear_page(page) + +/* 003 */ +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x) + +/* 004 */ +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z) + +/* 005 */ +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z) + +/* 006 */ +#define md_put_user put_user + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return capable(CAP_SYS_ADMIN); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +/* 010 */ +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +/* 011 */ +#define md_signal_pending signal_pending + +/* 012 */ +extern inline void md_set_global_readahead(int * table) +{ + max_readahead[MD_MAJOR] = table; +} + +/* 013 */ +#define md_mdelay(x) mdelay(x) + +/* 014 */ +#define MD_SYS_DOWN SYS_DOWN +#define MD_SYS_HALT SYS_HALT +#define MD_SYS_POWER_OFF SYS_POWER_OFF + +/* 015 */ +#define md_register_reboot_notifier register_reboot_notifier + +/* 016 */ +#define md_test_and_set_bit test_and_set_bit + +/* 017 */ +#define md_test_and_clear_bit test_and_clear_bit + +/* 018 */ +#define md_atomic_read atomic_read +#define md_atomic_set atomic_set + +/* 019 */ +#define md_lock_kernel lock_kernel +#define md_unlock_kernel unlock_kernel + +/* 020 */ + +#include + +#define md__init __init +#define md__initdata __initdata +#define md__initfunc(__arginit) __initfunc(__arginit) + +/* 021 */ + + +/* 022 */ + +#define md_list_head list_head +#define MD_LIST_HEAD(name) LIST_HEAD(name) +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr) +#define md_list_add list_add +#define md_list_del list_del +#define md_list_empty list_empty + +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member) + +/* 023 */ + +#define md_schedule_timeout schedule_timeout + +/* 024 */ +#define md_need_resched(tsk) ((tsk)->need_resched) + +/* 025 */ +#define md_spinlock_t spinlock_t +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +#define md_spin_lock spin_lock +#define md_spin_unlock spin_unlock +#define md_spin_lock_irq spin_lock_irq +#define md_spin_unlock_irq spin_unlock_irq +#define md_spin_unlock_irqrestore spin_unlock_irqrestore +#define md_spin_lock_irqsave spin_lock_irqsave + +/* 026 */ +typedef wait_queue_head_t md_wait_queue_head_t; +#define MD_DECLARE_WAITQUEUE(w,t) DECLARE_WAITQUEUE((w),(t)) +#define MD_DECLARE_WAIT_QUEUE_HEAD(x) DECLARE_WAIT_QUEUE_HEAD(x) +#define md_init_waitqueue_head init_waitqueue_head + +/* END */ + +#endif + +#endif + +#endif _MD_COMPATIBLE_H + --- linux/include/linux/raid/md_k.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_k.h Sun Jan 16 17:45:52 2000 @@ -0,0 +1,341 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#define MD_RESERVED 0UL +#define LINEAR 1UL +#define STRIPED 2UL +#define RAID0 STRIPED +#define RAID1 3UL +#define RAID5 4UL +#define TRANSLUCENT 5UL +#define HSM 6UL +#define MAX_PERSONALITY 7UL + +extern inline int pers_to_level (int pers) +{ + switch (pers) { + case HSM: return -3; + case TRANSLUCENT: return -2; + case LINEAR: return -1; + case RAID0: return 0; + case RAID1: return 1; + case RAID5: return 5; + } + panic("pers_to_level()"); +} + +extern inline int level_to_pers (int level) +{ + switch (level) { + case -3: return HSM; + case -2: return TRANSLUCENT; + case -1: return LINEAR; + case 0: return RAID0; + case 1: return RAID1; + case 4: + case 5: return RAID5; + } + return MD_RESERVED; +} + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +#if (MINORBITS != 8) +#error MD doesnt handle bigger kdev yet +#endif + +#define MAX_REAL 12 /* Max number of disks per md dev */ +#define MAX_MD_DEVS (1<state & (1 << MD_DISK_FAULTY); +} + +extern inline int disk_active(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_ACTIVE); +} + +extern inline int disk_sync(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_SYNC); +} + +extern inline int disk_spare(mdp_disk_t * d) +{ + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); +} + +extern inline int disk_removed(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_faulty(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_FAULTY); +} + +extern inline void mark_disk_active(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_sync(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_SYNC); +} + +extern inline void mark_disk_spare(mdp_disk_t * d) +{ + d->state = 0; +} + +extern inline void mark_disk_removed(mdp_disk_t * d) +{ + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_inactive(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_nonsync(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_SYNC); +} + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct md_list_head same_set; /* RAID devices within the same set */ + struct md_list_head all; /* all RAID devices */ + struct md_list_head pending; /* undetected RAID devices */ + + kdev_t dev; /* Device number */ + kdev_t old_dev; /* "" when it was last imported */ + int size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + unsigned long last_events; /* IO event timestamp */ + + struct inode *inode; /* Lock inode */ + struct file filp; /* Lock file */ + + mdp_super_t *sb; + int sb_offset; + + int faulty; /* if faulty do not issue IO requests */ + int desc_nr; /* descriptor index in the superblock */ +}; + + +/* + * disk operations in a working array: + */ +#define DISKOP_SPARE_INACTIVE 0 +#define DISKOP_SPARE_WRITE 1 +#define DISKOP_SPARE_ACTIVE 2 +#define DISKOP_HOT_REMOVE_DISK 3 +#define DISKOP_HOT_ADD_DISK 4 + +typedef struct mdk_personality_s mdk_personality_t; + +struct mddev_s +{ + void *private; + mdk_personality_t *pers; + int __minor; + mdp_super_t *sb; + int nb_dev; + struct md_list_head disks; + int sb_dirty; + mdu_param_t param; + int ro; + unsigned int curr_resync; + unsigned long resync_start; + char *name; + int recovery_running; + struct semaphore reconfig_sem; + struct semaphore recovery_sem; + struct semaphore resync_sem; + struct md_list_head all_mddevs; + request_queue_t queue; +}; + +struct mdk_personality_s +{ + char *name; + int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size); + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh); + void (*end_request)(struct buffer_head * bh, int uptodate); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + int (*status)(char *page, mddev_t *mddev); + int (*ioctl)(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); + int max_invalid_dev; + int (*error_handler)(mddev_t *mddev, kdev_t dev); + +/* + * Some personalities (RAID-1, RAID-5) can have disks hot-added and + * hot-removed. Hot removal is different from failure. (failure marks + * a disk inactive, but the disk is still part of the array) The interface + * to such operations is the 'pers->diskop()' function, can be NULL. + * + * the diskop function can change the pointer pointing to the incoming + * descriptor, but must do so very carefully. (currently only + * SPARE_ACTIVE expects such a change) + */ + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state); + + int (*stop_resync)(mddev_t *mddev); + int (*restart_resync)(mddev_t *mddev); +}; + + +/* + * Currently we index md_array directly, based on the minor + * number. This will have to change to dynamic allocation + * once we start supporting partitioning of md devices. + */ +extern inline int mdidx (mddev_t * mddev) +{ + return mddev->__minor; +} + +extern inline kdev_t mddev_to_kdev(mddev_t * mddev) +{ + return MKDEV(MD_MAJOR, mdidx(mddev)); +} + +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev); +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \ + \ + for (tmp = head.next; \ + rdev = md_list_entry(tmp, mdk_rdev_t, field), \ + tmp = tmp->next, tmp->prev != &head \ + ; ) +/* + * iterates through the 'same array disks' ringlist + */ +#define ITERATE_RDEV(mddev,rdev,tmp) \ + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp) + +/* + * Same as above, but assumes that the device has rdev->desc_nr numbered + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order. + */ +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \ + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++) + + +/* + * Iterates through all 'RAID managed disks' + */ +#define ITERATE_RDEV_ALL(rdev,tmp) \ + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp) + +/* + * Iterates through 'pending RAID disks' + */ +#define ITERATE_RDEV_PENDING(rdev,tmp) \ + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp) + +/* + * iterates through all used mddevs in the system. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (tmp = all_mddevs.next; \ + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ + tmp = tmp->next, tmp->prev != &all_mddevs \ + ; ) + +extern inline int lock_mddev (mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +extern inline void unlock_mddev (mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \ + x = y; y = __tmp; } while (0) + +typedef struct mdk_thread_s { + void (*run) (void *data); + void *data; + md_wait_queue_head_t wqueue; + unsigned long flags; + struct semaphore *sem; + struct task_struct *tsk; + const char *name; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +#define MAX_DISKNAME_LEN 32 + +typedef struct dev_name_s { + struct md_list_head list; + kdev_t dev; + char name [MAX_DISKNAME_LEN]; +} dev_name_t; + +#endif _MD_K_H + --- linux/include/linux/raid/md_p.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_p.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,161 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_DISKS 27 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 14 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ + __u64 events; /* 7 number of superblock updates (64-bit!) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +#endif _MD_P_H + --- linux/include/linux/raid/md_u.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/md_u.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,115 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_U_H +#define _MD_U_H + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) + +/* configuration */ +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24) +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +#define START_ARRAY _IO (MD_MAJOR, 0x31) +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif _MD_U_H + --- linux/include/linux/raid/raid0.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/raid0.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,33 @@ +#ifndef _RAID0_H +#define _RAID0_H + +#include + +struct strip_zone +{ + int zone_offset; /* Zone offset in md_dev */ + int dev_offset; /* Zone offset in real dev */ + int size; /* Zone size */ + int nb_dev; /* # of devices attached to the zone */ + mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */ +}; + +struct raid0_hash +{ + struct strip_zone *zone0, *zone1; +}; + +struct raid0_private_data +{ + struct raid0_hash *hash_table; /* Dynamically allocated */ + struct strip_zone *strip_zone; /* This one too */ + int nr_strip_zones; + struct strip_zone *smallest; + int nr_zones; +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/raid1.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/raid1.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,65 @@ +#ifndef _RAID1_H +#define _RAID1_H + +#include + +struct mirror_info { + int number; + int raid_disk; + kdev_t dev; + int next; + int sect_limit; + + /* + * State bits: + */ + int operational; + int write_only; + int spare; + + int used_slot; +}; + +struct raid1_private_data { + mddev_t *mddev; + struct mirror_info mirrors[MD_SB_DISKS]; + int nr_disks; + int raid_disks; + int working_disks; + int last_used; + unsigned long next_sect; + int sect_count; + mdk_thread_t *thread, *resync_thread; + int resync_mirrors; + struct mirror_info *spare; + md_spinlock_t device_lock; +}; + +typedef struct raid1_private_data raid1_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' RAID1 buffer head. + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct raid1_bh { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + int cmd; + unsigned long state; + mddev_t *mddev; + struct buffer_head *master_bh; + struct buffer_head *mirror_bh [MD_SB_DISKS]; + struct buffer_head bh_req; + struct buffer_head *next_retry; +}; + +#endif --- linux/include/linux/raid/raid5.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/raid5.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,115 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include + +struct disk_info { + kdev_t dev; + int operational; + int number; + int raid_disk; + int write_only; + int spare; + int used_slot; +}; + +struct stripe_head { + md_spinlock_t stripe_lock; + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ + struct stripe_head *free_next; /* pool of free sh's */ + struct buffer_head *buffer_pool; /* pool of free buffers */ + struct buffer_head *bh_pool; /* pool of free bh's */ + struct raid5_private_data *raid_conf; + struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ + struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ + struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ + struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ + int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ + int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ + unsigned long sector; /* sector of this row */ + int size; /* buffers size */ + int pd_idx; /* parity disk index */ + atomic_t nr_pending; /* nr of pending cmds */ + unsigned long state; /* state flags */ + int cmd; /* stripe cmd */ + atomic_t count; /* nr of waiters */ + int write_method; /* reconstruct-write / read-modify-write */ + int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ + md_wait_queue_head_t wait; /* processes waiting for this stripe */ +}; + +/* + * Phase + */ +#define PHASE_BEGIN 0 +#define PHASE_READ_OLD 1 +#define PHASE_WRITE 2 +#define PHASE_READ 3 +#define PHASE_COMPLETE 4 + +/* + * Write method + */ +#define METHOD_NONE 0 +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 + +/* + * Stripe state + */ +#define STRIPE_LOCKED 0 +#define STRIPE_ERROR 1 + +/* + * Stripe commands + */ +#define STRIPE_NONE 0 +#define STRIPE_WRITE 1 +#define STRIPE_READ 2 + +struct raid5_private_data { + struct stripe_head **stripe_hashtbl; + mddev_t *mddev; + mdk_thread_t *thread, *resync_thread; + struct disk_info disks[MD_SB_DISKS]; + struct disk_info *spare; + int buffer_size; + int chunk_size, level, algorithm; + int raid_disks, working_disks, failed_disks; + int sector_count; + unsigned long next_sector; + atomic_t nr_handle; + struct stripe_head *next_free_stripe; + atomic_t nr_stripes; + int resync_parity; + int max_nr_stripes; + int clock; + atomic_t nr_hashed_stripes; + atomic_t nr_locked_stripes; + atomic_t nr_pending_stripes; + atomic_t nr_cached_stripes; + + /* + * Free stripes pool + */ + atomic_t nr_free_sh; + struct stripe_head *free_sh_list; + md_wait_queue_head_t wait_for_stripe; + + md_spinlock_t device_lock; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +#endif --- linux/include/linux/raid/translucent.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/translucent.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,23 @@ +#ifndef _TRANSLUCENT_H +#define _TRANSLUCENT_H + +#include + +typedef struct dev_info dev_info_t; + +struct dev_info { + kdev_t dev; + int size; +}; + +struct translucent_private_data +{ + dev_info_t disks[MD_SB_DISKS]; +}; + + +typedef struct translucent_private_data translucent_conf_t; + +#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/xor.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/xor.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,12 @@ +#ifndef _XOR_H +#define _XOR_H + +#include + +#define MAX_XOR_BLOCKS 5 + +extern void calibrate_xor_block(void); +extern void (*xor_block)(unsigned int count, + struct buffer_head **bh_ptr); + +#endif --- linux/include/linux/raid/hsm.h.orig Sun Jan 16 11:26:02 2000 +++ linux/include/linux/raid/hsm.h Sun Jan 16 17:45:53 2000 @@ -0,0 +1,65 @@ +#ifndef _HSM_H +#define _HSM_H + +#include + +#if __alpha__ +#error fix cpu_addr on Alpha first +#endif + +#include + +#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr) +#define index_dev(lv,index) index_pv((lv),(index))->dev +#define index_block(lv,index) (index)->data.phys_block +#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr)) + +#define ptr_to_cpuaddr(ptr) ((__u32) (ptr)) + + +typedef struct pv_bg_desc_s { + unsigned int free_blocks; + pv_block_group_t *bg; +} pv_bg_desc_t; + +typedef struct pv_s pv_t; +typedef struct vg_s vg_t; +typedef struct lv_s lv_t; + +struct pv_s +{ + int phys_nr; + kdev_t dev; + pv_sb_t *pv_sb; + pv_bg_desc_t *bg_array; +}; + +struct lv_s +{ + int log_id; + vg_t *vg; + + unsigned int max_indices; + unsigned int free_indices; + lv_lptr_t root_index; + + kdev_t dev; +}; + +struct vg_s +{ + int nr_pv; + pv_t pv_array [MD_SB_DISKS]; + + int nr_lv; + lv_t lv_array [HSM_MAX_LVS_PER_VG]; + + vg_sb_t *vg_sb; + mddev_t *mddev; +}; + +#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data) +#define mddev_to_vg(mddev) ((vg_t *) mddev->private) + +#endif + --- linux/include/linux/sysctl.h.orig Sun Jan 16 06:38:26 2000 +++ linux/include/linux/sysctl.h Sun Jan 16 17:45:53 2000 @@ -504,7 +504,8 @@ enum { DEV_CDROM=1, DEV_HWMON=2, - DEV_PARPORT=3 + DEV_PARPORT=3, + DEV_MD=4 }; /* /proc/sys/dev/cdrom */ @@ -544,6 +545,11 @@ /* /proc/sys/dev/parport/parport n/devices/device n */ enum { DEV_PARPORT_DEVICE_TIMESLICE=1, +}; + +/* /proc/sys/dev/md */ +enum { + DEV_MD_SPEED_LIMIT=1 }; #ifdef __KERNEL__ --- linux/include/linux/blkdev.h.orig Sun Jan 16 06:38:16 2000 +++ linux/include/linux/blkdev.h Sun Jan 16 17:45:53 2000 @@ -36,14 +36,17 @@ }; typedef struct request_queue request_queue_t; -typedef int (merge_request_fn) (request_queue_t *, - struct request * req, - struct buffer_head *); -typedef int (merge_requests_fn) (request_queue_t *, - struct request * req, - struct request * req2); -typedef void (request_fn_proc) (request_queue_t *); +typedef int (merge_request_fn) (request_queue_t *q, + struct request *req, + struct buffer_head *bh); +typedef int (merge_requests_fn) (request_queue_t *q, + struct request *req, + struct request *req2); +typedef void (request_fn_proc) (request_queue_t *q); typedef request_queue_t * (queue_proc) (kdev_t dev); +typedef void (make_request_fn) (int rw, struct buffer_head *bh); +typedef void (plug_device_fn) (request_queue_t *q, kdev_t device); +typedef void (unplug_device_fn) (void *q); struct request_queue { @@ -51,6 +54,8 @@ request_fn_proc * request_fn; merge_request_fn * merge_fn; merge_requests_fn * merge_requests_fn; + make_request_fn * make_request_fn; + plug_device_fn * plug_device_fn; /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. @@ -71,12 +76,6 @@ * not. */ char head_active; - - /* - * Boolean that indicates whether we should use plugging on - * this queue or not. - */ - char use_plug; }; struct blk_dev_struct { @@ -105,8 +104,10 @@ extern struct blk_dev_struct blk_dev[MAX_BLKDEV]; extern wait_queue_head_t wait_for_request; extern void resetup_one_dev(struct gendisk *dev, int drive); -extern void unplug_device(void * data); -extern void make_request(int major,int rw, struct buffer_head * bh); +extern void generic_unplug_device(void * data); +extern void generic_plug_device (request_queue_t *q, kdev_t dev); +extern void generic_make_request(int rw, struct buffer_head * bh); +extern request_queue_t * blk_get_queue(kdev_t dev); /* * Access functions for manipulating queue properties @@ -114,12 +115,8 @@ extern void blk_init_queue(request_queue_t *, request_fn_proc *); extern void blk_cleanup_queue(request_queue_t *); extern void blk_queue_headactive(request_queue_t *, int); -extern void blk_queue_pluggable(request_queue_t *, int); - -/* md needs this function to remap requests */ -extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size); -extern int md_make_request (int minor, int rw, struct buffer_head * bh); -extern int md_error (kdev_t mddev, kdev_t rdev); +extern void blk_queue_pluggable(request_queue_t *, plug_device_fn *); +extern void blk_queue_make_request(request_queue_t *, make_request_fn *); extern int * blk_size[MAX_BLKDEV]; --- linux/include/linux/fs.h.orig Sun Jan 16 06:38:28 2000 +++ linux/include/linux/fs.h Sun Jan 16 17:45:53 2000 @@ -22,6 +22,7 @@ #include #include +#include struct poll_table_struct; @@ -210,8 +211,8 @@ /* First cache line: */ struct buffer_head *b_next; /* Hash queue list */ unsigned long b_blocknr; /* block number */ - unsigned short b_size; /* block size */ - unsigned short b_list; /* List that this buffer appears */ + int b_size; /* block size */ + int b_list; /* List that this buffer appears */ kdev_t b_dev; /* device (B_FREE = free) */ atomic_t b_count; /* users using this block */ @@ -225,7 +226,7 @@ struct buffer_head *b_reqnext; /* request queue */ struct buffer_head **b_pprev; /* doubly linked list of hash-queue */ - char * b_data; /* pointer to data block (512 byte) */ + char * b_data; /* pointer to data block */ struct page *b_page; /* the page this bh is mapped to */ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ void *b_dev_id; @@ -237,8 +238,17 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); void init_buffer(struct buffer_head *, bh_end_io_t *, void *); +extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate); +extern void end_buffer_io_bad(struct buffer_head *bh, int uptodate); -#define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0) + +#define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0) + +#define bcount(bh) (atomic_read(&((bh)->b_count))) +#define bh_set(bh,v) do { atomic_set(&(bh)->b_count,v); } while (0) +#define bget(bh) do { atomic_inc(&(bh)->b_count); } while (0) +#define bput(bh) do { atomic_dec(&(bh)->b_count); } while (0) +#define bput_and_test(bh) (atomic_dec_and_test(&(bh)->b_count)) #define buffer_uptodate(bh) __buffer_state(bh,Uptodate) #define buffer_dirty(bh) __buffer_state(bh,Dirty) @@ -248,11 +258,12 @@ #define buffer_new(bh) __buffer_state(bh,New) #define buffer_protected(bh) __buffer_state(bh,Protected) -#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) +#define bh_offset(bh) ((unsigned long)bh->b_data & ~PAGE_MASK) +#define bh_page(bh) page_cache_entry(bh->b_data) extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); -#define touch_buffer(bh) set_bit(PG_referenced, &bh->b_page->flags) +#define touch_buffer(bh) set_bit(PG_referenced, &(bh)->b_page->flags) #include #include @@ -811,6 +822,7 @@ extern int try_to_free_buffers(struct page *); extern void refile_buffer(struct buffer_head * buf); + #define BUF_CLEAN 0 #define BUF_LOCKED 1 /* Buffers scheduled for write */ #define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */ @@ -822,6 +834,8 @@ */ extern inline void mark_buffer_uptodate(struct buffer_head * bh, int on) { + if (0 && !bcount(bh)) + BH_BUG(bh); if (on) set_bit(BH_Uptodate, &bh->b_state); else @@ -832,11 +846,15 @@ extern inline void __mark_buffer_clean(struct buffer_head *bh) { + if (0 && !bcount(bh)) + BH_BUG(bh); refile_buffer(bh); } extern inline void mark_buffer_clean(struct buffer_head * bh) { + if (0 && !bcount(bh)) + BH_BUG(bh); if (atomic_set_buffer_clean(bh)) __mark_buffer_clean(bh); } @@ -860,6 +878,11 @@ extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag) { + /* + * We must not touch buffers we do not own. + */ + if (0 && !bcount(bh)) + BH_BUG(bh); if (!atomic_set_buffer_dirty(bh)) __mark_buffer_dirty(bh, flag); } @@ -958,7 +981,12 @@ extern void file_moveto(struct file *new, struct file *old); extern struct buffer_head * get_hash_table(kdev_t, int, int); extern struct buffer_head * getblk(kdev_t, int, int); +extern struct buffer_head * getblk_lock(kdev_t, int, int); +extern struct buffer_head * get_unused_bh(void); +extern void put_unused_bh(struct buffer_head * bh); +extern void insert_into_queues_exclusive(struct buffer_head *bh); extern void ll_rw_block(int, int, struct buffer_head * bh[]); +extern void ll_rw_block_locked(int, int, struct buffer_head * bh[]); extern int is_read_only(kdev_t); extern void __brelse(struct buffer_head *); extern inline void brelse(struct buffer_head *buf) @@ -971,6 +999,12 @@ { if (buf) __bforget(buf); +} +extern void __bdrop(struct buffer_head *); +extern inline void bdrop(struct buffer_head *buf) +{ + if (buf) + __bdrop(buf); } extern void set_blocksize(kdev_t, int); extern unsigned int get_hardblocksize(kdev_t); --- linux/include/linux/raid0.h.orig Tue Oct 29 14:20:24 1996 +++ linux/include/linux/raid0.h Sun Jan 16 17:45:53 2000 @@ -1,27 +0,0 @@ -#ifndef _RAID0_H -#define _RAID0_H - -struct strip_zone -{ - int zone_offset; /* Zone offset in md_dev */ - int dev_offset; /* Zone offset in real dev */ - int size; /* Zone size */ - int nb_dev; /* Number of devices attached to the zone */ - struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */ -}; - -struct raid0_hash -{ - struct strip_zone *zone0, *zone1; -}; - -struct raid0_data -{ - struct raid0_hash *hash_table; /* Dynamically allocated */ - struct strip_zone *strip_zone; /* This one too */ - int nr_strip_zones; - struct strip_zone *smallest; - int nr_zones; -}; - -#endif --- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998 +++ linux/include/linux/raid1.h Sun Jan 16 17:45:53 2000 @@ -1,49 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -#include - -struct mirror_info { - int number; - int raid_disk; - kdev_t dev; - int next; - int sect_limit; - - /* - * State bits: - */ - int operational; - int write_only; - int spare; -}; - -struct raid1_data { - struct md_dev *mddev; - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */ - int raid_disks; - int working_disks; /* Number of working disks */ - int last_used; - unsigned long next_sect; - int sect_count; - int resync_running; -}; - -/* - * this is our 'private' 'collective' RAID1 buffer head. - * it contains information about what kind of IO operations were started - * for this RAID5 operation, and about their status: - */ - -struct raid1_bh { - unsigned int remaining; - int cmd; - unsigned long state; - struct md_dev *mddev; - struct buffer_head *master_bh; - struct buffer_head *mirror_bh [MD_SB_DISKS]; - struct buffer_head bh_req; - struct buffer_head *next_retry; -}; - -#endif --- linux/include/linux/mm.h.orig Sun Jan 16 06:38:28 2000 +++ linux/include/linux/mm.h Sun Jan 16 17:45:53 2000 @@ -141,6 +141,9 @@ struct buffer_head * buffers; unsigned long virtual; /* nonzero if kmapped */ struct zone_struct *zone; +#if PAGE_TRACE + struct buffer_hist __hist; +#endif } mem_map_t; #define get_page(p) atomic_inc(&(p)->count) @@ -165,7 +168,6 @@ #define PG_highmem 12 /* bits 21-30 unused */ #define PG_reserved 31 - /* Make it prettier to test the above... */ #define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags) --- linux/include/linux/tqueue.h.orig Fri Oct 15 18:29:43 1999 +++ linux/include/linux/tqueue.h Sun Jan 16 17:45:53 2000 @@ -116,7 +116,8 @@ p = p -> next; mb(); save_p -> sync = 0; - (*f)(arg); + if (f) + (*f)(arg); } } } --- linux/include/linux/list.h.orig Mon Jul 12 07:50:27 1999 +++ linux/include/linux/list.h Sun Jan 16 17:45:53 2000 @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include + /* * Simple doubly linked list implementation. * @@ -26,6 +28,7 @@ (ptr)->next = (ptr); (ptr)->prev = (ptr); \ } while (0) +#define LIST_UNUSED(ptr) ((!(ptr)->next && !(ptr)->prev) || (((ptr)->next == (ptr)) && ((ptr)->prev = (ptr)))) /* * Insert a new entry between two known consecutive entries. * @@ -36,6 +39,8 @@ struct list_head * prev, struct list_head * next) { + if (!LIST_UNUSED(new)) + BUG(); next->prev = new; new->next = next; new->prev = prev; @@ -75,6 +80,8 @@ static __inline__ void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); + entry->prev = 0; + entry->next = 0; } static __inline__ int list_empty(struct list_head *head) --- linux/include/linux/wait.h.orig Fri Oct 15 18:29:42 1999 +++ linux/include/linux/wait.h Sun Jan 16 17:45:53 2000 @@ -143,6 +143,7 @@ #if WAITQUEUE_DEBUG q->__magic = (long)&q->__magic; #endif + INIT_LIST_HEAD(&q->task_list); } static inline int waitqueue_active(wait_queue_head_t *q) --- linux/include/linux/raid5.h.orig Wed May 12 17:41:15 1999 +++ linux/include/linux/raid5.h Sun Jan 16 17:45:53 2000 @@ -1,110 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#ifdef __KERNEL__ -#include -#include - -struct disk_info { - kdev_t dev; - int operational; - int number; - int raid_disk; - int write_only; - int spare; -}; - -struct stripe_head { - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ - struct stripe_head *free_next; /* pool of free sh's */ - struct buffer_head *buffer_pool; /* pool of free buffers */ - struct buffer_head *bh_pool; /* pool of free bh's */ - struct raid5_data *raid_conf; - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ - unsigned long sector; /* sector of this row */ - int size; /* buffers size */ - int pd_idx; /* parity disk index */ - int nr_pending; /* nr of pending cmds */ - unsigned long state; /* state flags */ - int cmd; /* stripe cmd */ - int count; /* nr of waiters */ - int write_method; /* reconstruct-write / read-modify-write */ - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ - wait_queue_head_t wait; /* processes waiting for this stripe */ -}; - -/* - * Phase - */ -#define PHASE_BEGIN 0 -#define PHASE_READ_OLD 1 -#define PHASE_WRITE 2 -#define PHASE_READ 3 -#define PHASE_COMPLETE 4 - -/* - * Write method - */ -#define METHOD_NONE 0 -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 - -/* - * Stripe state - */ -#define STRIPE_LOCKED 0 -#define STRIPE_ERROR 1 - -/* - * Stripe commands - */ -#define STRIPE_NONE 0 -#define STRIPE_WRITE 1 -#define STRIPE_READ 2 - -struct raid5_data { - struct stripe_head **stripe_hashtbl; - struct md_dev *mddev; - struct md_thread *thread, *resync_thread; - struct disk_info disks[MD_SB_DISKS]; - struct disk_info *spare; - int buffer_size; - int chunk_size, level, algorithm; - int raid_disks, working_disks, failed_disks; - int sector_count; - unsigned long next_sector; - atomic_t nr_handle; - struct stripe_head *next_free_stripe; - int nr_stripes; - int resync_parity; - int max_nr_stripes; - int clock; - int nr_hashed_stripes; - int nr_locked_stripes; - int nr_pending_stripes; - int nr_cached_stripes; - - /* - * Free stripes pool - */ - int nr_free_sh; - struct stripe_head *free_sh_list; - wait_queue_head_t wait_for_stripe; -}; - -#endif - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 - -#endif --- linux/include/linux/md.h.orig Tue May 11 23:46:24 1999 +++ linux/include/linux/md.h Sun Jan 16 17:45:53 2000 @@ -1,300 +0,0 @@ -/* - md.h : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_H -#define _MD_H - -#include -#include -#include - -/* - * Different major versions are not compatible. - * Different minor versions are only downward compatible. - * Different patchlevel versions are downward and upward compatible. - */ -#define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 36 -#define MD_PATCHLEVEL_VERSION 6 - -#define MD_DEFAULT_DISK_READAHEAD (256 * 1024) - -/* ioctls */ -#define REGISTER_DEV _IO (MD_MAJOR, 1) -#define START_MD _IO (MD_MAJOR, 2) -#define STOP_MD _IO (MD_MAJOR, 3) -#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4) - -/* - personalities : - Byte 0 : Chunk size factor - Byte 1 : Fault tolerance count for each physical device - ( 0 means no fault tolerance, - 0xFF means always tolerate faults), not used by now. - Byte 2 : Personality - Byte 3 : Reserved. - */ - -#define FAULT_SHIFT 8 -#define PERSONALITY_SHIFT 16 - -#define FACTOR_MASK 0x000000FFUL -#define FAULT_MASK 0x0000FF00UL -#define PERSONALITY_MASK 0x00FF0000UL - -#define MD_RESERVED 0 /* Not used by now */ -#define LINEAR (1UL << PERSONALITY_SHIFT) -#define STRIPED (2UL << PERSONALITY_SHIFT) -#define RAID0 STRIPED -#define RAID1 (3UL << PERSONALITY_SHIFT) -#define RAID5 (4UL << PERSONALITY_SHIFT) -#define MAX_PERSONALITY 5 - -/* - * MD superblock. - * - * The MD superblock maintains some statistics on each MD configuration. - * Each real device in the MD set contains it near the end of the device. - * Some of the ideas are copied from the ext2fs implementation. - * - * We currently use 4096 bytes as follows: - * - * word offset function - * - * 0 - 31 Constant generic MD device information. - * 32 - 63 Generic state information. - * 64 - 127 Personality specific information. - * 128 - 511 12 32-words descriptors of the disks in the raid set. - * 512 - 911 Reserved. - * 912 - 1023 Disk specific descriptor. - */ - -/* - * If x is the real device size in bytes, we return an apparent size of: - * - * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES - * - * and place the 4kB superblock at offset y. - */ -#define MD_RESERVED_BYTES (64 * 1024) -#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) -#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) - -#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) -#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) - -#define MD_SB_BYTES 4096 -#define MD_SB_WORDS (MD_SB_BYTES / 4) -#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) -#define MD_SB_SECTORS (MD_SB_BYTES / 512) - -/* - * The following are counted in 32-bit words - */ -#define MD_SB_GENERIC_OFFSET 0 -#define MD_SB_PERSONALITY_OFFSET 64 -#define MD_SB_DISKS_OFFSET 128 -#define MD_SB_DESCRIPTOR_OFFSET 992 - -#define MD_SB_GENERIC_CONSTANT_WORDS 32 -#define MD_SB_GENERIC_STATE_WORDS 32 -#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) -#define MD_SB_PERSONALITY_WORDS 64 -#define MD_SB_DISKS_WORDS 384 -#define MD_SB_DESCRIPTOR_WORDS 32 -#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) -#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) -#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS) - -/* - * Device "operational" state bits - */ -#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */ -#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */ -#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */ - -typedef struct md_device_descriptor_s { - __u32 number; /* 0 Device number in the entire set */ - __u32 major; /* 1 Device major number */ - __u32 minor; /* 2 Device minor number */ - __u32 raid_disk; /* 3 The role of the device in the raid set */ - __u32 state; /* 4 Operational state */ - __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; -} md_descriptor_t; - -#define MD_SB_MAGIC 0xa92b4efc - -/* - * Superblock state bits - */ -#define MD_SB_CLEAN 0 -#define MD_SB_ERRORS 1 - -typedef struct md_superblock_s { - - /* - * Constant generic information - */ - __u32 md_magic; /* 0 MD identifier */ - __u32 major_version; /* 1 major version to which the set conforms */ - __u32 m