--- linux/init/main.c.orig Tue Jan 16 13:30:09 2001 +++ linux/init/main.c Tue Jan 16 13:42:03 2001 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -549,7 +550,7 @@ #ifdef CONFIG_BLK_DEV_FD { "fd", 0x0200 }, #endif -#ifdef CONFIG_MD_BOOT +#if CONFIG_MD_BOOT || CONFIG_AUTODETECT_RAID { "md", 0x0900 }, #endif #ifdef CONFIG_BLK_DEV_XD @@ -1060,6 +1061,9 @@ #ifdef CONFIG_MD_BOOT { "md=", md_setup}, #endif +#if CONFIG_BLK_DEV_MD + { "raid=", raid_setup}, +#endif #ifdef CONFIG_ADBMOUSE { "adb_buttons=", adb_mouse_setup }, #endif @@ -1637,6 +1641,9 @@ while (pid != wait(&i)); if (MAJOR(real_root_dev) != RAMDISK_MAJOR || MINOR(real_root_dev) != 0) { +#ifdef CONFIG_BLK_DEV_MD + autodetect_raid(); +#endif error = change_root(real_root_dev,"/initrd"); if (error) printk(KERN_ERR "Change root to /initrd: " --- linux/include/linux/raid/hsm.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/hsm.h Tue Jan 16 13:42:03 2001 @@ -0,0 +1,65 @@ +#ifndef _HSM_H +#define _HSM_H + +#include + +#if __alpha__ +#error fix cpu_addr on Alpha first +#endif + +#include + +#define index_pv(lv,index) ((lv)->vg->pv_array+(index)->data.phys_nr) +#define index_dev(lv,index) index_pv((lv),(index))->dev +#define index_block(lv,index) (index)->data.phys_block +#define index_child(index) ((lv_lptr_t *)((index)->cpu_addr)) + +#define ptr_to_cpuaddr(ptr) ((__u32) (ptr)) + + +typedef struct pv_bg_desc_s { + unsigned int free_blocks; + pv_block_group_t *bg; +} pv_bg_desc_t; + +typedef struct pv_s pv_t; +typedef struct vg_s vg_t; +typedef struct lv_s lv_t; + +struct pv_s +{ + int phys_nr; + kdev_t dev; + pv_sb_t *pv_sb; + pv_bg_desc_t *bg_array; +}; + +struct lv_s +{ + int log_id; + vg_t *vg; + + unsigned int max_indices; + unsigned int free_indices; + lv_lptr_t root_index; + + kdev_t dev; +}; + +struct vg_s +{ + int nr_pv; + pv_t pv_array [MD_SB_DISKS]; + + int nr_lv; + lv_t lv_array [HSM_MAX_LVS_PER_VG]; + + vg_sb_t *vg_sb; + mddev_t *mddev; +}; + +#define kdev_to_lv(dev) ((lv_t *) mddev_map[MINOR(dev)].data) +#define mddev_to_vg(mddev) ((vg_t *) mddev->private) + +#endif + --- linux/include/linux/raid/hsm_p.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/hsm_p.h Tue Jan 16 13:42:03 2001 @@ -0,0 +1,237 @@ +#ifndef _HSM_P_H +#define _HSM_P_H + +#define HSM_BLOCKSIZE 4096 +#define HSM_BLOCKSIZE_WORDS (HSM_BLOCKSIZE/4) +#define PACKED __attribute__ ((packed)) + +/* + * Identifies a block in physical space + */ +typedef struct phys_idx_s { + __u16 phys_nr; + __u32 phys_block; + +} PACKED phys_idx_t; + +/* + * Identifies a block in logical space + */ +typedef struct log_idx_s { + __u16 log_id; + __u32 log_index; + +} PACKED log_idx_t; + +/* + * Describes one PV + */ +#define HSM_PV_SB_MAGIC 0xf091ae9fU + +#define HSM_PV_SB_GENERIC_WORDS 32 +#define HSM_PV_SB_RESERVED_WORDS \ + (HSM_BLOCKSIZE_WORDS - HSM_PV_SB_GENERIC_WORDS) + +/* + * On-disk PV identification data, on block 0 in any PV. + */ +typedef struct pv_sb_s +{ + __u32 pv_magic; /* 0 */ + + __u32 pv_uuid0; /* 1 */ + __u32 pv_uuid1; /* 2 */ + __u32 pv_uuid2; /* 3 */ + __u32 pv_uuid3; /* 4 */ + + __u32 pv_major; /* 5 */ + __u32 pv_minor; /* 6 */ + __u32 pv_patch; /* 7 */ + + __u32 pv_ctime; /* 8 Creation time */ + + __u32 pv_total_size; /* 9 size of this PV, in blocks */ + __u32 pv_first_free; /* 10 first free block */ + __u32 pv_first_used; /* 11 first used block */ + __u32 pv_blocks_left; /* 12 unallocated blocks */ + __u32 pv_bg_size; /* 13 size of a block group, in blocks */ + __u32 pv_block_size; /* 14 size of blocks, in bytes */ + __u32 pv_pptr_size; /* 15 size of block descriptor, in bytes */ + __u32 pv_block_groups; /* 16 number of block groups */ + + __u32 __reserved1[HSM_PV_SB_GENERIC_WORDS - 17]; + + /* + * Reserved + */ + __u32 __reserved2[HSM_PV_SB_RESERVED_WORDS]; + +} PACKED pv_sb_t; + +/* + * this is pretty much arbitrary, but has to be less than ~64 + */ +#define HSM_MAX_LVS_PER_VG 32 + +#define HSM_VG_SB_GENERIC_WORDS 32 + +#define LV_DESCRIPTOR_WORDS 8 +#define HSM_VG_SB_RESERVED_WORDS (HSM_BLOCKSIZE_WORDS - \ + LV_DESCRIPTOR_WORDS*HSM_MAX_LVS_PER_VG - HSM_VG_SB_GENERIC_WORDS) + +#if (HSM_PV_SB_RESERVED_WORDS < 0) +#error you messed this one up dude ... +#endif + +typedef struct lv_descriptor_s +{ + __u32 lv_id; /* 0 */ + phys_idx_t lv_root_idx; /* 1 */ + __u16 __reserved; /* 2 */ + __u32 lv_max_indices; /* 3 */ + __u32 lv_free_indices; /* 4 */ + __u32 md_id; /* 5 */ + + __u32 reserved[LV_DESCRIPTOR_WORDS - 6]; + +} PACKED lv_descriptor_t; + +#define HSM_VG_SB_MAGIC 0x98320d7aU +/* + * On-disk VG identification data, in block 1 on all PVs + */ +typedef struct vg_sb_s +{ + __u32 vg_magic; /* 0 */ + __u32 nr_lvs; /* 1 */ + + __u32 __reserved1[HSM_VG_SB_GENERIC_WORDS - 2]; + + lv_descriptor_t lv_array [HSM_MAX_LVS_PER_VG]; + /* + * Reserved + */ + __u32 __reserved2[HSM_VG_SB_RESERVED_WORDS]; + +} PACKED vg_sb_t; + +/* + * Describes one LV + */ + +#define HSM_LV_SB_MAGIC 0xe182bd8aU + +/* do we need lv_sb_t? */ + +typedef struct lv_sb_s +{ + /* + * On-disk LV identifier + */ + __u32 lv_magic; /* 0 LV identifier */ + __u32 lv_uuid0; /* 1 */ + __u32 lv_uuid1; /* 2 */ + __u32 lv_uuid2; /* 3 */ + __u32 lv_uuid3; /* 4 */ + + __u32 lv_major; /* 5 PV identifier */ + __u32 lv_minor; /* 6 PV identifier */ + __u32 lv_patch; /* 7 PV identifier */ + + __u32 ctime; /* 8 Creation time */ + __u32 size; /* 9 size of this LV, in blocks */ + phys_idx_t start; /* 10 position of root index block */ + log_idx_t first_free; /* 11-12 first free index */ + + /* + * Reserved + */ + __u32 reserved[HSM_BLOCKSIZE_WORDS-13]; + +} PACKED lv_sb_t; + +/* + * Pointer pointing from the physical space, points to + * the LV owning this block. It also contains various + * statistics about the physical block. + */ +typedef struct pv_pptr_s +{ + union { + /* case 1 */ + struct { + log_idx_t owner; + log_idx_t predicted; + __u32 last_referenced; + } used; + /* case 2 */ + struct { + __u16 log_id; + __u16 __unused1; + __u32 next_free; + __u32 __unused2; + __u32 __unused3; + } free; + } u; +} PACKED pv_pptr_t; + +static __inline__ int pv_pptr_free (const pv_pptr_t * pptr) +{ + return !pptr->u.free.log_id; +} + + +#define DATA_BLOCKS_PER_BG ((HSM_BLOCKSIZE*8)/(8*sizeof(pv_pptr_t)+1)) + +#define TOTAL_BLOCKS_PER_BG (DATA_BLOCKS_PER_BG+1) +/* + * A table of pointers filling up a single block, managing + * the next DATA_BLOCKS_PER_BG physical blocks. Such block + * groups form the physical space of blocks. + */ +typedef struct pv_block_group_s +{ + __u8 used_bitmap[(DATA_BLOCKS_PER_BG+7)/8]; + + pv_pptr_t blocks[DATA_BLOCKS_PER_BG]; + +} PACKED pv_block_group_t; + +/* + * Pointer from the logical space, points to + * the (PV,block) containing this logical block + */ +typedef struct lv_lptr_s +{ + phys_idx_t data; + __u16 __reserved; + __u32 cpu_addr; + __u32 __reserved2; + +} PACKED lv_lptr_t; + +static __inline__ int index_free (const lv_lptr_t * index) +{ + return !index->data.phys_block; +} + +static __inline__ int index_present (const lv_lptr_t * index) +{ + return index->cpu_addr; +} + + +#define HSM_LPTRS_PER_BLOCK (HSM_BLOCKSIZE/sizeof(lv_lptr_t)) +/* + * A table of pointers filling up a single block, managing + * HSM_LPTRS_PER_BLOCK logical blocks. Such block groups form + * the logical space of blocks. + */ +typedef struct lv_index_block_s +{ + lv_lptr_t blocks[HSM_LPTRS_PER_BLOCK]; + +} PACKED lv_index_block_t; + +#endif + --- linux/include/linux/raid/linear.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/linear.h Tue Jan 16 13:44:37 2001 @@ -0,0 +1,32 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +#include + +struct dev_info { + kdev_t dev; + int size; + unsigned int offset; +}; + +typedef struct dev_info dev_info_t; + +struct linear_hash +{ + dev_info_t *dev0, *dev1; +}; + +struct linear_private_data +{ + struct linear_hash *hash_table; + dev_info_t disks[MD_SB_DISKS]; + dev_info_t *smallest; + int nr_zones; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/md.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/md.h Tue Jan 16 13:47:20 2001 @@ -0,0 +1,96 @@ +/* + md.h : Multiple Devices driver for Linux + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + Copyright (C) 1994-96 Marc ZYNGIER + or + + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_H +#define _MD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +/* + * 'md_p.h' holds the 'physical' layout of RAID devices + * 'md_u.h' holds the user <=> kernel API + * + * 'md_k.h' holds kernel internal definitions + */ + +#include +#include +#include + +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +#define MD_PATCHLEVEL_VERSION 0 + +extern int md_size[MAX_MD_DEVS]; +extern struct hd_struct md_hd_struct[MAX_MD_DEVS]; + +extern void add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data); +extern void del_mddev_mapping (mddev_t *mddev, kdev_t dev); +extern char * partition_name (kdev_t dev); +extern int register_md_personality (int p_num, mdk_personality_t *p); +extern int unregister_md_personality (int p_num); +extern mdk_thread_t * md_register_thread (void (*run) (void *data), + void *data, const char *name); +extern void md_unregister_thread (mdk_thread_t *thread); +extern void md_wakeup_thread(mdk_thread_t *thread); +extern void md_interrupt_thread (mdk_thread_t *thread); +extern int md_update_sb (mddev_t *mddev); +extern int md_do_sync(mddev_t *mddev, mdp_disk_t *spare); +extern void md_recover_arrays (void); +extern int md_check_ordering (mddev_t *mddev); +extern void autodetect_raid(void); +extern struct gendisk * find_gendisk (kdev_t dev); +extern int md_notify_reboot(struct notifier_block *this, + unsigned long code, void *x); +#if CONFIG_BLK_DEV_MD +extern void raid_setup(char *str,int *ints) md__init; +#endif +#ifdef CONFIG_MD_BOOT +extern void md_setup(char *str,int *ints) md__init; +#endif + +extern void md_print_devices (void); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } + +#endif _MD_H + --- linux/include/linux/raid/md_compatible.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/md_compatible.h Tue Jan 16 13:47:19 2001 @@ -0,0 +1,387 @@ + +/* + md.h : Multiple Devices driver compatibility layer for Linux 2.0/2.2 + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#ifndef _MD_COMPATIBLE_H +#define _MD_COMPATIBLE_H + +#define LinuxVersionCode(v, p, s) (((v)<<16)+((p)<<8)+(s)) + +#if LINUX_VERSION_CODE < LinuxVersionCode(2,1,0) + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y,GFP_KERNEL) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return x86_capability & 0x00800000; +} +#endif + +/* 002 */ +#define md_clear_page(page) memset((void *)(page), 0, PAGE_SIZE) + +/* 003 */ +/* + * someone please suggest a sane compatibility layer for modules + */ +#define MD_EXPORT_SYMBOL(x) + +/* 004 */ +static inline unsigned long +md_copy_from_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_READ,from,n); + if (!err) + memcpy_fromfs(to, from, n); + return err; +} + +/* 005 */ +extern inline unsigned long +md_copy_to_user(void *to, const void *from, unsigned long n) +{ + int err; + + err = verify_area(VERIFY_WRITE,to,n); + if (!err) + memcpy_tofs(to, from, n); + return err; +} + +/* 006 */ +#define md_put_user(x,ptr) \ +({ \ + int __err; \ + \ + __err = verify_area(VERIFY_WRITE,ptr,sizeof(*ptr)); \ + if (!__err) \ + put_user(x,ptr); \ + __err; \ +}) + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return suser(); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + current->signal = 0; +} + +/* 010 */ +#define __S(nr) (1<<((nr)-1)) +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + current->blocked = ~(__S(SIGKILL)); +} +#undef __S + +/* 011 */ +extern inline unsigned long md_signal_pending (struct task_struct * tsk) +{ + return (tsk->signal & ~tsk->blocked); +} + +/* 012 */ +#define md_set_global_readahead(x) read_ahead[MD_MAJOR] = MD_READAHEAD + +/* 013 */ +#define md_mdelay(n) (\ + {unsigned long msec=(n); while (msec--) udelay(1000);}) + +/* 014 */ +#define MD_SYS_DOWN 0 +#define MD_SYS_HALT 0 +#define MD_SYS_POWER_OFF 0 + +/* 015 */ +#define md_register_reboot_notifier(x) + +/* 016 */ +extern __inline__ unsigned long +md_test_and_set_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + set_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 017 */ +extern __inline__ unsigned long +md_test_and_clear_bit(int nr, void * addr) +{ + unsigned long flags; + unsigned long oldbit; + + save_flags(flags); + cli(); + oldbit = test_bit(nr,addr); + clear_bit(nr,addr); + restore_flags(flags); + return oldbit; +} + +/* 018 */ +#define md_atomic_read(x) (*(volatile int *)(x)) +#define md_atomic_set(x,y) (*(volatile int *)(x) = (y)) + +/* 019 */ +extern __inline__ void md_lock_kernel (void) +{ +#if __SMP__ + lock_kernel(); + syscall_count++; +#endif +} + +extern __inline__ void md_unlock_kernel (void) +{ +#if __SMP__ + syscall_count--; + unlock_kernel(); +#endif +} +/* 020 */ + +#define md__init +#define md__initdata +#define md__initfunc(__arginit) __arginit + +/* 021 */ + +/* 022 */ + +struct md_list_head { + struct md_list_head *next, *prev; +}; + +#define MD_LIST_HEAD(name) \ + struct md_list_head name = { &name, &name } + +#define MD_INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +static __inline__ void md__list_add(struct md_list_head * new, + struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static __inline__ void md_list_add(struct md_list_head *new, + struct md_list_head *head) +{ + md__list_add(new, head, head->next); +} + +static __inline__ void md__list_del(struct md_list_head * prev, + struct md_list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +static __inline__ void md_list_del(struct md_list_head *entry) +{ + md__list_del(entry->prev, entry->next); +} + +static __inline__ int md_list_empty(struct md_list_head *head) +{ + return head->next == head; +} + +#define md_list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/* 023 */ + +static __inline__ signed long md_schedule_timeout(signed long timeout) +{ + current->timeout = jiffies + timeout; + schedule(); + return 0; +} + +/* 024 */ +#define md_need_resched(tsk) (need_resched) + +/* 025 */ +typedef struct { int gcc_is_buggy; } md_spinlock_t; +#define MD_SPIN_LOCK_UNLOCKED (md_spinlock_t) { 0 } + +#define md_spin_lock_irq cli +#define md_spin_unlock_irq sti +#define md_spin_unlock_irqrestore(x,flags) restore_flags(flags) +#define md_spin_lock_irqsave(x,flags) do { save_flags(flags); cli(); } while (0) + +/* END */ + +#else + +#include +#include + +/* 000 */ +#define md__get_free_pages(x,y) __get_free_pages(x,y) + +#ifdef __i386__ +/* 001 */ +extern __inline__ int md_cpu_has_mmx(void) +{ + return boot_cpu_data.x86_capability & X86_FEATURE_MMX; +} +#endif + +/* 002 */ +#define md_clear_page(page) clear_page(page) + +/* 003 */ +#define MD_EXPORT_SYMBOL(x) EXPORT_SYMBOL(x) + +/* 004 */ +#define md_copy_to_user(x,y,z) copy_to_user(x,y,z) + +/* 005 */ +#define md_copy_from_user(x,y,z) copy_from_user(x,y,z) + +/* 006 */ +#define md_put_user put_user + +/* 007 */ +extern inline int md_capable_admin(void) +{ + return capable(CAP_SYS_ADMIN); +} + +/* 008 */ +#define MD_FILE_TO_INODE(file) ((file)->f_dentry->d_inode) + +/* 009 */ +extern inline void md_flush_signals (void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +/* 010 */ +extern inline void md_init_signals (void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +/* 011 */ +#define md_signal_pending signal_pending + +/* 012 */ +extern inline void md_set_global_readahead(int * table) +{ + max_readahead[MD_MAJOR] = table; +} + +/* 013 */ +#define md_mdelay(x) mdelay(x) + +/* 014 */ +#define MD_SYS_DOWN SYS_DOWN +#define MD_SYS_HALT SYS_HALT +#define MD_SYS_POWER_OFF SYS_POWER_OFF + +/* 015 */ +#define md_register_reboot_notifier register_reboot_notifier + +/* 016 */ +#define md_test_and_set_bit test_and_set_bit + +/* 017 */ +#define md_test_and_clear_bit test_and_clear_bit + +/* 018 */ +#define md_atomic_read atomic_read +#define md_atomic_set atomic_set + +/* 019 */ +#define md_lock_kernel lock_kernel +#define md_unlock_kernel unlock_kernel + +/* 020 */ + +#include + +#define md__init __init +#define md__initdata __initdata +#define md__initfunc(__arginit) __initfunc(__arginit) + +/* 021 */ + + +/* 022 */ + +#define md_list_head list_head +#define MD_LIST_HEAD(name) LIST_HEAD(name) +#define MD_INIT_LIST_HEAD(ptr) INIT_LIST_HEAD(ptr) +#define md_list_add list_add +#define md_list_del list_del +#define md_list_empty list_empty + +#define md_list_entry(ptr, type, member) list_entry(ptr, type, member) + +/* 023 */ + +#define md_schedule_timeout schedule_timeout + +/* 024 */ +#define md_need_resched(tsk) ((tsk)->need_resched) + +/* 025 */ +#define md_spinlock_t spinlock_t +#define MD_SPIN_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +#define md_spin_lock_irq spin_lock_irq +#define md_spin_unlock_irq spin_unlock_irq +#define md_spin_unlock_irqrestore spin_unlock_irqrestore +#define md_spin_lock_irqsave spin_lock_irqsave + +/* END */ + +#endif + +#endif _MD_COMPATIBLE_H + --- linux/include/linux/raid/md_k.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/md_k.h Tue Jan 16 13:42:03 2001 @@ -0,0 +1,338 @@ +/* + md_k.h : kernel internal structure of the Linux MD driver + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_K_H +#define _MD_K_H + +#define MD_RESERVED 0UL +#define LINEAR 1UL +#define STRIPED 2UL +#define RAID0 STRIPED +#define RAID1 3UL +#define RAID5 4UL +#define TRANSLUCENT 5UL +#define HSM 6UL +#define MAX_PERSONALITY 7UL + +extern inline int pers_to_level (int pers) +{ + switch (pers) { + case HSM: return -3; + case TRANSLUCENT: return -2; + case LINEAR: return -1; + case RAID0: return 0; + case RAID1: return 1; + case RAID5: return 5; + } + panic("pers_to_level()"); +} + +extern inline int level_to_pers (int level) +{ + switch (level) { + case -3: return HSM; + case -2: return TRANSLUCENT; + case -1: return LINEAR; + case 0: return RAID0; + case 1: return RAID1; + case 4: + case 5: return RAID5; + } + return MD_RESERVED; +} + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +#if (MINORBITS != 8) +#error MD doesnt handle bigger kdev yet +#endif + +#define MAX_REAL 12 /* Max number of disks per md dev */ +#define MAX_MD_DEVS (1<state & (1 << MD_DISK_FAULTY); +} + +extern inline int disk_active(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_ACTIVE); +} + +extern inline int disk_sync(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_SYNC); +} + +extern inline int disk_spare(mdp_disk_t * d) +{ + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); +} + +extern inline int disk_removed(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_faulty(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_FAULTY); +} + +extern inline void mark_disk_active(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_sync(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_SYNC); +} + +extern inline void mark_disk_spare(mdp_disk_t * d) +{ + d->state = 0; +} + +extern inline void mark_disk_removed(mdp_disk_t * d) +{ + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); +} + +extern inline void mark_disk_inactive(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_ACTIVE); +} + +extern inline void mark_disk_nonsync(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_SYNC); +} + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct md_list_head same_set; /* RAID devices within the same set */ + struct md_list_head all; /* all RAID devices */ + struct md_list_head pending; /* undetected RAID devices */ + + kdev_t dev; /* Device number */ + kdev_t old_dev; /* "" when it was last imported */ + int size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + unsigned long last_events; /* IO event timestamp */ + + struct inode *inode; /* Lock inode */ + struct file filp; /* Lock file */ + + mdp_super_t *sb; + int sb_offset; + + int faulty; /* if faulty do not issue IO requests */ + int desc_nr; /* descriptor index in the superblock */ +}; + + +/* + * disk operations in a working array: + */ +#define DISKOP_SPARE_INACTIVE 0 +#define DISKOP_SPARE_WRITE 1 +#define DISKOP_SPARE_ACTIVE 2 +#define DISKOP_HOT_REMOVE_DISK 3 +#define DISKOP_HOT_ADD_DISK 4 + +typedef struct mdk_personality_s mdk_personality_t; + +struct mddev_s +{ + void *private; + mdk_personality_t *pers; + int __minor; + mdp_super_t *sb; + int nb_dev; + struct md_list_head disks; + int sb_dirty; + mdu_param_t param; + int ro; + unsigned int curr_resync; + unsigned long resync_start; + char *name; + int recovery_running; + struct semaphore reconfig_sem; + struct semaphore recovery_sem; + struct semaphore resync_sem; + struct md_list_head all_mddevs; +}; + +struct mdk_personality_s +{ + char *name; + int (*map)(mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size); + int (*make_request)(mddev_t *mddev, int rw, struct buffer_head * bh); + void (*end_request)(struct buffer_head * bh, int uptodate); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + int (*status)(char *page, mddev_t *mddev); + int (*ioctl)(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); + int max_invalid_dev; + int (*error_handler)(mddev_t *mddev, kdev_t dev); + +/* + * Some personalities (RAID-1, RAID-5) can have disks hot-added and + * hot-removed. Hot removal is different from failure. (failure marks + * a disk inactive, but the disk is still part of the array) The interface + * to such operations is the 'pers->diskop()' function, can be NULL. + * + * the diskop function can change the pointer pointing to the incoming + * descriptor, but must do so very carefully. (currently only + * SPARE_ACTIVE expects such a change) + */ + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state); + + int (*stop_resync)(mddev_t *mddev); + int (*restart_resync)(mddev_t *mddev); +}; + + +/* + * Currently we index md_array directly, based on the minor + * number. This will have to change to dynamic allocation + * once we start supporting partitioning of md devices. + */ +extern inline int mdidx (mddev_t * mddev) +{ + return mddev->__minor; +} + +extern inline kdev_t mddev_to_kdev(mddev_t * mddev) +{ + return MKDEV(MD_MAJOR, mdidx(mddev)); +} + +extern mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev); +extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \ + \ + for (tmp = head.next; \ + rdev = md_list_entry(tmp, mdk_rdev_t, field), \ + tmp = tmp->next, tmp->prev != &head \ + ; ) +/* + * iterates through the 'same array disks' ringlist + */ +#define ITERATE_RDEV(mddev,rdev,tmp) \ + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp) + +/* + * Same as above, but assumes that the device has rdev->desc_nr numbered + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order. + */ +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \ + for (i = 0; rdev = find_rdev_nr(mddev, i), i < mddev->nb_dev; i++) + + +/* + * Iterates through all 'RAID managed disks' + */ +#define ITERATE_RDEV_ALL(rdev,tmp) \ + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp) + +/* + * Iterates through 'pending RAID disks' + */ +#define ITERATE_RDEV_PENDING(rdev,tmp) \ + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp) + +/* + * iterates through all used mddevs in the system. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (tmp = all_mddevs.next; \ + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ + tmp = tmp->next, tmp->prev != &all_mddevs \ + ; ) + +extern inline int lock_mddev (mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +extern inline void unlock_mddev (mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \ + x = y; y = __tmp; } while (0) + +typedef struct mdk_thread_s { + void (*run) (void *data); + void *data; + struct wait_queue *wqueue; + unsigned long flags; + struct semaphore *sem; + struct task_struct *tsk; + const char *name; +} mdk_thread_t; + +#define THREAD_WAKEUP 0 + +typedef struct dev_name_s { + struct md_list_head list; + kdev_t dev; + char name [MAX_DISKNAME_LEN]; +} dev_name_t; + +#endif _MD_K_H + --- linux/include/linux/raid/md_p.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/md_p.h Tue Jan 16 13:42:03 2001 @@ -0,0 +1,161 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DISKS_WORDS 384 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) +#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 14 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ + __u64 events; /* 7 number of superblock updates (64-bit!) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +#endif _MD_P_H + --- linux/include/linux/raid/md_u.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/md_u.h Tue Jan 16 13:42:03 2001 @@ -0,0 +1,115 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_U_H +#define _MD_U_H + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) + +/* configuration */ +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24) +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +#define START_ARRAY _IO (MD_MAJOR, 0x31) +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif _MD_U_H + --- linux/include/linux/raid/raid0.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/raid0.h Tue Jan 16 13:44:37 2001 @@ -0,0 +1,33 @@ +#ifndef _RAID0_H +#define _RAID0_H + +#include + +struct strip_zone +{ + int zone_offset; /* Zone offset in md_dev */ + int dev_offset; /* Zone offset in real dev */ + int size; /* Zone size */ + int nb_dev; /* # of devices attached to the zone */ + mdk_rdev_t *dev[MAX_REAL]; /* Devices attached to the zone */ +}; + +struct raid0_hash +{ + struct strip_zone *zone0, *zone1; +}; + +struct raid0_private_data +{ + struct raid0_hash *hash_table; /* Dynamically allocated */ + struct strip_zone *strip_zone; /* This one too */ + int nr_strip_zones; + struct strip_zone *smallest; + int nr_zones; +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/raid1.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/raid1.h Tue Jan 16 13:44:37 2001 @@ -0,0 +1,64 @@ +#ifndef _RAID1_H +#define _RAID1_H + +#include + +struct mirror_info { + int number; + int raid_disk; + kdev_t dev; + int next; + int sect_limit; + + /* + * State bits: + */ + int operational; + int write_only; + int spare; + + int used_slot; +}; + +struct raid1_private_data { + mddev_t *mddev; + struct mirror_info mirrors[MD_SB_DISKS]; + int nr_disks; + int raid_disks; + int working_disks; + int last_used; + unsigned long next_sect; + int sect_count; + mdk_thread_t *thread, *resync_thread; + int resync_mirrors; + struct mirror_info *spare; +}; + +typedef struct raid1_private_data raid1_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' RAID1 buffer head. + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct raid1_bh { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + int cmd; + unsigned long state; + mddev_t *mddev; + struct buffer_head *master_bh; + struct buffer_head *mirror_bh [MD_SB_DISKS]; + struct buffer_head bh_req; + struct buffer_head *next_retry; +}; + +#endif --- linux/include/linux/raid/raid5.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/raid5.h Tue Jan 16 13:44:37 2001 @@ -0,0 +1,113 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include + +struct disk_info { + kdev_t dev; + int operational; + int number; + int raid_disk; + int write_only; + int spare; + int used_slot; +}; + +struct stripe_head { + md_spinlock_t stripe_lock; + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ + struct stripe_head *free_next; /* pool of free sh's */ + struct buffer_head *buffer_pool; /* pool of free buffers */ + struct buffer_head *bh_pool; /* pool of free bh's */ + struct raid5_private_data *raid_conf; + struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ + struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ + struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ + struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ + int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ + int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ + unsigned long sector; /* sector of this row */ + int size; /* buffers size */ + int pd_idx; /* parity disk index */ + atomic_t nr_pending; /* nr of pending cmds */ + unsigned long state; /* state flags */ + int cmd; /* stripe cmd */ + int count; /* nr of waiters */ + int write_method; /* reconstruct-write / read-modify-write */ + int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ + struct wait_queue *wait; /* processes waiting for this stripe */ +}; + +/* + * Phase + */ +#define PHASE_BEGIN 0 +#define PHASE_READ_OLD 1 +#define PHASE_WRITE 2 +#define PHASE_READ 3 +#define PHASE_COMPLETE 4 + +/* + * Write method + */ +#define METHOD_NONE 0 +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 + +/* + * Stripe state + */ +#define STRIPE_LOCKED 0 +#define STRIPE_ERROR 1 + +/* + * Stripe commands + */ +#define STRIPE_NONE 0 +#define STRIPE_WRITE 1 +#define STRIPE_READ 2 + +struct raid5_private_data { + struct stripe_head **stripe_hashtbl; + mddev_t *mddev; + mdk_thread_t *thread, *resync_thread; + struct disk_info disks[MD_SB_DISKS]; + struct disk_info *spare; + int buffer_size; + int chunk_size, level, algorithm; + int raid_disks, working_disks, failed_disks; + int sector_count; + unsigned long next_sector; + atomic_t nr_handle; + struct stripe_head *next_free_stripe; + int nr_stripes; + int resync_parity; + int max_nr_stripes; + int clock; + int nr_hashed_stripes; + int nr_locked_stripes; + int nr_pending_stripes; + int nr_cached_stripes; + + /* + * Free stripes pool + */ + int nr_free_sh; + struct stripe_head *free_sh_list; + struct wait_queue *wait_for_stripe; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +#endif --- linux/include/linux/raid/translucent.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/translucent.h Tue Jan 16 13:42:03 2001 @@ -0,0 +1,23 @@ +#ifndef _TRANSLUCENT_H +#define _TRANSLUCENT_H + +#include + +typedef struct dev_info dev_info_t; + +struct dev_info { + kdev_t dev; + int size; +}; + +struct translucent_private_data +{ + dev_info_t disks[MD_SB_DISKS]; +}; + + +typedef struct translucent_private_data translucent_conf_t; + +#define mddev_to_conf(mddev) ((translucent_conf_t *) mddev->private) + +#endif --- linux/include/linux/raid/xor.h.orig Tue Jan 16 13:42:03 2001 +++ linux/include/linux/raid/xor.h Tue Jan 16 13:44:35 2001 @@ -0,0 +1,12 @@ +#ifndef _XOR_H +#define _XOR_H + +#include + +#define MAX_XOR_BLOCKS 5 + +extern void calibrate_xor_block(void); +extern void (*xor_block)(unsigned int count, + struct buffer_head **bh_ptr); + +#endif --- linux/include/linux/blkdev.h.orig Mon Dec 11 01:49:44 2000 +++ linux/include/linux/blkdev.h Tue Jan 16 13:47:19 2001 @@ -91,8 +91,9 @@ extern void make_request(int major,int rw, struct buffer_head * bh); /* md needs this function to remap requests */ -extern int md_map (int minor, kdev_t *rdev, unsigned long *rsector, unsigned long size); -extern int md_make_request (int minor, int rw, struct buffer_head * bh); +extern int md_map (kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long size); +extern int md_make_request (struct buffer_head * bh, int rw); extern int md_error (kdev_t mddev, kdev_t rdev); extern int * blk_size[MAX_BLKDEV]; --- linux/include/linux/fs.h.orig Tue Jan 16 13:30:09 2001 +++ linux/include/linux/fs.h Tue Jan 16 13:47:18 2001 @@ -191,6 +191,7 @@ #define BH_Req 3 /* 0 if the buffer has been invalidated */ #define BH_Protected 6 /* 1 if the buffer is protected */ #define BH_Wait_IO 7 /* 1 if we should throttle on this buffer */ +#define BH_LowPrio 8 /* 1 if the buffer is lowprio */ /* * Try to keep the most commonly used fields in single cache lines (16 @@ -784,6 +785,7 @@ extern void refile_buffer(struct buffer_head * buf); extern void set_writetime(struct buffer_head * buf, int flag); extern int try_to_free_buffers(struct page *, int); +extern void cache_drop_behind(struct buffer_head *bh); extern int nr_buffers; extern long buffermem; @@ -814,6 +816,25 @@ } } +extern inline void mark_buffer_highprio(struct buffer_head * bh) +{ + clear_bit(BH_LowPrio, &bh->b_state); +} + +extern inline void mark_buffer_lowprio(struct buffer_head * bh) +{ + /* + * dirty buffers cannot be marked lowprio. + */ + if (!buffer_dirty(bh)) + set_bit(BH_LowPrio, &bh->b_state); +} + +static inline int buffer_lowprio(struct buffer_head * bh) +{ + return test_bit(BH_LowPrio, &bh->b_state); +} + extern inline void mark_buffer_dirty(struct buffer_head * bh, int flag) { if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { @@ -821,6 +842,23 @@ if (bh->b_list != BUF_DIRTY) refile_buffer(bh); } + /* + * if a buffer gets marked dirty then it has to lose + * it's lowprio state. + */ + mark_buffer_highprio(bh); +} + +extern inline void mark_buffer_dirty_lowprio(struct buffer_head * bh) +{ + if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { + if (bh->b_list != BUF_DIRTY) + refile_buffer(bh); + /* + * Mark it lowprio only if it was not dirty before! + */ + set_bit(BH_LowPrio, &bh->b_state); + } } extern int check_disk_change(kdev_t dev); @@ -898,6 +936,7 @@ extern struct buffer_head * find_buffer(kdev_t dev, int block, int size); extern void ll_rw_block(int, int, struct buffer_head * bh[]); extern int is_read_only(kdev_t); +extern int is_device_idle(kdev_t); extern void __brelse(struct buffer_head *); extern inline void brelse(struct buffer_head *buf) { @@ -913,8 +952,12 @@ extern void set_blocksize(kdev_t dev, int size); extern unsigned int get_hardblocksize(kdev_t dev); extern struct buffer_head * bread(kdev_t dev, int block, int size); +extern struct buffer_head * buffer_ready (kdev_t dev, int block, int size); +extern void bread_ahead (kdev_t dev, int block, int size); extern struct buffer_head * breada(kdev_t dev,int block, int size, unsigned int pos, unsigned int filesize); +extern struct buffer_head * breada_blocks(kdev_t dev,int block, + int size, int blocks); extern int brw_page(int, struct page *, kdev_t, int [], int, int); --- linux/include/linux/md.h.orig Fri May 8 09:17:13 1998 +++ linux/include/linux/md.h Tue Jan 16 13:42:03 2001 @@ -1,300 +0,0 @@ -/* - md.h : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#ifndef _MD_H -#define _MD_H - -#include -#include -#include - -/* - * Different major versions are not compatible. - * Different minor versions are only downward compatible. - * Different patchlevel versions are downward and upward compatible. - */ -#define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 36 -#define MD_PATCHLEVEL_VERSION 6 - -#define MD_DEFAULT_DISK_READAHEAD (256 * 1024) - -/* ioctls */ -#define REGISTER_DEV _IO (MD_MAJOR, 1) -#define START_MD _IO (MD_MAJOR, 2) -#define STOP_MD _IO (MD_MAJOR, 3) -#define REGISTER_DEV_NEW _IO (MD_MAJOR, 4) - -/* - personalities : - Byte 0 : Chunk size factor - Byte 1 : Fault tolerance count for each physical device - ( 0 means no fault tolerance, - 0xFF means always tolerate faults), not used by now. - Byte 2 : Personality - Byte 3 : Reserved. - */ - -#define FAULT_SHIFT 8 -#define PERSONALITY_SHIFT 16 - -#define FACTOR_MASK 0x000000FFUL -#define FAULT_MASK 0x0000FF00UL -#define PERSONALITY_MASK 0x00FF0000UL - -#define MD_RESERVED 0 /* Not used by now */ -#define LINEAR (1UL << PERSONALITY_SHIFT) -#define STRIPED (2UL << PERSONALITY_SHIFT) -#define RAID0 STRIPED -#define RAID1 (3UL << PERSONALITY_SHIFT) -#define RAID5 (4UL << PERSONALITY_SHIFT) -#define MAX_PERSONALITY 5 - -/* - * MD superblock. - * - * The MD superblock maintains some statistics on each MD configuration. - * Each real device in the MD set contains it near the end of the device. - * Some of the ideas are copied from the ext2fs implementation. - * - * We currently use 4096 bytes as follows: - * - * word offset function - * - * 0 - 31 Constant generic MD device information. - * 32 - 63 Generic state information. - * 64 - 127 Personality specific information. - * 128 - 511 12 32-words descriptors of the disks in the raid set. - * 512 - 911 Reserved. - * 912 - 1023 Disk specific descriptor. - */ - -/* - * If x is the real device size in bytes, we return an apparent size of: - * - * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES - * - * and place the 4kB superblock at offset y. - */ -#define MD_RESERVED_BYTES (64 * 1024) -#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) -#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) - -#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) -#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) - -#define MD_SB_BYTES 4096 -#define MD_SB_WORDS (MD_SB_BYTES / 4) -#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) -#define MD_SB_SECTORS (MD_SB_BYTES / 512) - -/* - * The following are counted in 32-bit words - */ -#define MD_SB_GENERIC_OFFSET 0 -#define MD_SB_PERSONALITY_OFFSET 64 -#define MD_SB_DISKS_OFFSET 128 -#define MD_SB_DESCRIPTOR_OFFSET 992 - -#define MD_SB_GENERIC_CONSTANT_WORDS 32 -#define MD_SB_GENERIC_STATE_WORDS 32 -#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) -#define MD_SB_PERSONALITY_WORDS 64 -#define MD_SB_DISKS_WORDS 384 -#define MD_SB_DESCRIPTOR_WORDS 32 -#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) -#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) -#define MD_SB_DISKS (MD_SB_DISKS_WORDS / MD_SB_DESCRIPTOR_WORDS) - -/* - * Device "operational" state bits - */ -#define MD_FAULTY_DEVICE 0 /* Device is faulty / operational */ -#define MD_ACTIVE_DEVICE 1 /* Device is a part or the raid set / spare disk */ -#define MD_SYNC_DEVICE 2 /* Device is in sync with the raid set */ - -typedef struct md_device_descriptor_s { - __u32 number; /* 0 Device number in the entire set */ - __u32 major; /* 1 Device major number */ - __u32 minor; /* 2 Device minor number */ - __u32 raid_disk; /* 3 The role of the device in the raid set */ - __u32 state; /* 4 Operational state */ - __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; -} md_descriptor_t; - -#define MD_SB_MAGIC 0xa92b4efc - -/* - * Superblock state bits - */ -#define MD_SB_CLEAN 0 -#define MD_SB_ERRORS 1 - -typedef struct md_superblock_s { - - /* - * Constant generic information - */ - __u32 md_magic; /* 0 MD identifier */ - __u32 major_version; /* 1 major version to which the set conforms */ - __u32 minor_version; /* 2 minor version to which the set conforms */ - __u32 patch_version; /* 3 patchlevel version to which the set conforms */ - __u32 gvalid_words; /* 4 Number of non-reserved words in this section */ - __u32 set_magic; /* 5 Raid set identifier */ - __u32 ctime; /* 6 Creation time */ - __u32 level; /* 7 Raid personality (mirroring, raid5, ...) */ - __u32 size; /* 8 Apparent size of each individual disk, in kB */ - __u32 nr_disks; /* 9 Number of total disks in the raid set */ - __u32 raid_disks; /* 10 Number of disks in a fully functional raid set */ - __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 11]; - - /* - * Generic state information - */ - __u32 utime; /* 0 Superblock update time */ - __u32 state; /* 1 State bits (clean, ...) */ - __u32 active_disks; /* 2 Number of currently active disks (some non-faulty disks might not be in sync) */ - __u32 working_disks; /* 3 Number of working disks */ - __u32 failed_disks; /* 4 Number of failed disks */ - __u32 spare_disks; /* 5 Number of spare disks */ - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6]; - - /* - * Personality information - */ - __u32 parity_algorithm; - __u32 chunk_size; - __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2]; - - /* - * Disks information - */ - md_descriptor_t disks[MD_SB_DISKS]; - - /* - * Reserved - */ - __u32 reserved[MD_SB_RESERVED_WORDS]; - - /* - * Active descriptor - */ - md_descriptor_t descriptor; -} md_superblock_t; - -#ifdef __KERNEL__ - -#include -#include -#include -#include - -/* - * Kernel-based reconstruction is mostly working, but still requires - * some additional work. - */ -#define SUPPORT_RECONSTRUCTION 0 - -#define MAX_REAL 8 /* Max number of physical dev per md dev */ -#define MAX_MD_DEV 4 /* Max number of md dev */ - -#define FACTOR(a) ((a)->repartition & FACTOR_MASK) -#define MAX_FAULT(a) (((a)->repartition & FAULT_MASK)>>8) -#define PERSONALITY(a) ((a)->repartition & PERSONALITY_MASK) - -#define FACTOR_SHIFT(a) (PAGE_SHIFT + (a) - 10) - -struct real_dev -{ - kdev_t dev; /* Device number */ - int size; /* Device size (in blocks) */ - int offset; /* Real device offset (in blocks) in md dev - (only used in linear mode) */ - struct inode *inode; /* Lock inode */ - md_superblock_t *sb; - u32 sb_offset; -}; - -struct md_dev; - -#define SPARE_INACTIVE 0 -#define SPARE_WRITE 1 -#define SPARE_ACTIVE 2 - -struct md_personality -{ - char *name; - int (*map)(struct md_dev *mddev, kdev_t *rdev, - unsigned long *rsector, unsigned long size); - int (*make_request)(struct md_dev *mddev, int rw, struct buffer_head * bh); - void (*end_request)(struct buffer_head * bh, int uptodate); - int (*run)(int minor, struct md_dev *mddev); - int (*stop)(int minor, struct md_dev *mddev); - int (*status)(char *page, int minor, struct md_dev *mddev); - int (*ioctl)(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); - int max_invalid_dev; - int (*error_handler)(struct md_dev *mddev, kdev_t dev); - -/* - * Some personalities (RAID-1, RAID-5) can get disks hot-added and - * hot-removed. Hot removal is different from failure. (failure marks - * a disk inactive, but the disk is still part of the array) - */ - int (*hot_add_disk) (struct md_dev *mddev, kdev_t dev); - int (*hot_remove_disk) (struct md_dev *mddev, kdev_t dev); - int (*mark_spare) (struct md_dev *mddev, md_descriptor_t *descriptor, int state); -}; - -struct md_dev -{ - struct real_dev devices[MAX_REAL]; - struct md_personality *pers; - md_superblock_t *sb; - int sb_dirty; - int repartition; - int busy; - int nb_dev; - void *private; -}; - -struct md_thread { - void (*run) (void *data); - void *data; - struct wait_queue *wqueue; - unsigned long flags; - struct semaphore *sem; - struct task_struct *tsk; -}; - -#define THREAD_WAKEUP 0 - -extern struct md_dev md_dev[MAX_MD_DEV]; -extern int md_size[MAX_MD_DEV]; -extern int md_maxreadahead[MAX_MD_DEV]; - -extern char *partition_name (kdev_t dev); - -extern int register_md_personality (int p_num, struct md_personality *p); -extern int unregister_md_personality (int p_num); -extern struct md_thread *md_register_thread (void (*run) (void *data), void *data); -extern void md_unregister_thread (struct md_thread *thread); -extern void md_wakeup_thread(struct md_thread *thread); -extern int md_update_sb (int minor); -extern int md_do_sync(struct md_dev *mddev); - -#endif __KERNEL__ -#endif _MD_H --- linux/include/linux/raid0.h.orig Tue Oct 29 14:20:24 1996 +++ linux/include/linux/raid0.h Tue Jan 16 13:42:03 2001 @@ -1,27 +0,0 @@ -#ifndef _RAID0_H -#define _RAID0_H - -struct strip_zone -{ - int zone_offset; /* Zone offset in md_dev */ - int dev_offset; /* Zone offset in real dev */ - int size; /* Zone size */ - int nb_dev; /* Number of devices attached to the zone */ - struct real_dev *dev[MAX_REAL]; /* Devices attached to the zone */ -}; - -struct raid0_hash -{ - struct strip_zone *zone0, *zone1; -}; - -struct raid0_data -{ - struct raid0_hash *hash_table; /* Dynamically allocated */ - struct strip_zone *strip_zone; /* This one too */ - int nr_strip_zones; - struct strip_zone *smallest; - int nr_zones; -}; - -#endif --- linux/include/linux/raid1.h.orig Fri May 8 09:17:13 1998 +++ linux/include/linux/raid1.h Tue Jan 16 13:42:03 2001 @@ -1,49 +0,0 @@ -#ifndef _RAID1_H -#define _RAID1_H - -#include - -struct mirror_info { - int number; - int raid_disk; - kdev_t dev; - int next; - int sect_limit; - - /* - * State bits: - */ - int operational; - int write_only; - int spare; -}; - -struct raid1_data { - struct md_dev *mddev; - struct mirror_info mirrors[MD_SB_DISKS]; /* RAID1 devices, 2 to MD_SB_DISKS */ - int raid_disks; - int working_disks; /* Number of working disks */ - int last_used; - unsigned long next_sect; - int sect_count; - int resync_running; -}; - -/* - * this is our 'private' 'collective' RAID1 buffer head. - * it contains information about what kind of IO operations were started - * for this RAID5 operation, and about their status: - */ - -struct raid1_bh { - unsigned int remaining; - int cmd; - unsigned long state; - struct md_dev *mddev; - struct buffer_head *master_bh; - struct buffer_head *mirror_bh [MD_SB_DISKS]; - struct buffer_head bh_req; - struct buffer_head *next_retry; -}; - -#endif --- linux/include/linux/raid5.h.orig Fri May 8 09:17:13 1998 +++ linux/include/linux/raid5.h Tue Jan 16 13:42:03 2001 @@ -1,110 +0,0 @@ -#ifndef _RAID5_H -#define _RAID5_H - -#ifdef __KERNEL__ -#include -#include - -struct disk_info { - kdev_t dev; - int operational; - int number; - int raid_disk; - int write_only; - int spare; -}; - -struct stripe_head { - struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ - struct stripe_head *free_next; /* pool of free sh's */ - struct buffer_head *buffer_pool; /* pool of free buffers */ - struct buffer_head *bh_pool; /* pool of free bh's */ - struct raid5_data *raid_conf; - struct buffer_head *bh_old[MD_SB_DISKS]; /* disk image */ - struct buffer_head *bh_new[MD_SB_DISKS]; /* buffers of the MD device (present in buffer cache) */ - struct buffer_head *bh_copy[MD_SB_DISKS]; /* copy on write of bh_new (bh_new can change from under us) */ - struct buffer_head *bh_req[MD_SB_DISKS]; /* copy of bh_new (only the buffer heads), queued to the lower levels */ - int cmd_new[MD_SB_DISKS]; /* READ/WRITE for new */ - int new[MD_SB_DISKS]; /* buffer added since the last handle_stripe() */ - unsigned long sector; /* sector of this row */ - int size; /* buffers size */ - int pd_idx; /* parity disk index */ - int nr_pending; /* nr of pending cmds */ - unsigned long state; /* state flags */ - int cmd; /* stripe cmd */ - int count; /* nr of waiters */ - int write_method; /* reconstruct-write / read-modify-write */ - int phase; /* PHASE_BEGIN, ..., PHASE_COMPLETE */ - struct wait_queue *wait; /* processes waiting for this stripe */ -}; - -/* - * Phase - */ -#define PHASE_BEGIN 0 -#define PHASE_READ_OLD 1 -#define PHASE_WRITE 2 -#define PHASE_READ 3 -#define PHASE_COMPLETE 4 - -/* - * Write method - */ -#define METHOD_NONE 0 -#define RECONSTRUCT_WRITE 1 -#define READ_MODIFY_WRITE 2 - -/* - * Stripe state - */ -#define STRIPE_LOCKED 0 -#define STRIPE_ERROR 1 - -/* - * Stripe commands - */ -#define STRIPE_NONE 0 -#define STRIPE_WRITE 1 -#define STRIPE_READ 2 - -struct raid5_data { - struct stripe_head **stripe_hashtbl; - struct md_dev *mddev; - struct md_thread *thread, *resync_thread; - struct disk_info disks[MD_SB_DISKS]; - struct disk_info *spare; - int buffer_size; - int chunk_size, level, algorithm; - int raid_disks, working_disks, failed_disks; - int sector_count; - unsigned long next_sector; - atomic_t nr_handle; - struct stripe_head *next_free_stripe; - int nr_stripes; - int resync_parity; - int max_nr_stripes; - int clock; - int nr_hashed_stripes; - int nr_locked_stripes; - int nr_pending_stripes; - int nr_cached_stripes; - - /* - * Free stripes pool - */ - int nr_free_sh; - struct stripe_head *free_sh_list; - struct wait_queue *wait_for_stripe; -}; - -#endif - -/* - * Our supported algorithms - */ -#define ALGORITHM_LEFT_ASYMMETRIC 0 -#define ALGORITHM_RIGHT_ASYMMETRIC 1 -#define ALGORITHM_LEFT_SYMMETRIC 2 -#define ALGORITHM_RIGHT_SYMMETRIC 3 - -#endif --- linux/include/linux/sysctl.h.orig Mon Dec 11 01:49:44 2000 +++ linux/include/linux/sysctl.h Tue Jan 16 13:42:03 2001 @@ -435,6 +435,7 @@ enum { DEV_CDROM=1, DEV_HWMON=2, + DEV_MD=3, DEV_MAC_HID=5 }; @@ -446,6 +447,11 @@ DEV_CDROM_DEBUG=4, DEV_CDROM_LOCK=5, DEV_CDROM_CHECK_MEDIA=6 +}; + +/* /proc/sys/dev/md */ +enum { + DEV_MD_SPEED_LIMIT=1 }; /* /proc/sys/dev/mac_hid */ --- linux/include/asm-i386/md.h.orig Fri May 8 09:17:13 1998 +++ linux/include/asm-i386/md.h Tue Jan 16 13:42:03 2001 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:11:57 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-alpha/md.h.orig Fri May 8 09:17:13 1998 +++ linux/include/asm-alpha/md.h Tue Jan 16 13:42:03 2001 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:11:48 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-m68k/md.h.orig Fri May 8 09:15:22 1998 +++ linux/include/asm-m68k/md.h Tue Jan 16 13:42:04 2001 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:12:04 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-sparc/md.h.orig Tue Jan 13 00:15:54 1998 +++ linux/include/asm-sparc/md.h Tue Jan 16 13:42:04 2001 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1 1997/12/15 15:12:39 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-ppc/md.h.orig Wed Oct 27 02:53:42 1999 +++ linux/include/asm-ppc/md.h Tue Jan 16 13:42:04 2001 @@ -1,13 +0,0 @@ -/* $Id: md.h,v 1.1.4.1 1999/08/13 18:30:41 davem dead $ - * md.h: High speed xor_block operation for RAID4/5 - * - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -/* #define HAVE_ARCH_XORBLOCK */ - -#define MD_XORBLOCK_ALIGNMENT sizeof(long) - -#endif /* __ASM_MD_H */ --- linux/include/asm-sparc64/md.h.orig Tue Jan 13 00:15:58 1998 +++ linux/include/asm-sparc64/md.h Tue Jan 16 13:42:04 2001 @@ -1,91 +0,0 @@ -/* $Id: md.h,v 1.2 1997/12/27 16:28:38 jj Exp $ - * md.h: High speed xor_block operation for RAID4/5 - * utilizing the UltraSparc Visual Instruction Set. - * - * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - */ - -#ifndef __ASM_MD_H -#define __ASM_MD_H - -#include -#include - -#define HAVE_ARCH_XORBLOCK - -#define MD_XORBLOCK_ALIGNMENT 64 - -/* void __xor_block (char *dest, char *src, long len) - * { - * while (len--) *dest++ ^= *src++; - * } - * - * Requirements: - * !(((long)dest | (long)src) & (MD_XORBLOCK_ALIGNMENT - 1)) && - * !(len & 127) && len >= 256 - */ - -static inline void __xor_block (char *dest, char *src, long len) -{ - __asm__ __volatile__ (" - wr %%g0, %3, %%fprs - wr %%g0, %4, %%asi - membar #LoadStore|#StoreLoad|#StoreStore - sub %2, 128, %2 - ldda [%0] %4, %%f0 - ldda [%1] %4, %%f16 -1: ldda [%0 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%0] %4 - ldda [%1 + 64] %%asi, %%f48 - ldda [%0 + 128] %%asi, %%f0 - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - add %0, 128, %0 - fxor %%f36, %%f52, %%f52 - add %1, 128, %1 - fxor %%f38, %%f54, %%f54 - subcc %2, 128, %2 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%0 - 64] %%asi - bne,pt %%xcc, 1b - ldda [%1] %4, %%f16 - ldda [%0 + 64] %%asi, %%f32 - fxor %%f0, %%f16, %%f16 - fxor %%f2, %%f18, %%f18 - fxor %%f4, %%f20, %%f20 - fxor %%f6, %%f22, %%f22 - fxor %%f8, %%f24, %%f24 - fxor %%f10, %%f26, %%f26 - fxor %%f12, %%f28, %%f28 - fxor %%f14, %%f30, %%f30 - stda %%f16, [%0] %4 - ldda [%1 + 64] %%asi, %%f48 - membar #Sync - fxor %%f32, %%f48, %%f48 - fxor %%f34, %%f50, %%f50 - fxor %%f36, %%f52, %%f52 - fxor %%f38, %%f54, %%f54 - fxor %%f40, %%f56, %%f56 - fxor %%f42, %%f58, %%f58 - fxor %%f44, %%f60, %%f60 - fxor %%f46, %%f62, %%f62 - stda %%f48, [%0 + 64] %%asi - membar #Sync|#StoreStore|#StoreLoad - wr %%g0, 0, %%fprs - " : : - "r" (dest), "r" (src), "r" (len), "i" (FPRS_FEF), "i" (ASI_BLK_P) : - "cc", "memory"); -} - -#endif /* __ASM_MD_H */ --- linux/drivers/block/Config.in.orig Mon Dec 11 01:49:41 2000 +++ linux/drivers/block/Config.in Tue Jan 16 13:42:04 2001 @@ -103,10 +103,13 @@ fi bool 'Multiple devices driver support' CONFIG_BLK_DEV_MD if [ "$CONFIG_BLK_DEV_MD" = "y" ]; then + bool 'Autodetect RAID partitions' CONFIG_AUTODETECT_RAID tristate ' Linear (append) mode' CONFIG_MD_LINEAR tristate ' RAID-0 (striping) mode' CONFIG_MD_STRIPED tristate ' RAID-1 (mirroring) mode' CONFIG_MD_MIRRORING tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 + tristate ' Translucent mode' CONFIG_MD_TRANSLUCENT + tristate ' Hierarchical Storage Management support' CONFIG_MD_HSM fi if [ "$CONFIG_MD_LINEAR" = "y" -o "$CONFIG_MD_STRIPED" = "y" ]; then bool ' Boot support (linear, striped)' CONFIG_MD_BOOT --- linux/drivers/block/Makefile.orig Mon Dec 11 01:49:41 2000 +++ linux/drivers/block/Makefile Tue Jan 16 13:42:04 2001 @@ -294,10 +294,28 @@ endif ifeq ($(CONFIG_MD_RAID5),y) +LX_OBJS += xor.o L_OBJS += raid5.o else ifeq ($(CONFIG_MD_RAID5),m) + LX_OBJS += xor.o M_OBJS += raid5.o + endif +endif + +ifeq ($(CONFIG_MD_TRANSLUCENT),y) +L_OBJS += translucent.o +else + ifeq ($(CONFIG_MD_TRANSLUCENT),m) + M_OBJS += translucent.o + endif +endif + +ifeq ($(CONFIG_MD_HSM),y) +L_OBJS += hsm.o +else + ifeq ($(CONFIG_MD_HSM),m) + M_OBJS += hsm.o endif endif --- linux/drivers/block/genhd.c.orig Tue Jan 16 13:30:06 2001 +++ linux/drivers/block/genhd.c Tue Jan 16 13:42:04 2001 @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef CONFIG_ARCH_S390 #include @@ -1785,6 +1786,9 @@ else #endif rd_load(); +#endif +#ifdef CONFIG_BLK_DEV_MD + autodetect_raid(); #endif #ifdef CONFIG_MD_BOOT md_setup_drive(); --- linux/drivers/block/hsm.c.orig Tue Jan 16 13:42:04 2001 +++ linux/drivers/block/hsm.c Tue Jan 16 13:42:04 2001 @@ -0,0 +1,840 @@ +/* + hsm.c : HSM RAID driver for Linux + Copyright (C) 1998 Ingo Molnar + + HSM mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include + +#include +#include + +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + + +#define DEBUG_HSM 1 + +#if DEBUG_HSM +#define dprintk(x,y...) printk(x,##y) +#else +#define dprintk(x,y...) do { } while (0) +#endif + +void print_bh(struct buffer_head *bh) +{ + dprintk("bh %p: %lx %lx %x %x %lx %p %lx %p %x %p %x %lx\n", bh, + bh->b_blocknr, bh->b_size, bh->b_dev, bh->b_rdev, + bh->b_rsector, bh->b_this_page, bh->b_state, + bh->b_next_free, bh->b_count, bh->b_data, + bh->b_list, bh->b_flushtime + ); +} + +static int check_bg (pv_t *pv, pv_block_group_t * bg) +{ + int i, free = 0; + + dprintk("checking bg ...\n"); + + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) { + if (pv_pptr_free(bg->blocks + i)) { + free++; + if (test_bit(i, bg->used_bitmap)) { + printk("hm, bit %d set?\n", i); + } + } else { + if (!test_bit(i, bg->used_bitmap)) { + printk("hm, bit %d not set?\n", i); + } + } + } + dprintk("%d free blocks in bg ...\n", free); + return free; +} + +static void get_bg (pv_t *pv, pv_bg_desc_t *desc, int nr) +{ + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2; + struct buffer_head *bh; + + dprintk("... getting BG at %u ...\n", bg_pos); + + bh = bread (pv->dev, bg_pos, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return; + } + desc->bg = (pv_block_group_t *) bh->b_data; + desc->free_blocks = check_bg(pv, desc->bg); +} + +static int find_free_block (lv_t *lv, pv_t *pv, pv_bg_desc_t *desc, int nr, + unsigned int lblock, lv_lptr_t * index) +{ + int i; + + for (i = 0; i < pv->pv_sb->pv_bg_size-1; i++) { + pv_pptr_t * bptr = desc->bg->blocks + i; + if (pv_pptr_free(bptr)) { + unsigned int bg_pos = nr * pv->pv_sb->pv_bg_size + 2; + + if (test_bit(i, desc->bg->used_bitmap)) { + MD_BUG(); + continue; + } + bptr->u.used.owner.log_id = lv->log_id; + bptr->u.used.owner.log_index = lblock; + index->data.phys_nr = pv->phys_nr; + index->data.phys_block = bg_pos + i + 1; + set_bit(i, desc->bg->used_bitmap); + desc->free_blocks--; + dprintk(".....free blocks left in bg %p: %d\n", + desc->bg, desc->free_blocks); + return 0; + } + } + return -ENOSPC; +} + +static int __get_free_block (lv_t *lv, pv_t *pv, + unsigned int lblock, lv_lptr_t * index) +{ + int i; + + dprintk("trying to get free block for lblock %d ...\n", lblock); + + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) { + pv_bg_desc_t *desc = pv->bg_array + i; + + dprintk("looking at desc #%d (%p)...\n", i, desc->bg); + if (!desc->bg) + get_bg(pv, desc, i); + + if (desc->bg && desc->free_blocks) + return find_free_block(lv, pv, desc, i, + lblock, index); + } + dprintk("hsm: pv %s full!\n", partition_name(pv->dev)); + return -ENOSPC; +} + +static int get_free_block (lv_t *lv, unsigned int lblock, lv_lptr_t * index) +{ + int err; + + if (!lv->free_indices) + return -ENOSPC; + + /* fix me */ + err = __get_free_block(lv, lv->vg->pv_array + 0, lblock, index); + + if (err || !index->data.phys_block) { + MD_BUG(); + return -ENOSPC; + } + + lv->free_indices--; + + return 0; +} + +/* + * fix me: wordsize assumptions ... + */ +#define INDEX_BITS 8 +#define INDEX_DEPTH (32/INDEX_BITS) +#define INDEX_MASK ((1< [.", index->data.phys_nr, + index->data.phys_block, index->cpu_addr); + + tmp = index_child(index); + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index_block(lv, tmp)) + dprintk("(%d->%d)", i, index_block(lv, tmp)); + tmp++; + } + dprintk(".]\n"); +} + +static int read_index_group (lv_t *lv, lv_lptr_t *index) +{ + lv_lptr_t *index_group, *tmp; + struct buffer_head *bh; + int i; + + dprintk("reading index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = bread(index_dev(lv, index), index_block(lv, index), HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -EIO; + } + if (!buffer_uptodate(bh)) + MD_BUG(); + + index_group = (lv_lptr_t *) bh->b_data; + tmp = index_group; + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index_block(lv, tmp)) { + dprintk("index group has BLOCK %d, non-present.\n", i); + tmp->cpu_addr = 0; + } + tmp++; + } + index->cpu_addr = ptr_to_cpuaddr(index_group); + + dprintk("have read index group %p at block %d.\n", + index_group, index_block(lv, index)); + print_index_list(lv, index); + + return 0; +} + +static int alloc_index_group (lv_t *lv, unsigned int lblock, lv_lptr_t * index) +{ + struct buffer_head *bh; + lv_lptr_t * index_group; + + if (get_free_block(lv, lblock, index)) + return -ENOSPC; + + dprintk("creating block for index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = getblk(index_dev(lv, index), + index_block(lv, index), HSM_BLOCKSIZE); + + index_group = (lv_lptr_t *) bh->b_data; + md_clear_page(index_group); + mark_buffer_uptodate(bh, 1); + + index->cpu_addr = ptr_to_cpuaddr(index_group); + + dprintk("allocated index group %p at block %d.\n", + index_group, index_block(lv, index)); + return 0; +} + +static lv_lptr_t * alloc_fixed_index (lv_t *lv, unsigned int lblock) +{ + lv_lptr_t * index = index_child(&lv->root_index); + int idx, l; + + for (l = INDEX_DEPTH-1; l >= 0; l--) { + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK; + index += idx; + if (!l) + break; + if (!index_present(index)) { + dprintk("no group, level %u, pos %u\n", l, idx); + if (alloc_index_group(lv, lblock, index)) + return NULL; + } + index = index_child(index); + } + if (!index_block(lv,index)) { + dprintk("no data, pos %u\n", idx); + if (get_free_block(lv, lblock, index)) + return NULL; + return index; + } + MD_BUG(); + return index; +} + +static lv_lptr_t * find_index (lv_t *lv, unsigned int lblock) +{ + lv_lptr_t * index = index_child(&lv->root_index); + int idx, l; + + for (l = INDEX_DEPTH-1; l >= 0; l--) { + idx = (lblock >> (INDEX_BITS*l)) & INDEX_MASK; + index += idx; + if (!l) + break; + if (index_free(index)) + return NULL; + if (!index_present(index)) + read_index_group(lv, index); + if (!index_present(index)) { + MD_BUG(); + return NULL; + } + index = index_child(index); + } + if (!index_block(lv,index)) + return NULL; + return index; +} + +static int read_root_index(lv_t *lv) +{ + int err; + lv_lptr_t *index = &lv->root_index; + + if (!index_block(lv, index)) { + printk("LV has no root index yet, creating.\n"); + + err = alloc_index_group (lv, 0, index); + if (err) { + printk("could not create index group, err:%d\n", err); + return err; + } + lv->vg->vg_sb->lv_array[lv->log_id].lv_root_idx = + lv->root_index.data; + } else { + printk("LV already has a root index.\n"); + printk("... at <%s:%d>.\n", + partition_name(index_dev(lv, index)), + index_block(lv, index)); + + read_index_group(lv, index); + } + return 0; +} + +static int init_pv(pv_t *pv) +{ + struct buffer_head *bh; + pv_sb_t *pv_sb; + + bh = bread (pv->dev, 0, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + + pv_sb = (pv_sb_t *) bh->b_data; + pv->pv_sb = pv_sb; + + if (pv_sb->pv_magic != HSM_PV_SB_MAGIC) { + printk("%s is not a PV, has magic %x instead of %x!\n", + partition_name(pv->dev), pv_sb->pv_magic, + HSM_PV_SB_MAGIC); + return -1; + } + printk("%s detected as a valid PV (#%d).\n", partition_name(pv->dev), + pv->phys_nr); + printk("... created under HSM version %d.%d.%d, at %x.\n", + pv_sb->pv_major, pv_sb->pv_minor, pv_sb->pv_patch, pv_sb->pv_ctime); + printk("... total # of blocks: %d (%d left unallocated).\n", + pv_sb->pv_total_size, pv_sb->pv_blocks_left); + + printk("... block size: %d bytes.\n", pv_sb->pv_block_size); + printk("... block descriptor size: %d bytes.\n", pv_sb->pv_pptr_size); + printk("... block group size: %d blocks.\n", pv_sb->pv_bg_size); + printk("... # of block groups: %d.\n", pv_sb->pv_block_groups); + + if (pv_sb->pv_block_groups*sizeof(pv_bg_desc_t) > PAGE_SIZE) { + MD_BUG(); + return 1; + } + pv->bg_array = (pv_bg_desc_t *)__get_free_page(GFP_KERNEL); + if (!pv->bg_array) { + MD_BUG(); + return 1; + } + memset(pv->bg_array, 0, PAGE_SIZE); + + return 0; +} + +static int free_pv(pv_t *pv) +{ + struct buffer_head *bh; + + dprintk("freeing PV %d ...\n", pv->phys_nr); + + if (pv->bg_array) { + int i; + + dprintk(".... freeing BGs ...\n"); + for (i = 0; i < pv->pv_sb->pv_block_groups; i++) { + unsigned int bg_pos = i * pv->pv_sb->pv_bg_size + 2; + pv_bg_desc_t *desc = pv->bg_array + i; + + if (desc->bg) { + dprintk(".... freeing BG %d ...\n", i); + bh = getblk (pv->dev, bg_pos, HSM_BLOCKSIZE); + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + } + } + free_page((unsigned long)pv->bg_array); + } else + MD_BUG(); + + bh = getblk (pv->dev, 0, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + + return 0; +} + +struct semaphore hsm_sem = MUTEX; + +#define HSM_SECTORS (HSM_BLOCKSIZE/512) + +static int hsm_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, + unsigned long *rsector, unsigned long bsectors) +{ + lv_t *lv = kdev_to_lv(dev); + lv_lptr_t *index; + unsigned int lblock = *rsector / HSM_SECTORS; + unsigned int offset = *rsector % HSM_SECTORS; + int err = -EIO; + + if (!lv) { + printk("HSM: md%d not a Logical Volume!\n", mdidx(mddev)); + goto out; + } + if (offset + bsectors > HSM_SECTORS) { + MD_BUG(); + goto out; + } + down(&hsm_sem); + index = find_index(lv, lblock); + if (!index) { + printk("no block %u yet ... allocating\n", lblock); + index = alloc_fixed_index(lv, lblock); + } + + err = 0; + + printk(" %u <%s : %ld(%ld)> -> ", lblock, + partition_name(*rdev), *rsector, bsectors); + + *rdev = index_dev(lv, index); + *rsector = index_block(lv, index) * HSM_SECTORS + offset; + + printk(" <%s : %ld> %u\n", + partition_name(*rdev), *rsector, index_block(lv, index)); + + up(&hsm_sem); +out: + return err; +} + +static void free_index (lv_t *lv, lv_lptr_t * index) +{ + struct buffer_head *bh; + + printk("tryin to get cached block for index group <%s:%d>\n", + partition_name(index_dev(lv, index)), index_block(lv, index)); + + bh = getblk(index_dev(lv, index), index_block(lv, index),HSM_BLOCKSIZE); + + printk("....FREEING "); + print_index_list(lv, index); + + if (bh) { + if (!buffer_uptodate(bh)) + MD_BUG(); + if ((lv_lptr_t *)bh->b_data != index_child(index)) { + printk("huh? b_data is %p, index content is %p.\n", + bh->b_data, index_child(index)); + } else + printk("good, b_data == index content == %p.\n", + index_child(index)); + printk("b_count == %d, writing.\n", bh->b_count); + mark_buffer_dirty(bh, 1); + brelse(bh); + brelse(bh); + printk("done.\n"); + } else { + printk("FAILED!\n"); + } + print_index_list(lv, index); + index_child(index) = NULL; +} + +static void free_index_group (lv_t *lv, int level, lv_lptr_t * index_0) +{ + char dots [3*8]; + lv_lptr_t * index; + int i, nr_dots; + + nr_dots = (INDEX_DEPTH-level)*3; + memcpy(dots,"...............",nr_dots); + dots[nr_dots] = 0; + + dprintk("%s level %d index group block:\n", dots, level); + + + index = index_0; + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + if (index->data.phys_block) { + dprintk("%s block <%u,%u,%x>\n", dots, + index->data.phys_nr, + index->data.phys_block, + index->cpu_addr); + if (level && index_present(index)) { + dprintk("%s==> deeper one level\n", dots); + free_index_group(lv, level-1, + index_child(index)); + dprintk("%s freeing index group block %p ...", + dots, index_child(index)); + free_index(lv, index); + } + } + index++; + } + dprintk("%s DONE: level %d index group block.\n", dots, level); +} + +static void free_lv_indextree (lv_t *lv) +{ + dprintk("freeing LV %d ...\n", lv->log_id); + dprintk("..root index: %p\n", index_child(&lv->root_index)); + dprintk("..INDEX TREE:\n"); + free_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index)); + dprintk("..freeing root index %p ...", index_child(&lv->root_index)); + dprintk("root block <%u,%u,%x>\n", lv->root_index.data.phys_nr, + lv->root_index.data.phys_block, lv->root_index.cpu_addr); + free_index(lv, &lv->root_index); + dprintk("..INDEX TREE done.\n"); + fsync_dev(lv->vg->pv_array[0].dev); /* fix me */ + lv->vg->vg_sb->lv_array[lv->log_id].lv_free_indices = lv->free_indices; +} + +static void print_index_group (lv_t *lv, int level, lv_lptr_t * index_0) +{ + char dots [3*5]; + lv_lptr_t * index; + int i, nr_dots; + + nr_dots = (INDEX_DEPTH-level)*3; + memcpy(dots,"...............",nr_dots); + dots[nr_dots] = 0; + + dprintk("%s level %d index group block:\n", dots, level); + + + for (i = 0; i < HSM_LPTRS_PER_BLOCK; i++) { + index = index_0 + i; + if (index->data.phys_block) { + dprintk("%s block <%u,%u,%x>\n", dots, + index->data.phys_nr, + index->data.phys_block, + index->cpu_addr); + if (level && index_present(index)) { + dprintk("%s==> deeper one level\n", dots); + print_index_group(lv, level-1, + index_child(index)); + } + } + } + dprintk("%s DONE: level %d index group block.\n", dots, level); +} + +static void print_lv (lv_t *lv) +{ + dprintk("printing LV %d ...\n", lv->log_id); + dprintk("..root index: %p\n", index_child(&lv->root_index)); + dprintk("..INDEX TREE:\n"); + print_index_group(lv, INDEX_DEPTH-1, index_child(&lv->root_index)); + dprintk("..INDEX TREE done.\n"); +} + +static int map_lv (lv_t *lv) +{ + kdev_t dev = lv->dev; + unsigned int nr = MINOR(dev); + mddev_t *mddev = lv->vg->mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return -1; + } + if (kdev_to_mddev(dev)) { + MD_BUG(); + return -1; + } + md_hd_struct[nr].start_sect = 0; + md_hd_struct[nr].nr_sects = md_size[mdidx(mddev)] << 1; + md_size[nr] = md_size[mdidx(mddev)]; + add_mddev_mapping(mddev, dev, lv); + + return 0; +} + +static int unmap_lv (lv_t *lv) +{ + kdev_t dev = lv->dev; + unsigned int nr = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return -1; + } + md_hd_struct[nr].start_sect = 0; + md_hd_struct[nr].nr_sects = 0; + md_size[nr] = 0; + del_mddev_mapping(lv->vg->mddev, dev); + + return 0; +} + +static int init_vg (vg_t *vg) +{ + int i; + lv_t *lv; + kdev_t dev; + vg_sb_t *vg_sb; + struct buffer_head *bh; + lv_descriptor_t *lv_desc; + + /* + * fix me: read all PVs and compare the SB + */ + dev = vg->pv_array[0].dev; + bh = bread (dev, 1, HSM_BLOCKSIZE); + if (!bh) { + MD_BUG(); + return -1; + } + + vg_sb = (vg_sb_t *) bh->b_data; + vg->vg_sb = vg_sb; + + if (vg_sb->vg_magic != HSM_VG_SB_MAGIC) { + printk("%s is not a valid VG, has magic %x instead of %x!\n", + partition_name(dev), vg_sb->vg_magic, + HSM_VG_SB_MAGIC); + return -1; + } + + vg->nr_lv = 0; + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + unsigned int id; + lv_desc = vg->vg_sb->lv_array + i; + + id = lv_desc->lv_id; + if (!id) { + printk("... LV desc %d empty\n", i); + continue; + } + if (id >= HSM_MAX_LVS_PER_VG) { + MD_BUG(); + continue; + } + + lv = vg->lv_array + id; + if (lv->vg) { + MD_BUG(); + continue; + } + lv->log_id = id; + lv->vg = vg; + lv->max_indices = lv_desc->lv_max_indices; + lv->free_indices = lv_desc->lv_free_indices; + lv->root_index.data = lv_desc->lv_root_idx; + lv->dev = MKDEV(MD_MAJOR, lv_desc->md_id); + + vg->nr_lv++; + + map_lv(lv); + if (read_root_index(lv)) { + vg->nr_lv--; + unmap_lv(lv); + memset(lv, 0, sizeof(*lv)); + } + } + if (vg->nr_lv != vg_sb->nr_lvs) + MD_BUG(); + + return 0; +} + +static int hsm_run (mddev_t *mddev) +{ + int i; + vg_t *vg; + mdk_rdev_t *rdev; + + MOD_INC_USE_COUNT; + + vg = kmalloc (sizeof (*vg), GFP_KERNEL); + if (!vg) + goto out; + memset(vg, 0, sizeof(*vg)); + mddev->private = vg; + vg->mddev = mddev; + + if (md_check_ordering(mddev)) { + printk("hsm: disks are not ordered, aborting!\n"); + goto out; + } + + set_blocksize (mddev_to_kdev(mddev), HSM_BLOCKSIZE); + + vg->nr_pv = mddev->nb_dev; + ITERATE_RDEV_ORDERED(mddev,rdev,i) { + pv_t *pv = vg->pv_array + i; + + pv->dev = rdev->dev; + fsync_dev (pv->dev); + set_blocksize (pv->dev, HSM_BLOCKSIZE); + pv->phys_nr = i; + if (init_pv(pv)) + goto out; + } + + init_vg(vg); + + return 0; + +out: + if (vg) { + kfree(vg); + mddev->private = NULL; + } + MOD_DEC_USE_COUNT; + + return 1; +} + +static int hsm_stop (mddev_t *mddev) +{ + lv_t *lv; + vg_t *vg; + int i; + + vg = mddev_to_vg(mddev); + + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + lv = vg->lv_array + i; + if (!lv->log_id) + continue; + print_lv(lv); + free_lv_indextree(lv); + unmap_lv(lv); + } + for (i = 0; i < vg->nr_pv; i++) + free_pv(vg->pv_array + i); + + kfree(vg); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static int hsm_status (char *page, mddev_t *mddev) +{ + int sz = 0, i; + lv_t *lv; + vg_t *vg; + + vg = mddev_to_vg(mddev); + + for (i = 0; i < HSM_MAX_LVS_PER_VG; i++) { + lv = vg->lv_array + i; + if (!lv->log_id) + continue; + sz += sprintf(page+sz, " ", lv->log_id, + lv->max_indices - lv->free_indices, lv->max_indices); + } + return sz; +} + + +static mdk_personality_t hsm_personality= +{ + "hsm", + hsm_map, + NULL, + NULL, + hsm_run, + hsm_stop, + hsm_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL +}; + +#ifndef MODULE + +md__initfunc(void hsm_init (void)) +{ + register_md_personality (HSM, &hsm_personality); +} + +#else + +int init_module (void) +{ + return (register_md_personality (HSM, &hsm_personality)); +} + +void cleanup_module (void) +{ + unregister_md_personality (HSM); +} + +#endif + +/* + * This Linus-trick catches bugs via the linker. + */ + +extern void __BUG__in__hsm_dot_c_1(void); +extern void __BUG__in__hsm_dot_c_2(void); +extern void __BUG__in__hsm_dot_c_3(void); +extern void __BUG__in__hsm_dot_c_4(void); +extern void __BUG__in__hsm_dot_c_5(void); +extern void __BUG__in__hsm_dot_c_6(void); +extern void __BUG__in__hsm_dot_c_7(void); + +void bugcatcher (void) +{ + if (sizeof(pv_block_group_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_1(); + if (sizeof(lv_index_block_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_2(); + + if (sizeof(pv_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_4(); + if (sizeof(lv_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_3(); + if (sizeof(vg_sb_t) != HSM_BLOCKSIZE) + __BUG__in__hsm_dot_c_6(); + + if (sizeof(lv_lptr_t) != 16) + __BUG__in__hsm_dot_c_5(); + if (sizeof(pv_pptr_t) != 16) + __BUG__in__hsm_dot_c_6(); +} + --- linux/drivers/block/linear.c.orig Sat Nov 8 20:39:12 1997 +++ linux/drivers/block/linear.c Tue Jan 16 13:42:04 2001 @@ -1,4 +1,3 @@ - /* linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER @@ -19,186 +18,207 @@ #include -#include +#include #include -#include -#include "linear.h" +#include #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY -static int linear_run (int minor, struct md_dev *mddev) +static int linear_run (mddev_t *mddev) { - int cur=0, i, size, dev0_size, nb_zone; - struct linear_data *data; - - MOD_INC_USE_COUNT; - - mddev->private=kmalloc (sizeof (struct linear_data), GFP_KERNEL); - data=(struct linear_data *) mddev->private; - - /* - Find out the smallest device. This was previously done - at registry time, but since it violates modularity, - I moved it here... Any comment ? ;-) - */ - - data->smallest=mddev->devices; - for (i=1; inb_dev; i++) - if (data->smallest->size > mddev->devices[i].size) - data->smallest=mddev->devices+i; - - nb_zone=data->nr_zones= - md_size[minor]/data->smallest->size + - (md_size[minor]%data->smallest->size ? 1 : 0); - - data->hash_table=kmalloc (sizeof (struct linear_hash)*nb_zone, GFP_KERNEL); - - size=mddev->devices[cur].size; + linear_conf_t *conf; + struct linear_hash *table; + mdk_rdev_t *rdev; + int size, i, j, nb_zone; + unsigned int curr_offset; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (md_check_ordering(mddev)) { + printk("linear: disks are not ordered, aborting!\n"); + goto out; + } + /* + * Find the smallest device. + */ + + conf->smallest = NULL; + curr_offset = 0; + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + dev_info_t *disk = conf->disks + j; + + disk->dev = rdev->dev; + disk->size = rdev->size; + disk->offset = curr_offset; + + curr_offset += disk->size; + + if (!conf->smallest || (disk->size < conf->smallest->size)) + conf->smallest = disk; + } + + nb_zone = conf->nr_zones = + md_size[mdidx(mddev)] / conf->smallest->size + + ((md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0); + + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone, + GFP_KERNEL); + if (!conf->hash_table) + goto out; + + /* + * Here we generate the linear hash table + */ + table = conf->hash_table; + i = 0; + size = 0; + for (j = 0; j < mddev->nb_dev; j++) { + dev_info_t *disk = conf->disks + j; + + if (size < 0) { + table->dev1 = disk; + table++; + } + size += disk->size; + + while (size) { + table->dev0 = disk; + size -= conf->smallest->size; + if (size < 0) + break; + table->dev1 = NULL; + table++; + } + } + table->dev1 = NULL; + + return 0; + +out: + if (conf) + kfree(conf); + MOD_DEC_USE_COUNT; + return 1; +} + +static int linear_stop (mddev_t *mddev) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + kfree(conf->hash_table); + kfree(conf); - i=0; - while (curnb_dev) - { - data->hash_table[i].dev0=mddev->devices+cur; + MOD_DEC_USE_COUNT; - if (size>=data->smallest->size) /* If we completely fill the slot */ - { - data->hash_table[i++].dev1=NULL; - size-=data->smallest->size; - - if (!size) - { - if (++cur==mddev->nb_dev) continue; - size=mddev->devices[cur].size; - } - - continue; - } - - if (++cur==mddev->nb_dev) /* Last dev, set dev1 as NULL */ - { - data->hash_table[i].dev1=NULL; - continue; - } - - dev0_size=size; /* Here, we use a 2nd dev to fill the slot */ - size=mddev->devices[cur].size; - data->hash_table[i++].dev1=mddev->devices+cur; - size-=(data->smallest->size - dev0_size); - } - - return 0; -} - -static int linear_stop (int minor, struct md_dev *mddev) -{ - struct linear_data *data=(struct linear_data *) mddev->private; - - kfree (data->hash_table); - kfree (data); - - MOD_DEC_USE_COUNT; - - return 0; + return 0; } -static int linear_map (struct md_dev *mddev, kdev_t *rdev, +static int linear_map (mddev_t *mddev, kdev_t dev, kdev_t *rdev, unsigned long *rsector, unsigned long size) { - struct linear_data *data=(struct linear_data *) mddev->private; - struct linear_hash *hash; - struct real_dev *tmp_dev; - long block; - - block=*rsector >> 1; - hash=data->hash_table+(block/data->smallest->size); - - if (block >= (hash->dev0->size + hash->dev0->offset)) - { - if (!hash->dev1) - { - printk ("linear_map : hash->dev1==NULL for block %ld\n", block); - return (-1); - } - - tmp_dev=hash->dev1; - } - else - tmp_dev=hash->dev0; + linear_conf_t *conf = mddev_to_conf(mddev); + struct linear_hash *hash; + dev_info_t *tmp_dev; + long block; + + block = *rsector >> 1; + hash = conf->hash_table + (block / conf->smallest->size); + + if (block >= (hash->dev0->size + hash->dev0->offset)) + { + if (!hash->dev1) + { + printk ("linear_map : hash->dev1==NULL for block %ld\n", + block); + return -1; + } + tmp_dev = hash->dev1; + } else + tmp_dev = hash->dev0; - if (block >= (tmp_dev->size + tmp_dev->offset) || block < tmp_dev->offset) - printk ("Block %ld out of bounds on dev %s size %d offset %d\n", - block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); + if (block >= (tmp_dev->size + tmp_dev->offset) + || block < tmp_dev->offset) + printk ("Block %ld out of bounds on dev %s size %d offset %d\n", + block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); - *rdev=tmp_dev->dev; - *rsector=(block-(tmp_dev->offset)) << 1; + *rdev = tmp_dev->dev; + *rsector = (block - tmp_dev->offset) << 1; - return (0); + return 0; } -static int linear_status (char *page, int minor, struct md_dev *mddev) +static int linear_status (char *page, mddev_t *mddev) { - int sz=0; + int sz=0; #undef MD_DEBUG #ifdef MD_DEBUG - int j; - struct linear_data *data=(struct linear_data *) mddev->private; + int j; + linear_conf_t *conf = mddev_to_conf(mddev); - sz+=sprintf (page+sz, " "); - for (j=0; jnr_zones; j++) - { - sz+=sprintf (page+sz, "[%s", - partition_name (data->hash_table[j].dev0->dev)); - - if (data->hash_table[j].dev1) - sz+=sprintf (page+sz, "/%s] ", - partition_name(data->hash_table[j].dev1->dev)); - else - sz+=sprintf (page+sz, "] "); - } - - sz+=sprintf (page+sz, "\n"); + sz += sprintf(page+sz, " "); + for (j = 0; j < conf->nr_zones; j++) + { + sz += sprintf(page+sz, "[%s", + partition_name(conf->hash_table[j].dev0->dev)); + + if (conf->hash_table[j].dev1) + sz += sprintf(page+sz, "/%s] ", + partition_name(conf->hash_table[j].dev1->dev)); + else + sz += sprintf(page+sz, "] "); + } + sz += sprintf(page+sz, "\n"); #endif - sz+=sprintf (page+sz, " %dk rounding", 1<param.chunk_size/1024); + return sz; } -static struct md_personality linear_personality= +static mdk_personality_t linear_personality= { - "linear", - linear_map, - NULL, - NULL, - linear_run, - linear_stop, - linear_status, - NULL, /* no ioctls */ - 0 + "linear", + linear_map, + NULL, + NULL, + linear_run, + linear_stop, + linear_status, + NULL, + 0, + NULL, + NULL, + NULL, + NULL }; - #ifndef MODULE -__initfunc(void linear_init (void)) +md__initfunc(void linear_init (void)) { - register_md_personality (LINEAR, &linear_personality); + register_md_personality (LINEAR, &linear_personality); } #else int init_module (void) { - return (register_md_personality (LINEAR, &linear_personality)); + return (register_md_personality (LINEAR, &linear_personality)); } void cleanup_module (void) { - unregister_md_personality (LINEAR); + unregister_md_personality (LINEAR); } #endif + --- linux/drivers/block/linear.h.orig Fri Nov 22 15:07:23 1996 +++ linux/drivers/block/linear.h Tue Jan 16 13:42:04 2001 @@ -1,16 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct linear_hash -{ - struct real_dev *dev0, *dev1; -}; - -struct linear_data -{ - struct linear_hash *hash_table; /* Dynamically allocated */ - struct real_dev *smallest; - int nr_zones; -}; - -#endif --- linux/drivers/block/ll_rw_blk.c.orig Mon Dec 11 01:49:41 2000 +++ linux/drivers/block/ll_rw_blk.c Tue Jan 16 13:42:04 2001 @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -53,6 +54,11 @@ spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED; /* + * per-major idle-IO detection + */ +unsigned long io_events[MAX_BLKDEV] = {0, }; + +/* * used to wait on when there are no free requests */ struct wait_queue * wait_for_request; @@ -583,6 +589,8 @@ return; /* Maybe the above fixes it, and maybe it doesn't boot. Life is interesting */ lock_buffer(bh); + if (!buffer_lowprio(bh)) + io_events[major]++; if (blk_size[major]) { unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1; @@ -832,7 +840,7 @@ bh[i]->b_rsector=bh[i]->b_blocknr*(bh[i]->b_size >> 9); #ifdef CONFIG_BLK_DEV_MD if (major==MD_MAJOR && - md_map (MINOR(bh[i]->b_dev), &bh[i]->b_rdev, + md_map (bh[i]->b_dev, &bh[i]->b_rdev, &bh[i]->b_rsector, bh[i]->b_size >> 9)) { printk (KERN_ERR "Bad md_map in ll_rw_block\n"); @@ -852,7 +860,7 @@ set_bit(BH_Req, &bh[i]->b_state); #ifdef CONFIG_BLK_DEV_MD if (MAJOR(bh[i]->b_dev) == MD_MAJOR) { - md_make_request(MINOR (bh[i]->b_dev), rw, bh[i]); + md_make_request(bh[i], rw); continue; } #endif --- linux/drivers/block/md.c.orig Mon Sep 4 19:39:16 2000 +++ linux/drivers/block/md.c Tue Jan 16 13:42:04 2001 @@ -1,21 +1,17 @@ - /* md.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - + Copyright (C) 1998, 1999 Ingo Molnar - A lot of inspiration came from hd.c ... + completely rewritten, based on the MD driver code from Marc Zyngier - kerneld support by Boris Tobotras - boot support for linear and striped mode by Harald Hoyer + Changes: - RAID-1/RAID-5 extensions by: - Ingo Molnar, Miguel de Icaza, Gadi Oxman + - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar + - boot support for linear and striped mode by Harald Hoyer + - kerneld support by Boris Tobotras + - kmod support by: Cyrus Durgin + - RAID0 bugfixes: Mark Anthony Lisher - Changes for kmod by: - Cyrus Durgin - This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) @@ -26,807 +22,3007 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* - * Current RAID-1,4,5 parallel reconstruction speed limit is 1024 KB/sec, so - * the extra system load does not show up that much. Increase it if your - * system can take more. - */ -#define SPEED_LIMIT 1024 +#include +#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #ifdef CONFIG_KMOD #include #endif -#include -#include #define __KERNEL_SYSCALLS__ #include +#include + +extern asmlinkage int sys_sched_yield(void); +extern asmlinkage int sys_setsid(void); + +extern unsigned long io_events[MAX_BLKDEV]; + #define MAJOR_NR MD_MAJOR #define MD_DRIVER #include -#include -#include -#include #ifdef CONFIG_MD_BOOT -extern kdev_t name_to_kdev_t(char *line) __init; +extern kdev_t name_to_kdev_t(char *line) md__init; #endif -static struct hd_struct md_hd_struct[MAX_MD_DEV]; -static int md_blocksizes[MAX_MD_DEV]; -int md_maxreadahead[MAX_MD_DEV]; -#if SUPPORT_RECONSTRUCTION -static struct md_thread *md_sync_thread = NULL; -#endif /* SUPPORT_RECONSTRUCTION */ +static mdk_personality_t *pers[MAX_PERSONALITY] = {NULL, }; + +/* + * these have to be allocated separately because external + * subsystems want to have a pre-defined structure + */ +struct hd_struct md_hd_struct[MAX_MD_DEVS]; +static int md_blocksizes[MAX_MD_DEVS]; +static int md_maxreadahead[MAX_MD_DEVS]; +static mdk_thread_t *md_recovery_thread = NULL; -int md_size[MAX_MD_DEV]={0, }; +int md_size[MAX_MD_DEVS] = {0, }; static void md_geninit (struct gendisk *); static struct gendisk md_gendisk= { - MD_MAJOR, - "md", - 0, - 1, - MAX_MD_DEV, - md_geninit, - md_hd_struct, - md_size, - MAX_MD_DEV, - NULL, - NULL + MD_MAJOR, + "md", + 0, + 1, + MAX_MD_DEVS, + md_geninit, + md_hd_struct, + md_size, + MAX_MD_DEVS, + NULL, + NULL }; -static struct md_personality *pers[MAX_PERSONALITY]={NULL, }; -struct md_dev md_dev[MAX_MD_DEV]; - -int md_thread(void * arg); +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. + * + * you can change it via /proc/sys/dev/speed-limit + */ -static struct gendisk *find_gendisk (kdev_t dev) -{ - struct gendisk *tmp=gendisk_head; +static int sysctl_speed_limit = 100; - while (tmp != NULL) - { - if (tmp->major==MAJOR(dev)) - return (tmp); - - tmp=tmp->next; - } +static struct ctl_table_header *md_table_header; - return (NULL); -} +static ctl_table md_table[] = { + {DEV_MD_SPEED_LIMIT, "speed-limit", + &sysctl_speed_limit, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; -char *partition_name (kdev_t dev) -{ - static char name[40]; /* This should be long - enough for a device name ! */ - struct gendisk *hd = find_gendisk (dev); +static ctl_table md_dir_table[] = { + {DEV_MD, "md", NULL, 0, 0555, md_table}, + {0} +}; - if (!hd) - { - sprintf (name, "[dev %s]", kdevname(dev)); - return (name); - } +static ctl_table md_root_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, md_dir_table}, + {0} +}; - return disk_name (hd, MINOR(dev), name); /* routine in genhd.c */ +static void md_register_sysctl(void) +{ + md_table_header = register_sysctl_table(md_root_table, 1); } -static int legacy_raid_sb (int minor, int pnum) +void md_unregister_sysctl(void) { - int i, factor; + unregister_sysctl_table(md_table_header); +} + +/* + * The mapping between kdev and mddev is not necessary a simple + * one! Eg. HSM uses several sub-devices to implement Logical + * Volumes. All these sub-devices map to the same mddev. + */ +dev_mapping_t mddev_map [MAX_MD_DEVS] = { {NULL, 0}, }; - factor = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); +void add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data) +{ + unsigned int minor = MINOR(dev); - /***** - * do size and offset calculations. - */ - for (i=0; i> PERSONALITY_SHIFT) - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * md_dev[minor].nb_dev; - return 0; + if (mddev_map[minor].mddev != NULL) { + MD_BUG(); + return; + } + mddev_map[minor].mddev = mddev; + mddev_map[minor].data = data; } -static void free_sb (struct md_dev *mddev) +void del_mddev_mapping (mddev_t * mddev, kdev_t dev) { - int i; - struct real_dev *realdev; + unsigned int minor = MINOR(dev); - if (mddev->sb) { - free_page((unsigned long) mddev->sb); - mddev->sb = NULL; + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; } - for (i = 0; i nb_dev; i++) { - realdev = mddev->devices + i; - if (realdev->sb) { - free_page((unsigned long) realdev->sb); - realdev->sb = NULL; - } + if (mddev_map[minor].mddev != mddev) { + MD_BUG(); + return; } + mddev_map[minor].mddev = NULL; + mddev_map[minor].data = NULL; } /* - * Check one RAID superblock for generic plausibility + * Enables to iterate over all existing md arrays */ +static MD_LIST_HEAD(all_mddevs); -#define BAD_MAGIC KERN_ERR \ -"md: %s: invalid raid superblock magic (%x) on block %u\n" +static mddev_t * alloc_mddev (kdev_t dev) +{ + mddev_t * mddev; -#define OUT_OF_MEM KERN_ALERT \ -"md: out of memory.\n" + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); -#define NO_DEVICE KERN_ERR \ -"md: disabled device %s\n" + mddev->__minor = MINOR(dev); + mddev->reconfig_sem = MUTEX; + mddev->recovery_sem = MUTEX; + mddev->resync_sem = MUTEX; + MD_INIT_LIST_HEAD(&mddev->disks); + /* + * The 'base' mddev is the one with data NULL. + * personalities can create additional mddevs + * if necessary. + */ + add_mddev_mapping(mddev, dev, 0); + md_list_add(&mddev->all_mddevs, &all_mddevs); -#define SUCCESS 0 -#define FAILURE -1 + return mddev; +} -static int analyze_one_sb (struct real_dev * rdev) +static void free_mddev (mddev_t *mddev) { - int ret = FAILURE; - struct buffer_head *bh; - kdev_t dev = rdev->dev; - md_superblock_t *sb; + if (!mddev) { + MD_BUG(); + return; + } /* - * Read the superblock, it's at the end of the disk + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) */ - rdev->sb_offset = MD_NEW_SIZE_BLOCKS (blk_size[MAJOR(dev)][MINOR(dev)]); - set_blocksize (dev, MD_SB_BYTES); - bh = bread (dev, rdev->sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - - if (bh) { - sb = (md_superblock_t *) bh->b_data; - if (sb->md_magic != MD_SB_MAGIC) { - printk (BAD_MAGIC, kdevname(dev), - sb->md_magic, rdev->sb_offset); - goto abort; - } - rdev->sb = (md_superblock_t *) __get_free_page(GFP_KERNEL); - if (!rdev->sb) { - printk (OUT_OF_MEM); - goto abort; - } - memcpy (rdev->sb, bh->b_data, MD_SB_BYTES); + while (md_atomic_read(&mddev->resync_sem.count) != 1) + schedule(); + while (md_atomic_read(&mddev->recovery_sem.count) != 1) + schedule(); - rdev->size = sb->size; - } else - printk (NO_DEVICE,kdevname(rdev->dev)); - ret = SUCCESS; -abort: - if (bh) - brelse (bh); - return ret; + del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); + md_list_del(&mddev->all_mddevs); + MD_INIT_LIST_HEAD(&mddev->all_mddevs); + kfree(mddev); } -#undef SUCCESS -#undef FAILURE - -#undef BAD_MAGIC -#undef OUT_OF_MEM -#undef NO_DEVICE -/* - * Check a full RAID array for plausibility - */ +struct gendisk * find_gendisk (kdev_t dev) +{ + struct gendisk *tmp = gendisk_head; -#define INCONSISTENT KERN_ERR \ -"md: superblock inconsistency -- run ckraid\n" + while (tmp != NULL) { + if (tmp->major == MAJOR(dev)) + return (tmp); + tmp = tmp->next; + } + return (NULL); +} -#define OUT_OF_DATE KERN_ERR \ -"md: superblock update time inconsistenty -- using the most recent one\n" +mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct md_list_head *tmp; -#define OLD_VERSION KERN_ALERT \ -"md: %s: unsupported raid array version %d.%d.%d\n" + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} -#define NOT_CLEAN KERN_ERR \ -"md: %s: raid array is not clean -- run ckraid\n" +mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; -#define NOT_CLEAN_IGNORE KERN_ERR \ -"md: %s: raid array is not clean -- reconstructing parity\n" + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} -#define UNKNOWN_LEVEL KERN_ERR \ -"md: %s: unsupported raid level %d\n" +static MD_LIST_HEAD(device_names); -static int analyze_sbs (int minor, int pnum) +char * partition_name (kdev_t dev) { - struct md_dev *mddev = md_dev + minor; - int i, N = mddev->nb_dev, out_of_date = 0; - struct real_dev * disks = mddev->devices; - md_superblock_t *sb, *freshest = NULL; + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct md_list_head *tmp = device_names.next; - /* - * RAID-0 and linear don't use a RAID superblock - */ - if (pnum == RAID0 >> PERSONALITY_SHIFT || - pnum == LINEAR >> PERSONALITY_SHIFT) - return legacy_raid_sb (minor, pnum); + while (tmp != &device_names) { + dname = md_list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + tmp = tmp->next; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + if (!dname) + return nomem; /* - * Verify the RAID superblock on each real device + * ok, add this new device name to the list */ - for (i = 0; i < N; i++) - if (analyze_one_sb(disks+i)) - goto abort; + hd = find_gendisk (dev); + + if (!hd) + sprintf (dname->name, "[dev %s]", kdevname(dev)); + else + disk_name (hd, MINOR(dev), dname->name); + + dname->dev = dev; + md_list_add(&dname->list, &device_names); + + return dname->name; +} + +static unsigned int calc_dev_sboffset (kdev_t dev, mddev_t *mddev, + int persistent) +{ + unsigned int size = 0; + + if (blk_size[MAJOR(dev)]) + size = blk_size[MAJOR(dev)][MINOR(dev)]; + if (persistent) + size = MD_NEW_SIZE_BLOCKS(size); + return size; +} + +static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent) +{ + unsigned int size; + + size = calc_dev_sboffset(dev, mddev, persistent); + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +/* + * We check wether all devices are numbered from 0 to nb_dev-1. The + * order is guaranteed even after device name changes. + * + * Some personalities (raid0, linear) use this. Personalities that + * provide data have to be able to deal with loss of individual + * disks, so they do their checking themselves. + */ +int md_check_ordering (mddev_t *mddev) +{ + int i, c; + mdk_rdev_t *rdev; + struct md_list_head *tmp; /* - * The superblock constant part has to be the same - * for all disks in the array. + * First, all devices must be fully functional */ - sb = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; - if (!sb) { - sb = disks[i].sb; - continue; - } - if (memcmp(sb, - disks[i].sb, MD_SB_GENERIC_CONSTANT_WORDS * 4)) { - printk (INCONSISTENT); + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + printk("md: md%d's device %s faulty, aborting.\n", + mdidx(mddev), partition_name(rdev->dev)); goto abort; } } - /* - * OK, we have all disks and the array is ready to run. Let's - * find the freshest superblock, that one will be the superblock - * that represents the whole array. - */ - if ((sb = mddev->sb = (md_superblock_t *) __get_free_page (GFP_KERNEL)) == NULL) + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + c++; + } + if (c != mddev->nb_dev) { + MD_BUG(); goto abort; - freshest = NULL; - for (i = 0; i < N; i++) { - if (!disks[i].sb) - continue; - if (!freshest) { - freshest = disks[i].sb; - continue; - } - /* - * Find the newest superblock version - */ - if (disks[i].sb->utime != freshest->utime) { - out_of_date = 1; - if (disks[i].sb->utime > freshest->utime) - freshest = disks[i].sb; - } } - if (out_of_date) - printk(OUT_OF_DATE); - memcpy (sb, freshest, sizeof(*freshest)); - - /* - * Check if we can support this RAID array - */ - if (sb->major_version != MD_MAJOR_VERSION || - sb->minor_version > MD_MINOR_VERSION) { - - printk (OLD_VERSION, kdevname(MKDEV(MD_MAJOR, minor)), - sb->major_version, sb->minor_version, - sb->patch_version); + if (mddev->nb_dev != mddev->sb->raid_disks) { + printk("md: md%d, array needs %d disks, has %d, aborting.\n", + mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); goto abort; } - /* - * We need to add this as a superblock option. + * Now the numbering check */ -#if SUPPORT_RECONSTRUCTION - if (sb->state != (1 << MD_SB_CLEAN)) { - if (sb->level == 1) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); + for (i = 0; i < mddev->nb_dev; i++) { + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == i) + c++; + } + if (c == 0) { + printk("md: md%d, missing disk #%d, aborting.\n", + mdidx(mddev), i); goto abort; - } else - printk (NOT_CLEAN_IGNORE, kdevname(MKDEV(MD_MAJOR, minor))); - } -#else - if (sb->state != (1 << MD_SB_CLEAN)) { - printk (NOT_CLEAN, kdevname(MKDEV(MD_MAJOR, minor))); - goto abort; - } -#endif /* SUPPORT_RECONSTRUCTION */ - - switch (sb->level) { - case 1: - md_size[minor] = sb->size; - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD; - break; - case 4: - case 5: - md_size[minor] = sb->size * (sb->raid_disks - 1); - md_maxreadahead[minor] = MD_DEFAULT_DISK_READAHEAD * (sb->raid_disks - 1); - break; - default: - printk (UNKNOWN_LEVEL, kdevname(MKDEV(MD_MAJOR, minor)), - sb->level); + } + if (c > 1) { + printk("md: md%d, too many disks #%d, aborting.\n", + mdidx(mddev), i); goto abort; + } } return 0; abort: - free_sb(mddev); return 1; } -#undef INCONSISTENT -#undef OUT_OF_DATE -#undef OLD_VERSION -#undef NOT_CLEAN -#undef OLD_LEVEL - -int md_update_sb(int minor) +static unsigned int zoned_raid_size (mddev_t *mddev) { - struct md_dev *mddev = md_dev + minor; - struct buffer_head *bh; - md_superblock_t *sb = mddev->sb; - struct real_dev *realdev; - kdev_t dev; - int i; - u32 sb_offset; + unsigned int mask; + mdk_rdev_t * rdev; + struct md_list_head *tmp; - sb->utime = CURRENT_TIME; - for (i = 0; i < mddev->nb_dev; i++) { - realdev = mddev->devices + i; - if (!realdev->sb) - continue; - dev = realdev->dev; - sb_offset = realdev->sb_offset; - set_blocksize(dev, MD_SB_BYTES); - printk("md: updating raid superblock on device %s, sb_offset == %u\n", kdevname(dev), sb_offset); - bh = getblk(dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - if (bh) { - sb = (md_superblock_t *) bh->b_data; - memcpy(sb, mddev->sb, MD_SB_BYTES); - memcpy(&sb->descriptor, sb->disks + realdev->sb->descriptor.number, MD_SB_DESCRIPTOR_WORDS * 4); - mark_buffer_uptodate(bh, 1); - mark_buffer_dirty(bh, 1); - ll_rw_block(WRITE, 1, &bh); - wait_on_buffer(bh); - bforget(bh); - fsync_dev(dev); - invalidate_buffers(dev); - } else - printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev)); + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); +printk("mask %08x\n", mask); + + ITERATE_RDEV(mddev,rdev,tmp) { +printk(" rdev->size: %d\n", rdev->size); + rdev->size &= mask; +printk(" masked rdev->size: %d\n", rdev->size); + md_size[mdidx(mddev)] += rdev->size; +printk(" new md_size: %d\n", md_size[mdidx(mddev)]); } return 0; } -static int do_md_run (int minor, int repart) +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb) { - int pnum, i, min, factor, err; + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = 0; + disk->minor = 0; + mark_disk_removed(disk); +} - if (!md_dev[minor].nb_dev) - return -EINVAL; - - if (md_dev[minor].pers) - return -EBUSY; +#define BAD_MAGIC KERN_ERR \ +"md: invalid raid superblock magic on %s\n" - md_dev[minor].repartition=repart; - - if ((pnum=PERSONALITY(&md_dev[minor]) >> (PERSONALITY_SHIFT)) - >= MAX_PERSONALITY) - return -EINVAL; - - /* Only RAID-1 and RAID-5 can have MD devices as underlying devices */ - if (pnum != (RAID1 >> PERSONALITY_SHIFT) && pnum != (RAID5 >> PERSONALITY_SHIFT)){ - for (i = 0; i < md_dev [minor].nb_dev; i++) - if (MAJOR (md_dev [minor].devices [i].dev) == MD_MAJOR) - return -EINVAL; - } - if (!pers[pnum]) - { -#ifdef CONFIG_KMOD - char module_name[80]; - sprintf (module_name, "md-personality-%d", pnum); - request_module (module_name); - if (!pers[pnum]) -#endif - return -EINVAL; - } - - factor = min = 1 << FACTOR_SHIFT(FACTOR((md_dev+minor))); - - for (i=0; irun (minor, md_dev+minor))) - { - md_dev[minor].pers=NULL; - free_sb(md_dev + minor); - return (err); - } - - if (pnum != RAID0 >> PERSONALITY_SHIFT && pnum != LINEAR >> PERSONALITY_SHIFT) - { - md_dev[minor].sb->state &= ~(1 << MD_SB_CLEAN); - md_update_sb(minor); - } - - /* FIXME : We assume here we have blocks - that are twice as large as sectors. - THIS MAY NOT BE TRUE !!! */ - md_hd_struct[minor].start_sect=0; - md_hd_struct[minor].nr_sects=md_size[minor]<<1; - - read_ahead[MD_MAJOR] = 128; - return (0); -} +#define OUT_OF_MEM KERN_ALERT \ +"md: out of memory.\n" + +#define NO_SB KERN_ERR \ +"md: disabled device %s, could not read superblock.\n" -static int do_md_stop (int minor, struct inode *inode) +#define BAD_CSUM KERN_WARNING \ +"md: invalid superblock checksum on %s\n" + +static int alloc_array_sb (mddev_t * mddev) { - int i; - - if (inode->i_count>1 || md_dev[minor].busy>1) { - /* - * ioctl : one open channel - */ - printk ("STOP_MD md%x failed : i_count=%d, busy=%d\n", - minor, inode->i_count, md_dev[minor].busy); - return -EBUSY; - } - - if (md_dev[minor].pers) { - /* - * It is safe to call stop here, it only frees private - * data. Also, it tells us if a device is unstoppable - * (eg. resyncing is in progress) - */ - if (md_dev[minor].pers->stop (minor, md_dev+minor)) - return -EBUSY; - /* - * The device won't exist anymore -> flush it now - */ - fsync_dev (inode->i_rdev); - invalidate_buffers (inode->i_rdev); - if (md_dev[minor].sb) { - md_dev[minor].sb->state |= 1 << MD_SB_CLEAN; - md_update_sb(minor); - } + if (mddev->sb) { + MD_BUG(); + return 0; } - - /* Remove locks. */ - if (md_dev[minor].sb) - free_sb(md_dev + minor); - for (i=0; isb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) + return -ENOMEM; + md_clear_page((unsigned long)mddev->sb); + return 0; } -static int do_md_add (int minor, kdev_t dev) +static int alloc_disk_sb (mdk_rdev_t * rdev) { - int i; - int hot_add=0; - struct real_dev *realdev; + if (rdev->sb) + MD_BUG(); - if (md_dev[minor].nb_dev==MAX_REAL) + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); + if (!rdev->sb) { + printk (OUT_OF_MEM); return -EINVAL; + } + md_clear_page((unsigned long)rdev->sb); - if (!fs_may_mount (dev)) - return -EBUSY; + return 0; +} - if (blk_size[MAJOR(dev)] == NULL || blk_size[MAJOR(dev)][MINOR(dev)] == 0) { - printk("md_add(): zero device size, huh, bailing out.\n"); - return -EINVAL; +static void free_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) { + free_page((unsigned long) rdev->sb); + rdev->sb = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->faulty) + MD_BUG(); } +} - if (md_dev[minor].pers) { - /* - * The array is already running, hot-add the drive, or - * bail out: - */ - if (!md_dev[minor].pers->hot_add_disk) - return -EBUSY; - else - hot_add=1; +static void mark_rdev_faulty (mdk_rdev_t * rdev) +{ + unsigned long flags; + + if (!rdev) { + MD_BUG(); + return; } + save_flags(flags); + cli(); + free_disk_sb(rdev); + rdev->faulty = 1; + restore_flags(flags); +} + +static int read_disk_sb (mdk_rdev_t * rdev) +{ + int ret = -EINVAL; + struct buffer_head *bh = NULL; + kdev_t dev = rdev->dev; + mdp_super_t *sb; + u32 sb_offset; + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + /* - * Careful. We cannot increase nb_dev for a running array. + * Calculate the position of the superblock, + * it's at the end of the disk */ - i=md_dev[minor].nb_dev; - realdev = &md_dev[minor].devices[i]; - realdev->dev=dev; - - /* Lock the device by inserting a dummy inode. This doesn't - smell very good, but I need to be consistent with the - mount stuff, specially with fs_may_mount. If someone have - a better idea, please help ! */ - - realdev->inode=get_empty_inode (); - realdev->inode->i_dev=dev; /* don't care about other fields */ - insert_inode_hash (realdev->inode); - - /* Sizes are now rounded at run time */ - -/* md_dev[minor].devices[i].size=gen_real->sizes[MINOR(dev)]; HACKHACK*/ - - realdev->size=blk_size[MAJOR(dev)][MINOR(dev)]; + sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1); + rdev->sb_offset = sb_offset; + printk("(read) %s's sb offset: %d", partition_name(dev), + sb_offset); + fsync_dev(dev); + set_blocksize (dev, MD_SB_BYTES); + bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); - if (hot_add) { + if (bh) { + sb = (mdp_super_t *) bh->b_data; + memcpy (rdev->sb, sb, MD_SB_BYTES); + } else { + printk (NO_SB,partition_name(rdev->dev)); + goto abort; + } + printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events)); + ret = 0; +abort: + if (bh) + brelse (bh); + return ret; +} + +static unsigned int calc_sb_csum (mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb (mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + printk (BAD_MAGIC, partition_name(rdev->dev)); + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + printk (BAD_MINOR, partition_name(rdev->dev), + sb->md_minor); + goto abort; + } + + if (calc_sb_csum(sb) != sb->sb_csum) + printk(BAD_CSUM, partition_name(rdev->dev)); + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = find_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + printk( KERN_WARNING +"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n" +" protection against single-disk failure might be compromised.\n", + mdidx(mddev), partition_name(rdev->dev), + partition_name(same_pdev->dev)); + + md_list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + mddev->nb_dev++; + printk("bind<%s,%d>\n", partition_name(rdev->dev), mddev->nb_dev); +} + +static void unbind_rdev_from_array (mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + md_list_del(&rdev->same_set); + MD_INIT_LIST_HEAD(&rdev->same_set); + rdev->mddev->nb_dev--; + printk("unbind<%s,%d>\n", partition_name(rdev->dev), + rdev->mddev->nb_dev); + rdev->mddev = NULL; +} + +/* + * prevent the device from being mounted, repartitioned or + * otherwise reused by a RAID array (or any other kernel + * subsystem), by opening the device. [simply getting an + * inode is not enough, the SCSI module usage code needs + * an explicit open() on the device] + */ +static int lock_rdev (mdk_rdev_t *rdev) +{ + int err = 0; + + /* + * First insert a dummy inode. + */ + if (rdev->inode) + MD_BUG(); + rdev->inode = get_empty_inode(); + /* + * we dont care about any other fields + */ + rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; + insert_inode_hash(rdev->inode); + + memset(&rdev->filp, 0, sizeof(rdev->filp)); + rdev->filp.f_mode = 3; /* read write */ + err = blkdev_open(rdev->inode, &rdev->filp); + if (err) { + printk("blkdev_open() failed: %d\n", err); + clear_inode(rdev->inode); + rdev->inode = NULL; + } + return err; +} + +static void unlock_rdev (mdk_rdev_t *rdev) +{ + blkdev_release(rdev->inode); + if (!rdev->inode) + MD_BUG(); + clear_inode(rdev->inode); + rdev->inode = NULL; +} + +static void export_rdev (mdk_rdev_t * rdev) +{ + printk("export_rdev(%s)\n",partition_name(rdev->dev)); + if (rdev->mddev) + MD_BUG(); + unlock_rdev(rdev); + free_disk_sb(rdev); + md_list_del(&rdev->all); + MD_INIT_LIST_HEAD(&rdev->all); + if (rdev->pending.next != &rdev->pending) { + printk("(%s was pending)\n",partition_name(rdev->dev)); + md_list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + +static void kick_rdev_from_array (mdk_rdev_t * rdev) +{ + unbind_rdev_from_array(rdev); + export_rdev(rdev); +} + +static void export_array (mddev_t *mddev) +{ + struct md_list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (mddev->nb_dev) + MD_BUG(); +} + +#undef BAD_CSUM +#undef BAD_MAGIC +#undef OUT_OF_MEM +#undef NO_SB + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + partition_name(MKDEV(desc->major,desc->minor)), + desc->major,desc->minor,desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, (unsigned long)get_unaligned(&sb->events)); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + printk(" D %2d: ", i); + print_desc(desc); + } + printk(" THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk(" rdev %s: O:%s, SZ:%08d F:%d DN:%d ", + partition_name(rdev->dev), partition_name(rdev->old_dev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk("rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk("no rdev superblock!\n"); +} + +void md_print_devices (void) +{ + struct md_list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk(" **********************************\n"); + printk(" * *\n"); + printk(" **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", partition_name(rdev->dev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rde