Hi, Following on from this message are the 16 patches of GFS2 which we are again posting for review. Since the last posting we have, I hope, addressed all the issues raised as well as fixing a number of bugs. In particular, we have now only one new exported symbol (see patch 16 of the series). It is now possible to mount the gfs2meta filesystem for a particular gfs2 filesystem at the same time as the gfs2 filesystem is mounted. In fact we enforce that rule at least for the time being due to dependencies in mounting the two filesystems. Christoph requested that we find a different way to deal with the problem of the VFS checking the file size in the direct i/o read path before handing control to GFS2 (in other words at a point when the size may change). We have done this with a two line patch to filemap.c (also in patch 16 of the series) so that its logic is now very similar to that followed in the direct i/o write path. This has eliminated the need for us to duplicate parts of the core VFS code inside GFS2. As usual the git tree is at: git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6.git It is based off a fairly recent pull from Linus' tree and includes the DLM which is not included in the patch set being posted here. Steve. [PATCH 01/16] GFS2: Core header files The header files for the core of GFS2. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/gfs2.h | 31 ++ fs/gfs2/incore.h | 659 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/gfs2_ondisk.h | 443 +++++++++++++++++++++++++++++ 3 files changed, 1133 insertions(+) --- /dev/null +++ b/fs/gfs2/gfs2.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __GFS2_DOT_H__ +#define __GFS2_DOT_H__ + +enum { + NO_CREATE = 0, + CREATE = 1, +}; + +enum { + NO_WAIT = 0, + WAIT = 1, +}; + +enum { + NO_FORCE = 0, + FORCE = 1, +}; + +#define GFS2_FAST_NAME_SIZE 8 + +#endif /* __GFS2_DOT_H__ */ + --- /dev/null +++ b/fs/gfs2/incore.h @@ -0,0 +1,659 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __INCORE_DOT_H__ +#define __INCORE_DOT_H__ + +#define DIO_FORCE 0x00000001 +#define DIO_CLEAN 0x00000002 +#define DIO_DIRTY 0x00000004 +#define DIO_START 0x00000008 +#define DIO_WAIT 0x00000010 +#define DIO_METADATA 0x00000020 +#define DIO_DATA 0x00000040 +#define DIO_RELEASE 0x00000080 +#define DIO_ALL 0x00000100 + +struct gfs2_log_operations; +struct gfs2_log_element; +struct gfs2_bitmap; +struct gfs2_rgrpd; +struct gfs2_bufdata; +struct gfs2_glock_operations; +struct gfs2_holder; +struct gfs2_glock; +struct gfs2_alloc; +struct gfs2_inode; +struct gfs2_file; +struct gfs2_revoke; +struct gfs2_revoke_replay; +struct gfs2_quota_data; +struct gfs2_log_buf; +struct gfs2_trans; +struct gfs2_ail; +struct gfs2_jdesc; +struct gfs2_args; +struct gfs2_tune; +struct gfs2_gl_hash_bucket; +struct gfs2_sbd; + +typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); + +/* + * Structure of operations that are associated with each + * type of element in the log. + */ + +struct gfs2_log_operations { + void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); + void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_before_commit) (struct gfs2_sbd *sdp); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_before_scan) (struct gfs2_jdesc *jd, + struct gfs2_log_header *head, int pass); + int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass); + void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass); + const char *lo_name; +}; + +struct gfs2_log_element { + struct list_head le_list; + const struct gfs2_log_operations *le_ops; +}; + +struct gfs2_bitmap { + struct buffer_head *bi_bh; + char *bi_clone; + uint32_t bi_offset; + uint32_t bi_start; + uint32_t bi_len; +}; + +struct gfs2_rgrpd { + struct list_head rd_list; /* Link with superblock */ + struct list_head rd_list_mru; + struct list_head rd_recent; /* Recently used rgrps */ + struct gfs2_glock *rd_gl; /* Glock for this rgrp */ + struct gfs2_rindex rd_ri; + struct gfs2_rgrp rd_rg; + uint64_t rd_rg_vn; + struct gfs2_bitmap *rd_bits; + unsigned int rd_bh_count; + struct mutex rd_mutex; + uint32_t rd_free_clone; + struct gfs2_log_element rd_le; + uint32_t rd_last_alloc_data; + uint32_t rd_last_alloc_meta; + struct gfs2_sbd *rd_sbd; +}; + +enum gfs2_state_bits { + BH_Pinned = BH_PrivateStart, + BH_Escaped = BH_PrivateStart + 1, +}; + +BUFFER_FNS(Pinned, pinned) +TAS_BUFFER_FNS(Pinned, pinned) +BUFFER_FNS(Escaped, escaped) +TAS_BUFFER_FNS(Escaped, escaped) + +struct gfs2_bufdata { + struct buffer_head *bd_bh; + struct gfs2_glock *bd_gl; + + struct list_head bd_list_tr; + struct gfs2_log_element bd_le; + + struct gfs2_ail *bd_ail; + struct list_head bd_ail_st_list; + struct list_head bd_ail_gl_list; +}; + +struct gfs2_glock_operations { + void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state, + int flags); + void (*go_xmote_bh) (struct gfs2_glock * gl); + void (*go_drop_th) (struct gfs2_glock * gl); + void (*go_drop_bh) (struct gfs2_glock * gl); + void (*go_sync) (struct gfs2_glock * gl, int flags); + void (*go_inval) (struct gfs2_glock * gl, int flags); + int (*go_demote_ok) (struct gfs2_glock * gl); + int (*go_lock) (struct gfs2_holder * gh); + void (*go_unlock) (struct gfs2_holder * gh); + void (*go_callback) (struct gfs2_glock * gl, unsigned int state); + void (*go_greedy) (struct gfs2_glock * gl); + const int go_type; +}; + +enum { + /* Actions */ + HIF_MUTEX = 0, + HIF_PROMOTE = 1, + HIF_DEMOTE = 2, + HIF_GREEDY = 3, + + /* States */ + HIF_ALLOCED = 4, + HIF_DEALLOC = 5, + HIF_HOLDER = 6, + HIF_FIRST = 7, + HIF_ABORTED = 9, +}; + +struct gfs2_holder { + struct list_head gh_list; + + struct gfs2_glock *gh_gl; + struct task_struct *gh_owner; + unsigned int gh_state; + unsigned gh_flags; + + int gh_error; + unsigned long gh_iflags; + struct completion gh_wait; + unsigned long gh_ip; +}; + +enum { + GLF_PLUG = 0, + GLF_LOCK = 1, + GLF_STICKY = 2, + GLF_PREFETCH = 3, + GLF_DIRTY = 5, + GLF_SKIP_WAITERS2 = 6, + GLF_GREEDY = 7, +}; + +struct gfs2_glock { + struct list_head gl_list; + unsigned long gl_flags; /* GLF_... */ + struct lm_lockname gl_name; + struct kref gl_ref; + + spinlock_t gl_spin; + + unsigned int gl_state; + struct task_struct *gl_owner; + unsigned long gl_ip; + struct list_head gl_holders; + struct list_head gl_waiters1; /* HIF_MUTEX */ + struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */ + struct list_head gl_waiters3; /* HIF_PROMOTE */ + + const struct gfs2_glock_operations *gl_ops; + + struct gfs2_holder *gl_req_gh; + gfs2_glop_bh_t gl_req_bh; + + lm_lock_t *gl_lock; + char *gl_lvb; + atomic_t gl_lvb_count; + + uint64_t gl_vn; + unsigned long gl_stamp; + void *gl_object; + + struct gfs2_gl_hash_bucket *gl_bucket; + struct list_head gl_reclaim; + + struct gfs2_sbd *gl_sbd; + + struct inode *gl_aspace; + struct gfs2_log_element gl_le; + struct list_head gl_ail_list; + atomic_t gl_ail_count; +}; + +struct gfs2_alloc { + /* Quota stuff */ + + struct gfs2_quota_data *al_qd[4]; + struct gfs2_holder al_qd_ghs[4]; + unsigned int al_qd_num; + + u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */ + u32 al_alloced; /* Filled in by gfs2_alloc_*() */ + + /* Filled in by gfs2_inplace_reserve() */ + + unsigned int al_line; + char *al_file; + struct gfs2_holder al_ri_gh; + struct gfs2_holder al_rgd_gh; + struct gfs2_rgrpd *al_rgd; + +}; + +enum { + GIF_QD_LOCKED = 1, + GIF_PAGED = 2, + GIF_SW_PAGED = 3, +}; + +struct gfs2_inode { + struct inode i_inode; + struct gfs2_inum i_num; + + unsigned long i_flags; /* GIF_... */ + + uint64_t i_vn; + struct gfs2_dinode i_di; /* To be replaced by ref to block */ + + struct gfs2_glock *i_gl; /* Move into i_gh? */ + struct gfs2_holder i_iopen_gh; + struct gfs2_holder i_gh; /* for prepare/commit_write only */ + struct gfs2_alloc i_alloc; + uint64_t i_last_rg_alloc; + + spinlock_t i_spin; + struct rw_semaphore i_rw_mutex; + unsigned int i_greedy; + unsigned long i_last_pfault; + + struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT]; +}; + +/* + * Since i_inode is the first element of struct gfs2_inode, + * this is effectively a cast. + */ +static inline struct gfs2_inode *GFS2_I(struct inode *inode) +{ + return container_of(inode, struct gfs2_inode, i_inode); +} + +/* To be removed? */ +static inline struct gfs2_sbd *GFS2_SB(struct inode *inode) +{ + return inode->i_sb->s_fs_info; +} + +enum { + GFF_DID_DIRECT_ALLOC = 0, + GFF_EXLOCK = 1, +}; + +struct gfs2_file { + unsigned long f_flags; /* GFF_... */ + struct mutex f_fl_mutex; + struct gfs2_holder f_fl_gh; +}; + +struct gfs2_revoke { + struct gfs2_log_element rv_le; + uint64_t rv_blkno; +}; + +struct gfs2_revoke_replay { + struct list_head rr_list; + uint64_t rr_blkno; + unsigned int rr_where; +}; + +enum { + QDF_USER = 0, + QDF_CHANGE = 1, + QDF_LOCKED = 2, +}; + +struct gfs2_quota_lvb { + uint32_t qb_magic; + uint32_t __pad; + uint64_t qb_limit; /* Hard limit of # blocks to alloc */ + uint64_t qb_warn; /* Warn user when alloc is above this # */ + int64_t qb_value; /* Current # blocks allocated */ +}; + +struct gfs2_quota_data { + struct list_head qd_list; + unsigned int qd_count; + + uint32_t qd_id; + unsigned long qd_flags; /* QDF_... */ + + int64_t qd_change; + int64_t qd_change_sync; + + unsigned int qd_slot; + unsigned int qd_slot_count; + + struct buffer_head *qd_bh; + struct gfs2_quota_change *qd_bh_qc; + unsigned int qd_bh_count; + + struct gfs2_glock *qd_gl; + struct gfs2_quota_lvb qd_qb; + + uint64_t qd_sync_gen; + unsigned long qd_last_warn; + unsigned long qd_last_touched; +}; + +struct gfs2_log_buf { + struct list_head lb_list; + struct buffer_head *lb_bh; + struct buffer_head *lb_real; +}; + +struct gfs2_trans { + unsigned long tr_ip; + + unsigned int tr_blocks; + unsigned int tr_revokes; + unsigned int tr_reserved; + + struct gfs2_holder tr_t_gh; + + int tr_touched; + + unsigned int tr_num_buf; + unsigned int tr_num_buf_new; + unsigned int tr_num_buf_rm; + struct list_head tr_list_buf; + + unsigned int tr_num_revoke; + unsigned int tr_num_revoke_rm; +}; + +struct gfs2_ail { + struct list_head ai_list; + + unsigned int ai_first; + struct list_head ai_ail1_list; + struct list_head ai_ail2_list; + + uint64_t ai_sync_gen; +}; + +struct gfs2_jdesc { + struct list_head jd_list; + + struct inode *jd_inode; + unsigned int jd_jid; + int jd_dirty; + + unsigned int jd_blocks; +}; + +#define GFS2_GLOCKD_DEFAULT 1 +#define GFS2_GLOCKD_MAX 16 + +#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF +#define GFS2_QUOTA_OFF 0 +#define GFS2_QUOTA_ACCOUNT 1 +#define GFS2_QUOTA_ON 2 + +#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED +#define GFS2_DATA_WRITEBACK 1 +#define GFS2_DATA_ORDERED 2 + +struct gfs2_args { + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + int ar_spectator; /* Don't get a journal because we're always RO */ + int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ + int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ + int ar_localcaching; /* Local-style caching (dangerous on multihost) */ + int ar_debug; /* Oops on errors instead of trying to be graceful */ + int ar_upgrade; /* Upgrade ondisk/multihost format */ + unsigned int ar_num_glockd; /* Number of glockd threads */ + int ar_posix_acl; /* Enable posix acls */ + int ar_quota; /* off/account/on */ + int ar_suiddir; /* suiddir support */ + int ar_data; /* ordered/writeback */ +}; + +struct gfs2_tune { + spinlock_t gt_spin; + + unsigned int gt_ilimit; + unsigned int gt_ilimit_tries; + unsigned int gt_ilimit_min; + unsigned int gt_demote_secs; /* Cache retention for unheld glock */ + unsigned int gt_incore_log_blocks; + unsigned int gt_log_flush_secs; + unsigned int gt_jindex_refresh_secs; /* Check for new journal index */ + + unsigned int gt_scand_secs; + unsigned int gt_recoverd_secs; + unsigned int gt_logd_secs; + unsigned int gt_quotad_secs; + + unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ + unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ + unsigned int gt_quota_scale_num; /* Numerator */ + unsigned int gt_quota_scale_den; /* Denominator */ + unsigned int gt_quota_cache_secs; + unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ + unsigned int gt_atime_quantum; /* Min secs between atime updates */ + unsigned int gt_new_files_jdata; + unsigned int gt_new_files_directio; + unsigned int gt_max_atomic_write; /* Split big writes into this size */ + unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ + unsigned int gt_lockdump_size; + unsigned int gt_stall_secs; /* Detects trouble! */ + unsigned int gt_complain_secs; + unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */ + unsigned int gt_entries_per_readdir; + unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */ + unsigned int gt_greedy_default; + unsigned int gt_greedy_quantum; + unsigned int gt_greedy_max; + unsigned int gt_statfs_quantum; + unsigned int gt_statfs_slow; +}; + +struct gfs2_gl_hash_bucket { + rwlock_t hb_lock; + struct list_head hb_list; +}; + +enum { + SDF_JOURNAL_CHECKED = 0, + SDF_JOURNAL_LIVE = 1, + SDF_SHUTDOWN = 2, + SDF_NOATIME = 3, +}; + +#define GFS2_GL_HASH_SHIFT 13 +#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) +#define GFS2_FSNAME_LEN 256 + +struct gfs2_sbd { + struct super_block *sd_vfs; + struct super_block *sd_vfs_meta; + struct kobject sd_kobj; + unsigned long sd_flags; /* SDF_... */ + struct gfs2_sb sd_sb; + + /* Constants computed on mount */ + + uint32_t sd_fsb2bb; + uint32_t sd_fsb2bb_shift; + uint32_t sd_diptrs; /* Number of pointers in a dinode */ + uint32_t sd_inptrs; /* Number of pointers in a indirect block */ + uint32_t sd_jbsize; /* Size of a journaled data block */ + uint32_t sd_hash_bsize; /* sizeof(exhash block) */ + uint32_t sd_hash_bsize_shift; + uint32_t sd_hash_ptrs; /* Number of pointers in a hash block */ + uint32_t sd_qc_per_block; + uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */ + uint32_t sd_max_height; /* Max height of a file's metadata tree */ + uint64_t sd_heightsize[GFS2_MAX_META_HEIGHT]; + uint32_t sd_max_jheight; /* Max height of journaled file's meta tree */ + uint64_t sd_jheightsize[GFS2_MAX_META_HEIGHT]; + + struct gfs2_args sd_args; /* Mount arguments */ + struct gfs2_tune sd_tune; /* Filesystem tuning structure */ + + /* Lock Stuff */ + + struct lm_lockstruct sd_lockstruct; + struct gfs2_gl_hash_bucket sd_gl_hash[GFS2_GL_HASH_SIZE]; + struct list_head sd_reclaim_list; + spinlock_t sd_reclaim_lock; + wait_queue_head_t sd_reclaim_wq; + atomic_t sd_reclaim_count; + struct gfs2_holder sd_live_gh; + struct gfs2_glock *sd_rename_gl; + struct gfs2_glock *sd_trans_gl; + + /* Inode Stuff */ + + struct inode *sd_master_dir; + struct inode *sd_jindex; + struct inode *sd_inum_inode; + struct inode *sd_statfs_inode; + struct inode *sd_ir_inode; + struct inode *sd_sc_inode; + struct inode *sd_qc_inode; + struct inode *sd_rindex; + struct inode *sd_quota_inode; + + /* Inum stuff */ + + struct mutex sd_inum_mutex; + + /* StatFS stuff */ + + spinlock_t sd_statfs_spin; + struct mutex sd_statfs_mutex; + struct gfs2_statfs_change sd_statfs_master; + struct gfs2_statfs_change sd_statfs_local; + unsigned long sd_statfs_sync_time; + + /* Resource group stuff */ + + uint64_t sd_rindex_vn; + spinlock_t sd_rindex_spin; + struct mutex sd_rindex_mutex; + struct list_head sd_rindex_list; + struct list_head sd_rindex_mru_list; + struct list_head sd_rindex_recent_list; + struct gfs2_rgrpd *sd_rindex_forward; + unsigned int sd_rgrps; + + /* Journal index stuff */ + + struct list_head sd_jindex_list; + spinlock_t sd_jindex_spin; + struct mutex sd_jindex_mutex; + unsigned int sd_journals; + unsigned long sd_jindex_refresh_time; + + struct gfs2_jdesc *sd_jdesc; + struct gfs2_holder sd_journal_gh; + struct gfs2_holder sd_jinode_gh; + + struct gfs2_holder sd_ir_gh; + struct gfs2_holder sd_sc_gh; + struct gfs2_holder sd_qc_gh; + + /* Daemon stuff */ + + struct task_struct *sd_scand_process; + struct task_struct *sd_recoverd_process; + struct task_struct *sd_logd_process; + struct task_struct *sd_quotad_process; + struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX]; + unsigned int sd_glockd_num; + + /* Quota stuff */ + + struct list_head sd_quota_list; + atomic_t sd_quota_count; + spinlock_t sd_quota_spin; + struct mutex sd_quota_mutex; + + unsigned int sd_quota_slots; + unsigned int sd_quota_chunks; + unsigned char **sd_quota_bitmap; + + uint64_t sd_quota_sync_gen; + unsigned long sd_quota_sync_time; + + /* Log stuff */ + + spinlock_t sd_log_lock; + + unsigned int sd_log_blks_reserved; + unsigned int sd_log_commited_buf; + unsigned int sd_log_commited_revoke; + + unsigned int sd_log_num_gl; + unsigned int sd_log_num_buf; + unsigned int sd_log_num_revoke; + unsigned int sd_log_num_rg; + unsigned int sd_log_num_databuf; + unsigned int sd_log_num_jdata; + unsigned int sd_log_num_hdrs; + + struct list_head sd_log_le_gl; + struct list_head sd_log_le_buf; + struct list_head sd_log_le_revoke; + struct list_head sd_log_le_rg; + struct list_head sd_log_le_databuf; + + unsigned int sd_log_blks_free; + struct mutex sd_log_reserve_mutex; + + uint64_t sd_log_sequence; + unsigned int sd_log_head; + unsigned int sd_log_tail; + int sd_log_idle; + + unsigned long sd_log_flush_time; + struct rw_semaphore sd_log_flush_lock; + struct list_head sd_log_flush_list; + + unsigned int sd_log_flush_head; + uint64_t sd_log_flush_wrapped; + + struct list_head sd_ail1_list; + struct list_head sd_ail2_list; + uint64_t sd_ail_sync_gen; + + /* Replay stuff */ + + struct list_head sd_revoke_list; + unsigned int sd_replay_tail; + + unsigned int sd_found_blocks; + unsigned int sd_found_revokes; + unsigned int sd_replayed_blocks; + + /* For quiescing the filesystem */ + + struct gfs2_holder sd_freeze_gh; + struct mutex sd_freeze_lock; + unsigned int sd_freeze_count; + + /* Counters */ + + atomic_t sd_glock_count; + atomic_t sd_glock_held_count; + atomic_t sd_inode_count; + atomic_t sd_reclaimed; + + char sd_fsname[GFS2_FSNAME_LEN]; + char sd_table_name[GFS2_FSNAME_LEN]; + char sd_proto_name[GFS2_FSNAME_LEN]; + + /* Debugging crud */ + + unsigned long sd_last_warning; + struct vfsmount *sd_gfs2mnt; +}; + +#endif /* __INCORE_DOT_H__ */ + --- /dev/null +++ b/include/linux/gfs2_ondisk.h @@ -0,0 +1,443 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __GFS2_ONDISK_DOT_H__ +#define __GFS2_ONDISK_DOT_H__ + +#define GFS2_MAGIC 0x01161970 +#define GFS2_BASIC_BLOCK 512 +#define GFS2_BASIC_BLOCK_SHIFT 9 + +/* Lock numbers of the LM_TYPE_NONDISK type */ + +#define GFS2_MOUNT_LOCK 0 +#define GFS2_LIVE_LOCK 1 +#define GFS2_TRANS_LOCK 2 +#define GFS2_RENAME_LOCK 3 + +/* Format numbers for various metadata types */ + +#define GFS2_FORMAT_NONE 0 +#define GFS2_FORMAT_SB 100 +#define GFS2_FORMAT_RG 200 +#define GFS2_FORMAT_RB 300 +#define GFS2_FORMAT_DI 400 +#define GFS2_FORMAT_IN 500 +#define GFS2_FORMAT_LF 600 +#define GFS2_FORMAT_JD 700 +#define GFS2_FORMAT_LH 800 +#define GFS2_FORMAT_LD 900 +#define GFS2_FORMAT_LB 1000 +#define GFS2_FORMAT_EA 1600 +#define GFS2_FORMAT_ED 1700 +#define GFS2_FORMAT_QC 1400 +/* These are format numbers for entities contained in files */ +#define GFS2_FORMAT_RI 1100 +#define GFS2_FORMAT_DE 1200 +#define GFS2_FORMAT_QU 1500 +/* These are part of the superblock */ +#define GFS2_FORMAT_FS 1801 +#define GFS2_FORMAT_MULTI 1900 + +/* + * An on-disk inode number + */ + +struct gfs2_inum { + __be64 no_formal_ino; + __be64 no_addr; +}; + +static inline int gfs2_inum_equal(const struct gfs2_inum *ino1, + const struct gfs2_inum *ino2) +{ + return ino1->no_formal_ino == ino2->no_formal_ino && + ino1->no_addr == ino2->no_addr; +} + +/* + * Generic metadata head structure + * Every inplace buffer logged in the journal must start with this. + */ + +#define GFS2_METATYPE_NONE 0 +#define GFS2_METATYPE_SB 1 +#define GFS2_METATYPE_RG 2 +#define GFS2_METATYPE_RB 3 +#define GFS2_METATYPE_DI 4 +#define GFS2_METATYPE_IN 5 +#define GFS2_METATYPE_LF 6 +#define GFS2_METATYPE_JD 7 +#define GFS2_METATYPE_LH 8 +#define GFS2_METATYPE_LD 9 +#define GFS2_METATYPE_LB 12 +#define GFS2_METATYPE_EA 10 +#define GFS2_METATYPE_ED 11 +#define GFS2_METATYPE_QC 14 + +struct gfs2_meta_header { + __be32 mh_magic; + __be32 mh_type; + __be64 __pad0; /* Was generation number in gfs1 */ + __be32 mh_format; + __be32 __pad1; /* Was incarnation number in gfs1 */ +}; + +/* + * super-block structure + * + * It's probably good if SIZEOF_SB <= GFS2_BASIC_BLOCK (512 bytes) + * + * Order is important, need to be able to read old superblocks to do on-disk + * version upgrades. + */ + +/* Address of superblock in GFS2 basic blocks */ +#define GFS2_SB_ADDR 128 + +/* The lock number for the superblock (must be zero) */ +#define GFS2_SB_LOCK 0 + +/* Requirement: GFS2_LOCKNAME_LEN % 8 == 0 + Includes: the fencing zero at the end */ +#define GFS2_LOCKNAME_LEN 64 + +struct gfs2_sb { + struct gfs2_meta_header sb_header; + + __be32 sb_fs_format; + __be32 sb_multihost_format; + __u32 __pad0; /* Was superblock flags in gfs1 */ + + __be32 sb_bsize; + __be32 sb_bsize_shift; + __u32 __pad1; /* Was journal segment size in gfs1 */ + + struct gfs2_inum sb_master_dir; /* Was jindex dinode in gfs1 */ + struct gfs2_inum __pad2; /* Was rindex dinode in gfs1 */ + struct gfs2_inum sb_root_dir; + + char sb_lockproto[GFS2_LOCKNAME_LEN]; + char sb_locktable[GFS2_LOCKNAME_LEN]; + /* In gfs1, quota and license dinodes followed */ +}; + +/* + * resource index structure + */ + +struct gfs2_rindex { + __be64 ri_addr; /* grp block disk address */ + __be32 ri_length; /* length of rgrp header in fs blocks */ + __u32 __pad; + + __be64 ri_data0; /* first data location */ + __be32 ri_data; /* num of data blocks in rgrp */ + + __be32 ri_bitbytes; /* number of bytes in data bitmaps */ + + __u8 ri_reserved[64]; +}; + +/* + * resource group header structure + */ + +/* Number of blocks per byte in rgrp */ +#define GFS2_NBBY 4 +#define GFS2_BIT_SIZE 2 +#define GFS2_BIT_MASK 0x00000003 + +#define GFS2_BLKST_FREE 0 +#define GFS2_BLKST_USED 1 +#define GFS2_BLKST_UNLINKED 2 +#define GFS2_BLKST_DINODE 3 + +#define GFS2_RGF_JOURNAL 0x00000001 +#define GFS2_RGF_METAONLY 0x00000002 +#define GFS2_RGF_DATAONLY 0x00000004 +#define GFS2_RGF_NOALLOC 0x00000008 + +struct gfs2_rgrp { + struct gfs2_meta_header rg_header; + + __be32 rg_flags; + __be32 rg_free; + __be32 rg_dinodes; + __be32 __pad; + __be64 rg_igeneration; + + __u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */ +}; + +/* + * quota structure + */ + +struct gfs2_quota { + __be64 qu_limit; + __be64 qu_warn; + __be64 qu_value; + __u8 qu_reserved[64]; +}; + +/* + * dinode structure + */ + +#define GFS2_MAX_META_HEIGHT 10 +#define GFS2_DIR_MAX_DEPTH 17 + +#define DT2IF(dt) (((dt) << 12) & S_IFMT) +#define IF2DT(sif) (((sif) & S_IFMT) >> 12) + +enum { + gfs2fl_Jdata = 0, + gfs2fl_ExHash = 1, + gfs2fl_Unused = 2, + gfs2fl_EaIndirect = 3, + gfs2fl_Directio = 4, + gfs2fl_Immutable = 5, + gfs2fl_AppendOnly = 6, + gfs2fl_NoAtime = 7, + gfs2fl_Sync = 8, + gfs2fl_System = 9, + gfs2fl_TruncInProg = 29, + gfs2fl_InheritDirectio = 30, + gfs2fl_InheritJdata = 31, +}; + +/* Dinode flags */ +#define GFS2_DIF_JDATA 0x00000001 +#define GFS2_DIF_EXHASH 0x00000002 +#define GFS2_DIF_UNUSED 0x00000004 /* only in gfs1 */ +#define GFS2_DIF_EA_INDIRECT 0x00000008 +#define GFS2_DIF_DIRECTIO 0x00000010 +#define GFS2_DIF_IMMUTABLE 0x00000020 +#define GFS2_DIF_APPENDONLY 0x00000040 +#define GFS2_DIF_NOATIME 0x00000080 +#define GFS2_DIF_SYNC 0x00000100 +#define GFS2_DIF_SYSTEM 0x00000200 /* New in gfs2 */ +#define GFS2_DIF_TRUNC_IN_PROG 0x20000000 /* New in gfs2 */ +#define GFS2_DIF_INHERIT_DIRECTIO 0x40000000 +#define GFS2_DIF_INHERIT_JDATA 0x80000000 + +struct gfs2_dinode { + struct gfs2_meta_header di_header; + + struct gfs2_inum di_num; + + __be32 di_mode; /* mode of file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to this file */ + __be64 di_size; /* number of bytes in file */ + __be64 di_blocks; /* number of blocks in file */ + __be64 di_atime; /* time last accessed */ + __be64 di_mtime; /* time last modified */ + __be64 di_ctime; /* time last changed */ + __be32 di_major; /* device major number */ + __be32 di_minor; /* device minor number */ + + /* This section varies from gfs1. Padding added to align with + * remainder of dinode + */ + __be64 di_goal_meta; /* rgrp to alloc from next */ + __be64 di_goal_data; /* data block goal */ + __be64 di_generation; /* generation number for NFS */ + + __be32 di_flags; /* GFS2_DIF_... */ + __be32 di_payload_format; /* GFS2_FORMAT_... */ + __u16 __pad1; /* Was ditype in gfs1 */ + __be16 di_height; /* height of metadata */ + __u32 __pad2; /* Unused incarnation number from gfs1 */ + + /* These only apply to directories */ + __u16 __pad3; /* Padding */ + __be16 di_depth; /* Number of bits in the table */ + __be32 di_entries; /* The number of entries in the directory */ + + struct gfs2_inum __pad4; /* Unused even in current gfs1 */ + + __be64 di_eattr; /* extended attribute block number */ + + __u8 di_reserved[56]; +}; + +/* + * directory structure - many of these per directory file + */ + +#define GFS2_FNAMESIZE 255 +#define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7) + +struct gfs2_dirent { + struct gfs2_inum de_inum; + __be32 de_hash; + __be16 de_rec_len; + __be16 de_name_len; + __be16 de_type; + __u8 __pad[14]; +}; + +/* + * Header of leaf directory nodes + */ + +struct gfs2_leaf { + struct gfs2_meta_header lf_header; + + __be16 lf_depth; /* Depth of leaf */ + __be16 lf_entries; /* Number of dirents in leaf */ + __be32 lf_dirent_format; /* Format of the dirents */ + __be64 lf_next; /* Next leaf, if overflow */ + + __u8 lf_reserved[64]; +}; + +/* + * Extended attribute header format + */ + +#define GFS2_EA_MAX_NAME_LEN 255 +#define GFS2_EA_MAX_DATA_LEN 65536 + +#define GFS2_EATYPE_UNUSED 0 +#define GFS2_EATYPE_USR 1 +#define GFS2_EATYPE_SYS 2 +#define GFS2_EATYPE_SECURITY 3 + +#define GFS2_EATYPE_LAST 3 +#define GFS2_EATYPE_VALID(x) ((x) <= GFS2_EATYPE_LAST) + +#define GFS2_EAFLAG_LAST 0x01 /* last ea in block */ + +struct gfs2_ea_header { + __be32 ea_rec_len; + __be32 ea_data_len; + __u8 ea_name_len; /* no NULL pointer after the string */ + __u8 ea_type; /* GFS2_EATYPE_... */ + __u8 ea_flags; /* GFS2_EAFLAG_... */ + __u8 ea_num_ptrs; + __u32 __pad; +}; + +/* + * Log header structure + */ + +#define GFS2_LOG_HEAD_UNMOUNT 0x00000001 /* log is clean */ + +struct gfs2_log_header { + struct gfs2_meta_header lh_header; + + __be64 lh_sequence; /* Sequence number of this transaction */ + __be32 lh_flags; /* GFS2_LOG_HEAD_... */ + __be32 lh_tail; /* Block number of log tail */ + __be32 lh_blkno; + __be32 lh_hash; +}; + +/* + * Log type descriptor + */ + +#define GFS2_LOG_DESC_METADATA 300 +/* ld_data1 is the number of metadata blocks in the descriptor. + ld_data2 is unused. */ + +#define GFS2_LOG_DESC_REVOKE 301 +/* ld_data1 is the number of revoke blocks in the descriptor. + ld_data2 is unused. */ + +#define GFS2_LOG_DESC_JDATA 302 +/* ld_data1 is the number of data blocks in the descriptor. + ld_data2 is unused. */ + +struct gfs2_log_descriptor { + struct gfs2_meta_header ld_header; + + __be32 ld_type; /* GFS2_LOG_DESC_... */ + __be32 ld_length; /* Number of buffers in this chunk */ + __be32 ld_data1; /* descriptor-specific field */ + __be32 ld_data2; /* descriptor-specific field */ + + __u8 ld_reserved[32]; +}; + +/* + * Inum Range + * Describe a range of formal inode numbers allocated to + * one machine to assign to inodes. + */ + +#define GFS2_INUM_QUANTUM 1048576 + +struct gfs2_inum_range { + __be64 ir_start; + __be64 ir_length; +}; + +/* + * Statfs change + * Describes an change to the pool of free and allocated + * blocks. + */ + +struct gfs2_statfs_change { + __be64 sc_total; + __be64 sc_free; + __be64 sc_dinodes; +}; + +/* + * Quota change + * Describes an allocation change for a particular + * user or group. + */ + +#define GFS2_QCF_USER 0x00000001 + +struct gfs2_quota_change { + __be64 qc_change; + __be32 qc_flags; /* GFS2_QCF_... */ + __be32 qc_id; +}; + +#ifdef __KERNEL__ +/* Translation functions */ + +extern void gfs2_inum_in(struct gfs2_inum *no, char *buf); +extern void gfs2_inum_out(const struct gfs2_inum *no, char *buf); +extern void gfs2_sb_in(struct gfs2_sb *sb, char *buf); +extern void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf); +extern void gfs2_rindex_out(struct gfs2_rindex *ri, char *buf); +extern void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf); +extern void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf); +extern void gfs2_quota_in(struct gfs2_quota *qu, char *buf); +extern void gfs2_quota_out(struct gfs2_quota *qu, char *buf); +extern void gfs2_dinode_in(struct gfs2_dinode *di, char *buf); +extern void gfs2_dinode_out(struct gfs2_dinode *di, char *buf); +extern void gfs2_ea_header_in(struct gfs2_ea_header *ea, char *buf); +extern void gfs2_ea_header_out(struct gfs2_ea_header *ea, char *buf); +extern void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf); +extern void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf); +extern void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf); +extern void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf); +extern void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf); + +/* Printing functions */ + +extern void gfs2_rindex_print(struct gfs2_rindex *ri); +extern void gfs2_dinode_print(struct gfs2_dinode *di); + +#endif /* __KERNEL__ */ + +#endif /* __GFS2_ONDISK_DOT_H__ */ [PATCH 02/16] GFS2: Core locking interface The core GFS2 locking code, including the interface to the pluggable locking modules. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/glock.c | 2254 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/glock.h | 151 +++ fs/gfs2/lm.c | 244 +++++ fs/gfs2/lm.h | 41 fs/gfs2/lm_interface.h | 290 ++++++ fs/gfs2/locking.c | 191 ++++ 6 files changed, 3171 insertions(+) --- /dev/null +++ b/fs/gfs2/lm.c @@ -0,0 +1,244 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "lm.h" +#include "super.h" +#include "util.h" +#include "lvb.h" + +/** + * gfs2_lm_mount - mount a locking protocol + * @sdp: the filesystem + * @args: mount arguements + * @silent: if 1, don't complain if the FS isn't a GFS2 fs + * + * Returns: errno + */ + +int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) +{ + char *proto = sdp->sd_proto_name; + char *table = sdp->sd_table_name; + int flags = 0; + int error; + + if (sdp->sd_args.ar_spectator) + flags |= LM_MFLAG_SPECTATOR; + + fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); + + error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, + gfs2_glock_cb, sdp, + GFS2_MIN_LVB_SIZE, flags, + &sdp->sd_lockstruct, &sdp->sd_kobj); + if (error) { + fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n", + proto, table, sdp->sd_args.ar_hostdata); + goto out; + } + + if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || + gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || + gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= + GFS2_MIN_LVB_SIZE)) { + gfs2_unmount_lockproto(&sdp->sd_lockstruct); + goto out; + } + + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, + sdp->sd_lockstruct.ls_jid); + + fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + + if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && + !sdp->sd_args.ar_ignore_local_fs) { + sdp->sd_args.ar_localflocks = 1; + sdp->sd_args.ar_localcaching = 1; + } + + out: + return error; +} + +void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_others_may_mount( + sdp->sd_lockstruct.ls_lockspace); +} + +void gfs2_lm_unmount(struct gfs2_sbd *sdp) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + gfs2_unmount_lockproto(&sdp->sd_lockstruct); +} + +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) +{ + va_list args; + + if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return 0; + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + fs_err(sdp, "about to withdraw from the cluster\n"); + BUG_ON(sdp->sd_args.ar_debug); + + + fs_err(sdp, "waiting for outstanding I/O\n"); + + /* FIXME: suspend dm device so oustanding bio's complete + and all further io requests fail */ + + fs_err(sdp, "telling LM to withdraw\n"); + gfs2_withdraw_lockproto(&sdp->sd_lockstruct); + fs_err(sdp, "withdrawn\n"); + dump_stack(); + + return -1; +} + +int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, + lm_lock_t **lockp) +{ + int error; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = -EIO; + else + error = sdp->sd_lockstruct.ls_ops->lm_get_lock( + sdp->sd_lockstruct.ls_lockspace, name, lockp); + return error; +} + +void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_put_lock(lock); +} + +unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock, + unsigned int cur_state, unsigned int req_state, + unsigned int flags) +{ + int ret; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = 0; + else + ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, + cur_state, + req_state, flags); + return ret; +} + +unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock, + unsigned int cur_state) +{ + int ret; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = 0; + else + ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state); + return ret; +} + +void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_cancel(lock); +} + +int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp) +{ + int error; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = -EIO; + else + error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); + return error; +} + +void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb); +} + +#if 0 +void gfs2_lm_sync_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_sync_lvb(lock, lvb); +} +#endif /* 0 */ + +int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = -EIO; + else + error = sdp->sd_lockstruct.ls_ops->lm_plock_get( + sdp->sd_lockstruct.ls_lockspace, + name, file, fl); + return error; +} + +int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + int error; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = -EIO; + else + error = sdp->sd_lockstruct.ls_ops->lm_plock( + sdp->sd_lockstruct.ls_lockspace, + name, file, cmd, fl); + return error; +} + +int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = -EIO; + else + error = sdp->sd_lockstruct.ls_ops->lm_punlock( + sdp->sd_lockstruct.ls_lockspace, + name, file, fl); + return error; +} + +void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_recovery_done( + sdp->sd_lockstruct.ls_lockspace, jid, message); +} + --- /dev/null +++ b/fs/gfs2/lm.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __LM_DOT_H__ +#define __LM_DOT_H__ + +int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent); +void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp); +void gfs2_lm_unmount(struct gfs2_sbd *sdp); +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) +__attribute__ ((format(printf, 2, 3))); +int gfs2_lm_get_lock(struct gfs2_sbd *sdp, + struct lm_lockname *name, lm_lock_t **lockp); +void gfs2_lm_put_lock(struct gfs2_sbd *sdp, lm_lock_t *lock); +unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, lm_lock_t *lock, + unsigned int cur_state, unsigned int req_state, + unsigned int flags); +unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, lm_lock_t *lock, + unsigned int cur_state); +void gfs2_lm_cancel(struct gfs2_sbd *sdp, lm_lock_t *lock); +int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char **lvbp); +void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, lm_lock_t *lock, char *lvb); +int gfs2_lm_plock_get(struct gfs2_sbd *sdp, + struct lm_lockname *name, + struct file *file, struct file_lock *fl); +int gfs2_lm_plock(struct gfs2_sbd *sdp, + struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl); +int gfs2_lm_punlock(struct gfs2_sbd *sdp, + struct lm_lockname *name, + struct file *file, struct file_lock *fl); +void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, + unsigned int jid, unsigned int message); + +#endif /* __LM_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/locking.c @@ -0,0 +1,191 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lm_interface.h" + +struct lmh_wrapper { + struct list_head lw_list; + struct lm_lockops *lw_ops; +}; + +/* List of registered low-level locking protocols. A file system selects one + of them by name at mount time, e.g. lock_nolock, lock_dlm. */ + +static struct list_head lmh_list; +static struct mutex lmh_lock; + +/** + * gfs2_register_lockproto - Register a low-level locking protocol + * @proto: the protocol definition + * + * Returns: 0 on success, -EXXX on failure + */ + +int gfs2_register_lockproto(struct lm_lockops *proto) +{ + struct lmh_wrapper *lw; + + mutex_lock(&lmh_lock); + + list_for_each_entry(lw, &lmh_list, lw_list) { + if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { + mutex_unlock(&lmh_lock); + printk(KERN_INFO "GFS2: protocol %s already exists\n", + proto->lm_proto_name); + return -EEXIST; + } + } + + lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL); + if (!lw) { + mutex_unlock(&lmh_lock); + return -ENOMEM; + } + + lw->lw_ops = proto; + list_add(&lw->lw_list, &lmh_list); + + mutex_unlock(&lmh_lock); + + return 0; +} + +/** + * gfs2_unregister_lockproto - Unregister a low-level locking protocol + * @proto: the protocol definition + * + */ + +void gfs2_unregister_lockproto(struct lm_lockops *proto) +{ + struct lmh_wrapper *lw; + + mutex_lock(&lmh_lock); + + list_for_each_entry(lw, &lmh_list, lw_list) { + if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { + list_del(&lw->lw_list); + mutex_unlock(&lmh_lock); + kfree(lw); + return; + } + } + + mutex_unlock(&lmh_lock); + + printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n", + proto->lm_proto_name); +} + +/** + * gfs2_mount_lockproto - Mount a lock protocol + * @proto_name - the name of the protocol + * @table_name - the name of the lock space + * @host_data - data specific to this host + * @cb - the callback to the code using the lock module + * @fsdata - data to pass back with the callback + * @min_lvb_size - the mininum LVB size that the caller can deal with + * @flags - LM_MFLAG_* + * @lockstruct - a structure returned describing the mount + * + * Returns: 0 on success, -EXXX on failure + */ + +int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data, + lm_callback_t cb, lm_fsdata_t *fsdata, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj) +{ + struct lmh_wrapper *lw = NULL; + int try = 0; + int error, found; + +retry: + mutex_lock(&lmh_lock); + + found = 0; + list_for_each_entry(lw, &lmh_list, lw_list) { + if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) { + found = 1; + break; + } + } + + if (!found) { + if (!try && capable(CAP_SYS_MODULE)) { + try = 1; + mutex_unlock(&lmh_lock); + request_module(proto_name); + goto retry; + } + printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name); + error = -ENOENT; + goto out; + } + + if (!try_module_get(lw->lw_ops->lm_owner)) { + try = 0; + mutex_unlock(&lmh_lock); + msleep(1000); + goto retry; + } + + error = lw->lw_ops->lm_mount(table_name, host_data, cb, fsdata, + min_lvb_size, flags, lockstruct, fskobj); + if (error) + module_put(lw->lw_ops->lm_owner); +out: + mutex_unlock(&lmh_lock); + return error; +} + +void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct) +{ + mutex_lock(&lmh_lock); + lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); + if (lockstruct->ls_ops->lm_owner) + module_put(lockstruct->ls_ops->lm_owner); + mutex_unlock(&lmh_lock); +} + +/** + * gfs2_withdraw_lockproto - abnormally unmount a lock module + * @lockstruct: the lockstruct passed into mount + * + */ + +void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct) +{ + mutex_lock(&lmh_lock); + lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace); + if (lockstruct->ls_ops->lm_owner) + module_put(lockstruct->ls_ops->lm_owner); + mutex_unlock(&lmh_lock); +} + +void __init gfs2_init_lmh(void) +{ + mutex_init(&lmh_lock); + INIT_LIST_HEAD(&lmh_list); +} + +EXPORT_SYMBOL_GPL(gfs2_register_lockproto); +EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto); + --- /dev/null +++ b/fs/gfs2/lm_interface.h @@ -0,0 +1,290 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __LM_INTERFACE_DOT_H__ +#define __LM_INTERFACE_DOT_H__ + +/* + * Opaque handles represent the lock module's lockspace structure, the lock + * module's lock structures, and GFS's file system (superblock) structure. + */ + +typedef void lm_lockspace_t; +typedef void lm_lock_t; +typedef void lm_fsdata_t; + +typedef void (*lm_callback_t) (lm_fsdata_t *fsdata, unsigned int type, + void *data); + +/* + * lm_mount() flags + * + * LM_MFLAG_SPECTATOR + * GFS is asking to join the filesystem's lockspace, but it doesn't want to + * modify the filesystem. The lock module shouldn't assign a journal to the FS + * mount. It shouldn't send recovery callbacks to the FS mount. If the node + * dies or withdraws, all locks can be wiped immediately. + */ + +#define LM_MFLAG_SPECTATOR 0x00000001 + +/* + * lm_lockstruct flags + * + * LM_LSFLAG_LOCAL + * The lock_nolock module returns LM_LSFLAG_LOCAL to GFS, indicating that GFS + * can make single-node optimizations. + */ + +#define LM_LSFLAG_LOCAL 0x00000001 + +/* + * lm_lockname types + */ + +#define LM_TYPE_RESERVED 0x00 +#define LM_TYPE_NONDISK 0x01 +#define LM_TYPE_INODE 0x02 +#define LM_TYPE_RGRP 0x03 +#define LM_TYPE_META 0x04 +#define LM_TYPE_IOPEN 0x05 +#define LM_TYPE_FLOCK 0x06 +#define LM_TYPE_PLOCK 0x07 +#define LM_TYPE_QUOTA 0x08 +#define LM_TYPE_JOURNAL 0x09 + +/* + * lm_lock() states + * + * SHARED is compatible with SHARED, not with DEFERRED or EX. + * DEFERRED is compatible with DEFERRED, not with SHARED or EX. + */ + +#define LM_ST_UNLOCKED 0 +#define LM_ST_EXCLUSIVE 1 +#define LM_ST_DEFERRED 2 +#define LM_ST_SHARED 3 + +/* + * lm_lock() flags + * + * LM_FLAG_TRY + * Don't wait to acquire the lock if it can't be granted immediately. + * + * LM_FLAG_TRY_1CB + * Send one blocking callback if TRY is set and the lock is not granted. + * + * LM_FLAG_NOEXP + * GFS sets this flag on lock requests it makes while doing journal recovery. + * These special requests should not be blocked due to the recovery like + * ordinary locks would be. + * + * LM_FLAG_ANY + * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may + * also be granted in SHARED. The preferred state is whichever is compatible + * with other granted locks, or the specified state if no other locks exist. + * + * LM_FLAG_PRIORITY + * Override fairness considerations. Suppose a lock is held in a shared state + * and there is a pending request for the deferred state. A shared lock + * request with the priority flag would be allowed to bypass the deferred + * request and directly join the other shared lock. A shared lock request + * without the priority flag might be forced to wait until the deferred + * requested had acquired and released the lock. + */ + +#define LM_FLAG_TRY 0x00000001 +#define LM_FLAG_TRY_1CB 0x00000002 +#define LM_FLAG_NOEXP 0x00000004 +#define LM_FLAG_ANY 0x00000008 +#define LM_FLAG_PRIORITY 0x00000010 + +/* + * lm_lock() and lm_async_cb return flags + * + * LM_OUT_ST_MASK + * Masks the lower two bits of lock state in the returned value. + * + * LM_OUT_CACHEABLE + * The lock hasn't been released so GFS can continue to cache data for it. + * + * LM_OUT_CANCELED + * The lock request was canceled. + * + * LM_OUT_ASYNC + * The result of the request will be returned in an LM_CB_ASYNC callback. + */ + +#define LM_OUT_ST_MASK 0x00000003 +#define LM_OUT_CACHEABLE 0x00000004 +#define LM_OUT_CANCELED 0x00000008 +#define LM_OUT_ASYNC 0x00000080 +#define LM_OUT_ERROR 0x00000100 + +/* + * lm_callback_t types + * + * LM_CB_NEED_E LM_CB_NEED_D LM_CB_NEED_S + * Blocking callback, a remote node is requesting the given lock in + * EXCLUSIVE, DEFERRED, or SHARED. + * + * LM_CB_NEED_RECOVERY + * The given journal needs to be recovered. + * + * LM_CB_DROPLOCKS + * Reduce the number of cached locks. + * + * LM_CB_ASYNC + * The given lock has been granted. + */ + +#define LM_CB_NEED_E 257 +#define LM_CB_NEED_D 258 +#define LM_CB_NEED_S 259 +#define LM_CB_NEED_RECOVERY 260 +#define LM_CB_DROPLOCKS 261 +#define LM_CB_ASYNC 262 + +/* + * lm_recovery_done() messages + */ + +#define LM_RD_GAVEUP 308 +#define LM_RD_SUCCESS 309 + + +struct lm_lockname { + uint64_t ln_number; + unsigned int ln_type; +}; + +#define lm_name_equal(name1, name2) \ + (((name1)->ln_number == (name2)->ln_number) && \ + ((name1)->ln_type == (name2)->ln_type)) \ + +struct lm_async_cb { + struct lm_lockname lc_name; + int lc_ret; +}; + +struct lm_lockstruct; + +struct lm_lockops { + char lm_proto_name[256]; + + /* + * Mount/Unmount + */ + + int (*lm_mount) (char *table_name, char *host_data, + lm_callback_t cb, lm_fsdata_t *fsdata, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj); + + void (*lm_others_may_mount) (lm_lockspace_t *lockspace); + + void (*lm_unmount) (lm_lockspace_t *lockspace); + + void (*lm_withdraw) (lm_lockspace_t *lockspace); + + /* + * Lock oriented operations + */ + + int (*lm_get_lock) (lm_lockspace_t *lockspace, + struct lm_lockname *name, lm_lock_t **lockp); + + void (*lm_put_lock) (lm_lock_t *lock); + + unsigned int (*lm_lock) (lm_lock_t *lock, unsigned int cur_state, + unsigned int req_state, unsigned int flags); + + unsigned int (*lm_unlock) (lm_lock_t *lock, unsigned int cur_state); + + void (*lm_cancel) (lm_lock_t *lock); + + int (*lm_hold_lvb) (lm_lock_t *lock, char **lvbp); + void (*lm_unhold_lvb) (lm_lock_t *lock, char *lvb); + void (*lm_sync_lvb) (lm_lock_t *lock, char *lvb); + + /* + * Posix Lock oriented operations + */ + + int (*lm_plock_get) (lm_lockspace_t *lockspace, + struct lm_lockname *name, + struct file *file, struct file_lock *fl); + + int (*lm_plock) (lm_lockspace_t *lockspace, + struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl); + + int (*lm_punlock) (lm_lockspace_t *lockspace, + struct lm_lockname *name, + struct file *file, struct file_lock *fl); + + /* + * Client oriented operations + */ + + void (*lm_recovery_done) (lm_lockspace_t *lockspace, unsigned int jid, + unsigned int message); + + struct module *lm_owner; +}; + +/* + * lm_mount() return values + * + * ls_jid - the journal ID this node should use + * ls_first - this node is the first to mount the file system + * ls_lvb_size - size in bytes of lock value blocks + * ls_lockspace - lock module's context for this file system + * ls_ops - lock module's functions + * ls_flags - lock module features + */ + +struct lm_lockstruct { + unsigned int ls_jid; + unsigned int ls_first; + unsigned int ls_lvb_size; + lm_lockspace_t *ls_lockspace; + struct lm_lockops *ls_ops; + int ls_flags; +}; + +void __init gfs2_init_lmh(void); + +/* + * Lock module bottom interface. A lock module makes itself available to GFS + * with these functions. + */ + +int gfs2_register_lockproto(struct lm_lockops *proto); + +void gfs2_unregister_lockproto(struct lm_lockops *proto); + +/* + * Lock module top interface. GFS calls these functions when mounting or + * unmounting a file system. + */ + +int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data, + lm_callback_t cb, lm_fsdata_t *fsdata, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj); + +void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct); + +void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct); + +#endif /* __LM_INTERFACE_DOT_H__ */ + --- /dev/null +++ b/fs/gfs2/glock.c @@ -0,0 +1,2254 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lm.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "super.h" +#include "util.h" + +/* Must be kept in sync with the beginning of struct gfs2_glock */ +struct glock_plug { + struct list_head gl_list; + unsigned long gl_flags; +}; + +struct greedy { + struct gfs2_holder gr_gh; + struct work_struct gr_work; +}; + +typedef void (*glock_examiner) (struct gfs2_glock * gl); + +static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); +static int dump_glock(struct gfs2_glock *gl); + +/** + * relaxed_state_ok - is a requested lock compatible with the current lock mode? + * @actual: the current state of the lock + * @requested: the lock state that was requested by the caller + * @flags: the modifier flags passed in by the caller + * + * Returns: 1 if the locks are compatible, 0 otherwise + */ + +static inline int relaxed_state_ok(unsigned int actual, unsigned requested, + int flags) +{ + if (actual == requested) + return 1; + + if (flags & GL_EXACT) + return 0; + + if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED) + return 1; + + if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY)) + return 1; + + return 0; +} + +/** + * gl_hash() - Turn glock number into hash bucket number + * @lock: The glock number + * + * Returns: The number of the corresponding hash bucket + */ + +static unsigned int gl_hash(const struct lm_lockname *name) +{ + unsigned int h; + + h = jhash(&name->ln_number, sizeof(uint64_t), 0); + h = jhash(&name->ln_type, sizeof(unsigned int), h); + h &= GFS2_GL_HASH_MASK; + + return h; +} + +/** + * glock_free() - Perform a few checks and then release struct gfs2_glock + * @gl: The glock to release + * + * Also calls lock module to release its internal structure for this glock. + * + */ + +static void glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct inode *aspace = gl->gl_aspace; + + gfs2_lm_put_lock(sdp, gl->gl_lock); + + if (aspace) + gfs2_aspace_put(aspace); + + kmem_cache_free(gfs2_glock_cachep, gl); +} + +/** + * gfs2_glock_hold() - increment reference count on glock + * @gl: The glock to hold + * + */ + +void gfs2_glock_hold(struct gfs2_glock *gl) +{ + kref_get(&gl->gl_ref); +} + +/* All work is done after the return from kref_put() so we + can release the write_lock before the free. */ + +static void kill_glock(struct kref *kref) +{ + struct gfs2_glock *gl = container_of(kref, struct gfs2_glock, gl_ref); + struct gfs2_sbd *sdp = gl->gl_sbd; + + gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); + gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); + gfs2_assert(sdp, list_empty(&gl->gl_holders)); + gfs2_assert(sdp, list_empty(&gl->gl_waiters1)); + gfs2_assert(sdp, list_empty(&gl->gl_waiters2)); + gfs2_assert(sdp, list_empty(&gl->gl_waiters3)); +} + +/** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +int gfs2_glock_put(struct gfs2_glock *gl) +{ + struct gfs2_gl_hash_bucket *bucket = gl->gl_bucket; + int rv = 0; + + write_lock(&bucket->hb_lock); + if (kref_put(&gl->gl_ref, kill_glock)) { + list_del_init(&gl->gl_list); + write_unlock(&bucket->hb_lock); + BUG_ON(spin_is_locked(&gl->gl_spin)); + glock_free(gl); + rv = 1; + goto out; + } + write_unlock(&bucket->hb_lock); +out: + return rv; +} + +/** + * queue_empty - check to see if a glock's queue is empty + * @gl: the glock + * @head: the head of the queue to check + * + * This function protects the list in the event that a process already + * has a holder on the list and is adding a second holder for itself. + * The glmutex lock is what generally prevents processes from working + * on the same glock at once, but the special case of adding a second + * holder for yourself ("recursive" locking) doesn't involve locking + * glmutex, making the spin lock necessary. + * + * Returns: 1 if the queue is empty + */ + +static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head) +{ + int empty; + spin_lock(&gl->gl_spin); + empty = list_empty(head); + spin_unlock(&gl->gl_spin); + return empty; +} + +/** + * search_bucket() - Find struct gfs2_glock by lock number + * @bucket: the bucket to search + * @name: The lock name + * + * Returns: NULL, or the struct gfs2_glock with the requested number + */ + +static struct gfs2_glock *search_bucket(struct gfs2_gl_hash_bucket *bucket, + const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + struct gfs2_glock *gl; + + list_for_each_entry(gl, &bucket->hb_list, gl_list) { + if (test_bit(GLF_PLUG, &gl->gl_flags)) + continue; + if (!lm_name_equal(&gl->gl_name, name)) + continue; + if (gl->gl_sbd != sdp) + continue; + + kref_get(&gl->gl_ref); + + return gl; + } + + return NULL; +} + +/** + * gfs2_glock_find() - Find glock by lock number + * @sdp: The GFS2 superblock + * @name: The lock name + * + * Returns: NULL, or the struct gfs2_glock with the requested number + */ + +static struct gfs2_glock *gfs2_glock_find(struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + struct gfs2_gl_hash_bucket *bucket = &sdp->sd_gl_hash[gl_hash(name)]; + struct gfs2_glock *gl; + + read_lock(&bucket->hb_lock); + gl = search_bucket(bucket, sdp, name); + read_unlock(&bucket->hb_lock); + + return gl; +} + +/** + * gfs2_glock_get() - Get a glock, or create one if one doesn't exist + * @sdp: The GFS2 superblock + * @number: the lock number + * @glops: The glock_operations to use + * @create: If 0, don't create the glock if it doesn't exist + * @glp: the glock is returned here + * + * This does not lock a glock, just finds/creates structures for one. + * + * Returns: errno + */ + +int gfs2_glock_get(struct gfs2_sbd *sdp, uint64_t number, + const struct gfs2_glock_operations *glops, int create, + struct gfs2_glock **glp) +{ + struct lm_lockname name; + struct gfs2_glock *gl, *tmp; + struct gfs2_gl_hash_bucket *bucket; + int error; + + name.ln_number = number; + name.ln_type = glops->go_type; + bucket = &sdp->sd_gl_hash[gl_hash(&name)]; + + read_lock(&bucket->hb_lock); + gl = search_bucket(bucket, sdp, &name); + read_unlock(&bucket->hb_lock); + + if (gl || !create) { + *glp = gl; + return 0; + } + + gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); + if (!gl) + return -ENOMEM; + + gl->gl_flags = 0; + gl->gl_name = name; + kref_init(&gl->gl_ref); + gl->gl_state = LM_ST_UNLOCKED; + gl->gl_owner = NULL; + gl->gl_ip = 0; + gl->gl_ops = glops; + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + gl->gl_vn = 0; + gl->gl_stamp = jiffies; + gl->gl_object = NULL; + gl->gl_bucket = bucket; + gl->gl_sbd = sdp; + gl->gl_aspace = NULL; + lops_init_le(&gl->gl_le, &gfs2_glock_lops); + + /* If this glock protects actual on-disk data or metadata blocks, + create a VFS inode to manage the pages/buffers holding them. */ + if (glops == &gfs2_inode_glops || + glops == &gfs2_rgrp_glops) { + gl->gl_aspace = gfs2_aspace_get(sdp); + if (!gl->gl_aspace) { + error = -ENOMEM; + goto fail; + } + } + + error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock); + if (error) + goto fail_aspace; + + write_lock(&bucket->hb_lock); + tmp = search_bucket(bucket, sdp, &name); + if (tmp) { + write_unlock(&bucket->hb_lock); + glock_free(gl); + gl = tmp; + } else { + list_add_tail(&gl->gl_list, &bucket->hb_list); + write_unlock(&bucket->hb_lock); + } + + *glp = gl; + + return 0; + +fail_aspace: + if (gl->gl_aspace) + gfs2_aspace_put(gl->gl_aspace); +fail: + kmem_cache_free(gfs2_glock_cachep, gl); + return error; +} + +/** + * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + */ + +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh) +{ + INIT_LIST_HEAD(&gh->gh_list); + gh->gh_gl = gl; + gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_owner = current; + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_error = 0; + gh->gh_iflags = 0; + init_completion(&gh->gh_wait); + + if (gh->gh_state == LM_ST_EXCLUSIVE) + gh->gh_flags |= GL_LOCAL_EXCL; + + gfs2_glock_hold(gl); +} + +/** + * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Don't mess with the glock. + * + */ + +void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) +{ + gh->gh_state = state; + gh->gh_flags = flags; + if (gh->gh_state == LM_ST_EXCLUSIVE) + gh->gh_flags |= GL_LOCAL_EXCL; + + gh->gh_iflags &= 1 << HIF_ALLOCED; + gh->gh_ip = (unsigned long)__builtin_return_address(0); +} + +/** + * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) + * @gh: the holder structure + * + */ + +void gfs2_holder_uninit(struct gfs2_holder *gh) +{ + gfs2_glock_put(gh->gh_gl); + gh->gh_gl = NULL; + gh->gh_ip = 0; +} + +/** + * gfs2_holder_get - get a struct gfs2_holder structure + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gfp_flags: + * + * Figure out how big an impact this function has. Either: + * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd + * 2) Leave it like it is + * + * Returns: the holder structure, NULL on ENOMEM + */ + +static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, + unsigned int state, + int flags, gfp_t gfp_flags) +{ + struct gfs2_holder *gh; + + gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags); + if (!gh) + return NULL; + + gfs2_holder_init(gl, state, flags, gh); + set_bit(HIF_ALLOCED, &gh->gh_iflags); + gh->gh_ip = (unsigned long)__builtin_return_address(0); + return gh; +} + +/** + * gfs2_holder_put - get rid of a struct gfs2_holder structure + * @gh: the holder structure + * + */ + +static void gfs2_holder_put(struct gfs2_holder *gh) +{ + gfs2_holder_uninit(gh); + kfree(gh); +} + +/** + * rq_mutex - process a mutex request in the queue + * @gh: the glock holder + * + * Returns: 1 if the queue is blocked + */ + +static int rq_mutex(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + list_del_init(&gh->gh_list); + /* gh->gh_error never examined. */ + set_bit(GLF_LOCK, &gl->gl_flags); + complete(&gh->gh_wait); + + return 1; +} + +/** + * rq_promote - process a promote request in the queue + * @gh: the glock holder + * + * Acquire a new inter-node lock, or change a lock state to more restrictive. + * + * Returns: 1 if the queue is blocked + */ + +static int rq_promote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { + if (list_empty(&gl->gl_holders)) { + gl->gl_req_gh = gh; + set_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + if (atomic_read(&sdp->sd_reclaim_count) > + gfs2_tune_get(sdp, gt_reclaim_limit) && + !(gh->gh_flags & LM_FLAG_PRIORITY)) { + gfs2_reclaim_glock(sdp); + gfs2_reclaim_glock(sdp); + } + + glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + spin_lock(&gl->gl_spin); + } + return 1; + } + + if (list_empty(&gl->gl_holders)) { + set_bit(HIF_FIRST, &gh->gh_iflags); + set_bit(GLF_LOCK, &gl->gl_flags); + } else { + struct gfs2_holder *next_gh; + if (gh->gh_flags & GL_LOCAL_EXCL) + return 1; + next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder, + gh_list); + if (next_gh->gh_flags & GL_LOCAL_EXCL) + return 1; + } + + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh->gh_error = 0; + set_bit(HIF_HOLDER, &gh->gh_iflags); + + complete(&gh->gh_wait); + + return 0; +} + +/** + * rq_demote - process a demote request in the queue + * @gh: the glock holder + * + * Returns: 1 if the queue is blocked + */ + +static int rq_demote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (!list_empty(&gl->gl_holders)) + return 1; + + if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) { + list_del_init(&gh->gh_list); + gh->gh_error = 0; + spin_unlock(&gl->gl_spin); + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) + gfs2_holder_put(gh); + else + complete(&gh->gh_wait); + spin_lock(&gl->gl_spin); + } else { + gl->gl_req_gh = gh; + set_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + if (gh->gh_state == LM_ST_UNLOCKED || + gl->gl_state != LM_ST_EXCLUSIVE) + glops->go_drop_th(gl); + else + glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + + spin_lock(&gl->gl_spin); + } + + return 0; +} + +/** + * rq_greedy - process a queued request to drop greedy status + * @gh: the glock holder + * + * Returns: 1 if the queue is blocked + */ + +static int rq_greedy(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + list_del_init(&gh->gh_list); + /* gh->gh_error never examined. */ + clear_bit(GLF_GREEDY, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + gfs2_holder_uninit(gh); + kfree(container_of(gh, struct greedy, gr_gh)); + + spin_lock(&gl->gl_spin); + + return 0; +} + +/** + * run_queue - process holder structures on a glock + * @gl: the glock + * + */ +static void run_queue(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + int blocked = 1; + + for (;;) { + if (test_bit(GLF_LOCK, &gl->gl_flags)) + break; + + if (!list_empty(&gl->gl_waiters1)) { + gh = list_entry(gl->gl_waiters1.next, + struct gfs2_holder, gh_list); + + if (test_bit(HIF_MUTEX, &gh->gh_iflags)) + blocked = rq_mutex(gh); + else + gfs2_assert_warn(gl->gl_sbd, 0); + + } else if (!list_empty(&gl->gl_waiters2) && + !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) { + gh = list_entry(gl->gl_waiters2.next, + struct gfs2_holder, gh_list); + + if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) + blocked = rq_demote(gh); + else if (test_bit(HIF_GREEDY, &gh->gh_iflags)) + blocked = rq_greedy(gh); + else + gfs2_assert_warn(gl->gl_sbd, 0); + + } else if (!list_empty(&gl->gl_waiters3)) { + gh = list_entry(gl->gl_waiters3.next, + struct gfs2_holder, gh_list); + + if (test_bit(HIF_PROMOTE, &gh->gh_iflags)) + blocked = rq_promote(gh); + else + gfs2_assert_warn(gl->gl_sbd, 0); + + } else + break; + + if (blocked) + break; + } +} + +/** + * gfs2_glmutex_lock - acquire a local lock on a glock + * @gl: the glock + * + * Gives caller exclusive access to manipulate a glock structure. + */ + +static void gfs2_glmutex_lock(struct gfs2_glock *gl) +{ + struct gfs2_holder gh; + + gfs2_holder_init(gl, 0, 0, &gh); + set_bit(HIF_MUTEX, &gh.gh_iflags); + + spin_lock(&gl->gl_spin); + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) + list_add_tail(&gh.gh_list, &gl->gl_waiters1); + else { + gl->gl_owner = current; + gl->gl_ip = (unsigned long)__builtin_return_address(0); + complete(&gh.gh_wait); + } + spin_unlock(&gl->gl_spin); + + wait_for_completion(&gh.gh_wait); + gfs2_holder_uninit(&gh); +} + +/** + * gfs2_glmutex_trylock - try to acquire a local lock on a glock + * @gl: the glock + * + * Returns: 1 if the glock is acquired + */ + +static int gfs2_glmutex_trylock(struct gfs2_glock *gl) +{ + int acquired = 1; + + spin_lock(&gl->gl_spin); + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) + acquired = 0; + else { + gl->gl_owner = current; + gl->gl_ip = (unsigned long)__builtin_return_address(0); + } + spin_unlock(&gl->gl_spin); + + return acquired; +} + +/** + * gfs2_glmutex_unlock - release a local lock on a glock + * @gl: the glock + * + */ + +static void gfs2_glmutex_unlock(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + gl->gl_owner = NULL; + gl->gl_ip = 0; + run_queue(gl); + BUG_ON(!spin_is_locked(&gl->gl_spin)); + spin_unlock(&gl->gl_spin); +} + +/** + * handle_callback - add a demote request to a lock's queue + * @gl: the glock + * @state: the state the caller wants us to change to + * + * Note: This may fail sliently if we are out of memory. + */ + +static void handle_callback(struct gfs2_glock *gl, unsigned int state) +{ + struct gfs2_holder *gh, *new_gh = NULL; + +restart: + spin_lock(&gl->gl_spin); + + list_for_each_entry(gh, &gl->gl_waiters2, gh_list) { + if (test_bit(HIF_DEMOTE, &gh->gh_iflags) && + gl->gl_req_gh != gh) { + if (gh->gh_state != state) + gh->gh_state = LM_ST_UNLOCKED; + goto out; + } + } + + if (new_gh) { + list_add_tail(&new_gh->gh_list, &gl->gl_waiters2); + new_gh = NULL; + } else { + spin_unlock(&gl->gl_spin); + + new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL); + if (!new_gh) + return; + set_bit(HIF_DEMOTE, &new_gh->gh_iflags); + set_bit(HIF_DEALLOC, &new_gh->gh_iflags); + + goto restart; + } + +out: + spin_unlock(&gl->gl_spin); + + if (new_gh) + gfs2_holder_put(new_gh); +} + +void gfs2_glock_inode_squish(struct inode *inode) +{ + struct gfs2_holder gh; + struct gfs2_glock *gl = GFS2_I(inode)->i_gl; + gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh); + set_bit(HIF_DEMOTE, &gh.gh_iflags); + spin_lock(&gl->gl_spin); + gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders)); + list_add_tail(&gh.gh_list, &gl->gl_waiters2); + run_queue(gl); + spin_unlock(&gl->gl_spin); + wait_for_completion(&gh.gh_wait); + gfs2_holder_uninit(&gh); +} + +/** + * state_change - record that the glock is now in a different state + * @gl: the glock + * @new_state the new state + * + */ + +static void state_change(struct gfs2_glock *gl, unsigned int new_state) +{ + int held1, held2; + + held1 = (gl->gl_state != LM_ST_UNLOCKED); + held2 = (new_state != LM_ST_UNLOCKED); + + if (held1 != held2) { + if (held2) + gfs2_glock_hold(gl); + else + gfs2_glock_put(gl); + } + + gl->gl_state = new_state; +} + +/** + * xmote_bh - Called after the lock module is done acquiring a lock + * @gl: The glock in question + * @ret: the int returned from the lock module + * + */ + +static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh = gl->gl_req_gh; + int prev_state = gl->gl_state; + int op_done = 1; + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); + + state_change(gl, ret & LM_OUT_ST_MASK); + + if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) { + if (glops->go_inval) + glops->go_inval(gl, DIO_METADATA | DIO_DATA); + } else if (gl->gl_state == LM_ST_DEFERRED) { + /* We might not want to do this here. + Look at moving to the inode glops. */ + if (glops->go_inval) + glops->go_inval(gl, DIO_DATA); + } + + /* Deal with each possible exit condition */ + + if (!gh) + gl->gl_stamp = jiffies; + + else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = -EIO; + spin_unlock(&gl->gl_spin); + + } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + if (gl->gl_state == gh->gh_state || + gl->gl_state == LM_ST_UNLOCKED) + gh->gh_error = 0; + else { + if (gfs2_assert_warn(sdp, gh->gh_flags & + (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1) + fs_warn(sdp, "ret = 0x%.8X\n", ret); + gh->gh_error = GLR_TRYFAILED; + } + spin_unlock(&gl->gl_spin); + + if (ret & LM_OUT_CANCELED) + handle_callback(gl, LM_ST_UNLOCKED); /* Lame */ + + } else if (ret & LM_OUT_CANCELED) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = GLR_CANCELED; + spin_unlock(&gl->gl_spin); + + } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { + spin_lock(&gl->gl_spin); + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh->gh_error = 0; + set_bit(HIF_HOLDER, &gh->gh_iflags); + spin_unlock(&gl->gl_spin); + + set_bit(HIF_FIRST, &gh->gh_iflags); + + op_done = 0; + + } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = GLR_TRYFAILED; + spin_unlock(&gl->gl_spin); + + } else { + if (gfs2_assert_withdraw(sdp, 0) == -1) + fs_err(sdp, "ret = 0x%.8X\n", ret); + } + + if (glops->go_xmote_bh) + glops->go_xmote_bh(gl); + + if (op_done) { + spin_lock(&gl->gl_spin); + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); + } + + gfs2_glock_put(gl); + + if (gh) { + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) + gfs2_holder_put(gh); + else + complete(&gh->gh_wait); + } +} + +/** + * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock + * @gl: The glock in question + * @state: the requested state + * @flags: modifier flags to the lock call + * + */ + +void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | + LM_FLAG_NOEXP | LM_FLAG_ANY | + LM_FLAG_PRIORITY); + unsigned int lck_ret; + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED); + gfs2_assert_warn(sdp, state != gl->gl_state); + + if (gl->gl_state == LM_ST_EXCLUSIVE) { + if (glops->go_sync) + glops->go_sync(gl, + DIO_METADATA | DIO_DATA | DIO_RELEASE); + } + + gfs2_glock_hold(gl); + gl->gl_req_bh = xmote_bh; + + lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags); + + if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR))) + return; + + if (lck_ret & LM_OUT_ASYNC) + gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC); + else + xmote_bh(gl, lck_ret); +} + +/** + * drop_bh - Called after a lock module unlock completes + * @gl: the glock + * @ret: the return status + * + * Doesn't wake up the process waiting on the struct gfs2_holder (if any) + * Doesn't drop the reference on the glock the top half took out + * + */ + +static void drop_bh(struct gfs2_glock *gl, unsigned int ret) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh = gl->gl_req_gh; + + clear_bit(GLF_PREFETCH, &gl->gl_flags); + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, !ret); + + state_change(gl, LM_ST_UNLOCKED); + + if (glops->go_inval) + glops->go_inval(gl, DIO_METADATA | DIO_DATA); + + if (gh) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = 0; + spin_unlock(&gl->gl_spin); + } + + if (glops->go_drop_bh) + glops->go_drop_bh(gl); + + spin_lock(&gl->gl_spin); + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); + + gfs2_glock_put(gl); + + if (gh) { + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) + gfs2_holder_put(gh); + else + complete(&gh->gh_wait); + } +} + +/** + * gfs2_glock_drop_th - call into the lock module to unlock a lock + * @gl: the glock + * + */ + +void gfs2_glock_drop_th(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned int ret; + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); + + if (gl->gl_state == LM_ST_EXCLUSIVE) { + if (glops->go_sync) + glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE); + } + + gfs2_glock_hold(gl); + gl->gl_req_bh = drop_bh; + + ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state); + + if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR))) + return; + + if (!ret) + drop_bh(gl, ret); + else + gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC); +} + +/** + * do_cancels - cancel requests for locks stuck waiting on an expire flag + * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock + * + * Don't cancel GL_NOCANCEL requests. + */ + +static void do_cancels(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_spin); + + while (gl->gl_req_gh != gh && + !test_bit(HIF_HOLDER, &gh->gh_iflags) && + !list_empty(&gh->gh_list)) { + if (gl->gl_req_bh && + !(gl->gl_req_gh && + (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) { + spin_unlock(&gl->gl_spin); + gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock); + msleep(100); + spin_lock(&gl->gl_spin); + } else { + spin_unlock(&gl->gl_spin); + msleep(100); + spin_lock(&gl->gl_spin); + } + } + + spin_unlock(&gl->gl_spin); +} + +/** + * glock_wait_internal - wait on a glock acquisition + * @gh: the glock holder + * + * Returns: 0 on success + */ + +static int glock_wait_internal(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (test_bit(HIF_ABORTED, &gh->gh_iflags)) + return -EIO; + + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + spin_lock(&gl->gl_spin); + if (gl->gl_req_gh != gh && + !test_bit(HIF_HOLDER, &gh->gh_iflags) && + !list_empty(&gh->gh_list)) { + list_del_init(&gh->gh_list); + gh->gh_error = GLR_TRYFAILED; + run_queue(gl); + spin_unlock(&gl->gl_spin); + return gh->gh_error; + } + spin_unlock(&gl->gl_spin); + } + + if (gh->gh_flags & LM_FLAG_PRIORITY) + do_cancels(gh); + + wait_for_completion(&gh->gh_wait); + + if (gh->gh_error) + return gh->gh_error; + + gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags)); + gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, + gh->gh_state, + gh->gh_flags)); + + if (test_bit(HIF_FIRST, &gh->gh_iflags)) { + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + + if (glops->go_lock) { + gh->gh_error = glops->go_lock(gh); + if (gh->gh_error) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + spin_unlock(&gl->gl_spin); + } + } + + spin_lock(&gl->gl_spin); + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); + } + + return gh->gh_error; +} + +static inline struct gfs2_holder * +find_holder_by_owner(struct list_head *head, struct task_struct *owner) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, head, gh_list) { + if (gh->gh_owner == owner) + return gh; + } + + return NULL; +} + +/** + * add_to_queue - Add a holder to the wait queue (but look for recursion) + * @gh: the holder structure to add + * + */ + +static void add_to_queue(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_holder *existing; + + BUG_ON(!gh->gh_owner); + + existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner); + if (existing) { + print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); + printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid); + printk(KERN_INFO "lock type : %d lock state : %d\n", + existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state); + print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); + printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid); + printk(KERN_INFO "lock type : %d lock state : %d\n", + gl->gl_name.ln_type, gl->gl_state); + BUG(); + } + + existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner); + if (existing) { + print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); + print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); + BUG(); + } + + if (gh->gh_flags & LM_FLAG_PRIORITY) + list_add(&gh->gh_list, &gl->gl_waiters3); + else + list_add_tail(&gh->gh_list, &gl->gl_waiters3); +} + +/** + * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) + * @gh: the holder structure + * + * if (gh->gh_flags & GL_ASYNC), this never returns an error + * + * Returns: 0, GLR_TRYFAILED, or errno on failure + */ + +int gfs2_glock_nq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + int error = 0; + +restart: + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + set_bit(HIF_ABORTED, &gh->gh_iflags); + return -EIO; + } + + set_bit(HIF_PROMOTE, &gh->gh_iflags); + + spin_lock(&gl->gl_spin); + add_to_queue(gh); + run_queue(gl); + spin_unlock(&gl->gl_spin); + + if (!(gh->gh_flags & GL_ASYNC)) { + error = glock_wait_internal(gh); + if (error == GLR_CANCELED) { + msleep(100); + goto restart; + } + } + + clear_bit(GLF_PREFETCH, &gl->gl_flags); + + if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP)) + dump_glock(gl); + + return error; +} + +/** + * gfs2_glock_poll - poll to see if an async request has been completed + * @gh: the holder + * + * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on + */ + +int gfs2_glock_poll(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + int ready = 0; + + spin_lock(&gl->gl_spin); + + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + ready = 1; + else if (list_empty(&gh->gh_list)) { + if (gh->gh_error == GLR_CANCELED) { + spin_unlock(&gl->gl_spin); + msleep(100); + if (gfs2_glock_nq(gh)) + return 1; + return 0; + } else + ready = 1; + } + + spin_unlock(&gl->gl_spin); + + return ready; +} + +/** + * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC + * @gh: the holder structure + * + * Returns: 0, GLR_TRYFAILED, or errno on failure + */ + +int gfs2_glock_wait(struct gfs2_holder *gh) +{ + int error; + + error = glock_wait_internal(gh); + if (error == GLR_CANCELED) { + msleep(100); + gh->gh_flags &= ~GL_ASYNC; + error = gfs2_glock_nq(gh); + } + + return error; +} + +/** + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) + * @gh: the glock holder + * + */ + +void gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED); + + gfs2_glmutex_lock(gl); + + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + + if (list_empty(&gl->gl_holders)) { + spin_unlock(&gl->gl_spin); + + if (glops->go_unlock) + glops->go_unlock(gh); + + gl->gl_stamp = jiffies; + + spin_lock(&gl->gl_spin); + } + + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); +} + +/** + * gfs2_glock_prefetch - Try to prefetch a glock + * @gl: the glock + * @state: the state to prefetch in + * @flags: flags passed to go_xmote_th() + * + */ + +static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, + int flags) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + spin_lock(&gl->gl_spin); + + if (test_bit(GLF_LOCK, &gl->gl_flags) || + !list_empty(&gl->gl_holders) || + !list_empty(&gl->gl_waiters1) || + !list_empty(&gl->gl_waiters2) || + !list_empty(&gl->gl_waiters3) || + relaxed_state_ok(gl->gl_state, state, flags)) { + spin_unlock(&gl->gl_spin); + return; + } + + set_bit(GLF_PREFETCH, &gl->gl_flags); + set_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + glops->go_xmote_th(gl, state, flags); +} + +static void greedy_work(void *data) +{ + struct greedy *gr = data; + struct gfs2_holder *gh = &gr->gr_gh; + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); + + if (glops->go_greedy) + glops->go_greedy(gl); + + spin_lock(&gl->gl_spin); + + if (list_empty(&gl->gl_waiters2)) { + clear_bit(GLF_GREEDY, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + gfs2_holder_uninit(gh); + kfree(gr); + } else { + gfs2_glock_hold(gl); + list_add_tail(&gh->gh_list, &gl->gl_waiters2); + run_queue(gl); + spin_unlock(&gl->gl_spin); + gfs2_glock_put(gl); + } +} + +/** + * gfs2_glock_be_greedy - + * @gl: + * @time: + * + * Returns: 0 if go_greedy will be called, 1 otherwise + */ + +int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time) +{ + struct greedy *gr; + struct gfs2_holder *gh; + + if (!time || gl->gl_sbd->sd_args.ar_localcaching || + test_and_set_bit(GLF_GREEDY, &gl->gl_flags)) + return 1; + + gr = kmalloc(sizeof(struct greedy), GFP_KERNEL); + if (!gr) { + clear_bit(GLF_GREEDY, &gl->gl_flags); + return 1; + } + gh = &gr->gr_gh; + + gfs2_holder_init(gl, 0, 0, gh); + set_bit(HIF_GREEDY, &gh->gh_iflags); + INIT_WORK(&gr->gr_work, greedy_work, gr); + + set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); + schedule_delayed_work(&gr->gr_work, time); + + return 0; +} + +/** + * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it + * @gh: the holder structure + * + */ + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh) +{ + gfs2_glock_dq(gh); + gfs2_holder_uninit(gh); +} + +/** + * gfs2_glock_nq_num - acquire a glock based on lock number + * @sdp: the filesystem + * @number: the lock number + * @glops: the glock operations for the type of glock + * @state: the state to acquire the glock in + * @flags: modifier flags for the aquisition + * @gh: the struct gfs2_holder + * + * Returns: errno + */ + +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, uint64_t number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, struct gfs2_holder *gh) +{ + struct gfs2_glock *gl; + int error; + + error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); + if (!error) { + error = gfs2_glock_nq_init(gl, state, flags, gh); + gfs2_glock_put(gl); + } + + return error; +} + +/** + * glock_compare - Compare two struct gfs2_glock structures for sorting + * @arg_a: the first structure + * @arg_b: the second structure + * + */ + +static int glock_compare(const void *arg_a, const void *arg_b) +{ + struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a; + struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b; + struct lm_lockname *a = &gh_a->gh_gl->gl_name; + struct lm_lockname *b = &gh_b->gh_gl->gl_name; + int ret = 0; + + if (a->ln_number > b->ln_number) + ret = 1; + else if (a->ln_number < b->ln_number) + ret = -1; + else { + if (gh_a->gh_state == LM_ST_SHARED && + gh_b->gh_state == LM_ST_EXCLUSIVE) + ret = 1; + else if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && + (gh_b->gh_flags & GL_LOCAL_EXCL)) + ret = 1; + } + + return ret; +} + +/** + * nq_m_sync - synchonously acquire more than one glock in deadlock free order + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, + struct gfs2_holder **p) +{ + unsigned int x; + int error = 0; + + for (x = 0; x < num_gh; x++) + p[x] = &ghs[x]; + + sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); + + for (x = 0; x < num_gh; x++) { + p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + + error = gfs2_glock_nq(p[x]); + if (error) { + while (x--) + gfs2_glock_dq(p[x]); + break; + } + } + + return error; +} + +/** + * gfs2_glock_nq_m - acquire multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Figure out how big an impact this function has. Either: + * 1) Replace this code with code that calls gfs2_glock_prefetch() + * 2) Forget async stuff and just call nq_m_sync() + * 3) Leave it like it is + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + int *e; + unsigned int x; + int borked = 0, serious = 0; + int error = 0; + + if (!num_gh) + return 0; + + if (num_gh == 1) { + ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + return gfs2_glock_nq(ghs); + } + + e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL); + if (!e) + return -ENOMEM; + + for (x = 0; x < num_gh; x++) { + ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC; + error = gfs2_glock_nq(&ghs[x]); + if (error) { + borked = 1; + serious = error; + num_gh = x; + break; + } + } + + for (x = 0; x < num_gh; x++) { + error = e[x] = glock_wait_internal(&ghs[x]); + if (error) { + borked = 1; + if (error != GLR_TRYFAILED && error != GLR_CANCELED) + serious = error; + } + } + + if (!borked) { + kfree(e); + return 0; + } + + for (x = 0; x < num_gh; x++) + if (!e[x]) + gfs2_glock_dq(&ghs[x]); + + if (serious) + error = serious; + else { + for (x = 0; x < num_gh; x++) + gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags, + &ghs[x]); + error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e); + } + + kfree(e); + + return error; +} + +/** + * gfs2_glock_dq_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + unsigned int x; + + for (x = 0; x < num_gh; x++) + gfs2_glock_dq(&ghs[x]); +} + +/** + * gfs2_glock_dq_uninit_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + unsigned int x; + + for (x = 0; x < num_gh; x++) + gfs2_glock_dq_uninit(&ghs[x]); +} + +/** + * gfs2_glock_prefetch_num - prefetch a glock based on lock number + * @sdp: the filesystem + * @number: the lock number + * @glops: the glock operations for the type of glock + * @state: the state to acquire the glock in + * @flags: modifier flags for the aquisition + * + * Returns: errno + */ + +void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags) +{ + struct gfs2_glock *gl; + int error; + + if (atomic_read(&sdp->sd_reclaim_count) < + gfs2_tune_get(sdp, gt_reclaim_limit)) { + error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); + if (!error) { + gfs2_glock_prefetch(gl, state, flags); + gfs2_glock_put(gl); + } + } +} + +/** + * gfs2_lvb_hold - attach a LVB from a glock + * @gl: The glock in question + * + */ + +int gfs2_lvb_hold(struct gfs2_glock *gl) +{ + int error; + + gfs2_glmutex_lock(gl); + + if (!atomic_read(&gl->gl_lvb_count)) { + error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb); + if (error) { + gfs2_glmutex_unlock(gl); + return error; + } + gfs2_glock_hold(gl); + } + atomic_inc(&gl->gl_lvb_count); + + gfs2_glmutex_unlock(gl); + + return 0; +} + +/** + * gfs2_lvb_unhold - detach a LVB from a glock + * @gl: The glock in question + * + */ + +void gfs2_lvb_unhold(struct gfs2_glock *gl) +{ + gfs2_glock_hold(gl); + gfs2_glmutex_lock(gl); + + gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); + if (atomic_dec_and_test(&gl->gl_lvb_count)) { + gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb); + gl->gl_lvb = NULL; + gfs2_glock_put(gl); + } + + gfs2_glmutex_unlock(gl); + gfs2_glock_put(gl); +} + +#if 0 +void gfs2_lvb_sync(struct gfs2_glock *gl) +{ + gfs2_glmutex_lock(gl); + + gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count)); + if (!gfs2_assert_warn(gl->gl_sbd, gfs2_glock_is_held_excl(gl))) + gfs2_lm_sync_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb); + + gfs2_glmutex_unlock(gl); +} +#endif /* 0 */ + +static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, + unsigned int state) +{ + struct gfs2_glock *gl; + + gl = gfs2_glock_find(sdp, name); + if (!gl) + return; + + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl, state); + handle_callback(gl, state); + + spin_lock(&gl->gl_spin); + run_queue(gl); + spin_unlock(&gl->gl_spin); + + gfs2_glock_put(gl); +} + +/** + * gfs2_glock_cb - Callback used by locking module + * @fsdata: Pointer to the superblock + * @type: Type of callback + * @data: Type dependent data pointer + * + * Called by the locking module when it wants to tell us something. + * Either we need to drop a lock, one of our ASYNC requests completed, or + * a journal from another client needs to be recovered. + */ + +void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data) +{ + struct gfs2_sbd *sdp = (struct gfs2_sbd *)fsdata; + + switch (type) { + case LM_CB_NEED_E: + blocking_cb(sdp, data, LM_ST_UNLOCKED); + return; + + case LM_CB_NEED_D: + blocking_cb(sdp, data, LM_ST_DEFERRED); + return; + + case LM_CB_NEED_S: + blocking_cb(sdp, data, LM_ST_SHARED); + return; + + case LM_CB_ASYNC: { + struct lm_async_cb *async = data; + struct gfs2_glock *gl; + + gl = gfs2_glock_find(sdp, &async->lc_name); + if (gfs2_assert_warn(sdp, gl)) + return; + if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) + gl->gl_req_bh(gl, async->lc_ret); + gfs2_glock_put(gl); + return; + } + + case LM_CB_NEED_RECOVERY: + gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data); + if (sdp->sd_recoverd_process) + wake_up_process(sdp->sd_recoverd_process); + return; + + case LM_CB_DROPLOCKS: + gfs2_gl_hash_clear(sdp, NO_WAIT); + gfs2_quota_scan(sdp); + return; + + default: + gfs2_assert_warn(sdp, 0); + return; + } +} + +/** + * gfs2_iopen_go_callback - Try to kick the inode/vnode associated with an + * iopen glock from memory + * @io_gl: the iopen glock + * @state: the state into which the glock should be put + * + */ + +void gfs2_iopen_go_callback(struct gfs2_glock *io_gl, unsigned int state) +{ + + if (state != LM_ST_UNLOCKED) + return; + /* FIXME: remove this? */ +} + +/** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + int demote = 1; + + if (test_bit(GLF_STICKY, &gl->gl_flags)) + demote = 0; + else if (test_bit(GLF_PREFETCH, &gl->gl_flags)) + demote = time_after_eq(jiffies, + gl->gl_stamp + + gfs2_tune_get(sdp, gt_prefetch_secs) * HZ); + else if (glops->go_demote_ok) + demote = glops->go_demote_ok(gl); + + return demote; +} + +/** + * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list + * @gl: the glock + * + */ + +void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + spin_lock(&sdp->sd_reclaim_lock); + if (list_empty(&gl->gl_reclaim)) { + gfs2_glock_hold(gl); + list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); + atomic_inc(&sdp->sd_reclaim_count); + } + spin_unlock(&sdp->sd_reclaim_lock); + + wake_up(&sdp->sd_reclaim_wq); +} + +/** + * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list + * @sdp: the filesystem + * + * Called from gfs2_glockd() glock reclaim daemon, or when promoting a + * different glock and we notice that there are a lot of glocks in the + * reclaim list. + * + */ + +void gfs2_reclaim_glock(struct gfs2_sbd *sdp) +{ + struct gfs2_glock *gl; + + spin_lock(&sdp->sd_reclaim_lock); + if (list_empty(&sdp->sd_reclaim_list)) { + spin_unlock(&sdp->sd_reclaim_lock); + return; + } + gl = list_entry(sdp->sd_reclaim_list.next, + struct gfs2_glock, gl_reclaim); + list_del_init(&gl->gl_reclaim); + spin_unlock(&sdp->sd_reclaim_lock); + + atomic_dec(&sdp->sd_reclaim_count); + atomic_inc(&sdp->sd_reclaimed); + + if (gfs2_glmutex_trylock(gl)) { + if (queue_empty(gl, &gl->gl_holders) && + gl->gl_state != LM_ST_UNLOCKED && + demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED); + gfs2_glmutex_unlock(gl); + } + + gfs2_glock_put(gl); +} + +/** + * examine_bucket - Call a function for glock in a hash bucket + * @examiner: the function + * @sdp: the filesystem + * @bucket: the bucket + * + * Returns: 1 if the bucket has entries + */ + +static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp, + struct gfs2_gl_hash_bucket *bucket) +{ + struct glock_plug plug; + struct list_head *tmp; + struct gfs2_glock *gl; + int entries; + + /* Add "plug" to end of bucket list, work back up list from there */ + memset(&plug.gl_flags, 0, sizeof(unsigned long)); + set_bit(GLF_PLUG, &plug.gl_flags); + + write_lock(&bucket->hb_lock); + list_add(&plug.gl_list, &bucket->hb_list); + write_unlock(&bucket->hb_lock); + + for (;;) { + write_lock(&bucket->hb_lock); + + for (;;) { + tmp = plug.gl_list.next; + + if (tmp == &bucket->hb_list) { + list_del(&plug.gl_list); + entries = !list_empty(&bucket->hb_list); + write_unlock(&bucket->hb_lock); + return entries; + } + gl = list_entry(tmp, struct gfs2_glock, gl_list); + + /* Move plug up list */ + list_move(&plug.gl_list, &gl->gl_list); + + if (test_bit(GLF_PLUG, &gl->gl_flags)) + continue; + + /* examiner() must glock_put() */ + gfs2_glock_hold(gl); + + break; + } + + write_unlock(&bucket->hb_lock); + + examiner(gl); + } +} + +/** + * scan_glock - look at a glock and see if we can reclaim it + * @gl: the glock to look at + * + */ + +static void scan_glock(struct gfs2_glock *gl) +{ + if (gl->gl_ops == &gfs2_inode_glops) + goto out; + + if (gfs2_glmutex_trylock(gl)) { + if (queue_empty(gl, &gl->gl_holders) && + gl->gl_state != LM_ST_UNLOCKED && + demote_ok(gl)) + goto out_schedule; + gfs2_glmutex_unlock(gl); + } +out: + gfs2_glock_put(gl); + return; + +out_schedule: + gfs2_glmutex_unlock(gl); + gfs2_glock_schedule_for_reclaim(gl); + gfs2_glock_put(gl); +} + +/** + * gfs2_scand_internal - Look for glocks and inodes to toss from memory + * @sdp: the filesystem + * + */ + +void gfs2_scand_internal(struct gfs2_sbd *sdp) +{ + unsigned int x; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { + examine_bucket(scan_glock, sdp, &sdp->sd_gl_hash[x]); + cond_resched(); + } +} + +/** + * clear_glock - look at a glock and see if we can free it from glock cache + * @gl: the glock to look at + * + */ + +static void clear_glock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + int released; + + spin_lock(&sdp->sd_reclaim_lock); + if (!list_empty(&gl->gl_reclaim)) { + list_del_init(&gl->gl_reclaim); + atomic_dec(&sdp->sd_reclaim_count); + spin_unlock(&sdp->sd_reclaim_lock); + released = gfs2_glock_put(gl); + gfs2_assert(sdp, !released); + } else { + spin_unlock(&sdp->sd_reclaim_lock); + } + + if (gfs2_glmutex_trylock(gl)) { + if (queue_empty(gl, &gl->gl_holders) && + gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED); + + gfs2_glmutex_unlock(gl); + } + + gfs2_glock_put(gl); +} + +/** + * gfs2_gl_hash_clear - Empty out the glock hash table + * @sdp: the filesystem + * @wait: wait until it's all gone + * + * Called when unmounting the filesystem, or when inter-node lock manager + * requests DROPLOCKS because it is running out of capacity. + */ + +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) +{ + unsigned long t; + unsigned int x; + int cont; + + t = jiffies; + + for (;;) { + cont = 0; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + if (examine_bucket(clear_glock, sdp, + &sdp->sd_gl_hash[x])) + cont = 1; + + if (!wait || !cont) + break; + + if (time_after_eq(jiffies, + t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) { + fs_warn(sdp, "Unmount seems to be stalled. " + "Dumping lock state...\n"); + gfs2_dump_lockstate(sdp); + t = jiffies; + } + + invalidate_inodes(sdp->sd_vfs); + msleep(10); + } +} + +/* + * Diagnostic routines to help debug distributed deadlock + */ + +/** + * dump_holder - print information about a glock holder + * @str: a string naming the type of holder + * @gh: the glock holder + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int dump_holder(char *str, struct gfs2_holder *gh) +{ + unsigned int x; + int error = -ENOBUFS; + + printk(KERN_INFO " %s\n", str); + printk(KERN_INFO " owner = %ld\n", + (gh->gh_owner) ? (long)gh->gh_owner->pid : -1); + printk(KERN_INFO " gh_state = %u\n", gh->gh_state); + printk(KERN_INFO " gh_flags ="); + for (x = 0; x < 32; x++) + if (gh->gh_flags & (1 << x)) + printk(" %u", x); + printk(" \n"); + printk(KERN_INFO " error = %d\n", gh->gh_error); + printk(KERN_INFO " gh_iflags ="); + for (x = 0; x < 32; x++) + if (test_bit(x, &gh->gh_iflags)) + printk(" %u", x); + printk(" \n"); + print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip); + + error = 0; + + return error; +} + +/** + * dump_inode - print information about an inode + * @ip: the inode + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int dump_inode(struct gfs2_inode *ip) +{ + unsigned int x; + int error = -ENOBUFS; + + printk(KERN_INFO " Inode:\n"); + printk(KERN_INFO " num = %llu %llu\n", + (unsigned long long)ip->i_num.no_formal_ino, + (unsigned long long)ip->i_num.no_addr); + printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode)); + printk(KERN_INFO " i_flags ="); + for (x = 0; x < 32; x++) + if (test_bit(x, &ip->i_flags)) + printk(" %u", x); + printk(" \n"); + + error = 0; + + return error; +} + +/** + * dump_glock - print information about a glock + * @gl: the glock + * @count: where we are in the buffer + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int dump_glock(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + unsigned int x; + int error = -ENOBUFS; + + spin_lock(&gl->gl_spin); + + printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", + gl, + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + printk(KERN_INFO " gl_flags ="); + for (x = 0; x < 32; x++) + if (test_bit(x, &gl->gl_flags)) + printk(" %u", x); + printk(" \n"); + printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref.refcount)); + printk(KERN_INFO " gl_state = %u\n", gl->gl_state); + printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm); + print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip); + printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no"); + printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); + printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); + printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no"); + printk(KERN_INFO " le = %s\n", + (list_empty(&gl->gl_le.le_list)) ? "no" : "yes"); + printk(KERN_INFO " reclaim = %s\n", + (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); + if (gl->gl_aspace) + printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n", + gl->gl_aspace, + gl->gl_aspace->i_mapping->nrpages); + else + printk(KERN_INFO " aspace = no\n"); + printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count)); + if (gl->gl_req_gh) { + error = dump_holder("Request", gl->gl_req_gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + error = dump_holder("Holder", gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_waiters1, gh_list) { + error = dump_holder("Waiter1", gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_waiters2, gh_list) { + error = dump_holder("Waiter2", gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_waiters3, gh_list) { + error = dump_holder("Waiter3", gh); + if (error) + goto out; + } + if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) { + if (!test_bit(GLF_LOCK, &gl->gl_flags) && + list_empty(&gl->gl_holders)) { + error = dump_inode(gl->gl_object); + if (error) + goto out; + } else { + error = -ENOBUFS; + printk(KERN_INFO " Inode: busy\n"); + } + } + + error = 0; + + out: + spin_unlock(&gl->gl_spin); + + return error; +} + +/** + * gfs2_dump_lockstate - print out the current lockstate + * @sdp: the filesystem + * @ub: the buffer to copy the information into + * + * If @ub is NULL, dump the lockstate to the console. + * + */ + +static int gfs2_dump_lockstate(struct gfs2_sbd *sdp) +{ + struct gfs2_gl_hash_bucket *bucket; + struct gfs2_glock *gl; + unsigned int x; + int error = 0; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { + bucket = &sdp->sd_gl_hash[x]; + + read_lock(&bucket->hb_lock); + + list_for_each_entry(gl, &bucket->hb_list, gl_list) { + if (test_bit(GLF_PLUG, &gl->gl_flags)) + continue; + + error = dump_glock(gl); + if (error) + break; + } + + read_unlock(&bucket->hb_lock); + + if (error) + break; + } + + + return error; +} + --- /dev/null +++ b/fs/gfs2/glock.h @@ -0,0 +1,151 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __GLOCK_DOT_H__ +#define __GLOCK_DOT_H__ + +/* Flags for lock requests; used in gfs2_holder gh_flag field. + From lm_interface.h: +#define LM_FLAG_TRY 0x00000001 +#define LM_FLAG_TRY_1CB 0x00000002 +#define LM_FLAG_NOEXP 0x00000004 +#define LM_FLAG_ANY 0x00000008 +#define LM_FLAG_PRIORITY 0x00000010 */ + +#define GL_LOCAL_EXCL 0x00000020 +#define GL_ASYNC 0x00000040 +#define GL_EXACT 0x00000080 +#define GL_SKIP 0x00000100 +#define GL_ATIME 0x00000200 +#define GL_NOCACHE 0x00000400 +#define GL_NOCANCEL 0x00001000 +#define GL_AOP 0x00004000 +#define GL_DUMP 0x00008000 + +#define GLR_TRYFAILED 13 +#define GLR_CANCELED 14 + +static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + int locked = 0; + + /* Look in glock's list of holders for one with current task as owner */ + spin_lock(&gl->gl_spin); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (gh->gh_owner == current) { + locked = 1; + break; + } + } + spin_unlock(&gl->gl_spin); + + return locked; +} + +static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) +{ + return (gl->gl_state == LM_ST_EXCLUSIVE); +} + +static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl) +{ + return (gl->gl_state == LM_ST_DEFERRED); +} + +static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) +{ + return (gl->gl_state == LM_ST_SHARED); +} + +static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) +{ + int ret; + spin_lock(&gl->gl_spin); + ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3); + spin_unlock(&gl->gl_spin); + return ret; +} + +int gfs2_glock_get(struct gfs2_sbd *sdp, + uint64_t number, const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +void gfs2_glock_hold(struct gfs2_glock *gl); +int gfs2_glock_put(struct gfs2_glock *gl); +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh); +void gfs2_holder_reinit(unsigned int state, unsigned flags, + struct gfs2_holder *gh); +void gfs2_holder_uninit(struct gfs2_holder *gh); + +void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags); +void gfs2_glock_drop_th(struct gfs2_glock *gl); + +int gfs2_glock_nq(struct gfs2_holder *gh); +int gfs2_glock_poll(struct gfs2_holder *gh); +int gfs2_glock_wait(struct gfs2_holder *gh); +void gfs2_glock_dq(struct gfs2_holder *gh); + +int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time); + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, + uint64_t number, const struct gfs2_glock_operations *glops, + unsigned int state, int flags, struct gfs2_holder *gh); + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); + +void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, uint64_t number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags); +void gfs2_glock_inode_squish(struct inode *inode); + +/** + * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Returns: 0, GLR_*, or errno + */ + +static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, + unsigned int state, int flags, + struct gfs2_holder *gh) +{ + int error; + + gfs2_holder_init(gl, state, flags, gh); + + error = gfs2_glock_nq(gh); + if (error) + gfs2_holder_uninit(gh); + + return error; +} + +/* Lock Value Block functions */ + +int gfs2_lvb_hold(struct gfs2_glock *gl); +void gfs2_lvb_unhold(struct gfs2_glock *gl); + +void gfs2_glock_cb(lm_fsdata_t *fsdata, unsigned int type, void *data); + +void gfs2_iopen_go_callback(struct gfs2_glock *gl, unsigned int state); + +void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); +void gfs2_reclaim_glock(struct gfs2_sbd *sdp); + +void gfs2_scand_internal(struct gfs2_sbd *sdp); +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); + +#endif /* __GLOCK_DOT_H__ */ [PATCH 03/16] GFS2: bmap and inode functions The GFS2 block mapping and inode core code. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/bmap.c | 1236 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/bmap.h | 27 + fs/gfs2/format.h | 21 fs/gfs2/glops.c | 562 ++++++++++++++++++++++ fs/gfs2/glops.h | 23 fs/gfs2/inode.c | 1344 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/inode.h | 56 ++ 7 files changed, 3269 insertions(+) --- /dev/null +++ b/fs/gfs2/bmap.c @@ -0,0 +1,1236 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "dir.h" +#include "util.h" +#include "ops_address.h" + +/* This doesn't need to be that large as max 64 bit pointers in a 4k + * block is 512, so __u16 is fine for that. It saves stack space to + * keep it small. + */ +struct metapath { + __u16 mp_list[GFS2_MAX_META_HEIGHT]; +}; + +typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, + struct buffer_head *bh, uint64_t *top, + uint64_t *bottom, unsigned int height, + void *data); + +struct strip_mine { + int sm_first; + unsigned int sm_height; +}; + +/** + * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * @ip: the inode + * @dibh: the dinode buffer + * @block: the block number that was allocated + * @private: any locked page held by the caller process + * + * Returns: errno + */ + +static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, + uint64_t block, struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct buffer_head *bh; + int release = 0; + + if (!page || page->index) { + page = grab_cache_page(inode->i_mapping, 0); + if (!page) + return -ENOMEM; + release = 1; + } + + if (!PageUptodate(page)) { + void *kaddr = kmap(page); + + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), + ip->i_di.di_size); + memset(kaddr + ip->i_di.di_size, 0, + PAGE_CACHE_SIZE - ip->i_di.di_size); + kunmap(page); + + SetPageUptodate(page); + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, + (1 << BH_Uptodate)); + + bh = page_buffers(page); + + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); + + set_buffer_uptodate(bh); + if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED) || gfs2_is_jdata(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); + mark_buffer_dirty(bh); + + if (release) { + unlock_page(page); + page_cache_release(page); + } + + return 0; +} + +/** + * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big + * @ip: The GFS2 inode to unstuff + * @unstuffer: the routine that handles unstuffing a non-zero length file + * @private: private data for the unstuffer + * + * This routine unstuffs a dinode and returns it to a "normal" state such + * that the height can be grown in the traditional way. + * + * Returns: errno + */ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *bh, *dibh; + uint64_t block = 0; + int isdir = gfs2_is_dir(ip); + int error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (ip->i_di.di_size) { + /* Get a free block, fill it with the stuffed data, + and write it out to disk */ + + if (isdir) { + block = gfs2_alloc_meta(ip); + + error = gfs2_dir_get_new_buffer(ip, block, &bh); + if (error) + goto out_brelse; + gfs2_buffer_copy_tail(bh, + sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + brelse(bh); + } else { + block = gfs2_alloc_data(ip); + + error = gfs2_unstuffer_page(ip, dibh, block, page); + if (error) + goto out_brelse; + } + } + + /* Set up the pointer to the new block */ + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + if (ip->i_di.di_size) { + *(uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)) = + cpu_to_be64(block); + ip->i_di.di_blocks++; + } + + ip->i_di.di_height = 1; + + gfs2_dinode_out(&ip->i_di, dibh->b_data); + + out_brelse: + brelse(dibh); + + out: + up_write(&ip->i_rw_mutex); + + return error; +} + +/** + * calc_tree_height - Calculate the height of a metadata tree + * @ip: The GFS2 inode + * @size: The proposed size of the file + * + * Work out how tall a metadata tree needs to be in order to accommodate a + * file of a particular size. If size is less than the current size of + * the inode, then the current size of the inode is used instead of the + * supplied one. + * + * Returns: the height the tree should be + */ + +static unsigned int calc_tree_height(struct gfs2_inode *ip, uint64_t size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + uint64_t *arr; + unsigned int max, height; + + if (ip->i_di.di_size > size) + size = ip->i_di.di_size; + + if (gfs2_is_dir(ip)) { + arr = sdp->sd_jheightsize; + max = sdp->sd_max_jheight; + } else { + arr = sdp->sd_heightsize; + max = sdp->sd_max_height; + } + + for (height = 0; height < max; height++) + if (arr[height] >= size) + break; + + return height; +} + +/** + * build_height - Build a metadata tree of the requested height + * @ip: The GFS2 inode + * @height: The height to build to + * + * + * Returns: errno + */ + +static int build_height(struct inode *inode, unsigned height) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned new_height = height - ip->i_di.di_height; + struct buffer_head *dibh; + struct buffer_head *blocks[GFS2_MAX_META_HEIGHT]; + int error; + u64 *bp; + u64 bn; + unsigned n; + + if (height <= ip->i_di.di_height) + return 0; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + for(n = 0; n < new_height; n++) { + bn = gfs2_alloc_meta(ip); + blocks[n] = gfs2_meta_new(ip->i_gl, bn); + gfs2_trans_add_bh(ip->i_gl, blocks[n], 1); + } + + n = 0; + bn = blocks[0]->b_blocknr; + if (new_height > 1) { + for(; n < new_height-1; n++) { + gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, + GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(blocks[n], + sizeof(struct gfs2_meta_header)); + bp = (u64 *)(blocks[n]->b_data + + sizeof(struct gfs2_meta_header)); + *bp = cpu_to_be64(blocks[n+1]->b_blocknr); + brelse(blocks[n]); + blocks[n] = NULL; + } + } + gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + brelse(blocks[n]); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + bp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); + *bp = cpu_to_be64(bn); + ip->i_di.di_height += new_height; + ip->i_di.di_blocks += new_height; + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + return error; +} + +/** + * find_metapath - Find path through the metadata tree + * @ip: The inode pointer + * @mp: The metapath to return the result in + * @block: The disk block to look up + * + * This routine returns a struct metapath structure that defines a path + * through the metadata of inode "ip" to get to block "block". + * + * Example: + * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a + * filesystem with a blocksize of 4096. + * + * find_metapath() would return a struct metapath structure set to: + * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48, + * and mp_list[2] = 165. + * + * That means that in order to get to the block containing the byte at + * offset 101342453, we would load the indirect block pointed to by pointer + * 0 in the dinode. We would then load the indirect block pointed to by + * pointer 48 in that indirect block. We would then load the data block + * pointed to by pointer 165 in that indirect block. + * + * ---------------------------------------- + * | Dinode | | + * | | 4| + * | |0 1 2 3 4 5 9| + * | | 6| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 5| + * | 4 4 4 4 4 5 5 1| + * |0 5 6 7 8 9 0 1 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 1 1 1 1 1 5| + * | 6 6 6 6 6 1| + * |0 3 4 5 6 7 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Data block containing offset | + * | 101342453 | + * | | + * | | + * ---------------------------------------- + * + */ + +static void find_metapath(struct gfs2_inode *ip, uint64_t block, + struct metapath *mp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + uint64_t b = block; + unsigned int i; + + for (i = ip->i_di.di_height; i--;) + mp->mp_list[i] = (__u16)do_div(b, sdp->sd_inptrs); + +} + +/** + * metapointer - Return pointer to start of metadata in a buffer + * @bh: The buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + * + * Return a pointer to the block number of the next height of the metadata + * tree given a buffer containing the pointer to the current height of the + * metadata tree. + */ + +static inline u64 *metapointer(struct buffer_head *bh, int *boundary, + unsigned int height, const struct metapath *mp) +{ + unsigned int head_size = (height > 0) ? + sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); + u64 *ptr; + *boundary = 0; + ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height]; + if (ptr + 1 == (u64*)(bh->b_data + bh->b_size)) + *boundary = 1; + return ptr; +} + +/** + * lookup_block - Get the next metadata block in metadata tree + * @ip: The GFS2 inode + * @bh: Buffer containing the pointers to metadata blocks + * @height: The height of the tree (0 = dinode) + * @mp: The metapath + * @create: Non-zero if we may create a new meatdata block + * @new: Used to indicate if we did create a new metadata block + * @block: the returned disk block number + * + * Given a metatree, complete to a particular height, checks to see if the next + * height of the tree exists. If not the next height of the tree is created. + * The block number of the next height of the metadata tree is returned. + * + */ + +static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh, + unsigned int height, struct metapath *mp, int create, + int *new, uint64_t *block) +{ + int boundary; + uint64_t *ptr = metapointer(bh, &boundary, height, mp); + + if (*ptr) { + *block = be64_to_cpu(*ptr); + return boundary; + } + + *block = 0; + + if (!create) + return 0; + + if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip)) + *block = gfs2_alloc_data(ip); + else + *block = gfs2_alloc_meta(ip); + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + *ptr = cpu_to_be64(*block); + ip->i_di.di_blocks++; + + *new = 1; + return 0; +} + +/** + * gfs2_block_pointers - Map a block from an inode to a disk block + * @inode: The inode + * @lblock: The logical block number + * @new: Value/Result argument (1 = may create/did create new blocks) + * @boundary: gets set if we've hit a block boundary + * @mp: metapath to use + * + * Find the block number on the current device which corresponds to an + * inode's block. If the block had to be created, "new" will be set. + * + * Returns: errno + */ + +static struct buffer_head *gfs2_block_pointers(struct inode *inode, u64 lblock, + int *new, u64 *dblock, + int *boundary, + struct metapath *mp) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + int create = *new; + unsigned int bsize; + unsigned int height; + unsigned int end_of_metadata; + unsigned int x; + int error = 0; + + *new = 0; + *dblock = 0; + + if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) + goto out; + + bsize = (gfs2_is_dir(ip)) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize; + + height = calc_tree_height(ip, (lblock + 1) * bsize); + if (ip->i_di.di_height < height) { + if (!create) + goto out; + + error = build_height(inode, height); + if (error) + goto out; + } + + find_metapath(ip, lblock, mp); + end_of_metadata = ip->i_di.di_height - 1; + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out; + + for (x = 0; x < end_of_metadata; x++) { + lookup_block(ip, bh, x, mp, create, new, dblock); + brelse(bh); + if (!*dblock) + goto out; + + error = gfs2_meta_indirect_buffer(ip, x+1, *dblock, *new, &bh); + if (error) + goto out; + } + + *boundary = lookup_block(ip, bh, end_of_metadata, mp, create, new, dblock); + if (*new) { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + } + return bh; +out: + return ERR_PTR(error); +} + + +static inline void bmap_lock(struct inode *inode, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + if (create) + down_write(&ip->i_rw_mutex); + else + down_read(&ip->i_rw_mutex); +} + +static inline void bmap_unlock(struct inode *inode, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + if (create) + up_write(&ip->i_rw_mutex); + else + up_read(&ip->i_rw_mutex); +} + +int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary) +{ + struct metapath mp; + struct buffer_head *bh; + int create = *new; + + bmap_lock(inode, create); + bh = gfs2_block_pointers(inode, lblock, new, dblock, boundary, &mp); + bmap_unlock(inode, create); + if (!bh) + return 0; + if (IS_ERR(bh)) + return PTR_ERR(bh); + brelse(bh); + return 0; +} + +int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct metapath mp; + struct buffer_head *bh; + int boundary; + int create = *new; + + BUG_ON(!extlen); + BUG_ON(!dblock); + BUG_ON(!new); + + bmap_lock(inode, create); + bh = gfs2_block_pointers(inode, lblock, new, dblock, &boundary, &mp); + *extlen = 1; + + if (bh && !IS_ERR(bh) && *dblock && !*new) { + u64 tmp_dblock; + int tmp_new; + unsigned int nptrs; + unsigned end_of_metadata = ip->i_di.di_height - 1; + + nptrs = (end_of_metadata) ? sdp->sd_inptrs : sdp->sd_diptrs; + while (++mp.mp_list[end_of_metadata] < nptrs) { + lookup_block(ip, bh, end_of_metadata, &mp, 0, &tmp_new, &tmp_dblock); + if (*dblock + *extlen != tmp_dblock) + break; + (*extlen)++; + } + } + bmap_unlock(inode, create); + if (!bh) + return 0; + if (IS_ERR(bh)) + return PTR_ERR(bh); + brelse(bh); + return 0; +} + +/** + * recursive_scan - recursively scan through the end of a file + * @ip: the inode + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @bc: the call to make for each piece of metadata + * @data: data opaque to this function to pass to @bc + * + * When this is first called @height and @block should be zero and + * @first should be 1. + * + * Returns: errno + */ + +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + uint64_t block, int first, block_call_t bc, + void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh = NULL; + uint64_t *top, *bottom; + uint64_t bn; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; + + top = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) + + mp->mp_list[0]; + bottom = (uint64_t *)(bh->b_data + sizeof(struct gfs2_dinode)) + + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); + if (error) + return error; + + top = (uint64_t *)(bh->b_data + mh_size) + + ((first) ? mp->mp_list[height] : 0); + + bottom = (uint64_t *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } + + error = bc(ip, dibh, bh, top, bottom, height, data); + if (error) + goto out; + + if (height < ip->i_di.di_height - 1) + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); + + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, bc, data); + if (error) + break; + } + + out: + brelse(bh); + + return error; +} + +/** + * do_strip - Look for a layer a particular layer of the file and strip it off + * @ip: the inode + * @dibh: the dinode buffer + * @bh: A buffer of pointers + * @top: The first pointer in the buffer + * @bottom: One more than the last pointer + * @height: the height this buffer is at + * @data: a pointer to a struct strip_mine + * + * Returns: errno + */ + +static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, + struct buffer_head *bh, uint64_t *top, uint64_t *bottom, + unsigned int height, void *data) +{ + struct strip_mine *sm = data; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + uint64_t bn, bstart; + uint32_t blen; + uint64_t *p; + unsigned int rg_blocks = 0; + int metadata; + unsigned int revokes = 0; + int x; + int error; + + if (!*top) + sm->sm_first = 0; + + if (height != sm->sm_height) + return 0; + + if (sm->sm_first) { + top++; + sm->sm_first = 0; + } + + metadata = (height != ip->i_di.di_height - 1); + if (metadata) + revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; + + error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh); + if (error) + return error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + bstart = 0; + blen = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + + bstart = bn; + blen = 1; + } + } + + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + else + goto out; /* Nothing to do */ + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_ri.ri_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + + RES_INDIRECT + RES_STATFS + RES_QUOTA, + revokes); + if (error) + goto out_rg_gunlock; + + down_write(&ip->i_rw_mutex); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + bstart = 0; + blen = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) { + if (metadata) + gfs2_free_meta(ip, bstart, blen); + else + gfs2_free_data(ip, bstart, blen); + } + + bstart = bn; + blen = 1; + } + + *p = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + } + if (bstart) { + if (metadata) + gfs2_free_meta(ip, bstart, blen); + else + gfs2_free_data(ip, bstart, blen); + } + + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + + gfs2_dinode_out(&ip->i_di, dibh->b_data); + + up_write(&ip->i_rw_mutex); + + gfs2_trans_end(sdp); + + out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); + + out_rlist: + gfs2_rlist_free(&rlist); + + out: + gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh); + + return error; +} + +/** + * do_grow - Make a file look bigger than it is + * @ip: the inode + * @size: the size to set the file to + * + * Called with an exclusive lock on @ip. + * + * Returns: errno + */ + +static int do_grow(struct gfs2_inode *ip, uint64_t size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al; + struct buffer_head *dibh; + unsigned int h; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = sdp->sd_max_height + RES_DATA; + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, + sdp->sd_max_height + al->al_rgd->rd_ri.ri_length + + RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto out_end_trans; + } + + h = calc_tree_height(ip, size); + if (ip->i_di.di_height < h) { + down_write(&ip->i_rw_mutex); + error = build_height(&ip->i_inode, h); + up_write(&ip->i_rw_mutex); + if (error) + goto out_end_trans; + } + } + + ip->i_di.di_size = size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + + out_end_trans: + gfs2_trans_end(sdp); + + out_ipres: + gfs2_inplace_release(ip); + + out_gunlock_q: + gfs2_quota_unlock(ip); + + out: + gfs2_alloc_put(ip); + + return error; +} + + +/** + * gfs2_block_truncate_page - Deal with zeroing out data for truncate + * + * This is partly borrowed from ext3. + */ +static int gfs2_block_truncate_page(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t from = inode->i_size; + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct buffer_head *bh; + struct page *page; + void *kaddr; + int err; + + page = grab_cache_page(mapping, index); + if (!page) + return 0; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + + if (!buffer_mapped(bh)) { + gfs2_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +static int trunc_start(struct gfs2_inode *ip, uint64_t size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int journaled = gfs2_is_jdata(ip); + int error; + + error = gfs2_trans_begin(sdp, + RES_DINODE + ((journaled) ? RES_JDATA : 0), 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (gfs2_is_stuffed(ip)) { + ip->i_di.di_size = size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); + error = 1; + + } else { + if (size & (uint64_t)(sdp->sd_sb.sb_bsize - 1)) + error = gfs2_block_truncate_page(ip->i_inode.i_mapping); + + if (!error) { + ip->i_di.di_size = size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + } + } + + brelse(dibh); + + out: + gfs2_trans_end(sdp); + + return error; +} + +static int trunc_dealloc(struct gfs2_inode *ip, uint64_t size) +{ + unsigned int height = ip->i_di.di_height; + uint64_t lblock; + struct metapath mp; + int error; + + if (!size) + lblock = 0; + else + lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift; + + find_metapath(ip, lblock, &mp); + gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + while (height--) { + struct strip_mine sm; + sm.sm_first = !!size; + sm.sm_height = height; + + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); + if (error) + break; + } + + gfs2_quota_unhold(ip); + + out: + gfs2_alloc_put(ip); + return error; +} + +static int trunc_end(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (!ip->i_di.di_size) { + ip->i_di.di_height = 0; + ip->i_di.di_goal_meta = + ip->i_di.di_goal_data = + ip->i_num.no_addr; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + } + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + + out: + up_write(&ip->i_rw_mutex); + + gfs2_trans_end(sdp); + + return error; +} + +/** + * do_shrink - make a file smaller + * @ip: the inode + * @size: the size to make the file + * @truncator: function to truncate the last partial block + * + * Called with an exclusive lock on @ip. + * + * Returns: errno + */ + +static int do_shrink(struct gfs2_inode *ip, uint64_t size) +{ + int error; + + error = trunc_start(ip, size); + if (error < 0) + return error; + if (error > 0) + return 0; + + error = trunc_dealloc(ip, size); + if (!error) + error = trunc_end(ip); + + return error; +} + +/** + * gfs2_truncatei - make a file a given size + * @ip: the inode + * @size: the size to make the file + * @truncator: function to truncate the last partial block + * + * The file size can grow, shrink, or stay the same size. + * + * Returns: errno + */ + +int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size) +{ + int error; + + if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode))) + return -EINVAL; + + if (size > ip->i_di.di_size) + error = do_grow(ip, size); + else + error = do_shrink(ip, size); + + return error; +} + +int gfs2_truncatei_resume(struct gfs2_inode *ip) +{ + int error; + error = trunc_dealloc(ip, ip->i_di.di_size); + if (!error) + error = trunc_end(ip); + return error; +} + +int gfs2_file_dealloc(struct gfs2_inode *ip) +{ + return trunc_dealloc(ip, 0); +} + +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + if (gfs2_is_dir(ip)) { + *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2; + *ind_blocks = 3 * (sdp->sd_max_jheight - 1); + } else { + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + } + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} + +/** + * gfs2_write_alloc_required - figure out if a write will require an allocation + * @ip: the file being written to + * @offset: the offset to write to + * @len: the number of bytes being written + * @alloc_required: set to 1 if an alloc is required, 0 otherwise + * + * Returns: errno + */ + +int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset, + unsigned int len, int *alloc_required) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + uint64_t lblock, lblock_stop, dblock; + uint32_t extlen; + int new = 0; + int error = 0; + + *alloc_required = 0; + + if (!len) + return 0; + + if (gfs2_is_stuffed(ip)) { + if (offset + len > + sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + *alloc_required = 1; + return 0; + } + + if (gfs2_is_dir(ip)) { + unsigned int bsize = sdp->sd_jbsize; + lblock = offset; + do_div(lblock, bsize); + lblock_stop = offset + len + bsize - 1; + do_div(lblock_stop, bsize); + } else { + unsigned int shift = sdp->sd_sb.sb_bsize_shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + } + + for (; lblock < lblock_stop; lblock += extlen) { + error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); + if (error) + return error; + + if (!dblock) { + *alloc_required = 1; + return 0; + } + } + + return 0; +} + --- /dev/null +++ b/fs/gfs2/bmap.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __BMAP_DOT_H__ +#define __BMAP_DOT_H__ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +int gfs2_block_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, int *boundary); +int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); + +int gfs2_truncatei(struct gfs2_inode *ip, uint64_t size); +int gfs2_truncatei_resume(struct gfs2_inode *ip); +int gfs2_file_dealloc(struct gfs2_inode *ip); + +void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks); +int gfs2_write_alloc_required(struct gfs2_inode *ip, uint64_t offset, + unsigned int len, int *alloc_required); + +#endif /* __BMAP_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/format.h @@ -0,0 +1,21 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __FORMAT_DOT_H__ +#define __FORMAT_DOT_H__ + +static const uint32_t gfs2_old_fs_formats[] = { + 0 +}; + +static const uint32_t gfs2_old_multihost_formats[] = { + 0 +}; + +#endif /* __FORMAT_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/glops.c @@ -0,0 +1,562 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "util.h" + + +/** + * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock + * @gl: the glock + * + */ + +static void gfs2_pte_inval(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + struct inode *inode; + + ip = gl->gl_object; + inode = &ip->i_inode; + if (!ip || !S_ISREG(ip->i_di.di_mode)) + return; + + if (!test_bit(GIF_PAGED, &ip->i_flags)) + return; + + unmap_shared_mapping_range(inode->i_mapping, 0, 0); + + if (test_bit(GIF_SW_PAGED, &ip->i_flags)) + set_bit(GLF_DIRTY, &gl->gl_flags); + + clear_bit(GIF_SW_PAGED, &ip->i_flags); +} + +/** + * gfs2_page_inval - Invalidate all pages associated with a glock + * @gl: the glock + * + */ + +static void gfs2_page_inval(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + struct inode *inode; + + ip = gl->gl_object; + inode = &ip->i_inode; + if (!ip || !S_ISREG(ip->i_di.di_mode)) + return; + + truncate_inode_pages(inode->i_mapping, 0); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages); + clear_bit(GIF_PAGED, &ip->i_flags); +} + +/** + * gfs2_page_sync - Sync the data pages (not metadata) associated with a glock + * @gl: the glock + * @flags: DIO_START | DIO_WAIT + * + * Syncs data (not metadata) for a regular file. + * No-op for all other types. + */ + +static void gfs2_page_sync(struct gfs2_glock *gl, int flags) +{ + struct gfs2_inode *ip; + struct inode *inode; + struct address_space *mapping; + int error = 0; + + ip = gl->gl_object; + inode = &ip->i_inode; + if (!ip || !S_ISREG(ip->i_di.di_mode)) + return; + + mapping = inode->i_mapping; + + if (flags & DIO_START) + filemap_fdatawrite(mapping); + if (!error && (flags & DIO_WAIT)) + error = filemap_fdatawait(mapping); + + /* Put back any errors cleared by filemap_fdatawait() + so they can be caught by someone who can pass them + up to user space. */ + + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else if (error) + set_bit(AS_EIO, &mapping->flags); + +} + +/** + * meta_go_sync - sync out the metadata for this glock + * @gl: the glock + * @flags: DIO_* + * + * Called when demoting or unlocking an EX glock. We must flush + * to disk all dirty buffers/pages relating to this glock, and must not + * not return to caller to demote/unlock the glock until I/O is complete. + */ + +static void meta_go_sync(struct gfs2_glock *gl, int flags) +{ + if (!(flags & DIO_METADATA)) + return; + + if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { + gfs2_log_flush(gl->gl_sbd, gl); + gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT); + if (flags & DIO_RELEASE) + gfs2_ail_empty_gl(gl); + } + +} + +/** + * meta_go_inval - invalidate the metadata for this glock + * @gl: the glock + * @flags: + * + */ + +static void meta_go_inval(struct gfs2_glock *gl, int flags) +{ + if (!(flags & DIO_METADATA)) + return; + + gfs2_meta_inval(gl); + gl->gl_vn++; +} + +/** + * inode_go_xmote_th - promote/demote a glock + * @gl: the glock + * @state: the requested state + * @flags: + * + */ + +static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state, + int flags) +{ + if (gl->gl_state != LM_ST_UNLOCKED) + gfs2_pte_inval(gl); + gfs2_glock_xmote_th(gl, state, flags); +} + +/** + * inode_go_xmote_bh - After promoting/demoting a glock + * @gl: the glock + * + */ + +static void inode_go_xmote_bh(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh = gl->gl_req_gh; + struct buffer_head *bh; + int error; + + if (gl->gl_state != LM_ST_UNLOCKED && + (!gh || !(gh->gh_flags & GL_SKIP))) { + error = gfs2_meta_read(gl, gl->gl_name.ln_number, DIO_START, + &bh); + if (!error) + brelse(bh); + } +} + +/** + * inode_go_drop_th - unlock a glock + * @gl: the glock + * + * Invoked from rq_demote(). + * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long) + * is being purged from our node's glock cache; we're dropping lock. + */ + +static void inode_go_drop_th(struct gfs2_glock *gl) +{ + gfs2_pte_inval(gl); + gfs2_glock_drop_th(gl); +} + +/** + * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * @gl: the glock protecting the inode + * @flags: + * + */ + +static void inode_go_sync(struct gfs2_glock *gl, int flags) +{ + int meta = (flags & DIO_METADATA); + int data = (flags & DIO_DATA); + + if (test_bit(GLF_DIRTY, &gl->gl_flags)) { + if (meta && data) { + gfs2_page_sync(gl, flags | DIO_START); + gfs2_log_flush(gl->gl_sbd, gl); + gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT); + gfs2_page_sync(gl, flags | DIO_WAIT); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } else if (meta) { + gfs2_log_flush(gl->gl_sbd, gl); + gfs2_meta_sync(gl, flags | DIO_START | DIO_WAIT); + } else if (data) + gfs2_page_sync(gl, flags | DIO_START | DIO_WAIT); + if (flags & DIO_RELEASE) + gfs2_ail_empty_gl(gl); + } + +} + +/** + * inode_go_inval - prepare a inode glock to be released + * @gl: the glock + * @flags: + * + */ + +static void inode_go_inval(struct gfs2_glock *gl, int flags) +{ + int meta = (flags & DIO_METADATA); + int data = (flags & DIO_DATA); + + if (meta) { + gfs2_meta_inval(gl); + gl->gl_vn++; + } + if (data) + gfs2_page_inval(gl); +} + +/** + * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int inode_go_demote_ok(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + int demote = 0; + + if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) + demote = 1; + else if (!sdp->sd_args.ar_localcaching && + time_after_eq(jiffies, gl->gl_stamp + + gfs2_tune_get(sdp, gt_demote_secs) * HZ)) + demote = 1; + + return demote; +} + +/** + * inode_go_lock - operation done after an inode lock is locked by a process + * @gl: the glock + * @flags: + * + * Returns: errno + */ + +static int inode_go_lock(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_inode *ip = gl->gl_object; + int error = 0; + + if (!ip) + return 0; + + if (ip->i_vn != gl->gl_vn) { + error = gfs2_inode_refresh(ip); + if (error) + return error; + gfs2_inode_attr_in(ip); + } + + if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && + (gl->gl_state == LM_ST_EXCLUSIVE) && + (gh->gh_flags & GL_LOCAL_EXCL)) + error = gfs2_truncatei_resume(ip); + + return error; +} + +/** + * inode_go_unlock - operation done before an inode lock is unlocked by a + * process + * @gl: the glock + * @flags: + * + */ + +static void inode_go_unlock(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_inode *ip = gl->gl_object; + + if (ip) { + if (test_bit(GLF_DIRTY, &gl->gl_flags)) + gfs2_inode_attr_in(ip); + + gfs2_meta_cache_flush(ip); + } +} + +/** + * inode_greedy - + * @gl: the glock + * + */ + +static void inode_greedy(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = gl->gl_object; + unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum); + unsigned int max = gfs2_tune_get(sdp, gt_greedy_max); + unsigned int new_time; + + spin_lock(&ip->i_spin); + + if (time_after(ip->i_last_pfault + quantum, jiffies)) { + new_time = ip->i_greedy + quantum; + if (new_time > max) + new_time = max; + } else { + new_time = ip->i_greedy - quantum; + if (!new_time || new_time > max) + new_time = 1; + } + + ip->i_greedy = new_time; + + spin_unlock(&ip->i_spin); + + iput(&ip->i_inode); +} + +/** + * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int rgrp_go_demote_ok(struct gfs2_glock *gl) +{ + return !gl->gl_aspace->i_mapping->nrpages; +} + +/** + * rgrp_go_lock - operation done after an rgrp lock is locked by + * a first holder on this node. + * @gl: the glock + * @flags: + * + * Returns: errno + */ + +static int rgrp_go_lock(struct gfs2_holder *gh) +{ + return gfs2_rgrp_bh_get(gh->gh_gl->gl_object); +} + +/** + * rgrp_go_unlock - operation done before an rgrp lock is unlocked by + * a last holder on this node. + * @gl: the glock + * @flags: + * + */ + +static void rgrp_go_unlock(struct gfs2_holder *gh) +{ + gfs2_rgrp_bh_put(gh->gh_gl->gl_object); +} + +/** + * trans_go_xmote_th - promote/demote the transaction glock + * @gl: the glock + * @state: the requested state + * @flags: + * + */ + +static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state, + int flags) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (gl->gl_state != LM_ST_UNLOCKED && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + } + + gfs2_glock_xmote_th(gl, state, flags); +} + +/** + * trans_go_xmote_bh - After promoting/demoting the transaction glock + * @gl: the glock + * + */ + +static void trans_go_xmote_bh(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_log_header head; + int error; + + if (gl->gl_state != LM_ST_UNLOCKED && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode)); + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + gfs2_consist(sdp); + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) + gfs2_consist(sdp); + + /* Initialize some head of the log stuff */ + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + } + } +} + +/** + * trans_go_drop_th - unlock the transaction glock + * @gl: the glock + * + * We want to sync the device even with localcaching. Remember + * that localcaching journal replay only marks buffers dirty. + */ + +static void trans_go_drop_th(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + } + + gfs2_glock_drop_th(gl); +} + +/** + * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int quota_go_demote_ok(struct gfs2_glock *gl) +{ + return !atomic_read(&gl->gl_lvb_count); +} + +const struct gfs2_glock_operations gfs2_meta_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_META +}; + +const struct gfs2_glock_operations gfs2_inode_glops = { + .go_xmote_th = inode_go_xmote_th, + .go_xmote_bh = inode_go_xmote_bh, + .go_drop_th = inode_go_drop_th, + .go_sync = inode_go_sync, + .go_inval = inode_go_inval, + .go_demote_ok = inode_go_demote_ok, + .go_lock = inode_go_lock, + .go_unlock = inode_go_unlock, + .go_greedy = inode_greedy, + .go_type = LM_TYPE_INODE +}; + +const struct gfs2_glock_operations gfs2_rgrp_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_sync = meta_go_sync, + .go_inval = meta_go_inval, + .go_demote_ok = rgrp_go_demote_ok, + .go_lock = rgrp_go_lock, + .go_unlock = rgrp_go_unlock, + .go_type = LM_TYPE_RGRP +}; + +const struct gfs2_glock_operations gfs2_trans_glops = { + .go_xmote_th = trans_go_xmote_th, + .go_xmote_bh = trans_go_xmote_bh, + .go_drop_th = trans_go_drop_th, + .go_type = LM_TYPE_NONDISK +}; + +const struct gfs2_glock_operations gfs2_iopen_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_callback = gfs2_iopen_go_callback, + .go_type = LM_TYPE_IOPEN +}; + +const struct gfs2_glock_operations gfs2_flock_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_FLOCK +}; + +const struct gfs2_glock_operations gfs2_nondisk_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_NONDISK +}; + +const struct gfs2_glock_operations gfs2_quota_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_demote_ok = quota_go_demote_ok, + .go_type = LM_TYPE_QUOTA +}; + +const struct gfs2_glock_operations gfs2_journal_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_JOURNAL +}; + --- /dev/null +++ b/fs/gfs2/glops.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __GLOPS_DOT_H__ +#define __GLOPS_DOT_H__ + +extern const struct gfs2_glock_operations gfs2_meta_glops; +extern const struct gfs2_glock_operations gfs2_inode_glops; +extern const struct gfs2_glock_operations gfs2_rgrp_glops; +extern const struct gfs2_glock_operations gfs2_trans_glops; +extern const struct gfs2_glock_operations gfs2_iopen_glops; +extern const struct gfs2_glock_operations gfs2_flock_glops; +extern const struct gfs2_glock_operations gfs2_nondisk_glops; +extern const struct gfs2_glock_operations gfs2_quota_glops; +extern const struct gfs2_glock_operations gfs2_journal_glops; + +#endif /* __GLOPS_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/inode.c @@ -0,0 +1,1344 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "acl.h" +#include "bmap.h" +#include "dir.h" +#include "eattr.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "ops_address.h" +#include "ops_file.h" +#include "ops_inode.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode + * @ip: The GFS2 inode (with embedded disk inode data) + * @inode: The Linux VFS inode + * + */ + +void gfs2_inode_attr_in(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + + inode->i_ino = ip->i_num.no_addr; + + switch (ip->i_di.di_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = MKDEV(ip->i_di.di_major, ip->i_di.di_minor); + break; + default: + inode->i_rdev = 0; + break; + }; + + inode->i_mode = ip->i_di.di_mode; + inode->i_nlink = ip->i_di.di_nlink; + inode->i_uid = ip->i_di.di_uid; + inode->i_gid = ip->i_di.di_gid; + i_size_write(inode, ip->i_di.di_size); + inode->i_atime.tv_sec = ip->i_di.di_atime; + inode->i_mtime.tv_sec = ip->i_di.di_mtime; + inode->i_ctime.tv_sec = ip->i_di.di_ctime; + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = ip->i_di.di_blocks << + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); + + if (ip->i_di.di_flags & GFS2_DIF_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + + if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; +} + +/** + * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode + * @ip: The GFS2 inode + * + * Only copy out the attributes that we want the VFS layer + * to be able to modify. + */ + +void gfs2_inode_attr_out(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + + gfs2_assert_withdraw(GFS2_SB(inode), + (ip->i_di.di_mode & S_IFMT) == (inode->i_mode & S_IFMT)); + ip->i_di.di_mode = inode->i_mode; + ip->i_di.di_uid = inode->i_uid; + ip->i_di.di_gid = inode->i_gid; + ip->i_di.di_atime = inode->i_atime.tv_sec; + ip->i_di.di_mtime = inode->i_mtime.tv_sec; + ip->i_di.di_ctime = inode->i_ctime.tv_sec; +} + +static int iget_test(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inum *inum = opaque; + + if (ip && ip->i_num.no_addr == inum->no_addr) + return 1; + + return 0; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inum *inum = opaque; + + ip->i_num = *inum; + return 0; +} + +struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum) +{ + return ilookup5(sb, (unsigned long)inum->no_formal_ino, + iget_test, inum); +} + +static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum) +{ + return iget5_locked(sb, (unsigned long)inum->no_formal_ino, + iget_test, iget_set, inum); +} + +/** + * gfs2_inode_lookup - Lookup an inode + * @sb: The super block + * @inum: The inode number + * @type: The type of the inode + * + * Returns: A VFS inode, or an error + */ + +struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type) +{ + struct inode *inode = gfs2_iget(sb, inum); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *io_gl; + int error; + + if (inode->i_state & I_NEW) { + struct gfs2_sbd *sdp = GFS2_SB(inode); + umode_t mode = DT2IF(type); + inode->u.generic_ip = ip; + inode->i_mode = mode; + + if (S_ISREG(mode)) { + inode->i_op = &gfs2_file_iops; + inode->i_fop = &gfs2_file_fops; + inode->i_mapping->a_ops = &gfs2_file_aops; + } else if (S_ISDIR(mode)) { + inode->i_op = &gfs2_dir_iops; + inode->i_fop = &gfs2_dir_fops; + } else if (S_ISLNK(mode)) { + inode->i_op = &gfs2_symlink_iops; + } else { + inode->i_op = &gfs2_dev_iops; + } + + error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (unlikely(error)) + goto fail; + ip->i_gl->gl_object = ip; + + error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (unlikely(error)) + goto fail_put; + + ip->i_vn = ip->i_gl->gl_vn - 1; + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + if (unlikely(error)) + goto fail_iopen; + + gfs2_glock_put(io_gl); + unlock_new_inode(inode); + } + + return inode; +fail_iopen: + gfs2_glock_put(io_gl); +fail_put: + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); +fail: + iput(inode); + return ERR_PTR(error); +} + +/** + * gfs2_inode_refresh - Refresh the incore copy of the dinode + * @ip: The GFS2 inode + * + * Returns: errno + */ + +int gfs2_inode_refresh(struct gfs2_inode *ip) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) { + brelse(dibh); + return -EIO; + } + + gfs2_dinode_in(&ip->i_di, dibh->b_data); + + brelse(dibh); + + if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino) + return -ESTALE; + + ip->i_vn = ip->i_gl->gl_vn; + + return 0; +} + +int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al; + struct gfs2_rgrpd *rgd; + int error; + + if (ip->i_di.di_blocks != 1) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + if (error) + goto out_qs; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_rindex_relse; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, + &al->al_rgd_gh); + if (error) + goto out_rindex_relse; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1); + if (error) + goto out_rg_gunlock; + + gfs2_trans_add_gl(ip->i_gl); + + gfs2_free_di(rgd, ip); + + gfs2_trans_end(sdp); + clear_bit(GLF_STICKY, &ip->i_gl->gl_flags); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&al->al_rgd_gh); +out_rindex_relse: + gfs2_glock_dq_uninit(&al->al_ri_gh); +out_qs: + gfs2_quota_unhold(ip); +out: + gfs2_alloc_put(ip); + return error; +} + +/** + * gfs2_change_nlink - Change nlink count on inode + * @ip: The GFS2 inode + * @diff: The change in the nlink count required + * + * Returns: errno + */ + +int gfs2_change_nlink(struct gfs2_inode *ip, int diff) +{ + struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info; + struct buffer_head *dibh; + uint32_t nlink; + int error; + + BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink); + nlink = ip->i_di.di_nlink + diff; + + /* If we are reducing the nlink count, but the new value ends up being + bigger than the old one, we must have underflowed. */ + if (diff < 0 && nlink > ip->i_di.di_nlink) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + ip->i_di.di_nlink = nlink; + ip->i_di.di_ctime = get_seconds(); + ip->i_inode.i_nlink = nlink; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + mark_inode_dirty(&ip->i_inode); + + if (ip->i_di.di_nlink == 0) { + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh, rg_gh; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + error = -EIO; + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + if (!rgd) + goto out_norgrp; + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + if (error) + goto out_norgrp; + + gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */ + gfs2_glock_dq_uninit(&rg_gh); +out_norgrp: + gfs2_glock_dq_uninit(&ri_gh); + } +out: + return error; +} + +struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) +{ + struct qstr qstr; + gfs2_str2qstr(&qstr, name); + return gfs2_lookupi(dip, &qstr, 1, NULL); +} + + +/** + * gfs2_lookupi - Look up a filename in a directory and return its inode + * @d_gh: An initialized holder for the directory glock + * @name: The name of the inode to look for + * @is_root: If 1, ignore the caller's permissions + * @i_gh: An uninitialized holder for the new inode glock + * + * There will always be a vnode (Linux VFS inode) for the d_gh inode unless + * @is_root is true. + * + * Returns: errno + */ + +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root, struct nameidata *nd) + +{ + struct super_block *sb = dir->i_sb; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + struct gfs2_inum inum; + unsigned int type; + int error = 0; + struct inode *inode = NULL; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return ERR_PTR(-ENAMETOOLONG); + + if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || + (name->len == 2 && memcmp(name->name, "..", 2) == 0 && + dir == sb->s_root->d_inode)) { + igrab(dir); + return dir; + } + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return ERR_PTR(error); + + if (!is_root) { + error = permission(dir, MAY_EXEC, NULL); + if (error) + goto out; + } + + error = gfs2_dir_search(dir, name, &inum, &type); + if (error) + goto out; + + inode = gfs2_inode_lookup(sb, &inum, type); + +out: + gfs2_glock_dq_uninit(&d_gh); + if (error == -ENOENT) + return NULL; + return inode; +} + +static int pick_formal_ino_1(struct gfs2_sbd *sdp, uint64_t *formal_ino) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); + struct buffer_head *bh; + struct gfs2_inum_range ir; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + mutex_lock(&sdp->sd_inum_mutex); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) { + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); + return error; + } + + gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); + + if (ir.ir_length) { + *formal_ino = ir.ir_start++; + ir.ir_length--; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_inum_range_out(&ir, + bh->b_data + sizeof(struct gfs2_dinode)); + brelse(bh); + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); + return 0; + } + + brelse(bh); + + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); + + return 1; +} + +static int pick_formal_ino_2(struct gfs2_sbd *sdp, uint64_t *formal_ino) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode); + struct gfs2_holder gh; + struct buffer_head *bh; + struct gfs2_inum_range ir; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); + if (error) + goto out; + mutex_lock(&sdp->sd_inum_mutex); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_end_trans; + + gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); + + if (!ir.ir_length) { + struct buffer_head *m_bh; + uint64_t x, y; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out_brelse; + + x = *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)); + x = y = be64_to_cpu(x); + ir.ir_start = x; + ir.ir_length = GFS2_INUM_QUANTUM; + x += GFS2_INUM_QUANTUM; + if (x < y) + gfs2_consist_inode(m_ip); + x = cpu_to_be64(x); + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + *(uint64_t *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x; + + brelse(m_bh); + } + + *formal_ino = ir.ir_start++; + ir.ir_length--; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode)); + +out_brelse: + brelse(bh); +out_end_trans: + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); + return error; +} + +static int pick_formal_ino(struct gfs2_sbd *sdp, uint64_t *inum) +{ + int error; + + error = pick_formal_ino_1(sdp, inum); + if (error <= 0) + return error; + + error = pick_formal_ino_2(sdp, inum); + + return error; +} + +/** + * create_ok - OK to create a new on-disk inode here? + * @dip: Directory in which dinode is to be created + * @name: Name of new dinode + * @mode: + * + * Returns: errno + */ + +static int create_ok(struct gfs2_inode *dip, const struct qstr *name, + unsigned int mode) +{ + int error; + + error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + if (error) + return error; + + /* Don't create entries in an unlinked directory */ + if (!dip->i_di.di_nlink) + return -EPERM; + + error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + return -EEXIST; + default: + return error; + } + + if (dip->i_di.di_entries == (uint32_t)-1) + return -EFBIG; + if (S_ISDIR(mode) && dip->i_di.di_nlink == (uint32_t)-1) + return -EMLINK; + + return 0; +} + +static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode, + unsigned int *uid, unsigned int *gid) +{ + if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && + (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) { + if (S_ISDIR(*mode)) + *mode |= S_ISUID; + else if (dip->i_di.di_uid != current->fsuid) + *mode &= ~07111; + *uid = dip->i_di.di_uid; + } else + *uid = current->fsuid; + + if (dip->i_di.di_mode & S_ISGID) { + if (S_ISDIR(*mode)) + *mode |= S_ISGID; + *gid = dip->i_di.di_gid; + } else + *gid = current->fsgid; +} + +static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum, + u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + int error; + + gfs2_alloc_get(dip); + + dip->i_alloc.al_requested = RES_DINODE; + error = gfs2_inplace_reserve(dip); + if (error) + goto out; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0); + if (error) + goto out_ipreserv; + + inum->no_addr = gfs2_alloc_di(dip, generation); + + gfs2_trans_end(sdp); + +out_ipreserv: + gfs2_inplace_release(dip); +out: + gfs2_alloc_put(dip); + return error; +} + +/** + * init_dinode - Fill in a new dinode structure + * @dip: the directory this inode is being created in + * @gl: The glock covering the new inode + * @inum: the inode number + * @mode: the file permissions + * @uid: + * @gid: + * + */ + +static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, + const struct gfs2_inum *inum, unsigned int mode, + unsigned int uid, unsigned int gid, + const u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_dinode *di; + struct buffer_head *dibh; + + dibh = gfs2_meta_new(gl, inum->no_addr); + gfs2_trans_add_bh(gl, dibh, 1); + gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + di = (struct gfs2_dinode *)dibh->b_data; + + di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); + di->di_num.no_addr = cpu_to_be64(inum->no_addr); + di->di_mode = cpu_to_be32(mode); + di->di_uid = cpu_to_be32(uid); + di->di_gid = cpu_to_be32(gid); + di->di_nlink = cpu_to_be32(0); + di->di_size = cpu_to_be64(0); + di->di_blocks = cpu_to_be64(1); + di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds()); + di->di_major = di->di_minor = cpu_to_be32(0); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); + di->di_generation = cpu_to_be64(*generation); + di->di_flags = cpu_to_be32(0); + + if (S_ISREG(mode)) { + if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); + if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) || + gfs2_tune_get(sdp, gt_new_files_directio)) + di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO); + } else if (S_ISDIR(mode)) { + di->di_flags |= cpu_to_be32(dip->i_di.di_flags & + GFS2_DIF_INHERIT_DIRECTIO); + di->di_flags |= cpu_to_be32(dip->i_di.di_flags & + GFS2_DIF_INHERIT_JDATA); + } + + di->__pad1 = 0; + di->di_payload_format = cpu_to_be32(0); + di->di_height = cpu_to_be32(0); + di->__pad2 = 0; + di->__pad3 = 0; + di->di_depth = cpu_to_be16(0); + di->di_entries = cpu_to_be32(0); + memset(&di->__pad4, 0, sizeof(di->__pad4)); + di->di_eattr = cpu_to_be64(0); + memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + + brelse(dibh); +} + +static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, + unsigned int mode, const struct gfs2_inum *inum, + const u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + unsigned int uid, gid; + int error; + + munge_mode_uid_gid(dip, &mode, &uid, &gid); + gfs2_alloc_get(dip); + + error = gfs2_quota_lock(dip, uid, gid); + if (error) + goto out; + + error = gfs2_quota_check(dip, uid, gid); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0); + if (error) + goto out_quota; + + init_dinode(dip, gl, inum, mode, uid, gid, generation); + gfs2_quota_change(dip, +1, uid, gid); + gfs2_trans_end(sdp); + +out_quota: + gfs2_quota_unlock(dip); +out: + gfs2_alloc_put(dip); + return error; +} + +static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc *al; + int alloc_required; + struct buffer_head *dibh; + int error; + + al = gfs2_alloc_get(dip); + + error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto fail; + + error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); + if (alloc_required < 0) + goto fail; + if (alloc_required) { + error = gfs2_quota_check(dip, dip->i_di.di_uid, + dip->i_di.di_gid); + if (error) + goto fail_quota_locks; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(dip); + if (error) + goto fail_quota_locks; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + al->al_rgd->rd_ri.ri_length + + 2 * RES_DINODE + + RES_STATFS + RES_QUOTA, 0); + if (error) + goto fail_ipreserv; + } else { + error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0); + if (error) + goto fail_quota_locks; + } + + error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode)); + if (error) + goto fail_end_trans; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_end_trans; + ip->i_di.di_nlink = 1; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + return 0; + +fail_end_trans: + gfs2_trans_end(sdp); + +fail_ipreserv: + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + +fail_quota_locks: + gfs2_quota_unlock(dip); + +fail: + gfs2_alloc_put(dip); + return error; +} + +/** + * gfs2_createi - Create a new inode + * @ghs: An array of two holders + * @name: The name of the new file + * @mode: the permissions on the new inode + * + * @ghs[0] is an initialized holder for the directory + * @ghs[1] is the holder for the inode lock + * + * If the return value is not NULL, the glocks on both the directory and the new + * file are held. A transaction has been started and an inplace reservation + * is held, as well. + * + * Returns: An inode + */ + +struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, + unsigned int mode) +{ + struct inode *inode; + struct gfs2_inode *dip = ghs->gh_gl->gl_object; + struct inode *dir = &dip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_inum inum; + int error; + u64 generation; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return ERR_PTR(-ENAMETOOLONG); + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs); + error = gfs2_glock_nq(ghs); + if (error) + goto fail; + + error = create_ok(dip, name, mode); + if (error) + goto fail_gunlock; + + error = pick_formal_ino(sdp, &inum.no_formal_ino); + if (error) + goto fail_gunlock; + + error = alloc_dinode(dip, &inum, &generation); + if (error) + goto fail_gunlock; + + if (inum.no_addr < dip->i_num.no_addr) { + gfs2_glock_dq(ghs); + + error = gfs2_glock_nq_num(sdp, inum.no_addr, + &gfs2_inode_glops, LM_ST_EXCLUSIVE, + GL_SKIP, ghs + 1); + if (error) { + return ERR_PTR(error); + } + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs); + error = gfs2_glock_nq(ghs); + if (error) { + gfs2_glock_dq_uninit(ghs + 1); + return ERR_PTR(error); + } + + error = create_ok(dip, name, mode); + if (error) + goto fail_gunlock2; + } else { + error = gfs2_glock_nq_num(sdp, inum.no_addr, + &gfs2_inode_glops, LM_ST_EXCLUSIVE, + GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock; + } + + error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation); + if (error) + goto fail_gunlock2; + + inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode)); + if (IS_ERR(inode)) + goto fail_gunlock2; + + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_iput; + + error = gfs2_acl_create(dip, GFS2_I(inode)); + if (error) + goto fail_iput; + + error = link_dinode(dip, name, GFS2_I(inode)); + if (error) + goto fail_iput; + + if (!inode) + return ERR_PTR(-ENOMEM); + return inode; + +fail_iput: + iput(inode); +fail_gunlock2: + gfs2_glock_dq_uninit(ghs + 1); +fail_gunlock: + gfs2_glock_dq(ghs); +fail: + return ERR_PTR(error); +} + +/** + * gfs2_rmdiri - Remove a directory + * @dip: The parent directory of the directory to be removed + * @name: The name of the directory to be removed + * @ip: The GFS2 inode of the directory to be removed + * + * Assumes Glocks on dip and ip are held + * + * Returns: errno + */ + +int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct qstr dotname; + int error; + + if (ip->i_di.di_entries != 2) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + + error = gfs2_dir_del(dip, name); + if (error) + return error; + + error = gfs2_change_nlink(dip, -1); + if (error) + return error; + + gfs2_str2qstr(&dotname, "."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + gfs2_str2qstr(&dotname, ".."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + error = gfs2_change_nlink(ip, -2); + if (error) + return error; + + return error; +} + +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct gfs2_inum inum; + unsigned int type; + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_di.di_mode & S_ISVTX) && + dip->i_di.di_uid != current->fsuid && + ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + if (error) + return error; + + error = gfs2_dir_search(&dip->i_inode, name, &inum, &type); + if (error) + return error; + + if (!gfs2_inum_equal(&inum, &ip->i_num)) + return -ENOENT; + + if (IF2DT(ip->i_di.di_mode) != type) { + gfs2_consist_inode(dip); + return -EIO; + } + + return 0; +} + +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + struct qstr dotdot; + int error = 0; + + gfs2_str2qstr(&dotdot, ".."); + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == sb->s_root->d_inode) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &dotdot, 1, NULL); + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + +/** + * gfs2_readlinki - return the contents of a symlink + * @ip: the symlink's inode + * @buf: a pointer to the buffer to be filled + * @len: a pointer to the length of @buf + * + * If @buf is too small, a piece of memory is kmalloc()ed and needs + * to be freed by the caller. + * + * Returns: errno + */ + +int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) +{ + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int x; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); + error = gfs2_glock_nq_atime(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + return error; + } + + if (!ip->i_di.di_size) { + gfs2_consist_inode(ip); + error = -EIO; + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + x = ip->i_di.di_size + 1; + if (x > *len) { + *buf = kmalloc(x, GFP_KERNEL); + if (!*buf) { + error = -ENOMEM; + goto out_brelse; + } + } + + memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); + *len = x; + +out_brelse: + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + return error; +} + +/** + * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and + * conditionally update the inode's atime + * @gh: the holder to acquire + * + * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap + * Update if the difference between the current time and the inode's current + * atime is greater than an interval specified at mount. + * + * Returns: errno + */ + +int gfs2_glock_nq_atime(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = gl->gl_object; + int64_t curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum); + unsigned int state; + int flags; + int error; + + if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) || + gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) || + gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops)) + return -EINVAL; + + state = gh->gh_state; + flags = gh->gh_flags; + + error = gfs2_glock_nq(gh); + if (error) + return error; + + if (test_bit(SDF_NOATIME, &sdp->sd_flags) || + (sdp->sd_vfs->s_flags & MS_RDONLY)) + return 0; + + curtime = get_seconds(); + if (curtime - ip->i_di.di_atime >= quantum) { + gfs2_glock_dq(gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, + gh); + error = gfs2_glock_nq(gh); + if (error) + return error; + + /* Verify that atime hasn't been updated while we were + trying to get exclusive lock. */ + + curtime = get_seconds(); + if (curtime - ip->i_di.di_atime >= quantum) { + struct buffer_head *dibh; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error == -EROFS) + return 0; + if (error) + goto fail; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_end_trans; + + ip->i_di.di_atime = curtime; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + + gfs2_trans_end(sdp); + } + + /* If someone else has asked for the glock, + unlock and let them have it. Then reacquire + in the original state. */ + if (gfs2_glock_is_blocking(gl)) { + gfs2_glock_dq(gh); + gfs2_holder_reinit(state, flags, gh); + return gfs2_glock_nq(gh); + } + } + + return 0; + +fail_end_trans: + gfs2_trans_end(sdp); +fail: + gfs2_glock_dq(gh); + return error; +} + +/** + * glock_compare_atime - Compare two struct gfs2_glock structures for sort + * @arg_a: the first structure + * @arg_b: the second structure + * + * Returns: 1 if A > B + * -1 if A < B + * 0 if A = B + */ + +static int glock_compare_atime(const void *arg_a, const void *arg_b) +{ + struct gfs2_holder *gh_a = *(struct gfs2_holder **)arg_a; + struct gfs2_holder *gh_b = *(struct gfs2_holder **)arg_b; + struct lm_lockname *a = &gh_a->gh_gl->gl_name; + struct lm_lockname *b = &gh_b->gh_gl->gl_name; + int ret = 0; + + if (a->ln_number > b->ln_number) + ret = 1; + else if (a->ln_number < b->ln_number) + ret = -1; + else { + if (gh_a->gh_state == LM_ST_SHARED && + gh_b->gh_state == LM_ST_EXCLUSIVE) + ret = 1; + else if (gh_a->gh_state == LM_ST_SHARED && + (gh_b->gh_flags & GL_ATIME)) + ret = 1; + } + + return ret; +} + +/** + * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an + * atime update + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_holder **p; + unsigned int x; + int error = 0; + + if (!num_gh) + return 0; + + if (num_gh == 1) { + ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + if (ghs->gh_flags & GL_ATIME) + error = gfs2_glock_nq_atime(ghs); + else + error = gfs2_glock_nq(ghs); + return error; + } + + p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL); + if (!p) + return -ENOMEM; + + for (x = 0; x < num_gh; x++) + p[x] = &ghs[x]; + + sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL); + + for (x = 0; x < num_gh; x++) { + p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + + if (p[x]->gh_flags & GL_ATIME) + error = gfs2_glock_nq_atime(p[x]); + else + error = gfs2_glock_nq(p[x]); + + if (error) { + while (x--) + gfs2_glock_dq(p[x]); + break; + } + } + + kfree(p); + return error; +} + + +static int +__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + error = inode_setattr(&ip->i_inode, attr); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + gfs2_inode_attr_out(ip); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + return error; +} + +/** + * gfs2_setattr_simple - + * @ip: + * @attr: + * + * Called with a reference on the vnode. + * + * Returns: errno + */ + +int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +{ + int error; + + if (current->journal_info) + return __gfs2_setattr_simple(ip, attr); + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); + if (error) + return error; + + error = __gfs2_setattr_simple(ip, attr); + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + return error; +} + --- /dev/null +++ b/fs/gfs2/inode.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __INODE_DOT_H__ +#define __INODE_DOT_H__ + +static inline int gfs2_is_stuffed(struct gfs2_inode *ip) +{ + return !ip->i_di.di_height; +} + +static inline int gfs2_is_jdata(struct gfs2_inode *ip) +{ + return ip->i_di.di_flags & GFS2_DIF_JDATA; +} + +static inline int gfs2_is_dir(struct gfs2_inode *ip) +{ + return S_ISDIR(ip->i_di.di_mode); +} + +void gfs2_inode_attr_in(struct gfs2_inode *ip); +void gfs2_inode_attr_out(struct gfs2_inode *ip); +struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type); +struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum); + +int gfs2_inode_refresh(struct gfs2_inode *ip); + +int gfs2_dinode_dealloc(struct gfs2_inode *inode); +int gfs2_change_nlink(struct gfs2_inode *ip, int diff); +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root, struct nameidata *nd); +struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, + unsigned int mode); +int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip); +int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip); +int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); +int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); + +int gfs2_glock_nq_atime(struct gfs2_holder *gh); +int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs); + +int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); + +struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); + +#endif /* __INODE_DOT_H__ */ + [PATCH 04/16] GFS2: Daemons and address space operations The GFS2 kernel threads, logging operations and page cache interface code. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/daemon.c | 196 +++++++++++ fs/gfs2/daemon.h | 19 + fs/gfs2/lops.c | 811 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/lops.h | 96 +++++ fs/gfs2/main.c | 148 +++++++++ fs/gfs2/meta_io.c | 779 +++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/meta_io.h | 74 ++++ fs/gfs2/ondisk.c | 308 ++++++++++++++++++ fs/gfs2/ops_address.c | 819 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/ops_address.h | 18 + 10 files changed, 3268 insertions(+) --- /dev/null +++ b/fs/gfs2/daemon.c @@ -0,0 +1,196 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "daemon.h" +#include "glock.h" +#include "log.h" +#include "quota.h" +#include "recovery.h" +#include "super.h" +#include "util.h" + +/* This uses schedule_timeout() instead of msleep() because it's good for + the daemons to wake up more often than the timeout when unmounting so + the user's unmount doesn't sit there forever. + + The kthread functions used to start these daemons block and flush signals. */ + +/** + * gfs2_scand - Look for cached glocks and inodes to toss from memory + * @sdp: Pointer to GFS2 superblock + * + * One of these daemons runs, finding candidates to add to sd_reclaim_list. + * See gfs2_glockd() + */ + +int gfs2_scand(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + + while (!kthread_should_stop()) { + gfs2_scand_internal(sdp); + t = gfs2_tune_get(sdp, gt_scand_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + +/** + * gfs2_glockd - Reclaim unused glock structures + * @sdp: Pointer to GFS2 superblock + * + * One or more of these daemons run, reclaiming glocks on sd_reclaim_list. + * Number of daemons can be set by user, with num_glockd mount option. + */ + +int gfs2_glockd(void *data) +{ + struct gfs2_sbd *sdp = data; + + while (!kthread_should_stop()) { + while (atomic_read(&sdp->sd_reclaim_count)) + gfs2_reclaim_glock(sdp); + + wait_event_interruptible(sdp->sd_reclaim_wq, + (atomic_read(&sdp->sd_reclaim_count) || + kthread_should_stop())); + } + + return 0; +} + +/** + * gfs2_recoverd - Recover dead machine's journals + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_recoverd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + + while (!kthread_should_stop()) { + gfs2_check_journals(sdp); + t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + +/** + * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks + * @sdp: Pointer to GFS2 superblock + * + * Also, periodically check to make sure that we're using the most recent + * journal index. + */ + +int gfs2_logd(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_holder ji_gh; + unsigned long t; + + while (!kthread_should_stop()) { + /* Advance the log tail */ + + t = sdp->sd_log_flush_time + + gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; + + gfs2_ail1_empty(sdp, DIO_ALL); + + if (time_after_eq(jiffies, t)) { + gfs2_log_flush(sdp, NULL); + sdp->sd_log_flush_time = jiffies; + } + + /* Check for latest journal index */ + + t = sdp->sd_jindex_refresh_time + + gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ; + + if (time_after_eq(jiffies, t)) { + if (!gfs2_jindex_hold(sdp, &ji_gh)) + gfs2_glock_dq_uninit(&ji_gh); + sdp->sd_jindex_refresh_time = jiffies; + } + + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + int error; + + while (!kthread_should_stop()) { + /* Update the master statfs file */ + + t = sdp->sd_statfs_sync_time + + gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; + + if (time_after_eq(jiffies, t)) { + error = gfs2_statfs_sync(sdp); + if (error && + error != -EROFS && + !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "quotad: (1) error=%d\n", error); + sdp->sd_statfs_sync_time = jiffies; + } + + /* Update quota file */ + + t = sdp->sd_quota_sync_time + + gfs2_tune_get(sdp, gt_quota_quantum) * HZ; + + if (time_after_eq(jiffies, t)) { + error = gfs2_quota_sync(sdp); + if (error && + error != -EROFS && + !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "quotad: (2) error=%d\n", error); + sdp->sd_quota_sync_time = jiffies; + } + + gfs2_quota_scan(sdp); + + t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + --- /dev/null +++ b/fs/gfs2/daemon.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __DAEMON_DOT_H__ +#define __DAEMON_DOT_H__ + +int gfs2_scand(void *data); +int gfs2_glockd(void *data); +int gfs2_recoverd(void *data); +int gfs2_logd(void *data); +int gfs2_quotad(void *data); + +#endif /* __DAEMON_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/lops.c @@ -0,0 +1,811 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_glock *gl; + struct gfs2_trans *tr = current->journal_info; + + tr->tr_touched = 1; + + if (!list_empty(&le->le_list)) + return; + + gl = container_of(le, struct gfs2_glock, gl_le); + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl))) + return; + gfs2_glock_hold(gl); + set_bit(GLF_DIRTY, &gl->gl_flags); + + gfs2_log_lock(sdp); + sdp->sd_log_num_gl++; + list_add(&le->le_list, &sdp->sd_log_le_gl); + gfs2_log_unlock(sdp); +} + +static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_gl; + struct gfs2_glock *gl; + + while (!list_empty(head)) { + gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list); + list_del_init(&gl->gl_le.le_list); + sdp->sd_log_num_gl--; + + gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)); + gfs2_glock_put(gl); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_gl); +} + +static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_trans *tr; + + if (!list_empty(&bd->bd_list_tr)) + return; + + tr = current->journal_info; + tr->tr_touched = 1; + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + + if (!list_empty(&le->le_list)) + return; + + gfs2_trans_add_gl(bd->bd_gl); + + gfs2_meta_check(sdp, bd->bd_bh); + gfs2_pin(sdp, bd->bd_bh); + + gfs2_log_lock(sdp); + sdp->sd_log_num_buf++; + list_add(&le->le_list, &sdp->sd_log_le_buf); + gfs2_log_unlock(sdp); + + tr->tr_num_buf_new++; +} + +static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head = &tr->tr_list_buf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); + list_del_init(&bd->bd_list_tr); + tr->tr_num_buf--; + } + gfs2_assert_warn(sdp, !tr->tr_num_buf); +} + +static void buf_lo_before_commit(struct gfs2_sbd *sdp) +{ + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + struct gfs2_bufdata *bd1 = NULL, *bd2; + unsigned int total = sdp->sd_log_num_buf; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + unsigned int limit; + unsigned int num; + unsigned n; + __be64 *ptr; + + offset += (sizeof(__be64) - 1); + offset &= ~(sizeof(__be64) - 1); + limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64); + /* for 4k blocks, limit = 503 */ + + bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); + while(total) { + num = total; + if (total > limit) + num = limit; + bh = gfs2_log_get_buf(sdp); + sdp->sd_log_num_hdrs++; + ld = (struct gfs2_log_descriptor *)bh->b_data; + ptr = (__be64 *)(bh->b_data + offset); + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA); + ld->ld_length = cpu_to_be32(num + 1); + ld->ld_data1 = cpu_to_be32(num); + ld->ld_data2 = cpu_to_be32(0); + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + + n = 0; + list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, + bd_le.le_list) { + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + if (++n >= num) + break; + } + + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + + n = 0; + list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, + bd_le.le_list) { + bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + if (++n >= num) + break; + } + + total -= num; + } +} + +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_buf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + sdp->sd_log_num_buf--; + + gfs2_unpin(sdp, bd->bd_bh, ai); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); +} + +static void buf_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header *head, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (pass != 0) + return; + + sdp->sd_found_blocks = 0; + sdp->sd_replayed_blocks = 0; +} + +static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + uint64_t blkno; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + + sdp->sd_found_blocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + if (gfs2_meta_check(sdp, bh_ip)) + error = -EIO; + else + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + + if (error) + break; + + sdp->sd_replayed_blocks++; + } + + return error; +} + +static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl, + DIO_START | DIO_WAIT); + return; + } + if (pass != 1) + return; + + gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT); + + fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", + jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); +} + +static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_trans *tr; + + tr = current->journal_info; + tr->tr_touched = 1; + tr->tr_num_revoke++; + + gfs2_log_lock(sdp); + sdp->sd_log_num_revoke++; + list_add(&le->le_list, &sdp->sd_log_le_revoke); + gfs2_log_unlock(sdp); +} + +static void revoke_lo_before_commit(struct gfs2_sbd *sdp) +{ + struct gfs2_log_descriptor *ld; + struct gfs2_meta_header *mh; + struct buffer_head *bh; + unsigned int offset; + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_revoke *rv; + + if (!sdp->sd_log_num_revoke) + return; + + bh = gfs2_log_get_buf(sdp); + ld = (struct gfs2_log_descriptor *)bh->b_data; + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE); + ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, + sizeof(uint64_t))); + ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); + ld->ld_data2 = cpu_to_be32(0); + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + offset = sizeof(struct gfs2_log_descriptor); + + while (!list_empty(head)) { + rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list); + list_del_init(&rv->rv_le.le_list); + sdp->sd_log_num_revoke--; + + if (offset + sizeof(uint64_t) > sdp->sd_sb.sb_bsize) { + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + + bh = gfs2_log_get_buf(sdp); + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); + mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB); + mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB); + offset = sizeof(struct gfs2_meta_header); + } + + *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno); + kfree(rv); + + offset += sizeof(uint64_t); + } + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); +} + +static void revoke_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header *head, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (pass != 0) + return; + + sdp->sd_found_revokes = 0; + sdp->sd_replay_tail = head->lh_tail; +} + +static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int blks = be32_to_cpu(ld->ld_length); + unsigned int revokes = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh; + unsigned int offset; + uint64_t blkno; + int first = 1; + int error; + + if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE) + return 0; + + offset = sizeof(struct gfs2_log_descriptor); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + + if (!first) + gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB); + + while (offset + sizeof(uint64_t) <= sdp->sd_sb.sb_bsize) { + blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); + + error = gfs2_revoke_add(sdp, blkno, start); + if (error < 0) + return error; + else if (error) + sdp->sd_found_revokes++; + + if (!--revokes) + break; + offset += sizeof(uint64_t); + } + + brelse(bh); + offset = sizeof(struct gfs2_meta_header); + first = 0; + } + + return 0; +} + +static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_revoke_clean(sdp); + return; + } + if (pass != 1) + return; + + fs_info(sdp, "jid=%u: Found %u revoke tags\n", + jd->jd_jid, sdp->sd_found_revokes); + + gfs2_revoke_clean(sdp); +} + +static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_trans *tr = current->journal_info; + + tr->tr_touched = 1; + + if (!list_empty(&le->le_list)) + return; + + rgd = container_of(le, struct gfs2_rgrpd, rd_le); + gfs2_rgrp_bh_hold(rgd); + + gfs2_log_lock(sdp); + sdp->sd_log_num_rg++; + list_add(&le->le_list, &sdp->sd_log_le_rg); + gfs2_log_unlock(sdp); +} + +static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_rg; + struct gfs2_rgrpd *rgd; + + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list); + list_del_init(&rgd->rd_le.le_list); + sdp->sd_log_num_rg--; + + gfs2_rgrp_repolish_clones(rgd); + gfs2_rgrp_bh_put(rgd); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); +} + +/** + * databuf_lo_add - Add a databuf to the transaction. + * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. + */ +static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_trans *tr = current->journal_info; + struct address_space *mapping = bd->bd_bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + + tr->tr_touched = 1; + if (list_empty(&bd->bd_list_tr) && + (ip->i_di.di_flags & GFS2_DIF_JDATA)) { + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_buf_new++; + } + gfs2_trans_add_gl(bd->bd_gl); + gfs2_log_lock(sdp); + if (list_empty(&le->le_list)) { + if (ip->i_di.di_flags & GFS2_DIF_JDATA) + sdp->sd_log_num_jdata++; + sdp->sd_log_num_databuf++; + list_add(&le->le_list, &sdp->sd_log_le_databuf); + } + gfs2_log_unlock(sdp); +} + +static int gfs2_check_magic(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + void *kaddr; + __be32 *ptr; + int rv = 0; + + kaddr = kmap_atomic(page, KM_USER0); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + rv = 1; + kunmap_atomic(page, KM_USER0); + + return rv; +} + +/** + * databuf_lo_before_commit - Scan the data buffers, writing as we go + * + * Here we scan through the lists of buffers and make the assumption + * that any buffer thats been pinned is being journaled, and that + * any unpinned buffer is an ordered write data buffer and therefore + * will be written back rather than journaled. + */ +static void databuf_lo_before_commit(struct gfs2_sbd *sdp) +{ + LIST_HEAD(started); + struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt; + struct buffer_head *bh = NULL; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + struct gfs2_log_descriptor *ld; + unsigned int limit; + unsigned int total_dbuf = sdp->sd_log_num_databuf; + unsigned int total_jdata = sdp->sd_log_num_jdata; + unsigned int num, n; + __be64 *ptr = NULL; + + offset += (2*sizeof(__be64) - 1); + offset &= ~(2*sizeof(__be64) - 1); + limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64); + + /* + * Start writing ordered buffers, write journaled buffers + * into the log along with a header + */ + gfs2_log_lock(sdp); + bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, + bd_le.le_list); + while(total_dbuf) { + num = total_jdata; + if (num > limit) + num = limit; + n = 0; + list_for_each_entry_safe_continue(bd1, bdt, + &sdp->sd_log_le_databuf, + bd_le.le_list) { + /* An ordered write buffer */ + if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) { + list_move(&bd1->bd_le.le_list, &started); + if (bd1 == bd2) { + bd2 = NULL; + bd2 = list_prepare_entry(bd2, + &sdp->sd_log_le_databuf, + bd_le.le_list); + } + total_dbuf--; + if (bd1->bd_bh) { + get_bh(bd1->bd_bh); + if (buffer_dirty(bd1->bd_bh)) { + gfs2_log_unlock(sdp); + wait_on_buffer(bd1->bd_bh); + ll_rw_block(WRITE, 1, + &bd1->bd_bh); + gfs2_log_lock(sdp); + } + brelse(bd1->bd_bh); + continue; + } + continue; + } else if (bd1->bd_bh) { /* A journaled buffer */ + int magic; + gfs2_log_unlock(sdp); + if (!bh) { + bh = gfs2_log_get_buf(sdp); + sdp->sd_log_num_hdrs++; + ld = (struct gfs2_log_descriptor *) + bh->b_data; + ptr = (__be64 *)(bh->b_data + offset); + ld->ld_header.mh_magic = + cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = + cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = + cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = + cpu_to_be32(GFS2_LOG_DESC_JDATA); + ld->ld_length = cpu_to_be32(num + 1); + ld->ld_data1 = cpu_to_be32(num); + ld->ld_data2 = cpu_to_be32(0); + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + } + magic = gfs2_check_magic(bd1->bd_bh); + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + *ptr++ = cpu_to_be64((__u64)magic); + clear_buffer_escaped(bd1->bd_bh); + if (unlikely(magic != 0)) + set_buffer_escaped(bd1->bd_bh); + gfs2_log_lock(sdp); + if (n++ > num) + break; + } else if (!bd1->bd_bh) { + total_dbuf--; + sdp->sd_log_num_databuf--; + list_del_init(&bd1->bd_le.le_list); + if (bd1 == bd2) { + bd2 = NULL; + bd2 = list_prepare_entry(bd2, + &sdp->sd_log_le_databuf, + bd_le.le_list); + } + kmem_cache_free(gfs2_bufdata_cachep, bd1); + } + } + gfs2_log_unlock(sdp); + if (bh) { + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + bh = NULL; + } + n = 0; + gfs2_log_lock(sdp); + list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, + bd_le.le_list) { + if (!bd2->bd_bh) + continue; + /* copy buffer if it needs escaping */ + gfs2_log_unlock(sdp); + if (unlikely(buffer_escaped(bd2->bd_bh))) { + void *kaddr; + struct page *page = bd2->bd_bh->b_page; + bh = gfs2_log_get_buf(sdp); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(bh->b_data, + kaddr + bh_offset(bd2->bd_bh), + sdp->sd_sb.sb_bsize); + kunmap_atomic(page, KM_USER0); + *(__be32 *)bh->b_data = 0; + } else { + bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); + } + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + gfs2_log_lock(sdp); + if (++n >= num) + break; + } + bh = NULL; + total_dbuf -= num; + total_jdata -= num; + } + gfs2_log_unlock(sdp); + + /* Wait on all ordered buffers */ + while (!list_empty(&started)) { + gfs2_log_lock(sdp); + bd1 = list_entry(started.next, struct gfs2_bufdata, + bd_le.le_list); + list_del_init(&bd1->bd_le.le_list); + sdp->sd_log_num_databuf--; + bh = bd1->bd_bh; + if (bh) { + bh->b_private = NULL; + get_bh(bh); + gfs2_log_unlock(sdp); + wait_on_buffer(bh); + brelse(bh); + } else + gfs2_log_unlock(sdp); + + kmem_cache_free(gfs2_bufdata_cachep, bd1); + } + + /* We've removed all the ordered write bufs here, so only jdata left */ + gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata); +} + +static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + uint64_t blkno; + uint64_t esc; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + esc = be64_to_cpu(*ptr++); + + sdp->sd_found_blocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + /* Unescape */ + if (esc) { + __be32 *eptr = (__be32 *)bh_ip->b_data; + *eptr = cpu_to_be32(GFS2_MAGIC); + } + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + if (error) + break; + + sdp->sd_replayed_blocks++; + } + + return error; +} + +/* FIXME: sort out accounting for log blocks etc. */ + +static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl, + DIO_START | DIO_WAIT); + return; + } + if (pass != 1) + return; + + /* data sync? */ + gfs2_meta_sync(ip->i_gl, DIO_START | DIO_WAIT); + + fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", + jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); +} + +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_databuf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + sdp->sd_log_num_databuf--; + sdp->sd_log_num_jdata--; + gfs2_unpin(sdp, bd->bd_bh, ai); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); + gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata); +} + + +const struct gfs2_log_operations gfs2_glock_lops = { + .lo_add = glock_lo_add, + .lo_after_commit = glock_lo_after_commit, + .lo_name = "glock" +}; + +const struct gfs2_log_operations gfs2_buf_lops = { + .lo_add = buf_lo_add, + .lo_incore_commit = buf_lo_incore_commit, + .lo_before_commit = buf_lo_before_commit, + .lo_after_commit = buf_lo_after_commit, + .lo_before_scan = buf_lo_before_scan, + .lo_scan_elements = buf_lo_scan_elements, + .lo_after_scan = buf_lo_after_scan, + .lo_name = "buf" +}; + +const struct gfs2_log_operations gfs2_revoke_lops = { + .lo_add = revoke_lo_add, + .lo_before_commit = revoke_lo_before_commit, + .lo_before_scan = revoke_lo_before_scan, + .lo_scan_elements = revoke_lo_scan_elements, + .lo_after_scan = revoke_lo_after_scan, + .lo_name = "revoke" +}; + +const struct gfs2_log_operations gfs2_rg_lops = { + .lo_add = rg_lo_add, + .lo_after_commit = rg_lo_after_commit, + .lo_name = "rg" +}; + +const struct gfs2_log_operations gfs2_databuf_lops = { + .lo_add = databuf_lo_add, + .lo_incore_commit = buf_lo_incore_commit, + .lo_before_commit = databuf_lo_before_commit, + .lo_after_commit = databuf_lo_after_commit, + .lo_scan_elements = databuf_lo_scan_elements, + .lo_after_scan = databuf_lo_after_scan, + .lo_name = "databuf" +}; + +const struct gfs2_log_operations *gfs2_log_ops[] = { + &gfs2_glock_lops, + &gfs2_buf_lops, + &gfs2_revoke_lops, + &gfs2_rg_lops, + &gfs2_databuf_lops, + NULL +}; + --- /dev/null +++ b/fs/gfs2/lops.h @@ -0,0 +1,96 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __LOPS_DOT_H__ +#define __LOPS_DOT_H__ + +extern const struct gfs2_log_operations gfs2_glock_lops; +extern const struct gfs2_log_operations gfs2_buf_lops; +extern const struct gfs2_log_operations gfs2_revoke_lops; +extern const struct gfs2_log_operations gfs2_rg_lops; +extern const struct gfs2_log_operations gfs2_databuf_lops; + +extern const struct gfs2_log_operations *gfs2_log_ops[]; + +static inline void lops_init_le(struct gfs2_log_element *le, + const struct gfs2_log_operations *lops) +{ + INIT_LIST_HEAD(&le->le_list); + le->le_ops = lops; +} + +static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + if (le->le_ops->lo_add) + le->le_ops->lo_add(sdp, le); +} + +static inline void lops_incore_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_incore_commit) + gfs2_log_ops[x]->lo_incore_commit(sdp, tr); +} + +static inline void lops_before_commit(struct gfs2_sbd *sdp) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_commit) + gfs2_log_ops[x]->lo_before_commit(sdp); +} + +static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_after_commit) + gfs2_log_ops[x]->lo_after_commit(sdp, ai); +} + +static inline void lops_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header *head, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_before_scan(jd, head, pass); +} + +static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, + unsigned int pass) +{ + int x, error; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_scan_elements) { + error = gfs2_log_ops[x]->lo_scan_elements(jd, start, + ld, ptr, pass); + if (error) + return error; + } + + return 0; +} + +static inline void lops_after_scan(struct gfs2_jdesc *jd, int error, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_after_scan(jd, error, pass); +} + +#endif /* __LOPS_DOT_H__ */ + --- /dev/null +++ b/fs/gfs2/main.c @@ -0,0 +1,148 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "ops_fstype.h" +#include "sys.h" +#include "util.h" + +static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct gfs2_inode *ip = foo; + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + inode_init_once(&ip->i_inode); + spin_lock_init(&ip->i_spin); + init_rwsem(&ip->i_rw_mutex); + memset(ip->i_cache, 0, sizeof(ip->i_cache)); + } +} + +static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct gfs2_glock *gl = foo; + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + INIT_LIST_HEAD(&gl->gl_list); + spin_lock_init(&gl->gl_spin); + INIT_LIST_HEAD(&gl->gl_holders); + INIT_LIST_HEAD(&gl->gl_waiters1); + INIT_LIST_HEAD(&gl->gl_waiters2); + INIT_LIST_HEAD(&gl->gl_waiters3); + gl->gl_lvb = NULL; + atomic_set(&gl->gl_lvb_count, 0); + INIT_LIST_HEAD(&gl->gl_reclaim); + INIT_LIST_HEAD(&gl->gl_ail_list); + atomic_set(&gl->gl_ail_count, 0); + } +} + +/** + * init_gfs2_fs - Register GFS2 as a filesystem + * + * Returns: 0 on success, error code on failure + */ + +static int __init init_gfs2_fs(void) +{ + int error; + + gfs2_init_lmh(); + + error = gfs2_sys_init(); + if (error) + return error; + + error = -ENOMEM; + + gfs2_glock_cachep = kmem_cache_create("gfs2_glock", + sizeof(struct gfs2_glock), + 0, 0, + gfs2_init_glock_once, NULL); + if (!gfs2_glock_cachep) + goto fail; + + gfs2_inode_cachep = kmem_cache_create("gfs2_inode", + sizeof(struct gfs2_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_PANIC|SLAB_MEM_SPREAD), + gfs2_init_inode_once, NULL); + if (!gfs2_inode_cachep) + goto fail; + + gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata", + sizeof(struct gfs2_bufdata), + 0, 0, NULL, NULL); + if (!gfs2_bufdata_cachep) + goto fail; + + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_unregister; + + printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); + + return 0; + +fail_unregister: + unregister_filesystem(&gfs2_fs_type); +fail: + if (gfs2_bufdata_cachep) + kmem_cache_destroy(gfs2_bufdata_cachep); + + if (gfs2_inode_cachep) + kmem_cache_destroy(gfs2_inode_cachep); + + if (gfs2_glock_cachep) + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); + return error; +} + +/** + * exit_gfs2_fs - Unregister the file system + * + */ + +static void __exit exit_gfs2_fs(void) +{ + unregister_filesystem(&gfs2_fs_type); + unregister_filesystem(&gfs2meta_fs_type); + + kmem_cache_destroy(gfs2_bufdata_cachep); + kmem_cache_destroy(gfs2_inode_cachep); + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); +} + +MODULE_DESCRIPTION("Global File System"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +module_init(init_gfs2_fs); +module_exit(exit_gfs2_fs); + --- /dev/null +++ b/fs/gfs2/meta_io.c @@ -0,0 +1,779 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "ops_address.h" + +#define buffer_busy(bh) \ +((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned))) +#define buffer_in_io(bh) \ +((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock))) + +static int aspace_get_block(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + gfs2_assert_warn(inode->i_sb->s_fs_info, 0); + return -EOPNOTSUPP; +} + +static int gfs2_aspace_writepage(struct page *page, + struct writeback_control *wbc) +{ + return block_write_full_page(page, aspace_get_block, wbc); +} + +static const struct address_space_operations aspace_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; + +/** + * gfs2_aspace_get - Create and initialize a struct inode structure + * @sdp: the filesystem the aspace is in + * + * Right now a struct inode is just a struct inode. Maybe Linux + * will supply a more lightweight address space construct (that works) + * in the future. + * + * Make sure pages/buffers in this aspace aren't in high memory. + * + * Returns: the aspace + */ + +struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp) +{ + struct inode *aspace; + + aspace = new_inode(sdp->sd_vfs); + if (aspace) { + mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS); + aspace->i_mapping->a_ops = &aspace_aops; + aspace->i_size = ~0ULL; + aspace->u.generic_ip = NULL; + insert_inode_hash(aspace); + } + return aspace; +} + +void gfs2_aspace_put(struct inode *aspace) +{ + remove_inode_hash(aspace); + iput(aspace); +} + +/** + * gfs2_ail1_start_one - Start I/O on a part of the AIL + * @sdp: the filesystem + * @tr: the part of the AIL + * + */ + +void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + int retry; + + BUG_ON(!spin_is_locked(&sdp->sd_log_lock)); + + do { + retry = 0; + + list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + + gfs2_assert(sdp, bd->bd_ail == ai); + + if (!buffer_busy(bh)) { + if (!buffer_uptodate(bh)) { + gfs2_log_unlock(sdp); + gfs2_io_error_bh(sdp, bh); + gfs2_log_lock(sdp); + } + list_move(&bd->bd_ail_st_list, + &ai->ai_ail2_list); + continue; + } + + if (!buffer_dirty(bh)) + continue; + + list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); + + gfs2_log_unlock(sdp); + wait_on_buffer(bh); + ll_rw_block(WRITE, 1, &bh); + gfs2_log_lock(sdp); + + retry = 1; + break; + } + } while (retry); +} + +/** + * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags) +{ + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + + list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + + gfs2_assert(sdp, bd->bd_ail == ai); + + if (buffer_busy(bh)) { + if (flags & DIO_ALL) + continue; + else + break; + } + + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + } + + return list_empty(&ai->ai_ail1_list); +} + +/** + * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &ai->ai_ail2_list; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->prev, struct gfs2_bufdata, + bd_ail_st_list); + gfs2_assert(sdp, bd->bd_ail == ai); + bd->bd_ail = NULL; + list_del(&bd->bd_ail_st_list); + list_del(&bd->bd_ail_gl_list); + atomic_dec(&bd->bd_gl->gl_ail_count); + brelse(bd->bd_bh); + } +} + +/** + * ail_empty_gl - remove all buffers for a given lock from the AIL + * @gl: the glock + * + * None of the buffers should be dirty, locked, or pinned. + */ + +void gfs2_ail_empty_gl(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int blocks; + struct list_head *head = &gl->gl_ail_list; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + uint64_t blkno; + int error; + + blocks = atomic_read(&gl->gl_ail_count); + if (!blocks) + return; + + error = gfs2_trans_begin(sdp, 0, blocks); + if (gfs2_assert_withdraw(sdp, !error)) + return; + + gfs2_log_lock(sdp); + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, + bd_ail_gl_list); + bh = bd->bd_bh; + blkno = bh->b_blocknr; + gfs2_assert_withdraw(sdp, !buffer_busy(bh)); + + bd->bd_ail = NULL; + list_del(&bd->bd_ail_st_list); + list_del(&bd->bd_ail_gl_list); + atomic_dec(&gl->gl_ail_count); + brelse(bh); + gfs2_log_unlock(sdp); + + gfs2_trans_add_revoke(sdp, blkno); + + gfs2_log_lock(sdp); + } + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + gfs2_log_unlock(sdp); + + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL); +} + +/** + * gfs2_meta_inval - Invalidate all buffers associated with a glock + * @gl: the glock + * + */ + +void gfs2_meta_inval(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct inode *aspace = gl->gl_aspace; + struct address_space *mapping = gl->gl_aspace->i_mapping; + + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + + atomic_inc(&aspace->i_writecount); + truncate_inode_pages(mapping, 0); + atomic_dec(&aspace->i_writecount); + + gfs2_assert_withdraw(sdp, !mapping->nrpages); +} + +/** + * gfs2_meta_sync - Sync all buffers associated with a glock + * @gl: The glock + * @flags: DIO_START | DIO_WAIT + * + */ + +void gfs2_meta_sync(struct gfs2_glock *gl, int flags) +{ + struct address_space *mapping = gl->gl_aspace->i_mapping; + int error = 0; + + if (flags & DIO_START) + filemap_fdatawrite(mapping); + if (!error && (flags & DIO_WAIT)) + error = filemap_fdatawait(mapping); + + if (error) + gfs2_io_error(gl->gl_sbd); +} + +/** + * getbuf - Get a buffer with a given address space + * @sdp: the filesystem + * @aspace: the address space + * @blkno: the block number (filesystem scope) + * @create: 1 if the buffer should be created + * + * Returns: the buffer + */ + +static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace, + uint64_t blkno, int create) +{ + struct page *page; + struct buffer_head *bh; + unsigned int shift; + unsigned long index; + unsigned int bufnum; + + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; + index = blkno >> shift; /* convert block to page */ + bufnum = blkno - (index << shift); /* block buf index within page */ + + if (create) { + for (;;) { + page = grab_cache_page(aspace->i_mapping, index); + if (page) + break; + yield(); + } + } else { + page = find_lock_page(aspace->i_mapping, index); + if (!page) + return NULL; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + + if (!buffer_mapped(bh)) + map_bh(bh, sdp->sd_vfs, blkno); + + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + return bh; +} + +static void meta_prep_new(struct buffer_head *bh) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); +} + +/** + * gfs2_meta_new - Get a block + * @gl: The glock associated with this block + * @blkno: The block number + * + * Returns: The buffer + */ + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno) +{ + struct buffer_head *bh; + bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE); + meta_prep_new(bh); + return bh; +} + +/** + * gfs2_meta_read - Read a block from disk + * @gl: The glock covering the block + * @blkno: The block number + * @flags: flags to gfs2_dreread() + * @bhp: the place where the buffer is returned (NULL on failure) + * + * Returns: errno + */ + +int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, int flags, + struct buffer_head **bhp) +{ + int error; + + *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE); + error = gfs2_meta_reread(gl->gl_sbd, *bhp, flags); + if (error) + brelse(*bhp); + + return error; +} + +/** + * gfs2_meta_reread - Reread a block from disk + * @sdp: the filesystem + * @bh: The block to read + * @flags: Flags that control the read + * + * Returns: errno + */ + +int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags) +{ + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + if (flags & DIO_FORCE) + clear_buffer_uptodate(bh); + + if ((flags & DIO_START) && !buffer_uptodate(bh)) + ll_rw_block(READ, 1, &bh); + + if (flags & DIO_WAIT) { + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + return -EIO; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + } + + return 0; +} + +/** + * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer + * @gl: the glock the buffer belongs to + * @bh: The buffer to be attached to + * @meta: Flag to indicate whether its metadata or not + */ + +void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, + int meta) +{ + struct gfs2_bufdata *bd; + + if (meta) + lock_page(bh->b_page); + + if (bh->b_private) { + if (meta) + unlock_page(bh->b_page); + return; + } + + bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL), + memset(bd, 0, sizeof(struct gfs2_bufdata)); + bd->bd_bh = bh; + bd->bd_gl = gl; + + INIT_LIST_HEAD(&bd->bd_list_tr); + if (meta) { + lops_init_le(&bd->bd_le, &gfs2_buf_lops); + } else { + lops_init_le(&bd->bd_le, &gfs2_databuf_lops); + } + bh->b_private = bd; + + if (meta) + unlock_page(bh->b_page); +} + +/** + * gfs2_pin - Pin a buffer in memory + * @sdp: the filesystem the buffer belongs to + * @bh: The buffer to be pinned + * + */ + +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd = bh->b_private; + + gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); + + if (test_set_buffer_pinned(bh)) + gfs2_assert_withdraw(sdp, 0); + + wait_on_buffer(bh); + + /* If this buffer is in the AIL and it has already been written + to in-place disk block, remove it from the AIL. */ + + gfs2_log_lock(sdp); + if (bd->bd_ail && !buffer_in_io(bh)) + list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + gfs2_log_unlock(sdp); + + clear_buffer_dirty(bh); + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + + get_bh(bh); +} + +/** + * gfs2_unpin - Unpin a buffer + * @sdp: the filesystem the buffer belongs to + * @bh: The buffer to unpin + * @ai: + * + */ + +void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct gfs2_ail *ai) +{ + struct gfs2_bufdata *bd = bh->b_private; + + gfs2_assert_withdraw(sdp, buffer_uptodate(bh)); + + if (!buffer_pinned(bh)) + gfs2_assert_withdraw(sdp, 0); + + mark_buffer_dirty(bh); + clear_buffer_pinned(bh); + + gfs2_log_lock(sdp); + if (bd->bd_ail) { + list_del(&bd->bd_ail_st_list); + brelse(bh); + } else { + struct gfs2_glock *gl = bd->bd_gl; + list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); + atomic_inc(&gl->gl_ail_count); + } + bd->bd_ail = ai; + list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + gfs2_log_unlock(sdp); +} + +/** + * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore + * @ip: the inode who owns the buffers + * @bstart: the first buffer in the run + * @blen: the number of buffers in the run + * + */ + +void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *aspace = ip->i_gl->gl_aspace; + struct buffer_head *bh; + + while (blen) { + bh = getbuf(sdp, aspace, bstart, NO_CREATE); + if (bh) { + struct gfs2_bufdata *bd = bh->b_private; + + if (test_clear_buffer_pinned(bh)) { + struct gfs2_trans *tr = current->journal_info; + gfs2_log_lock(sdp); + list_del_init(&bd->bd_le.le_list); + gfs2_assert_warn(sdp, sdp->sd_log_num_buf); + sdp->sd_log_num_buf--; + gfs2_log_unlock(sdp); + tr->tr_num_buf_rm++; + brelse(bh); + } + if (bd) { + gfs2_log_lock(sdp); + if (bd->bd_ail) { + uint64_t blkno = bh->b_blocknr; + bd->bd_ail = NULL; + list_del(&bd->bd_ail_st_list); + list_del(&bd->bd_ail_gl_list); + atomic_dec(&bd->bd_gl->gl_ail_count); + brelse(bh); + gfs2_log_unlock(sdp); + gfs2_trans_add_revoke(sdp, blkno); + } else + gfs2_log_unlock(sdp); + } + + lock_buffer(bh); + clear_buffer_dirty(bh); + clear_buffer_uptodate(bh); + unlock_buffer(bh); + + brelse(bh); + } + + bstart++; + blen--; + } +} + +/** + * gfs2_meta_cache_flush - get rid of any references on buffers for this inode + * @ip: The GFS2 inode + * + * This releases buffers that are in the most-recently-used array of + * blocks used for indirect block addressing for this inode. + */ + +void gfs2_meta_cache_flush(struct gfs2_inode *ip) +{ + struct buffer_head **bh_slot; + unsigned int x; + + spin_lock(&ip->i_spin); + + for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) { + bh_slot = &ip->i_cache[x]; + if (!*bh_slot) + break; + brelse(*bh_slot); + *bh_slot = NULL; + } + + spin_unlock(&ip->i_spin); +} + +/** + * gfs2_meta_indirect_buffer - Get a metadata buffer + * @ip: The GFS2 inode + * @height: The level of this buf in the metadata (indir addr) tree (if any) + * @num: The block number (device relative) of the buffer + * @new: Non-zero if we may create a new buffer + * @bhp: the buffer is returned here + * + * Try to use the gfs2_inode's MRU metadata tree cache. + * + * Returns: errno + */ + +int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num, + int new, struct buffer_head **bhp) +{ + struct buffer_head *bh, **bh_slot = ip->i_cache + height; + int error; + + spin_lock(&ip->i_spin); + bh = *bh_slot; + if (bh) { + if (bh->b_blocknr == num) + get_bh(bh); + else + bh = NULL; + } + spin_unlock(&ip->i_spin); + + if (bh) { + if (new) + meta_prep_new(bh); + else { + error = gfs2_meta_reread(GFS2_SB(&ip->i_inode), bh, + DIO_START | DIO_WAIT); + if (error) { + brelse(bh); + return error; + } + } + } else { + if (new) + bh = gfs2_meta_new(ip->i_gl, num); + else { + error = gfs2_meta_read(ip->i_gl, num, + DIO_START | DIO_WAIT, &bh); + if (error) + return error; + } + + spin_lock(&ip->i_spin); + if (*bh_slot != bh) { + brelse(*bh_slot); + *bh_slot = bh; + get_bh(bh); + } + spin_unlock(&ip->i_spin); + } + + if (new) { + if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), height)) { + brelse(bh); + return -EIO; + } + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + + } else if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, + (height) ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)) { + brelse(bh); + return -EIO; + } + + *bhp = bh; + + return 0; +} + +/** + * gfs2_meta_ra - start readahead on an extent of a file + * @gl: the glock the blocks belong to + * @dblock: the starting disk block + * @extlen: the number of blocks in the extent + * + */ + +void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct inode *aspace = gl->gl_aspace; + struct buffer_head *first_bh, *bh; + uint32_t max_ra = gfs2_tune_get(sdp, gt_max_readahead) >> + sdp->sd_sb.sb_bsize_shift; + int error; + + if (!extlen || !max_ra) + return; + if (extlen > max_ra) + extlen = max_ra; + + first_bh = getbuf(sdp, aspace, dblock, CREATE); + + if (buffer_uptodate(first_bh)) + goto out; + if (!buffer_locked(first_bh)) { + error = gfs2_meta_reread(sdp, first_bh, DIO_START); + if (error) + goto out; + } + + dblock++; + extlen--; + + while (extlen) { + bh = getbuf(sdp, aspace, dblock, CREATE); + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + error = gfs2_meta_reread(sdp, bh, DIO_START); + brelse(bh); + if (error) + goto out; + } else + brelse(bh); + + dblock++; + extlen--; + + if (buffer_uptodate(first_bh)) + break; + } + + out: + brelse(first_bh); +} + +/** + * gfs2_meta_syncfs - sync all the buffers in a filesystem + * @sdp: the filesystem + * + */ + +void gfs2_meta_syncfs(struct gfs2_sbd *sdp) +{ + gfs2_log_flush(sdp, NULL); + for (;;) { + gfs2_ail1_start(sdp, DIO_ALL); + if (gfs2_ail1_empty(sdp, DIO_ALL)) + break; + msleep(10); + } +} + --- /dev/null +++ b/fs/gfs2/meta_io.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __DIO_DOT_H__ +#define __DIO_DOT_H__ + +static inline void gfs2_buffer_clear(struct buffer_head *bh) +{ + memset(bh->b_data, 0, bh->b_size); +} + +static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head) +{ + BUG_ON(head > bh->b_size); + memset(bh->b_data + head, 0, bh->b_size - head); +} + +static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, + int to_head, + struct buffer_head *from_bh, + int from_head) +{ + BUG_ON(from_head < to_head); + memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head, + from_bh->b_size - from_head); + memset(to_bh->b_data + to_bh->b_size + to_head - from_head, + 0, from_head - to_head); +} + +struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); +void gfs2_aspace_put(struct inode *aspace); + +void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai); +int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags); +void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai); +void gfs2_ail_empty_gl(struct gfs2_glock *gl); + +void gfs2_meta_inval(struct gfs2_glock *gl); +void gfs2_meta_sync(struct gfs2_glock *gl, int flags); + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, uint64_t blkno); +int gfs2_meta_read(struct gfs2_glock *gl, uint64_t blkno, + int flags, struct buffer_head **bhp); +int gfs2_meta_reread(struct gfs2_sbd *sdp, struct buffer_head *bh, int flags); + +void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, + int meta); +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct gfs2_ail *ai); + +void gfs2_meta_wipe(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen); + +void gfs2_meta_cache_flush(struct gfs2_inode *ip); +int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, uint64_t num, + int new, struct buffer_head **bhp); + +static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, + struct buffer_head **bhp) +{ + return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp); +} + +void gfs2_meta_ra(struct gfs2_glock *gl, uint64_t dblock, uint32_t extlen); +void gfs2_meta_syncfs(struct gfs2_sbd *sdp); + +#endif /* __DIO_DOT_H__ */ + --- /dev/null +++ b/fs/gfs2/ondisk.c @@ -0,0 +1,308 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include + +#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \ + struct->member); + +/* + * gfs2_xxx_in - read in an xxx struct + * first arg: the cpu-order structure + * buf: the disk-order buffer + * + * gfs2_xxx_out - write out an xxx struct + * first arg: the cpu-order structure + * buf: the disk-order buffer + * + * gfs2_xxx_print - print out an xxx struct + * first arg: the cpu-order structure + */ + +void gfs2_inum_in(struct gfs2_inum *no, char *buf) +{ + struct gfs2_inum *str = (struct gfs2_inum *)buf; + + no->no_formal_ino = be64_to_cpu(str->no_formal_ino); + no->no_addr = be64_to_cpu(str->no_addr); +} + +void gfs2_inum_out(const struct gfs2_inum *no, char *buf) +{ + struct gfs2_inum *str = (struct gfs2_inum *)buf; + + str->no_formal_ino = cpu_to_be64(no->no_formal_ino); + str->no_addr = cpu_to_be64(no->no_addr); +} + +static void gfs2_inum_print(struct gfs2_inum *no) +{ + printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino); + printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr); +} + +static void gfs2_meta_header_in(struct gfs2_meta_header *mh, char *buf) +{ + struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf; + + mh->mh_magic = be32_to_cpu(str->mh_magic); + mh->mh_type = be32_to_cpu(str->mh_type); + mh->mh_format = be32_to_cpu(str->mh_format); +} + +static void gfs2_meta_header_out(struct gfs2_meta_header *mh, char *buf) +{ + struct gfs2_meta_header *str = (struct gfs2_meta_header *)buf; + + str->mh_magic = cpu_to_be32(mh->mh_magic); + str->mh_type = cpu_to_be32(mh->mh_type); + str->mh_format = cpu_to_be32(mh->mh_format); +} + +static void gfs2_meta_header_print(struct gfs2_meta_header *mh) +{ + pv(mh, mh_magic, "0x%.8X"); + pv(mh, mh_type, "%u"); + pv(mh, mh_format, "%u"); +} + +void gfs2_sb_in(struct gfs2_sb *sb, char *buf) +{ + struct gfs2_sb *str = (struct gfs2_sb *)buf; + + gfs2_meta_header_in(&sb->sb_header, buf); + + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + + gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir); + gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); +} + +void gfs2_rindex_in(struct gfs2_rindex *ri, char *buf) +{ + struct gfs2_rindex *str = (struct gfs2_rindex *)buf; + + ri->ri_addr = be64_to_cpu(str->ri_addr); + ri->ri_length = be32_to_cpu(str->ri_length); + ri->ri_data0 = be64_to_cpu(str->ri_data0); + ri->ri_data = be32_to_cpu(str->ri_data); + ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes); + +} + +void gfs2_rindex_print(struct gfs2_rindex *ri) +{ + printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr); + pv(ri, ri_length, "%u"); + + printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0); + pv(ri, ri_data, "%u"); + + pv(ri, ri_bitbytes, "%u"); +} + +void gfs2_rgrp_in(struct gfs2_rgrp *rg, char *buf) +{ + struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf; + + gfs2_meta_header_in(&rg->rg_header, buf); + rg->rg_flags = be32_to_cpu(str->rg_flags); + rg->rg_free = be32_to_cpu(str->rg_free); + rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); + rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); +} + +void gfs2_rgrp_out(struct gfs2_rgrp *rg, char *buf) +{ + struct gfs2_rgrp *str = (struct gfs2_rgrp *)buf; + + gfs2_meta_header_out(&rg->rg_header, buf); + str->rg_flags = cpu_to_be32(rg->rg_flags); + str->rg_free = cpu_to_be32(rg->rg_free); + str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); + str->__pad = cpu_to_be32(0); + str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); + memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); +} + +void gfs2_quota_in(struct gfs2_quota *qu, char *buf) +{ + struct gfs2_quota *str = (struct gfs2_quota *)buf; + + qu->qu_limit = be64_to_cpu(str->qu_limit); + qu->qu_warn = be64_to_cpu(str->qu_warn); + qu->qu_value = be64_to_cpu(str->qu_value); +} + +void gfs2_dinode_in(struct gfs2_dinode *di, char *buf) +{ + struct gfs2_dinode *str = (struct gfs2_dinode *)buf; + + gfs2_meta_header_in(&di->di_header, buf); + gfs2_inum_in(&di->di_num, (char *)&str->di_num); + + di->di_mode = be32_to_cpu(str->di_mode); + di->di_uid = be32_to_cpu(str->di_uid); + di->di_gid = be32_to_cpu(str->di_gid); + di->di_nlink = be32_to_cpu(str->di_nlink); + di->di_size = be64_to_cpu(str->di_size); + di->di_blocks = be64_to_cpu(str->di_blocks); + di->di_atime = be64_to_cpu(str->di_atime); + di->di_mtime = be64_to_cpu(str->di_mtime); + di->di_ctime = be64_to_cpu(str->di_ctime); + di->di_major = be32_to_cpu(str->di_major); + di->di_minor = be32_to_cpu(str->di_minor); + + di->di_goal_meta = be64_to_cpu(str->di_goal_meta); + di->di_goal_data = be64_to_cpu(str->di_goal_data); + di->di_generation = be64_to_cpu(str->di_generation); + + di->di_flags = be32_to_cpu(str->di_flags); + di->di_payload_format = be32_to_cpu(str->di_payload_format); + di->di_height = be16_to_cpu(str->di_height); + + di->di_depth = be16_to_cpu(str->di_depth); + di->di_entries = be32_to_cpu(str->di_entries); + + di->di_eattr = be64_to_cpu(str->di_eattr); + +} + +void gfs2_dinode_out(struct gfs2_dinode *di, char *buf) +{ + struct gfs2_dinode *str = (struct gfs2_dinode *)buf; + + gfs2_meta_header_out(&di->di_header, buf); + gfs2_inum_out(&di->di_num, (char *)&str->di_num); + + str->di_mode = cpu_to_be32(di->di_mode); + str->di_uid = cpu_to_be32(di->di_uid); + str->di_gid = cpu_to_be32(di->di_gid); + str->di_nlink = cpu_to_be32(di->di_nlink); + str->di_size = cpu_to_be64(di->di_size); + str->di_blocks = cpu_to_be64(di->di_blocks); + str->di_atime = cpu_to_be64(di->di_atime); + str->di_mtime = cpu_to_be64(di->di_mtime); + str->di_ctime = cpu_to_be64(di->di_ctime); + str->di_major = cpu_to_be32(di->di_major); + str->di_minor = cpu_to_be32(di->di_minor); + + str->di_goal_meta = cpu_to_be64(di->di_goal_meta); + str->di_goal_data = cpu_to_be64(di->di_goal_data); + str->di_generation = cpu_to_be64(di->di_generation); + + str->di_flags = cpu_to_be32(di->di_flags); + str->di_payload_format = cpu_to_be32(di->di_payload_format); + str->di_height = cpu_to_be16(di->di_height); + + str->di_depth = cpu_to_be16(di->di_depth); + str->di_entries = cpu_to_be32(di->di_entries); + + str->di_eattr = cpu_to_be64(di->di_eattr); + +} + +void gfs2_dinode_print(struct gfs2_dinode *di) +{ + gfs2_meta_header_print(&di->di_header); + gfs2_inum_print(&di->di_num); + + pv(di, di_mode, "0%o"); + pv(di, di_uid, "%u"); + pv(di, di_gid, "%u"); + pv(di, di_nlink, "%u"); + printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); + printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks); + printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime); + printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime); + printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime); + pv(di, di_major, "%u"); + pv(di, di_minor, "%u"); + + printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta); + printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data); + + pv(di, di_flags, "0x%.8X"); + pv(di, di_payload_format, "%u"); + pv(di, di_height, "%u"); + + pv(di, di_depth, "%u"); + pv(di, di_entries, "%u"); + + printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr); +} + +void gfs2_log_header_in(struct gfs2_log_header *lh, char *buf) +{ + struct gfs2_log_header *str = (struct gfs2_log_header *)buf; + + gfs2_meta_header_in(&lh->lh_header, buf); + lh->lh_sequence = be64_to_cpu(str->lh_sequence); + lh->lh_flags = be32_to_cpu(str->lh_flags); + lh->lh_tail = be32_to_cpu(str->lh_tail); + lh->lh_blkno = be32_to_cpu(str->lh_blkno); + lh->lh_hash = be32_to_cpu(str->lh_hash); +} + +void gfs2_inum_range_in(struct gfs2_inum_range *ir, char *buf) +{ + struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf; + + ir->ir_start = be64_to_cpu(str->ir_start); + ir->ir_length = be64_to_cpu(str->ir_length); +} + +void gfs2_inum_range_out(struct gfs2_inum_range *ir, char *buf) +{ + struct gfs2_inum_range *str = (struct gfs2_inum_range *)buf; + + str->ir_start = cpu_to_be64(ir->ir_start); + str->ir_length = cpu_to_be64(ir->ir_length); +} + +void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, char *buf) +{ + struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf; + + sc->sc_total = be64_to_cpu(str->sc_total); + sc->sc_free = be64_to_cpu(str->sc_free); + sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); +} + +void gfs2_statfs_change_out(struct gfs2_statfs_change *sc, char *buf) +{ + struct gfs2_statfs_change *str = (struct gfs2_statfs_change *)buf; + + str->sc_total = cpu_to_be64(sc->sc_total); + str->sc_free = cpu_to_be64(sc->sc_free); + str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); +} + +void gfs2_quota_change_in(struct gfs2_quota_change *qc, char *buf) +{ + struct gfs2_quota_change *str = (struct gfs2_quota_change *)buf; + + qc->qc_change = be64_to_cpu(str->qc_change); + qc->qc_flags = be32_to_cpu(str->qc_flags); + qc->qc_id = be32_to_cpu(str->qc_id); +} + --- /dev/null +++ b/fs/gfs2/ops_address.c @@ -0,0 +1,819 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "ops_address.h" +#include "quota.h" +#include "trans.h" +#include "rgrp.h" +#include "ops_file.h" +#include "util.h" +#include "glops.h" + + +static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) +{ + struct buffer_head *head = page_buffers(page); + unsigned int bsize = head->b_size; + struct buffer_head *bh; + unsigned int start, end; + + for (bh = head, start = 0; bh != head || !start; + bh = bh->b_this_page, start = end) { + end = start + bsize; + if (end <= from || start >= to) + continue; + gfs2_trans_add_bh(ip->i_gl, bh, 0); + } +} + +/** + * gfs2_get_block - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +int gfs2_get_block(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int new = create; + uint64_t dblock; + int error; + int boundary; + + error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary); + if (error) + return error; + + if (!dblock) + return 0; + + map_bh(bh_result, inode->i_sb, dblock); + if (new) + set_buffer_new(bh_result); + if (boundary) + set_buffer_boundary(bh_result); + + return 0; +} + +/** + * get_block_noalloc - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +static int get_block_noalloc(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int new = 0; + uint64_t dblock; + int error; + int boundary; + + error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary); + if (error) + return error; + + if (dblock) + map_bh(bh_result, inode->i_sb, dblock); + else if (gfs2_assert_withdraw(GFS2_SB(inode), !create)) + error = -EIO; + if (boundary) + set_buffer_boundary(bh_result); + + return error; +} + +static int get_block_direct(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int new = 0; + u64 dblock; + int error, boundary; + + error = gfs2_block_map(inode, lblock, &new, &dblock, &boundary); + if (error) + return error; + + if (dblock) { + map_bh(bh_result, inode->i_sb, dblock); + if (boundary) + set_buffer_boundary(bh_result); + } + + return 0; +} +/** + * gfs2_writepage - Write complete page + * @page: Page to write + * + * Returns: errno + * + * Some of this is copied from block_write_full_page() although we still + * call it to do most of the work. + */ + +static int gfs2_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int error; + int done_trans = 0; + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) { + unlock_page(page); + return -EIO; + } + if (current->journal_info) + goto out_ignore; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + return 0; /* don't care */ + } + + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) { + error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (error) + goto out_ignore; + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + done_trans = 1; + } + error = block_write_full_page(page, get_block_noalloc, wbc); + if (done_trans) + gfs2_trans_end(sdp); + gfs2_meta_cache_flush(ip); + return error; + +out_ignore: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +static int zero_readpage(struct page *page) +{ + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(page, KM_USER0); + + SetPageUptodate(page); + + return 0; +} + +/** + * stuffed_readpage - Fill in a Linux page with stuffed file data + * @ip: the inode + * @page: the page + * + * Returns: errno + */ + +static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *dibh; + void *kaddr; + int error; + + /* Only the first page of a stuffed file might contain data */ + if (unlikely(page->index)) + return zero_readpage(page); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), + ip->i_di.di_size); + memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); + kunmap_atomic(page, KM_USER0); + + brelse(dibh); + + SetPageUptodate(page); + + return 0; +} + + +/** + * gfs2_readpage - readpage with locking + * @file: The file to read a page for. N.B. This may be NULL if we are + * reading an internal file. + * @page: The page to read + * + * Returns: errno + */ + +static int gfs2_readpage(struct file *file, struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct gfs2_holder gh; + int error; + int do_unlock = 0; + + if (likely(file != &gfs2_internal_file_sentinal)) { + if (file) { + struct gfs2_file *gf = file->private_data; + if (test_bit(GFF_EXLOCK, &gf->f_flags)) + goto skip_lock; + } + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh); + do_unlock = 1; + error = gfs2_glock_nq_m_atime(1, &gh); + if (unlikely(error)) + goto out_unlock; + } + +skip_lock: + if (gfs2_is_stuffed(ip)) { + error = stuffed_readpage(ip, page); + unlock_page(page); + } else + error = mpage_readpage(page, gfs2_get_block); + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = -EIO; + + if (file != &gfs2_internal_file_sentinal) { + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + } +out: + return error; +out_unlock: + unlock_page(page); + if (do_unlock) + gfs2_holder_uninit(&gh); + goto out; +} + +/** + * gfs2_readpages - Read a bunch of pages at once + * + * Some notes: + * 1. This is only for readahead, so we can simply ignore any things + * which are slightly inconvenient (such as locking conflicts between + * the page lock and the glock) and return having done no I/O. Its + * obviously not something we'd want to do on too regular a basis. + * Any I/O we ignore at this time will be done via readpage later. + * 2. We have to handle stuffed files here too. + * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. + * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as + * well as read-ahead. + */ +static int gfs2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + unsigned page_idx; + int ret; + int do_unlock = 0; + + if (likely(file != &gfs2_internal_file_sentinal)) { + if (file) { + struct gfs2_file *gf = file->private_data; + if (test_bit(GFF_EXLOCK, &gf->f_flags)) + goto skip_lock; + } + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh); + do_unlock = 1; + ret = gfs2_glock_nq_m_atime(1, &gh); + if (ret == GLR_TRYFAILED) + goto out_noerror; + if (unlikely(ret)) + goto out_unlock; + } +skip_lock: + if (gfs2_is_stuffed(ip)) { + struct pagevec lru_pvec; + pagevec_init(&lru_pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + prefetchw(&page->flags); + list_del(&page->lru); + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + ret = stuffed_readpage(ip, page); + unlock_page(page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + } else { + page_cache_release(page); + } + } + pagevec_lru_add(&lru_pvec); + ret = 0; + } else { + /* What we really want to do .... */ + ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); + } + + if (do_unlock) { + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + } +out: + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = -EIO; + return ret; +out_noerror: + ret = 0; +out_unlock: + /* unlock all pages, we can't do any I/O right now */ + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + unlock_page(page); + page_cache_release(page); + } + if (do_unlock) + gfs2_holder_uninit(&gh); + goto out; +} + +/** + * gfs2_prepare_write - Prepare to write a page to a file + * @file: The file to write to + * @page: The page which is to be prepared for writing + * @from: From (byte range within page) + * @to: To (byte range within page) + * + * Returns: errno + */ + +static int gfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + unsigned int data_blocks, ind_blocks, rblocks; + int alloc_required; + int error = 0; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from; + loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + struct gfs2_alloc *al; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh); + error = gfs2_glock_nq_m_atime(1, &ip->i_gh); + if (error) + goto out_uninit; + + gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks); + + error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required); + if (error) + goto out_unlock; + + + if (alloc_required) { + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc_put; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_qunlock; + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + + error = gfs2_trans_begin(sdp, rblocks, 0); + if (error) + goto out; + + if (gfs2_is_stuffed(ip)) { + if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + error = gfs2_unstuff_dinode(ip, page); + if (error == 0) + goto prepare_write; + } else if (!PageUptodate(page)) + error = stuffed_readpage(ip, page); + goto out; + } + +prepare_write: + error = block_prepare_write(page, from, to, gfs2_get_block); + +out: + if (error) { + gfs2_trans_end(sdp); + if (alloc_required) { + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); + } +out_unlock: + gfs2_glock_dq_m(1, &ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + } + + return error; +} + +/** + * gfs2_commit_write - Commit write to a file + * @file: The file to write to + * @page: The page containing the data + * @from: From (byte range within page) + * @to: To (byte range within page) + * + * Returns: errno + */ + +static int gfs2_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + int error = -EOPNOTSUPP; + struct buffer_head *dibh; + struct gfs2_alloc *al = &ip->i_alloc;; + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl))) + goto fail_nounlock; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_endtrans; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) { + uint64_t file_size; + void *kaddr; + + file_size = ((uint64_t)page->index << PAGE_CACHE_SHIFT) + to; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from, + (char *)kaddr + from, to - from); + kunmap_atomic(page, KM_USER0); + + SetPageUptodate(page); + + if (inode->i_size < file_size) + i_size_write(inode, file_size); + } else { + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || + gfs2_is_jdata(ip)) + gfs2_page_add_databufs(ip, page, from, to); + error = generic_commit_write(file, page, from, to); + if (error) + goto fail; + } + + if (ip->i_di.di_size < inode->i_size) + ip->i_di.di_size = inode->i_size; + + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + gfs2_trans_end(sdp); + if (al->al_requested) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return 0; + +fail: + brelse(dibh); +fail_endtrans: + gfs2_trans_end(sdp); + if (al->al_requested) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); +fail_nounlock: + ClearPageUptodate(page); + return error; +} + +/** + * gfs2_bmap - Block map function + * @mapping: Address space info + * @lblock: The block to map + * + * Returns: The disk address for the block or 0 on hole or error + */ + +static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder i_gh; + sector_t dblock = 0; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return 0; + + if (!gfs2_is_stuffed(ip)) + dblock = generic_block_bmap(mapping, lblock, gfs2_get_block); + + gfs2_glock_dq_uninit(&i_gh); + + return dblock; +} + +static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd) { + bd->bd_bh = NULL; + bh->b_private = NULL; + } + gfs2_log_unlock(sdp); + + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + unlock_buffer(bh); +} + +static void gfs2_invalidatepage(struct page *page, unsigned long offset) +{ + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (offset <= curr_off) + discard_buffer(sdp, bh); + + curr_off = next_off; + bh = next; + } while (bh != head); + + if (!offset) + try_to_release_page(page, 0); + + return; +} + +static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int rv; + + if (rw == READ) + mutex_lock(&inode->i_mutex); + /* + * Shared lock, even if its a write, since we do no allocation + * on this path. All we need change is atime. + */ + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + rv = gfs2_glock_nq_m_atime(1, &gh); + if (rv) + goto out; + + if (offset > i_size_read(inode)) + goto out; + + /* + * Should we return an error here? I can't see that O_DIRECT for + * a journaled file makes any sense. For now we'll silently fall + * back to buffered I/O, likewise we do the same for stuffed + * files since they are (a) small and (b) unaligned. + */ + if (gfs2_is_jdata(ip)) + goto out; + + if (gfs2_is_stuffed(ip)) + goto out; + + rv = blockdev_direct_IO_own_locking(rw, iocb, inode, + inode->i_sb->s_bdev, + iov, offset, nr_segs, + get_block_direct, NULL); +out: + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + if (rw == READ) + mutex_unlock(&inode->i_mutex); + + return rv; +} + +/** + * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out. + * @bh: the buffer we're stuck on + * + */ + +static void stuck_releasepage(struct buffer_head *bh) +{ + struct inode *inode = bh->b_page->mapping->host; + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_bufdata *bd = bh->b_private; + struct gfs2_glock *gl; +static unsigned limit = 0; + + if (limit++ > 3) + return; + + fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode); + fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n", + (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count)); + fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh)); + fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL"); + + if (!bd) + return; + + gl = bd->bd_gl; + + fs_warn(sdp, "gl = (%u, %llu)\n", + gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); + + fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n", + (list_empty(&bd->bd_list_tr)) ? "no" : "yes", + (list_empty(&bd->bd_le.le_list)) ? "no" : "yes"); + + if (gl->gl_ops == &gfs2_inode_glops) { + struct gfs2_inode *ip = gl->gl_object; + unsigned int x; + + if (!ip) + return; + + fs_warn(sdp, "ip = %llu %llu\n", + (unsigned long long)ip->i_num.no_formal_ino, + (unsigned long long)ip->i_num.no_addr); + + for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) + fs_warn(sdp, "ip->i_cache[%u] = %s\n", + x, (ip->i_cache[x]) ? "!NULL" : "NULL"); + } +} + +/** + * gfs2_releasepage - free the metadata associated with a page + * @page: the page that's being released + * @gfp_mask: passed from Linux VFS, ignored by us + * + * Call try_to_free_buffers() if the buffers in this page can be + * released. + * + * Returns: 0 + */ + +int gfs2_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct inode *aspace = page->mapping->host; + struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; + struct buffer_head *bh, *head; + struct gfs2_bufdata *bd; + unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ; + + if (!page_has_buffers(page)) + goto out; + + head = bh = page_buffers(page); + do { + while (atomic_read(&bh->b_count)) { + if (!atomic_read(&aspace->i_writecount)) + return 0; + + if (time_after_eq(jiffies, t)) { + stuck_releasepage(bh); + /* should we withdraw here? */ + return 0; + } + + yield(); + } + + gfs2_assert_warn(sdp, !buffer_pinned(bh)); + gfs2_assert_warn(sdp, !buffer_dirty(bh)); + + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd) { + gfs2_assert_warn(sdp, bd->bd_bh == bh); + gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); + gfs2_assert_warn(sdp, !bd->bd_ail); + bd->bd_bh = NULL; + if (!list_empty(&bd->bd_le.le_list)) + bd = NULL; + bh->b_private = NULL; + } + gfs2_log_unlock(sdp); + if (bd) + kmem_cache_free(gfs2_bufdata_cachep, bd); + + bh = bh->b_this_page; + } while (bh != head); + +out: + return try_to_free_buffers(page); +} + +const struct address_space_operations gfs2_file_aops = { + .writepage = gfs2_writepage, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .sync_page = block_sync_page, + .prepare_write = gfs2_prepare_write, + .commit_write = gfs2_commit_write, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, +}; + --- /dev/null +++ b/fs/gfs2/ops_address.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_ADDRESS_DOT_H__ +#define __OPS_ADDRESS_DOT_H__ + +extern const struct address_space_operations gfs2_file_aops; +extern int gfs2_get_block(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create); +extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); + +#endif /* __OPS_ADDRESS_DOT_H__ */ [PATCH 05/16] GFS2: File and inode operations File operations, superblock operations and inode operations code for GFS2. This also includes a new header file, iflags.h which is designed to abstract the (originally ext2 only, but now used by many different fs) get/set flags ioctl by having one central place in which to register filesystem flags. Given favourable reviews, I'll submit some patches to update the other fileststems to define their flags in terms of those in iflags.h. Note that this doesn't change the interface of the other filesystems since the values of the flags are identical to those previously defined. To the best of my knowledge, GFS2 is the only filesystem which requires the addition of flags above and beyond those defined by ext2/3 so if there are others and we've clashed with them, please let me know. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/ops_file.c | 812 ++++++++++++++++++++++++++++++++++ fs/gfs2/ops_file.h | 20 fs/gfs2/ops_fstype.c | 980 +++++++++++++++++++++++++++++++++++++++++ fs/gfs2/ops_fstype.h | 16 fs/gfs2/ops_inode.c | 1165 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/ops_inode.h | 18 include/linux/iflags.h | 102 ++++ 7 files changed, 3113 insertions(+) --- /dev/null +++ b/fs/gfs2/ops_file.c @@ -0,0 +1,812 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lm.h" +#include "log.h" +#include "meta_io.h" +#include "ops_file.h" +#include "ops_vm.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "eaops.h" + +/* "bad" is for NFS support */ +struct filldir_bad_entry { + char *fbe_name; + unsigned int fbe_length; + uint64_t fbe_offset; + struct gfs2_inum fbe_inum; + unsigned int fbe_type; +}; + +struct filldir_bad { + struct gfs2_sbd *fdb_sbd; + + struct filldir_bad_entry *fdb_entry; + unsigned int fdb_entry_num; + unsigned int fdb_entry_off; + + char *fdb_name; + unsigned int fdb_name_size; + unsigned int fdb_name_off; +}; + +/* For regular, non-NFS */ +struct filldir_reg { + struct gfs2_sbd *fdr_sbd; + int fdr_prefetch; + + filldir_t fdr_filldir; + void *fdr_opaque; +}; + +/* + * Most fields left uninitialised to catch anybody who tries to + * use them. f_flags set to prevent file_accessed() from touching + * any other part of this. Its use is purely as a flag so that we + * know (in readpage()) whether or not do to locking. + */ +struct file gfs2_internal_file_sentinal = { + .f_flags = O_NOATIME|O_RDONLY, +}; + +static int gfs2_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long count = desc->count; + + if (size > count) + size = count; + + kaddr = kmap(page); + memcpy(desc->arg.buf, kaddr + offset, size); + kunmap(page); + + desc->count = count - size; + desc->written += size; + desc->arg.buf += size; + return size; +} + +int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size) +{ + struct inode *inode = &ip->i_inode; + read_descriptor_t desc; + desc.written = 0; + desc.arg.buf = buf; + desc.count = size; + desc.error = 0; + do_generic_mapping_read(inode->i_mapping, ra_state, + &gfs2_internal_file_sentinal, pos, &desc, + gfs2_read_actor); + return desc.written ? desc.written : desc.error; +} + +/** + * gfs2_llseek - seek to a location in a file + * @file: the file + * @offset: the offset + * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * + * SEEK_END requires the glock for the file because it references the + * file's size. + * + * Returns: The new offset, or errno + */ + +static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + loff_t error; + + if (origin == 2) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (!error) { + error = remote_llseek(file, offset, origin); + gfs2_glock_dq_uninit(&i_gh); + } + } else + error = remote_llseek(file, offset, origin); + + return error; +} + +/** + * filldir_reg_func - Report a directory entry to the caller of gfs2_dir_read() + * @opaque: opaque data used by the function + * @name: the name of the directory entry + * @length: the length of the name + * @offset: the entry's offset in the directory + * @inum: the inode number the entry points to + * @type: the type of inode the entry points to + * + * Returns: 0 on success, 1 if buffer full + */ + +static int filldir_reg_func(void *opaque, const char *name, unsigned int length, + uint64_t offset, struct gfs2_inum *inum, + unsigned int type) +{ + struct filldir_reg *fdr = (struct filldir_reg *)opaque; + struct gfs2_sbd *sdp = fdr->fdr_sbd; + int error; + + error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset, + inum->no_addr, type); + if (error) + return 1; + + if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) { + gfs2_glock_prefetch_num(sdp, + inum->no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY); + gfs2_glock_prefetch_num(sdp, + inum->no_addr, &gfs2_iopen_glops, + LM_ST_SHARED, LM_FLAG_TRY); + } + + return 0; +} + +/** + * readdir_reg - Read directory entries from a directory + * @file: The directory to read from + * @dirent: Buffer for dirents + * @filldir: Function used to do the copying + * + * Returns: errno + */ + +static int readdir_reg(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct filldir_reg fdr; + struct gfs2_holder d_gh; + uint64_t offset = file->f_pos; + int error; + + fdr.fdr_sbd = GFS2_SB(dir); + fdr.fdr_prefetch = 1; + fdr.fdr_filldir = filldir; + fdr.fdr_opaque = dirent; + + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); + error = gfs2_glock_nq_atime(&d_gh); + if (error) { + gfs2_holder_uninit(&d_gh); + return error; + } + + error = gfs2_dir_read(dir, &offset, &fdr, filldir_reg_func); + + gfs2_glock_dq_uninit(&d_gh); + + file->f_pos = offset; + + return error; +} + +/** + * filldir_bad_func - Report a directory entry to the caller of gfs2_dir_read() + * @opaque: opaque data used by the function + * @name: the name of the directory entry + * @length: the length of the name + * @offset: the entry's offset in the directory + * @inum: the inode number the entry points to + * @type: the type of inode the entry points to + * + * For supporting NFS. + * + * Returns: 0 on success, 1 if buffer full + */ + +static int filldir_bad_func(void *opaque, const char *name, unsigned int length, + uint64_t offset, struct gfs2_inum *inum, + unsigned int type) +{ + struct filldir_bad *fdb = (struct filldir_bad *)opaque; + struct gfs2_sbd *sdp = fdb->fdb_sbd; + struct filldir_bad_entry *fbe; + + if (fdb->fdb_entry_off == fdb->fdb_entry_num || + fdb->fdb_name_off + length > fdb->fdb_name_size) + return 1; + + fbe = &fdb->fdb_entry[fdb->fdb_entry_off]; + fbe->fbe_name = fdb->fdb_name + fdb->fdb_name_off; + memcpy(fbe->fbe_name, name, length); + fbe->fbe_length = length; + fbe->fbe_offset = offset; + fbe->fbe_inum = *inum; + fbe->fbe_type = type; + + fdb->fdb_entry_off++; + fdb->fdb_name_off += length; + + if (!(length == 1 && *name == '.')) { + gfs2_glock_prefetch_num(sdp, + inum->no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY); + gfs2_glock_prefetch_num(sdp, + inum->no_addr, &gfs2_iopen_glops, + LM_ST_SHARED, LM_FLAG_TRY); + } + + return 0; +} + +/** + * readdir_bad - Read directory entries from a directory + * @file: The directory to read from + * @dirent: Buffer for dirents + * @filldir: Function used to do the copying + * + * For supporting NFS. + * + * Returns: errno + */ + +static int readdir_bad(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct filldir_reg fdr; + unsigned int entries, size; + struct filldir_bad *fdb; + struct gfs2_holder d_gh; + uint64_t offset = file->f_pos; + unsigned int x; + struct filldir_bad_entry *fbe; + int error; + + entries = gfs2_tune_get(sdp, gt_entries_per_readdir); + size = sizeof(struct filldir_bad) + + entries * (sizeof(struct filldir_bad_entry) + GFS2_FAST_NAME_SIZE); + + fdb = kzalloc(size, GFP_KERNEL); + if (!fdb) + return -ENOMEM; + + fdb->fdb_sbd = sdp; + fdb->fdb_entry = (struct filldir_bad_entry *)(fdb + 1); + fdb->fdb_entry_num = entries; + fdb->fdb_name = ((char *)fdb) + sizeof(struct filldir_bad) + + entries * sizeof(struct filldir_bad_entry); + fdb->fdb_name_size = entries * GFS2_FAST_NAME_SIZE; + + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); + error = gfs2_glock_nq_atime(&d_gh); + if (error) { + gfs2_holder_uninit(&d_gh); + goto out; + } + + error = gfs2_dir_read(dir, &offset, fdb, filldir_bad_func); + + gfs2_glock_dq_uninit(&d_gh); + + fdr.fdr_sbd = sdp; + fdr.fdr_prefetch = 0; + fdr.fdr_filldir = filldir; + fdr.fdr_opaque = dirent; + + for (x = 0; x < fdb->fdb_entry_off; x++) { + fbe = &fdb->fdb_entry[x]; + + error = filldir_reg_func(&fdr, + fbe->fbe_name, fbe->fbe_length, + fbe->fbe_offset, + &fbe->fbe_inum, fbe->fbe_type); + if (error) { + file->f_pos = fbe->fbe_offset; + error = 0; + goto out; + } + } + + file->f_pos = offset; + + out: + kfree(fdb); + + return error; +} + +/** + * gfs2_readdir - Read directory entries from a directory + * @file: The directory to read from + * @dirent: Buffer for dirents + * @filldir: Function used to do the copying + * + * Returns: errno + */ + +static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int error; + + if (strcmp(current->comm, "nfsd") != 0) + error = readdir_reg(file, dirent, filldir); + else + error = readdir_bad(file, dirent, filldir); + + return error; +} + +static const u32 iflags_to_gfs2[32] = { + [iflag_Sync] = GFS2_DIF_SYNC, + [iflag_Immutable] = GFS2_DIF_IMMUTABLE, + [iflag_Append] = GFS2_DIF_APPENDONLY, + [iflag_NoAtime] = GFS2_DIF_NOATIME, + [iflag_Index] = GFS2_DIF_EXHASH, + [iflag_JournalData] = GFS2_DIF_JDATA, + [iflag_DirectIO] = GFS2_DIF_DIRECTIO, +}; + +static const u32 gfs2_to_iflags[32] = { + [gfs2fl_Sync] = IFLAG_SYNC, + [gfs2fl_Immutable] = IFLAG_IMMUTABLE, + [gfs2fl_AppendOnly] = IFLAG_APPEND, + [gfs2fl_NoAtime] = IFLAG_NOATIME, + [gfs2fl_ExHash] = IFLAG_INDEX, + [gfs2fl_Jdata] = IFLAG_JOURNAL_DATA, + [gfs2fl_Directio] = IFLAG_DIRECTIO, + [gfs2fl_InheritDirectio] = IFLAG_DIRECTIO, + [gfs2fl_InheritJdata] = IFLAG_JOURNAL_DATA, +}; + +static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + u32 iflags; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + error = gfs2_glock_nq_m_atime(1, &gh); + if (error) + return error; + + iflags = iflags_cvt(gfs2_to_iflags, ip->i_di.di_flags); + if (put_user(iflags, ptr)) + error = -EFAULT; + + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + return error; +} + +/* Flags that can be set by user space */ +#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ + GFS2_DIF_DIRECTIO| \ + GFS2_DIF_IMMUTABLE| \ + GFS2_DIF_APPENDONLY| \ + GFS2_DIF_NOATIME| \ + GFS2_DIF_SYNC| \ + GFS2_DIF_SYSTEM| \ + GFS2_DIF_INHERIT_DIRECTIO| \ + GFS2_DIF_INHERIT_JDATA) + +/** + * gfs2_set_flags - set flags on an inode + * @inode: The inode + * @flags: The flags to set + * @mask: Indicates which flags are valid + * + */ +static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int error; + u32 new_flags, flags; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + return error; + + flags = ip->i_di.di_flags; + new_flags = (flags & ~mask) | (reqflags & mask); + if ((new_flags ^ flags) == 0) + goto out; + + if (S_ISDIR(inode->i_mode)) { + if ((new_flags ^ flags) & GFS2_DIF_JDATA) + new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA); + if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO) + new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO); + } + + error = -EINVAL; + if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) + goto out; + + error = -EPERM; + if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) + goto out; + if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) + goto out; + if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + if (!IS_IMMUTABLE(inode)) { + error = permission(inode, MAY_WRITE, NULL); + if (error) + goto out; + } + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + goto out; + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_trans_end; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_di.di_flags = new_flags; + gfs2_dinode_out(&ip->i_di, bh->b_data); + brelse(bh); +out_trans_end: + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); + return error; +} + +static int gfs2_set_flags(struct file *filp, u32 __user *ptr) +{ + u32 iflags, gfsflags; + if (get_user(iflags, ptr)) + return -EFAULT; + gfsflags = iflags_cvt(iflags_to_gfs2, iflags); + return do_gfs2_set_flags(filp, gfsflags, ~0); +} + +static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case IFLAGS_GET_IOC: + return gfs2_get_flags(filp, (u32 __user *)arg); + case IFLAGS_SET_IOC: + return gfs2_set_flags(filp, (u32 __user *)arg); + } + return -ENOTTY; +} + + +/** + * gfs2_mmap - + * @file: The file to map + * @vma: The VMA which described the mapping + * + * Returns: 0 or error code + */ + +static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); + error = gfs2_glock_nq_atime(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + return error; + } + + /* This is VM_MAYWRITE instead of VM_WRITE because a call + to mprotect() can turn on VM_WRITE later. */ + + if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) == + (VM_MAYSHARE | VM_MAYWRITE)) + vma->vm_ops = &gfs2_vm_ops_sharewrite; + else + vma->vm_ops = &gfs2_vm_ops_private; + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + struct gfs2_file *fp; + int error; + + fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + mutex_init(&fp->f_fl_mutex); + + gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + file->private_data = fp; + + if (S_ISREG(ip->i_di.di_mode)) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + goto fail; + + if (!(file->f_flags & O_LARGEFILE) && + ip->i_di.di_size > MAX_NON_LFS) { + error = -EFBIG; + goto fail_gunlock; + } + + /* Listen to the Direct I/O flag */ + + if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO) + file->f_flags |= O_DIRECT; + + gfs2_glock_dq_uninit(&i_gh); + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + file->private_data = NULL; + kfree(fp); + return error; +} + +/** + * gfs2_close - called to close a struct file + * @inode: the inode the struct file belongs to + * @file: the struct file being closed + * + * Returns: errno + */ + +static int gfs2_close(struct inode *inode, struct file *file) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_file *fp; + + fp = file->private_data; + file->private_data = NULL; + + if (gfs2_assert_warn(sdp, fp)) + return -EIO; + + kfree(fp); + + return 0; +} + +/** + * gfs2_fsync - sync the dirty data for a file (across the cluster) + * @file: the file that points to the dentry (we ignore this) + * @dentry: the dentry that points to the inode to sync + * + * Returns: errno + */ + +static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + + return 0; +} + +/** + * gfs2_lock - acquire/release a posix lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockname name = + { .ln_number = ip->i_num.no_addr, + .ln_type = LM_TYPE_PLOCK }; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + return -ENOLCK; + + if (sdp->sd_args.ar_localflocks) { + if (IS_GETLK(cmd)) { + struct file_lock tmp; + int ret; + ret = posix_test_lock(file, fl, &tmp); + fl->fl_type = F_UNLCK; + if (ret) + memcpy(fl, &tmp, sizeof(struct file_lock)); + return 0; + } else { + return posix_lock_file_wait(file, fl); + } + } + + if (IS_GETLK(cmd)) + return gfs2_lm_plock_get(sdp, &name, file, fl); + else if (fl->fl_type == F_UNLCK) + return gfs2_lm_punlock(sdp, &name, file, fl); + else + return gfs2_lm_plock(sdp, &name, file, cmd, fl); +} + +static int do_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode); + struct gfs2_glock *gl; + unsigned int state; + int flags; + int error = 0; + + state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + flags = ((IS_SETLKW(cmd)) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + + mutex_lock(&fp->f_fl_mutex); + + gl = fl_gh->gh_gl; + if (gl) { + if (fl_gh->gh_state == state) + goto out; + gfs2_glock_hold(gl); + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + gfs2_glock_dq_uninit(fl_gh); + } else { + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), + ip->i_num.no_addr, &gfs2_flock_glops, + CREATE, &gl); + if (error) + goto out; + } + + gfs2_holder_init(gl, state, flags, fl_gh); + gfs2_glock_put(gl); + + error = gfs2_glock_nq(fl_gh); + if (error) { + gfs2_holder_uninit(fl_gh); + if (error == GLR_TRYFAILED) + error = -EAGAIN; + } else { + error = flock_lock_file_wait(file, fl); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + } + +out: + mutex_unlock(&fp->f_fl_mutex); + return error; +} + +static void do_unflock(struct file *file, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + + mutex_lock(&fp->f_fl_mutex); + flock_lock_file_wait(file, fl); + if (fl_gh->gh_gl) + gfs2_glock_dq_uninit(fl_gh); + mutex_unlock(&fp->f_fl_mutex); +} + +/** + * gfs2_flock - acquire/release a flock lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + return -ENOLCK; + + if (sdp->sd_args.ar_localflocks) + return flock_lock_file_wait(file, fl); + + if (fl->fl_type == F_UNLCK) { + do_unflock(file, fl); + return 0; + } else + return do_flock(file, cmd, fl); +} + +const struct file_operations gfs2_file_fops = { + .llseek = gfs2_llseek, + .read = generic_file_read, + .readv = generic_file_readv, + .aio_read = generic_file_aio_read, + .write = generic_file_write, + .writev = generic_file_writev, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .sendfile = generic_file_sendfile, + .flock = gfs2_flock, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +const struct file_operations gfs2_dir_fops = { + .readdir = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, +}; + --- /dev/null +++ b/include/linux/iflags.h @@ -0,0 +1,102 @@ +#ifndef _LINUX_IFLAGS_H +#define _LINUX_IFLAGS_H + +/* + * A universal set of inode flags. + * + * Originally taken from ext2/3 with additions for other filesystems. + * Filesystems supporting this interface should interoperate with + * the lsattr and chattr command line tools. + * + * This interface is supported in whole or in part by: + * ext2 + * ext3 + * xfs + * jfs + * gfs2 + * + */ + +#define IFLAGS_GET_IOC _IOR('f', 1, long) +#define IFLAGS_SET_IOC _IOW('f', 2, long) + +/* + * These values are provided for use as indices of an array + * for use with the iflags_cvt function below + */ +enum { + iflag_SecureRm = 0, /* Secure deletion */ + iflag_Unrm = 1, /* Undelete */ + iflag_Compress = 2, /* Compress file */ + iflag_Sync = 3, /* Synchronous updates */ + iflag_Immutable = 4, /* Immutable */ + iflag_Append = 5, /* Append */ + iflag_NoDump = 6, /* Don't dump file */ + iflag_NoAtime = 7, /* No atime updates */ + /* Reserved for compression usage */ + iflag_Dirty = 8, + iflag_ComprBlk = 9, /* One or more compressed clusters */ + iflag_NoComp = 10, /* Don't compress */ + iflag_Ecompr = 11, /* Compression error */ + /* End of compression flags */ + iflag_Btree = 12, /* btree format dir */ + iflag_Index = 12, /* hash-indexed directory */ + iflag_Imagic = 13, /* AFS directory */ + iflag_JournalData = 14, /* file data should be journaled */ + iflag_NoTail = 15, /* file tail should not be merged */ + iflag_DirSync = 16, /* dirsync behaviour */ + iflag_TopDir = 17, /* Top of directory hierarchies */ + iflag_Extent = 19, /* Extents */ + iflag_DirectIO = 20, /* Always use direct I/O on this file */ + iflag_Reserved = 31 /* reserved for ext2/3 lib */ +}; + +#define __IFL(x) (1<<(iflag_##x)) +#define IFLAG_SECRM __IFL(SecureRm) /* 0x00000001 */ +#define IFLAG_UNRM __IFL(Unrm) /* 0x00000002 */ +#define IFLAG_COMPR __IFL(Compr) /* 0x00000004 */ +#define IFLAG_SYNC __IFL(Sync) /* 0x00000008 */ +#define IFLAG_IMMUTABLE __IFL(Immutable) /* 0x00000010 */ +#define IFLAG_APPEND __IFL(Append) /* 0x00000020 */ +#define IFLAG_NODUMP __IFL(NoDump) /* 0x00000040 */ +#define IFLAG_NOATIME __IFL(NoAtime) /* 0x00000080 */ +#define IFLAG_DIRTY __IFL(Dirty) /* 0x00000100 */ +#define IFLAG_COMPRBLK __IFL(ComprBlk) /* 0x00000200 */ +#define IFLAG_NOCOMP __IFL(NoComp) /* 0x00000400 */ +#define IFLAG_ECOMPR __IFL(Ecompr) /* 0x00000800 */ +#define IFLAG_BTREE __IFL(Btree) /* 0x00001000 */ +#define IFLAG_INDEX __IFL(Index) /* 0x00001000 */ +#define IFLAG_IMAGIC __IFL(Imagic) /* 0x00002000 */ +#define IFLAG_JOURNAL_DATA __IFL(JournalData) /* 0x00004000 */ +#define IFLAG_NOTAIL __IFL(NoTail) /* 0x00008000 */ +#define IFLAG_DIRSYNC __IFL(DirSync) /* 0x00010000 */ +#define IFLAG_TOPDIR __IFL(TopDir) /* 0x00020000 */ +#define IFLAG_EXTENT __IFL(Extent) /* 0x00080000 */ +#define IFLAG_DIRECTIO __IFL(DirectIO) /* 0x00100000 */ +#define IFLAG_RESERVED __IFL(Reserved) /* 0x80000000 */ + +#ifdef __KERNEL__ +/** + * iflags_cvt + * @table: A table of 32 u32 flags + * @val: a 32 bit value to convert + * + * This function can be used to convert between IFLAGS values and + * the filesystem's own flags values. + * + * Returns: the converted flags + */ +static inline u32 iflags_cvt(const u32 *table, u32 val) +{ + u32 res = 0; + while(val) { + if (val & 1) + res |= *table; + table++; + val >>= 1; + } + return res; +} +#endif /* __KERNEL__ */ + +#endif /* _LINUX_IFLAGS_H */ --- /dev/null +++ b/fs/gfs2/ops_file.h @@ -0,0 +1,20 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_FILE_DOT_H__ +#define __OPS_FILE_DOT_H__ +extern struct file gfs2_internal_file_sentinal; +extern int gfs2_internal_read(struct gfs2_inode *ip, + struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size); + +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; + +#endif /* __OPS_FILE_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/ops_fstype.c @@ -0,0 +1,980 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "daemon.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lm.h" +#include "mount.h" +#include "ops_export.h" +#include "ops_fstype.h" +#include "ops_super.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "sys.h" +#include "util.h" + +#define DO 0 +#define UNDO 1 + +extern struct dentry_operations gfs2_dops; + +static struct gfs2_sbd *init_sbd(struct super_block *sb) +{ + struct gfs2_sbd *sdp; + unsigned int x; + + sdp = vmalloc(sizeof(struct gfs2_sbd)); + if (!sdp) + return NULL; + + memset(sdp, 0, sizeof(struct gfs2_sbd)); + + sb->s_fs_info = sdp; + sdp->sd_vfs = sb; + + gfs2_tune_init(&sdp->sd_tune); + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { + rwlock_init(&sdp->sd_gl_hash[x].hb_lock); + INIT_LIST_HEAD(&sdp->sd_gl_hash[x].hb_list); + } + INIT_LIST_HEAD(&sdp->sd_reclaim_list); + spin_lock_init(&sdp->sd_reclaim_lock); + init_waitqueue_head(&sdp->sd_reclaim_wq); + + mutex_init(&sdp->sd_inum_mutex); + spin_lock_init(&sdp->sd_statfs_spin); + mutex_init(&sdp->sd_statfs_mutex); + + spin_lock_init(&sdp->sd_rindex_spin); + mutex_init(&sdp->sd_rindex_mutex); + INIT_LIST_HEAD(&sdp->sd_rindex_list); + INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); + INIT_LIST_HEAD(&sdp->sd_rindex_recent_list); + + INIT_LIST_HEAD(&sdp->sd_jindex_list); + spin_lock_init(&sdp->sd_jindex_spin); + mutex_init(&sdp->sd_jindex_mutex); + + INIT_LIST_HEAD(&sdp->sd_quota_list); + spin_lock_init(&sdp->sd_quota_spin); + mutex_init(&sdp->sd_quota_mutex); + + spin_lock_init(&sdp->sd_log_lock); + + INIT_LIST_HEAD(&sdp->sd_log_le_gl); + INIT_LIST_HEAD(&sdp->sd_log_le_buf); + INIT_LIST_HEAD(&sdp->sd_log_le_revoke); + INIT_LIST_HEAD(&sdp->sd_log_le_rg); + INIT_LIST_HEAD(&sdp->sd_log_le_databuf); + + mutex_init(&sdp->sd_log_reserve_mutex); + INIT_LIST_HEAD(&sdp->sd_ail1_list); + INIT_LIST_HEAD(&sdp->sd_ail2_list); + + init_rwsem(&sdp->sd_log_flush_lock); + INIT_LIST_HEAD(&sdp->sd_log_flush_list); + + INIT_LIST_HEAD(&sdp->sd_revoke_list); + + mutex_init(&sdp->sd_freeze_lock); + + return sdp; +} + +static void init_vfs(struct super_block *sb, unsigned noatime) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) + set_bit(noatime, &sdp->sd_flags); + + /* Don't let the VFS update atimes. GFS2 handles this itself. */ + sb->s_flags |= MS_NOATIME | MS_NODIRATIME; +} + +static int init_names(struct gfs2_sbd *sdp, int silent) +{ + struct gfs2_sb *sb = NULL; + char *proto, *table; + int error = 0; + + proto = sdp->sd_args.ar_lockproto; + table = sdp->sd_args.ar_locktable; + + /* Try to autodetect */ + + if (!proto[0] || !table[0]) { + struct buffer_head *bh; + bh = sb_getblk(sdp->sd_vfs, + GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + lock_buffer(bh); + clear_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + + sb = kmalloc(sizeof(struct gfs2_sb), GFP_KERNEL); + if (!sb) { + brelse(bh); + return -ENOMEM; + } + gfs2_sb_in(sb, bh->b_data); + brelse(bh); + + error = gfs2_check_sb(sdp, sb, silent); + if (error) + goto out; + + if (!proto[0]) + proto = sb->sb_lockproto; + if (!table[0]) + table = sb->sb_locktable; + } + + if (!table[0]) + table = sdp->sd_vfs->s_id; + + snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto); + snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table); + + out: + kfree(sb); + + return error; +} + +static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, + int undo) +{ + struct task_struct *p; + int error = 0; + + if (undo) + goto fail_trans; + + p = kthread_run(gfs2_scand, sdp, "gfs2_scand"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start scand thread: %d\n", error); + return error; + } + sdp->sd_scand_process = p; + + for (sdp->sd_glockd_num = 0; + sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd; + sdp->sd_glockd_num++) { + p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start glockd thread: %d\n", error); + goto fail; + } + sdp->sd_glockd_process[sdp->sd_glockd_num] = p; + } + + error = gfs2_glock_nq_num(sdp, + GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + mount_gh); + if (error) { + fs_err(sdp, "can't acquire mount glock: %d\n", error); + goto fail; + } + + error = gfs2_glock_nq_num(sdp, + GFS2_LIVE_LOCK, &gfs2_nondisk_glops, + LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_live_gh); + if (error) { + fs_err(sdp, "can't acquire live glock: %d\n", error); + goto fail_mount; + } + + error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops, + CREATE, &sdp->sd_rename_gl); + if (error) { + fs_err(sdp, "can't create rename glock: %d\n", error); + goto fail_live; + } + + error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops, + CREATE, &sdp->sd_trans_gl); + if (error) { + fs_err(sdp, "can't create transaction glock: %d\n", error); + goto fail_rename; + } + set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags); + + return 0; + +fail_trans: + gfs2_glock_put(sdp->sd_trans_gl); + +fail_rename: + gfs2_glock_put(sdp->sd_rename_gl); + +fail_live: + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + +fail_mount: + gfs2_glock_dq_uninit(mount_gh); + +fail: + while (sdp->sd_glockd_num--) + kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); + + kthread_stop(sdp->sd_scand_process); + + return error; +} + +static struct inode *gfs2_lookup_root(struct super_block *sb, + struct gfs2_inum *inum) +{ + return gfs2_inode_lookup(sb, inum, DT_DIR); +} + +static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder sb_gh; + struct gfs2_inum *inum; + struct inode *inode; + int error = 0; + + if (undo) { + if (sb->s_root) { + dput(sb->s_root); + sb->s_root = NULL; + } + return 0; + } + + error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (error) { + fs_err(sdp, "can't acquire superblock glock: %d\n", error); + return error; + } + + error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); + if (error) { + fs_err(sdp, "can't read superblock: %d\n", error); + goto out; + } + + /* Set up the buffer cache and SB for real */ + if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { + error = -EINVAL; + fs_err(sdp, "FS block size (%u) is too small for device " + "block size (%u)\n", + sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); + goto out; + } + if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { + error = -EINVAL; + fs_err(sdp, "FS block size (%u) is too big for machine " + "page size (%u)\n", + sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); + goto out; + } + sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); + + /* Get the root inode */ + inum = &sdp->sd_sb.sb_root_dir; + if (sb->s_type == &gfs2meta_fs_type) + inum = &sdp->sd_sb.sb_master_dir; + inode = gfs2_lookup_root(sb, inum); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + fs_err(sdp, "can't read in root inode: %d\n", error); + goto out; + } + + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + fs_err(sdp, "can't get root dentry\n"); + error = -ENOMEM; + iput(inode); + } + sb->s_root->d_op = &gfs2_dops; +out: + gfs2_glock_dq_uninit(&sb_gh); + return error; +} + +static int init_journal(struct gfs2_sbd *sdp, int undo) +{ + struct gfs2_holder ji_gh; + struct task_struct *p; + struct gfs2_inode *ip; + int jindex = 1; + int error = 0; + + if (undo) { + jindex = 0; + goto fail_recoverd; + } + + sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); + if (IS_ERR(sdp->sd_jindex)) { + fs_err(sdp, "can't lookup journal index: %d\n", error); + return PTR_ERR(sdp->sd_jindex); + } + ip = GFS2_I(sdp->sd_jindex); + set_bit(GLF_STICKY, &ip->i_gl->gl_flags); + + /* Load in the journal index special file */ + + error = gfs2_jindex_hold(sdp, &ji_gh); + if (error) { + fs_err(sdp, "can't read journal index: %d\n", error); + goto fail; + } + + error = -EINVAL; + if (!gfs2_jindex_size(sdp)) { + fs_err(sdp, "no journals!\n"); + goto fail_jindex; + } + + if (sdp->sd_args.ar_spectator) { + sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); + sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; + } else { + if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { + fs_err(sdp, "can't mount journal #%u\n", + sdp->sd_lockstruct.ls_jid); + fs_err(sdp, "there are only %u journals (0 - %u)\n", + gfs2_jindex_size(sdp), + gfs2_jindex_size(sdp) - 1); + goto fail_jindex; + } + sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid); + + error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, + &gfs2_journal_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP, + &sdp->sd_journal_gh); + if (error) { + fs_err(sdp, "can't acquire journal glock: %d\n", error); + goto fail_jindex; + } + + ip = GFS2_I(sdp->sd_jdesc->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_jinode_gh); + if (error) { + fs_err(sdp, "can't acquire journal inode glock: %d\n", + error); + goto fail_journal_gh; + } + + error = gfs2_jdesc_check(sdp->sd_jdesc); + if (error) { + fs_err(sdp, "my journal (%u) is bad: %d\n", + sdp->sd_jdesc->jd_jid, error); + goto fail_jinode_gh; + } + sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; + } + + if (sdp->sd_lockstruct.ls_first) { + unsigned int x; + for (x = 0; x < sdp->sd_journals; x++) { + error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); + if (error) { + fs_err(sdp, "error recovering journal %u: %d\n", + x, error); + goto fail_jinode_gh; + } + } + + gfs2_lm_others_may_mount(sdp); + } else if (!sdp->sd_args.ar_spectator) { + error = gfs2_recover_journal(sdp->sd_jdesc); + if (error) { + fs_err(sdp, "error recovering my journal: %d\n", error); + goto fail_jinode_gh; + } + } + + set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); + gfs2_glock_dq_uninit(&ji_gh); + jindex = 0; + + p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start recoverd thread: %d\n", error); + goto fail_jinode_gh; + } + sdp->sd_recoverd_process = p; + + return 0; + + fail_recoverd: + kthread_stop(sdp->sd_recoverd_process); + + fail_jinode_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + + fail_journal_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + + fail_jindex: + gfs2_jindex_free(sdp); + if (jindex) + gfs2_glock_dq_uninit(&ji_gh); + + fail: + iput(sdp->sd_jindex); + + return error; +} + + +static int init_inodes(struct gfs2_sbd *sdp, int undo) +{ + int error = 0; + struct gfs2_inode *ip; + struct inode *inode; + + if (undo) + goto fail_qinode; + + inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + fs_err(sdp, "can't read in master directory: %d\n", error); + goto fail; + } + sdp->sd_master_dir = inode; + + error = init_journal(sdp, undo); + if (error) + goto fail_master; + + /* Read in the master inode number inode */ + sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); + if (IS_ERR(sdp->sd_inum_inode)) { + error = PTR_ERR(sdp->sd_inum_inode); + fs_err(sdp, "can't read in inum inode: %d\n", error); + goto fail_journal; + } + + + /* Read in the master statfs inode */ + sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail_inum; + } + + /* Read in the resource index inode */ + sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); + if (IS_ERR(sdp->sd_rindex)) { + error = PTR_ERR(sdp->sd_rindex); + fs_err(sdp, "can't get resource index inode: %d\n", error); + goto fail_statfs; + } + ip = GFS2_I(sdp->sd_rindex); + set_bit(GLF_STICKY, &ip->i_gl->gl_flags); + sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1; + + /* Read in the quota inode */ + sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); + if (IS_ERR(sdp->sd_quota_inode)) { + error = PTR_ERR(sdp->sd_quota_inode); + fs_err(sdp, "can't get quota file inode: %d\n", error); + goto fail_rindex; + } + return 0; + +fail_qinode: + iput(sdp->sd_quota_inode); + +fail_rindex: + gfs2_clear_rgrpd(sdp); + iput(sdp->sd_rindex); + +fail_statfs: + iput(sdp->sd_statfs_inode); + +fail_inum: + iput(sdp->sd_inum_inode); +fail_journal: + init_journal(sdp, UNDO); +fail_master: + iput(sdp->sd_master_dir); +fail: + return error; +} + +static int init_per_node(struct gfs2_sbd *sdp, int undo) +{ + struct inode *pn = NULL; + char buf[30]; + int error = 0; + struct gfs2_inode *ip; + + if (sdp->sd_args.ar_spectator) + return 0; + + if (undo) + goto fail_qc_gh; + + pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + return error; + } + + sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid); + sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_ir_inode)) { + error = PTR_ERR(sdp->sd_ir_inode); + fs_err(sdp, "can't find local \"ir\" file: %d\n", error); + goto fail; + } + + sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_sc_inode)) { + error = PTR_ERR(sdp->sd_sc_inode); + fs_err(sdp, "can't find local \"sc\" file: %d\n", error); + goto fail_ir_i; + } + + sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_qc_inode)) { + error = PTR_ERR(sdp->sd_qc_inode); + fs_err(sdp, "can't find local \"qc\" file: %d\n", error); + goto fail_ut_i; + } + + iput(pn); + pn = NULL; + + ip = GFS2_I(sdp->sd_ir_inode); + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_EXCLUSIVE, 0, + &sdp->sd_ir_gh); + if (error) { + fs_err(sdp, "can't lock local \"ir\" file: %d\n", error); + goto fail_qc_i; + } + + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto fail_ir_gh; + } + + ip = GFS2_I(sdp->sd_qc_inode); + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_EXCLUSIVE, 0, + &sdp->sd_qc_gh); + if (error) { + fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); + goto fail_ut_gh; + } + + return 0; + + fail_qc_gh: + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + + fail_ut_gh: + + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + + fail_ir_gh: + gfs2_glock_dq_uninit(&sdp->sd_ir_gh); + + fail_qc_i: + iput(sdp->sd_qc_inode); + + fail_ut_i: + + iput(sdp->sd_sc_inode); + + fail_ir_i: + iput(sdp->sd_ir_inode); + + fail: + if (pn) + iput(pn); + return error; +} + +static int init_threads(struct gfs2_sbd *sdp, int undo) +{ + struct task_struct *p; + int error = 0; + + if (undo) + goto fail_quotad; + + sdp->sd_log_flush_time = jiffies; + sdp->sd_jindex_refresh_time = jiffies; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + sdp->sd_statfs_sync_time = jiffies; + sdp->sd_quota_sync_time = jiffies; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + + return 0; + + +fail_quotad: + kthread_stop(sdp->sd_quotad_process); +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + +/** + * fill_super - Read in superblock + * @sb: The VFS superblock + * @data: Mount options + * @silent: Don't complain if it's not a GFS2 filesystem + * + * Returns: errno + */ + +static int fill_super(struct super_block *sb, void *data, int silent) +{ + struct gfs2_sbd *sdp; + struct gfs2_holder mount_gh; + int error; + + sdp = init_sbd(sb); + if (!sdp) { + printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); + return -ENOMEM; + } + + error = gfs2_mount_args(sdp, (char *)data, 0); + if (error) { + printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); + goto fail; + } + + init_vfs(sb, SDF_NOATIME); + + /* Set up the buffer cache and fill in some fake block size values + to allow us to read-in the on-disk superblock. */ + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK); + sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + + error = init_names(sdp, silent); + if (error) + goto fail; + + error = gfs2_sys_fs_add(sdp); + if (error) + goto fail; + + error = gfs2_lm_mount(sdp, silent); + if (error) + goto fail_sys; + + error = init_locking(sdp, &mount_gh, DO); + if (error) + goto fail_lm; + + error = init_sb(sdp, silent, DO); + if (error) + goto fail_locking; + + error = init_inodes(sdp, DO); + if (error) + goto fail_sb; + + error = init_per_node(sdp, DO); + if (error) + goto fail_inodes; + + error = gfs2_statfs_init(sdp); + if (error) { + fs_err(sdp, "can't initialize statfs subsystem: %d\n", error); + goto fail_per_node; + } + + error = init_threads(sdp, DO); + if (error) + goto fail_per_node; + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_rw(sdp); + if (error) { + fs_err(sdp, "can't make FS RW: %d\n", error); + goto fail_threads; + } + } + + gfs2_glock_dq_uninit(&mount_gh); + + return 0; + + fail_threads: + init_threads(sdp, UNDO); + + fail_per_node: + init_per_node(sdp, UNDO); + + fail_inodes: + init_inodes(sdp, UNDO); + + fail_sb: + init_sb(sdp, 0, UNDO); + + fail_locking: + init_locking(sdp, &mount_gh, UNDO); + + fail_lm: + gfs2_gl_hash_clear(sdp, WAIT); + gfs2_lm_unmount(sdp); + while (invalidate_inodes(sb)) + yield(); + + fail_sys: + gfs2_sys_fs_del(sdp); + + fail: + vfree(sdp); + sb->s_fs_info = NULL; + + return error; +} + +static int gfs2_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + struct super_block *sb; + struct gfs2_sbd *sdp; + int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); + if (error) + goto out; + sb = mnt->mnt_sb; + sdp = (struct gfs2_sbd*)sb->s_fs_info; + sdp->sd_gfs2mnt = mnt; +out: + return error; +} + +static int fill_super_meta(struct super_block *sb, struct super_block *new, + void *data, int silent) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct inode *inode; + int error = 0; + + new->s_fs_info = sdp; + sdp->sd_vfs_meta = sb; + + init_vfs(new, SDF_NOATIME); + + /* Get the master inode */ + inode = igrab(sdp->sd_master_dir); + + new->s_root = d_alloc_root(inode); + if (!new->s_root) { + fs_err(sdp, "can't get root dentry\n"); + error = -ENOMEM; + iput(inode); + } + new->s_root->d_op = &gfs2_dops; + + return error; +} +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int test_bdev_super(struct super_block *s, void *data) +{ + return (void *)s->s_bdev == data; +} + +static struct super_block* get_gfs2_sb(const char *dev_name) +{ + struct kstat stat; + struct nameidata nd; + struct file_system_type *fstype; + struct super_block *sb = NULL, *s; + struct list_head *l; + int error; + + error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); + if (error) { + printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", + dev_name); + goto out; + } + error = vfs_getattr(nd.mnt, nd.dentry, &stat); + + fstype = get_fs_type("gfs2"); + list_for_each(l, &fstype->fs_supers) { + s = list_entry(l, struct super_block, s_instances); + if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || + (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) { + sb = s; + goto free_nd; + } + } + + printk(KERN_WARNING "GFS2: Unrecognized block device or " + "mount point %s", dev_name); + +free_nd: + path_release(&nd); +out: + return sb; +} + +static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + int error = 0; + struct super_block *sb = NULL, *new; + struct gfs2_sbd *sdp; + char *gfs2mnt = NULL; + + sb = get_gfs2_sb(dev_name); + if (!sb) { + printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); + error = -ENOENT; + goto error; + } + sdp = (struct gfs2_sbd*) sb->s_fs_info; + if (sdp->sd_vfs_meta) { + printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); + error = -EBUSY; + goto error; + } + mutex_lock(&sb->s_bdev->bd_mount_mutex); + new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev); + mutex_unlock(&sb->s_bdev->bd_mount_mutex); + if (IS_ERR(new)) { + error = PTR_ERR(new); + goto error; + } + module_put(fs_type->owner); + new->s_flags = flags; + strlcpy(new->s_id, sb->s_id, sizeof(new->s_id)); + sb_set_blocksize(new, sb->s_blocksize); + error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&new->s_umount); + deactivate_super(new); + goto error; + } + + new->s_flags |= MS_ACTIVE; + + /* Grab a reference to the gfs2 mount point */ + atomic_inc(&sdp->sd_gfs2mnt->mnt_count); + return simple_set_mnt(mnt, new); +error: + if (gfs2mnt) + kfree(gfs2mnt); + return error; +} + +static void gfs2_kill_sb(struct super_block *sb) +{ + kill_block_super(sb); +} + +static void gfs2_kill_sb_meta(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + generic_shutdown_super(sb); + sdp->sd_vfs_meta = NULL; + atomic_dec(&sdp->sd_gfs2mnt->mnt_count); +} + +struct file_system_type gfs2_fs_type = { + .name = "gfs2", + .fs_flags = FS_REQUIRES_DEV, + .get_sb = gfs2_get_sb, + .kill_sb = gfs2_kill_sb, + .owner = THIS_MODULE, +}; + +struct file_system_type gfs2meta_fs_type = { + .name = "gfs2meta", + .fs_flags = FS_REQUIRES_DEV, + .get_sb = gfs2_get_sb_meta, + .kill_sb = gfs2_kill_sb_meta, + .owner = THIS_MODULE, +}; + --- /dev/null +++ b/fs/gfs2/ops_fstype.h @@ -0,0 +1,16 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_FSTYPE_DOT_H__ +#define __OPS_FSTYPE_DOT_H__ + +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; + +#endif /* __OPS_FSTYPE_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/ops_inode.c @@ -0,0 +1,1165 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "acl.h" +#include "bmap.h" +#include "dir.h" +#include "eaops.h" +#include "eattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "ops_dentry.h" +#include "ops_inode.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_create - Create a file + * @dir: The directory in which to create the file + * @dentry: The dentry of the new file + * @mode: The mode of the new file + * + * Returns: errno + */ + +static int gfs2_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + for (;;) { + inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode); + if (!IS_ERR(inode)) { + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + gfs2_glock_dq_uninit_m(2, ghs); + mark_inode_dirty(inode); + break; + } else if (PTR_ERR(inode) != -EEXIST || + (nd->intent.open.flags & O_EXCL)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); + if (inode) { + if (!IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + break; + } else { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + } + } + + d_instantiate(dentry, inode); + + return 0; +} + +/** + * gfs2_lookup - Look up a filename in a directory and return its inode + * @dir: The directory inode + * @dentry: The dentry of the new inode + * @nd: passed from Linux VFS, ignored by us + * + * Called by the VFS layer. Lock dir and call gfs2_lookupi() + * + * Returns: errno + */ + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + + dentry->d_op = &gfs2_dops; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); + if (inode && IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + + if (inode) + return d_splice_alias(inode, dentry); + d_add(dentry, inode); + + return NULL; +} + +/** + * gfs2_link - Link to a file + * @old_dentry: The inode to link + * @dir: Add link to this directory + * @dentry: The name of the link + * + * Link the inode in "old_dentry" into the directory "dir" with the + * name in "dentry". + * + * Returns: errno + */ + +static int gfs2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = old_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[2]; + int alloc_required; + int error; + + if (S_ISDIR(ip->i_di.di_mode)) + return -EPERM; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq_m(2, ghs); + if (error) + goto out; + + error = permission(dir, MAY_WRITE | MAY_EXEC, NULL); + if (error) + goto out_gunlock; + + error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + } + + error = -EINVAL; + if (!dip->i_di.di_nlink) + goto out_gunlock; + error = -EFBIG; + if (dip->i_di.di_entries == (uint32_t)-1) + goto out_gunlock; + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out_gunlock; + error = -EINVAL; + if (!ip->i_di.di_nlink) + goto out_gunlock; + error = -EMLINK; + if (ip->i_di.di_nlink == (uint32_t)-1) + goto out_gunlock; + + alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(dip); + + error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_quota_check(dip, dip->i_di.di_uid, + dip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(dip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + al->al_rgd->rd_ri.ri_length + + 2 * RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); + if (error) + goto out_ipres; + } + + error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num, + IF2DT(ip->i_di.di_mode)); + if (error) + goto out_end_trans; + + error = gfs2_change_nlink(ip, +1); + +out_end_trans: + gfs2_trans_end(sdp); + +out_ipres: + if (alloc_required) + gfs2_inplace_release(dip); + +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(dip); + +out_alloc: + if (alloc_required) + gfs2_alloc_put(dip); + +out_gunlock: + gfs2_glock_dq_m(2, ghs); + +out: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + + if (!error) { + atomic_inc(&inode->i_count); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + } + + return error; +} + +/** + * gfs2_unlink - Unlink a file + * @dir: The inode of the directory containing the file to unlink + * @dentry: The file itself + * + * Unlink a file. Call gfs2_unlinki() + * + * Returns: errno + */ + +static int gfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder ghs[2]; + int error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq_m(2, ghs); + if (error) + goto out; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_gunlock; + + error = gfs2_dir_del(dip, &dentry->d_name); + if (error) + goto out_end_trans; + + error = gfs2_change_nlink(ip, -1); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + gfs2_glock_dq_m(2, ghs); +out: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + return error; +} + +/** + * gfs2_symlink - Create a symlink + * @dir: The directory to create the symlink in + * @dentry: The dentry to put the symlink in + * @symname: The thing which the link points to + * + * Returns: errno + */ + +static int gfs2_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + struct buffer_head *dibh; + int size; + int error; + + /* Must be stuffed with a null terminator for gfs2_follow_link() */ + size = strlen(symname); + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) + return -ENAMETOOLONG; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + ip = ghs[1].gh_gl->gl_object; + + ip->i_di.di_size = size; + + error = gfs2_meta_inode_buffer(ip, &dibh); + + if (!gfs2_assert_withdraw(sdp, !error)) { + gfs2_dinode_out(&ip->i_di, dibh->b_data); + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, + size); + brelse(dibh); + } + + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + + gfs2_glock_dq_uninit_m(2, ghs); + + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + + return 0; +} + +/** + * gfs2_mkdir - Make a directory + * @dir: The parent directory of the new one + * @dentry: The dentry of the new directory + * @mode: The mode of the new directory + * + * Returns: errno + */ + +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + struct buffer_head *dibh; + int error; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode); + if (IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + ip = ghs[1].gh_gl->gl_object; + + ip->i_di.di_nlink = 2; + ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + ip->i_di.di_flags |= GFS2_DIF_JDATA; + ip->i_di.di_payload_format = GFS2_FORMAT_DE; + ip->i_di.di_entries = 2; + + error = gfs2_meta_inode_buffer(ip, &dibh); + + if (!gfs2_assert_withdraw(sdp, !error)) { + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); + struct qstr str; + + gfs2_str2qstr(&str, "."); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); + dent->de_inum = di->di_num; /* already GFS2 endian */ + dent->de_type = DT_DIR; + di->di_entries = cpu_to_be32(1); + + gfs2_str2qstr(&str, ".."); + dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); + gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + + gfs2_inum_out(&dip->i_num, (char *) &dent->de_inum); + dent->de_type = DT_DIR; + + gfs2_dinode_out(&ip->i_di, (char *)di); + + brelse(dibh); + } + + error = gfs2_change_nlink(dip, +1); + gfs2_assert_withdraw(sdp, !error); /* dip already pinned */ + + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + + gfs2_glock_dq_uninit_m(2, ghs); + + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + + return 0; +} + +/** + * gfs2_rmdir - Remove a directory + * @dir: The parent directory of the directory to be removed + * @dentry: The dentry of the directory to remove + * + * Remove a directory. Call gfs2_rmdiri() + * + * Returns: errno + */ + +static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder ghs[2]; + int error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq_m(2, ghs); + if (error) + goto out; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + if (ip->i_di.di_entries < 2) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + error = -EIO; + goto out_gunlock; + } + if (ip->i_di.di_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_gunlock; + + error = gfs2_rmdiri(dip, &dentry->d_name, ip); + + gfs2_trans_end(sdp); + + out_gunlock: + gfs2_glock_dq_m(2, ghs); + + out: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + + return error; +} + +/** + * gfs2_mknod - Make a special file + * @dir: The directory in which the special file will reside + * @dentry: The dentry of the special file + * @mode: The mode of the special file + * @rdev: The device specification of the special file + * + */ + +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t dev) +{ + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + struct buffer_head *dibh; + uint32_t major = 0, minor = 0; + int error; + + switch (mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + major = MAJOR(dev); + minor = MINOR(dev); + break; + case S_IFIFO: + case S_IFSOCK: + break; + default: + return -EOPNOTSUPP; + }; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + inode = gfs2_createi(ghs, &dentry->d_name, mode); + if (IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + ip = ghs[1].gh_gl->gl_object; + + ip->i_di.di_major = major; + ip->i_di.di_minor = minor; + + error = gfs2_meta_inode_buffer(ip, &dibh); + + if (!gfs2_assert_withdraw(sdp, !error)) { + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + + gfs2_glock_dq_uninit_m(2, ghs); + + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + + return 0; +} + +/** + * gfs2_rename - Rename a file + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * + * Returns: errno + */ + +static int gfs2_rename(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = NULL; + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[4], r_gh; + unsigned int num_gh; + int dir_rename = 0; + int alloc_required; + unsigned int x; + int error; + + if (ndentry->d_inode) { + nip = GFS2_I(ndentry->d_inode); + if (ip == nip) + return 0; + } + + /* Make sure we aren't trying to move a dirctory into it's subdir */ + + if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) { + dir_rename = 1; + + error = gfs2_glock_nq_init(sdp->sd_rename_gl, + LM_ST_EXCLUSIVE, 0, + &r_gh); + if (error) + goto out; + + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + if (nip) { + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + + error = gfs2_glock_nq_m(num_gh, ghs); + if (error) + goto out_uninit; + + /* Check out the old directory */ + + error = gfs2_unlink_ok(odip, &odentry->d_name, ip); + if (error) + goto out_gunlock; + + /* Check out the new directory */ + + if (nip) { + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (S_ISDIR(nip->i_di.di_mode)) { + if (nip->i_di.di_entries < 2) { + if (gfs2_consist_inode(nip)) + gfs2_dinode_print(&nip->i_di); + error = -EIO; + goto out_gunlock; + } + if (nip->i_di.di_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + } + } else { + error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL); + if (error) + goto out_gunlock; + + error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + }; + + if (odip != ndip) { + if (!ndip->i_di.di_nlink) { + error = -EINVAL; + goto out_gunlock; + } + if (ndip->i_di.di_entries == (uint32_t)-1) { + error = -EFBIG; + goto out_gunlock; + } + if (S_ISDIR(ip->i_di.di_mode) && + ndip->i_di.di_nlink == (uint32_t)-1) { + error = -EMLINK; + goto out_gunlock; + } + } + } + + /* Check out the dir to be renamed */ + + if (dir_rename) { + error = permission(odentry->d_inode, MAY_WRITE, NULL); + if (error) + goto out_gunlock; + } + + alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(ndip); + + error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_quota_check(ndip, ndip->i_di.di_uid, + ndip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(ndip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + al->al_rgd->rd_ri.ri_length + + 4 * RES_DINODE + 4 * RES_LEAF + + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipreserv; + } else { + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + + 5 * RES_LEAF, 0); + if (error) + goto out_gunlock; + } + + /* Remove the target file, if it exists */ + + if (nip) { + if (S_ISDIR(nip->i_di.di_mode)) + error = gfs2_rmdiri(ndip, &ndentry->d_name, nip); + else { + error = gfs2_dir_del(ndip, &ndentry->d_name); + if (error) + goto out_end_trans; + error = gfs2_change_nlink(nip, -1); + } + if (error) + goto out_end_trans; + } + + if (dir_rename) { + struct qstr name; + gfs2_str2qstr(&name, ".."); + + error = gfs2_change_nlink(ndip, +1); + if (error) + goto out_end_trans; + error = gfs2_change_nlink(odip, -1); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR); + if (error) + goto out_end_trans; + } else { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + error = gfs2_dir_del(odip, &odentry->d_name); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num, + IF2DT(ip->i_di.di_mode)); + if (error) + goto out_end_trans; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipreserv: + if (alloc_required) + gfs2_inplace_release(ndip); +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(ndip); +out_alloc: + if (alloc_required) + gfs2_alloc_put(ndip); +out_gunlock: + gfs2_glock_dq_m(num_gh, ghs); +out_uninit: + for (x = 0; x < num_gh; x++) + gfs2_holder_uninit(ghs + x); +out_gunlock_r: + if (dir_rename) + gfs2_glock_dq_uninit(&r_gh); +out: + return error; +} + +/** + * gfs2_readlink - Read the value of a symlink + * @dentry: the symlink + * @buf: the buffer to read the symlink data into + * @size: the size of the buffer + * + * Returns: errno + */ + +static int gfs2_readlink(struct dentry *dentry, char __user *user_buf, + int user_size) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + char array[GFS2_FAST_NAME_SIZE], *buf = array; + unsigned int len = GFS2_FAST_NAME_SIZE; + int error; + + error = gfs2_readlinki(ip, &buf, &len); + if (error) + return error; + + if (user_size > len - 1) + user_size = len - 1; + + if (copy_to_user(user_buf, buf, user_size)) + error = -EFAULT; + else + error = user_size; + + if (buf != array) + kfree(buf); + + return error; +} + +/** + * gfs2_follow_link - Follow a symbolic link + * @dentry: The dentry of the link + * @nd: Data that we pass to vfs_follow_link() + * + * This can handle symlinks of any size. It is optimised for symlinks + * under GFS2_FAST_NAME_SIZE. + * + * Returns: 0 on success or error code + */ + +static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + char array[GFS2_FAST_NAME_SIZE], *buf = array; + unsigned int len = GFS2_FAST_NAME_SIZE; + int error; + + error = gfs2_readlinki(ip, &buf, &len); + if (!error) { + error = vfs_follow_link(nd, buf); + if (buf != array) + kfree(buf); + } + + return ERR_PTR(error); +} + +/** + * gfs2_permission - + * @inode: + * @mask: + * @nd: passed from Linux VFS, ignored by us + * + * Returns: errno + */ + +static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + if (ip->i_vn == ip->i_gl->gl_vn) + return generic_permission(inode, mask, gfs2_check_acl); + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (!error) { + error = generic_permission(inode, mask, gfs2_check_acl_locked); + gfs2_glock_dq_uninit(&i_gh); + } + + return error; +} + +static int setattr_size(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + if (attr->ia_size != ip->i_di.di_size) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + error = gfs2_truncatei(ip, attr->ia_size); + if (error) + return error; + + return error; +} + +static int setattr_chown(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh; + uint32_t ouid, ogid, nuid, ngid; + int error; + + ouid = ip->i_di.di_uid; + ogid = ip->i_di.di_gid; + nuid = attr->ia_uid; + ngid = attr->ia_gid; + + if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) + ouid = nuid = NO_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) + ogid = ngid = NO_QUOTA_CHANGE; + + gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out_alloc; + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + error = gfs2_quota_check(ip, nuid, ngid); + if (error) + goto out_gunlock_q; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); + if (error) + goto out_gunlock_q; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + error = inode_setattr(inode, attr); + gfs2_assert_warn(sdp, !error); + gfs2_inode_attr_out(ip); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid); + gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid); + } + + out_end_trans: + gfs2_trans_end(sdp); + + out_gunlock_q: + gfs2_quota_unlock(ip); + + out_alloc: + gfs2_alloc_put(ip); + + return error; +} + +/** + * gfs2_setattr - Change attributes on an inode + * @dentry: The dentry which is changing + * @attr: The structure describing the change + * + * The VFS layer wants to change one or more of an inodes attributes. Write + * that change out to disk. + * + * Returns: errno + */ + +static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + + error = inode_change_ok(inode, attr); + if (error) + goto out; + + if (attr->ia_valid & ATTR_SIZE) + error = setattr_size(inode, attr); + else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) + error = setattr_chown(inode, attr); + else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) + error = gfs2_acl_chmod(ip, attr); + else + error = gfs2_setattr_simple(ip, attr); + + out: + gfs2_glock_dq_uninit(&i_gh); + + if (!error) + mark_inode_dirty(inode); + + return error; +} + +/** + * gfs2_getattr - Read out an inode's attributes + * @mnt: ? + * @dentry: The dentry to stat + * @stat: The inode's stats + * + * Returns: errno + */ + +static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (!error) { + generic_fillattr(inode, stat); + gfs2_glock_dq_uninit(&gh); + } + + return error; +} + +static int gfs2_setxattr(struct dentry *dentry, const char *name, + const void *data, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = gfs2_ea_name2type(name, &er.er_name); + if (er.er_type == GFS2_EATYPE_UNUSED) + return -EOPNOTSUPP; + er.er_data = (char *)data; + er.er_name_len = strlen(er.er_name); + er.er_data_len = size; + er.er_flags = flags; + + gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE)); + + return gfs2_ea_set(GFS2_I(inode), &er); +} + +static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, + void *data, size_t size) +{ + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = gfs2_ea_name2type(name, &er.er_name); + if (er.er_type == GFS2_EATYPE_UNUSED) + return -EOPNOTSUPP; + er.er_data = data; + er.er_name_len = strlen(er.er_name); + er.er_data_len = size; + + return gfs2_ea_get(GFS2_I(dentry->d_inode), &er); +} + +static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_data = (size) ? buffer : NULL; + er.er_data_len = size; + + return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); +} + +static int gfs2_removexattr(struct dentry *dentry, const char *name) +{ + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = gfs2_ea_name2type(name, &er.er_name); + if (er.er_type == GFS2_EATYPE_UNUSED) + return -EOPNOTSUPP; + er.er_name_len = strlen(er.er_name); + + return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); +} + +struct inode_operations gfs2_file_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + +struct inode_operations gfs2_dev_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + +struct inode_operations gfs2_dir_iops = { + .create = gfs2_create, + .lookup = gfs2_lookup, + .link = gfs2_link, + .unlink = gfs2_unlink, + .symlink = gfs2_symlink, + .mkdir = gfs2_mkdir, + .rmdir = gfs2_rmdir, + .mknod = gfs2_mknod, + .rename = gfs2_rename, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + +struct inode_operations gfs2_symlink_iops = { + .readlink = gfs2_readlink, + .follow_link = gfs2_follow_link, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + --- /dev/null +++ b/fs/gfs2/ops_inode.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_INODE_DOT_H__ +#define __OPS_INODE_DOT_H__ + +extern struct inode_operations gfs2_file_iops; +extern struct inode_operations gfs2_dir_iops; +extern struct inode_operations gfs2_symlink_iops; +extern struct inode_operations gfs2_dev_iops; + +#endif /* __OPS_INODE_DOT_H__ */ [PATCH 06/16] GFS2: dentry, export, super and vm operations The GFS2 dentry interface, NFS operations, superblock operations, VM operaions, transaction code, unlinked inode code and GFS2 utility routines. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/ops_dentry.c | 123 ++++++ fs/gfs2/ops_dentry.h | 15 fs/gfs2/ops_export.c | 293 +++++++++++++++ fs/gfs2/ops_export.h | 19 fs/gfs2/ops_super.c | 472 ++++++++++++++++++++++++ fs/gfs2/ops_super.h | 15 fs/gfs2/ops_vm.c | 188 +++++++++ fs/gfs2/ops_vm.h | 16 fs/gfs2/super.c | 979 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/super.h | 52 ++ fs/gfs2/trans.c | 184 +++++++++ fs/gfs2/trans.h | 34 + fs/gfs2/util.c | 245 ++++++++++++ fs/gfs2/util.h | 169 ++++++++ 14 files changed, 2804 insertions(+) --- /dev/null +++ b/fs/gfs2/ops_dentry.c @@ -0,0 +1,123 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "ops_dentry.h" +#include "util.h" + +/** + * gfs2_drevalidate - Check directory lookup consistency + * @dentry: the mapping to check + * @nd: + * + * Check to make sure the lookup necessary to arrive at this inode from its + * parent is still good. + * + * Returns: 1 if the dentry is ok, 0 if it isn't + */ + +static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *parent = dget_parent(dentry); + struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); + struct gfs2_inode *dip = GFS2_I(parent->d_inode); + struct inode *inode = dentry->d_inode; + struct gfs2_holder d_gh; + struct gfs2_inode *ip; + struct gfs2_inum inum; + unsigned int type; + int error; + + if (inode && is_bad_inode(inode)) + goto invalid; + + if (sdp->sd_args.ar_localcaching) + goto valid; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto fail; + + error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type); + switch (error) { + case 0: + if (!inode) + goto invalid_gunlock; + break; + case -ENOENT: + if (!inode) + goto valid_gunlock; + goto invalid_gunlock; + default: + goto fail_gunlock; + } + + ip = GFS2_I(inode); + + if (!gfs2_inum_equal(&ip->i_num, &inum)) + goto invalid_gunlock; + + if (IF2DT(ip->i_di.di_mode) != type) { + gfs2_consist_inode(dip); + goto fail_gunlock; + } + + valid_gunlock: + gfs2_glock_dq_uninit(&d_gh); + + valid: + dput(parent); + return 1; + + invalid_gunlock: + gfs2_glock_dq_uninit(&d_gh); + + invalid: + if (inode && S_ISDIR(inode->i_mode)) { + if (have_submounts(dentry)) + goto valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + + dput(parent); + return 0; + + fail_gunlock: + gfs2_glock_dq_uninit(&d_gh); + + fail: + dput(parent); + return 0; +} + +static int gfs2_dhash(struct dentry *dentry, struct qstr *str) +{ + str->hash = gfs2_disk_hash(str->name, str->len); + return 0; +} + +struct dentry_operations gfs2_dops = { + .d_revalidate = gfs2_drevalidate, + .d_hash = gfs2_dhash, +}; + --- /dev/null +++ b/fs/gfs2/ops_dentry.h @@ -0,0 +1,15 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_DENTRY_DOT_H__ +#define __OPS_DENTRY_DOT_H__ + +extern struct dentry_operations gfs2_dops; + +#endif /* __OPS_DENTRY_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/ops_export.c @@ -0,0 +1,293 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "ops_export.h" +#include "rgrp.h" +#include "util.h" + +static struct dentry *gfs2_decode_fh(struct super_block *sb, + __u32 *fh, + int fh_len, + int fh_type, + int (*acceptable)(void *context, + struct dentry *dentry), + void *context) +{ + struct gfs2_fh_obj fh_obj; + struct gfs2_inum *this, parent; + + if (fh_type != fh_len) + return NULL; + + this = &fh_obj.this; + fh_obj.imode = DT_UNKNOWN; + memset(&parent, 0, sizeof(struct gfs2_inum)); + + switch (fh_type) { + case 10: + parent.no_formal_ino = ((uint64_t)be32_to_cpu(fh[4])) << 32; + parent.no_formal_ino |= be32_to_cpu(fh[5]); + parent.no_addr = ((uint64_t)be32_to_cpu(fh[6])) << 32; + parent.no_addr |= be32_to_cpu(fh[7]); + fh_obj.imode = be32_to_cpu(fh[8]); + case 4: + this->no_formal_ino = ((uint64_t)be32_to_cpu(fh[0])) << 32; + this->no_formal_ino |= be32_to_cpu(fh[1]); + this->no_addr = ((uint64_t)be32_to_cpu(fh[2])) << 32; + this->no_addr |= be32_to_cpu(fh[3]); + break; + default: + return NULL; + } + + return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent, + acceptable, context); +} + +static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct gfs2_inode *ip = GFS2_I(inode); + + if (*len < 4 || (connectable && *len < 10)) + return 255; + + fh[0] = ip->i_num.no_formal_ino >> 32; + fh[0] = cpu_to_be32(fh[0]); + fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF; + fh[1] = cpu_to_be32(fh[1]); + fh[2] = ip->i_num.no_addr >> 32; + fh[2] = cpu_to_be32(fh[2]); + fh[3] = ip->i_num.no_addr & 0xFFFFFFFF; + fh[3] = cpu_to_be32(fh[3]); + *len = 4; + + if (!connectable || inode == sb->s_root->d_inode) + return *len; + + spin_lock(&dentry->d_lock); + inode = dentry->d_parent->d_inode; + ip = GFS2_I(inode); + igrab(inode); + spin_unlock(&dentry->d_lock); + + fh[4] = ip->i_num.no_formal_ino >> 32; + fh[4] = cpu_to_be32(fh[4]); + fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF; + fh[5] = cpu_to_be32(fh[5]); + fh[6] = ip->i_num.no_addr >> 32; + fh[6] = cpu_to_be32(fh[6]); + fh[7] = ip->i_num.no_addr & 0xFFFFFFFF; + fh[7] = cpu_to_be32(fh[7]); + + fh[8] = cpu_to_be32(inode->i_mode); + fh[9] = 0; /* pad to double word */ + *len = 10; + + iput(inode); + + return *len; +} + +struct get_name_filldir { + struct gfs2_inum inum; + char *name; +}; + +static int get_name_filldir(void *opaque, const char *name, unsigned int length, + uint64_t offset, struct gfs2_inum *inum, + unsigned int type) +{ + struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque; + + if (!gfs2_inum_equal(inum, &gnfd->inum)) + return 0; + + memcpy(gnfd->name, name, length); + gnfd->name[length] = 0; + + return 1; +} + +static int gfs2_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *dir = parent->d_inode; + struct inode *inode = child->d_inode; + struct gfs2_inode *dip, *ip; + struct get_name_filldir gnfd; + struct gfs2_holder gh; + uint64_t offset = 0; + int error; + + if (!dir) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode) || !inode) + return -EINVAL; + + dip = GFS2_I(dir); + ip = GFS2_I(inode); + + *name = 0; + gnfd.inum = ip->i_num; + gnfd.name = name; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); + if (error) + return error; + + error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); + + gfs2_glock_dq_uninit(&gh); + + if (!error && !*name) + error = -ENOENT; + + return error; +} + +static struct dentry *gfs2_get_parent(struct dentry *child) +{ + struct qstr dotdot; + struct inode *inode; + struct dentry *dentry; + + gfs2_str2qstr(&dotdot, ".."); + inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL); + + if (!inode) + return ERR_PTR(-ENOENT); + if (IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + + dentry = d_alloc_anon(inode); + if (!dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + + return dentry; +} + +static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj; + struct gfs2_inum *inum = &fh_obj->this; + struct gfs2_holder i_gh, ri_gh, rgd_gh; + struct gfs2_rgrpd *rgd; + struct inode *inode; + struct dentry *dentry; + int error; + + /* System files? */ + + inode = gfs2_ilookup(sb, inum); + if (inode) { + if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) { + iput(inode); + return ERR_PTR(-ESTALE); + } + goto out_inode; + } + + error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL, + &i_gh); + if (error) + return ERR_PTR(error); + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + + error = -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); + if (!rgd) + goto fail_rindex; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail_rindex; + + error = -ESTALE; + if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) + goto fail_rgd; + + gfs2_glock_dq_uninit(&rgd_gh); + gfs2_glock_dq_uninit(&ri_gh); + + inode = gfs2_inode_lookup(sb, inum, fh_obj->imode); + if (!inode) + goto fail; + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto fail; + } + + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) { + iput(inode); + goto fail; + } + + error = -EIO; + if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { + iput(inode); + goto fail; + } + + gfs2_glock_dq_uninit(&i_gh); + +out_inode: + dentry = d_alloc_anon(inode); + if (!dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + + return dentry; + +fail_rgd: + gfs2_glock_dq_uninit(&rgd_gh); + +fail_rindex: + gfs2_glock_dq_uninit(&ri_gh); + +fail: + gfs2_glock_dq_uninit(&i_gh); + return ERR_PTR(error); +} + +struct export_operations gfs2_export_ops = { + .decode_fh = gfs2_decode_fh, + .encode_fh = gfs2_encode_fh, + .get_name = gfs2_get_name, + .get_parent = gfs2_get_parent, + .get_dentry = gfs2_get_dentry, +}; + --- /dev/null +++ b/fs/gfs2/ops_export.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_EXPORT_DOT_H__ +#define __OPS_EXPORT_DOT_H__ + +extern struct export_operations gfs2_export_ops; +struct gfs2_fh_obj { + struct gfs2_inum this; + __u32 imode; +}; + +#endif /* __OPS_EXPORT_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/ops_super.c @@ -0,0 +1,472 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "inode.h" +#include "lm.h" +#include "log.h" +#include "mount.h" +#include "ops_super.h" +#include "quota.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "trans.h" +#include "dir.h" +#include "eattr.h" +#include "bmap.h" + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @sync: synchronous write flag + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, int sync) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + /* Check this is a "normal" inode */ + if (inode->u.generic_ip) { + if (current->flags & PF_MEMALLOC) + return 0; + if (sync) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + } + + return 0; +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (!sdp) + return; + + if (!strncmp(sb->s_type->name, "gfs2meta", 8)) + return; /* meta fs. don't do nothin' */ + + /* Unfreeze the filesystem, if we need to */ + + mutex_lock(&sdp->sd_freeze_lock); + if (sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + mutex_unlock(&sdp->sd_freeze_lock); + + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + kthread_stop(sdp->sd_recoverd_process); + while (sdp->sd_glockd_num--) + kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); + kthread_stop(sdp->sd_scand_process); + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_ro(sdp); + if (error) + gfs2_io_error(sdp); + } + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + iput(sdp->sd_master_dir); + iput(sdp->sd_jindex); + iput(sdp->sd_inum_inode); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_trans_gl); + + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + gfs2_glock_dq_uninit(&sdp->sd_ir_gh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + iput(sdp->sd_ir_inode); + iput(sdp->sd_sc_inode); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp, WAIT); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); + vfree(sdp); + sb->s_fs_info = NULL; +} + +/** + * gfs2_write_super - disk commit all incore transactions + * @sb: the filesystem + * + * This function is called every time sync(2) is called. + * After this exits, all dirty buffers are synced. + */ + +static void gfs2_write_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + gfs2_log_flush(sdp, NULL); +} + +/** + * gfs2_write_super_lockfs - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static void gfs2_write_super_lockfs(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + for (;;) { + error = gfs2_freeze_fs(sdp); + if (!error) + break; + + switch (error) { + case -EBUSY: + fs_err(sdp, "waiting for recovery before freeze\n"); + break; + + default: + fs_err(sdp, "error freezing FS: %d\n", error); + break; + } + + fs_err(sdp, "retrying...\n"); + msleep(1000); + } +} + +/** + * gfs2_unlockfs - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static void gfs2_unlockfs(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + gfs2_unfreeze_fs(sdp); +} + +/** + * gfs2_statfs - Gather and return stats about the filesystem + * @sb: The superblock + * @statfsbuf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change sc; + int error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_remount_fs - called when the FS is remounted + * @sb: the filesystem + * @flags: the remount flags + * @data: extra data passed in (not used right now) + * + * Returns: errno + */ + +static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + error = gfs2_mount_args(sdp, data, 1); + if (error) + return error; + + if (sdp->sd_args.ar_spectator) + *flags |= MS_RDONLY; + else { + if (*flags & MS_RDONLY) { + if (!(sb->s_flags & MS_RDONLY)) + error = gfs2_make_fs_ro(sdp); + } else if (!(*flags & MS_RDONLY) && + (sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_rw(sdp); + } + } + + if (*flags & (MS_NOATIME | MS_NODIRATIME)) + set_bit(SDF_NOATIME, &sdp->sd_flags); + else + clear_bit(SDF_NOATIME, &sdp->sd_flags); + + /* Don't let the VFS update atimes. GFS2 handles this itself. */ + *flags |= MS_NOATIME | MS_NODIRATIME; + + return error; +} + +/** + * gfs2_clear_inode - Deallocate an inode when VFS is done with it + * @inode: The VFS inode + * + */ + +static void gfs2_clear_inode(struct inode *inode) +{ + /* This tells us its a "real" inode and not one which only + * serves to contain an address space (see rgrp.c, meta_io.c) + * which therefore doesn't have its own glocks. + */ + if (inode->u.generic_ip) { + struct gfs2_inode *ip = GFS2_I(inode); + gfs2_glock_inode_squish(inode); + gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED); + ip->i_gl->gl_object = NULL; + gfs2_glock_schedule_for_reclaim(ip->i_gl); + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @mnt: vfsmount + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + + if (args->ar_lockproto[0]) + seq_printf(s, ",lockproto=%s", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_printf(s, ",locktable=%s", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_printf(s, ",hostdata=%s", args->ar_hostdata); + if (args->ar_spectator) + seq_printf(s, ",spectator"); + if (args->ar_ignore_local_fs) + seq_printf(s, ",ignore_local_fs"); + if (args->ar_localflocks) + seq_printf(s, ",localflocks"); + if (args->ar_localcaching) + seq_printf(s, ",localcaching"); + if (args->ar_debug) + seq_printf(s, ",debug"); + if (args->ar_upgrade) + seq_printf(s, ",upgrade"); + if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT) + seq_printf(s, ",num_glockd=%u", args->ar_num_glockd); + if (args->ar_posix_acl) + seq_printf(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; + break; + case GFS2_QUOTA_ACCOUNT: + state = "account"; + break; + case GFS2_QUOTA_ON: + state = "on"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_printf(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + + return 0; +} + +/* + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ +static void gfs2_delete_inode(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + if (!inode->u.generic_ip) + goto out; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh); + if (unlikely(error)) { + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + goto out; + } + + gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_uninit; + + if (S_ISDIR(ip->i_di.di_mode) && + (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + error = gfs2_dir_exhash_dealloc(ip); + if (error) + goto out_unlock; + } + + if (ip->i_di.di_eattr) { + error = gfs2_ea_dealloc(ip); + if (error) + goto out_unlock; + } + + if (!gfs2_is_stuffed(ip)) { + error = gfs2_file_dealloc(ip); + if (error) + goto out_unlock; + } + + error = gfs2_dinode_dealloc(ip); + +out_unlock: + gfs2_glock_dq(&ip->i_iopen_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&gh); + if (error) + fs_warn(sdp, "gfs2_delete_inode: %d\n", error); +out: + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + + + +static struct inode *gfs2_alloc_inode(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip; + + ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + if (ip) { + ip->i_flags = 0; + ip->i_gl = NULL; + ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default); + ip->i_last_pfault = jiffies; + } + return &ip->i_inode; +} + +static void gfs2_destroy_inode(struct inode *inode) +{ + kmem_cache_free(gfs2_inode_cachep, inode); +} + +struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .destroy_inode = gfs2_destroy_inode, + .write_inode = gfs2_write_inode, + .delete_inode = gfs2_delete_inode, + .put_super = gfs2_put_super, + .write_super = gfs2_write_super, + .write_super_lockfs = gfs2_write_super_lockfs, + .unlockfs = gfs2_unlockfs, + .statfs = gfs2_statfs, + .remount_fs = gfs2_remount_fs, + .clear_inode = gfs2_clear_inode, + .show_options = gfs2_show_options, +}; + --- /dev/null +++ b/fs/gfs2/ops_super.h @@ -0,0 +1,15 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_SUPER_DOT_H__ +#define __OPS_SUPER_DOT_H__ + +extern struct super_operations gfs2_super_ops; + +#endif /* __OPS_SUPER_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/ops_vm.c @@ -0,0 +1,188 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "ops_vm.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +static void pfault_be_greedy(struct gfs2_inode *ip) +{ + unsigned int time; + + spin_lock(&ip->i_spin); + time = ip->i_greedy; + ip->i_last_pfault = jiffies; + spin_unlock(&ip->i_spin); + + igrab(&ip->i_inode); + if (gfs2_glock_be_greedy(ip->i_gl, time)) + iput(&ip->i_inode); +} + +static struct page *gfs2_private_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host); + struct page *result; + + set_bit(GIF_PAGED, &ip->i_flags); + + result = filemap_nopage(area, address, type); + + if (result && result != NOPAGE_OOM) + pfault_be_greedy(ip); + + return result; +} + +static int alloc_page_backing(struct gfs2_inode *ip, struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned long index = page->index; + uint64_t lblock = index << (PAGE_CACHE_SHIFT - + sdp->sd_sb.sb_bsize_shift); + unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift; + struct gfs2_alloc *al; + unsigned int data_blocks, ind_blocks; + unsigned int x; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length + + ind_blocks + RES_DINODE + + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto out_trans; + } + + for (x = 0; x < blocks; ) { + uint64_t dblock; + unsigned int extlen; + int new = 1; + + error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); + if (error) + goto out_trans; + + lblock += extlen; + x += extlen; + } + + gfs2_assert_warn(sdp, al->al_alloced); + + out_trans: + gfs2_trans_end(sdp); + + out_ipres: + gfs2_inplace_release(ip); + + out_gunlock_q: + gfs2_quota_unlock(ip); + + out: + gfs2_alloc_put(ip); + + return error; +} + +static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + struct file *file = area->vm_file; + struct gfs2_file *gf = file->private_data; + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + struct page *result = NULL; + unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + + area->vm_pgoff; + int alloc_required; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return NULL; + + set_bit(GIF_PAGED, &ip->i_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + + error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, &alloc_required); + if (error) + goto out; + + set_bit(GFF_EXLOCK, &gf->f_flags); + result = filemap_nopage(area, address, type); + clear_bit(GFF_EXLOCK, &gf->f_flags); + if (!result || result == NOPAGE_OOM) + goto out; + + if (alloc_required) { + error = alloc_page_backing(ip, result); + if (error) { + page_cache_release(result); + result = NULL; + goto out; + } + set_page_dirty(result); + } + + pfault_be_greedy(ip); +out: + gfs2_glock_dq_uninit(&i_gh); + + return result; +} + +struct vm_operations_struct gfs2_vm_ops_private = { + .nopage = gfs2_private_nopage, +}; + +struct vm_operations_struct gfs2_vm_ops_sharewrite = { + .nopage = gfs2_sharewrite_nopage, +}; + --- /dev/null +++ b/fs/gfs2/ops_vm.h @@ -0,0 +1,16 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __OPS_VM_DOT_H__ +#define __OPS_VM_DOT_H__ + +extern struct vm_operations_struct gfs2_vm_ops_private; +extern struct vm_operations_struct gfs2_vm_ops_sharewrite; + +#endif /* __OPS_VM_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/super.c @@ -0,0 +1,979 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "format.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_ilimit = 100; + gt->gt_ilimit_tries = 3; + gt->gt_ilimit_min = 1; + gt->gt_demote_secs = 300; + gt->gt_incore_log_blocks = 1024; + gt->gt_log_flush_secs = 60; + gt->gt_jindex_refresh_secs = 60; + gt->gt_scand_secs = 15; + gt->gt_recoverd_secs = 60; + gt->gt_logd_secs = 1; + gt->gt_quotad_secs = 5; + gt->gt_quota_simul_sync = 64; + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_quota_cache_secs = 300; + gt->gt_quota_quantum = 60; + gt->gt_atime_quantum = 3600; + gt->gt_new_files_jdata = 0; + gt->gt_new_files_directio = 0; + gt->gt_max_atomic_write = 4 << 20; + gt->gt_max_readahead = 1 << 18; + gt->gt_lockdump_size = 131072; + gt->gt_stall_secs = 600; + gt->gt_complain_secs = 10; + gt->gt_reclaim_limit = 5000; + gt->gt_entries_per_readdir = 32; + gt->gt_prefetch_secs = 10; + gt->gt_greedy_default = HZ / 10; + gt->gt_greedy_quantum = HZ / 40; + gt->gt_greedy_max = HZ / 4; + gt->gt_statfs_quantum = 30; + gt->gt_statfs_slow = 0; +} + +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @sb: The superblock + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent) +{ + unsigned int x; + + if (sb->sb_header.mh_magic != GFS2_MAGIC || + sb->sb_header.mh_type != GFS2_METATYPE_SB) { + if (!silent) + printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); + return -EINVAL; + } + + /* If format numbers match exactly, we're done. */ + + if (sb->sb_fs_format == GFS2_FORMAT_FS && + sb->sb_multihost_format == GFS2_FORMAT_MULTI) + return 0; + + if (sb->sb_fs_format != GFS2_FORMAT_FS) { + for (x = 0; gfs2_old_fs_formats[x]; x++) + if (gfs2_old_fs_formats[x] == sb->sb_fs_format) + break; + + if (!gfs2_old_fs_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + for (x = 0; gfs2_old_multihost_formats[x]; x++) + if (gfs2_old_multihost_formats[x] == + sb->sb_multihost_format) + break; + + if (!gfs2_old_multihost_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (!sdp->sd_args.ar_upgrade) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_INFO + "GFS2: Use the \"upgrade\" mount option to upgrade " + "the FS\n"); + printk(KERN_INFO "GFS2: See the manual for more details\n"); + return -EINVAL; + } + + return 0; +} + + +static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error) +{ + struct page *page = bio->bi_private; + if (bio->bi_size) + return 1; + + if (!error) + SetPageUptodate(page); + else + printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); + unlock_page(page); + return 0; +} + +static struct page *gfs2_read_super(struct super_block *sb, sector_t sector) +{ + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_KERNEL); + if (unlikely(!page)) + return NULL; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(GFP_KERNEL, 1); + if (unlikely(!bio)) { + __free_page(page); + return NULL; + } + + bio->bi_sector = sector; + bio->bi_bdev = sb->s_bdev; + bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(READ_SYNC, bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return NULL; + } + return page; +} + +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @gl: the glock for the superblock (assumed to be held) + * @silent: Don't print message if mount fails + * + */ + +int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) +{ + uint32_t hash_blocks, ind_blocks, leaf_blocks; + uint32_t tmp_blocks; + unsigned int x; + int error; + struct page *page; + char *sb; + + page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + if (!page) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return -EIO; + } + sb = kmap(page); + gfs2_sb_in(&sdp->sd_sb, sb); + kunmap(page); + __free_page(page); + + error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); + if (error) + return error; + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(uint64_t); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(uint64_t); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(uint64_t); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(uint64_t) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + uint64_t space, d; + uint32_t m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2;; x++) { + uint64_t space, d; + uint32_t m; + + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_jheightsize[x - 1] || m) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + + return 0; +} + +/** + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock + * + * This is very similar to the gfs2_rindex_hold() function, except that + * in general we hold the jindex lock for longer periods of time and + * we grab it far less frequently (in general) then the rgrp lock. + * + * Returns: errno + */ + +int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) +{ + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, + GL_LOCAL_EXCL, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL); + if (error == -ENOENT) { + error = 0; + break; + } + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL); + if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; + } + + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + +/** + * gfs2_jindex_free - Clear all the journal index information + * @sdp: The GFS2 superblock + * + */ + +void gfs2_jindex_free(struct gfs2_sbd *sdp) +{ + struct list_head list; + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + list_add(&list, &sdp->sd_jindex_list); + list_del_init(&sdp->sd_jindex_list); + sdp->sd_journals = 0; + spin_unlock(&sdp->sd_jindex_spin); + + while (!list_empty(&list)) { + jd = list_entry(list.next, struct gfs2_jdesc, jd_list); + list_del(&jd->jd_list); + iput(jd->jd_inode); + kfree(jd); + } +} + +static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) +{ + struct gfs2_jdesc *jd; + int found = 0; + + list_for_each_entry(jd, head, jd_list) { + if (jd->jd_jid == jid) { + found = 1; + break; + } + } + + if (!found) + jd = NULL; + + return jd; +} + +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + jd = jdesc_find_i(&sdp->sd_jindex_list, jid); + spin_unlock(&sdp->sd_jindex_spin); + + return jd; +} + +void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + jd = jdesc_find_i(&sdp->sd_jindex_list, jid); + if (jd) + jd->jd_dirty = 1; + spin_unlock(&sdp->sd_jindex_spin); +} + +struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + int found = 0; + + spin_lock(&sdp->sd_jindex_spin); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_dirty) { + jd->jd_dirty = 0; + found = 1; + break; + } + } + spin_unlock(&sdp->sd_jindex_spin); + + if (!found) + jd = NULL; + + return jd; +} + +int gfs2_jdesc_check(struct gfs2_jdesc *jd) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + int ar; + int error; + + if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || + (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { + gfs2_consist_inode(ip); + return -EIO; + } + jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + + error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); + if (!error && ar) { + gfs2_consist_inode(ip); + error = -EIO; + } + + return error; +} + +/** + * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one + * @sdp: the filesystem + * + * Returns: errno + */ + +int gfs2_make_fs_rw(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_holder t_gh; + struct gfs2_log_header head; + int error; + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, + GL_LOCAL_EXCL, &t_gh); + if (error) + return error; + + gfs2_meta_cache_flush(ip); + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + goto fail; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + gfs2_consist(sdp); + error = -EIO; + goto fail; + } + + /* Initialize some head of the log stuff */ + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + + error = gfs2_quota_init(sdp); + if (error) + goto fail_unlinked; + + set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + gfs2_glock_dq_uninit(&t_gh); + + return 0; + + fail_unlinked: + + fail: + t_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&t_gh); + + return error; +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder t_gh; + int error; + + gfs2_quota_sync(sdp); + gfs2_statfs_sync(sdp); + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, + GL_LOCAL_EXCL | GL_NOCACHE, + &t_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (t_gh.gh_gl) + gfs2_glock_dq_uninit(&t_gh); + + gfs2_quota_cleanup(sdp); + + return error; +} + +int gfs2_statfs_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master; + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + if (sdp->sd_args.ar_spectator) { + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + } else { + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_m_bh; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + gfs2_statfs_change_in(l_sc, l_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); + } + + out_m_bh: + brelse(m_bh); + + out: + gfs2_glock_dq_uninit(&gh); + + return 0; +} + +void gfs2_statfs_change(struct gfs2_sbd *sdp, int64_t total, int64_t free, + int64_t dinodes) +{ + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + struct buffer_head *l_bh; + int error; + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + return; + + mutex_lock(&sdp->sd_statfs_mutex); + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + mutex_unlock(&sdp->sd_statfs_mutex); + + spin_lock(&sdp->sd_statfs_spin); + l_sc->sc_total += total; + l_sc->sc_free += free; + l_sc->sc_dinodes += dinodes; + gfs2_statfs_change_out(l_sc, l_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); +} + +int gfs2_statfs_sync(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + struct gfs2_holder gh; + struct buffer_head *m_bh, *l_bh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) { + spin_unlock(&sdp->sd_statfs_spin); + goto out_bh; + } + spin_unlock(&sdp->sd_statfs_spin); + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_bh; + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); + if (error) + goto out_bh2; + + mutex_lock(&sdp->sd_statfs_mutex); + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + mutex_unlock(&sdp->sd_statfs_mutex); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + + gfs2_trans_end(sdp); + + out_bh2: + brelse(l_bh); + + out_bh: + brelse(m_bh); + + out: + gfs2_glock_dq_uninit(&gh); + + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc) +{ + struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_ri.ri_data; + sc->sc_free += rgd->rd_rg.rg_free; + sc->sc_dinodes += rgd->rd_rg.rg_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc) +{ + struct gfs2_holder ri_gh; + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + gfs2_glock_dq_uninit(&ri_gh); + + out: + kfree(gha); + + return error; +} + +struct lfcc { + struct list_head list; + struct gfs2_holder gh; +}; + +/** + * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all + * journals are clean + * @sdp: the file system + * @state: the state to put the transaction lock into + * @t_gh: the hold on the transaction lock + * + * Returns: errno + */ + +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, + struct gfs2_holder *t_gh) +{ + struct gfs2_inode *ip; + struct gfs2_holder ji_gh; + struct gfs2_jdesc *jd; + struct lfcc *lfcc; + LIST_HEAD(list); + struct gfs2_log_header lh; + int error; + + error = gfs2_jindex_hold(sdp, &ji_gh); + if (error) + return error; + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); + if (!lfcc) { + error = -ENOMEM; + goto out; + } + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); + if (error) { + kfree(lfcc); + goto out; + } + list_add(&lfcc->list, &list); + } + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, + LM_FLAG_PRIORITY | GL_NOCACHE, + t_gh); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + error = gfs2_jdesc_check(jd); + if (error) + break; + error = gfs2_find_jhead(jd, &lh); + if (error) + break; + if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EBUSY; + break; + } + } + + if (error) + gfs2_glock_dq_uninit(t_gh); + + out: + while (!list_empty(&list)) { + lfcc = list_entry(list.next, struct lfcc, list); + list_del(&lfcc->list); + gfs2_glock_dq_uninit(&lfcc->gh); + kfree(lfcc); + } + gfs2_glock_dq_uninit(&ji_gh); + + return error; +} + +/** + * gfs2_freeze_fs - freezes the file system + * @sdp: the file system + * + * This function flushes data and meta data for all machines by + * aquiring the transaction log exclusively. All journals are + * ensured to be in a clean state as well. + * + * Returns: errno + */ + +int gfs2_freeze_fs(struct gfs2_sbd *sdp) +{ + int error = 0; + + mutex_lock(&sdp->sd_freeze_lock); + + if (!sdp->sd_freeze_count++) { + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (error) + sdp->sd_freeze_count--; + } + + mutex_unlock(&sdp->sd_freeze_lock); + + return error; +} + +/** + * gfs2_unfreeze_fs - unfreezes the file system + * @sdp: the file system + * + * This function allows the file system to proceed by unlocking + * the exclusively held transaction lock. Other GFS2 nodes are + * now free to acquire the lock shared and go on with their lives. + * + */ + +void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) +{ + mutex_lock(&sdp->sd_freeze_lock); + + if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + + mutex_unlock(&sdp->sd_freeze_lock); +} + --- /dev/null +++ b/fs/gfs2/super.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __SUPER_DOT_H__ +#define __SUPER_DOT_H__ + +void gfs2_tune_init(struct gfs2_tune *gt); + +int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent); +int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); + +static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) +{ + unsigned int x; + spin_lock(&sdp->sd_jindex_spin); + x = sdp->sd_journals; + spin_unlock(&sdp->sd_jindex_spin); + return x; +} + +int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh); +void gfs2_jindex_free(struct gfs2_sbd *sdp); + +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid); +struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp); +int gfs2_jdesc_check(struct gfs2_jdesc *jd); + +int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); + +int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +int gfs2_make_fs_ro(struct gfs2_sbd *sdp); + +int gfs2_statfs_init(struct gfs2_sbd *sdp); +void gfs2_statfs_change(struct gfs2_sbd *sdp, + int64_t total, int64_t free, int64_t dinodes); +int gfs2_statfs_sync(struct gfs2_sbd *sdp); +int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc); +int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc); + +int gfs2_freeze_fs(struct gfs2_sbd *sdp); +void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); + +#endif /* __SUPER_DOT_H__ */ + --- /dev/null +++ b/fs/gfs2/trans.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" + +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes) +{ + struct gfs2_trans *tr; + int error; + + BUG_ON(current->journal_info); + BUG_ON(blocks == 0 && revokes == 0); + + tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); + if (!tr) + return -ENOMEM; + + tr->tr_ip = (unsigned long)__builtin_return_address(0); + tr->tr_blocks = blocks; + tr->tr_revokes = revokes; + tr->tr_reserved = 1; + if (blocks) + tr->tr_reserved += 6 + blocks; + if (revokes) + tr->tr_reserved += gfs2_struct2blk(sdp, revokes, + sizeof(uint64_t)); + INIT_LIST_HEAD(&tr->tr_list_buf); + + gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); + + error = gfs2_glock_nq(&tr->tr_t_gh); + if (error) + goto fail_holder_uninit; + + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + tr->tr_t_gh.gh_flags |= GL_NOCACHE; + error = -EROFS; + goto fail_gunlock; + } + + error = gfs2_log_reserve(sdp, tr->tr_reserved); + if (error) + goto fail_gunlock; + + current->journal_info = tr; + + return 0; + +fail_gunlock: + gfs2_glock_dq(&tr->tr_t_gh); + +fail_holder_uninit: + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + + return error; +} + +void gfs2_trans_end(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr = current->journal_info; + + BUG_ON(!tr); + current->journal_info = NULL; + + if (!tr->tr_touched) { + gfs2_log_release(sdp, tr->tr_reserved); + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + return; + } + + if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) { + fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ", + tr->tr_num_buf, tr->tr_blocks); + print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + } + if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) { + fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ", + tr->tr_num_revoke, tr->tr_revokes); + print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + } + + gfs2_log_commit(sdp, tr); + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + + if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) + gfs2_log_flush(sdp, NULL); +} + +void gfs2_trans_add_gl(struct gfs2_glock *gl) +{ + lops_add(gl->gl_sbd, &gl->gl_le); +} + +/** + * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction + * @gl: the glock the buffer belongs to + * @bh: The buffer to add + * @meta: True in the case of adding metadata + * + */ + +void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_bufdata *bd; + + bd = bh->b_private; + if (bd) + gfs2_assert(sdp, bd->bd_gl == gl); + else { + gfs2_attach_bufdata(gl, bh, meta); + bd = bh->b_private; + } + lops_add(sdp, &bd->bd_le); +} + +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno) +{ + struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke), + GFP_NOFS | __GFP_NOFAIL); + lops_init_le(&rv->rv_le, &gfs2_revoke_lops); + rv->rv_blkno = blkno; + lops_add(sdp, &rv->rv_le); +} + +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno) +{ + struct gfs2_revoke *rv; + int found = 0; + + gfs2_log_lock(sdp); + + list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) { + if (rv->rv_blkno == blkno) { + list_del(&rv->rv_le.le_list); + gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); + sdp->sd_log_num_revoke--; + found = 1; + break; + } + } + + gfs2_log_unlock(sdp); + + if (found) { + struct gfs2_trans *tr = current->journal_info; + kfree(rv); + tr->tr_num_revoke_rm++; + } +} + +void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) +{ + lops_add(rgd->rd_sbd, &rgd->rd_le); +} + --- /dev/null +++ b/fs/gfs2/trans.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __TRANS_DOT_H__ +#define __TRANS_DOT_H__ + +#define RES_DINODE 1 +#define RES_INDIRECT 1 +#define RES_JDATA 1 +#define RES_DATA 1 +#define RES_LEAF 1 +#define RES_RG_BIT 2 +#define RES_EATTR 1 +#define RES_STATFS 1 +#define RES_QUOTA 2 + +int gfs2_trans_begin(struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes); + +void gfs2_trans_end(struct gfs2_sbd *sdp); + +void gfs2_trans_add_gl(struct gfs2_glock *gl); +void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, uint64_t blkno); +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, uint64_t blkno); +void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); + +#endif /* __TRANS_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/util.c @@ -0,0 +1,245 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "lm.h" +#include "util.h" + +kmem_cache_t *gfs2_glock_cachep __read_mostly; +kmem_cache_t *gfs2_inode_cachep __read_mostly; +kmem_cache_t *gfs2_bufdata_cachep __read_mostly; + +void gfs2_assert_i(struct gfs2_sbd *sdp) +{ + printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", + sdp->sd_fsname); +} + +/** + * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + dump_stack(); + return (me) ? -1 : -2; +} + +/** + * gfs2_assert_warn_i - Print a message to the console if @assertion is false + * Returns: -1 if we printed something + * -2 if we didn't + */ + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + if (time_before(jiffies, + sdp->sd_last_warning + + gfs2_tune_get(sdp, gt_complain_secs) * HZ)) + return -2; + + printk(KERN_WARNING + "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + + if (sdp->sd_args.ar_debug) + BUG(); + else + dump_stack(); + + sdp->sd_last_warning = jiffies; + + return -1; +} + +/** + * gfs2_consist_i - Flag a filesystem consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, + char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_consist_inode_i - Flag an inode consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: inode = %llu %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino, + (unsigned long long)ip->i_num.no_addr, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: RG = %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, char *file, + unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: invalid metadata block\n" + "GFS2: fsid=%s: bh = %llu (%s)\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, + sdp->sd_fsname, function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + uint16_t type, uint16_t t, const char *function, + char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: invalid metadata block\n" + "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, + sdp->sd_fsname, function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_io_error_i - Flag an I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: I/O error\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: I/O error\n" + "GFS2: fsid=%s: block = %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, + sdp->sd_fsname, function, file, line); + return rv; +} + +void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, + unsigned int bit, int new_value) +{ + unsigned int c, o, b = bit; + int old_value; + + c = b / (8 * PAGE_SIZE); + b %= 8 * PAGE_SIZE; + o = b / 8; + b %= 8; + + old_value = (bitmap[c][o] & (1 << b)); + gfs2_assert_withdraw(sdp, !old_value != !new_value); + + if (new_value) + bitmap[c][o] |= 1 << b; + else + bitmap[c][o] &= ~(1 << b); +} + --- /dev/null +++ b/fs/gfs2/util.h @@ -0,0 +1,169 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + + +#define fs_printk(level, fs, fmt, arg...) \ + printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) + +#define fs_info(fs, fmt, arg...) \ + fs_printk(KERN_INFO , fs , fmt , ## arg) + +#define fs_warn(fs, fmt, arg...) \ + fs_printk(KERN_WARNING , fs , fmt , ## arg) + +#define fs_err(fs, fmt, arg...) \ + fs_printk(KERN_ERR, fs , fmt , ## arg) + + +void gfs2_assert_i(struct gfs2_sbd *sdp); + +#define gfs2_assert(sdp, assertion) \ +do { \ + if (unlikely(!(assertion))) { \ + gfs2_assert_i(sdp); \ + BUG(); \ + } \ +} while (0) + + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_withdraw(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \ + __FUNCTION__, __FILE__, __LINE__)) + + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_warn(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \ + __FUNCTION__, __FILE__, __LINE__)) + + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist(sdp) \ +gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_inode(ip) \ +gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_rgrpd(rgd) \ +gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, + char *file, unsigned int line); + +static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + uint32_t magic = mh->mh_magic; + magic = be32_to_cpu(magic); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + return 0; +} + +#define gfs2_meta_check(sdp, bh) \ +gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + uint16_t type, uint16_t t, + const char *function, + char *file, unsigned int line); + +static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + uint16_t type, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + uint32_t magic = mh->mh_magic; + uint16_t t = be32_to_cpu(mh->mh_type); + magic = be32_to_cpu(magic); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + if (unlikely(t != type)) + return gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return 0; +} + +#define gfs2_metatype_check(sdp, bh, type) \ +gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__) + +static inline void gfs2_metatype_set(struct buffer_head *bh, uint16_t type, + uint16_t format) +{ + struct gfs2_meta_header *mh; + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_type = cpu_to_be32(type); + mh->mh_format = cpu_to_be32(format); +} + + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); + +#define gfs2_io_error(sdp) \ +gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__); + + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line); + +#define gfs2_io_error_bh(sdp, bh) \ +gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__); + + +extern kmem_cache_t *gfs2_glock_cachep; +extern kmem_cache_t *gfs2_inode_cachep; +extern kmem_cache_t *gfs2_bufdata_cachep; + +static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, + unsigned int *p) +{ + unsigned int x; + spin_lock(>->gt_spin); + x = *p; + spin_unlock(>->gt_spin); + return x; +} + +#define gfs2_tune_get(sdp, field) \ +gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) + +void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, + unsigned int bit, int new_value); + +#endif /* __UTIL_DOT_H__ */ + [PATCH 07/16] GFS2: Directory handling Directory handling code for GFS2. GFS2 uses extensible hashing in the directory code which makes lookups fast. Due to the way in which the hash tables are split during directory expansion, the directory contents are sorted during readdir and it would be nice to be able to eliminate this operation. Suggestions are welcome, but note that we need to maintain backward compatibility with GFS1's on disk format. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/dir.c | 1976 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/dir.h | 73 ++ 2 files changed, 2049 insertions(+) --- /dev/null +++ b/fs/gfs2/dir.c @@ -0,0 +1,1976 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +/* + * Implements Extendible Hashing as described in: + * "Extendible Hashing" by Fagin, et al in + * __ACM Trans. on Database Systems__, Sept 1979. + * + * + * Here's the layout of dirents which is essentially the same as that of ext2 + * within a single block. The field de_name_len is the number of bytes + * actually required for the name (no null terminator). The field de_rec_len + * is the number of bytes allocated to the dirent. The offset of the next + * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is + * deleted, the preceding dirent inherits its allocated space, ie + * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained + * by adding de_rec_len to the current dirent, this essentially causes the + * deleted dirent to get jumped over when iterating through all the dirents. + * + * When deleting the first dirent in a block, there is no previous dirent so + * the field de_ino is set to zero to designate it as deleted. When allocating + * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the + * first dirent has (de_ino == 0) and de_rec_len is large enough, this first + * dirent is allocated. Otherwise it must go through all the 'used' dirents + * searching for one in which the amount of total space minus the amount of + * used space will provide enough space for the new dirent. + * + * There are two types of blocks in which dirents reside. In a stuffed dinode, + * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of + * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the + * beginning of the leaf block. The dirents reside in leaves when + * + * dip->i_di.di_flags & GFS2_DIF_EXHASH is true + * + * Otherwise, the dirents are "linear", within a single stuffed dinode block. + * + * When the dirents are in leaves, the actual contents of the directory file are + * used as an array of 64-bit block pointers pointing to the leaf blocks. The + * dirents are NOT in the directory file itself. There can be more than one + * block pointer in the array that points to the same leaf. In fact, when a + * directory is first converted from linear to exhash, all of the pointers + * point to the same leaf. + * + * When a leaf is completely full, the size of the hash table can be + * doubled unless it is already at the maximum size which is hard coded into + * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list, + * but never before the maximum hash table size has been reached. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "bmap.h" +#include "util.h" + +#define IS_LEAF 1 /* Hashed (leaf) directory */ +#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ + +#define gfs2_disk_hash2offset(h) (((uint64_t)(h)) >> 1) +#define gfs2_dir_offset2hash(p) ((uint32_t)(((uint64_t)(p)) << 1)) + +typedef int (*leaf_call_t) (struct gfs2_inode *dip, + uint32_t index, uint32_t len, uint64_t leaf_no, + void *data); + + +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + *bhp = bh; + return 0; +} + +static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, uint64_t block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + int error; + + error = gfs2_meta_read(ip->i_gl, block, DIO_START | DIO_WAIT, &bh); + if (error) + return error; + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { + brelse(bh); + return -EIO; + } + *bhp = bh; + return 0; +} + +static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, + unsigned int offset, unsigned int size) + +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); + if (ip->i_di.di_size < offset + size) + ip->i_di.di_size = offset + size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + + brelse(dibh); + + return size; +} + + + +/** + * gfs2_dir_write_data - Write directory information to the inode + * @ip: The GFS2 inode + * @buf: The buffer containing information to be written + * @offset: The file offset to start writing at + * @size: The amount of data to write + * + * Returns: The number of bytes correctly written or error code + */ +static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, + uint64_t offset, unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + uint64_t lblock, dblock; + uint32_t extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip) && + offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset, + size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + return error; + } + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + int new; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 1; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error) + goto fail; + error = -EIO; + if (gfs2_assert_withdraw(sdp, dblock)) + goto fail; + } + + if (amount == sdp->sd_jbsize || new) + error = gfs2_dir_get_new_buffer(ip, dblock, &bh); + else + error = gfs2_dir_get_existing_buffer(ip, dblock, &bh); + + if (error) + goto fail; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + memcpy(bh->b_data + o, buf, amount); + brelse(bh); + if (error) + goto fail; + + buf += amount; + copied += amount; + lblock++; + dblock++; + extlen--; + + o = sizeof(struct gfs2_meta_header); + } + +out: + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (ip->i_di.di_size < offset + copied) + ip->i_di.di_size = offset + copied; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + + return copied; +fail: + if (copied) + goto out; + return error; +} + +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, + unsigned int offset, unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + offset += sizeof(struct gfs2_dinode); + memcpy(buf, dibh->b_data + offset, size); + brelse(dibh); + } + + return (error) ? error : size; +} + + +/** + * gfs2_dir_read_data - Read a data from a directory inode + * @ip: The GFS2 Inode + * @buf: The buffer to place result into + * @offset: File offset to begin jdata_readng from + * @size: Amount of data to transfer + * + * Returns: The amount of data actually copied or the error + */ +static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, + uint64_t offset, unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + uint64_t lblock, dblock; + uint32_t extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + + if (offset >= ip->i_di.di_size) + return 0; + + if ((offset + size) > ip->i_di.di_size) + size = ip->i_di.di_size - offset; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip)) + return gfs2_dir_read_stuffed(ip, buf, (unsigned int)offset, + size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + int new; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 0; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error) + goto fail; + } + + if (extlen > 1) + gfs2_meta_ra(ip->i_gl, dblock, extlen); + + if (dblock) { + if (new) + error = gfs2_dir_get_new_buffer(ip, dblock, &bh); + else + error = gfs2_dir_get_existing_buffer(ip, dblock, &bh); + if (error) + goto fail; + dblock++; + extlen--; + } else + bh = NULL; + + memcpy(buf, bh->b_data + o, amount); + brelse(bh); + if (error) + goto fail; + + buf += amount; + copied += amount; + lblock++; + + o = sizeof(struct gfs2_meta_header); + } + + return copied; +fail: + return (copied) ? copied : error; +} + +typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque); + +static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, int ret) +{ + if (dent->de_inum.no_addr != 0 && + be32_to_cpu(dent->de_hash) == name->hash && + be16_to_cpu(dent->de_name_len) == name->len && + memcmp((char *)(dent+1), name->name, name->len) == 0) + return ret; + return 0; +} + +static int gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 1); +} + +static int gfs2_dirent_prev(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 2); +} + +/* + * name->name holds ptr to start of block. + * name->len holds size of block. + */ +static int gfs2_dirent_last(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + const char *start = name->name; + const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len); + if (name->len == (end - start)) + return 1; + return 0; +} + +static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (!dent->de_inum.no_addr) + actual = GFS2_DIRENT_SIZE(0); + if ((totlen - actual) >= required) + return 1; + return 0; +} + +struct dirent_gather { + const struct gfs2_dirent **pdent; + unsigned offset; +}; + +static int gfs2_dirent_gather(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + struct dirent_gather *g = opaque; + if (dent->de_inum.no_addr) { + g->pdent[g->offset++] = dent; + } + return 0; +} + +/* + * Other possible things to check: + * - Inode located within filesystem size (and on valid block) + * - Valid directory entry type + * Not sure how heavy-weight we want to make this... could also check + * hash is correct for example, but that would take a lot of extra time. + * For now the most important thing is to check that the various sizes + * are correct. + */ +static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, + unsigned int size, unsigned int len, int first) +{ + const char *msg = "gfs2_dirent too small"; + if (unlikely(size < sizeof(struct gfs2_dirent))) + goto error; + msg = "gfs2_dirent misaligned"; + if (unlikely(offset & 0x7)) + goto error; + msg = "gfs2_dirent points beyond end of block"; + if (unlikely(offset + size > len)) + goto error; + msg = "zero inode number"; + if (unlikely(!first && !dent->de_inum.no_addr)) + goto error; + msg = "name length is greater than space in dirent"; + if (dent->de_inum.no_addr && + unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) > + size)) + goto error; + return 0; +error: + printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, + first ? "first in block" : "not first in block"); + return -EIO; +} + +static int gfs2_dirent_offset(const void *buf) +{ + const struct gfs2_meta_header *h = buf; + int offset; + + BUG_ON(buf == NULL); + + switch(be32_to_cpu(h->mh_type)) { + case GFS2_METATYPE_LF: + offset = sizeof(struct gfs2_leaf); + break; + case GFS2_METATYPE_DI: + offset = sizeof(struct gfs2_dinode); + break; + default: + goto wrong_type; + } + return offset; +wrong_type: + printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", + be32_to_cpu(h->mh_type)); + return -1; +} + +static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, + void *buf, + unsigned int len, gfs2_dscan_t scan, + const struct qstr *name, + void *opaque) +{ + struct gfs2_dirent *dent, *prev; + unsigned offset; + unsigned size; + int ret = 0; + + ret = gfs2_dirent_offset(buf); + if (ret < 0) + goto consist_inode; + + offset = ret; + prev = NULL; + dent = (struct gfs2_dirent *)(buf + offset); + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 1)) + goto consist_inode; + do { + ret = scan(dent, name, opaque); + if (ret) + break; + offset += size; + if (offset == len) + break; + prev = dent; + dent = (struct gfs2_dirent *)(buf + offset); + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 0)) + goto consist_inode; + } while(1); + + switch(ret) { + case 0: + return NULL; + case 1: + return dent; + case 2: + return prev ? prev : dent; + default: + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + +consist_inode: + gfs2_consist_inode(GFS2_I(inode)); + return ERR_PTR(-EIO); +} + + +/** + * dirent_first - Return the first dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * return first dirent whether bh points to leaf or stuffed dinode + * + * Returns: IS_LEAF, IS_DINODE, or -errno + */ + +static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data; + + if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) { + if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh)) + return -EIO; + *dent = (struct gfs2_dirent *)(bh->b_data + + sizeof(struct gfs2_leaf)); + return IS_LEAF; + } else { + if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI)) + return -EIO; + *dent = (struct gfs2_dirent *)(bh->b_data + + sizeof(struct gfs2_dinode)); + return IS_DINODE; + } +} + +/** + * dirent_next - Next dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * Returns: 0 on success, error code otherwise + */ + +static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_dirent *tmp, *cur; + char *bh_end; + uint16_t cur_rec_len; + + cur = *dent; + bh_end = bh->b_data + bh->b_size; + cur_rec_len = be16_to_cpu(cur->de_rec_len); + + if ((char *)cur + cur_rec_len >= bh_end) { + if ((char *)cur + cur_rec_len > bh_end) { + gfs2_consist_inode(dip); + return -EIO; + } + return -ENOENT; + } + + tmp = (struct gfs2_dirent *)((char *)cur + cur_rec_len); + + if ((char *)tmp + be16_to_cpu(tmp->de_rec_len) > bh_end) { + gfs2_consist_inode(dip); + return -EIO; + } + + if (cur_rec_len == 0) { + gfs2_consist_inode(dip); + return -EIO; + } + + /* Only the first dent could ever have de_inum.no_addr == 0 */ + if (!tmp->de_inum.no_addr) { + gfs2_consist_inode(dip); + return -EIO; + } + + *dent = tmp; + + return 0; +} + +/** + * dirent_del - Delete a dirent + * @dip: The GFS2 inode + * @bh: The buffer + * @prev: The previous dirent + * @cur: The current dirent + * + */ + +static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent *prev, struct gfs2_dirent *cur) +{ + uint16_t cur_rec_len, prev_rec_len; + + if (!cur->de_inum.no_addr) { + gfs2_consist_inode(dip); + return; + } + + gfs2_trans_add_bh(dip->i_gl, bh, 1); + + /* If there is no prev entry, this is the first entry in the block. + The de_rec_len is already as big as it needs to be. Just zero + out the inode number and return. */ + + if (!prev) { + cur->de_inum.no_addr = 0; /* No endianess worries */ + return; + } + + /* Combine this dentry with the previous one. */ + + prev_rec_len = be16_to_cpu(prev->de_rec_len); + cur_rec_len = be16_to_cpu(cur->de_rec_len); + + if ((char *)prev + prev_rec_len != (char *)cur) + gfs2_consist_inode(dip); + if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size) + gfs2_consist_inode(dip); + + prev_rec_len += cur_rec_len; + prev->de_rec_len = cpu_to_be16(prev_rec_len); +} + +/* + * Takes a dent from which to grab space as an argument. Returns the + * newly created dent. + */ +static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_dirent *ndent; + unsigned offset = 0, totlen; + + if (dent->de_inum.no_addr) + offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + totlen = be16_to_cpu(dent->de_rec_len); + BUG_ON(offset + name->len > totlen); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ndent = (struct gfs2_dirent *)((char *)dent + offset); + dent->de_rec_len = cpu_to_be16(offset); + gfs2_qstr2dirent(name, totlen - offset, ndent); + return ndent; +} + +static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode, + struct buffer_head *bh, + const struct qstr *name) +{ + struct gfs2_dirent *dent; + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_find_space, name, NULL); + if (!dent || IS_ERR(dent)) + return dent; + return gfs2_init_dirent(inode, dent, name, bh); +} + +static int get_leaf(struct gfs2_inode *dip, uint64_t leaf_no, + struct buffer_head **bhp) +{ + int error; + + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_START | DIO_WAIT, bhp); + if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { + /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ + error = -EIO; + } + + return error; +} + +/** + * get_leaf_nr - Get a leaf number associated with the index + * @dip: The GFS2 inode + * @index: + * @leaf_out: + * + * Returns: 0 on success, error code otherwise + */ + +static int get_leaf_nr(struct gfs2_inode *dip, uint32_t index, + uint64_t *leaf_out) +{ + uint64_t leaf_no; + int error; + + error = gfs2_dir_read_data(dip, (char *)&leaf_no, + index * sizeof(uint64_t), + sizeof(uint64_t)); + if (error != sizeof(uint64_t)) + return (error < 0) ? error : -EIO; + + *leaf_out = be64_to_cpu(leaf_no); + + return 0; +} + +static int get_first_leaf(struct gfs2_inode *dip, uint32_t index, + struct buffer_head **bh_out) +{ + uint64_t leaf_no; + int error; + + error = get_leaf_nr(dip, index, &leaf_no); + if (!error) + error = get_leaf(dip, leaf_no, bh_out); + + return error; +} + +static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, + const struct qstr *name, + gfs2_dscan_t scan, + struct buffer_head **pbh) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf; + unsigned hsize = 1 << ip->i_di.di_depth; + unsigned index; + u64 ln; + if (hsize * sizeof(u64) != ip->i_di.di_size) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + index = name->hash >> (32 - ip->i_di.di_depth); + error = get_first_leaf(ip, index, &bh); + if (error) + return ERR_PTR(error); + do { + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + scan, name, NULL); + if (dent) + goto got_dent; + leaf = (struct gfs2_leaf *)bh->b_data; + ln = be64_to_cpu(leaf->lf_next); + brelse(bh); + if (!ln) + break; + + error = get_leaf(ip, ln, &bh); + } while(!error); + + return error ? ERR_PTR(error) : NULL; + } + + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return ERR_PTR(error); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL); +got_dent: + if (unlikely(dent == NULL || IS_ERR(dent))) { + brelse(bh); + bh = NULL; + } + *pbh = bh; + return dent; +} + +static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) +{ + struct gfs2_inode *ip = GFS2_I(inode); + u64 bn = gfs2_alloc_meta(ip); + struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); + struct gfs2_leaf *leaf; + struct gfs2_dirent *dent; + struct qstr name = { .name = "", .len = 0, .hash = 0 }; + if (!bh) + return NULL; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_depth = cpu_to_be16(depth); + leaf->lf_entries = cpu_to_be16(0); + leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE); + leaf->lf_next = cpu_to_be64(0); + memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + dent = (struct gfs2_dirent *)(leaf+1); + gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); + *pbh = bh; + return leaf; +} + +/** + * dir_make_exhash - Convert a stuffed directory into an ExHash directory + * @dip: The GFS2 inode + * + * Returns: 0 on success, error code otherwise + */ + +static int dir_make_exhash(struct inode *inode) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_dirent *dent; + struct qstr args; + struct buffer_head *bh, *dibh; + struct gfs2_leaf *leaf; + int y; + uint32_t x; + uint64_t *lp, bn; + int error; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + /* Turn over a new leaf */ + + leaf = new_leaf(inode, &bh, 0); + if (!leaf) + return -ENOSPC; + bn = bh->b_blocknr; + + gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); + leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); + + /* Copy dirents */ + + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh, + sizeof(struct gfs2_dinode)); + + /* Find last entry */ + + x = 0; + args.len = bh->b_size - sizeof(struct gfs2_dinode) + + sizeof(struct gfs2_leaf); + args.name = bh->b_data; + dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size, + gfs2_dirent_last, &args, NULL); + if (!dent) { + brelse(bh); + brelse(dibh); + return -EIO; + } + if (IS_ERR(dent)) { + brelse(bh); + brelse(dibh); + return PTR_ERR(dent); + } + + /* Adjust the last dirent's record length + (Remember that dent still points to the last entry.) */ + + dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) + + sizeof(struct gfs2_dinode) - + sizeof(struct gfs2_leaf)); + + brelse(bh); + + /* We're done with the new leaf block, now setup the new + hash table. */ + + gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + lp = (uint64_t *)(dibh->b_data + sizeof(struct gfs2_dinode)); + + for (x = sdp->sd_hash_ptrs; x--; lp++) + *lp = cpu_to_be64(bn); + + dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; + dip->i_di.di_blocks++; + dip->i_di.di_flags |= GFS2_DIF_EXHASH; + dip->i_di.di_payload_format = 0; + + for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; + dip->i_di.di_depth = y; + + gfs2_dinode_out(&dip->i_di, dibh->b_data); + + brelse(dibh); + + return 0; +} + +/** + * dir_split_leaf - Split a leaf block into two + * @dip: The GFS2 inode + * @index: + * @leaf_no: + * + * Returns: 0 on success, error code on failure + */ + +static int dir_split_leaf(struct inode *inode, const struct qstr *name) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct buffer_head *nbh, *obh, *dibh; + struct gfs2_leaf *nleaf, *oleaf; + struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new; + uint32_t start, len, half_len, divider; + uint64_t bn, *lp, leaf_no; + uint32_t index; + int x, moved = 0; + int error; + + index = name->hash >> (32 - dip->i_di.di_depth); + error = get_leaf_nr(dip, index, &leaf_no); + if (error) + return error; + + /* Get the old leaf block */ + error = get_leaf(dip, leaf_no, &obh); + if (error) + return error; + + oleaf = (struct gfs2_leaf *)obh->b_data; + if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) { + brelse(obh); + return 1; /* can't split */ + } + + gfs2_trans_add_bh(dip->i_gl, obh, 1); + + nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); + if (!nleaf) { + brelse(obh); + return -ENOSPC; + } + bn = nbh->b_blocknr; + + /* Compute the start and len of leaf pointers in the hash table. */ + len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth)); + half_len = len >> 1; + if (!half_len) { + printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index); + gfs2_consist_inode(dip); + error = -EIO; + goto fail_brelse; + } + + start = (index & ~(len - 1)); + + /* Change the pointers. + Don't bother distinguishing stuffed from non-stuffed. + This code is complicated enough already. */ + lp = kmalloc(half_len * sizeof(uint64_t), GFP_NOFS | __GFP_NOFAIL); + /* Change the pointers */ + for (x = 0; x < half_len; x++) + lp[x] = cpu_to_be64(bn); + + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(uint64_t), + half_len * sizeof(uint64_t)); + if (error != half_len * sizeof(uint64_t)) { + if (error >= 0) + error = -EIO; + goto fail_lpfree; + } + + kfree(lp); + + /* Compute the divider */ + divider = (start + half_len) << (32 - dip->i_di.di_depth); + + /* Copy the entries */ + dirent_first(dip, obh, &dent); + + do { + next = dent; + if (dirent_next(dip, obh, &next)) + next = NULL; + + if (dent->de_inum.no_addr && + be32_to_cpu(dent->de_hash) < divider) { + struct qstr str; + str.name = (char*)(dent+1); + str.len = be16_to_cpu(dent->de_name_len); + str.hash = be32_to_cpu(dent->de_hash); + new = gfs2_dirent_alloc(inode, nbh, &str); + if (IS_ERR(new)) { + error = PTR_ERR(new); + break; + } + + new->de_inum = dent->de_inum; /* No endian worries */ + new->de_type = dent->de_type; /* No endian worries */ + nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1); + + dirent_del(dip, obh, prev, dent); + + if (!oleaf->lf_entries) + gfs2_consist_inode(dip); + oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1); + + if (!prev) + prev = dent; + + moved = 1; + } else { + prev = dent; + } + dent = next; + } while (dent); + + oleaf->lf_depth = nleaf->lf_depth; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { + dip->i_di.di_blocks++; + gfs2_dinode_out(&dip->i_di, dibh->b_data); + brelse(dibh); + } + + brelse(obh); + brelse(nbh); + + return error; + +fail_lpfree: + kfree(lp); + +fail_brelse: + brelse(obh); + brelse(nbh); + return error; +} + +/** + * dir_double_exhash - Double size of ExHash table + * @dip: The GFS2 dinode + * + * Returns: 0 on success, error code on failure + */ + +static int dir_double_exhash(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *dibh; + uint32_t hsize; + uint64_t *buf; + uint64_t *from, *to; + uint64_t block; + int x; + int error = 0; + + hsize = 1 << dip->i_di.di_depth; + if (hsize * sizeof(uint64_t) != dip->i_di.di_size) { + gfs2_consist_inode(dip); + return -EIO; + } + + /* Allocate both the "from" and "to" buffers in one big chunk */ + + buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL); + + for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { + error = gfs2_dir_read_data(dip, (char *)buf, + block * sdp->sd_hash_bsize, + sdp->sd_hash_bsize); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto fail; + } + + from = buf; + to = (uint64_t *)((char *)buf + sdp->sd_hash_bsize); + + for (x = sdp->sd_hash_ptrs; x--; from++) { + *to++ = *from; /* No endianess worries */ + *to++ = *from; + } + + error = gfs2_dir_write_data(dip, + (char *)buf + sdp->sd_hash_bsize, + block * sdp->sd_sb.sb_bsize, + sdp->sd_sb.sb_bsize); + if (error != sdp->sd_sb.sb_bsize) { + if (error >= 0) + error = -EIO; + goto fail; + } + } + + kfree(buf); + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(sdp, !error)) { + dip->i_di.di_depth++; + gfs2_dinode_out(&dip->i_di, dibh->b_data); + brelse(dibh); + } + + return error; + + fail: + kfree(buf); + + return error; +} + +/** + * compare_dents - compare directory entries by hash value + * @a: first dent + * @b: second dent + * + * When comparing the hash entries of @a to @b: + * gt: returns 1 + * lt: returns -1 + * eq: returns 0 + */ + +static int compare_dents(const void *a, const void *b) +{ + struct gfs2_dirent *dent_a, *dent_b; + uint32_t hash_a, hash_b; + int ret = 0; + + dent_a = *(struct gfs2_dirent **)a; + hash_a = be32_to_cpu(dent_a->de_hash); + + dent_b = *(struct gfs2_dirent **)b; + hash_b = be32_to_cpu(dent_b->de_hash); + + if (hash_a > hash_b) + ret = 1; + else if (hash_a < hash_b) + ret = -1; + else { + unsigned int len_a = be16_to_cpu(dent_a->de_name_len); + unsigned int len_b = be16_to_cpu(dent_b->de_name_len); + + if (len_a > len_b) + ret = 1; + else if (len_a < len_b) + ret = -1; + else + ret = memcmp((char *)(dent_a + 1), + (char *)(dent_b + 1), + len_a); + } + + return ret; +} + +/** + * do_filldir_main - read out directory entries + * @dip: The GFS2 inode + * @offset: The offset in the file to read from + * @opaque: opaque data to pass to filldir + * @filldir: The function to pass entries to + * @darr: an array of struct gfs2_dirent pointers to read + * @entries: the number of entries in darr + * @copied: pointer to int that's non-zero if a entry has been copied out + * + * Jump through some hoops to make sure that if there are hash collsions, + * they are read out at the beginning of a buffer. We want to minimize + * the possibility that they will fall into different readdir buffers or + * that someone will want to seek to that location. + * + * Returns: errno, >0 on exception from filldir + */ + +static int do_filldir_main(struct gfs2_inode *dip, uint64_t *offset, + void *opaque, gfs2_filldir_t filldir, + const struct gfs2_dirent **darr, uint32_t entries, + int *copied) +{ + const struct gfs2_dirent *dent, *dent_next; + struct gfs2_inum inum; + uint64_t off, off_next; + unsigned int x, y; + int run = 0; + int error = 0; + + sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); + + dent_next = darr[0]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + for (x = 0, y = 1; x < entries; x++, y++) { + dent = dent_next; + off = off_next; + + if (y < entries) { + dent_next = darr[y]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + if (off < *offset) + continue; + *offset = off; + + if (off_next == off) { + if (*copied && !run) + return 1; + run = 1; + } else + run = 0; + } else { + if (off < *offset) + continue; + *offset = off; + } + + gfs2_inum_in(&inum, (char *)&dent->de_inum); + + error = filldir(opaque, (char *)(dent + 1), + be16_to_cpu(dent->de_name_len), + off, &inum, + be16_to_cpu(dent->de_type)); + if (error) + return 1; + + *copied = 1; + } + + /* Increment the *offset by one, so the next time we come into the + do_filldir fxn, we get the next entry instead of the last one in the + current leaf */ + + (*offset)++; + + return 0; +} + +static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, + gfs2_filldir_t filldir, int *copied, + unsigned *depth, u64 leaf_no) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + struct gfs2_leaf *lf; + unsigned entries = 0; + unsigned leaves = 0; + const struct gfs2_dirent **darr, *dent; + struct dirent_gather g; + struct buffer_head **larr; + int leaf = 0; + int error, i; + u64 lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out; + lf = (struct gfs2_leaf *)bh->b_data; + if (leaves == 0) + *depth = be16_to_cpu(lf->lf_depth); + entries += be16_to_cpu(lf->lf_entries); + leaves++; + lfn = be64_to_cpu(lf->lf_next); + brelse(bh); + } while(lfn); + + if (!entries) + return 0; + + error = -ENOMEM; + larr = vmalloc((leaves + entries) * sizeof(void*)); + if (!larr) + goto out; + darr = (const struct gfs2_dirent **)(larr + leaves); + g.pdent = darr; + g.offset = 0; + lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out_kfree; + lf = (struct gfs2_leaf *)bh->b_data; + lfn = be64_to_cpu(lf->lf_next); + if (lf->lf_entries) { + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_gather, NULL, &g); + error = PTR_ERR(dent); + if (IS_ERR(dent)) { + goto out_kfree; + } + error = 0; + larr[leaf++] = bh; + } else { + brelse(bh); + } + } while(lfn); + + error = do_filldir_main(ip, offset, opaque, filldir, darr, + entries, copied); +out_kfree: + for(i = 0; i < leaf; i++) + brelse(larr[i]); + vfree(larr); +out: + return error; +} + +/** + * dir_e_read - Reads the entries from a directory into a filldir buffer + * @dip: dinode pointer + * @offset: the hash of the last entry read shifted to the right once + * @opaque: buffer for the filldir function to fill + * @filldir: points to the filldir function to use + * + * Returns: errno + */ + +static int dir_e_read(struct inode *inode, uint64_t *offset, void *opaque, + gfs2_filldir_t filldir) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + uint32_t hsize, len = 0; + uint32_t ht_offset, lp_offset, ht_offset_cur = -1; + uint32_t hash, index; + uint64_t *lp; + int copied = 0; + int error = 0; + unsigned depth = 0; + + hsize = 1 << dip->i_di.di_depth; + if (hsize * sizeof(uint64_t) != dip->i_di.di_size) { + gfs2_consist_inode(dip); + return -EIO; + } + + hash = gfs2_dir_offset2hash(*offset); + index = hash >> (32 - dip->i_di.di_depth); + + lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); + if (!lp) + return -ENOMEM; + + while (index < hsize) { + lp_offset = index & (sdp->sd_hash_ptrs - 1); + ht_offset = index - lp_offset; + + if (ht_offset_cur != ht_offset) { + error = gfs2_dir_read_data(dip, (char *)lp, + ht_offset * sizeof(uint64_t), + sdp->sd_hash_bsize); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto out; + } + ht_offset_cur = ht_offset; + } + + error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, + &copied, &depth, + be64_to_cpu(lp[lp_offset])); + if (error) + break; + + len = 1 << (dip->i_di.di_depth - depth); + index = (index & ~(len - 1)) + len; + } + +out: + kfree(lp); + if (error > 0) + error = 0; + return error; +} + +int gfs2_dir_read(struct inode *inode, uint64_t *offset, void *opaque, + gfs2_filldir_t filldir) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct dirent_gather g; + const struct gfs2_dirent **darr, *dent; + struct buffer_head *dibh; + int copied = 0; + int error; + + if (!dip->i_di.di_entries) + return 0; + + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) + return dir_e_read(inode, offset, opaque, filldir); + + if (!gfs2_is_stuffed(dip)) { + gfs2_consist_inode(dip); + return -EIO; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + error = -ENOMEM; + darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *), + GFP_KERNEL); + if (darr) { + g.pdent = darr; + g.offset = 0; + dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, + gfs2_dirent_gather, NULL, &g); + if (IS_ERR(dent)) { + error = PTR_ERR(dent); + goto out; + } + error = do_filldir_main(dip, offset, opaque, filldir, darr, + dip->i_di.di_entries, &copied); +out: + kfree(darr); + } + + if (error > 0) + error = 0; + + brelse(dibh); + + return error; +} + +/** + * gfs2_dir_search - Search a directory + * @dip: The GFS2 inode + * @filename: + * @inode: + * + * This routine searches a directory for a file or another directory. + * Assumes a glock is held on dip. + * + * Returns: errno + */ + +int gfs2_dir_search(struct inode *dir, const struct qstr *name, + struct gfs2_inum *inum, unsigned int *type) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + if (inum) + gfs2_inum_in(inum, (char *)&dent->de_inum); + if (type) + *type = be16_to_cpu(dent->de_type); + brelse(bh); + return 0; + } + return -ENOENT; +} + +static int dir_new_leaf(struct inode *inode, const struct qstr *name) +{ + struct buffer_head *bh, *obh; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_leaf *leaf, *oleaf; + int error; + u32 index; + u64 bn; + + index = name->hash >> (32 - ip->i_di.di_depth); + error = get_first_leaf(ip, index, &obh); + if (error) + return error; + do { + oleaf = (struct gfs2_leaf *)obh->b_data; + bn = be64_to_cpu(oleaf->lf_next); + if (!bn) + break; + brelse(obh); + error = get_leaf(ip, bn, &obh); + if (error) + return error; + } while(1); + + gfs2_trans_add_bh(ip->i_gl, obh, 1); + + leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); + if (!leaf) { + brelse(obh); + return -ENOSPC; + } + oleaf->lf_next = cpu_to_be64(bh->b_blocknr); + brelse(bh); + brelse(obh); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_di.di_blocks++; + gfs2_dinode_out(&ip->i_di, bh->b_data); + brelse(bh); + return 0; +} + +/** + * gfs2_dir_add - Add new filename into directory + * @dip: The GFS2 inode + * @filename: The new name + * @inode: The inode number of the entry + * @type: The type of the entry + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_add(struct inode *inode, const struct qstr *name, + const struct gfs2_inum *inum, unsigned type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_leaf *leaf; + int error; + + while(1) { + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, + &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + dent = gfs2_init_dirent(inode, dent, name, bh); + gfs2_inum_out(inum, (char *)&dent->de_inum); + dent->de_type = cpu_to_be16(type); + if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1); + } + brelse(bh); + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + break; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_di.di_entries++; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&ip->i_di, bh->b_data); + brelse(bh); + error = 0; + break; + } + if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + error = dir_make_exhash(inode); + if (error) + break; + continue; + } + error = dir_split_leaf(inode, name); + if (error == 0) + continue; + if (error < 0) + break; + if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) { + error = dir_double_exhash(ip); + if (error) + break; + error = dir_split_leaf(inode, name); + if (error < 0) + break; + if (error == 0) + continue; + } + error = dir_new_leaf(inode, name); + if (!error) + continue; + error = -ENOSPC; + break; + } + return error; +} + + +/** + * gfs2_dir_del - Delete a directory entry + * @dip: The GFS2 inode + * @filename: The filename + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) +{ + struct gfs2_dirent *dent, *prev = NULL; + struct buffer_head *bh; + int error; + + /* Returns _either_ the entry (if its first in block) or the + previous entry otherwise */ + dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) { + gfs2_consist_inode(dip); + return PTR_ERR(dent); + } + /* If not first in block, adjust pointers accordingly */ + if (gfs2_dirent_find(dent, name, NULL) == 0) { + prev = dent; + dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len)); + } + + dirent_del(dip, bh, prev, dent); + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + u16 entries = be16_to_cpu(leaf->lf_entries); + if (!entries) + gfs2_consist_inode(dip); + leaf->lf_entries = cpu_to_be16(--entries); + } + brelse(bh); + + error = gfs2_meta_inode_buffer(dip, &bh); + if (error) + return error; + + if (!dip->i_di.di_entries) + gfs2_consist_inode(dip); + gfs2_trans_add_bh(dip->i_gl, bh, 1); + dip->i_di.di_entries--; + dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&dip->i_di, bh->b_data); + brelse(bh); + mark_inode_dirty(&dip->i_inode); + + return error; +} + +/** + * gfs2_dir_mvino - Change inode number of directory entry + * @dip: The GFS2 inode + * @filename: + * @new_inode: + * + * This routine changes the inode number of a directory entry. It's used + * by rename to change ".." when a directory is moved. + * Assumes a glock is held on dvp. + * + * Returns: errno + */ + +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + struct gfs2_inum *inum, unsigned int new_type) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int error; + + dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + + gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_inum_out(inum, (char *)&dent->de_inum); + dent->de_type = cpu_to_be16(new_type); + + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + brelse(bh); + error = gfs2_meta_inode_buffer(dip, &bh); + if (error) + return error; + gfs2_trans_add_bh(dip->i_gl, bh, 1); + } + + dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&dip->i_di, bh->b_data); + brelse(bh); + return 0; +} + +/** + * foreach_leaf - call a function for each leaf in a directory + * @dip: the directory + * @lc: the function to call for each each + * @data: private data to pass to it + * + * Returns: errno + */ + +static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *bh; + struct gfs2_leaf *leaf; + uint32_t hsize, len; + uint32_t ht_offset, lp_offset, ht_offset_cur = -1; + uint32_t index = 0; + uint64_t *lp; + uint64_t leaf_no; + int error = 0; + + hsize = 1 << dip->i_di.di_depth; + if (hsize * sizeof(uint64_t) != dip->i_di.di_size) { + gfs2_consist_inode(dip); + return -EIO; + } + + lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); + if (!lp) + return -ENOMEM; + + while (index < hsize) { + lp_offset = index & (sdp->sd_hash_ptrs - 1); + ht_offset = index - lp_offset; + + if (ht_offset_cur != ht_offset) { + error = gfs2_dir_read_data(dip, (char *)lp, + ht_offset * sizeof(uint64_t), + sdp->sd_hash_bsize); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto out; + } + ht_offset_cur = ht_offset; + } + + leaf_no = be64_to_cpu(lp[lp_offset]); + if (leaf_no) { + error = get_leaf(dip, leaf_no, &bh); + if (error) + goto out; + leaf = (struct gfs2_leaf *)bh->b_data; + len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth)); + brelse(bh); + + error = lc(dip, index, len, leaf_no, data); + if (error) + goto out; + + index = (index & ~(len - 1)) + len; + } else + index++; + } + + if (index != hsize) { + gfs2_consist_inode(dip); + error = -EIO; + } + +out: + kfree(lp); + + return error; +} + +/** + * leaf_dealloc - Deallocate a directory leaf + * @dip: the directory + * @index: the hash table offset in the directory + * @len: the number of pointers to this leaf + * @leaf_no: the leaf number + * @data: not used + * + * Returns: errno + */ + +static int leaf_dealloc(struct gfs2_inode *dip, uint32_t index, uint32_t len, + uint64_t leaf_no, void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_leaf *tmp_leaf; + struct gfs2_rgrp_list rlist; + struct buffer_head *bh, *dibh; + uint64_t blk, nblk; + unsigned int rg_blocks = 0, l_blocks = 0; + char *ht; + unsigned int x, size = len * sizeof(uint64_t); + int error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + ht = kzalloc(size, GFP_KERNEL); + if (!ht) + return -ENOMEM; + + gfs2_alloc_get(dip); + + error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh); + if (error) + goto out_qs; + + /* Count the number of leaves */ + + for (blk = leaf_no; blk; blk = nblk) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_rlist; + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + brelse(bh); + + gfs2_rlist_add(sdp, &rlist, blk); + l_blocks++; + } + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_ri.ri_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, + rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) + + RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks); + if (error) + goto out_rg_gunlock; + + for (blk = leaf_no; blk; blk = nblk) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_end_trans; + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + brelse(bh); + + gfs2_free_meta(dip, blk, 1); + + if (!dip->i_di.di_blocks) + gfs2_consist_inode(dip); + dip->i_di.di_blocks--; + } + + error = gfs2_dir_write_data(dip, ht, index * sizeof(uint64_t), size); + if (error != size) { + if (error >= 0) + error = -EIO; + goto out_end_trans; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_end_trans; + + gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_dinode_out(&dip->i_di, dibh->b_data); + brelse(dibh); + + out_end_trans: + gfs2_trans_end(sdp); + + out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); + + out_rlist: + gfs2_rlist_free(&rlist); + gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh); + + out_qs: + gfs2_quota_unhold(dip); + + out: + gfs2_alloc_put(dip); + kfree(ht); + + return error; +} + +/** + * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory + * @dip: the directory + * + * Dealloc all on-disk directory leaves to FREEMETA state + * Change on-disk inode type to "regular file" + * + * Returns: errno + */ + +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *bh; + int error; + + /* Dealloc on-disk leaves to FREEMETA state */ + error = foreach_leaf(dip, leaf_dealloc, NULL); + if (error) + return error; + + /* Make this a regular file in case we crash. + (We don't want to free these blocks a second time.) */ + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(dip, &bh); + if (!error) { + gfs2_trans_add_bh(dip->i_gl, bh, 1); + ((struct gfs2_dinode *)bh->b_data)->di_mode = + cpu_to_be32(S_IFREG); + brelse(bh); + } + + gfs2_trans_end(sdp); + + return error; +} + +/** + * gfs2_diradd_alloc_required - find if adding entry will require an allocation + * @ip: the file being written to + * @filname: the filename that's going to be added + * + * Returns: 1 if alloc required, 0 if not, -ve on error + */ + +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +{ + struct gfs2_dirent *dent; + struct buffer_head *bh; + + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); + if (!dent) { + return 1; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + brelse(bh); + return 0; +} + --- /dev/null +++ b/fs/gfs2/dir.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +/** + * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read() + * @opaque: opaque data used by the function + * @name: the name of the directory entry + * @length: the length of the name + * @offset: the entry's offset in the directory + * @inum: the inode number the entry points to + * @type: the type of inode the entry points to + * + * Returns: 0 on success, 1 if buffer full + */ + +typedef int (*gfs2_filldir_t) (void *opaque, + const char *name, unsigned int length, + uint64_t offset, + struct gfs2_inum *inum, unsigned int type); + +int gfs2_dir_search(struct inode *dir, const struct qstr *filename, + struct gfs2_inum *inum, unsigned int *type); +int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inum *inum, unsigned int type); +int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); +int gfs2_dir_read(struct inode *inode, uint64_t * offset, void *opaque, + gfs2_filldir_t filldir); +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + struct gfs2_inum *new_inum, unsigned int new_type); + +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); + +int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename); +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, uint64_t block, + struct buffer_head **bhp); + +static inline uint32_t gfs2_disk_hash(const char *data, int len) +{ + return crc32_le(0xFFFFFFFF, data, len) ^ 0xFFFFFFFF; +} + + +static inline void gfs2_str2qstr(struct qstr *name, const char *fname) +{ + name->name = fname; + name->len = strlen(fname); + name->hash = gfs2_disk_hash(name->name, name->len); +} + +/* N.B. This probably ought to take inum & type as args as well */ +static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent) +{ + dent->de_inum.no_addr = cpu_to_be64(0); + dent->de_inum.no_formal_ino = cpu_to_be64(0); + dent->de_hash = cpu_to_be32(name->hash); + dent->de_rec_len = cpu_to_be16(reclen); + dent->de_name_len = cpu_to_be16(name->len); + dent->de_type = cpu_to_be16(0); + memset(dent->__pad, 0, sizeof(dent->__pad)); + memcpy((char*)(dent+1), name->name, name->len); +} + +#endif /* __DIR_DOT_H__ */ [PATCH 08/16] GFS2: Resource group code GFS2's resource group handling code and code for manipulating bitmaps. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/rgrp.c | 1525 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/rgrp.h | 62 ++ 2 files changed, 1587 insertions(+) --- /dev/null +++ b/fs/gfs2/rgrp.c @@ -0,0 +1,1525 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "ops_file.h" +#include "util.h" + +#define BFITNOENT 0xFFFFFFFF + +/* + * These routines are used by the resource group routines (rgrp.c) + * to keep track of block allocation. Each block is represented by two + * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks. + * + * 0 = Free + * 1 = Used (not metadata) + * 2 = Unlinked (still in use) inode + * 3 = Used (metadata) + */ + +static const char valid_change[16] = { + /* current */ + /* n */ 0, 1, 1, 1, + /* e */ 1, 0, 0, 0, + /* w */ 0, 0, 0, 1, + 1, 0, 0, 0 +}; + +/** + * gfs2_setbit - Set a bit in the bitmaps + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @block: the block to set + * @new_state: the new state of the block + * + */ + +static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, uint32_t block, + unsigned char new_state) +{ + unsigned char *byte, *end, cur_state; + unsigned int bit; + + byte = buffer + (block / GFS2_NBBY); + bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; + end = buffer + buflen; + + gfs2_assert(rgd->rd_sbd, byte < end); + + cur_state = (*byte >> bit) & GFS2_BIT_MASK; + + if (valid_change[new_state * 4 + cur_state]) { + *byte ^= cur_state << bit; + *byte |= new_state << bit; + } else + gfs2_consist_rgrpd(rgd); +} + +/** + * gfs2_testbit - test a bit in the bitmaps + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @block: the block to read + * + */ + +static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, uint32_t block) +{ + unsigned char *byte, *end, cur_state; + unsigned int bit; + + byte = buffer + (block / GFS2_NBBY); + bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; + end = buffer + buflen; + + gfs2_assert(rgd->rd_sbd, byte < end); + + cur_state = (*byte >> bit) & GFS2_BIT_MASK; + + return cur_state; +} + +/** + * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing + * a block in a given allocation state. + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @goal: start search at this block's bit-pair (within @buffer) + * @old_state: GFS2_BLKST_XXX the state of the block we're looking for; + * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0) + * + * Scope of @goal and returned block number is only within this bitmap buffer, + * not entire rgrp or filesystem. @buffer will be offset from the actual + * beginning of a bitmap block buffer, skipping any header structures. + * + * Return: the block number (bitmap buffer scope) that was found + */ + +static uint32_t gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, uint32_t goal, + unsigned char old_state) +{ + unsigned char *byte, *end, alloc; + uint32_t blk = goal; + unsigned int bit; + + byte = buffer + (goal / GFS2_NBBY); + bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; + end = buffer + buflen; + alloc = (old_state & 1) ? 0 : 0x55; + + while (byte < end) { + if ((*byte & 0x55) == alloc) { + blk += (8 - bit) >> 1; + + bit = 0; + byte++; + + continue; + } + + if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) + return blk; + + bit += GFS2_BIT_SIZE; + if (bit >= 8) { + bit = 0; + byte++; + } + + blk++; + } + + return BFITNOENT; +} + +/** + * gfs2_bitcount - count the number of bits in a certain state + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @state: the state of the block we're looking for + * + * Returns: The number of bits + */ + +static uint32_t gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, unsigned char state) +{ + unsigned char *byte = buffer; + unsigned char *end = buffer + buflen; + unsigned char state1 = state << 2; + unsigned char state2 = state << 4; + unsigned char state3 = state << 6; + uint32_t count = 0; + + for (; byte < end; byte++) { + if (((*byte) & 0x03) == state) + count++; + if (((*byte) & 0x0C) == state1) + count++; + if (((*byte) & 0x30) == state2) + count++; + if (((*byte) & 0xC0) == state3) + count++; + } + + return count; +} + +/** + * gfs2_rgrp_verify - Verify that a resource group is consistent + * @sdp: the filesystem + * @rgd: the rgrp + * + */ + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi = NULL; + uint32_t length = rgd->rd_ri.ri_length; + uint32_t count[4], tmp; + int buf, x; + + memset(count, 0, 4 * sizeof(uint32_t)); + + /* Count # blocks in each of 4 possible allocation states */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + for (x = 0; x < 4; x++) + count[x] += gfs2_bitcount(rgd, + bi->bi_bh->b_data + + bi->bi_offset, + bi->bi_len, x); + } + + if (count[0] != rgd->rd_rg.rg_free) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "free data mismatch: %u != %u\n", + count[0], rgd->rd_rg.rg_free); + return; + } + + tmp = rgd->rd_ri.ri_data - + rgd->rd_rg.rg_free - + rgd->rd_rg.rg_dinodes; + if (count[1] + count[2] != tmp) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used data mismatch: %u != %u\n", + count[1], tmp); + return; + } + + if (count[3] != rgd->rd_rg.rg_dinodes) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used metadata mismatch: %u != %u\n", + count[3], rgd->rd_rg.rg_dinodes); + return; + } + + if (count[2] > count[3]) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "unlinked inodes > inodes: %u\n", + count[2]); + return; + } + +} + +static inline int rgrp_contains_block(struct gfs2_rindex *ri, uint64_t block) +{ + uint64_t first = ri->ri_data0; + uint64_t last = first + ri->ri_data; + return !!(first <= block && block < last); +} + +/** + * gfs2_blk2rgrpd - Find resource group for a given data/meta block number + * @sdp: The GFS2 superblock + * @n: The data block number + * + * Returns: The resource group, or NULL if not found + */ + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + + list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { + if (rgrp_contains_block(&rgd->rd_ri, blk)) { + list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; + } + } + + spin_unlock(&sdp->sd_rindex_spin); + + return NULL; +} + +/** + * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem + * @sdp: The GFS2 superblock + * + * Returns: The first rgrp in the filesystem + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) +{ + gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); + return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); +} + +/** + * gfs2_rgrpd_get_next - get the next RG + * @rgd: A RG + * + * Returns: The next rgrp + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) +{ + if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) + return NULL; + return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); +} + +static void clear_rgrpdi(struct gfs2_sbd *sdp) +{ + struct list_head *head; + struct gfs2_rgrpd *rgd; + struct gfs2_glock *gl; + + spin_lock(&sdp->sd_rindex_spin); + sdp->sd_rindex_forward = NULL; + head = &sdp->sd_rindex_recent_list; + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); + list_del(&rgd->rd_recent); + } + spin_unlock(&sdp->sd_rindex_spin); + + head = &sdp->sd_rindex_list; + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list); + gl = rgd->rd_gl; + + list_del(&rgd->rd_list); + list_del(&rgd->rd_list_mru); + + if (gl) { + gl->gl_object = NULL; + gfs2_glock_put(gl); + } + + kfree(rgd->rd_bits); + kfree(rgd); + } +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) +{ + mutex_lock(&sdp->sd_rindex_mutex); + clear_rgrpdi(sdp); + mutex_unlock(&sdp->sd_rindex_mutex); +} + +/** + * gfs2_compute_bitstructs - Compute the bitmap sizes + * @rgd: The resource group descriptor + * + * Calculates bitmap descriptors, one for each block that contains bitmap data + * + * Returns: errno + */ + +static int compute_bitstructs(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi; + uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */ + uint32_t bytes_left, bytes; + int x; + + if (!length) + return -EINVAL; + + rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS); + if (!rgd->rd_bits) + return -ENOMEM; + + bytes_left = rgd->rd_ri.ri_bitbytes; + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + + /* small rgrp; bitmap stored completely in header block */ + if (length == 1) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + /* header block */ + } else if (x == 0) { + bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + /* last block */ + } else if (x + 1 == length) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; + bi->bi_len = bytes; + /* other blocks */ + } else { + bytes = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header); + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; + bi->bi_len = bytes; + } + + bytes_left -= bytes; + } + + if (bytes_left) { + gfs2_consist_rgrpd(rgd); + return -EIO; + } + bi = rgd->rd_bits + (length - 1); + if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) { + if (gfs2_consist_rgrpd(rgd)) { + gfs2_rindex_print(&rgd->rd_ri); + fs_err(sdp, "start=%u len=%u offset=%u\n", + bi->bi_start, bi->bi_len, bi->bi_offset); + } + return -EIO; + } + + return 0; +} + +/** + * gfs2_ri_update - Pull in a new resource index from the disk + * @gl: The glock covering the rindex inode + * + * Returns: 0 on successful update, error code otherwise + */ + +static int gfs2_ri_update(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct gfs2_rgrpd *rgd; + char buf[sizeof(struct gfs2_rindex)]; + struct file_ra_state ra_state; + uint64_t junk = ip->i_di.di_size; + int error; + + if (do_div(junk, sizeof(struct gfs2_rindex))) { + gfs2_consist_inode(ip); + return -EIO; + } + + clear_rgrpdi(sdp); + + file_ra_state_init(&ra_state, inode->i_mapping); + for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { + loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); + error = gfs2_internal_read(ip, &ra_state, buf, &pos, + sizeof(struct gfs2_rindex)); + if (!error) + break; + if (error != sizeof(struct gfs2_rindex)) { + if (error > 0) + error = -EIO; + goto fail; + } + + rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); + error = -ENOMEM; + if (!rgd) + goto fail; + + mutex_init(&rgd->rd_mutex); + lops_init_le(&rgd->rd_le, &gfs2_rg_lops); + rgd->rd_sbd = sdp; + + list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); + list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + + gfs2_rindex_in(&rgd->rd_ri, buf); + error = compute_bitstructs(rgd); + if (error) + goto fail; + + error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr, + &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); + if (error) + goto fail; + + rgd->rd_gl->gl_object = rgd; + rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; + } + + sdp->sd_rindex_vn = ip->i_gl->gl_vn; + return 0; + +fail: + clear_rgrpdi(sdp); + return error; +} + +/** + * gfs2_rindex_hold - Grab a lock on the rindex + * @sdp: The GFS2 superblock + * @ri_gh: the glock holder + * + * We grab a lock on the rindex inode to make sure that it doesn't + * change whilst we are performing an operation. We keep this lock + * for quite long periods of time compared to other locks. This + * doesn't matter, since it is shared and it is very, very rarely + * accessed in the exclusive mode (i.e. only when expanding the filesystem). + * + * This makes sure that we're using the latest copy of the resource index + * special file, which might have been updated if someone expanded the + * filesystem (via gfs2_grow utility), which adds new resource groups. + * + * Returns: 0 on success, error code otherwise + */ + +int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); + struct gfs2_glock *gl = ip->i_gl; + int error; + + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh); + if (error) + return error; + + /* Read new copy from disk if we don't have the latest */ + if (sdp->sd_rindex_vn != gl->gl_vn) { + mutex_lock(&sdp->sd_rindex_mutex); + if (sdp->sd_rindex_vn != gl->gl_vn) { + error = gfs2_ri_update(ip); + if (error) + gfs2_glock_dq_uninit(ri_gh); + } + mutex_unlock(&sdp->sd_rindex_mutex); + } + + return error; +} + +/** + * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * @rgd: the struct gfs2_rgrpd describing the RG to read in + * + * Read in all of a Resource Group's header and bitmap blocks. + * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. + * + * Returns: errno + */ + +int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_glock *gl = rgd->rd_gl; + unsigned int length = rgd->rd_ri.ri_length; + struct gfs2_bitmap *bi; + unsigned int x, y; + int error; + + mutex_lock(&rgd->rd_mutex); + + spin_lock(&sdp->sd_rindex_spin); + if (rgd->rd_bh_count) { + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); + mutex_unlock(&rgd->rd_mutex); + return 0; + } + spin_unlock(&sdp->sd_rindex_spin); + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, DIO_START, + &bi->bi_bh); + if (error) + goto fail; + } + + for (y = length; y--;) { + bi = rgd->rd_bits + y; + error = gfs2_meta_reread(sdp, bi->bi_bh, DIO_WAIT); + if (error) + goto fail; + if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB : + GFS2_METATYPE_RG)) { + error = -EIO; + goto fail; + } + } + + if (rgd->rd_rg_vn != gl->gl_vn) { + gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data); + rgd->rd_rg_vn = gl->gl_vn; + } + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone = rgd->rd_rg.rg_free; + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); + + mutex_unlock(&rgd->rd_mutex); + + return 0; + +fail: + while (x--) { + bi = rgd->rd_bits + x; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + gfs2_assert_warn(sdp, !bi->bi_clone); + } + mutex_unlock(&rgd->rd_mutex); + + return error; +} + +void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + + spin_lock(&sdp->sd_rindex_spin); + gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @rgd: the struct gfs2_rgrpd describing the RG to read in + * + */ + +void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int x, length = rgd->rd_ri.ri_length; + + spin_lock(&sdp->sd_rindex_spin); + gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); + if (--rgd->rd_bh_count) { + spin_unlock(&sdp->sd_rindex_spin); + return; + } + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + } + + spin_unlock(&sdp->sd_rindex_spin); +} + +void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + unsigned int length = rgd->rd_ri.ri_length; + unsigned int x; + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + if (!bi->bi_clone) + continue; + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); + } + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone = rgd->rd_rg.rg_free; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode + * @ip: the incore GFS2 inode structure + * + * Returns: the struct gfs2_alloc + */ + +struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al = &ip->i_alloc; + + /* FIXME: Should assert that the correct locks are held here... */ + memset(al, 0, sizeof(*al)); + return al; +} + +/** + * gfs2_alloc_put - throw away the struct gfs2_alloc for an inode + * @ip: the inode + * + */ + +void gfs2_alloc_put(struct gfs2_inode *ip) +{ + return; +} + +/** + * try_rgrp_fit - See if a given reservation will fit in a given RG + * @rgd: the RG data + * @al: the struct gfs2_alloc structure describing the reservation + * + * If there's room for the requested blocks to be allocated from the RG: + * Sets the $al_reserved_data field in @al. + * Sets the $al_reserved_meta field in @al. + * Sets the $al_rgd field in @al. + * + * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) + */ + +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int ret = 0; + + spin_lock(&sdp->sd_rindex_spin); + if (rgd->rd_free_clone >= al->al_requested) { + al->al_rgd = rgd; + ret = 1; + } + spin_unlock(&sdp->sd_rindex_spin); + + return ret; +} + +/** + * recent_rgrp_first - get first RG from "recent" list + * @sdp: The GFS2 superblock + * @rglast: address of the rgrp used last + * + * Returns: The first rgrp in the recent list + */ + +static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp, + uint64_t rglast) +{ + struct gfs2_rgrpd *rgd = NULL; + + spin_lock(&sdp->sd_rindex_spin); + + if (list_empty(&sdp->sd_rindex_recent_list)) + goto out; + + if (!rglast) + goto first; + + list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { + if (rgd->rd_ri.ri_addr == rglast) + goto out; + } + +first: + rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd, + rd_recent); +out: + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +/** + * recent_rgrp_next - get next RG from "recent" list + * @cur_rgd: current rgrp + * @remove: + * + * Returns: The next rgrp in the recent list + */ + +static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd, + int remove) +{ + struct gfs2_sbd *sdp = cur_rgd->rd_sbd; + struct list_head *head; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + + head = &sdp->sd_rindex_recent_list; + + list_for_each_entry(rgd, head, rd_recent) { + if (rgd == cur_rgd) { + if (cur_rgd->rd_recent.next != head) + rgd = list_entry(cur_rgd->rd_recent.next, + struct gfs2_rgrpd, rd_recent); + else + rgd = NULL; + + if (remove) + list_del(&cur_rgd->rd_recent); + + goto out; + } + } + + rgd = NULL; + if (!list_empty(head)) + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); + +out: + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +/** + * recent_rgrp_add - add an RG to tail of "recent" list + * @new_rgd: The rgrp to add + * + */ + +static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd) +{ + struct gfs2_sbd *sdp = new_rgd->rd_sbd; + struct gfs2_rgrpd *rgd; + unsigned int count = 0; + unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp); + + spin_lock(&sdp->sd_rindex_spin); + + list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { + if (rgd == new_rgd) + goto out; + + if (++count >= max) + goto out; + } + list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list); + +out: + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * forward_rgrp_get - get an rgrp to try next from full list + * @sdp: The GFS2 superblock + * + * Returns: The rgrp to try next + */ + +static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp) +{ + struct gfs2_rgrpd *rgd; + unsigned int journals = gfs2_jindex_size(sdp); + unsigned int rg = 0, x; + + spin_lock(&sdp->sd_rindex_spin); + + rgd = sdp->sd_rindex_forward; + if (!rgd) { + if (sdp->sd_rgrps >= journals) + rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals; + + for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg; + x++, rgd = gfs2_rgrpd_get_next(rgd)) + /* Do Nothing */; + + sdp->sd_rindex_forward = rgd; + } + + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; +} + +/** + * forward_rgrp_set - set the forward rgrp pointer + * @sdp: the filesystem + * @rgd: The new forward rgrp + * + */ + +static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) +{ + spin_lock(&sdp->sd_rindex_spin); + sdp->sd_rindex_forward = rgd; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * get_local_rgrp - Choose and lock a rgrp for allocation + * @ip: the inode to reserve space for + * @rgp: the chosen and locked rgrp + * + * Try to acquire rgrp in way which avoids contending with others. + * + * Returns: errno + */ + +static int get_local_rgrp(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd, *begin = NULL; + struct gfs2_alloc *al = &ip->i_alloc; + int flags = LM_FLAG_TRY; + int skipped = 0; + int loops = 0; + int error; + + /* Try recently successful rgrps */ + + rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); + + while (rgd) { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_TRY, &al->al_rgd_gh); + switch (error) { + case 0: + if (try_rgrp_fit(rgd, al)) + goto out; + gfs2_glock_dq_uninit(&al->al_rgd_gh); + rgd = recent_rgrp_next(rgd, 1); + break; + + case GLR_TRYFAILED: + rgd = recent_rgrp_next(rgd, 0); + break; + + default: + return error; + } + } + + /* Go through full list of rgrps */ + + begin = rgd = forward_rgrp_get(sdp); + + for (;;) { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, + &al->al_rgd_gh); + switch (error) { + case 0: + if (try_rgrp_fit(rgd, al)) + goto out; + gfs2_glock_dq_uninit(&al->al_rgd_gh); + break; + + case GLR_TRYFAILED: + skipped++; + break; + + default: + return error; + } + + rgd = gfs2_rgrpd_get_next(rgd); + if (!rgd) + rgd = gfs2_rgrpd_get_first(sdp); + + if (rgd == begin) { + if (++loops >= 2 || !skipped) + return -ENOSPC; + flags = 0; + } + } + +out: + ip->i_last_rg_alloc = rgd->rd_ri.ri_addr; + + if (begin) { + recent_rgrp_add(rgd); + rgd = gfs2_rgrpd_get_next(rgd); + if (!rgd) + rgd = gfs2_rgrpd_get_first(sdp); + forward_rgrp_set(sdp, rgd); + } + + return 0; +} + +/** + * gfs2_inplace_reserve_i - Reserve space in the filesystem + * @ip: the inode to reserve space for + * + * Returns: errno + */ + +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + int error; + + if (gfs2_assert_warn(sdp, al->al_requested)) + return -EINVAL; + + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + if (error) + return error; + + error = get_local_rgrp(ip); + if (error) { + gfs2_glock_dq_uninit(&al->al_ri_gh); + return error; + } + + al->al_file = file; + al->al_line = line; + + return 0; +} + +/** + * gfs2_inplace_release - release an inplace reservation + * @ip: the inode the reservation was taken out on + * + * Release a reservation made by gfs2_inplace_reserve(). + */ + +void gfs2_inplace_release(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + + if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) + fs_warn(sdp, "al_alloced = %u, al_requested = %u " + "al_file = %s, al_line = %u\n", + al->al_alloced, al->al_requested, al->al_file, + al->al_line); + + al->al_rgd = NULL; + gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&al->al_ri_gh); +} + +/** + * gfs2_get_block_type - Check a block in a RG is of given type + * @rgd: the resource group holding the block + * @block: the block number + * + * Returns: The block type (GFS2_BLKST_*) + */ + +unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block) +{ + struct gfs2_bitmap *bi = NULL; + uint32_t length, rgrp_block, buf_block; + unsigned int buf; + unsigned char type; + + length = rgd->rd_ri.ri_length; + rgrp_block = block - rgd->rd_ri.ri_data0; + + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + buf_block = rgrp_block - bi->bi_start * GFS2_NBBY; + + type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, buf_block); + + return type; +} + +/** + * rgblk_search - find a block in @old_state, change allocation + * state to @new_state + * @rgd: the resource group descriptor + * @goal: the goal block within the RG (start here to search for avail block) + * @old_state: GFS2_BLKST_XXX the before-allocation state to find + * @new_state: GFS2_BLKST_XXX the after-allocation block state + * + * Walk rgrp's bitmap to find bits that represent a block in @old_state. + * Add the found bitmap buffer to the transaction. + * Set the found bits to @new_state to change block's allocation state. + * + * This function never fails, because we wouldn't call it unless we + * know (from reservation results, etc.) that a block is available. + * + * Scope of @goal and returned block is just within rgrp, not the whole + * filesystem. + * + * Returns: the block number allocated + */ + +static uint32_t rgblk_search(struct gfs2_rgrpd *rgd, uint32_t goal, + unsigned char old_state, unsigned char new_state) +{ + struct gfs2_bitmap *bi = NULL; + uint32_t length = rgd->rd_ri.ri_length; + uint32_t blk = 0; + unsigned int buf, x; + + /* Find bitmap block that contains bits for goal block */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + + /* Convert scope of "goal" from rgrp-wide to within found bit block */ + goal -= bi->bi_start * GFS2_NBBY; + + /* Search (up to entire) bitmap in this rgrp for allocatable block. + "x <= length", instead of "x < length", because we typically start + the search in the middle of a bit block, but if we can't find an + allocatable block anywhere else, we want to be able wrap around and + search in the first part of our first-searched bit block. */ + for (x = 0; x <= length; x++) { + if (bi->bi_clone) + blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset, + bi->bi_len, goal, old_state); + else + blk = gfs2_bitfit(rgd, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, goal, old_state); + if (blk != BFITNOENT) + break; + + /* Try next bitmap block (wrap back to rgrp header if at end) */ + buf = (buf + 1) % length; + bi = rgd->rd_bits + buf; + goal = 0; + } + + if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length)) + blk = 0; + + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, blk, new_state); + if (bi->bi_clone) + gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset, + bi->bi_len, blk, new_state); + + return bi->bi_start * GFS2_NBBY + blk; +} + +/** + * rgblk_free - Change alloc state of given block(s) + * @sdp: the filesystem + * @bstart: the start of a run of blocks to free + * @blen: the length of the block run (all must lie within ONE RG!) + * @new_state: GFS2_BLKST_XXX the after-allocation block state + * + * Returns: Resource group containing the block(s) + */ + +static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, uint64_t bstart, + uint32_t blen, unsigned char new_state) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_bitmap *bi = NULL; + uint32_t length, rgrp_blk, buf_blk; + unsigned int buf; + + rgd = gfs2_blk2rgrpd(sdp, bstart); + if (!rgd) { + if (gfs2_consist(sdp)) + fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); + return NULL; + } + + length = rgd->rd_ri.ri_length; + + rgrp_blk = bstart - rgd->rd_ri.ri_data0; + + while (blen--) { + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + + buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY; + rgrp_blk++; + + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len); + } + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, buf_blk, new_state); + } + + return rgd; +} + +/** + * gfs2_alloc_data - Allocate a data block + * @ip: the inode to allocate the data block for + * + * Returns: the allocated block + */ + +u64 gfs2_alloc_data(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_rgrpd *rgd = al->al_rgd; + uint32_t goal, blk; + uint64_t block; + + if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data)) + goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0; + else + goal = rgd->rd_last_alloc_data; + + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); + rgd->rd_last_alloc_data = blk; + + block = rgd->rd_ri.ri_data0 + blk; + ip->i_di.di_goal_data = block; + + gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); + rgd->rd_rg.rg_free--; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced++; + + gfs2_statfs_change(sdp, 0, -1, 0); + gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone--; + spin_unlock(&sdp->sd_rindex_spin); + + return block; +} + +/** + * gfs2_alloc_meta - Allocate a metadata block + * @ip: the inode to allocate the metadata block for + * + * Returns: the allocated block + */ + +u64 gfs2_alloc_meta(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_rgrpd *rgd = al->al_rgd; + uint32_t goal, blk; + uint64_t block; + + if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta)) + goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0; + else + goal = rgd->rd_last_alloc_meta; + + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); + rgd->rd_last_alloc_meta = blk; + + block = rgd->rd_ri.ri_data0 + blk; + ip->i_di.di_goal_meta = block; + + gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); + rgd->rd_rg.rg_free--; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced++; + + gfs2_statfs_change(sdp, 0, -1, 0); + gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid); + gfs2_trans_add_unrevoke(sdp, block); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone--; + spin_unlock(&sdp->sd_rindex_spin); + + return block; +} + +/** + * gfs2_alloc_di - Allocate a dinode + * @dip: the directory that the inode is going in + * + * Returns: the block allocated + */ + +u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc *al = &dip->i_alloc; + struct gfs2_rgrpd *rgd = al->al_rgd; + u32 blk; + u64 block; + + blk = rgblk_search(rgd, rgd->rd_last_alloc_meta, + GFS2_BLKST_FREE, GFS2_BLKST_DINODE); + + rgd->rd_last_alloc_meta = blk; + + block = rgd->rd_ri.ri_data0 + blk; + + gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); + rgd->rd_rg.rg_free--; + rgd->rd_rg.rg_dinodes++; + *generation = rgd->rd_rg.rg_igeneration++; + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced++; + + gfs2_statfs_change(sdp, 0, -1, +1); + gfs2_trans_add_unrevoke(sdp, block); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone--; + spin_unlock(&sdp->sd_rindex_spin); + + return block; +} + +/** + * gfs2_free_data - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + + rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + if (!rgd) + return; + + rgd->rd_rg.rg_free += blen; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_trans_add_rg(rgd); + + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(int64_t)blen, + ip->i_di.di_uid, ip->i_di.di_gid); +} + +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + + rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + if (!rgd) + return; + + rgd->rd_rg.rg_free += blen; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_trans_add_rg(rgd); + + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(int64_t)blen, ip->i_di.di_uid, ip->i_di.di_gid); + gfs2_meta_wipe(ip, bstart, blen); +} + +void gfs2_unlink_di(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_rgrpd *rgd; + u64 blkno = ip->i_num.no_addr; + + rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); + if (!rgd) + return; + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_trans_add_rg(rgd); +} + +static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, uint64_t blkno) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *tmp_rgd; + + tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE); + if (!tmp_rgd) + return; + gfs2_assert_withdraw(sdp, rgd == tmp_rgd); + + if (!rgd->rd_rg.rg_dinodes) + gfs2_consist_rgrpd(rgd); + rgd->rd_rg.rg_dinodes--; + rgd->rd_rg.rg_free++; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_statfs_change(sdp, 0, +1, -1); + gfs2_trans_add_rg(rgd); +} + + +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) +{ + gfs2_free_uninit_di(rgd, ip->i_num.no_addr); + gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid); + gfs2_meta_wipe(ip, ip->i_num.no_addr, 1); +} + +/** + * gfs2_rlist_add - add a RG to a list of RGs + * @sdp: the filesystem + * @rlist: the list of resource groups + * @block: the block + * + * Figure out what RG a block belongs to and add that RG to the list + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + uint64_t block) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd **tmp; + unsigned int new_space; + unsigned int x; + + if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) + return; + + rgd = gfs2_blk2rgrpd(sdp, block); + if (!rgd) { + if (gfs2_consist(sdp)) + fs_err(sdp, "block = %llu\n", (unsigned long long)block); + return; + } + + for (x = 0; x < rlist->rl_rgrps; x++) + if (rlist->rl_rgd[x] == rgd) + return; + + if (rlist->rl_rgrps == rlist->rl_space) { + new_space = rlist->rl_space + 10; + + tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), + GFP_NOFS | __GFP_NOFAIL); + + if (rlist->rl_rgd) { + memcpy(tmp, rlist->rl_rgd, + rlist->rl_space * sizeof(struct gfs2_rgrpd *)); + kfree(rlist->rl_rgd); + } + + rlist->rl_space = new_space; + rlist->rl_rgd = tmp; + } + + rlist->rl_rgd[rlist->rl_rgrps++] = rgd; +} + +/** + * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate + * and initialize an array of glock holders for them + * @rlist: the list of resource groups + * @state: the lock state to acquire the RG lock in + * @flags: the modifier flags for the holder structures + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, + int flags) +{ + unsigned int x; + + rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), + GFP_NOFS | __GFP_NOFAIL); + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, + state, flags, + &rlist->rl_ghs[x]); +} + +/** + * gfs2_rlist_free - free a resource group list + * @list: the list of resource groups + * + */ + +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) +{ + unsigned int x; + + kfree(rlist->rl_rgd); + + if (rlist->rl_ghs) { + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_uninit(&rlist->rl_ghs[x]); + kfree(rlist->rl_ghs); + } +} + --- /dev/null +++ b/fs/gfs2/rgrp.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __RGRP_DOT_H__ +#define __RGRP_DOT_H__ + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, uint64_t blk); +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); + +int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); +void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); +void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); + +void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); + +struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); +void gfs2_alloc_put(struct gfs2_inode *ip); + +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, + char *file, unsigned int line); +#define gfs2_inplace_reserve(ip) \ +gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) + +void gfs2_inplace_release(struct gfs2_inode *ip); + +unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, uint64_t block); + +u64 gfs2_alloc_data(struct gfs2_inode *ip); +u64 gfs2_alloc_meta(struct gfs2_inode *ip); +u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); + +void gfs2_free_data(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen); +void gfs2_free_meta(struct gfs2_inode *ip, uint64_t bstart, uint32_t blen); +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +void gfs2_unlink_di(struct inode *inode); + +struct gfs2_rgrp_list { + unsigned int rl_rgrps; + unsigned int rl_space; + struct gfs2_rgrpd **rl_rgd; + struct gfs2_holder *rl_ghs; +}; + +void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + uint64_t block); +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, + int flags); +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); + +#endif /* __RGRP_DOT_H__ */ [PATCH 09/16] GFS2: Extended attribute & ACL support ACL's and extended attribute for GFS2. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/acl.c | 313 +++++++++++ fs/gfs2/acl.h | 37 + fs/gfs2/eaops.c | 230 ++++++++ fs/gfs2/eaops.h | 29 + fs/gfs2/eattr.c | 1548 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/eattr.h | 97 +++ 6 files changed, 2254 insertions(+) --- /dev/null +++ b/fs/gfs2/acl.c @@ -0,0 +1,313 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "acl.h" +#include "eaops.h" +#include "eattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" + +#define ACL_ACCESS 1 +#define ACL_DEFAULT 0 + +int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, + struct gfs2_ea_request *er, + int *remove, mode_t *mode) +{ + struct posix_acl *acl; + int error; + + error = gfs2_acl_validate_remove(ip, access); + if (error) + return error; + + if (!er->er_data) + return -EINVAL; + + acl = posix_acl_from_xattr(er->er_data, er->er_data_len); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) { + *remove = 1; + return 0; + } + + error = posix_acl_valid(acl); + if (error) + goto out; + + if (access) { + error = posix_acl_equiv_mode(acl, mode); + if (!error) + *remove = 1; + else if (error > 0) + error = 0; + } + + out: + posix_acl_release(acl); + + return error; +} + +int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) +{ + if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER)) + return -EPERM; + if (S_ISLNK(ip->i_di.di_mode)) + return -EOPNOTSUPP; + if (!access && !S_ISDIR(ip->i_di.di_mode)) + return -EACCES; + + return 0; +} + +static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, + struct gfs2_ea_location *el, char **data, unsigned int *len) +{ + struct gfs2_ea_request er; + struct gfs2_ea_location el_this; + int error; + + if (!ip->i_di.di_eattr) + return 0; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (access) { + er.er_name = GFS2_POSIX_ACL_ACCESS; + er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; + } else { + er.er_name = GFS2_POSIX_ACL_DEFAULT; + er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; + } + er.er_type = GFS2_EATYPE_SYS; + + if (!el) + el = &el_this; + + error = gfs2_ea_find(ip, &er, el); + if (error) + return error; + if (!el->el_ea) + return 0; + if (!GFS2_EA_DATA_LEN(el->el_ea)) + goto out; + + er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); + er.er_data = kmalloc(er.er_data_len, GFP_KERNEL); + error = -ENOMEM; + if (!er.er_data) + goto out; + + error = gfs2_ea_get_copy(ip, el, er.er_data); + if (error) + goto out_kfree; + + if (acl) { + *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); + if (IS_ERR(*acl)) + error = PTR_ERR(*acl); + } + + out_kfree: + if (error || !data) + kfree(er.er_data); + else { + *data = er.er_data; + *len = er.er_data_len; + } + + out: + if (error || el == &el_this) + brelse(el->el_bh); + + return error; +} + +/** + * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something + * @inode: the file we want to do something to + * @mask: what we want to do + * + * Returns: errno + */ + +int gfs2_check_acl_locked(struct inode *inode, int mask) +{ + struct posix_acl *acl = NULL; + int error; + + error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); + if (error) + return error; + + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +int gfs2_check_acl(struct inode *inode, int mask) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (!error) { + error = gfs2_check_acl_locked(inode, mask); + gfs2_glock_dq_uninit(&i_gh); + } + + return error; +} + +static int munge_mode(struct gfs2_inode *ip, mode_t mode) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_assert_withdraw(sdp, + (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT)); + ip->i_di.di_mode = mode; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + + return 0; +} + +int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct posix_acl *acl = NULL, *clone; + struct gfs2_ea_request er; + mode_t mode = ip->i_di.di_mode; + int error; + + if (!sdp->sd_args.ar_posix_acl) + return 0; + if (S_ISLNK(ip->i_di.di_mode)) + return 0; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = GFS2_EATYPE_SYS; + + error = acl_get(dip, ACL_DEFAULT, &acl, NULL, + &er.er_data, &er.er_data_len); + if (error) + return error; + if (!acl) { + mode &= ~current->fs->umask; + if (mode != ip->i_di.di_mode) + error = munge_mode(ip, mode); + return error; + } + + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto out; + posix_acl_release(acl); + acl = clone; + + if (S_ISDIR(ip->i_di.di_mode)) { + er.er_name = GFS2_POSIX_ACL_DEFAULT; + er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; + error = gfs2_system_eaops.eo_set(ip, &er); + if (error) + goto out; + } + + error = posix_acl_create_masq(acl, &mode); + if (error < 0) + goto out; + if (error > 0) { + er.er_name = GFS2_POSIX_ACL_ACCESS; + er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; + posix_acl_to_xattr(acl, er.er_data, er.er_data_len); + er.er_mode = mode; + er.er_flags = GFS2_ERF_MODE; + error = gfs2_system_eaops.eo_set(ip, &er); + if (error) + goto out; + } else + munge_mode(ip, mode); + + out: + posix_acl_release(acl); + kfree(er.er_data); + return error; +} + +int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) +{ + struct posix_acl *acl = NULL, *clone; + struct gfs2_ea_location el; + char *data; + unsigned int len; + int error; + + error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); + if (error) + return error; + if (!acl) + return gfs2_setattr_simple(ip, attr); + + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto out; + posix_acl_release(acl); + acl = clone; + + error = posix_acl_chmod_masq(acl, attr->ia_mode); + if (!error) { + posix_acl_to_xattr(acl, data, len); + error = gfs2_ea_acl_chmod(ip, &el, attr, data); + } + + out: + posix_acl_release(acl); + brelse(el.el_bh); + kfree(data); + + return error; +} + --- /dev/null +++ b/fs/gfs2/acl.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __ACL_DOT_H__ +#define __ACL_DOT_H__ + +#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" +#define GFS2_POSIX_ACL_ACCESS_LEN 16 +#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" +#define GFS2_POSIX_ACL_DEFAULT_LEN 17 + +#define GFS2_ACL_IS_ACCESS(name, len) \ + ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \ + !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len))) + +#define GFS2_ACL_IS_DEFAULT(name, len) \ + ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \ + !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len))) + +struct gfs2_ea_request; + +int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, + struct gfs2_ea_request *er, + int *remove, mode_t *mode); +int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access); +int gfs2_check_acl_locked(struct inode *inode, int mask); +int gfs2_check_acl(struct inode *inode, int mask); +int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip); +int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); + +#endif /* __ACL_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/eattr.c @@ -0,0 +1,1548 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "acl.h" +#include "eaops.h" +#include "eattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * ea_calc_size - returns the acutal number of bytes the request will take up + * (not counting any unstuffed data blocks) + * @sdp: + * @er: + * @size: + * + * Returns: 1 if the EA should be stuffed + */ + +static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, + unsigned int *size) +{ + *size = GFS2_EAREQ_SIZE_STUFFED(er); + if (*size <= sdp->sd_jbsize) + return 1; + + *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); + + return 0; +} + +static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) +{ + unsigned int size; + + if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) + return -ERANGE; + + ea_calc_size(sdp, er, &size); + + /* This can only happen with 512 byte blocks */ + if (size > sdp->sd_jbsize) + return -ERANGE; + + return 0; +} + +typedef int (*ea_call_t) (struct gfs2_inode *ip, + struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, + void *private); + +static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, + ea_call_t ea_call, void *data) +{ + struct gfs2_ea_header *ea, *prev = NULL; + int error = 0; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA)) + return -EIO; + + for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) { + if (!GFS2_EA_REC_LEN(ea)) + goto fail; + if (!(bh->b_data <= (char *)ea && + (char *)GFS2_EA2NEXT(ea) <= + bh->b_data + bh->b_size)) + goto fail; + if (!GFS2_EATYPE_VALID(ea->ea_type)) + goto fail; + + error = ea_call(ip, bh, ea, prev, data); + if (error) + return error; + + if (GFS2_EA_IS_LAST(ea)) { + if ((char *)GFS2_EA2NEXT(ea) != + bh->b_data + bh->b_size) + goto fail; + break; + } + } + + return error; + + fail: + gfs2_consist_inode(ip); + return -EIO; +} + +static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) +{ + struct buffer_head *bh, *eabh; + uint64_t *eablk, *end; + int error; + + error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, + DIO_START | DIO_WAIT, &bh); + if (error) + return error; + + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { + error = ea_foreach_i(ip, bh, ea_call, data); + goto out; + } + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (uint64_t *)(bh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs; + + for (; eablk < end; eablk++) { + uint64_t bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + error = gfs2_meta_read(ip->i_gl, bn, DIO_START | DIO_WAIT, + &eabh); + if (error) + break; + error = ea_foreach_i(ip, eabh, ea_call, data); + brelse(eabh); + if (error) + break; + } + out: + brelse(bh); + + return error; +} + +struct ea_find { + struct gfs2_ea_request *ef_er; + struct gfs2_ea_location *ef_el; +}; + +static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_find *ef = private; + struct gfs2_ea_request *er = ef->ef_er; + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (ea->ea_type == er->er_type) { + if (ea->ea_name_len == er->er_name_len && + !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { + struct gfs2_ea_location *el = ef->ef_el; + get_bh(bh); + el->el_bh = bh; + el->el_ea = ea; + el->el_prev = prev; + return 1; + } + } + +#if 0 + else if ((ip->i_di.di_flags & GFS2_DIF_EA_PACKED) && + er->er_type == GFS2_EATYPE_SYS) + return 1; +#endif + + return 0; +} + +int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, + struct gfs2_ea_location *el) +{ + struct ea_find ef; + int error; + + ef.ef_er = er; + ef.ef_el = el; + + memset(el, 0, sizeof(struct gfs2_ea_location)); + + error = ea_foreach(ip, ea_find_i, &ef); + if (error > 0) + return 0; + + return error; +} + +/** + * ea_dealloc_unstuffed - + * @ip: + * @bh: + * @ea: + * @prev: + * @private: + * + * Take advantage of the fact that all unstuffed blocks are + * allocated from the same RG. But watch, this may not always + * be true. + * + * Returns: errno + */ + +static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private) +{ + int *leave = private; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder rg_gh; + struct buffer_head *dibh; + uint64_t *dataptrs, bn = 0; + uint64_t bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + if (GFS2_EA_IS_STUFFED(ea)) + return 0; + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) + if (*dataptrs) { + blks++; + bn = be64_to_cpu(*dataptrs); + } + if (!blks) + return 0; + + rgd = gfs2_blk2rgrpd(sdp, bn); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + + RES_DINODE + RES_EATTR + RES_STATFS + + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (!*dataptrs) + break; + bn = be64_to_cpu(*dataptrs); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *dataptrs = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + if (prev && !leave) { + uint32_t len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_num_ptrs = 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + + out_gunlock: + gfs2_glock_dq_uninit(&rg_gh); + + return error; +} + +static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, int leave) +{ + struct gfs2_alloc *al; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + if (error) + goto out_quota; + + error = ea_dealloc_unstuffed(ip, + bh, ea, prev, + (leave) ? &error : NULL); + + gfs2_glock_dq_uninit(&al->al_ri_gh); + + out_quota: + gfs2_quota_unhold(ip); + + out_alloc: + gfs2_alloc_put(ip); + + return error; +} + +struct ea_list { + struct gfs2_ea_request *ei_er; + unsigned int ei_size; +}; + +static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_list *ei = private; + struct gfs2_ea_request *er = ei->ei_er; + unsigned int ea_size = gfs2_ea_strlen(ea); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (er->er_data_len) { + char *prefix = NULL; + unsigned int l = 0; + char c = 0; + + if (ei->ei_size + ea_size > er->er_data_len) + return -ERANGE; + + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + prefix = "user."; + l = 5; + break; + case GFS2_EATYPE_SYS: + prefix = "system."; + l = 7; + break; + case GFS2_EATYPE_SECURITY: + prefix = "security."; + l = 9; + break; + } + + BUG_ON(l == 0); + + memcpy(er->er_data + ei->ei_size, prefix, l); + memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea), + ea->ea_name_len); + memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1); + } + + ei->ei_size += ea_size; + + return 0; +} + +/** + * gfs2_ea_list - + * @ip: + * @er: + * + * Returns: actual size of data on success, -errno on error + */ + +int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_data || !er->er_data_len) { + er->er_data = NULL; + er->er_data_len = 0; + } + + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + + if (ip->i_di.di_eattr) { + struct ea_list ei = { .ei_er = er, .ei_size = 0 }; + + error = ea_foreach(ip, ea_list_i, &ei); + if (!error) + error = ei.ei_size; + } + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * ea_get_unstuffed - actually copies the unstuffed data into the + * request buffer + * @ip: + * @ea: + * @data: + * + * Returns: errno + */ + +static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error = 0; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bh) + return -ENOMEM; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), + DIO_START, bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto out; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto out; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto out; + } + + memcpy(data, + bh[x]->b_data + sizeof(struct gfs2_meta_header), + (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + + amount -= sdp->sd_jbsize; + data += sdp->sd_jbsize; + + brelse(bh[x]); + } + + out: + kfree(bh); + + return error; +} + +int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data) +{ + if (GFS2_EA_IS_STUFFED(el->el_ea)) { + memcpy(data, + GFS2_EA2DATA(el->el_ea), + GFS2_EA_DATA_LEN(el->el_ea)); + return 0; + } else + return ea_get_unstuffed(ip, el->el_ea, data); +} + +/** + * gfs2_ea_get_i - + * @ip: + * @er: + * + * Returns: actual size of data on success, -errno on error + */ + +int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_di.di_eattr) + return -ENODATA; + + error = gfs2_ea_find(ip, er, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + + if (er->er_data_len) { + if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) + error = -ERANGE; + else + error = gfs2_ea_get_copy(ip, &el, er->er_data); + } + if (!error) + error = GFS2_EA_DATA_LEN(el.el_ea); + + brelse(el.el_bh); + + return error; +} + +/** + * gfs2_ea_get - + * @ip: + * @er: + * + * Returns: actual size of data on success, -errno on error + */ + +int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_name_len || + er->er_name_len > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + if (!er->er_data || !er->er_data_len) { + er->er_data = NULL; + er->er_data_len = 0; + } + + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + + error = gfs2_ea_ops[er->er_type]->eo_get(ip, er); + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * ea_alloc_blk - allocates a new block for extended attributes. + * @ip: A pointer to the inode that's getting extended attributes + * @bhp: + * + * Returns: errno + */ + +static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_ea_header *ea; + uint64_t block; + + block = gfs2_alloc_meta(ip); + + *bhp = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, *bhp, 1); + gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(*bhp); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + ea->ea_num_ptrs = 0; + + ip->i_di.di_blocks++; + + return 0; +} + +/** + * ea_write - writes the request info to an ea, creating new blocks if + * necessary + * @ip: inode that is being modified + * @ea: the location of the new ea in a block + * @er: the write request + * + * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags + * + * returns : errno + */ + +static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + struct gfs2_ea_request *er) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + ea->ea_data_len = cpu_to_be32(er->er_data_len); + ea->ea_name_len = er->er_name_len; + ea->ea_type = er->er_type; + ea->__pad = 0; + + memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len); + + if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) { + ea->ea_num_ptrs = 0; + memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len); + } else { + uint64_t *dataptr = GFS2_EA2DATAPTRS(ea); + const char *data = er->er_data; + unsigned int data_len = er->er_data_len; + unsigned int copy; + unsigned int x; + + ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize); + for (x = 0; x < ea->ea_num_ptrs; x++) { + struct buffer_head *bh; + uint64_t block; + int mh_size = sizeof(struct gfs2_meta_header); + + block = gfs2_alloc_meta(ip); + + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); + + ip->i_di.di_blocks++; + + copy = (data_len > sdp->sd_jbsize) ? sdp->sd_jbsize : + data_len; + memcpy(bh->b_data + mh_size, data, copy); + if (copy < sdp->sd_jbsize) + memset(bh->b_data + mh_size + copy, 0, + sdp->sd_jbsize - copy); + + *dataptr++ = cpu_to_be64((uint64_t)bh->b_blocknr); + data += copy; + data_len -= copy; + + brelse(bh); + } + + gfs2_assert_withdraw(sdp, !data_len); + } + + return 0; +} + +typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip, + struct gfs2_ea_request *er, + void *private); + +static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, + unsigned int blks, + ea_skeleton_call_t skeleton_call, + void *private) +{ + struct gfs2_alloc *al; + struct buffer_head *dibh; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = blks; + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), + blks + al->al_rgd->rd_ri.ri_length + + RES_DINODE + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + error = skeleton_call(ip, er, private); + if (error) + goto out_end_trans; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + if (er->er_flags & GFS2_ERF_MODE) { + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + (ip->i_di.di_mode & S_IFMT) == + (er->er_mode & S_IFMT)); + ip->i_di.di_mode = er->er_mode; + } + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + out_end_trans: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + out_ipres: + gfs2_inplace_release(ip); + + out_gunlock_q: + gfs2_quota_unlock(ip); + + out: + gfs2_alloc_put(ip); + + return error; +} + +static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct buffer_head *bh; + int error; + + error = ea_alloc_blk(ip, &bh); + if (error) + return error; + + ip->i_di.di_eattr = bh->b_blocknr; + error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); + + brelse(bh); + + return error; +} + +/** + * ea_init - initializes a new eattr block + * @ip: + * @er: + * + * Returns: errno + */ + +static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; + unsigned int blks = 1; + + if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) + blks += DIV_ROUND_UP(er->er_data_len, jbsize); + + return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); +} + +static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) +{ + uint32_t ea_size = GFS2_EA_SIZE(ea); + struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + + ea_size); + uint32_t new_size = GFS2_EA_REC_LEN(ea) - ea_size; + int last = ea->ea_flags & GFS2_EAFLAG_LAST; + + ea->ea_rec_len = cpu_to_be32(ea_size); + ea->ea_flags ^= last; + + new->ea_rec_len = cpu_to_be32(new_size); + new->ea_flags = last; + + return new; +} + +static void ea_set_remove_stuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + uint32_t len; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + + if (!prev || !GFS2_EA_IS_STUFFED(ea)) { + ea->ea_type = GFS2_EATYPE_UNUSED; + return; + } else if (GFS2_EA2NEXT(prev) != ea) { + prev = GFS2_EA2NEXT(prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea); + } + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; +} + +struct ea_set { + int ea_split; + + struct gfs2_ea_request *es_er; + struct gfs2_ea_location *es_el; + + struct buffer_head *es_bh; + struct gfs2_ea_header *es_ea; +}; + +static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct ea_set *es) +{ + struct gfs2_ea_request *er = es->es_er; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + if (es->ea_split) + ea = ea_split_ea(ea); + + ea_write(ip, ea, er); + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (er->er_flags & GFS2_ERF_MODE) { + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT)); + ip->i_di.di_mode = er->er_mode; + } + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + out: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +static int ea_set_simple_alloc(struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private) +{ + struct ea_set *es = private; + struct gfs2_ea_header *ea = es->es_ea; + int error; + + gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); + + if (es->ea_split) + ea = ea_split_ea(ea); + + error = ea_write(ip, ea, er); + if (error) + return error; + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + return 0; +} + +static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_set *es = private; + unsigned int size; + int stuffed; + int error; + + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) { + if (GFS2_EA_REC_LEN(ea) < size) + return 0; + if (!GFS2_EA_IS_STUFFED(ea)) { + error = ea_remove_unstuffed(ip, bh, ea, prev, 1); + if (error) + return error; + } + es->ea_split = 0; + } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size) + es->ea_split = 1; + else + return 0; + + if (stuffed) { + error = ea_set_simple_noalloc(ip, bh, ea, es); + if (error) + return error; + } else { + unsigned int blks; + + es->es_bh = bh; + es->es_ea = ea; + blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len, + GFS2_SB(&ip->i_inode)->sd_jbsize); + + error = ea_alloc_skeleton(ip, es->es_er, blks, + ea_set_simple_alloc, es); + if (error) + return error; + } + + return 1; +} + +static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *indbh, *newbh; + uint64_t *eablk; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + uint64_t *end; + + error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, + DIO_START | DIO_WAIT, &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (uint64_t *)(indbh->b_data + mh_size); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) + if (!*eablk) + break; + + if (eablk == end) { + error = -ENOSPC; + goto out; + } + + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + } else { + uint64_t blk; + + blk = gfs2_alloc_meta(ip); + + indbh = gfs2_meta_new(ip->i_gl, blk); + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(indbh, mh_size); + + eablk = (uint64_t *)(indbh->b_data + mh_size); + *eablk = cpu_to_be64(ip->i_di.di_eattr); + ip->i_di.di_eattr = blk; + ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; + ip->i_di.di_blocks++; + + eablk++; + } + + error = ea_alloc_blk(ip, &newbh); + if (error) + goto out; + + *eablk = cpu_to_be64((uint64_t)newbh->b_blocknr); + error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er); + brelse(newbh); + if (error) + goto out; + + if (private) + ea_set_remove_stuffed(ip, (struct gfs2_ea_location *)private); + + out: + brelse(indbh); + + return error; +} + +static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + struct gfs2_ea_location *el) +{ + struct ea_set es; + unsigned int blks = 2; + int error; + + memset(&es, 0, sizeof(struct ea_set)); + es.es_er = er; + es.es_el = el; + + error = ea_foreach(ip, ea_set_simple, &es); + if (error > 0) + return 0; + if (error) + return error; + + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) + blks++; + if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + + return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); +} + +static int ea_set_remove_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) { + el->el_prev = GFS2_EA2NEXT(el->el_prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + GFS2_EA2NEXT(el->el_prev) == el->el_ea); + } + + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); +} + +int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_di.di_eattr) { + if (er->er_flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, er); + } + + error = gfs2_ea_find(ip, er, &el); + if (error) + return error; + + if (el.el_ea) { + if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } + + error = -EEXIST; + if (!(er->er_flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, er, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + } else { + error = -ENODATA; + if (!(er->er_flags & XATTR_REPLACE)) + error = ea_set_i(ip, er, NULL); + } + + return error; +} + +int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_name_len || + er->er_name_len > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + if (!er->er_data || !er->er_data_len) { + er->er_data = NULL; + er->er_data_len = 0; + } + error = ea_check_size(GFS2_SB(&ip->i_inode), er); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + if (IS_IMMUTABLE(&ip->i_inode)) + error = -EPERM; + else + error = gfs2_ea_ops[er->er_type]->eo_set(ip, er); + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + + if (prev) { + uint32_t len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else + ea->ea_type = GFS2_EATYPE_UNUSED; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_di.di_eattr) + return -ENODATA; + + error = gfs2_ea_find(ip, er, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) + error = ea_remove_stuffed(ip, &el); + else + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, + 0); + + brelse(el.el_bh); + + return error; +} + +/** + * gfs2_ea_remove - sets (or creates or replaces) an extended attribute + * @ip: pointer to the inode of the target file + * @er: request information + * + * Returns: errno + */ + +int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + error = -EPERM; + else + error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_header *ea, char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + uint64_t *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bh) + return -ENOMEM; + + error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); + if (error) + goto out; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), + DIO_START, bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto fail; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_reread(sdp, bh[x], DIO_WAIT); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto fail; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto fail; + } + + gfs2_trans_add_bh(ip->i_gl, bh[x], 1); + + memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), + data, + (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + + amount -= sdp->sd_jbsize; + data += sdp->sd_jbsize; + + brelse(bh[x]); + } + + out: + kfree(bh); + + return error; + + fail: + gfs2_trans_end(sdp); + kfree(bh); + + return error; +} + +int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, + struct iattr *attr, char *data) +{ + struct buffer_head *dibh; + int error; + + if (GFS2_EA_IS_STUFFED(el->el_ea)) { + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + memcpy(GFS2_EA2DATA(el->el_ea), + data, + GFS2_EA_DATA_LEN(el->el_ea)); + } else + error = ea_acl_chmod_unstuffed(ip, el->el_ea, data); + + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + error = inode_setattr(&ip->i_inode, attr); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + gfs2_inode_attr_out(ip); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +static int ea_dealloc_indirect(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + struct buffer_head *indbh, *dibh; + uint64_t *eablk, *end; + unsigned int rg_blocks = 0; + uint64_t bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, + DIO_START | DIO_WAIT, &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) { + uint64_t bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + bstart = bn; + blen = 1; + } + blks++; + } + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + else + goto out; + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_ri.ri_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist_free; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + + RES_INDIRECT + RES_STATFS + + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + + eablk = (uint64_t *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + bstart = 0; + blen = 0; + + for (; eablk < end; eablk++) { + uint64_t bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *eablk = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + + out_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); + + out_rlist_free: + gfs2_rlist_free(&rlist); + + out: + brelse(indbh); + + return error; +} + +static int ea_dealloc_block(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_rgrpd *rgd; + struct buffer_head *dibh; + int error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, + &al->al_rgd_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + + RES_STATFS + RES_QUOTA, 1); + if (error) + goto out_gunlock; + + gfs2_free_meta(ip, ip->i_di.di_eattr, 1); + + ip->i_di.di_eattr = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + + out_gunlock: + gfs2_glock_dq_uninit(&al->al_rgd_gh); + + return error; +} + +/** + * gfs2_ea_dealloc - deallocate the extended attribute fork + * @ip: the inode + * + * Returns: errno + */ + +int gfs2_ea_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + if (error) + goto out_quota; + + error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); + if (error) + goto out_rindex; + + if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + error = ea_dealloc_indirect(ip); + if (error) + goto out_rindex; + } + + error = ea_dealloc_block(ip); + + out_rindex: + gfs2_glock_dq_uninit(&al->al_ri_gh); + + out_quota: + gfs2_quota_unhold(ip); + + out_alloc: + gfs2_alloc_put(ip); + + return error; +} + --- /dev/null +++ b/fs/gfs2/eattr.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __EATTR_DOT_H__ +#define __EATTR_DOT_H__ + +#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len) +#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len) + +#define GFS2_EA_SIZE(ea) \ +ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ + ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ + (sizeof(uint64_t) * (ea)->ea_num_ptrs)), 8) + +#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) +#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) + +#define GFS2_EAREQ_SIZE_STUFFED(er) \ +ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) + +#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \ +ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ + sizeof(uint64_t) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8) + +#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) +#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) + +#define GFS2_EA2DATAPTRS(ea) \ +((uint64_t *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8))) + +#define GFS2_EA2NEXT(ea) \ +((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea))) + +#define GFS2_EA_BH2FIRST(bh) \ +((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) + +#define GFS2_ERF_MODE 0x80000000 + +struct gfs2_ea_request { + char *er_name; + char *er_data; + unsigned int er_name_len; + unsigned int er_data_len; + unsigned int er_type; /* GFS2_EATYPE_... */ + int er_flags; + mode_t er_mode; +}; + +struct gfs2_ea_location { + struct buffer_head *el_bh; + struct gfs2_ea_header *el_ea; + struct gfs2_ea_header *el_prev; +}; + +int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); + +int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er); + +int gfs2_ea_dealloc(struct gfs2_inode *ip); + +/* Exported to acl.c */ + +int gfs2_ea_find(struct gfs2_inode *ip, + struct gfs2_ea_request *er, + struct gfs2_ea_location *el); +int gfs2_ea_get_copy(struct gfs2_inode *ip, + struct gfs2_ea_location *el, + char *data); +int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, + struct iattr *attr, char *data); + +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return (5 + (ea->ea_name_len + 1)); + case GFS2_EATYPE_SYS: + return (7 + (ea->ea_name_len + 1)); + case GFS2_EATYPE_SECURITY: + return (9 + (ea->ea_name_len + 1)); + default: + return (0); + } +} + +#endif /* __EATTR_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/eaops.c @@ -0,0 +1,230 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "acl.h" +#include "eaops.h" +#include "eattr.h" +#include "util.h" + +/** + * gfs2_ea_name2type - get the type of the ea, and truncate type from the name + * @namep: ea name, possibly with type appended + * + * Returns: GFS2_EATYPE_XXX + */ + +unsigned int gfs2_ea_name2type(const char *name, char **truncated_name) +{ + unsigned int type; + + if (strncmp(name, "system.", 7) == 0) { + type = GFS2_EATYPE_SYS; + if (truncated_name) + *truncated_name = strchr(name, '.') + 1; + } else if (strncmp(name, "user.", 5) == 0) { + type = GFS2_EATYPE_USR; + if (truncated_name) + *truncated_name = strchr(name, '.') + 1; + } else if (strncmp(name, "security.", 9) == 0) { + type = GFS2_EATYPE_SECURITY; + if (truncated_name) + *truncated_name = strchr(name, '.') + 1; + } else { + type = GFS2_EATYPE_UNUSED; + if (truncated_name) + *truncated_name = NULL; + } + + return type; +} + +static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_READ, NULL); + if (error) + return error; + + return gfs2_ea_get_i(ip, er); +} + +static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + + if (S_ISREG(inode->i_mode) || + (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) { + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + } else + return -EPERM; + + return gfs2_ea_set_i(ip, er); +} + +static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + + if (S_ISREG(inode->i_mode) || + (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) { + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + } else + return -EPERM; + + return gfs2_ea_remove_i(ip, er); +} + +static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && + !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 && + (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) || + GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) + return -EOPNOTSUPP; + + + + return gfs2_ea_get_i(ip, er); +} + +static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + int remove = 0; + int error; + + if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { + if (!(er->er_flags & GFS2_ERF_MODE)) { + er->er_mode = ip->i_di.di_mode; + er->er_flags |= GFS2_ERF_MODE; + } + error = gfs2_acl_validate_set(ip, 1, er, + &remove, &er->er_mode); + if (error) + return error; + error = gfs2_ea_set_i(ip, er); + if (error) + return error; + if (remove) + gfs2_ea_remove_i(ip, er); + return 0; + + } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { + error = gfs2_acl_validate_set(ip, 0, er, + &remove, NULL); + if (error) + return error; + if (!remove) + error = gfs2_ea_set_i(ip, er); + else { + error = gfs2_ea_remove_i(ip, er); + if (error == -ENODATA) + error = 0; + } + return error; + } + + return -EPERM; +} + +static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { + int error = gfs2_acl_validate_remove(ip, 1); + if (error) + return error; + + } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { + int error = gfs2_acl_validate_remove(ip, 0); + if (error) + return error; + + } else + return -EPERM; + + return gfs2_ea_remove_i(ip, er); +} + +static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_READ, NULL); + if (error) + return error; + + return gfs2_ea_get_i(ip, er); +} + +static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + + return gfs2_ea_set_i(ip, er); +} + +static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + + return gfs2_ea_remove_i(ip, er); +} + +static struct gfs2_eattr_operations gfs2_user_eaops = { + .eo_get = user_eo_get, + .eo_set = user_eo_set, + .eo_remove = user_eo_remove, + .eo_name = "user", +}; + +struct gfs2_eattr_operations gfs2_system_eaops = { + .eo_get = system_eo_get, + .eo_set = system_eo_set, + .eo_remove = system_eo_remove, + .eo_name = "system", +}; + +static struct gfs2_eattr_operations gfs2_security_eaops = { + .eo_get = security_eo_get, + .eo_set = security_eo_set, + .eo_remove = security_eo_remove, + .eo_name = "security", +}; + +struct gfs2_eattr_operations *gfs2_ea_ops[] = { + NULL, + &gfs2_user_eaops, + &gfs2_system_eaops, + &gfs2_security_eaops, +}; + --- /dev/null +++ b/fs/gfs2/eaops.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __EAOPS_DOT_H__ +#define __EAOPS_DOT_H__ + +struct gfs2_ea_request; + +struct gfs2_eattr_operations { + int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er); + int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er); + int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er); + char *eo_name; +}; + +unsigned int gfs2_ea_name2type(const char *name, char **truncated_name); + +extern struct gfs2_eattr_operations gfs2_system_eaops; + +extern struct gfs2_eattr_operations *gfs2_ea_ops[]; + +#endif /* __EAOPS_DOT_H__ */ + [PATCH 10/16] GFS2: Logging and recovery The journaling and recovery code for GFS2 Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/log.c | 603 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/log.h | 61 +++++ fs/gfs2/recovery.c | 573 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/recovery.h | 32 ++ 4 files changed, 1269 insertions(+) --- /dev/null +++ b/fs/gfs2/log.c @@ -0,0 +1,603 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "util.h" +#include "dir.h" + +#define PULL 1 + +/** + * gfs2_struct2blk - compute stuff + * @sdp: the filesystem + * @nstruct: the number of structures + * @ssize: the size of the structures + * + * Compute the number of log descriptor blocks needed to hold a certain number + * of structures of a certain size. + * + * Returns: the number of blocks needed (minimum is always 1) + */ + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize) +{ + unsigned int blks; + unsigned int first, second; + + blks = 1; + first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / + ssize; + + if (nstruct > first) { + second = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / ssize; + blks += DIV_ROUND_UP(nstruct - first, second); + } + + return blks; +} + +void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) +{ + struct list_head *head = &sdp->sd_ail1_list; + uint64_t sync_gen; + struct list_head *first, *tmp; + struct gfs2_ail *first_ai, *ai; + + gfs2_log_lock(sdp); + if (list_empty(head)) { + gfs2_log_unlock(sdp); + return; + } + sync_gen = sdp->sd_ail_sync_gen++; + + first = head->prev; + first_ai = list_entry(first, struct gfs2_ail, ai_list); + first_ai->ai_sync_gen = sync_gen; + gfs2_ail1_start_one(sdp, first_ai); + + if (flags & DIO_ALL) + first = NULL; + + for (;;) { + if (first && (head->prev != first || + gfs2_ail1_empty_one(sdp, first_ai, 0))) + break; + + for (tmp = head->prev; tmp != head; tmp = tmp->prev) { + ai = list_entry(tmp, struct gfs2_ail, ai_list); + if (ai->ai_sync_gen >= sync_gen) + continue; + ai->ai_sync_gen = sync_gen; + gfs2_ail1_start_one(sdp, ai); + break; + } + + if (tmp == head) + break; + } + + gfs2_log_unlock(sdp); +} + +int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) +{ + struct gfs2_ail *ai, *s; + int ret; + + gfs2_log_lock(sdp); + + list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { + if (gfs2_ail1_empty_one(sdp, ai, flags)) + list_move(&ai->ai_list, &sdp->sd_ail2_list); + else if (!(flags & DIO_ALL)) + break; + } + + ret = list_empty(&sdp->sd_ail1_list); + + gfs2_log_unlock(sdp); + + return ret; +} + +static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) +{ + struct gfs2_ail *ai, *safe; + unsigned int old_tail = sdp->sd_log_tail; + int wrap = (new_tail < old_tail); + int a, b, rm; + + gfs2_log_lock(sdp); + + list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { + a = (old_tail <= ai->ai_first); + b = (ai->ai_first < new_tail); + rm = (wrap) ? (a || b) : (a && b); + if (!rm) + continue; + + gfs2_ail2_empty_one(sdp, ai); + list_del(&ai->ai_list); + gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list)); + kfree(ai); + } + + gfs2_log_unlock(sdp); +} + +/** + * gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * + * Returns: errno + */ + +int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) +{ + unsigned int try = 0; + + if (gfs2_assert_warn(sdp, blks) || + gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) + return -EINVAL; + + mutex_lock(&sdp->sd_log_reserve_mutex); + gfs2_log_lock(sdp); + while(sdp->sd_log_blks_free <= blks) { + gfs2_log_unlock(sdp); + gfs2_ail1_empty(sdp, 0); + gfs2_log_flush(sdp, NULL); + + if (try++) + gfs2_ail1_start(sdp, 0); + gfs2_log_lock(sdp); + } + sdp->sd_log_blks_free -= blks; + /* printk(KERN_INFO "reserved %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */ + gfs2_log_unlock(sdp); + mutex_unlock(&sdp->sd_log_reserve_mutex); + + down_read(&sdp->sd_log_flush_lock); + + return 0; +} + +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + gfs2_log_lock(sdp); + sdp->sd_log_blks_free += blks; + /* printk(KERN_INFO "released %u blocks (%u left)\n", blks, sdp->sd_log_blks_free); */ + gfs2_assert_withdraw(sdp, + sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); + gfs2_log_unlock(sdp); + up_read(&sdp->sd_log_flush_lock); +} + +static uint64_t log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) +{ + int new = 0; + uint64_t dbn; + int error; + int bdy; + + error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, &new, &dbn, &bdy); + if (!(!error && dbn)) { + printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, (unsigned long long)dbn, lbn); + } + gfs2_assert_withdraw(sdp, !error && dbn); + + return dbn; +} + +/** + * log_distance - Compute distance between two journal blocks + * @sdp: The GFS2 superblock + * @newer: The most recent journal block of the pair + * @older: The older journal block of the pair + * + * Compute the distance (in the journal direction) between two + * blocks in the journal + * + * Returns: the distance in blocks + */ + +static inline unsigned int log_distance(struct gfs2_sbd *sdp, + unsigned int newer, + unsigned int older) +{ + int dist; + + dist = newer - older; + if (dist < 0) + dist += sdp->sd_jdesc->jd_blocks; + + return dist; +} + +static unsigned int current_tail(struct gfs2_sbd *sdp) +{ + struct gfs2_ail *ai; + unsigned int tail; + + gfs2_log_lock(sdp); + + if (list_empty(&sdp->sd_ail1_list)) + tail = sdp->sd_log_head; + else { + ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, + ai_list); + tail = ai->ai_first; + } + + gfs2_log_unlock(sdp); + + return tail; +} + +static inline void log_incr_head(struct gfs2_sbd *sdp) +{ + if (sdp->sd_log_flush_head == sdp->sd_log_tail) + gfs2_assert_withdraw(sdp, + sdp->sd_log_flush_head == sdp->sd_log_head); + + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + sdp->sd_log_flush_head = 0; + sdp->sd_log_flush_wrapped = 1; + } +} + +/** + * gfs2_log_get_buf - Get and initialize a buffer to use for log control data + * @sdp: The GFS2 superblock + * + * Returns: the buffer_head + */ + +struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) +{ + uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct gfs2_log_buf *lb; + struct buffer_head *bh; + + lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); + list_add(&lb->lb_list, &sdp->sd_log_flush_list); + + bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + log_incr_head(sdp); + + return bh; +} + +/** + * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log + * @sdp: the filesystem + * @data: the data the buffer_head should point to + * + * Returns: the log buffer descriptor + */ + +struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, + struct buffer_head *real) +{ + uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct gfs2_log_buf *lb; + struct buffer_head *bh; + + lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); + list_add(&lb->lb_list, &sdp->sd_log_flush_list); + lb->lb_real = real; + + bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); + atomic_set(&bh->b_count, 1); + bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate); + set_bh_page(bh, real->b_page, bh_offset(real)); + bh->b_blocknr = blkno; + bh->b_size = sdp->sd_sb.sb_bsize; + bh->b_bdev = sdp->sd_vfs->s_bdev; + + log_incr_head(sdp); + + return bh; +} + +static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull) +{ + unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + + ail2_empty(sdp, new_tail); + + gfs2_log_lock(sdp); + sdp->sd_log_blks_free += dist - ((pull) ? 1 : 0); + /* printk(KERN_INFO "pull tail refunding %u blocks (%u left) pull=%d\n", dist - ((pull) ? 1 : 0), sdp->sd_log_blks_free, pull); */ + gfs2_assert_withdraw(sdp, + sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); + gfs2_log_unlock(sdp); + + sdp->sd_log_tail = new_tail; +} + +/** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, uint32_t flags, int pull) +{ + uint64_t blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + struct gfs2_log_header *lh; + unsigned int tail; + uint32_t hash; + + /* printk(KERN_INFO "log write header start (flags=%08x, pull=%d)\n", flags, pull); */ + + bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + gfs2_ail1_empty(sdp, 0); + tail = current_tail(sdp); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); + lh->lh_flags = cpu_to_be32(flags); + lh->lh_tail = cpu_to_be32(tail); + lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); + hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + set_buffer_dirty(bh); + if (sync_dirty_buffer(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + if (sdp->sd_log_tail != tail) + log_pull_tail(sdp, tail, pull); + else + gfs2_assert_withdraw(sdp, !pull); + + sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); + log_incr_head(sdp); + + /* printk(KERN_INFO "log write header out\n"); */ +} + +static void log_flush_commit(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_log_flush_list; + struct gfs2_log_buf *lb; + struct buffer_head *bh; +#if 0 + unsigned int d; + + d = log_distance(sdp, sdp->sd_log_flush_head, sdp->sd_log_head); + + gfs2_assert_withdraw(sdp, d + 1 == sdp->sd_log_blks_reserved); +#endif + + while (!list_empty(head)) { + lb = list_entry(head->next, struct gfs2_log_buf, lb_list); + list_del(&lb->lb_list); + bh = lb->lb_bh; + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + if (lb->lb_real) { + while (atomic_read(&bh->b_count) != 1) /* Grrrr... */ + schedule(); + free_buffer_head(bh); + } else + brelse(bh); + kfree(lb); + } + + log_write_header(sdp, 0, 0); +} + +/** + * gfs2_log_flush - flush incore transaction(s) + * @sdp: the filesystem + * @gl: The glock structure to flush. If NULL, flush the whole incore log + * + */ + +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +{ + struct gfs2_ail *ai; + + down_write(&sdp->sd_log_flush_lock); + + if (gl) { + gfs2_log_lock(sdp); + if (list_empty(&gl->gl_le.le_list)) { + gfs2_log_unlock(sdp); + up_write(&sdp->sd_log_flush_lock); + return; + } + gfs2_log_unlock(sdp); + } + + ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&ai->ai_ail1_list); + INIT_LIST_HEAD(&ai->ai_ail2_list); + + gfs2_assert_withdraw(sdp, + sdp->sd_log_num_buf == sdp->sd_log_commited_buf); + gfs2_assert_withdraw(sdp, + sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + ai->ai_first = sdp->sd_log_flush_head; + + lops_before_commit(sdp); + if (!list_empty(&sdp->sd_log_flush_list)) + log_flush_commit(sdp); + else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle) + log_write_header(sdp, 0, PULL); + lops_after_commit(sdp, ai); + sdp->sd_log_head = sdp->sd_log_flush_head; + + /* printk(KERN_INFO "sd_log_num_hdrs %u\n", sdp->sd_log_num_hdrs); */ + sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs; + + sdp->sd_log_blks_reserved = + sdp->sd_log_commited_buf = + sdp->sd_log_num_hdrs = + sdp->sd_log_commited_revoke = 0; + + gfs2_log_lock(sdp); + if (!list_empty(&ai->ai_ail1_list)) { + list_add(&ai->ai_list, &sdp->sd_ail1_list); + ai = NULL; + } + gfs2_log_unlock(sdp); + + sdp->sd_vfs->s_dirt = 0; + up_write(&sdp->sd_log_flush_lock); + + kfree(ai); +} + +static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int reserved = 0; + unsigned int old; + + gfs2_log_lock(sdp); + + sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; + gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0); + sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; + gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); + + if (sdp->sd_log_commited_buf) + reserved += sdp->sd_log_commited_buf; + if (sdp->sd_log_commited_revoke) + reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + sizeof(uint64_t)); + if (reserved) + reserved++; + + old = sdp->sd_log_blks_free; + sdp->sd_log_blks_free += tr->tr_reserved - + (reserved - sdp->sd_log_blks_reserved); + + gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old); + gfs2_assert_withdraw(sdp, + sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks + + sdp->sd_log_num_hdrs); + + sdp->sd_log_blks_reserved = reserved; + + gfs2_log_unlock(sdp); +} + +/** + * gfs2_log_commit - Commit a transaction to the log + * @sdp: the filesystem + * @tr: the transaction + * + * Returns: errno + */ + +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + log_refund(sdp, tr); + lops_incore_commit(sdp, tr); + + sdp->sd_vfs->s_dirt = 1; + up_read(&sdp->sd_log_flush_lock); + + gfs2_log_lock(sdp); + if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) { + gfs2_log_unlock(sdp); + gfs2_log_flush(sdp, NULL); + } else + gfs2_log_unlock(sdp); +} + +/** + * gfs2_log_shutdown - write a shutdown header into a journal + * @sdp: the filesystem + * + */ + +void gfs2_log_shutdown(struct gfs2_sbd *sdp) +{ + down_write(&sdp->sd_log_flush_lock); + + gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs); + gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0); + + /* printk(KERN_INFO "sd_log_blks_free %u, sd_jdesc->jd_blocks %u\n", sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); */ + gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks); + gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); + gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); + + sdp->sd_log_head = sdp->sd_log_flush_head; + sdp->sd_log_tail = sdp->sd_log_head; + + up_write(&sdp->sd_log_flush_lock); +} + --- /dev/null +++ b/fs/gfs2/log.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __LOG_DOT_H__ +#define __LOG_DOT_H__ + +/** + * gfs2_log_lock - acquire the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_lock(struct gfs2_sbd *sdp) +{ + spin_lock(&sdp->sd_log_lock); +} + +/** + * gfs2_log_unlock - release the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) +{ + spin_unlock(&sdp->sd_log_lock); +} + +static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, + unsigned int value) +{ + if (++value == sdp->sd_jdesc->jd_blocks) { + value = 0; + } + sdp->sd_log_head = sdp->sd_log_tail = value; +} + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize); + +void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags); +int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags); + +int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); + +struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); +struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, + struct buffer_head *real); +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); + +void gfs2_log_shutdown(struct gfs2_sbd *sdp); + +#endif /* __LOG_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/recovery.c @@ -0,0 +1,573 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "lm.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "super.h" +#include "util.h" +#include "dir.h" + +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + int new = 0; + uint64_t dblock; + uint32_t extlen; + int error; + + error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen); + if (error) + return error; + if (!dblock) { + gfs2_consist_inode(ip); + return -EIO; + } + + gfs2_meta_ra(gl, dblock, extlen); + error = gfs2_meta_read(gl, dblock, DIO_START | DIO_WAIT, bh); + + return error; +} + +int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where) +{ + struct list_head *head = &sdp->sd_revoke_list; + struct gfs2_revoke_replay *rr; + int found = 0; + + list_for_each_entry(rr, head, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (found) { + rr->rr_where = where; + return 0; + } + + rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL); + if (!rr) + return -ENOMEM; + + rr->rr_blkno = blkno; + rr->rr_where = where; + list_add(&rr->rr_list, head); + + return 1; +} + +int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where) +{ + struct gfs2_revoke_replay *rr; + int wrap, a, b, revoke; + int found = 0; + + list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (!found) + return 0; + + wrap = (rr->rr_where < sdp->sd_replay_tail); + a = (sdp->sd_replay_tail < where); + b = (where < rr->rr_where); + revoke = (wrap) ? (a || b) : (a && b); + + return revoke; +} + +void gfs2_revoke_clean(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_revoke_list; + struct gfs2_revoke_replay *rr; + + while (!list_empty(head)) { + rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list); + list_del(&rr->rr_list); + kfree(rr); + } +} + +/** + * get_log_header - read the log header for a given segment + * @jd: the journal + * @blk: the block to look at + * @lh: the log header to return + * + * Read the log header for a given segement in a given journal. Do a few + * sanity checks on it. + * + * Returns: 0 on success, + * 1 if the header was invalid or incomplete, + * errno on error + */ + +static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, + struct gfs2_log_header *head) +{ + struct buffer_head *bh; + struct gfs2_log_header lh; + uint32_t hash; + int error; + + error = gfs2_replay_read_block(jd, blk, &bh); + if (error) + return error; + + memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header)); + lh.lh_hash = 0; + hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header)); + gfs2_log_header_in(&lh, bh->b_data); + + brelse(bh); + + if (lh.lh_header.mh_magic != GFS2_MAGIC || + lh.lh_header.mh_type != GFS2_METATYPE_LH || + lh.lh_blkno != blk || lh.lh_hash != hash) + return 1; + + *head = lh; + + return 0; +} + +/** + * find_good_lh - find a good log header + * @jd: the journal + * @blk: the segment to start searching from + * @lh: the log header to fill in + * @forward: if true search forward in the log, else search backward + * + * Call get_log_header() to get a log header for a segment, but if the + * segment is bad, either scan forward or backward until we find a good one. + * + * Returns: errno + */ + +static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, + struct gfs2_log_header *head) +{ + unsigned int orig_blk = *blk; + int error; + + for (;;) { + error = get_log_header(jd, *blk, head); + if (error <= 0) + return error; + + if (++*blk == jd->jd_blocks) + *blk = 0; + + if (*blk == orig_blk) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + } +} + +/** + * jhead_scan - make sure we've found the head of the log + * @jd: the journal + * @head: this is filled in with the log descriptor of the head + * + * At this point, seg and lh should be either the head of the log or just + * before. Scan forward until we find the head. + * + * Returns: errno + */ + +static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head) +{ + unsigned int blk = head->lh_blkno; + struct gfs2_log_header lh; + int error; + + for (;;) { + if (++blk == jd->jd_blocks) + blk = 0; + + error = get_log_header(jd, blk, &lh); + if (error < 0) + return error; + if (error == 1) + continue; + + if (lh.lh_sequence == head->lh_sequence) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + if (lh.lh_sequence < head->lh_sequence) + break; + + *head = lh; + } + + return 0; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: the journal + * @head: the log descriptor for the head of the log is returned here + * + * Do a binary search of a journal and find the valid log entry with the + * highest sequence number. (i.e. the log head) + * + * Returns: errno + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head) +{ + struct gfs2_log_header lh_1, lh_m; + uint32_t blk_1, blk_2, blk_m; + int error; + + blk_1 = 0; + blk_2 = jd->jd_blocks - 1; + + for (;;) { + blk_m = (blk_1 + blk_2) / 2; + + error = find_good_lh(jd, &blk_1, &lh_1); + if (error) + return error; + + error = find_good_lh(jd, &blk_m, &lh_m); + if (error) + return error; + + if (blk_1 == blk_m || blk_m == blk_2) + break; + + if (lh_1.lh_sequence <= lh_m.lh_sequence) + blk_1 = blk_m; + else + blk_2 = blk_m; + } + + error = jhead_scan(jd, &lh_1); + if (error) + return error; + + *head = lh_1; + + return error; +} + +/** + * foreach_descriptor - go through the active part of the log + * @jd: the journal + * @start: the first log header in the active region + * @end: the last log header (don't process the contents of this entry)) + * + * Call a given function once for every log descriptor in the active + * portion of the log. + * + * Returns: errno + */ + +static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, + unsigned int end, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + int error = 0; + u32 length; + __be64 *ptr; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + offset += (sizeof(__be64)-1); + offset &= ~(sizeof(__be64)-1); + + while (start != end) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + if (gfs2_meta_check(sdp, bh)) { + brelse(bh); + return -EIO; + } + ld = (struct gfs2_log_descriptor *)bh->b_data; + length = be32_to_cpu(ld->ld_length); + + if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { + struct gfs2_log_header lh; + error = get_log_header(jd, start, &lh); + if (!error) { + gfs2_replay_incr_blk(sdp, &start); + brelse(bh); + continue; + } + if (error == 1) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + error = -EIO; + } + brelse(bh); + return error; + } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) { + brelse(bh); + return -EIO; + } + ptr = (__be64 *)(bh->b_data + offset); + error = lops_scan_elements(jd, start, ld, ptr, pass); + if (error) { + brelse(bh); + return error; + } + + while (length--) + gfs2_replay_incr_blk(sdp, &start); + + brelse(bh); + } + + return 0; +} + +/** + * clean_journal - mark a dirty journal as being clean + * @sdp: the filesystem + * @jd: the journal + * @gl: the journal's glock + * @head: the head journal to start from + * + * Returns: errno + */ + +static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int lblock; + int new = 0; + uint64_t dblock; + struct gfs2_log_header *lh; + uint32_t hash; + struct buffer_head *bh; + int error; + int boundary; + + lblock = head->lh_blkno; + gfs2_replay_incr_blk(sdp, &lblock); + error = gfs2_block_map(&ip->i_inode, lblock, &new, &dblock, &boundary); + if (error) + return error; + if (!dblock) { + gfs2_consist_inode(ip); + return -EIO; + } + + bh = sb_getblk(sdp->sd_vfs, dblock); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); + lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); + lh->lh_blkno = cpu_to_be32(lblock); + hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + set_buffer_dirty(bh); + if (sync_dirty_buffer(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + return error; +} + +/** + * gfs2_recover_journal - recovery a given journal + * @jd: the struct gfs2_jdesc describing the journal + * + * Acquire the journal's lock, check to see if the journal is clean, and + * do recovery if necessary. + * + * Returns: errno + */ + +int gfs2_recover_journal(struct gfs2_jdesc *jd) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header head; + struct gfs2_holder j_gh, ji_gh, t_gh; + unsigned long t; + int ro = 0; + unsigned int pass; + int error; + + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", + jd->jd_jid); + + /* Aquire the journal lock so we can do recovery */ + + error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE, + &j_gh); + switch (error) { + case 0: + break; + + case GLR_TRYFAILED: + fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); + error = 0; + + default: + goto fail; + }; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP, &ji_gh); + if (error) + goto fail_gunlock_j; + } else { + fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); + } + + fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); + + error = gfs2_jdesc_check(jd); + if (error) + goto fail_gunlock_ji; + + error = gfs2_find_jhead(jd, &head); + if (error) + goto fail_gunlock_ji; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", + jd->jd_jid); + + t = jiffies; + + /* Acquire a shared hold on the transaction lock */ + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | LM_FLAG_PRIORITY | + GL_NOCANCEL | GL_NOCACHE, &t_gh); + if (error) + goto fail_gunlock_ji; + + if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + ro = 1; + } else { + if (sdp->sd_vfs->s_flags & MS_RDONLY) + ro = 1; + } + + if (ro) { + fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n", + jd->jd_jid); + error = -EROFS; + goto fail_gunlock_tr; + } + + fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); + + for (pass = 0; pass < 2; pass++) { + lops_before_scan(jd, &head, pass); + error = foreach_descriptor(jd, head.lh_tail, + head.lh_blkno, pass); + lops_after_scan(jd, error, pass); + if (error) + goto fail_gunlock_tr; + } + + error = clean_journal(jd, &head); + if (error) + goto fail_gunlock_tr; + + gfs2_glock_dq_uninit(&t_gh); + t = DIV_ROUND_UP(jiffies - t, HZ); + fs_info(sdp, "jid=%u: Journal replayed in %lus\n", + jd->jd_jid, t); + } + + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) + gfs2_glock_dq_uninit(&ji_gh); + + gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); + + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) + gfs2_glock_dq_uninit(&j_gh); + + fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); + return 0; + +fail_gunlock_tr: + gfs2_glock_dq_uninit(&t_gh); +fail_gunlock_ji: + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + gfs2_glock_dq_uninit(&ji_gh); +fail_gunlock_j: + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); + +fail: + gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); + return error; +} + +/** + * gfs2_check_journals - Recover any dirty journals + * @sdp: the filesystem + * + */ + +void gfs2_check_journals(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + + for (;;) { + jd = gfs2_jdesc_find_dirty(sdp); + if (!jd) + break; + + if (jd != sdp->sd_jdesc) + gfs2_recover_journal(jd); + } +} + --- /dev/null +++ b/fs/gfs2/recovery.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __RECOVERY_DOT_H__ +#define __RECOVERY_DOT_H__ + +static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) +{ + if (++*blk == sdp->sd_jdesc->jd_blocks) + *blk = 0; +} + +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh); + +int gfs2_revoke_add(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where); +int gfs2_revoke_check(struct gfs2_sbd *sdp, uint64_t blkno, unsigned int where); +void gfs2_revoke_clean(struct gfs2_sbd *sdp); + +int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header *head); +int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); +void gfs2_check_journals(struct gfs2_sbd *sdp); + +#endif /* __RECOVERY_DOT_H__ */ + [PATCH 11/16] GFS2: Quota and LVB handling Lock value block handling and quotas. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/lvb.c | 45 + fs/gfs2/lvb.h | 19 fs/gfs2/quota.c | 1286 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/quota.h | 32 + 4 files changed, 1382 insertions(+) --- /dev/null +++ b/fs/gfs2/quota.c @@ -0,0 +1,1286 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +/* + * Quota change tags are associated with each transaction that allocates or + * deallocates space. Those changes are accumulated locally to each node (in a + * per-node file) and then are periodically synced to the quota file. This + * avoids the bottleneck of constantly touching the quota file, but introduces + * fuzziness in the current usage value of IDs that are being used on different + * nodes in the cluster simultaneously. So, it is possible for a user on + * multiple nodes to overrun their quota, but that overrun is controlable. + * Since quota tags are part of transactions, there is no need to a quota check + * program to be run on node crashes or anything like that. + * + * There are couple of knobs that let the administrator manage the quota + * fuzziness. "quota_quantum" sets the maximum time a quota change can be + * sitting on one node before being synced to the quota file. (The default is + * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency + * of quota file syncs increases as the user moves closer to their limit. The + * more frequent the syncs, the more accurate the quota enforcement, but that + * means that there is more contention between the nodes for the quota file. + * The default value is one. This sets the maximum theoretical quota overrun + * (with infinite node with infinite bandwidth) to twice the user's limit. (In + * practice, the maximum overrun you see should be much less.) A "quota_scale" + * number greater than one makes quota syncs more frequent and reduces the + * maximum overrun. Numbers less than one (but greater than zero) make quota + * syncs less frequent. + * + * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of + * the quota file, so it is not being constantly read. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "lvb.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "inode.h" +#include "ops_file.h" +#include "ops_address.h" +#include "util.h" + +#define QUOTA_USER 1 +#define QUOTA_GROUP 0 + +static uint64_t qd2offset(struct gfs2_quota_data *qd) +{ + uint64_t offset; + + offset = 2 * (uint64_t)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags); + offset *= sizeof(struct gfs2_quota); + + return offset; +} + +static int qd_alloc(struct gfs2_sbd *sdp, int user, uint32_t id, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd; + int error; + + qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL); + if (!qd) + return -ENOMEM; + + qd->qd_count = 1; + qd->qd_id = id; + if (user) + set_bit(QDF_USER, &qd->qd_flags); + qd->qd_slot = -1; + + error = gfs2_glock_get(sdp, 2 * (uint64_t)id + !user, + &gfs2_quota_glops, CREATE, &qd->qd_gl); + if (error) + goto fail; + + error = gfs2_lvb_hold(qd->qd_gl); + gfs2_glock_put(qd->qd_gl); + if (error) + goto fail; + + *qdp = qd; + + return 0; + + fail: + kfree(qd); + return error; +} + +static int qd_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL, *new_qd = NULL; + int error, found; + + *qdp = NULL; + + for (;;) { + found = 0; + spin_lock(&sdp->sd_quota_spin); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (qd->qd_id == id && + !test_bit(QDF_USER, &qd->qd_flags) == !user) { + qd->qd_count++; + found = 1; + break; + } + } + + if (!found) + qd = NULL; + + if (!qd && new_qd) { + qd = new_qd; + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + new_qd = NULL; + } + + spin_unlock(&sdp->sd_quota_spin); + + if (qd || !create) { + if (new_qd) { + gfs2_lvb_unhold(new_qd->qd_gl); + kfree(new_qd); + } + *qdp = qd; + return 0; + } + + error = qd_alloc(sdp, user, id, &new_qd); + if (error) + return error; + } +} + +static void qd_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_count); + qd->qd_count++; + spin_unlock(&sdp->sd_quota_spin); +} + +static void qd_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_count); + if (!--qd->qd_count) + qd->qd_last_touched = jiffies; + spin_unlock(&sdp->sd_quota_spin); +} + +static int slot_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + unsigned int c, o = 0, b; + unsigned char byte = 0; + + spin_lock(&sdp->sd_quota_spin); + + if (qd->qd_slot_count++) { + spin_unlock(&sdp->sd_quota_spin); + return 0; + } + + for (c = 0; c < sdp->sd_quota_chunks; c++) + for (o = 0; o < PAGE_SIZE; o++) { + byte = sdp->sd_quota_bitmap[c][o]; + if (byte != 0xFF) + goto found; + } + + goto fail; + + found: + for (b = 0; b < 8; b++) + if (!(byte & (1 << b))) + break; + qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; + + if (qd->qd_slot >= sdp->sd_quota_slots) + goto fail; + + sdp->sd_quota_bitmap[c][o] |= 1 << b; + + spin_unlock(&sdp->sd_quota_spin); + + return 0; + + fail: + qd->qd_slot_count--; + spin_unlock(&sdp->sd_quota_spin); + return -ENOSPC; +} + +static void slot_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + spin_unlock(&sdp->sd_quota_spin); +} + +static void slot_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_slot_count); + if (!--qd->qd_slot_count) { + gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + qd->qd_slot = -1; + } + spin_unlock(&sdp->sd_quota_spin); +} + +static int bh_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + unsigned int block, offset; + uint64_t dblock; + int new = 0; + struct buffer_head *bh; + int error; + int boundary; + + mutex_lock(&sdp->sd_quota_mutex); + + if (qd->qd_bh_count++) { + mutex_unlock(&sdp->sd_quota_mutex); + return 0; + } + + block = qd->qd_slot / sdp->sd_qc_per_block; + offset = qd->qd_slot % sdp->sd_qc_per_block;; + + error = gfs2_block_map(&ip->i_inode, block, &new, &dblock, &boundary); + if (error) + goto fail; + error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, &bh); + if (error) + goto fail; + error = -EIO; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) + goto fail_brelse; + + qd->qd_bh = bh; + qd->qd_bh_qc = (struct gfs2_quota_change *) + (bh->b_data + sizeof(struct gfs2_meta_header) + + offset * sizeof(struct gfs2_quota_change)); + + mutex_lock(&sdp->sd_quota_mutex); + + return 0; + + fail_brelse: + brelse(bh); + + fail: + qd->qd_bh_count--; + mutex_unlock(&sdp->sd_quota_mutex); + return error; +} + +static void bh_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_assert(sdp, qd->qd_bh_count); + if (!--qd->qd_bh_count) { + brelse(qd->qd_bh); + qd->qd_bh = NULL; + qd->qd_bh_qc = NULL; + } + mutex_unlock(&sdp->sd_quota_mutex); +} + +static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL; + int error; + int found = 0; + + *qdp = NULL; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return 0; + + spin_lock(&sdp->sd_quota_spin); + + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + qd->qd_sync_gen >= sdp->sd_quota_sync_gen) + continue; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + + set_bit(QDF_LOCKED, &qd->qd_flags); + gfs2_assert_warn(sdp, qd->qd_count); + qd->qd_count++; + qd->qd_change_sync = qd->qd_change; + gfs2_assert_warn(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + found = 1; + + break; + } + + if (!found) + qd = NULL; + + spin_unlock(&sdp->sd_quota_spin); + + if (qd) { + gfs2_assert_warn(sdp, qd->qd_change_sync); + error = bh_get(qd); + if (error) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return error; + } + } + + *qdp = qd; + + return 0; +} + +static int qd_trylock(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return 0; + + spin_lock(&sdp->sd_quota_spin); + + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags)) { + spin_unlock(&sdp->sd_quota_spin); + return 0; + } + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + + set_bit(QDF_LOCKED, &qd->qd_flags); + gfs2_assert_warn(sdp, qd->qd_count); + qd->qd_count++; + qd->qd_change_sync = qd->qd_change; + gfs2_assert_warn(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + + spin_unlock(&sdp->sd_quota_spin); + + gfs2_assert_warn(sdp, qd->qd_change_sync); + if (bh_get(qd)) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return 0; + } + + return 1; +} + +static void qd_unlock(struct gfs2_quota_data *qd) +{ + gfs2_assert_warn(qd->qd_gl->gl_sbd, + test_bit(QDF_LOCKED, &qd->qd_flags)); + clear_bit(QDF_LOCKED, &qd->qd_flags); + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +static int qdsb_get(struct gfs2_sbd *sdp, int user, uint32_t id, int create, + struct gfs2_quota_data **qdp) +{ + int error; + + error = qd_get(sdp, user, id, create, qdp); + if (error) + return error; + + error = slot_get(*qdp); + if (error) + goto fail; + + error = bh_get(*qdp); + if (error) + goto fail_slot; + + return 0; + + fail_slot: + slot_put(*qdp); + + fail: + qd_put(*qdp); + return error; +} + +static void qdsb_put(struct gfs2_quota_data *qd) +{ + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data **qd = al->al_qd; + int error; + + if (gfs2_assert_warn(sdp, !al->al_qd_num) || + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) + return -EIO; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + + error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + + if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) { + error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + } + + if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) { + error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + } + + out: + if (error) + gfs2_quota_unhold(ip); + + return error; +} + +void gfs2_quota_unhold(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + unsigned int x; + + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); + + for (x = 0; x < al->al_qd_num; x++) { + qdsb_put(al->al_qd[x]); + al->al_qd[x] = NULL; + } + al->al_qd_num = 0; +} + +static int sort_qd(const void *a, const void *b) +{ + struct gfs2_quota_data *qd_a = *(struct gfs2_quota_data **)a; + struct gfs2_quota_data *qd_b = *(struct gfs2_quota_data **)b; + int ret = 0; + + if (!test_bit(QDF_USER, &qd_a->qd_flags) != + !test_bit(QDF_USER, &qd_b->qd_flags)) { + if (test_bit(QDF_USER, &qd_a->qd_flags)) + ret = -1; + else + ret = 1; + } else { + if (qd_a->qd_id < qd_b->qd_id) + ret = -1; + else if (qd_a->qd_id > qd_b->qd_id) + ret = 1; + } + + return ret; +} + +static void do_qc(struct gfs2_quota_data *qd, int64_t change) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + struct gfs2_quota_change *qc = qd->qd_bh_qc; + int64_t x; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); + + if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { + qc->qc_change = 0; + qc->qc_flags = 0; + if (test_bit(QDF_USER, &qd->qd_flags)) + qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); + qc->qc_id = cpu_to_be32(qd->qd_id); + } + + x = qc->qc_change; + x = be64_to_cpu(x) + change; + qc->qc_change = cpu_to_be64(x); + + spin_lock(&sdp->sd_quota_spin); + qd->qd_change = x; + spin_unlock(&sdp->sd_quota_spin); + + if (!x) { + gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); + clear_bit(QDF_CHANGE, &qd->qd_flags); + qc->qc_flags = 0; + qc->qc_id = 0; + slot_put(qd); + qd_put(qd); + } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) { + qd_hold(qd); + slot_hold(qd); + } + + mutex_unlock(&sdp->sd_quota_mutex); +} + +/** + * gfs2_adjust_quota + * + * This function was mostly borrowed from gfs2_block_truncate_page which was + * in turn mostly borrowed from ext3 + */ +static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, + int64_t change, struct gfs2_quota_data *qd) +{ + struct inode *inode = &ip->i_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long index = loc >> PAGE_CACHE_SHIFT; + unsigned offset = loc & (PAGE_CACHE_SHIFT - 1); + unsigned blocksize, iblock, pos; + struct buffer_head *bh; + struct page *page; + void *kaddr; + __be64 *ptr; + u64 value; + int err = -EIO; + + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + blocksize = inode->i_sb->s_blocksize; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + if (!buffer_mapped(bh)) { + gfs2_get_block(inode, iblock, bh, 1); + if (!buffer_mapped(bh)) + goto unlock; + } + + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock; + } + + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + kaddr = kmap_atomic(page, KM_USER0); + ptr = (__be64 *)(kaddr + offset); + value = *ptr = cpu_to_be64(be64_to_cpu(*ptr) + change); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + err = 0; + qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); +#if 0 + qd->qd_qb.qb_limit = cpu_to_be64(q.qu_limit); + qd->qd_qb.qb_warn = cpu_to_be64(q.qu_warn); +#endif + qd->qd_qb.qb_value = cpu_to_be64(value); +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) +{ + struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + unsigned int data_blocks, ind_blocks; + struct gfs2_holder *ghs, i_gh; + unsigned int qx, x; + struct gfs2_quota_data *qd; + loff_t offset; + unsigned int nalloc = 0; + struct gfs2_alloc *al = NULL; + int error; + + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + + ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!ghs) + return -ENOMEM; + + sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); + for (qx = 0; qx < num_qd; qx++) { + error = gfs2_glock_nq_init(qda[qx]->qd_gl, + LM_ST_EXCLUSIVE, + GL_NOCACHE, &ghs[qx]); + if (error) + goto out; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out; + + for (x = 0; x < num_qd; x++) { + int alloc_required; + + offset = qd2offset(qda[x]); + error = gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota), + &alloc_required); + if (error) + goto out_gunlock; + if (alloc_required) + nalloc++; + } + + if (nalloc) { + al = gfs2_alloc_get(ip); + + al->al_requested = nalloc * (data_blocks + ind_blocks); + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_alloc; + + error = gfs2_trans_begin(sdp, + al->al_rgd->rd_ri.ri_length + + num_qd * data_blocks + + nalloc * ind_blocks + + RES_DINODE + num_qd + + RES_STATFS, 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, + num_qd * data_blocks + + RES_DINODE + num_qd, 0); + if (error) + goto out_gunlock; + } + + for (x = 0; x < num_qd; x++) { + qd = qda[x]; + offset = qd2offset(qd); + error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, + (struct gfs2_quota_data *) + qd->qd_gl->gl_lvb); + if (error) + goto out_end_trans; + + do_qc(qd, -qd->qd_change_sync); + } + + error = 0; + + out_end_trans: + gfs2_trans_end(sdp); + + out_ipres: + if (nalloc) + gfs2_inplace_release(ip); + + out_alloc: + if (nalloc) + gfs2_alloc_put(ip); + + out_gunlock: + gfs2_glock_dq_uninit(&i_gh); + + out: + while (qx--) + gfs2_glock_dq_uninit(&ghs[qx]); + kfree(ghs); + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + + return error; +} + +static int do_glock(struct gfs2_quota_data *qd, int force_refresh, + struct gfs2_holder *q_gh) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_holder i_gh; + struct gfs2_quota q; + char buf[sizeof(struct gfs2_quota)]; + struct file_ra_state ra_state; + int error; + + file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping); + restart: + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); + if (error) + return error; + + gfs2_quota_lvb_in(&qd->qd_qb, qd->qd_gl->gl_lvb); + + if (force_refresh || qd->qd_qb.qb_magic != GFS2_MAGIC) { + loff_t pos; + gfs2_glock_dq_uninit(q_gh); + error = gfs2_glock_nq_init(qd->qd_gl, + LM_ST_EXCLUSIVE, GL_NOCACHE, + q_gh); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_SHARED, 0, + &i_gh); + if (error) + goto fail; + + memset(buf, 0, sizeof(struct gfs2_quota)); + pos = qd2offset(qd); + error = gfs2_internal_read(ip, &ra_state, buf, + &pos, sizeof(struct gfs2_quota)); + if (error < 0) + goto fail_gunlock; + + gfs2_glock_dq_uninit(&i_gh); + + gfs2_quota_in(&q, buf); + + memset(&qd->qd_qb, 0, sizeof(struct gfs2_quota_lvb)); + qd->qd_qb.qb_magic = GFS2_MAGIC; + qd->qd_qb.qb_limit = q.qu_limit; + qd->qd_qb.qb_warn = q.qu_warn; + qd->qd_qb.qb_value = q.qu_value; + + gfs2_quota_lvb_out(&qd->qd_qb, qd->qd_gl->gl_lvb); + + if (gfs2_glock_is_blocking(qd->qd_gl)) { + gfs2_glock_dq_uninit(q_gh); + force_refresh = 0; + goto restart; + } + } + + return 0; + + fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); + + fail: + gfs2_glock_dq_uninit(q_gh); + + return error; +} + +int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + unsigned int x; + int error = 0; + + gfs2_quota_hold(ip, uid, gid); + + if (capable(CAP_SYS_RESOURCE) || + sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *), + sort_qd, NULL); + + for (x = 0; x < al->al_qd_num; x++) { + error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]); + if (error) + break; + } + + if (!error) + set_bit(GIF_QD_LOCKED, &ip->i_flags); + else { + while (x--) + gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_quota_unhold(ip); + } + + return error; +} + +static int need_sync(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_tune *gt = &sdp->sd_tune; + int64_t value; + unsigned int num, den; + int do_sync = 1; + + if (!qd->qd_qb.qb_limit) + return 0; + + spin_lock(&sdp->sd_quota_spin); + value = qd->qd_change; + spin_unlock(&sdp->sd_quota_spin); + + spin_lock(>->gt_spin); + num = gt->gt_quota_scale_num; + den = gt->gt_quota_scale_den; + spin_unlock(>->gt_spin); + + if (value < 0) + do_sync = 0; + else if (qd->qd_qb.qb_value >= (int64_t)qd->qd_qb.qb_limit) + do_sync = 0; + else { + value *= gfs2_jindex_size(sdp) * num; + do_div(value, den); + value += qd->qd_qb.qb_value; + if (value < (int64_t)qd->qd_qb.qb_limit) + do_sync = 0; + } + + return do_sync; +} + +void gfs2_quota_unlock(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data *qda[4]; + unsigned int count = 0; + unsigned int x; + + if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) + goto out; + + for (x = 0; x < al->al_qd_num; x++) { + struct gfs2_quota_data *qd; + int sync; + + qd = al->al_qd[x]; + sync = need_sync(qd); + + gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + + if (sync && qd_trylock(qd)) + qda[count++] = qd; + } + + if (count) { + do_sync(count, qda); + for (x = 0; x < count; x++) + qd_unlock(qda[x]); + } + + out: + gfs2_quota_unhold(ip); +} + +#define MAX_LINE 256 + +static int print_message(struct gfs2_quota_data *qd, char *type) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n", + sdp->sd_fsname, type, + (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", + qd->qd_id); + + return 0; +} + +int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data *qd; + int64_t value; + unsigned int x; + int error = 0; + + if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) + return 0; + + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + for (x = 0; x < al->al_qd_num; x++) { + qd = al->al_qd[x]; + + if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || + (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) + continue; + + value = qd->qd_qb.qb_value; + spin_lock(&sdp->sd_quota_spin); + value += qd->qd_change; + spin_unlock(&sdp->sd_quota_spin); + + if (qd->qd_qb.qb_limit && (int64_t)qd->qd_qb.qb_limit < value) { + print_message(qd, "exceeded"); + error = -EDQUOT; + break; + } else if (qd->qd_qb.qb_warn && + (int64_t)qd->qd_qb.qb_warn < value && + time_after_eq(jiffies, qd->qd_last_warn + + gfs2_tune_get(sdp, + gt_quota_warn_period) * HZ)) { + error = print_message(qd, "warning"); + qd->qd_last_warn = jiffies; + } + } + + return error; +} + +void gfs2_quota_change(struct gfs2_inode *ip, int64_t change, + uint32_t uid, uint32_t gid) +{ + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data *qd; + unsigned int x; + unsigned int found = 0; + + if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) + return; + if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) + return; + + for (x = 0; x < al->al_qd_num; x++) { + qd = al->al_qd[x]; + + if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || + (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { + do_qc(qd, change); + found++; + } + } +} + +int gfs2_quota_sync(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data **qda; + unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); + unsigned int num_qd; + unsigned int x; + int error = 0; + + sdp->sd_quota_sync_gen++; + + qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); + if (!qda) + return -ENOMEM; + + do { + num_qd = 0; + + for (;;) { + error = qd_fish(sdp, qda + num_qd); + if (error || !qda[num_qd]) + break; + if (++num_qd == max_qd) + break; + } + + if (num_qd) { + if (!error) + error = do_sync(num_qd, qda); + if (!error) + for (x = 0; x < num_qd; x++) + qda[x]->qd_sync_gen = + sdp->sd_quota_sync_gen; + + for (x = 0; x < num_qd; x++) + qd_unlock(qda[x]); + } + } while (!error && num_qd == max_qd); + + kfree(qda); + + return error; +} + +int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id) +{ + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + error = qd_get(sdp, user, id, CREATE, &qd); + if (error) + return error; + + error = do_glock(qd, FORCE, &q_gh); + if (!error) + gfs2_glock_dq_uninit(&q_gh); + + qd_put(qd); + + return error; +} + +#if 0 +int gfs2_quota_read(struct gfs2_sbd *sdp, int user, uint32_t id, + struct gfs2_quota *q) +{ + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + if (((user) ? (id != current->fsuid) : (!in_group_p(id))) && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + + error = qd_get(sdp, user, id, CREATE, &qd); + if (error) + return error; + + error = do_glock(qd, NO_FORCE, &q_gh); + if (error) + goto out; + + memset(q, 0, sizeof(struct gfs2_quota)); + q->qu_limit = qd->qd_qb.qb_limit; + q->qu_warn = qd->qd_qb.qb_warn; + q->qu_value = qd->qd_qb.qb_value; + + spin_lock(&sdp->sd_quota_spin); + q->qu_value += qd->qd_change; + spin_unlock(&sdp->sd_quota_spin); + + gfs2_glock_dq_uninit(&q_gh); + + out: + qd_put(qd); + + return error; +} +#endif /* 0 */ + +int gfs2_quota_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + unsigned int x, slot = 0; + unsigned int found = 0; + uint64_t dblock; + uint32_t extlen = 0; + int error; + + if (!ip->i_di.di_size || + ip->i_di.di_size > (64 << 20) || + ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { + gfs2_consist_inode(ip); + return -EIO; + } + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; + sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); + + error = -ENOMEM; + + sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, + sizeof(unsigned char *), GFP_KERNEL); + if (!sdp->sd_quota_bitmap) + return error; + + for (x = 0; x < sdp->sd_quota_chunks; x++) { + sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!sdp->sd_quota_bitmap[x]) + goto fail; + } + + for (x = 0; x < blocks; x++) { + struct buffer_head *bh; + unsigned int y; + + if (!extlen) { + int new = 0; + error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen); + if (error) + goto fail; + } + gfs2_meta_ra(ip->i_gl, dblock, extlen); + error = gfs2_meta_read(ip->i_gl, dblock, DIO_START | DIO_WAIT, + &bh); + if (error) + goto fail; + error = -EIO; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) { + brelse(bh); + goto fail; + } + + for (y = 0; + y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; + y++, slot++) { + struct gfs2_quota_change qc; + struct gfs2_quota_data *qd; + + gfs2_quota_change_in(&qc, bh->b_data + + sizeof(struct gfs2_meta_header) + + y * sizeof(struct gfs2_quota_change)); + if (!qc.qc_change) + continue; + + error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER), + qc.qc_id, &qd); + if (error) { + brelse(bh); + goto fail; + } + + set_bit(QDF_CHANGE, &qd->qd_flags); + qd->qd_change = qc.qc_change; + qd->qd_slot = slot; + qd->qd_slot_count = 1; + qd->qd_last_touched = jiffies; + + spin_lock(&sdp->sd_quota_spin); + gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + spin_unlock(&sdp->sd_quota_spin); + + found++; + } + + brelse(bh); + dblock++; + extlen--; + } + + if (found) + fs_info(sdp, "found %u quota changes\n", found); + + return 0; + + fail: + gfs2_quota_cleanup(sdp); + return error; +} + +void gfs2_quota_scan(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data *qd, *safe; + LIST_HEAD(dead); + + spin_lock(&sdp->sd_quota_spin); + list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) { + if (!qd->qd_count && + time_after_eq(jiffies, qd->qd_last_touched + + gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) { + list_move(&qd->qd_list, &dead); + gfs2_assert_warn(sdp, + atomic_read(&sdp->sd_quota_count) > 0); + atomic_dec(&sdp->sd_quota_count); + } + } + spin_unlock(&sdp->sd_quota_spin); + + while (!list_empty(&dead)) { + qd = list_entry(dead.next, struct gfs2_quota_data, qd_list); + list_del(&qd->qd_list); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_lvb_unhold(qd->qd_gl); + kfree(qd); + } +} + +void gfs2_quota_cleanup(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_quota_list; + struct gfs2_quota_data *qd; + unsigned int x; + + spin_lock(&sdp->sd_quota_spin); + while (!list_empty(head)) { + qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); + + if (qd->qd_count > 1 || + (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { + list_move(&qd->qd_list, head); + spin_unlock(&sdp->sd_quota_spin); + schedule(); + spin_lock(&sdp->sd_quota_spin); + continue; + } + + list_del(&qd->qd_list); + atomic_dec(&sdp->sd_quota_count); + spin_unlock(&sdp->sd_quota_spin); + + if (!qd->qd_count) { + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + } else + gfs2_assert_warn(sdp, qd->qd_slot_count == 1); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_lvb_unhold(qd->qd_gl); + kfree(qd); + + spin_lock(&sdp->sd_quota_spin); + } + spin_unlock(&sdp->sd_quota_spin); + + gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + + if (sdp->sd_quota_bitmap) { + for (x = 0; x < sdp->sd_quota_chunks; x++) + kfree(sdp->sd_quota_bitmap[x]); + kfree(sdp->sd_quota_bitmap); + } +} + --- /dev/null +++ b/fs/gfs2/quota.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __QUOTA_DOT_H__ +#define __QUOTA_DOT_H__ + +#define NO_QUOTA_CHANGE ((uint32_t)-1) + +int gfs2_quota_hold(struct gfs2_inode *ip, uint32_t uid, uint32_t gid); +void gfs2_quota_unhold(struct gfs2_inode *ip); + +int gfs2_quota_lock(struct gfs2_inode *ip, uint32_t uid, uint32_t gid); +void gfs2_quota_unlock(struct gfs2_inode *ip); + +int gfs2_quota_check(struct gfs2_inode *ip, uint32_t uid, uint32_t gid); +void gfs2_quota_change(struct gfs2_inode *ip, int64_t change, + uint32_t uid, uint32_t gid); + +int gfs2_quota_sync(struct gfs2_sbd *sdp); +int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, uint32_t id); + +int gfs2_quota_init(struct gfs2_sbd *sdp); +void gfs2_quota_scan(struct gfs2_sbd *sdp); +void gfs2_quota_cleanup(struct gfs2_sbd *sdp); + +#endif /* __QUOTA_DOT_H__ */ --- /dev/null +++ b/fs/gfs2/lvb.c @@ -0,0 +1,45 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "lvb.h" + +#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \ + struct->member); + +void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb) +{ + struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb; + + qb->qb_magic = be32_to_cpu(str->qb_magic); + qb->qb_limit = be64_to_cpu(str->qb_limit); + qb->qb_warn = be64_to_cpu(str->qb_warn); + qb->qb_value = be64_to_cpu(str->qb_value); +} + +void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb) +{ + struct gfs2_quota_lvb *str = (struct gfs2_quota_lvb *)lvb; + + str->qb_magic = cpu_to_be32(qb->qb_magic); + str->qb_limit = cpu_to_be64(qb->qb_limit); + str->qb_warn = cpu_to_be64(qb->qb_warn); + str->qb_value = cpu_to_be64(qb->qb_value); +} + + --- /dev/null +++ b/fs/gfs2/lvb.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __LVB_DOT_H__ +#define __LVB_DOT_H__ + +#define GFS2_MIN_LVB_SIZE 32 + +void gfs2_quota_lvb_in(struct gfs2_quota_lvb *qb, char *lvb); +void gfs2_quota_lvb_out(struct gfs2_quota_lvb *qb, char *lvb); + +#endif /* __LVB_DOT_H__ */ + [PATCH 12/16] GFS2: Mounting & sysfs interface Mount/umount routines and GFS2's sysfs interface. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/mount.c | 214 ++++++++++++++++++++ fs/gfs2/mount.h | 15 + fs/gfs2/sys.c | 579 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/sys.h | 24 ++ 4 files changed, 832 insertions(+) --- /dev/null +++ b/fs/gfs2/sys.c @@ -0,0 +1,579 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "lm.h" +#include "sys.h" +#include "super.h" +#include "glock.h" +#include "quota.h" +#include "util.h" + +char *gfs2_sys_margs; +spinlock_t gfs2_sys_margs_lock; + +static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%s\n", sdp->sd_vfs->s_id); +} + +static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%s\n", sdp->sd_fsname); +} + +static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int count; + + mutex_lock(&sdp->sd_freeze_lock); + count = sdp->sd_freeze_count; + mutex_unlock(&sdp->sd_freeze_lock); + + return sprintf(buf, "%u\n", count); +} + +static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int error = 0; + int n = simple_strtol(buf, NULL, 0); + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + switch (n) { + case 0: + gfs2_unfreeze_fs(sdp); + break; + case 1: + error = gfs2_freeze_fs(sdp); + break; + default: + ret = -EINVAL; + } + + if (error) + fs_warn(sdp, "freeze %d error %d", n, error); + + return ret; +} + +static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags); + return sprintf(buf, "%u\n", b); +} + +static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: withdrawing from cluster at user's request\n", + sdp->sd_fsname); + return len; +} + +static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_statfs_sync(sdp); + return len; +} + +static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_gl_hash_clear(sdp, NO_WAIT); + return len; +} + +static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_quota_sync(sdp); + return len; +} + +static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + uint32_t id; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + id = simple_strtoul(buf, NULL, 0); + + gfs2_quota_refresh(sdp, 1, id); + return len; +} + +static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + uint32_t id; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + id = simple_strtoul(buf, NULL, 0); + + gfs2_quota_refresh(sdp, 0, id); + return len; +} + +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +#define GFS2_ATTR(name, mode, show, store) \ +static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) + +GFS2_ATTR(id, 0444, id_show, NULL); +GFS2_ATTR(fsname, 0444, fsname_show, NULL); +GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); +GFS2_ATTR(shrink, 0200, NULL, shrink_store); +GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); +GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); +GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); +GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); + +static struct attribute *gfs2_attrs[] = { + &gfs2_attr_id.attr, + &gfs2_attr_fsname.attr, + &gfs2_attr_freeze.attr, + &gfs2_attr_shrink.attr, + &gfs2_attr_withdraw.attr, + &gfs2_attr_statfs_sync.attr, + &gfs2_attr_quota_sync.attr, + &gfs2_attr_quota_refresh_user.attr, + &gfs2_attr_quota_refresh_group.attr, + NULL, +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + +static struct kobj_type gfs2_ktype = { + .default_attrs = gfs2_attrs, + .sysfs_ops = &gfs2_attr_ops, +}; + +static struct kset gfs2_kset = { + .subsys = &fs_subsys, + .kobj = {.name = "gfs2",}, + .ktype = &gfs2_ktype, +}; + +/* + * display struct lm_lockstruct fields + */ + +struct lockstruct_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); +}; + +#define LOCKSTRUCT_ATTR(name, fmt) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return sprintf(buf, fmt, sdp->sd_lockstruct.ls_##name); \ +} \ +static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name) + +LOCKSTRUCT_ATTR(jid, "%u\n"); +LOCKSTRUCT_ATTR(first, "%u\n"); +LOCKSTRUCT_ATTR(lvb_size, "%u\n"); +LOCKSTRUCT_ATTR(flags, "%d\n"); + +static struct attribute *lockstruct_attrs[] = { + &lockstruct_attr_jid.attr, + &lockstruct_attr_first.attr, + &lockstruct_attr_lvb_size.attr, + &lockstruct_attr_flags.attr, + NULL +}; + +/* + * display struct gfs2_args fields + */ + +struct args_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); +}; + +#define ARGS_ATTR(name, fmt) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return sprintf(buf, fmt, sdp->sd_args.ar_##name); \ +} \ +static struct args_attr args_attr_##name = __ATTR_RO(name) + +ARGS_ATTR(lockproto, "%s\n"); +ARGS_ATTR(locktable, "%s\n"); +ARGS_ATTR(hostdata, "%s\n"); +ARGS_ATTR(spectator, "%d\n"); +ARGS_ATTR(ignore_local_fs, "%d\n"); +ARGS_ATTR(localcaching, "%d\n"); +ARGS_ATTR(localflocks, "%d\n"); +ARGS_ATTR(debug, "%d\n"); +ARGS_ATTR(upgrade, "%d\n"); +ARGS_ATTR(num_glockd, "%u\n"); +ARGS_ATTR(posix_acl, "%d\n"); +ARGS_ATTR(quota, "%u\n"); +ARGS_ATTR(suiddir, "%d\n"); +ARGS_ATTR(data, "%d\n"); + +/* one oddball doesn't fit the macro mold */ +static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%d\n", !!test_bit(SDF_NOATIME, &sdp->sd_flags)); +} +static struct args_attr args_attr_noatime = __ATTR_RO(noatime); + +static struct attribute *args_attrs[] = { + &args_attr_lockproto.attr, + &args_attr_locktable.attr, + &args_attr_hostdata.attr, + &args_attr_spectator.attr, + &args_attr_ignore_local_fs.attr, + &args_attr_localcaching.attr, + &args_attr_localflocks.attr, + &args_attr_debug.attr, + &args_attr_upgrade.attr, + &args_attr_num_glockd.attr, + &args_attr_posix_acl.attr, + &args_attr_quota.attr, + &args_attr_suiddir.attr, + &args_attr_data.attr, + &args_attr_noatime.attr, + NULL +}; + +/* + * display counters from superblock + */ + +struct counters_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); +}; + +#define COUNTERS_ATTR(name, fmt) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return sprintf(buf, fmt, (unsigned int)atomic_read(&sdp->sd_##name)); \ +} \ +static struct counters_attr counters_attr_##name = __ATTR_RO(name) + +COUNTERS_ATTR(glock_count, "%u\n"); +COUNTERS_ATTR(glock_held_count, "%u\n"); +COUNTERS_ATTR(inode_count, "%u\n"); +COUNTERS_ATTR(reclaimed, "%u\n"); + +static struct attribute *counters_attrs[] = { + &counters_attr_glock_count.attr, + &counters_attr_glock_held_count.attr, + &counters_attr_inode_count.attr, + &counters_attr_reclaimed.attr, + NULL +}; + +/* + * get and set struct gfs2_tune fields + */ + +static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%u %u\n", sdp->sd_tune.gt_quota_scale_num, + sdp->sd_tune.gt_quota_scale_den); +} + +static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x, y; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (sscanf(buf, "%u %u", &x, &y) != 2 || !y) + return -EINVAL; + + spin_lock(>->gt_spin); + gt->gt_quota_scale_num = x; + gt->gt_quota_scale_den = y; + spin_unlock(>->gt_spin); + return len; +} + +static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, + int check_zero, const char *buf, size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + x = simple_strtoul(buf, NULL, 0); + + if (check_zero && !x) + return -EINVAL; + + spin_lock(>->gt_spin); + *field = x; + spin_unlock(>->gt_spin); + return len; +} + +struct tune_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +#define TUNE_ATTR_3(name, show, store) \ +static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store) + +#define TUNE_ATTR_2(name, store) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return sprintf(buf, "%u\n", sdp->sd_tune.gt_##name); \ +} \ +TUNE_ATTR_3(name, name##_show, store) + +#define TUNE_ATTR(name, check_zero) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ +{ \ + return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \ +} \ +TUNE_ATTR_2(name, name##_store) + +#define TUNE_ATTR_DAEMON(name, process) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ +{ \ + ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \ + wake_up_process(sdp->sd_##process); \ + return r; \ +} \ +TUNE_ATTR_2(name, name##_store) + +TUNE_ATTR(ilimit, 0); +TUNE_ATTR(ilimit_tries, 0); +TUNE_ATTR(ilimit_min, 0); +TUNE_ATTR(demote_secs, 0); +TUNE_ATTR(incore_log_blocks, 0); +TUNE_ATTR(log_flush_secs, 0); +TUNE_ATTR(jindex_refresh_secs, 0); +TUNE_ATTR(quota_warn_period, 0); +TUNE_ATTR(quota_quantum, 0); +TUNE_ATTR(atime_quantum, 0); +TUNE_ATTR(max_readahead, 0); +TUNE_ATTR(complain_secs, 0); +TUNE_ATTR(reclaim_limit, 0); +TUNE_ATTR(prefetch_secs, 0); +TUNE_ATTR(statfs_slow, 0); +TUNE_ATTR(new_files_jdata, 0); +TUNE_ATTR(new_files_directio, 0); +TUNE_ATTR(quota_simul_sync, 1); +TUNE_ATTR(quota_cache_secs, 1); +TUNE_ATTR(max_atomic_write, 1); +TUNE_ATTR(stall_secs, 1); +TUNE_ATTR(entries_per_readdir, 1); +TUNE_ATTR(greedy_default, 1); +TUNE_ATTR(greedy_quantum, 1); +TUNE_ATTR(greedy_max, 1); +TUNE_ATTR(statfs_quantum, 1); +TUNE_ATTR_DAEMON(scand_secs, scand_process); +TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); +TUNE_ATTR_DAEMON(logd_secs, logd_process); +TUNE_ATTR_DAEMON(quotad_secs, quotad_process); +TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); + +static struct attribute *tune_attrs[] = { + &tune_attr_ilimit.attr, + &tune_attr_ilimit_tries.attr, + &tune_attr_ilimit_min.attr, + &tune_attr_demote_secs.attr, + &tune_attr_incore_log_blocks.attr, + &tune_attr_log_flush_secs.attr, + &tune_attr_jindex_refresh_secs.attr, + &tune_attr_quota_warn_period.attr, + &tune_attr_quota_quantum.attr, + &tune_attr_atime_quantum.attr, + &tune_attr_max_readahead.attr, + &tune_attr_complain_secs.attr, + &tune_attr_reclaim_limit.attr, + &tune_attr_prefetch_secs.attr, + &tune_attr_statfs_slow.attr, + &tune_attr_quota_simul_sync.attr, + &tune_attr_quota_cache_secs.attr, + &tune_attr_max_atomic_write.attr, + &tune_attr_stall_secs.attr, + &tune_attr_entries_per_readdir.attr, + &tune_attr_greedy_default.attr, + &tune_attr_greedy_quantum.attr, + &tune_attr_greedy_max.attr, + &tune_attr_statfs_quantum.attr, + &tune_attr_scand_secs.attr, + &tune_attr_recoverd_secs.attr, + &tune_attr_logd_secs.attr, + &tune_attr_quotad_secs.attr, + &tune_attr_quota_scale.attr, + &tune_attr_new_files_jdata.attr, + &tune_attr_new_files_directio.attr, + NULL +}; + +static struct attribute_group lockstruct_group = { + .name = "lockstruct", + .attrs = lockstruct_attrs +}; + +static struct attribute_group counters_group = { + .name = "counters", + .attrs = counters_attrs +}; + +static struct attribute_group args_group = { + .name = "args", + .attrs = args_attrs +}; + +static struct attribute_group tune_group = { + .name = "tune", + .attrs = tune_attrs +}; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp) +{ + int error; + + sdp->sd_kobj.kset = &gfs2_kset; + sdp->sd_kobj.ktype = &gfs2_ktype; + + error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name); + if (error) + goto fail; + + error = kobject_register(&sdp->sd_kobj); + if (error) + goto fail; + + error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group); + if (error) + goto fail_reg; + + error = sysfs_create_group(&sdp->sd_kobj, &counters_group); + if (error) + goto fail_lockstruct; + + error = sysfs_create_group(&sdp->sd_kobj, &args_group); + if (error) + goto fail_counters; + + error = sysfs_create_group(&sdp->sd_kobj, &tune_group); + if (error) + goto fail_args; + + return 0; + + fail_args: + sysfs_remove_group(&sdp->sd_kobj, &args_group); + fail_counters: + sysfs_remove_group(&sdp->sd_kobj, &counters_group); + fail_lockstruct: + sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); + fail_reg: + kobject_unregister(&sdp->sd_kobj); + fail: + return error; +} + +void gfs2_sys_fs_del(struct gfs2_sbd *sdp) +{ + sysfs_remove_group(&sdp->sd_kobj, &tune_group); + sysfs_remove_group(&sdp->sd_kobj, &args_group); + sysfs_remove_group(&sdp->sd_kobj, &counters_group); + sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); + kobject_unregister(&sdp->sd_kobj); +} + +int gfs2_sys_init(void) +{ + gfs2_sys_margs = NULL; + spin_lock_init(&gfs2_sys_margs_lock); + return kset_register(&gfs2_kset); +} + +void gfs2_sys_uninit(void) +{ + kfree(gfs2_sys_margs); + kset_unregister(&gfs2_kset); +} + --- /dev/null +++ b/fs/gfs2/sys.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __SYS_DOT_H__ +#define __SYS_DOT_H__ + +/* Allow args to be passed to GFS2 when using an initial ram disk */ +extern char *gfs2_sys_margs; +extern spinlock_t gfs2_sys_margs_lock; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp); +void gfs2_sys_fs_del(struct gfs2_sbd *sdp); + +int gfs2_sys_init(void); +void gfs2_sys_uninit(void); + +#endif /* __SYS_DOT_H__ */ + --- /dev/null +++ b/fs/gfs2/mount.c @@ -0,0 +1,214 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "lm_interface.h" +#include "incore.h" +#include "mount.h" +#include "sys.h" +#include "util.h" + +/** + * gfs2_mount_args - Parse mount options + * @sdp: + * @data: + * + * Return: errno + */ + +int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) +{ + struct gfs2_args *args = &sdp->sd_args; + char *data = data_arg; + char *options, *o, *v; + int error = 0; + + if (!remount) { + /* If someone preloaded options, use those instead */ + spin_lock(&gfs2_sys_margs_lock); + if (gfs2_sys_margs) { + data = gfs2_sys_margs; + gfs2_sys_margs = NULL; + } + spin_unlock(&gfs2_sys_margs_lock); + + /* Set some defaults */ + args->ar_num_glockd = GFS2_GLOCKD_DEFAULT; + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + } + + /* Split the options into tokens with the "," character and + process them */ + + for (options = data; (o = strsep(&options, ",")); ) { + if (!*o) + continue; + + v = strchr(o, '='); + if (v) + *v++ = 0; + + if (!strcmp(o, "lockproto")) { + if (!v) + goto need_value; + if (remount && strcmp(v, args->ar_lockproto)) + goto cant_remount; + strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN); + args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0; + } + + else if (!strcmp(o, "locktable")) { + if (!v) + goto need_value; + if (remount && strcmp(v, args->ar_locktable)) + goto cant_remount; + strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN); + args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0; + } + + else if (!strcmp(o, "hostdata")) { + if (!v) + goto need_value; + if (remount && strcmp(v, args->ar_hostdata)) + goto cant_remount; + strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN); + args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0; + } + + else if (!strcmp(o, "spectator")) { + if (remount && !args->ar_spectator) + goto cant_remount; + args->ar_spectator = 1; + sdp->sd_vfs->s_flags |= MS_RDONLY; + } + + else if (!strcmp(o, "ignore_local_fs")) { + if (remount && !args->ar_ignore_local_fs) + goto cant_remount; + args->ar_ignore_local_fs = 1; + } + + else if (!strcmp(o, "localflocks")) { + if (remount && !args->ar_localflocks) + goto cant_remount; + args->ar_localflocks = 1; + } + + else if (!strcmp(o, "localcaching")) { + if (remount && !args->ar_localcaching) + goto cant_remount; + args->ar_localcaching = 1; + } + + else if (!strcmp(o, "debug")) + args->ar_debug = 1; + + else if (!strcmp(o, "nodebug")) + args->ar_debug = 0; + + else if (!strcmp(o, "upgrade")) { + if (remount && !args->ar_upgrade) + goto cant_remount; + args->ar_upgrade = 1; + } + + else if (!strcmp(o, "num_glockd")) { + unsigned int x; + if (!v) + goto need_value; + sscanf(v, "%u", &x); + if (remount && x != args->ar_num_glockd) + goto cant_remount; + if (!x || x > GFS2_GLOCKD_MAX) { + fs_info(sdp, "0 < num_glockd <= %u (not %u)\n", + GFS2_GLOCKD_MAX, x); + error = -EINVAL; + break; + } + args->ar_num_glockd = x; + } + + else if (!strcmp(o, "acl")) { + args->ar_posix_acl = 1; + sdp->sd_vfs->s_flags |= MS_POSIXACL; + } + + else if (!strcmp(o, "noacl")) { + args->ar_posix_acl = 0; + sdp->sd_vfs->s_flags &= ~MS_POSIXACL; + } + + else if (!strcmp(o, "quota")) { + if (!v) + goto need_value; + if (!strcmp(v, "off")) + args->ar_quota = GFS2_QUOTA_OFF; + else if (!strcmp(v, "account")) + args->ar_quota = GFS2_QUOTA_ACCOUNT; + else if (!strcmp(v, "on")) + args->ar_quota = GFS2_QUOTA_ON; + else { + fs_info(sdp, "invalid value for quota\n"); + error = -EINVAL; + break; + } + } + + else if (!strcmp(o, "suiddir")) + args->ar_suiddir = 1; + + else if (!strcmp(o, "nosuiddir")) + args->ar_suiddir = 0; + + else if (!strcmp(o, "data")) { + if (!v) + goto need_value; + if (!strcmp(v, "writeback")) + args->ar_data = GFS2_DATA_WRITEBACK; + else if (!strcmp(v, "ordered")) + args->ar_data = GFS2_DATA_ORDERED; + else { + fs_info(sdp, "invalid value for data\n"); + error = -EINVAL; + break; + } + } + + else { + fs_info(sdp, "unknown option: %s\n", o); + error = -EINVAL; + break; + } + } + + if (error) + fs_info(sdp, "invalid mount option(s)\n"); + + if (data != data_arg) + kfree(data); + + return error; + + need_value: + fs_info(sdp, "need value for option %s\n", o); + return -EINVAL; + + cant_remount: + fs_info(sdp, "can't remount with option %s\n", o); + return -EINVAL; +} + --- /dev/null +++ b/fs/gfs2/mount.h @@ -0,0 +1,15 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __MOUNT_DOT_H__ +#define __MOUNT_DOT_H__ + +int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount); + +#endif /* __MOUNT_DOT_H__ */ [PATCH 13/16] GFS2: Makefiles and Kconfig This hooks GFS2 into the kernel's build system. It also adds some documentation. Note that the dlm has been moved to be under fs/dlm as per Ingo Molnar's suggestion. This patch series doesn't include the dlm however. The DLM is already in both -mm and the git tree containing GFS2 at kernel.org. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland Documentation/filesystems/gfs2.txt | 43 ++++++++++++++++++++++++++++++++++++ fs/Kconfig | 2 + fs/Makefile | 2 + fs/gfs2/Kconfig | 44 +++++++++++++++++++++++++++++++++++++ fs/gfs2/Makefile | 10 ++++++++ 5 files changed, 101 insertions(+) --- a/fs/Kconfig +++ b/fs/Kconfig @@ -323,6 +323,7 @@ # default n source "fs/xfs/Kconfig" +source "fs/gfs2/Kconfig" config OCFS2_FS tristate "OCFS2 file system support (EXPERIMENTAL)" @@ -1930,6 +1931,7 @@ source "fs/partitions/Kconfig" endmenu source "fs/nls/Kconfig" +source "fs/dlm/Kconfig" endmenu --- a/fs/Makefile +++ b/fs/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ obj-$(CONFIG_PROFILING) += dcookies.o +obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ @@ -102,3 +103,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ +obj-$(CONFIG_GFS2_FS) += gfs2/ --- /dev/null +++ b/fs/gfs2/Makefile @@ -0,0 +1,10 @@ +obj-$(CONFIG_GFS2_FS) += gfs2.o +gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ + glops.o inode.o lm.o log.o lops.o locking.o lvb.o main.o meta_io.o \ + mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ + ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \ + recovery.o rgrp.o super.o sys.o trans.o util.o + +obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/ +obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ + --- /dev/null +++ b/fs/gfs2/Kconfig @@ -0,0 +1,44 @@ +config GFS2_FS + tristate "GFS2 file system support" + depends on EXPERIMENTAL + select FS_POSIX_ACL + help + A cluster filesystem. + + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. + + To use the GFS2 filesystem, you will need to enable one or more of + the below locking modules. Documentation and utilities for GFS2 can + be found here: http://sources.redhat.com/cluster + +config GFS2_FS_LOCKING_NOLOCK + tristate "GFS2 \"nolock\" locking module" + depends on GFS2_FS + help + Single node locking module for GFS2. + + Use this module if you want to use GFS2 on a single node without + its clustering features. You can still take advantage of the + large file support, and upgrade to running a full cluster later on + if required. + + If you will only be using GFS2 in cluster mode, you do not need this + module. + +config GFS2_FS_LOCKING_DLM + tristate "GFS2 DLM locking module" + depends on GFS2_FS + select DLM + help + Multiple node locking module for GFS2 + + Most users of GFS2 will require this module. It provides the locking + interface between GFS2 and the DLM, which is required to use GFS2 + in a cluster environment. + --- /dev/null +++ b/Documentation/filesystems/gfs2.txt @@ -0,0 +1,43 @@ +Global File System +------------------ + +http://sources.redhat.com/cluster/ + +GFS is a cluster file system. It allows a cluster of computers to +simultaneously use a block device that is shared between them (with FC, +iSCSI, NBD, etc). GFS reads and writes to the block device like a local +file system, but also uses a lock module to allow the computers coordinate +their I/O so file system consistency is maintained. One of the nifty +features of GFS is perfect consistency -- changes made to the file system +on one machine show up immediately on all other machines in the cluster. + +GFS uses interchangable inter-node locking mechanisms. Different lock +modules can plug into GFS and each file system selects the appropriate +lock module at mount time. Lock modules include: + + lock_nolock -- allows gfs to be used as a local file system + + lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking + The dlm is found at linux/fs/dlm/ + +In addition to interfacing with an external locking manager, a gfs lock +module is responsible for interacting with external cluster management +systems. Lock_dlm depends on user space cluster management systems found +at the URL above. + +To use gfs as a local file system, no external clustering systems are +needed, simply: + + $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device + $ mount -t gfs2 /dev/block_device /dir + +GFS2 is not on-disk compatible with previous versions of GFS. + +The following man pages can be found at the URL above: + gfs2_fsck to repair a filesystem + gfs2_grow to expand a filesystem online + gfs2_jadd to add journals to a filesystem online + gfs2_tool to manipulate, examine and tune a filesystem + gfs2_quota to examine and change quota values in a filesystem + mount.gfs2 to help mount(8) mount a filesystem + mkfs.gfs2 to make a filesystem [PATCH 14/16] GFS2: The DLM interface module The interface between GFS2 and the DLM. This is the bit which is actually part of GFS2. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/locking/dlm/Makefile | 3 fs/gfs2/locking/dlm/lock.c | 541 +++++++++++++++++++++++++++++++++++++++++ fs/gfs2/locking/dlm/lock_dlm.h | 188 ++++++++++++++ fs/gfs2/locking/dlm/main.c | 64 ++++ fs/gfs2/locking/dlm/mount.c | 256 +++++++++++++++++++ fs/gfs2/locking/dlm/plock.c | 302 ++++++++++++++++++++++ fs/gfs2/locking/dlm/sysfs.c | 225 +++++++++++++++++ fs/gfs2/locking/dlm/thread.c | 359 +++++++++++++++++++++++++++ include/linux/lock_dlm_plock.h | 41 +++ 9 files changed, 1979 insertions(+) --- /dev/null +++ b/fs/gfs2/locking/dlm/main.c @@ -0,0 +1,64 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include + +#include "lock_dlm.h" + +extern int gdlm_drop_count; +extern int gdlm_drop_period; + +extern struct lm_lockops gdlm_ops; + +static int __init init_lock_dlm(void) +{ + int error; + + error = gfs2_register_lockproto(&gdlm_ops); + if (error) { + printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n", + error); + return error; + } + + error = gdlm_sysfs_init(); + if (error) { + gfs2_unregister_lockproto(&gdlm_ops); + return error; + } + + error = gdlm_plock_init(); + if (error) { + gdlm_sysfs_exit(); + gfs2_unregister_lockproto(&gdlm_ops); + return error; + } + + gdlm_drop_count = GDLM_DROP_COUNT; + gdlm_drop_period = GDLM_DROP_PERIOD; + + printk(KERN_INFO + "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); + return 0; +} + +static void __exit exit_lock_dlm(void) +{ + gdlm_plock_exit(); + gdlm_sysfs_exit(); + gfs2_unregister_lockproto(&gdlm_ops); +} + +module_init(init_lock_dlm); +module_exit(exit_lock_dlm); + +MODULE_DESCRIPTION("GFS DLM Locking Module"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + --- /dev/null +++ b/fs/gfs2/locking/dlm/lock.c @@ -0,0 +1,541 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include "lock_dlm.h" + +static char junk_lvb[GDLM_LVB_SIZE]; + +static void queue_complete(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + clear_bit(LFL_ACTIVE, &lp->flags); + + spin_lock(&ls->async_lock); + list_add_tail(&lp->clist, &ls->complete); + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +static inline void gdlm_ast(void *astarg) +{ + queue_complete(astarg); +} + +static inline void gdlm_bast(void *astarg, int mode) +{ + struct gdlm_lock *lp = astarg; + struct gdlm_ls *ls = lp->ls; + + if (!mode) { + printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + return; + } + + spin_lock(&ls->async_lock); + if (!lp->bast_mode) { + list_add_tail(&lp->blist, &ls->blocking); + lp->bast_mode = mode; + } else if (lp->bast_mode < mode) + lp->bast_mode = mode; + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +void gdlm_queue_delayed(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + spin_lock(&ls->async_lock); + list_add_tail(&lp->delay_list, &ls->delayed); + spin_unlock(&ls->async_lock); +} + +/* convert gfs lock-state to dlm lock-mode */ + +static int16_t make_mode(int16_t lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + } + gdlm_assert(0, "unknown LM state %d", lmstate); + return -1; +} + +/* convert dlm lock-mode to gfs lock-state */ + +int16_t gdlm_make_lmstate(int16_t dlmmode) +{ + switch (dlmmode) { + case DLM_LOCK_IV: + case DLM_LOCK_NL: + return LM_ST_UNLOCKED; + case DLM_LOCK_EX: + return LM_ST_EXCLUSIVE; + case DLM_LOCK_CW: + return LM_ST_DEFERRED; + case DLM_LOCK_PR: + return LM_ST_SHARED; + } + gdlm_assert(0, "unknown DLM mode %d", dlmmode); + return -1; +} + +/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and + DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ + +static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state) +{ + int16_t cur = make_mode(cur_state); + if (lp->cur != DLM_LOCK_IV) + gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur); +} + +static inline unsigned int make_flags(struct gdlm_lock *lp, + unsigned int gfs_flags, + int16_t cur, int16_t req) +{ + unsigned int lkf = 0; + + if (gfs_flags & LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags & LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags & LM_FLAG_PRIORITY) { + lkf |= DLM_LKF_NOORDER; + lkf |= DLM_LKF_HEADQUE; + } + + if (gfs_flags & LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf |= DLM_LKF_ALTCW; + else if (req == DLM_LOCK_CW) + lkf |= DLM_LKF_ALTPR; + } + + if (lp->lksb.sb_lkid != 0) { + lkf |= DLM_LKF_CONVERT; + + /* Conversion deadlock avoidance by DLM */ + + if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) && + !(lkf & DLM_LKF_NOQUEUE) && + cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req) + lkf |= DLM_LKF_CONVDEADLK; + } + + if (lp->lvb) + lkf |= DLM_LKF_VALBLK; + + return lkf; +} + +/* make_strname - convert GFS lock numbers to a string */ + +static inline void make_strname(struct lm_lockname *lockname, + struct gdlm_strname *str) +{ + sprintf(str->name, "%8x%16llx", lockname->ln_type, + (unsigned long long)lockname->ln_number); + str->namelen = GDLM_STRNAME_BYTES; +} + +static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, + struct gdlm_lock **lpp) +{ + struct gdlm_lock *lp; + + lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL); + if (!lp) + return -ENOMEM; + + lp->lockname = *name; + lp->ls = ls; + lp->cur = DLM_LOCK_IV; + lp->lvb = NULL; + lp->hold_null = NULL; + init_completion(&lp->ast_wait); + INIT_LIST_HEAD(&lp->clist); + INIT_LIST_HEAD(&lp->blist); + INIT_LIST_HEAD(&lp->delay_list); + + spin_lock(&ls->async_lock); + list_add(&lp->all_list, &ls->all_locks); + ls->all_locks_count++; + spin_unlock(&ls->async_lock); + + *lpp = lp; + return 0; +} + +void gdlm_delete_lp(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + spin_lock(&ls->async_lock); + if (!list_empty(&lp->clist)) + list_del_init(&lp->clist); + if (!list_empty(&lp->blist)) + list_del_init(&lp->blist); + if (!list_empty(&lp->delay_list)) + list_del_init(&lp->delay_list); + gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + list_del_init(&lp->all_list); + ls->all_locks_count--; + spin_unlock(&ls->async_lock); + + kfree(lp); +} + +int gdlm_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name, + lm_lock_t **lockp) +{ + struct gdlm_lock *lp; + int error; + + error = gdlm_create_lp((struct gdlm_ls *) lockspace, name, &lp); + + *lockp = (lm_lock_t *) lp; + return error; +} + +void gdlm_put_lock(lm_lock_t *lock) +{ + gdlm_delete_lp((struct gdlm_lock *) lock); +} + +unsigned int gdlm_do_lock(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + struct gdlm_strname str; + int error, bast = 1; + + /* + * When recovery is in progress, delay lock requests for submission + * once recovery is done. Requests for recovery (NOEXP) and unlocks + * can pass. + */ + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) { + gdlm_queue_delayed(lp); + return LM_OUT_ASYNC; + } + + /* + * Submit the actual lock request. + */ + + if (test_bit(LFL_NOBAST, &lp->flags)) + bast = 0; + + make_strname(&lp->lockname, &str); + + set_bit(LFL_ACTIVE, &lp->flags); + + log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid, + lp->cur, lp->req, lp->lkf); + + error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf, + str.name, str.namelen, 0, gdlm_ast, lp, + bast ? gdlm_bast : NULL); + + if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { + lp->lksb.sb_status = -EAGAIN; + queue_complete(lp); + error = 0; + } + + if (error) { + log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x " + "flags=%lx", ls->fsname, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, error, + lp->cur, lp->req, lp->lkf, lp->flags); + return LM_OUT_ERROR; + } + return LM_OUT_ASYNC; +} + +static unsigned int gdlm_do_unlock(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + unsigned int lkf = 0; + int error; + + set_bit(LFL_DLM_UNLOCK, &lp->flags); + set_bit(LFL_ACTIVE, &lp->flags); + + if (lp->lvb) + lkf = DLM_LKF_VALBLK; + + log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->lksb.sb_lkid, lp->cur, lkf); + + error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp); + + if (error) { + log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x " + "flags=%lx", ls->fsname, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, error, + lp->cur, lp->req, lp->lkf, lp->flags); + return LM_OUT_ERROR; + } + return LM_OUT_ASYNC; +} + +unsigned int gdlm_lock(lm_lock_t *lock, unsigned int cur_state, + unsigned int req_state, unsigned int flags) +{ + struct gdlm_lock *lp = (struct gdlm_lock *) lock; + + clear_bit(LFL_DLM_CANCEL, &lp->flags); + if (flags & LM_FLAG_NOEXP) + set_bit(LFL_NOBLOCK, &lp->flags); + + check_cur_state(lp, cur_state); + lp->req = make_mode(req_state); + lp->lkf = make_flags(lp, flags, lp->cur, lp->req); + + return gdlm_do_lock(lp); +} + +unsigned int gdlm_unlock(lm_lock_t *lock, unsigned int cur_state) +{ + struct gdlm_lock *lp = (struct gdlm_lock *) lock; + + clear_bit(LFL_DLM_CANCEL, &lp->flags); + if (lp->cur == DLM_LOCK_IV) + return 0; + return gdlm_do_unlock(lp); +} + +void gdlm_cancel(lm_lock_t *lock) +{ + struct gdlm_lock *lp = (struct gdlm_lock *) lock; + struct gdlm_ls *ls = lp->ls; + int error, delay_list = 0; + + if (test_bit(LFL_DLM_CANCEL, &lp->flags)) + return; + + log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->flags); + + spin_lock(&ls->async_lock); + if (!list_empty(&lp->delay_list)) { + list_del_init(&lp->delay_list); + delay_list = 1; + } + spin_unlock(&ls->async_lock); + + if (delay_list) { + set_bit(LFL_CANCEL, &lp->flags); + set_bit(LFL_ACTIVE, &lp->flags); + queue_complete(lp); + return; + } + + if (!test_bit(LFL_ACTIVE, &lp->flags) || + test_bit(LFL_DLM_UNLOCK, &lp->flags)) { + log_info("gdlm_cancel skip %x,%llx flags %lx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->flags); + return; + } + + /* the lock is blocked in the dlm */ + + set_bit(LFL_DLM_CANCEL, &lp->flags); + set_bit(LFL_ACTIVE, &lp->flags); + + error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL, + NULL, lp); + + log_info("gdlm_cancel rv %d %x,%llx flags %lx", error, + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->flags); + + if (error == -EBUSY) + clear_bit(LFL_DLM_CANCEL, &lp->flags); +} + +static int gdlm_add_lvb(struct gdlm_lock *lp) +{ + char *lvb; + + lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL); + if (!lvb) + return -ENOMEM; + + lp->lksb.sb_lvbptr = lvb; + lp->lvb = lvb; + return 0; +} + +static void gdlm_del_lvb(struct gdlm_lock *lp) +{ + kfree(lp->lvb); + lp->lvb = NULL; + lp->lksb.sb_lvbptr = NULL; +} + +/* This can do a synchronous dlm request (requiring a lock_dlm thread to get + the completion) because gfs won't call hold_lvb() during a callback (from + the context of a lock_dlm thread). */ + +static int hold_null_lock(struct gdlm_lock *lp) +{ + struct gdlm_lock *lpn = NULL; + int error; + + if (lp->hold_null) { + printk(KERN_INFO "lock_dlm: lvb already held\n"); + return 0; + } + + error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn); + if (error) + goto out; + + lpn->lksb.sb_lvbptr = junk_lvb; + lpn->lvb = junk_lvb; + + lpn->req = DLM_LOCK_NL; + lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE; + set_bit(LFL_NOBAST, &lpn->flags); + set_bit(LFL_INLOCK, &lpn->flags); + + init_completion(&lpn->ast_wait); + gdlm_do_lock(lpn); + wait_for_completion(&lpn->ast_wait); + error = lpn->lksb.sb_status; + if (error) { + printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n", + error); + gdlm_delete_lp(lpn); + lpn = NULL; + } + out: + lp->hold_null = lpn; + return error; +} + +/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get + the completion) because gfs may call unhold_lvb() during a callback (from + the context of a lock_dlm thread) which could cause a deadlock since the + other lock_dlm thread could be engaged in recovery. */ + +static void unhold_null_lock(struct gdlm_lock *lp) +{ + struct gdlm_lock *lpn = lp->hold_null; + + gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + lpn->lksb.sb_lvbptr = NULL; + lpn->lvb = NULL; + set_bit(LFL_UNLOCK_DELETE, &lpn->flags); + gdlm_do_unlock(lpn); + lp->hold_null = NULL; +} + +/* Acquire a NL lock because gfs requires the value block to remain + intact on the resource while the lvb is "held" even if it's holding no locks + on the resource. */ + +int gdlm_hold_lvb(lm_lock_t *lock, char **lvbp) +{ + struct gdlm_lock *lp = (struct gdlm_lock *) lock; + int error; + + error = gdlm_add_lvb(lp); + if (error) + return error; + + *lvbp = lp->lvb; + + error = hold_null_lock(lp); + if (error) + gdlm_del_lvb(lp); + + return error; +} + +void gdlm_unhold_lvb(lm_lock_t *lock, char *lvb) +{ + struct gdlm_lock *lp = (struct gdlm_lock *) lock; + + unhold_null_lock(lp); + gdlm_del_lvb(lp); +} + +void gdlm_sync_lvb(lm_lock_t *lock, char *lvb) +{ + struct gdlm_lock *lp = (struct gdlm_lock *) lock; + + if (lp->cur != DLM_LOCK_EX) + return; + + init_completion(&lp->ast_wait); + set_bit(LFL_SYNC_LVB, &lp->flags); + + lp->req = DLM_LOCK_EX; + lp->lkf = make_flags(lp, 0, lp->cur, lp->req); + + gdlm_do_lock(lp); + wait_for_completion(&lp->ast_wait); +} + +void gdlm_submit_delayed(struct gdlm_ls *ls) +{ + struct gdlm_lock *lp, *safe; + + spin_lock(&ls->async_lock); + list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) { + list_del_init(&lp->delay_list); + list_add_tail(&lp->delay_list, &ls->submit); + } + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +int gdlm_release_all_locks(struct gdlm_ls *ls) +{ + struct gdlm_lock *lp, *safe; + int count = 0; + + spin_lock(&ls->async_lock); + list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) { + list_del_init(&lp->all_list); + + if (lp->lvb && lp->lvb != junk_lvb) + kfree(lp->lvb); + kfree(lp); + count++; + } + spin_unlock(&ls->async_lock); + + return count; +} + --- /dev/null +++ b/fs/gfs2/locking/dlm/mount.c @@ -0,0 +1,256 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include "lock_dlm.h" + +int gdlm_drop_count; +int gdlm_drop_period; +struct lm_lockops gdlm_ops; + + +static struct gdlm_ls *init_gdlm(lm_callback_t cb, lm_fsdata_t *fsdata, + int flags, char *table_name) +{ + struct gdlm_ls *ls; + char buf[256], *p; + + ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL); + if (!ls) + return NULL; + + ls->drop_locks_count = gdlm_drop_count; + ls->drop_locks_period = gdlm_drop_period; + ls->fscb = cb; + ls->fsdata = fsdata; + ls->fsflags = flags; + spin_lock_init(&ls->async_lock); + INIT_LIST_HEAD(&ls->complete); + INIT_LIST_HEAD(&ls->blocking); + INIT_LIST_HEAD(&ls->delayed); + INIT_LIST_HEAD(&ls->submit); + INIT_LIST_HEAD(&ls->all_locks); + init_waitqueue_head(&ls->thread_wait); + init_waitqueue_head(&ls->wait_control); + ls->thread1 = NULL; + ls->thread2 = NULL; + ls->drop_time = jiffies; + ls->jid = -1; + + strncpy(buf, table_name, 256); + buf[255] = '\0'; + + p = strstr(buf, ":"); + if (!p) { + log_info("invalid table_name \"%s\"", table_name); + kfree(ls); + return NULL; + } + *p = '\0'; + p++; + + strncpy(ls->clustername, buf, GDLM_NAME_LEN); + strncpy(ls->fsname, p, GDLM_NAME_LEN); + + return ls; +} + +static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir) +{ + char data[256]; + char *options, *x, *y; + int error = 0; + + memset(data, 0, 256); + strncpy(data, data_arg, 255); + + for (options = data; (x = strsep(&options, ":")); ) { + if (!*x) + continue; + + y = strchr(x, '='); + if (y) + *y++ = 0; + + if (!strcmp(x, "jid")) { + if (!y) { + log_error("need argument to jid"); + error = -EINVAL; + break; + } + sscanf(y, "%u", &ls->jid); + + } else if (!strcmp(x, "first")) { + if (!y) { + log_error("need argument to first"); + error = -EINVAL; + break; + } + sscanf(y, "%u", &ls->first); + + } else if (!strcmp(x, "id")) { + if (!y) { + log_error("need argument to id"); + error = -EINVAL; + break; + } + sscanf(y, "%u", &ls->id); + + } else if (!strcmp(x, "nodir")) { + if (!y) { + log_error("need argument to nodir"); + error = -EINVAL; + break; + } + sscanf(y, "%u", nodir); + + } else { + log_error("unkonwn option: %s", x); + error = -EINVAL; + break; + } + } + + return error; +} + +static int gdlm_mount(char *table_name, char *host_data, + lm_callback_t cb, lm_fsdata_t *fsdata, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj) +{ + struct gdlm_ls *ls; + int error = -ENOMEM, nodir = 0; + + if (min_lvb_size > GDLM_LVB_SIZE) + goto out; + + ls = init_gdlm(cb, fsdata, flags, table_name); + if (!ls) + goto out; + + error = make_args(ls, host_data, &nodir); + if (error) + goto out; + + error = gdlm_init_threads(ls); + if (error) + goto out_free; + + error = gdlm_kobject_setup(ls, fskobj); + if (error) + goto out_thread; + + error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), + &ls->dlm_lockspace, + nodir ? DLM_LSFL_NODIR : 0, + GDLM_LVB_SIZE); + if (error) { + log_error("dlm_new_lockspace error %d", error); + goto out_kobj; + } + + lockstruct->ls_jid = ls->jid; + lockstruct->ls_first = ls->first; + lockstruct->ls_lockspace = ls; + lockstruct->ls_ops = &gdlm_ops; + lockstruct->ls_flags = 0; + lockstruct->ls_lvb_size = GDLM_LVB_SIZE; + return 0; + + out_kobj: + gdlm_kobject_release(ls); + out_thread: + gdlm_release_threads(ls); + out_free: + kfree(ls); + out: + return error; +} + +static void gdlm_unmount(lm_lockspace_t *lockspace) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) lockspace; + int rv; + + log_debug("unmount flags %lx", ls->flags); + + /* FIXME: serialize unmount and withdraw in case they + happen at once. Also, if unmount follows withdraw, + wait for withdraw to finish. */ + + if (test_bit(DFL_WITHDRAW, &ls->flags)) + goto out; + + gdlm_kobject_release(ls); + dlm_release_lockspace(ls->dlm_lockspace, 2); + gdlm_release_threads(ls); + rv = gdlm_release_all_locks(ls); + if (rv) + log_info("gdlm_unmount: %d stray locks freed", rv); + out: + kfree(ls); +} + +static void gdlm_recovery_done(lm_lockspace_t *lockspace, unsigned int jid, + unsigned int message) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) lockspace; + ls->recover_jid_done = jid; + ls->recover_jid_status = message; + kobject_uevent(&ls->kobj, KOBJ_CHANGE); +} + +static void gdlm_others_may_mount(lm_lockspace_t *lockspace) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) lockspace; + ls->first_done = 1; + kobject_uevent(&ls->kobj, KOBJ_CHANGE); +} + +/* Userspace gets the offline uevent, blocks new gfs locks on + other mounters, and lets us know (sets WITHDRAW flag). Then, + userspace leaves the mount group while we leave the lockspace. */ + +static void gdlm_withdraw(lm_lockspace_t *lockspace) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) lockspace; + + kobject_uevent(&ls->kobj, KOBJ_OFFLINE); + + wait_event_interruptible(ls->wait_control, + test_bit(DFL_WITHDRAW, &ls->flags)); + + dlm_release_lockspace(ls->dlm_lockspace, 2); + gdlm_release_threads(ls); + gdlm_release_all_locks(ls); + gdlm_kobject_release(ls); +} + +struct lm_lockops gdlm_ops = { + .lm_proto_name = "lock_dlm", + .lm_mount = gdlm_mount, + .lm_others_may_mount = gdlm_others_may_mount, + .lm_unmount = gdlm_unmount, + .lm_withdraw = gdlm_withdraw, + .lm_get_lock = gdlm_get_lock, + .lm_put_lock = gdlm_put_lock, + .lm_lock = gdlm_lock, + .lm_unlock = gdlm_unlock, + .lm_plock = gdlm_plock, + .lm_punlock = gdlm_punlock, + .lm_plock_get = gdlm_plock_get, + .lm_cancel = gdlm_cancel, + .lm_hold_lvb = gdlm_hold_lvb, + .lm_unhold_lvb = gdlm_unhold_lvb, + .lm_sync_lvb = gdlm_sync_lvb, + .lm_recovery_done = gdlm_recovery_done, + .lm_owner = THIS_MODULE, +}; + --- /dev/null +++ b/fs/gfs2/locking/dlm/plock.c @@ -0,0 +1,302 @@ +/* + * Copyright (C) 2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include + +#include "lock_dlm.h" + + +static spinlock_t ops_lock; +static struct list_head send_list; +static struct list_head recv_list; +static wait_queue_head_t send_wq; +static wait_queue_head_t recv_wq; + +struct plock_op { + struct list_head list; + int done; + struct gdlm_plock_info info; +}; + +static inline void set_version(struct gdlm_plock_info *info) +{ + info->version[0] = GDLM_PLOCK_VERSION_MAJOR; + info->version[1] = GDLM_PLOCK_VERSION_MINOR; + info->version[2] = GDLM_PLOCK_VERSION_PATCH; +} + +static int check_version(struct gdlm_plock_info *info) +{ + if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) || + (GDLM_PLOCK_VERSION_MINOR < info->version[1])) { + log_error("plock device version mismatch: " + "kernel (%u.%u.%u), user (%u.%u.%u)", + GDLM_PLOCK_VERSION_MAJOR, + GDLM_PLOCK_VERSION_MINOR, + GDLM_PLOCK_VERSION_PATCH, + info->version[0], + info->version[1], + info->version[2]); + return -EINVAL; + } + return 0; +} + +static void send_op(struct plock_op *op) +{ + set_version(&op->info); + INIT_LIST_HEAD(&op->list); + spin_lock(&ops_lock); + list_add_tail(&op->list, &send_list); + spin_unlock(&ops_lock); + wake_up(&send_wq); +} + +int gdlm_plock(lm_lockspace_t *lockspace, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) lockspace; + struct plock_op *op; + int rv; + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->info.optype = GDLM_PLOCK_OP_LOCK; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.wait = IS_SETLKW(cmd); + op->info.fsid = ls->id; + op->info.number = name->ln_number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + op->info.owner = (__u64)(long) fl->fl_owner; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + printk(KERN_INFO "plock op on list\n"); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (!rv) { + if (posix_lock_file_wait(file, fl) < 0) + log_error("gdlm_plock: vfs lock error %x,%llx", + name->ln_type, + (unsigned long long)name->ln_number); + } + + kfree(op); + return rv; +} + +int gdlm_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) lockspace; + struct plock_op *op; + int rv; + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return -ENOMEM; + + if (posix_lock_file_wait(file, fl) < 0) + log_error("gdlm_punlock: vfs unlock error %x,%llx", + name->ln_type, (unsigned long long)name->ln_number); + + op->info.optype = GDLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->id; + op->info.number = name->ln_number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + op->info.owner = (__u64)(long) fl->fl_owner; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + printk(KERN_INFO "punlock op on list\n"); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + kfree(op); + return rv; +} + +int gdlm_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) lockspace; + struct plock_op *op; + int rv; + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->info.optype = GDLM_PLOCK_OP_GET; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.fsid = ls->id; + op->info.number = name->ln_number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + printk(KERN_INFO "plock_get op on list\n"); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (rv == 0) + fl->fl_type = F_UNLCK; + else if (rv > 0) { + fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->fl_pid = op->info.pid; + fl->fl_start = op->info.start; + fl->fl_end = op->info.end; + } + + kfree(op); + return rv; +} + +/* a read copies out one plock request from the send list */ +static ssize_t dev_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) +{ + struct gdlm_plock_info info; + struct plock_op *op = NULL; + + if (count < sizeof(info)) + return -EINVAL; + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) { + op = list_entry(send_list.next, struct plock_op, list); + list_move(&op->list, &recv_list); + memcpy(&info, &op->info, sizeof(info)); + } + spin_unlock(&ops_lock); + + if (!op) + return -EAGAIN; + + if (copy_to_user(u, &info, sizeof(info))) + return -EFAULT; + return sizeof(info); +} + +/* a write copies in one plock result that should match a plock_op + on the recv list */ +static ssize_t dev_write(struct file *file, const char __user *u, size_t count, + loff_t *ppos) +{ + struct gdlm_plock_info info; + struct plock_op *op; + int found = 0; + + if (count != sizeof(info)) + return -EINVAL; + + if (copy_from_user(&info, u, sizeof(info))) + return -EFAULT; + + if (check_version(&info)) + return -EINVAL; + + spin_lock(&ops_lock); + list_for_each_entry(op, &recv_list, list) { + if (op->info.fsid == info.fsid && + op->info.number == info.number && + op->info.owner == info.owner) { + list_del_init(&op->list); + found = 1; + op->done = 1; + memcpy(&op->info, &info, sizeof(info)); + break; + } + } + spin_unlock(&ops_lock); + + if (found) + wake_up(&recv_wq); + else + printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid, + (unsigned long long)info.number); + return count; +} + +static unsigned int dev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &send_wq, wait); + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) { + spin_unlock(&ops_lock); + return POLLIN | POLLRDNORM; + } + spin_unlock(&ops_lock); + return 0; +} + +static struct file_operations dev_fops = { + .read = dev_read, + .write = dev_write, + .poll = dev_poll, + .owner = THIS_MODULE +}; + +static struct miscdevice plock_dev_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = GDLM_PLOCK_MISC_NAME, + .fops = &dev_fops +}; + +int gdlm_plock_init(void) +{ + int rv; + + spin_lock_init(&ops_lock); + INIT_LIST_HEAD(&send_list); + INIT_LIST_HEAD(&recv_list); + init_waitqueue_head(&send_wq); + init_waitqueue_head(&recv_wq); + + rv = misc_register(&plock_dev_misc); + if (rv) + printk(KERN_INFO "gdlm_plock_init: misc_register failed %d", + rv); + return rv; +} + +void gdlm_plock_exit(void) +{ + if (misc_deregister(&plock_dev_misc) < 0) + printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed"); +} + --- /dev/null +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -0,0 +1,225 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include + +#include "lock_dlm.h" + +extern struct lm_lockops gdlm_ops; + +static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name); +} + +static ssize_t block_show(struct gdlm_ls *ls, char *buf) +{ + ssize_t ret; + int val = 0; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if (val == 1) + set_bit(DFL_BLOCK_LOCKS, &ls->flags); + else if (val == 0) { + clear_bit(DFL_BLOCK_LOCKS, &ls->flags); + gdlm_submit_delayed(ls); + } else + ret = -EINVAL; + return ret; +} + +static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf) +{ + ssize_t ret; + int val = 0; + + if (test_bit(DFL_WITHDRAW, &ls->flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if (val == 1) + set_bit(DFL_WITHDRAW, &ls->flags); + else + ret = -EINVAL; + wake_up(&ls->wait_control); + return ret; +} + +static ssize_t id_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%u\n", ls->id); +} + +static ssize_t jid_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->jid); +} + +static ssize_t first_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->first); +} + +static ssize_t first_done_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->first_done); +} + +static ssize_t recover_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->recover_jid); +} + +static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ls->recover_jid = simple_strtol(buf, NULL, 0); + ls->fscb(ls->fsdata, LM_CB_NEED_RECOVERY, &ls->recover_jid); + return len; +} + +static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->recover_jid_done); +} + +static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->recover_jid_status); +} + +struct gdlm_attr { + struct attribute attr; + ssize_t (*show)(struct gdlm_ls *, char *); + ssize_t (*store)(struct gdlm_ls *, const char *, size_t); +}; + +#define GDLM_ATTR(_name,_mode,_show,_store) \ +static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) + +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(id, 0444, id_show, NULL); +GDLM_ATTR(jid, 0444, jid_show, NULL); +GDLM_ATTR(first, 0444, first_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0644, recover_show, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); + +static struct attribute *gdlm_attrs[] = { + &gdlm_attr_proto_name.attr, + &gdlm_attr_block.attr, + &gdlm_attr_withdraw.attr, + &gdlm_attr_id.attr, + &gdlm_attr_jid.attr, + &gdlm_attr_first.attr, + &gdlm_attr_first_done.attr, + &gdlm_attr_recover.attr, + &gdlm_attr_recover_done.attr, + &gdlm_attr_recover_status.attr, + NULL, +}; + +static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); + struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); + return a->show ? a->show(ls, buf) : 0; +} + +static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); + struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); + return a->store ? a->store(ls, buf, len) : len; +} + +static struct sysfs_ops gdlm_attr_ops = { + .show = gdlm_attr_show, + .store = gdlm_attr_store, +}; + +static struct kobj_type gdlm_ktype = { + .default_attrs = gdlm_attrs, + .sysfs_ops = &gdlm_attr_ops, +}; + +static struct kset gdlm_kset = { + .subsys = &kernel_subsys, + .kobj = {.name = "lock_dlm",}, + .ktype = &gdlm_ktype, +}; + +int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj) +{ + int error; + + error = kobject_set_name(&ls->kobj, "%s", "lock_module"); + if (error) { + log_error("can't set kobj name %d", error); + return error; + } + + ls->kobj.kset = &gdlm_kset; + ls->kobj.ktype = &gdlm_ktype; + ls->kobj.parent = fskobj; + + error = kobject_register(&ls->kobj); + if (error) + log_error("can't register kobj %d", error); + + return error; +} + +void gdlm_kobject_release(struct gdlm_ls *ls) +{ + kobject_unregister(&ls->kobj); +} + +int gdlm_sysfs_init(void) +{ + int error; + + error = kset_register(&gdlm_kset); + if (error) + printk("lock_dlm: cannot register kset %d\n", error); + + return error; +} + +void gdlm_sysfs_exit(void) +{ + kset_unregister(&gdlm_kset); +} + --- /dev/null +++ b/fs/gfs2/locking/dlm/thread.c @@ -0,0 +1,359 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include "lock_dlm.h" + +/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm + thread gets to it. */ + +static void queue_submit(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + spin_lock(&ls->async_lock); + list_add_tail(&lp->delay_list, &ls->submit); + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +static void process_blocking(struct gdlm_lock *lp, int bast_mode) +{ + struct gdlm_ls *ls = lp->ls; + unsigned int cb = 0; + + switch (gdlm_make_lmstate(bast_mode)) { + case LM_ST_EXCLUSIVE: + cb = LM_CB_NEED_E; + break; + case LM_ST_DEFERRED: + cb = LM_CB_NEED_D; + break; + case LM_ST_SHARED: + cb = LM_CB_NEED_S; + break; + default: + gdlm_assert(0, "unknown bast mode %u", lp->bast_mode); + } + + ls->fscb(ls->fsdata, cb, &lp->lockname); +} + +static void process_complete(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + struct lm_async_cb acb; + int16_t prev_mode = lp->cur; + + memset(&acb, 0, sizeof(acb)); + + if (lp->lksb.sb_status == -DLM_ECANCEL) { + log_info("complete dlm cancel %x,%llx flags %lx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + + lp->req = lp->cur; + acb.lc_ret |= LM_OUT_CANCELED; + if (lp->cur == DLM_LOCK_IV) + lp->lksb.sb_lkid = 0; + goto out; + } + + if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) { + if (lp->lksb.sb_status != -DLM_EUNLOCK) { + log_info("unlock sb_status %d %x,%llx flags %lx", + lp->lksb.sb_status, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + return; + } + + lp->cur = DLM_LOCK_IV; + lp->req = DLM_LOCK_IV; + lp->lksb.sb_lkid = 0; + + if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) { + gdlm_delete_lp(lp); + return; + } + goto out; + } + + if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID) + memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); + + if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) { + if (lp->req == DLM_LOCK_PR) + lp->req = DLM_LOCK_CW; + else if (lp->req == DLM_LOCK_CW) + lp->req = DLM_LOCK_PR; + } + + /* + * A canceled lock request. The lock was just taken off the delayed + * list and was never even submitted to dlm. + */ + + if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) { + log_info("complete internal cancel %x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + lp->req = lp->cur; + acb.lc_ret |= LM_OUT_CANCELED; + goto out; + } + + /* + * An error occured. + */ + + if (lp->lksb.sb_status) { + /* a "normal" error */ + if ((lp->lksb.sb_status == -EAGAIN) && + (lp->lkf & DLM_LKF_NOQUEUE)) { + lp->req = lp->cur; + if (lp->cur == DLM_LOCK_IV) + lp->lksb.sb_lkid = 0; + goto out; + } + + /* this could only happen with cancels I think */ + log_info("ast sb_status %d %x,%llx flags %lx", + lp->lksb.sb_status, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + return; + } + + /* + * This is an AST for an EX->EX conversion for sync_lvb from GFS. + */ + + if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { + complete(&lp->ast_wait); + return; + } + + /* + * A lock has been demoted to NL because it initially completed during + * BLOCK_LOCKS. Now it must be requested in the originally requested + * mode. + */ + + if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) { + gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + + lp->cur = DLM_LOCK_NL; + lp->req = lp->prev_req; + lp->prev_req = DLM_LOCK_IV; + lp->lkf &= ~DLM_LKF_CONVDEADLK; + + set_bit(LFL_NOCACHE, &lp->flags); + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags)) + gdlm_queue_delayed(lp); + else + queue_submit(lp); + return; + } + + /* + * A request is granted during dlm recovery. It may be granted + * because the locks of a failed node were cleared. In that case, + * there may be inconsistent data beneath this lock and we must wait + * for recovery to complete to use it. When gfs recovery is done this + * granted lock will be converted to NL and then reacquired in this + * granted state. + */ + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags) && + lp->req != DLM_LOCK_NL) { + + lp->cur = lp->req; + lp->prev_req = lp->req; + lp->req = DLM_LOCK_NL; + lp->lkf |= DLM_LKF_CONVERT; + lp->lkf &= ~DLM_LKF_CONVDEADLK; + + log_debug("rereq %x,%llx id %x %d,%d", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->lksb.sb_lkid, lp->cur, lp->req); + + set_bit(LFL_REREQUEST, &lp->flags); + queue_submit(lp); + return; + } + + /* + * DLM demoted the lock to NL before it was granted so GFS must be + * told it cannot cache data for this lock. + */ + + if (lp->lksb.sb_flags & DLM_SBF_DEMOTED) + set_bit(LFL_NOCACHE, &lp->flags); + + out: + /* + * This is an internal lock_dlm lock + */ + + if (test_bit(LFL_INLOCK, &lp->flags)) { + clear_bit(LFL_NOBLOCK, &lp->flags); + lp->cur = lp->req; + complete(&lp->ast_wait); + return; + } + + /* + * Normal completion of a lock request. Tell GFS it now has the lock. + */ + + clear_bit(LFL_NOBLOCK, &lp->flags); + lp->cur = lp->req; + + acb.lc_name = lp->lockname; + acb.lc_ret |= gdlm_make_lmstate(lp->cur); + + if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) && + (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL)) + acb.lc_ret |= LM_OUT_CACHEABLE; + + ls->fscb(ls->fsdata, LM_CB_ASYNC, &acb); +} + +static inline int no_work(struct gdlm_ls *ls, int blocking) +{ + int ret; + + spin_lock(&ls->async_lock); + ret = list_empty(&ls->complete) && list_empty(&ls->submit); + if (ret && blocking) + ret = list_empty(&ls->blocking); + spin_unlock(&ls->async_lock); + + return ret; +} + +static inline int check_drop(struct gdlm_ls *ls) +{ + if (!ls->drop_locks_count) + return 0; + + if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) { + ls->drop_time = jiffies; + if (ls->all_locks_count >= ls->drop_locks_count) + return 1; + } + return 0; +} + +static int gdlm_thread(void *data) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) data; + struct gdlm_lock *lp = NULL; + int blist = 0; + uint8_t complete, blocking, submit, drop; + DECLARE_WAITQUEUE(wait, current); + + /* Only thread1 is allowed to do blocking callbacks since gfs + may wait for a completion callback within a blocking cb. */ + + if (current == ls->thread1) + blist = 1; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ls->thread_wait, &wait); + if (no_work(ls, blist)) + schedule(); + remove_wait_queue(&ls->thread_wait, &wait); + set_current_state(TASK_RUNNING); + + complete = blocking = submit = drop = 0; + + spin_lock(&ls->async_lock); + + if (blist && !list_empty(&ls->blocking)) { + lp = list_entry(ls->blocking.next, struct gdlm_lock, + blist); + list_del_init(&lp->blist); + blocking = lp->bast_mode; + lp->bast_mode = 0; + } else if (!list_empty(&ls->complete)) { + lp = list_entry(ls->complete.next, struct gdlm_lock, + clist); + list_del_init(&lp->clist); + complete = 1; + } else if (!list_empty(&ls->submit)) { + lp = list_entry(ls->submit.next, struct gdlm_lock, + delay_list); + list_del_init(&lp->delay_list); + submit = 1; + } + + drop = check_drop(ls); + spin_unlock(&ls->async_lock); + + if (complete) + process_complete(lp); + + else if (blocking) + process_blocking(lp, blocking); + + else if (submit) + gdlm_do_lock(lp); + + if (drop) + ls->fscb(ls->fsdata, LM_CB_DROPLOCKS, NULL); + + schedule(); + } + + return 0; +} + +int gdlm_init_threads(struct gdlm_ls *ls) +{ + struct task_struct *p; + int error; + + p = kthread_run(gdlm_thread, ls, "lock_dlm1"); + error = IS_ERR(p); + if (error) { + log_error("can't start lock_dlm1 thread %d", error); + return error; + } + ls->thread1 = p; + + p = kthread_run(gdlm_thread, ls, "lock_dlm2"); + error = IS_ERR(p); + if (error) { + log_error("can't start lock_dlm2 thread %d", error); + kthread_stop(ls->thread1); + return error; + } + ls->thread2 = p; + + return 0; +} + +void gdlm_release_threads(struct gdlm_ls *ls) +{ + kthread_stop(ls->thread1); + kthread_stop(ls->thread2); +} + --- /dev/null +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -0,0 +1,188 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef LOCK_DLM_DOT_H +#define LOCK_DLM_DOT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "../../lm_interface.h" + +/* + * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a + * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module + * as "lock_dlm". + */ + +#define GDLM_STRNAME_BYTES 24 +#define GDLM_LVB_SIZE 32 +#define GDLM_DROP_COUNT 50000 +#define GDLM_DROP_PERIOD 60 +#define GDLM_NAME_LEN 128 + +/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number). + We sprintf these numbers into a 24 byte string of hex values to make them + human-readable (to make debugging simpler.) */ + +struct gdlm_strname { + unsigned char name[GDLM_STRNAME_BYTES]; + unsigned short namelen; +}; + +enum { + DFL_BLOCK_LOCKS = 0, + DFL_SPECTATOR = 1, + DFL_WITHDRAW = 2, +}; + +struct gdlm_ls { + uint32_t id; + int jid; + int first; + int first_done; + unsigned long flags; + struct kobject kobj; + char clustername[GDLM_NAME_LEN]; + char fsname[GDLM_NAME_LEN]; + int fsflags; + dlm_lockspace_t *dlm_lockspace; + lm_callback_t fscb; + lm_fsdata_t *fsdata; + int recover_jid; + int recover_jid_done; + int recover_jid_status; + spinlock_t async_lock; + struct list_head complete; + struct list_head blocking; + struct list_head delayed; + struct list_head submit; + struct list_head all_locks; + uint32_t all_locks_count; + wait_queue_head_t wait_control; + struct task_struct *thread1; + struct task_struct *thread2; + wait_queue_head_t thread_wait; + unsigned long drop_time; + int drop_locks_count; + int drop_locks_period; +}; + +enum { + LFL_NOBLOCK = 0, + LFL_NOCACHE = 1, + LFL_DLM_UNLOCK = 2, + LFL_DLM_CANCEL = 3, + LFL_SYNC_LVB = 4, + LFL_FORCE_PROMOTE = 5, + LFL_REREQUEST = 6, + LFL_ACTIVE = 7, + LFL_INLOCK = 8, + LFL_CANCEL = 9, + LFL_NOBAST = 10, + LFL_HEADQUE = 11, + LFL_UNLOCK_DELETE = 12, +}; + +struct gdlm_lock { + struct gdlm_ls *ls; + struct lm_lockname lockname; + char *lvb; + struct dlm_lksb lksb; + + int16_t cur; + int16_t req; + int16_t prev_req; + uint32_t lkf; /* dlm flags DLM_LKF_ */ + unsigned long flags; /* lock_dlm flags LFL_ */ + + int bast_mode; /* protected by async_lock */ + struct completion ast_wait; + + struct list_head clist; /* complete */ + struct list_head blist; /* blocking */ + struct list_head delay_list; /* delayed */ + struct list_head all_list; /* all locks for the fs */ + struct gdlm_lock *hold_null; /* NL lock for hold_lvb */ +}; + +#define gdlm_assert(assertion, fmt, args...) \ +do { \ + if (unlikely(!(assertion))) { \ + printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \ + "lock_dlm: " fmt "\n", \ + #assertion, ##args); \ + BUG(); \ + } \ +} while (0) + +#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg) +#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg) +#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg) +#ifdef LOCK_DLM_LOG_DEBUG +#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg) +#else +#define log_debug(fmt, arg...) +#endif + +/* sysfs.c */ + +int gdlm_sysfs_init(void); +void gdlm_sysfs_exit(void); +int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *); +void gdlm_kobject_release(struct gdlm_ls *); + +/* thread.c */ + +int gdlm_init_threads(struct gdlm_ls *); +void gdlm_release_threads(struct gdlm_ls *); + +/* lock.c */ + +int16_t gdlm_make_lmstate(int16_t); +void gdlm_queue_delayed(struct gdlm_lock *); +void gdlm_submit_delayed(struct gdlm_ls *); +int gdlm_release_all_locks(struct gdlm_ls *); +void gdlm_delete_lp(struct gdlm_lock *); +unsigned int gdlm_do_lock(struct gdlm_lock *); + +int gdlm_get_lock(lm_lockspace_t *, struct lm_lockname *, lm_lock_t **); +void gdlm_put_lock(lm_lock_t *); +unsigned int gdlm_lock(lm_lock_t *, unsigned int, unsigned int, unsigned int); +unsigned int gdlm_unlock(lm_lock_t *, unsigned int); +void gdlm_cancel(lm_lock_t *); +int gdlm_hold_lvb(lm_lock_t *, char **); +void gdlm_unhold_lvb(lm_lock_t *, char *); +void gdlm_sync_lvb(lm_lock_t *, char *); + +/* plock.c */ + +int gdlm_plock_init(void); +void gdlm_plock_exit(void); +int gdlm_plock(lm_lockspace_t *, struct lm_lockname *, struct file *, int, + struct file_lock *); +int gdlm_plock_get(lm_lockspace_t *, struct lm_lockname *, struct file *, + struct file_lock *); +int gdlm_punlock(lm_lockspace_t *, struct lm_lockname *, struct file *, + struct file_lock *); +#endif + --- /dev/null +++ b/fs/gfs2/locking/dlm/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_GFS2_FS) += lock_dlm.o +lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o + --- /dev/null +++ b/include/linux/lock_dlm_plock.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#ifndef __LOCK_DLM_PLOCK_DOT_H__ +#define __LOCK_DLM_PLOCK_DOT_H__ + +#define GDLM_PLOCK_MISC_NAME "lock_dlm_plock" + +#define GDLM_PLOCK_VERSION_MAJOR 1 +#define GDLM_PLOCK_VERSION_MINOR 1 +#define GDLM_PLOCK_VERSION_PATCH 0 + +enum { + GDLM_PLOCK_OP_LOCK = 1, + GDLM_PLOCK_OP_UNLOCK, + GDLM_PLOCK_OP_GET, +}; + +struct gdlm_plock_info { + __u32 version[3]; + __u8 optype; + __u8 ex; + __u8 wait; + __u8 pad; + __u32 pid; + __s32 nodeid; + __s32 rv; + __u32 fsid; + __u64 number; + __u64 start; + __u64 end; + __u64 owner; +}; + +#endif + [PATCH 15/16] GFS2: The "nolock" module The "nolock" locking module provides locking services for single node GFS2 set ups and is also useful for testing. Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland fs/gfs2/locking/nolock/Makefile | 3 fs/gfs2/locking/nolock/main.c | 259 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 262 insertions(+) --- /dev/null +++ b/fs/gfs2/locking/nolock/main.c @@ -0,0 +1,259 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../../lm_interface.h" + +struct nolock_lockspace { + unsigned int nl_lvb_size; +}; + +static struct lm_lockops nolock_ops; + +static int nolock_mount(char *table_name, char *host_data, + lm_callback_t cb, lm_fsdata_t *fsdata, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj) +{ + char *c; + unsigned int jid; + struct nolock_lockspace *nl; + + c = strstr(host_data, "jid="); + if (!c) + jid = 0; + else { + c += 4; + sscanf(c, "%u", &jid); + } + + nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL); + if (!nl) + return -ENOMEM; + + nl->nl_lvb_size = min_lvb_size; + + lockstruct->ls_jid = jid; + lockstruct->ls_first = 1; + lockstruct->ls_lvb_size = min_lvb_size; + lockstruct->ls_lockspace = (lm_lockspace_t *)nl; + lockstruct->ls_ops = &nolock_ops; + lockstruct->ls_flags = LM_LSFLAG_LOCAL; + + return 0; +} + +static void nolock_others_may_mount(lm_lockspace_t *lockspace) +{ +} + +static void nolock_unmount(lm_lockspace_t *lockspace) +{ + struct nolock_lockspace *nl = (struct nolock_lockspace *)lockspace; + kfree(nl); +} + +static void nolock_withdraw(lm_lockspace_t *lockspace) +{ +} + +/** + * nolock_get_lock - get a lm_lock_t given a descripton of the lock + * @lockspace: the lockspace the lock lives in + * @name: the name of the lock + * @lockp: return the lm_lock_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_get_lock(lm_lockspace_t *lockspace, struct lm_lockname *name, + lm_lock_t **lockp) +{ + *lockp = (lm_lock_t *)lockspace; + return 0; +} + +/** + * nolock_put_lock - get rid of a lock structure + * @lock: the lock to throw away + * + */ + +static void nolock_put_lock(lm_lock_t *lock) +{ +} + +/** + * nolock_lock - acquire a lock + * @lock: the lock to manipulate + * @cur_state: the current state + * @req_state: the requested state + * @flags: modifier flags + * + * Returns: A bitmap of LM_OUT_* + */ + +static unsigned int nolock_lock(lm_lock_t *lock, unsigned int cur_state, + unsigned int req_state, unsigned int flags) +{ + return req_state | LM_OUT_CACHEABLE; +} + +/** + * nolock_unlock - unlock a lock + * @lock: the lock to manipulate + * @cur_state: the current state + * + * Returns: 0 + */ + +static unsigned int nolock_unlock(lm_lock_t *lock, unsigned int cur_state) +{ + return 0; +} + +static void nolock_cancel(lm_lock_t *lock) +{ +} + +/** + * nolock_hold_lvb - hold on to a lock value block + * @lock: the lock the LVB is associated with + * @lvbp: return the lm_lvb_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_hold_lvb(lm_lock_t *lock, char **lvbp) +{ + struct nolock_lockspace *nl = (struct nolock_lockspace *)lock; + int error = 0; + + *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL); + if (!*lvbp) + error = -ENOMEM; + + return error; +} + +/** + * nolock_unhold_lvb - release a LVB + * @lock: the lock the LVB is associated with + * @lvb: the lock value block + * + */ + +static void nolock_unhold_lvb(lm_lock_t *lock, char *lvb) +{ + kfree(lvb); +} + +/** + * nolock_sync_lvb - sync out the value of a lvb + * @lock: the lock the LVB is associated with + * @lvb: the lock value block + * + */ + +static void nolock_sync_lvb(lm_lock_t *lock, char *lvb) +{ +} + +static int nolock_plock_get(lm_lockspace_t *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct file_lock tmp; + int ret; + + ret = posix_test_lock(file, fl, &tmp); + fl->fl_type = F_UNLCK; + if (ret) + memcpy(fl, &tmp, sizeof(struct file_lock)); + + return 0; +} + +static int nolock_plock(lm_lockspace_t *lockspace, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + int error; + error = posix_lock_file_wait(file, fl); + return error; +} + +static int nolock_punlock(lm_lockspace_t *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error; + error = posix_lock_file_wait(file, fl); + return error; +} + +static void nolock_recovery_done(lm_lockspace_t *lockspace, unsigned int jid, + unsigned int message) +{ +} + +static struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_mount = nolock_mount, + .lm_others_may_mount = nolock_others_may_mount, + .lm_unmount = nolock_unmount, + .lm_withdraw = nolock_withdraw, + .lm_get_lock = nolock_get_lock, + .lm_put_lock = nolock_put_lock, + .lm_lock = nolock_lock, + .lm_unlock = nolock_unlock, + .lm_cancel = nolock_cancel, + .lm_hold_lvb = nolock_hold_lvb, + .lm_unhold_lvb = nolock_unhold_lvb, + .lm_sync_lvb = nolock_sync_lvb, + .lm_plock_get = nolock_plock_get, + .lm_plock = nolock_plock, + .lm_punlock = nolock_punlock, + .lm_recovery_done = nolock_recovery_done, + .lm_owner = THIS_MODULE, +}; + +static int __init init_nolock(void) +{ + int error; + + error = gfs2_register_lockproto(&nolock_ops); + if (error) { + printk(KERN_WARNING + "lock_nolock: can't register protocol: %d\n", error); + return error; + } + + printk(KERN_INFO + "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__); + return 0; +} + +static void __exit exit_nolock(void) +{ + gfs2_unregister_lockproto(&nolock_ops); +} + +module_init(init_nolock); +module_exit(exit_nolock); + +MODULE_DESCRIPTION("GFS Nolock Locking Module"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + --- /dev/null +++ b/fs/gfs2/locking/nolock/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_GFS2_FS) += lock_nolock.o +lock_nolock-y := main.o + [PATCH 16/16] GFS2: Exported functions Export the routine for initialising ra_state structure. This allows GFS2 to use the standard kernel readahead when reading its internal files. Also a modification to the read path for direct i/o allows a 0 return from the direct i/o function provided by the filesystem to result in a "normal" buffered i/o. GFS2 uses this in cases where direct i/o isn't possible for some reason (e.g. the file is "stuffed" and this not aligned on disk). Signed-off-by: Steven Whitehouse Signed-off-by: David Teigland mm/filemap.c | 3 ++- mm/readahead.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1181,7 +1181,8 @@ __generic_file_aio_read(struct kiocb *io *ppos = pos + retval; } file_accessed(filp); - goto out; + if (retval != 0) + goto out; } retval = 0; --- a/mm/readahead.c +++ b/mm/readahead.c @@ -38,6 +38,7 @@ file_ra_state_init(struct file_ra_state ra->ra_pages = mapping->backing_dev_info->ra_pages; ra->prev_page = -1; } +EXPORT_SYMBOL_GPL(file_ra_state_init); /* * Return max readahead size for this inode in number-of-pages.