-----Original Message----- From: Cahill, Ben M Sent: Thursday, September 23, 2004 4:12 PM To: RedHat Cluster (linux-cluster@xxxxxxxxxx) Subject: [PATCH] More comments for GFS files Hi all, Below please find a patch for more comments in some files in gfs-kernel/src/gfs: dio.c file.c gfs_ioctl.c incore.h log.c lops.c lvb.h rgrp.c The focus was on incore.h. These were diffed against Thursday's CVS, and I've built and run GFS after applying the patches, so things should hopefully apply cleanly. -- Ben -- Opinions are mine, not Intel's diff -ru cvs/cluster/gfs-kernel/src/gfs/dio.c build_092304/cluster/gfs-kernel/src/gfs/dio.c --- cvs/cluster/gfs-kernel/src/gfs/dio.c 2004-06-24 04:53:27.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/dio.c 2004-09-23 14:18:00.229937128 -0400 @@ -1078,6 +1078,9 @@ * gfs_sync_meta - sync all the buffers in a filesystem * @sdp: the filesystem * + * Flush metadata blocks to on-disk journal, then + * Flush metadata blocks (now in AIL) to on-disk in-place locations + * Periodically keep checking until done (AIL empty) */ void diff -ru cvs/cluster/gfs-kernel/src/gfs/file.c build_092304/cluster/gfs-kernel/src/gfs/file.c --- cvs/cluster/gfs-kernel/src/gfs/file.c 2004-06-24 04:53:27.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/file.c 2004-09-23 14:18:09.964457256 -0400 @@ -199,15 +199,18 @@ char **p = (char **)buf; int error = 0; + /* the dinode block always gets journaled */ if (bh->b_blocknr == ip->i_num.no_addr) { GFS_ASSERT_INODE(!new, ip,); gfs_trans_add_bh(ip->i_gl, bh); memcpy(bh->b_data + offset, *p, size); + /* data blocks get journaled only for special files */ } else if (gfs_is_jdata(ip)) { gfs_trans_add_bh(ip->i_gl, bh); memcpy(bh->b_data + offset, *p, size); if (new) gfs_buffer_clear_ends(bh, offset, size, TRUE); + /* non-journaled data blocks get written to in-place disk blocks */ } else { memcpy(bh->b_data + offset, *p, size); if (new) @@ -240,11 +243,13 @@ char **p = (char **)buf; int error = 0; + /* the dinode block always gets journaled */ if (bh->b_blocknr == ip->i_num.no_addr) { GFS_ASSERT_INODE(!new, ip,); gfs_trans_add_bh(ip->i_gl, bh); if (copy_from_user(bh->b_data + offset, *p, size)) error = -EFAULT; + /* data blocks get journaled only for special files */ } else if (gfs_is_jdata(ip)) { gfs_trans_add_bh(ip->i_gl, bh); if (copy_from_user(bh->b_data + offset, *p, size)) @@ -254,6 +259,7 @@ if (error) memset(bh->b_data + offset, 0, size); } + /* non-journaled data blocks get written to in-place disk blocks */ } else { if (copy_from_user(bh->b_data + offset, *p, size)) error = -EFAULT; diff -ru cvs/cluster/gfs-kernel/src/gfs/gfs_ioctl.h build_092304/cluster/gfs-kernel/src/gfs/gfs_ioctl.h --- cvs/cluster/gfs-kernel/src/gfs/gfs_ioctl.h 2004-09-13 18:48:45.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/gfs_ioctl.h 2004-09-23 13:32:21.518284584 -0400 @@ -131,18 +131,21 @@ unsigned int gt_demote_secs; unsigned int gt_incore_log_blocks; unsigned int gt_jindex_refresh_secs; + + /* how often various daemons run (seconds) */ unsigned int gt_depend_secs; - unsigned int gt_scand_secs; - unsigned int gt_recoverd_secs; - unsigned int gt_logd_secs; - unsigned int gt_quotad_secs; - unsigned int gt_inoded_secs; - unsigned int gt_quota_simul_sync; - unsigned int gt_quota_warn_period; + unsigned int gt_scand_secs; /* find unused glocks and inodes */ + unsigned int gt_recoverd_secs; /* recover journal of crashed node */ + unsigned int gt_logd_secs; /* update log tail as AIL flushes */ + unsigned int gt_quotad_secs; /* sync changes to quota file, clean*/ + unsigned int gt_inoded_secs; /* toss unused inodes */ + + unsigned int gt_quota_simul_sync; /* max # quotavals to sync at once */ + unsigned int gt_quota_warn_period; /* secs between quota warn msgs */ unsigned int gt_atime_quantum; - unsigned int gt_quota_quantum; - unsigned int gt_quota_scale_num; - unsigned int gt_quota_scale_den; + unsigned int gt_quota_quantum; /* secs between syncs to quota file */ + unsigned int gt_quota_scale_num; /* numerator */ + unsigned int gt_quota_scale_den; /* denominator */ unsigned int gt_quota_enforce; unsigned int gt_quota_account; unsigned int gt_new_files_jdata; diff -ru cvs/cluster/gfs-kernel/src/gfs/incore.h build_092304/cluster/gfs-kernel/src/gfs/incore.h --- cvs/cluster/gfs-kernel/src/gfs/incore.h 2004-09-13 18:48:45.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/incore.h 2004-09-23 14:58:06.330154296 -0400 @@ -11,20 +11,28 @@ ************************************************************************ ******* ************************************************************************ ******/ +/* + * In-core (memory/RAM) structures. + * These do not appear on-disk. See gfs_ondisk.h for on-disk structures. + */ + #ifndef __INCORE_DOT_H__ #define __INCORE_DOT_H__ +/* flags used in function call parameters */ + #define DIO_NEW (0x00000001) -#define DIO_FORCE (0x00000002) -#define DIO_CLEAN (0x00000004) -#define DIO_DIRTY (0x00000008) -#define DIO_START (0x00000010) -#define DIO_WAIT (0x00000020) -#define DIO_METADATA (0x00000040) -#define DIO_DATA (0x00000080) +#define DIO_FORCE (0x00000002) /* force read of block from disk */ +#define DIO_CLEAN (0x00000004) /* don't write to disk */ +#define DIO_DIRTY (0x00000008) /* data changed, must write to disk */ +#define DIO_START (0x00000010) /* start disk read or write */ +#define DIO_WAIT (0x00000020) /* wait for disk r/w to complete */ + +#define DIO_METADATA (0x00000040) /* process glock's protected metadata */ +#define DIO_DATA (0x00000080) /* process glock's protected filedata */ #define DIO_INVISIBLE (0x00000100) -#define DIO_CHECK (0x00000200) -#define DIO_ALL (0x00000400) +#define DIO_CHECK (0x00000200) /* make sure glock's AIL is empty */ +#define DIO_ALL (0x00000400) /* flush all AIL transactions to disk */ /* Structure prototypes */ @@ -98,6 +106,7 @@ void (*lo_after_scan) (struct gfs_sbd * sdp, unsigned int jid, unsigned int pass); + /* type of element (glock/buf/unlinked/quota) */ char *lo_name; }; @@ -107,227 +116,351 @@ */ struct gfs_log_element { - struct gfs_log_operations *le_ops; + struct gfs_log_operations *le_ops; /* vector of functions */ - struct gfs_trans *le_trans; - struct list_head le_list; + struct gfs_trans *le_trans; /* we're part of this transaction */ + struct list_head le_list; /* link to transaction's element list */ }; +/* + * Meta-header cache structure. + * One for each metadata block that we've read from disk, and are still using. + * In-core superblock structure hosts the actual cache. + * Also, each resource group keeps a list of cached blocks within its scope. + */ struct gfs_meta_header_cache { - struct list_head mc_list_hash; - struct list_head mc_list_single; - struct list_head mc_list_rgd; + /* Links to various lists */ + struct list_head mc_list_hash; /* superblock's hashed list */ + struct list_head mc_list_single; /* superblock's single list */ + struct list_head mc_list_rgd; /* resource group's list */ - uint64_t mc_block; - struct gfs_meta_header mc_mh; + uint64_t mc_block; /* block # (in-place address) */ + struct gfs_meta_header mc_mh; /* payload: the block's meta-header */ }; +/* + * Dependency cache structure. + * In-core superblock structure hosts the actual cache. + * Also, each resource group keeps a list of dependency blocks within its scope. + */ struct gfs_depend { - struct list_head gd_list_hash; - struct list_head gd_list_rgd; + /* Links to various lists */ + struct list_head gd_list_hash; /* superblock's hashed list */ + struct list_head gd_list_rgd; /* resource group's list */ - struct gfs_rgrpd *gd_rgd; - uint64_t gd_formal_ino; - unsigned long gd_time; + struct gfs_rgrpd *gd_rgd; /* resource group descriptor */ + uint64_t gd_formal_ino; /* inode ID */ + unsigned long gd_time; /* time (jiffies) when put on list */ }; /* - * Structure containing information about the allocation bitmaps. - * There are one of these for each fs block that the bitmap for - * the resource group header covers. + * Block allocation bitmap descriptor structure. + * One of these for each fs block that contains bitmap data + * (i.e. the resource group header blocks and their following bitmap blocks). + * Each allocatable fs data block is represented by 2 bits (4 alloc states). */ struct gfs_bitmap { - uint32_t bi_offset; /* The offset in the buffer of the first byte */ - uint32_t bi_start; /* The position of the first byte in this block */ - uint32_t bi_len; /* The number of bytes in this block */ + uint32_t bi_offset; /* Byte offset of bitmap within this bit block + (non-zero only for an rgrp header block) */ + uint32_t bi_start; /* Data block (rgrp scope, 32-bit) represented + by the first bit-pair in this bit block */ + uint32_t bi_len; /* The number of bitmap bytes in this bit block */ }; /* - * Structure containing information Resource Groups + * Resource Group (Rgrp) descriptor structure. + * There is one of these for each resource (block) group in the fs. + * The filesystem is divided into a number of resource groups to allow + * simultaneous block alloc operations by a number of nodes. */ struct gfs_rgrpd { - struct list_head rd_list; /* Link with superblock */ - struct list_head rd_list_mru; - struct list_head rd_recent; /* Recently used rgrps */ + /* Links to superblock lists */ + struct list_head rd_list; /* on-disk-order list of all rgrps */ + struct list_head rd_list_mru; /* Most Recently Used list of all rgs */ + struct list_head rd_recent; /* recently used rgrps */ - struct gfs_glock *rd_gl; /* Glock for rgrp */ + struct gfs_glock *rd_gl; /* Glock for this rgrp */ - unsigned long rd_flags; + unsigned long rd_flags; /* ?? */ - struct gfs_rindex rd_ri; /* Resource Index structure */ - struct gfs_rgrp rd_rg; /* Resource Group structure */ - uint64_t rd_rg_vn; + struct gfs_rindex rd_ri; /* Resource Index (on-disk) structure */ + struct gfs_rgrp rd_rg; /* Resource Group (on-disk) structure */ + uint64_t rd_rg_vn; /* version #: if != glock's gl_vn, + we need to read rgrp fm disk */ - struct gfs_bitmap *rd_bits; - struct buffer_head **rd_bh; + /* Block alloc bitmap cache */ + struct gfs_bitmap *rd_bits; /* Array of block bitmap descriptors */ + struct buffer_head **rd_bh; /* Array of ptrs to block bitmap bh's */ - uint32_t rd_last_alloc_data; - uint32_t rd_last_alloc_meta; + /* Block allocation strategy, rgrp scope. Start at these blocks when + * searching for next data/meta block to alloc */ + uint32_t rd_last_alloc_data; /* most recent data block allocated */ + uint32_t rd_last_alloc_meta; /* most recent meta block allocated */ - struct list_head rd_mhc; - struct list_head rd_depend; + struct list_head rd_mhc; /* cached meta-headers for this rgrp */ + struct list_head rd_depend; /* dependency elements */ - struct gfs_sbd *rd_sbd; + struct gfs_sbd *rd_sbd; /* fs incore superblock (fs instance) */ }; /* * Per-buffer data + * One of these is attached as GFS private data to each fs block's buffer_head. + * These also link into the Active Items Lists (AIL) (buffers flushed to + * on-disk log, but not yet flushed to on-disk in-place locations) attached + * to transactions and glocks. */ struct gfs_bufdata { - struct buffer_head *bd_bh; /* struct buffer_head which this struct belongs to */ - struct gfs_glock *bd_gl; /* Pointer to Glock struct for this bh */ + struct buffer_head *bd_bh; /* we belong to this Linux buffer_head */ + struct gfs_glock *bd_gl; /* this glock protects buffer's payload */ struct gfs_log_element bd_new_le; struct gfs_log_element bd_incore_le; - char *bd_frozen; - struct semaphore bd_lock; + char *bd_frozen; /* "frozen" copy of buffer's data */ + struct semaphore bd_lock; /* protects access to this structure */ - unsigned int bd_pinned; /* Pin count */ - struct list_head bd_ail_tr_list; /* List of buffers hanging off tr_ail_bufs */ - struct list_head bd_ail_gl_list; /* List of buffers hanging off gl_ail_bufs */ + /* "pin" means keep buffer in RAM, don't write to disk (yet) */ + unsigned int bd_pinned; /* recursive pin count */ + struct list_head bd_ail_tr_list; /* link to transaction's AIL list */ + struct list_head bd_ail_gl_list; /* link to glock's AIL list */ }; /* * Glock operations + * One set of operations for each glock, the set selected by type of glock. + * These functions get called at various points in a glock's lifetime. + * "xmote" = promote (lock) a glock at inter-node level. + * "th" = top half, "bh" = bottom half */ struct gfs_glock_operations { + + /* before acquiring a lock at inter-node level */ void (*go_xmote_th) (struct gfs_glock * gl, unsigned int state, int flags); + + /* after acquiring a lock at inter-node level */ void (*go_xmote_bh) (struct gfs_glock * gl); + + /* before releasing a lock at inter-node level, calls go_sync */ void (*go_drop_th) (struct gfs_glock * gl); + + /* after releasing a lock at inter-node level, calls go_inval */ void (*go_drop_bh) (struct gfs_glock * gl); + + /* sync dirty data to disk before releasing an inter-node lock + * (another node needs to read the updated data from disk) */ void (*go_sync) (struct gfs_glock * gl, int flags); + + /* invalidate local data just after releasing an inter-node lock + * (another node may change the on-disk data, so it's no good to us) */ void (*go_inval) (struct gfs_glock * gl, int flags); + + /* lock-type-specific check to see if it's okay to unlock a glock */ int (*go_demote_ok) (struct gfs_glock * gl); + + /* after locking at local process level */ int (*go_lock) (struct gfs_glock * gl, int flags); + + /* before unlocking at local process level */ void (*go_unlock) (struct gfs_glock * gl, int flags); + + /* after receiving a callback: another node needs the lock */ void (*go_callback) (struct gfs_glock * gl, unsigned int state); + void (*go_greedy) (struct gfs_glock * gl); - int go_type; + + /* lock type: locks with same lock # (usually an fs block #), + * but different types, are different locks */ + int go_type; /* glock type */ }; -/* Actions */ -#define HIF_MUTEX (0) -#define HIF_PROMOTE (1) -#define HIF_DEMOTE (2) -#define HIF_GREEDY (3) +/* + * Glock holder structure + * These coordinate the use, within this node, of an acquired inter-node lock. + * One for each holder of a glock. A glock may be shared within a node by + * several processes, or even by several recursive requests from the same + * process. Each is a separate "holder". To be shared locally, the glock + * must be in "SHARED" or "DEFERRED" state at inter-node level, which means + * that processes on other nodes might also read the protected entity. + * When a process needs to manipulate a lock, it requests it via one of + * these holder structures. If the request cannot be satisfied immediately, + * the holder structure gets queued on one of these glock lists: + * 1) waiters1, for gaining exclusive access to the glock structure. + * 2) waiters2, for locking (promoting) or unlocking (demoting) a lock. + * This may require changing lock state at inter-node level. + * When holding a lock, gfs_holder struct stays on glock's holder list. + * See gfs-kernel/src/harness/lm_interface.h for gh_state (LM_ST_...) + * and gh_flags (LM_FLAG...) fields. + * Also see glock.h for gh_flags field (GL_...) flags. + */ +/* Action requests */ +#define HIF_MUTEX (0) /* exclusive access to glock struct */ +#define HIF_PROMOTE (1) /* change lock to more restrictive state */ +#define HIF_DEMOTE (2) /* change lock to less restrictive state */ +#define HIF_GREEDY (3) /* States */ -#define HIF_ALLOCED (4) -#define HIF_DEALLOC (5) -#define HIF_HOLDER (6) -#define HIF_FIRST (7) -#define HIF_WAKEUP (8) -#define HIF_RECURSE (9) +#define HIF_ALLOCED (4) /* holder structure is or was in use */ +#define HIF_DEALLOC (5) /* holder structure no longer in use */ +#define HIF_HOLDER (6) /* we have been granted a hold on the lock */ +#define HIF_FIRST (7) /* we are first on glock's holder list */ +#define HIF_WAKEUP (8) /* wake us up when request is satisfied */ +#define HIF_RECURSE (9) /* recursive locks on same glock by same process */ struct gfs_holder { - struct list_head gh_list; + struct list_head gh_list; /* link to one of glock's holder lists */ - struct gfs_glock *gh_gl; - struct task_struct *gh_owner; - unsigned int gh_state; - int gh_flags; - - int gh_error; - unsigned long gh_iflags; - struct completion gh_wait; + struct gfs_glock *gh_gl; /* glock that we're holding */ + struct task_struct *gh_owner; /* Linux process that is the holder */ + + /* request to change lock state */ + unsigned int gh_state; /* LM_ST_... requested lock state */ + int gh_flags; /* GL_... or LM_FLAG_... req modifiers */ + + int gh_error; /* GLR_... CANCELLED or TRYFAILED */ + unsigned long gh_iflags; /* HIF_... see above */ + struct completion gh_wait; /* wait for completion of ... */ }; /* * Glock Structure - */ - -#define GLF_PLUG (0) -#define GLF_LOCK (1) -#define GLF_STICKY (2) + * One for each inter-node lock held by this node. + * A glock is a local representation/abstraction of an inter-node lock. + * Inter-node locks are managed by a "lock module" which plugs in to the + * lock harness / glock interface (see gfs-kernel/harness). Different + * lock modules support different lock protocols (e.g. GULM, GDLM, no_lock). + * A glock may have one or more holders within a node. See gfs_holder above. + * Glocks are managed within a hash table hosted by the in-core superblock. + * After all holders have released a glock, it will stay in the hash table + * cache for a certain time (gt_prefetch_secs), during which the inter-node + * lock will not be released unless another node needs the lock. This + * provides better performance in case this node needs the glock again soon. + * Each glock has an associated vector of lock-type-specific "glops" functions + * which are called at important times during the life of a glock, and + * which define the type of lock (e.g. dinode, rgrp, non-disk, etc). + * See gfs_glock_operations above. + * A glock, at inter-node scope, is identified by the following dimensions: + * 1) lock number (usually a block # for on-disk protected entities, + * or a fixed assigned number for non-disk locks, e.g. MOUNT). + * 2) lock type (actually, the type of entity protected by the lock). + * 3) lock namespace, to support multiple GFS filesystems simultaneously. + * Namespace (usually cluster:filesystem) is specified when mounting. + * See man page for gfs_mount. + * Glocks require support of Lock Value Blocks (LVBs) by the inter-node lock + * manager. LVBs are small (32-byte) chunks of data associated with a given + * lock, that can be quickly shared between cluster nodes. Used for certain + * purposes such as sharing an rgroup's block usage statistics without + * requiring the overhead of: + * -- sync-to-disk by one node, then a + * -- read from disk by another node. + * + */ + +#define GLF_PLUG (0) /* dummy */ +#define GLF_LOCK (1) /* exclusive access to glock structure */ +#define GLF_STICKY (2) /* permanent lock, used sparingly */ #define GLF_PREFETCH (3) #define GLF_SYNC (4) #define GLF_DIRTY (5) -#define GLF_LVB_INVALID (6) +#define GLF_LVB_INVALID (6) /* LVB does not contain valid data */ #define GLF_SKIP_WAITERS2 (7) #define GLF_GREEDY (8) struct gfs_glock { - struct list_head gl_list; - unsigned long gl_flags; - struct lm_lockname gl_name; - atomic_t gl_count; - - spinlock_t gl_spin; - - unsigned int gl_state; - struct list_head gl_holders; - struct list_head gl_waiters1; /* HIF_MUTEX */ - struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */ - struct list_head gl_waiters3; /* HIF_PROMOTE */ + struct list_head gl_list; /* link to superblock's hash table */ + unsigned long gl_flags; /* GLF_... see above */ + struct lm_lockname gl_name; /* lock number and lock type */ + atomic_t gl_count; /* recursive access/usage count */ + + spinlock_t gl_spin; /* protects some members of this struct */ + + /* lock state reflects inter-node manager's lock state */ + unsigned int gl_state; /* LM_ST_... see harness/lm_interface.h */ + + /* lists of gfs_holders */ + struct list_head gl_holders; /* all current holders of the glock */ + struct list_head gl_waiters1; /* wait for excl. access to glock struct*/ + struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */ + struct list_head gl_waiters3; /* HIF_PROMOTE */ - struct gfs_glock_operations *gl_ops; + struct gfs_glock_operations *gl_ops; /* function vector, defines type */ struct gfs_holder *gl_req_gh; gfs_glop_bh_t gl_req_bh; - lm_lock_t *gl_lock; - char *gl_lvb; - atomic_t gl_lvb_count; - - uint64_t gl_vn; - unsigned long gl_stamp; - void *gl_object; + lm_lock_t *gl_lock; /* lock module's private lock data */ + char *gl_lvb; /* Lock Value Block */ + atomic_t gl_lvb_count; /* LVB recursive usage (hold/unhold) count */ + + uint64_t gl_vn; /* incremented when protected data changes */ + unsigned long gl_stamp; /* glock cache retention timer */ + void *gl_object; /* the protected entity (e.g. a dinode) */ struct gfs_log_element gl_new_le; struct gfs_log_element gl_incore_le; - struct gfs_gl_hash_bucket *gl_bucket; - struct list_head gl_reclaim; + struct gfs_gl_hash_bucket *gl_bucket; /* our bucket in hash table */ + struct list_head gl_reclaim; /* link to "reclaim" list */ - struct gfs_sbd *gl_sbd; + struct gfs_sbd *gl_sbd; /* superblock (fs instance) */ - struct inode *gl_aspace; - struct list_head gl_dirty_buffers; - struct list_head gl_ail_bufs; + struct inode *gl_aspace; /* Linux VFS inode */ + struct list_head gl_dirty_buffers; /* ?? */ + struct list_head gl_ail_bufs; /* AIL buffers protected by us */ }; /* * In-Place Reservation structure + * Coordinates allocation of "in-place" (as opposed to journal) fs blocks, + * which contain persistent inode/file/directory data and metadata. + * These blocks are the allocatable blocks within resource groups (i.e. + * not including rgrp header and block alloc bitmap blocks). + * gfs_inplace_reserve() calculates a fulfillment plan for allocating blocks, + * based on block statistics in the resource group headers. + * Then, gfs_blkalloc() or gfs_metaalloc() walks the block alloc bitmaps + * to do the actual allocation. */ struct gfs_alloc { - /* Quota stuff */ - - unsigned int al_qd_num; - struct gfs_quota_data *al_qd[4]; - struct gfs_holder al_qd_ghs[4]; - - /* Filled in by the caller to gfs_inplace_reserve() */ - - uint32_t al_requested_di; - uint32_t al_requested_meta; - uint32_t al_requested_data; - - /* Filled in by gfs_inplace_reserve() */ - - char *al_file; - unsigned int al_line; - struct gfs_holder al_ri_gh; - struct gfs_holder al_rgd_gh; - struct gfs_rgrpd *al_rgd; - uint32_t al_reserved_meta; - uint32_t al_reserved_data; - - /* Filled in by gfs_blkalloc() */ - - uint32_t al_alloced_di; - uint32_t al_alloced_meta; - uint32_t al_alloced_data; + /* + * Up to 4 quotas (including an inode's user and group quotas) + * can track changes in block allocation + */ + + unsigned int al_qd_num; /* # of quotas tracking changes */ + struct gfs_quota_data *al_qd[4]; /* ptrs to quota structures */ + struct gfs_holder al_qd_ghs[4]; /* holders for quota glocks */ + + /* Request, filled in by the caller to gfs_inplace_reserve() */ + + uint32_t al_requested_di; /* number of dinodes to reserve */ + uint32_t al_requested_meta; /* number of metadata blocks to reserve */ + uint32_t al_requested_data; /* number of data blocks to reserve */ + + /* Fulfillment plan, filled in by gfs_inplace_reserve() */ + + char *al_file; /* debug info, .c file making request */ + unsigned int al_line; /* debug info, line of code making req */ + struct gfs_holder al_ri_gh; /* glock holder for resource grp index */ + struct gfs_holder al_rgd_gh; /* glock holder for al_rgd rgrp */ + struct gfs_rgrpd *al_rgd; /* resource group from which to alloc */ + uint32_t al_reserved_meta; /* alloc this # meta blocks from al_rgd */ + uint32_t al_reserved_data; /* alloc this # data blocks from al_rgd */ + + /* Actual alloc, filled in by gfs_blkalloc()/gfs_metaalloc(), etc. */ + + uint32_t al_alloced_di; /* # dinode blocks allocated */ + uint32_t al_alloced_meta; /* # meta blocks allocated */ + uint32_t al_alloced_data; /* # data blocks allocated */ /* Dinode allocation crap */ - struct gfs_unlinked *al_ul; + struct gfs_unlinked *al_ul; /* unlinked dinode log entry */ }; /* @@ -339,27 +472,32 @@ #define GIF_SW_PAGED (2) struct gfs_inode { - struct gfs_inum i_num; + struct gfs_inum i_num; /* formal inode # and block address */ - atomic_t i_count; - unsigned long i_flags; + atomic_t i_count; /* recursive usage (get/put) count */ + unsigned long i_flags; /* GIF_... see above */ - uint64_t i_vn; - struct gfs_dinode i_di; + uint64_t i_vn; /* version #: if different from glock's vn, + we need to read inode from disk */ + struct gfs_dinode i_di; /* dinode (on-disk) structure */ - struct gfs_glock *i_gl; - struct gfs_sbd *i_sbd; - struct inode *i_vnode; + struct gfs_glock *i_gl; /* this glock protects this inode */ + struct gfs_sbd *i_sbd; /* superblock (fs instance structure) */ + struct inode *i_vnode; /* Linux VFS inode structure */ - struct gfs_holder i_iopen_gh; + struct gfs_holder i_iopen_gh; /* glock holder for # inode opens lock */ - struct gfs_alloc *i_alloc; - uint64_t i_last_rg_alloc; + /* block allocation strategy, inode scope */ + struct gfs_alloc *i_alloc; /* in-place block reservation structure */ + uint64_t i_last_rg_alloc; /* most recnt block alloc was fm this rgrp */ - struct task_struct *i_creat_task; - pid_t i_creat_pid; + /* Linux process that originally created this inode */ + struct task_struct *i_creat_task; /* Linux "current" task struct */ + pid_t i_creat_pid; /* Linux process ID current->pid */ - spinlock_t i_lock; + spinlock_t i_lock; /* protects this structure */ + + /* cache of most-recently used buffers in indirect addressing chain */ struct buffer_head *i_cache[GFS_MAX_META_HEIGHT]; unsigned int i_greedy; @@ -378,8 +516,8 @@ struct semaphore f_fl_lock; struct gfs_holder f_fl_gh; - struct gfs_inode *f_inode; - struct file *f_vfile; + struct gfs_inode *f_inode; /* incore GFS inode */ + struct file *f_vfile; /* Linux file struct */ }; /* @@ -393,112 +531,143 @@ #define ULF_LOCK (4) struct gfs_unlinked { - struct list_head ul_list; - unsigned int ul_count; + struct list_head ul_list; /* link to superblock's sd_unlinked_list */ + unsigned int ul_count; /* usage count */ - struct gfs_inum ul_inum; - unsigned long ul_flags; + struct gfs_inum ul_inum; /* formal inode #, block addr */ + unsigned long ul_flags; /* ULF_... */ - struct gfs_log_element ul_new_le; - struct gfs_log_element ul_incore_le; - struct gfs_log_element ul_ondisk_le; + struct gfs_log_element ul_new_le; /* new, not yet committed */ + struct gfs_log_element ul_incore_le; /* committed to incore log */ + struct gfs_log_element ul_ondisk_le; /* committed to ondisk log */ }; /* * Quota log element + * One for each logged change in a block alloc value affecting a given quota. + * Only one of these for a given quota within a given transaction; + * multiple changes, within one transaction, for a given quota will be + * combined into one log element. */ struct gfs_quota_le { - struct gfs_log_element ql_le; + /* Log element maps us to a particular set of log operations functions, + * and to a particular transaction */ + struct gfs_log_element ql_le; /* generic log element structure */ - struct gfs_quota_data *ql_data; - struct list_head ql_data_list; + struct gfs_quota_data *ql_data; /* the quota we're changing */ + struct list_head ql_data_list; /* link to quota's log element list */ - int64_t ql_change; + int64_t ql_change; /* # of blocks alloc'd (+) or freed (-) */ }; -#define QDF_USER (0) -#define QDF_OD_LIST (1) -#define QDF_LOCK (2) +/* + * Quota structure + * One for each user or group quota. + * Summarizes all block allocation activity for a given quota, and supports + * recording updates of current block alloc values in GFS' special quota + * file, including the journaling of these updates, encompassing + * multiple transactions and log dumps. + */ + +#define QDF_USER (0) /* user (1) vs. group (0) quota */ +#define QDF_OD_LIST (1) /* waiting for sync to quota file */ +#define QDF_LOCK (2) /* protects access to this structure */ struct gfs_quota_data { - struct list_head qd_list; - unsigned int qd_count; + struct list_head qd_list; /* Link to superblock's sd_quota_list */ + unsigned int qd_count; /* usage/reference count */ - uint32_t qd_id; - unsigned long qd_flags; + uint32_t qd_id; /* user or group ID number */ + unsigned long qd_flags; /* QDF_... */ - struct list_head qd_le_list; + /* this list is for non-log-dump transactions */ + struct list_head qd_le_list; /* List of gfs_quota_le log elements */ - int64_t qd_change_new; - int64_t qd_change_ic; - int64_t qd_change_od; - int64_t qd_change_sync; + /* summary of block alloc changes affecting this quota, in various + * stages of logging & syncing changes to the special quota file */ + int64_t qd_change_new; /* new, not yet committed to in-core log*/ + int64_t qd_change_ic; /* committed to in-core log */ + int64_t qd_change_od; /* committed to on-disk log */ + int64_t qd_change_sync; /* being synced to the in-place quota file */ - struct gfs_quota_le qd_ondisk_ql; - uint64_t qd_sync_gen; + struct gfs_quota_le qd_ondisk_ql; /* log element for log dump */ + uint64_t qd_sync_gen; /* sync-to-quota-file generation # */ - struct gfs_glock *qd_gl; - struct gfs_quota_lvb qd_qb; + /* glock provides protection for quota, *and* provides + * lock value block (LVB) communication, between nodes, of current + * quota values. Shared lock -> LVB read. EX lock -> LVB write. */ + struct gfs_glock *qd_gl; /* glock for this quota */ + struct gfs_quota_lvb qd_qb; /* LVB (limit/warn/value) */ - unsigned long qd_last_warn; + unsigned long qd_last_warn; /* jiffies of last warning to user */ }; +/* + * Log Buffer descriptor structure + * One for each fs block buffer recorded in the log + */ struct gfs_log_buf { - struct list_head lb_list; + /* link to one of the transaction structure's lists */ + struct list_head lb_list; /* link to tr_free_bufs or tr_list */ struct buffer_head lb_bh; struct buffer_head *lb_unlock; }; /* - * Transaction structures + * Transaction structure + * One for each transaction + * This coordinates the logging and flushing of written metadata. */ #define TRF_LOG_DUMP (0x00000001) struct gfs_trans { - struct list_head tr_list; + + /* link to various lists */ + struct list_head tr_list; /* superblk's incore trans or AIL list*/ /* Initial creation stuff */ - char *tr_file; - unsigned int tr_line; + char *tr_file; /* debug info: .c file creating trans */ + unsigned int tr_line; /* debug info: codeline creating trans */ - unsigned int tr_mblks_asked; /* Number of log blocks asked to be reserved */ - unsigned int tr_eblks_asked; - unsigned int tr_seg_reserved; /* Number of segments reserved */ + /* reservations for on-disk space in journal */ + unsigned int tr_mblks_asked; /* # of meta log blocks requested */ + unsigned int tr_eblks_asked; /* # of extra log blocks requested */ + unsigned int tr_seg_reserved; /* # of segments actually reserved */ - struct gfs_holder *tr_t_gh; + struct gfs_holder *tr_t_gh; /* glock holder for this transaction */ /* Stuff filled in during creation */ - unsigned int tr_flags; - struct list_head tr_elements; + unsigned int tr_flags; /* TRF_... */ + struct list_head tr_elements; /* List of this trans' log elements */ /* Stuff modified during the commit */ - unsigned int tr_num_free_bufs; + unsigned int tr_num_free_bufs; /* List of free gfs_log_buf structs */ struct list_head tr_free_bufs; - unsigned int tr_num_free_bmem; + unsigned int tr_num_free_bmem; /* List of free fs-block-size buffers */ struct list_head tr_free_bmem; - uint64_t tr_log_head; /* The current log head */ - uint64_t tr_first_head; /* First header block */ + uint64_t tr_log_head; /* The current log head */ + uint64_t tr_first_head; /* First header block */ - struct list_head tr_bufs; /* List of buffers going to the log */ + struct list_head tr_bufs; /* List of buffers going to the log */ - /* Stuff that's part of the AIL */ + /* Stuff that's part of the Active Items List (AIL) */ - struct list_head tr_ail_bufs; + struct list_head tr_ail_bufs; /* List of buffers on AIL list */ - /* Private data for different log element types */ + /* # log elements of various types on tr_elements list */ - unsigned int tr_num_gl; - unsigned int tr_num_buf; - unsigned int tr_num_iul; - unsigned int tr_num_ida; - unsigned int tr_num_q; + unsigned int tr_num_gl; /* glocks */ + unsigned int tr_num_buf; /* buffers */ + unsigned int tr_num_iul; /* unlinked inodes */ + unsigned int tr_num_ida; /* de-allocated inodes */ + unsigned int tr_num_q; /* quotas */ }; /* @@ -511,153 +680,201 @@ } __attribute__ ((__aligned__(SMP_CACHE_BYTES))); /* - * Super Block Data Structure (One per filesystem) - */ + * "Super Block" Data Structure + * One per mounted filesystem. + * This is the big instance structure that ties everything together for + * a given mounted filesystem. Each GFS mount has its own, supporting + * mounts of multiple GFS filesystems on each node. + * Pointer to this is usually seen as "sdp" throughout code. + * This is a very large structure, as structures go, in part because it + * contains arrays of hash buckets for various in-core caches. + */ + +/* sd_flags */ + +#define SDF_JOURNAL_LIVE (0) /* journaling is active (fs is writeable)*/ + +/* daemon run (1) / stop (0) flags */ +#define SDF_SCAND_RUN (1) /* put unused glocks on reclaim queue */ +#define SDF_GLOCKD_RUN (2) /* reclaim (dealloc) unused glocks */ +#define SDF_RECOVERD_RUN (3) /* recover journal of a crashed node */ +#define SDF_LOGD_RUN (4) /* update log tail after AIL flushed */ +#define SDF_QUOTAD_RUN (5) /* sync quota changes to file, cleanup */ +#define SDF_INODED_RUN (6) /* deallocate unlinked inodes */ + +/* (re)mount options from Linux VFS */ +#define SDF_NOATIME (7) /* don't change access time */ +#define SDF_ROFS (8) /* read-only mode (no journal) */ -#define SDF_JOURNAL_LIVE (0) -#define SDF_SCAND_RUN (1) -#define SDF_GLOCKD_RUN (2) -#define SDF_RECOVERD_RUN (3) -#define SDF_LOGD_RUN (4) -#define SDF_QUOTAD_RUN (5) -#define SDF_INODED_RUN (6) -#define SDF_NOATIME (7) -#define SDF_ROFS (8) +/* journal log dump support */ #define SDF_NEED_LOG_DUMP (9) #define SDF_FOUND_UL_DUMP (10) #define SDF_FOUND_Q_DUMP (11) -#define SDF_IN_LOG_DUMP (12) +#define SDF_IN_LOG_DUMP (12) /* serializes log dumps */ + -#define GFS_GL_HASH_SHIFT (13) +/* constants for various in-core caches */ + +/* glock cache */ +#define GFS_GL_HASH_SHIFT (13) /* # hash buckets = 8K */ #define GFS_GL_HASH_SIZE (1 << GFS_GL_HASH_SHIFT) #define GFS_GL_HASH_MASK (GFS_GL_HASH_SIZE - 1) -#define GFS_MHC_HASH_SHIFT (10) +/* meta header cache */ +#define GFS_MHC_HASH_SHIFT (10) /* # hash buckets = 1K */ #define GFS_MHC_HASH_SIZE (1 << GFS_MHC_HASH_SHIFT) #define GFS_MHC_HASH_MASK (GFS_MHC_HASH_SIZE - 1) -#define GFS_DEPEND_HASH_SHIFT (10) +/* dependency cache */ +#define GFS_DEPEND_HASH_SHIFT (10) /* # hash buckets = 1K */ #define GFS_DEPEND_HASH_SIZE (1 << GFS_DEPEND_HASH_SHIFT) #define GFS_DEPEND_HASH_MASK (GFS_DEPEND_HASH_SIZE - 1) struct gfs_sbd { - struct gfs_sb sd_sb; /* Super Block */ + struct gfs_sb sd_sb; /* GFS on-disk Super Block image */ - struct super_block *sd_vfs; /* FS's device independent sb */ + struct super_block *sd_vfs; /* Linux VFS device independent sb */ - struct gfs_args sd_args; - unsigned long sd_flags; + struct gfs_args sd_args; /* Mount arguments */ + unsigned long sd_flags; /* SDF_... see above */ - struct gfs_tune sd_tune; /* FS tuning structure */ + struct gfs_tune sd_tune; /* Filesystem tuning structure */ /* Resource group stuff */ - struct gfs_inode *sd_riinode; /* rindex inode */ - uint64_t sd_riinode_vn; /* Version number of the resource index inode */ - - struct list_head sd_rglist; /* List of resource groups */ - struct semaphore sd_rindex_lock; - - struct list_head sd_rg_mru_list; /* List of resource groups in MRU order */ - spinlock_t sd_rg_mru_lock; /* Lock for MRU list */ - struct list_head sd_rg_recent; /* Recently used rgrps */ - spinlock_t sd_rg_recent_lock; - struct gfs_rgrpd *sd_rg_forward; /* Next new rgrp to try for allocation */ - spinlock_t sd_rg_forward_lock; + struct gfs_inode *sd_riinode; /* Resource Index (rindex) inode */ + uint64_t sd_riinode_vn; /* Resource Index version # (detects + whether new rgrps have been added) */ + + struct list_head sd_rglist; /* List of all resource groups, */ + struct semaphore sd_rindex_lock;/* on-disk order */ + struct list_head sd_rg_mru_list;/* List of resource groups, */ + spinlock_t sd_rg_mru_lock; /* most-recently-used (MRU) order */ + struct list_head sd_rg_recent; /* List of rgrps from which blocks */ + spinlock_t sd_rg_recent_lock; /* were recently allocated */ + struct gfs_rgrpd *sd_rg_forward;/* Next rgrp from which to attempt */ + spinlock_t sd_rg_forward_lock; /* a block alloc */ - unsigned int sd_rgcount; /* Count of resource groups */ + unsigned int sd_rgcount; /* Total # of resource groups */ /* Constants computed on mount */ - uint32_t sd_fsb2bb; - uint32_t sd_fsb2bb_shift; /* Shift FS Block numbers to the left by - this to get buffer cache blocks */ - uint32_t sd_diptrs; /* Number of pointers in a dinode */ - uint32_t sd_inptrs; /* Number of pointers in a indirect block */ - uint32_t sd_jbsize; /* Size of a journaled data block */ - uint32_t sd_hash_bsize; /* sizeof(exhash block) */ + /* "bb" == "basic block" == 512Byte sector */ + uint32_t sd_fsb2bb; /* # 512B basic blocks in a FS block */ + uint32_t sd_fsb2bb_shift; /* Shift sector # to the right by + this to get FileSystem block addr */ + uint32_t sd_diptrs; /* Max # of block pointers in a dinode */ + uint32_t sd_inptrs; /* Max # of block pointers in an indirect blk */ + uint32_t sd_jbsize; /* Payload size (bytes) of a journaled metadata + block (GFS journals all meta blocks) */ + uint32_t sd_hash_bsize; /* sizeof(exhash block) */ uint32_t sd_hash_bsize_shift; - uint32_t sd_hash_ptrs; /* Number of points in a hash block */ - uint32_t sd_max_dirres; /* Maximum space needed to add a directory entry */ - uint32_t sd_max_height; /* Maximum height of a file's metadata tree */ + uint32_t sd_hash_ptrs; /* Number of points in a hash block */ + uint32_t sd_max_dirres; /* Max blocks needed to add a directory entry */ + uint32_t sd_max_height; /* Max height of a file's indir addr tree */ uint64_t sd_heightsize[GFS_MAX_META_HEIGHT]; - uint32_t sd_max_jheight; /* Maximum height of a journaled file's metadata tree */ + uint32_t sd_max_jheight; /* Max hgt, journaled file's indir addr tree */ uint64_t sd_jheightsize[GFS_MAX_META_HEIGHT]; /* Lock Stuff */ + /* glock cache (all glocks currently held by this node for this fs) */ struct gfs_gl_hash_bucket sd_gl_hash[GFS_GL_HASH_SIZE]; - struct list_head sd_reclaim_list; + /* glock reclaim support for scand and glockd */ + struct list_head sd_reclaim_list; /* list of glocks to reclaim */ spinlock_t sd_reclaim_lock; wait_queue_head_t sd_reclaim_wchan; - atomic_t sd_reclaim_count; + atomic_t sd_reclaim_count; /* # glocks on reclaim list */ - struct lm_lockstruct sd_lockstruct; + /* lock module tells us if we're first-to-mount, + * which journal to use, etc. */ + struct lm_lockstruct sd_lockstruct; /* info provided by lock module */ - struct list_head sd_mhc[GFS_MHC_HASH_SIZE]; - struct list_head sd_mhc_single; + /* Other caches */ + + /* meta-header cache (incore copies of on-disk meta headers)*/ + struct list_head sd_mhc[GFS_MHC_HASH_SIZE]; /* hash buckets */ + struct list_head sd_mhc_single; /* non-hashed list of all MHCs */ spinlock_t sd_mhc_lock; - atomic_t sd_mhc_count; + atomic_t sd_mhc_count; /* # MHCs in cache */ - struct list_head sd_depend[GFS_DEPEND_HASH_SIZE]; + /* dependency cache */ + struct list_head sd_depend[GFS_DEPEND_HASH_SIZE]; /* hash buckets */ spinlock_t sd_depend_lock; - atomic_t sd_depend_count; + atomic_t sd_depend_count; /* # dependencies in cache */ - struct gfs_holder sd_live_gh; + /* LIVE inter-node lock indicates that fs is mounted on at least + * one node */ + struct gfs_holder sd_live_gh; /* glock holder for LIVE lock */ + /* for quiescing the filesystem */ struct gfs_holder sd_freeze_gh; struct semaphore sd_freeze_lock; unsigned int sd_freeze_count; /* Inode Stuff */ - struct gfs_inode *sd_rooti; /* FS's root inode */ + struct gfs_inode *sd_rooti; /* FS's root inode */ - struct gfs_glock *sd_rename_gl; /* rename glock */ + /* only 1 node at a time may rename (e.g. mv) a file or dir */ + struct gfs_glock *sd_rename_gl; /* rename glock */ /* Daemon stuff */ - struct task_struct *sd_scand_process; - unsigned int sd_glockd_num; + /* scan for glocks and inodes to toss from memory */ + struct task_struct *sd_scand_process; /* scand places on reclaim list*/ + unsigned int sd_glockd_num; /* # of glockd procs to do reclaiming*/ + + /* recover journal of a crashed node */ struct task_struct *sd_recoverd_process; + + /* update log tail as AIL gets flushed to in-place on-disk blocks */ struct task_struct *sd_logd_process; + + /* sync quota updates to disk, and clean up unused quota structs */ struct task_struct *sd_quotad_process; + + /* clean up unused inode structures */ struct task_struct *sd_inoded_process; + /* support for starting/stopping daemons */ struct semaphore sd_thread_lock; struct completion sd_thread_completion; /* Log stuff */ - struct gfs_glock *sd_trans_gl; /* transaction glock */ + /* transaction lock protects journal replay (recovery) */ + struct gfs_glock *sd_trans_gl; /* transaction glock structure */ - struct gfs_inode *sd_jiinode; /* jindex inode */ - uint64_t sd_jiinode_vn; /* Version number of the journal index inode */ + struct gfs_inode *sd_jiinode; /* journal index inode */ + uint64_t sd_jiinode_vn; /* journal index version # (detects + if new journals have been added) */ unsigned int sd_journals; /* Number of journals in the FS */ - struct gfs_jindex *sd_jindex; /* Array of Jindex structures describing this FS's journals */ + struct gfs_jindex *sd_jindex; /* Array of journal descriptors */ struct semaphore sd_jindex_lock; - unsigned long sd_jindex_refresh_time; + unsigned long sd_jindex_refresh_time; /* poll for new journals (secs) */ - struct gfs_jindex sd_jdesc; /* Jindex structure describing this machine's journal */ - struct gfs_holder sd_journal_gh; /* the glock for this machine's journal */ + struct gfs_jindex sd_jdesc; /* this machine's journal descriptor */ + struct gfs_holder sd_journal_gh; /* this machine's journal glock */ uint64_t sd_sequence; /* Assigned to xactions in order they commit */ uint64_t sd_log_head; /* Block number of next journal write */ uint64_t sd_log_wrap; spinlock_t sd_log_seg_lock; - unsigned int sd_log_seg_free; /* Free segments in the log */ + unsigned int sd_log_seg_free; /* # of free segments in the log */ struct list_head sd_log_seg_list; wait_queue_head_t sd_log_seg_wait; - struct list_head sd_log_ail; /* struct gfs_trans structures that form the Active Items List - "next" is the head, "prev" is the tail */ - - struct list_head sd_log_incore; /* transactions that have been commited incore (but not ondisk) - "next" is the newest, "prev" is the oldest */ - unsigned int sd_log_buffers; /* Number of buffers in the incore log */ + /* "Active Items List" of transactions that have been flushed to + * on-disk log, and are waiting for flush to in-place on-disk blocks */ + struct list_head sd_log_ail; /* "next" is head, "prev" is tail */ + + /* Transactions committed incore, but not yet flushed to on-disk log */ + struct list_head sd_log_incore; /* "next" is newest, "prev" is oldest */ + unsigned int sd_log_buffers; /* # of buffers in the incore log */ struct semaphore sd_log_lock; /* Lock for access to log values */ @@ -674,16 +891,17 @@ /* quota crap */ - struct list_head sd_quota_list; + struct list_head sd_quota_list; /* list of all gfs_quota_data structs */ spinlock_t sd_quota_lock; - atomic_t sd_quota_count; - atomic_t sd_quota_od_count; + atomic_t sd_quota_count; /* # quotas on sd_quota_list */ + atomic_t sd_quota_od_count; /* # quotas waiting for sync to + special on-disk quota file */ - struct gfs_inode *sd_qinode; + struct gfs_inode *sd_qinode; /* special on-disk quota file */ - uint64_t sd_quota_sync_gen; - unsigned long sd_quota_sync_time; + uint64_t sd_quota_sync_gen; /* generation, incr when sync to file */ + unsigned long sd_quota_sync_time; /* jiffies, last sync to quota file */ /* license crap */ diff -ru cvs/cluster/gfs-kernel/src/gfs/log.c build_092304/cluster/gfs-kernel/src/gfs/log.c --- cvs/cluster/gfs-kernel/src/gfs/log.c 2004-07-12 15:22:44.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/log.c 2004-09-23 14:18:29.406501616 -0400 @@ -134,7 +134,8 @@ /** * gfs_ail_start - Start I/O on the AIL * @sdp: the filesystem - * @flags: + * @flags: DIO_ALL -- flush *all* AIL transactions to disk + * default -- flush first-on-list AIL transaction to disk * */ @@ -1207,7 +1208,7 @@ LO_CLEAN_DUMP(sdp, le); } - /* If there isn't anything the AIL, we won't get back the log + /* If there isn't anything in the AIL, we won't get back the log space we reserved unless we do it ourselves. */ if (list_empty(&sdp->sd_log_ail)) { diff -ru cvs/cluster/gfs-kernel/src/gfs/lops.c build_092304/cluster/gfs-kernel/src/gfs/lops.c --- cvs/cluster/gfs-kernel/src/gfs/lops.c 2004-06-24 04:53:28.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/lops.c 2004-09-23 14:18:41.725628824 -0400 @@ -442,6 +442,13 @@ * @blkno: the location of the log's copy of the block * * Returns: 0 on success, -EXXX on failure + * + * Read in-place block from disk + * Read log (journal) block from disk + * Compare generation numbers + * Copy log block to in-place block on-disk if: + * log generation # > in-place generation # + * OR generation #s are ==, but data contained in block is different (corrupt) */ static int diff -ru cvs/cluster/gfs-kernel/src/gfs/lvb.h build_092304/cluster/gfs-kernel/src/gfs/lvb.h --- cvs/cluster/gfs-kernel/src/gfs/lvb.h 2004-06-24 04:53:28.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/lvb.h 2004-09-23 14:19:09.962336192 -0400 @@ -11,26 +11,44 @@ ************************************************************************ ******* ************************************************************************ ******/ +/* + * Formats of Lock Value Blocks (LVBs) for various types of locks. + * These 32-bit data chunks can be shared quickly between nodes + * via the inter-node lock manager (via LAN instead of on-disk). + */ + #ifndef __LVB_DOT_H__ #define __LVB_DOT_H__ #define GFS_MIN_LVB_SIZE (32) +/* + * Resource Group block allocation statistics + * Each resource group lock contains one of these in its LVB. + * Used for sharing approximate current statistics for statfs. + * Not used for actual block allocation. + */ struct gfs_rgrp_lvb { - uint32_t rb_magic; - uint32_t rb_free; - uint32_t rb_useddi; - uint32_t rb_freedi; - uint32_t rb_usedmeta; - uint32_t rb_freemeta; + uint32_t rb_magic; /* GFS_MAGIC sanity check value */ + uint32_t rb_free; /* # free data blocks */ + uint32_t rb_useddi; /* # used dinode blocks */ + uint32_t rb_freedi; /* # free dinode blocks */ + uint32_t rb_usedmeta; /* # used metadata blocks */ + uint32_t rb_freemeta; /* # free metadata blocks */ }; +/* + * Quota + * Each quota lock contains one of these in its LVB. + * Keeps track of block allocation limits and current block allocation + * for either a cluster-wide user or a cluster-wide group. + */ struct gfs_quota_lvb { - uint32_t qb_magic; + uint32_t qb_magic; /* GFS_MAGIC sanity check value */ uint32_t qb_pad; - uint64_t qb_limit; - uint64_t qb_warn; - int64_t qb_value; + uint64_t qb_limit; /* hard limit of # blocks to alloc */ + uint64_t qb_warn; /* warn user when alloc is above this # */ + int64_t qb_value; /* current # blocks allocated */ }; /* Translation functions */ diff -ru cvs/cluster/gfs-kernel/src/gfs/rgrp.c build_092304/cluster/gfs-kernel/src/gfs/rgrp.c --- cvs/cluster/gfs-kernel/src/gfs/rgrp.c 2004-06-24 04:53:28.000000000 -0400 +++ build_092304/cluster/gfs-kernel/src/gfs/rgrp.c 2004-09-23 14:18:56.703351864 -0400 @@ -372,6 +372,7 @@ memset(count, 0, 4 * sizeof(uint32_t)); + /* count # blocks in each of 4 possible allocation states */ for (buf = 0; buf < length; buf++) { bits = &rgd->rd_bits[buf]; for (x = 0; x < 4; x++) @@ -531,6 +532,7 @@ * gfs_compute_bitstructs - Compute the bitmap sizes * @rgd: The resource group descriptor * + * Calculates bitmap descriptors, one for each block that contains bitmap data */ static void @@ -538,7 +540,7 @@ { struct gfs_sbd *sdp = rgd->rd_sbd; struct gfs_bitmap *bits; - uint32_t length = rgd->rd_ri.ri_length; + uint32_t length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */ uint32_t bytes_left, bytes; int x; @@ -550,21 +552,25 @@ for (x = 0; x < length; x++) { bits = &rgd->rd_bits[x]; + /* small rgrp; bitmap stored completely in header block */ if (length == 1) { bytes = bytes_left; bits->bi_offset = sizeof(struct gfs_rgrp); bits->bi_start = 0; bits->bi_len = bytes; + /* header block */ } else if (x == 0) { bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_rgrp); bits->bi_offset = sizeof(struct gfs_rgrp); bits->bi_start = 0; bits->bi_len = bytes; + /* last block */ } else if (x + 1 == length) { bytes = bytes_left; bits->bi_offset = sizeof(struct gfs_meta_header); bits->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; bits->bi_len = bytes; + /* other blocks */ } else { bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs_meta_header); bits->bi_offset = sizeof(struct gfs_meta_header); @@ -855,10 +861,12 @@ * @rgd: the RG data * @al: the struct gfs_alloc structure describing the reservation * - * Sets the $ir_datares field in @res. - * Sets the $ir_metares field in @res. + * If there's room for the requested blocks to be allocated from the RG: + * Sets the $al_reserved_data field in @al. + * Sets the $al_reserved_meta field in @al. + * Sets the $al_rgd field in @al. * - * Returns: 1 on success, 0 on failure + * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) */ static int @@ -900,7 +908,7 @@ } /** - * recent_rgrp_first - get first RG from recent list + * recent_rgrp_first - get first RG from "recent" list * @sdp: The GFS superblock * @rglast: address of the rgrp used last * @@ -939,7 +947,7 @@ } /** - * recent_rgrp_next - get next RG from recent list + * recent_rgrp_next - get next RG from "recent" list * @cur_rgd: current rgrp * * Returns: The next rgrp in the recent list @@ -978,7 +986,7 @@ } /** - * recent_rgrp_remove - remove an RG from recent list + * recent_rgrp_remove - remove an RG from "recent" list * @rgd: The rgrp to remove * */ @@ -992,9 +1000,14 @@ } /** - * recent_rgrp_add - add an RG to recent list + * recent_rgrp_add - add an RG to tail of "recent" list * @new_rgd: The rgrp to add * + * Before adding, make sure that: + * 1) it's not already on the list + * 2) there's still room for more entries + * The capacity limit imposed on the "recent" list is basically a node's "share" + * of rgrps within a cluster, i.e. (total # rgrps) / (# nodes (journals)) */ static void