* Ingo Molnar <mingo@xxxxxxx> wrote: > > > > While trying to remove 2 small files, 2 empty dirs and 1 empty dir > > > > on xfs partition > > > > > > Probably spurious. xfs_ilock can be called on both the parent and > > > child, which wouldn't be a deadlock. > > > > Hmm... they'd be different inodes though, so different lock addresses > > in memory - is lockdep taking that into account? Would we need to go > > annotate xfs_ilock somehow to give better hints to the lockdep code? > > correct, lockdep has to be taught about relations between locks within > the same lock-class. (it detects relations between different > lock-classes automatically) It's usually a straightforward process. > > In this particular case we probably need to do something similar to > the VFS's 'enum inode_i_mutex_lock_class' subclass categorizations: we > need xfs_ilock_nested(.., subclass), where in xfs_lock_dir_and_entry() > we'd pass in ILOCK_PARENT. [normal locking calls have a default > subclass ID of 0] the patch below should solve the case mentioned above, but i'm sure there will be other cases as well. XFS locking seems quite complex all around. All this dynamic flag-passing into an opaque function (such as xfs_ilock), just to have them untangled in xfs_ilock(): if (lock_flags & XFS_IOLOCK_EXCL) { mrupdate(&ip->i_iolock); } else if (lock_flags & XFS_IOLOCK_SHARED) { mraccess(&ip->i_iolock); } if (lock_flags & XFS_ILOCK_EXCL) { mrupdate(&ip->i_lock); } else if (lock_flags & XFS_ILOCK_SHARED) { mraccess(&ip->i_lock); } is pretty inefficient too - there are 85 calls to xfs_ilock(), and 74 of them have static flags. Also, mrunlock() does: static inline void mrunlock(mrlock_t *mrp) { if (mrp->mr_writer) { mrp->mr_writer = 0; up_write(&mrp->mr_lock); } else { up_read(&mrp->mr_lock); } } while xfs_iunlock() knows whether the release is for read or for write! So ->mr_writer seems like a totally unnecessary layer. In fact basically all codepaths should be well aware of whether they locked the inode for read or for write. i'd really suggest to clean this up and to convert: xfs_ilock(ip, XFS_ILOCK_EXCL); to: down_write(&ip->i_lock); and: xfs_iunlock(ip, XFS_ILOCK_EXCL); to: up_write(&ip->i_lock); and eliminate all those layers of indirection. Ingo --- fs/xfs/linux-2.6/mrlock.h | 33 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_iget.c | 12 ++++++++---- fs/xfs/xfs_inode.h | 3 ++- fs/xfs/xfs_vnodeops.c | 8 ++++---- 4 files changed, 47 insertions(+), 9 deletions(-) Index: linux/fs/xfs/linux-2.6/mrlock.h =================================================================== --- linux.orig/fs/xfs/linux-2.6/mrlock.h +++ linux/fs/xfs/linux-2.6/mrlock.h @@ -27,12 +27,33 @@ typedef struct { int mr_writer; } mrlock_t; +/* + * ilock nesting subclasses for the lock validator: + * + * 0: the object of the current VFS operation + * 1: parent + * 2: child/target + * 3: quota file + * + * The locking order between these classes is + * parent -> child -> normal -> quota + */ +enum xfs_ilock_class +{ + ILOCK_NORMAL, + ILOCK_PARENT, + ILOCK_CHILD, + ILOCK_QUOTA +}; + #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) #define mrfree(mrp) do { } while (0) #define mraccess(mrp) mraccessf(mrp, 0) +#define mraccess_nested(mrp, s) mraccessf_nested(mrp, 0, s) #define mrupdate(mrp) mrupdatef(mrp, 0) +#define mrupdate_nested(mrp, s) mrupdatef_nested(mrp, 0, s) static inline void mraccessf(mrlock_t *mrp, int flags) { @@ -45,6 +66,18 @@ static inline void mrupdatef(mrlock_t *m mrp->mr_writer = 1; } +static inline void mraccessf_nested(mrlock_t *mrp, int flags, int subclass) +{ + down_read_nested(&mrp->mr_lock, subclass); +} + +static inline void mrupdatef_nested(mrlock_t *mrp, int flags, int subclass) +{ + down_write_nested(&mrp->mr_lock, subclass); + mrp->mr_writer = 1; +} + + static inline int mrtryaccess(mrlock_t *mrp) { return down_read_trylock(&mrp->mr_lock); Index: linux/fs/xfs/xfs_iget.c =================================================================== --- linux.orig/fs/xfs/xfs_iget.c +++ linux/fs/xfs/xfs_iget.c @@ -859,10 +859,14 @@ xfs_ilock(xfs_inode_t *ip, } else if (lock_flags & XFS_IOLOCK_SHARED) { mraccess(&ip->i_iolock); } - if (lock_flags & XFS_ILOCK_EXCL) { - mrupdate(&ip->i_lock); - } else if (lock_flags & XFS_ILOCK_SHARED) { - mraccess(&ip->i_lock); + if (lock_flags & XFS_ILOCK_EXCL_PARENT) + mrupdate_nested(&ip->i_lock, ILOCK_PARENT); + else { + if (lock_flags & XFS_ILOCK_EXCL) { + mrupdate(&ip->i_lock); + } else if (lock_flags & XFS_ILOCK_SHARED) { + mraccess(&ip->i_lock); + } } xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); } Index: linux/fs/xfs/xfs_inode.h =================================================================== --- linux.orig/fs/xfs/xfs_inode.h +++ linux/fs/xfs/xfs_inode.h @@ -352,11 +352,12 @@ typedef struct xfs_inode { #define XFS_SIZE_TOKEN_WR (XFS_SIZE_TOKEN_RD | XFS_WILLLEND) #define XFS_EXTSIZE_WR (XFS_EXTSIZE_RD | XFS_WILLLEND) /* XFS_SIZE_TOKEN_WANT 0x200 */ +#define XFS_ILOCK_EXCL_PARENT 0x400 #define XFS_LOCK_MASK \ (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \ XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \ - XFS_WILLLEND) + XFS_WILLLEND | XFS_ILOCK_EXCL_PARENT) /* * Flags for xfs_iflush() Index: linux/fs/xfs/xfs_vnodeops.c =================================================================== --- linux.orig/fs/xfs/xfs_vnodeops.c +++ linux/fs/xfs/xfs_vnodeops.c @@ -1942,7 +1942,7 @@ xfs_create( goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL_PARENT); XFS_BMAP_INIT(&free_list, &first_block); @@ -2137,7 +2137,7 @@ xfs_lock_dir_and_entry( attempts = 0; again: - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL_PARENT); e_inum = ip->i_ino; @@ -2836,7 +2836,7 @@ xfs_mkdir( goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL_PARENT); /* * Check for directory link count overflow. @@ -3385,7 +3385,7 @@ xfs_symlink( goto error_return; } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL_PARENT); /* * Check whether the directory allows new symlinks or not. - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html