On Thu, Feb 05, 2015 at 07:56:49AM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Al Viro noticed a generic set of issues to do with filehandle lookup > racing with dentry cache setup. They involve a filehandle lookup > occurring while an inode is being created and the filehandle lookup > racing with the dentry creation for the real file. This can lead to > multiple dentries for the one path being instantiated. There are a > host of other issues around this same set of paths. > > The underlying cause is that file handle lookup only waits on inode > cache instantiation rather than full dentry cache instantiation. XFS > is mostly immune to the problems discovered due to it's own internal > inode cache, but there are a couple of corner cases where races can > happen. > > We currently clear the XFS_INEW flag when the inode is fully set up > after insertion into the cache. Newly allocated inodes are inserted > locked and so aren't usable until the allocation transaction > commits. This, however, occurs before the dentry and security > information is fully initialised and hence the inode is unlocked and > available for lookups to find too early. > > To solve the problem, only clear the XFS_INEW flag for newly created > inodes once the dentry is fully instantiated. This means lookups > will retry until the XFS_INEW flag is removed from the inode and > hence avoids the race conditions in questions. > > THis also means that xfs_create(), xfs_create_tmpfile() and > xfs_symlink() need to finish the setup of the inode in their error > paths if we had allocated the inode but failed later in the creation > process. xfs_symlink(), in particular, needed a lot of help to make > it's error handling match that of xfs_create(). > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx> > fs/xfs/xfs_icache.c | 4 ++-- > fs/xfs/xfs_inode.c | 22 ++++++++++++-------- > fs/xfs/xfs_inode.h | 22 ++++++++++++++++++++ > fs/xfs/xfs_iops.c | 24 +++++++++------------- > fs/xfs/xfs_iops.h | 2 -- > fs/xfs/xfs_qm.c | 13 ++++++++---- > fs/xfs/xfs_symlink.c | 58 ++++++++++++++++++++++++++++++---------------------- > 7 files changed, 90 insertions(+), 55 deletions(-) > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c > index 9771b7e..76a9f27 100644 > --- a/fs/xfs/xfs_icache.c > +++ b/fs/xfs/xfs_icache.c > @@ -439,11 +439,11 @@ again: > *ipp = ip; > > /* > - * If we have a real type for an on-disk inode, we can set ops(&unlock) > + * If we have a real type for an on-disk inode, we can setup the inode > * now. If it's a new inode being created, xfs_ialloc will handle it. > */ > if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) > - xfs_setup_inode(ip); > + xfs_setup_existing_inode(ip); > return 0; > > out_error_or_again: > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index daafa1f..d0414f3 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -818,7 +818,7 @@ xfs_ialloc( > xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); > xfs_trans_log_inode(tp, ip, flags); > > - /* now that we have an i_mode we can setup inode ops and unlock */ > + /* now that we have an i_mode we can setup the inode structure */ > xfs_setup_inode(ip); > > *ipp = ip; > @@ -1235,12 +1235,14 @@ xfs_create( > xfs_trans_cancel(tp, cancel_flags); > out_release_inode: > /* > - * Wait until after the current transaction is aborted to > - * release the inode. This prevents recursive transactions > - * and deadlocks from xfs_inactive. > + * Wait until after the current transaction is aborted to finish the > + * setup of the inode and release the inode. This prevents recursive > + * transactions and deadlocks from xfs_inactive. > */ > - if (ip) > + if (ip) { > + xfs_finish_inode_setup(ip); > IRELE(ip); > + } > > xfs_qm_dqrele(udqp); > xfs_qm_dqrele(gdqp); > @@ -1345,12 +1347,14 @@ xfs_create_tmpfile( > xfs_trans_cancel(tp, cancel_flags); > out_release_inode: > /* > - * Wait until after the current transaction is aborted to > - * release the inode. This prevents recursive transactions > - * and deadlocks from xfs_inactive. > + * Wait until after the current transaction is aborted to finish the > + * setup of the inode and release the inode. This prevents recursive > + * transactions and deadlocks from xfs_inactive. > */ > - if (ip) > + if (ip) { > + xfs_finish_inode_setup(ip); > IRELE(ip); > + } > > xfs_qm_dqrele(udqp); > xfs_qm_dqrele(gdqp); > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 86cd6b3..8e82b41 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -390,6 +390,28 @@ int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); > int xfs_iozero(struct xfs_inode *, loff_t, size_t); > > > +/* from xfs_iops.c */ > +/* > + * When setting up a newly allocated inode, we need to call > + * xfs_finish_inode_setup() once the inode is fully instantiated at > + * the VFS level to prevent the rest of the world seeing the inode > + * before we've completed instantiation. Otherwise we can do it > + * the moment the inode lookup is complete. > + */ > +extern void xfs_setup_inode(struct xfs_inode *ip); > +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) > +{ > + xfs_iflags_clear(ip, XFS_INEW); > + barrier(); > + unlock_new_inode(VFS_I(ip)); > +} > + > +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) > +{ > + xfs_setup_inode(ip); > + xfs_finish_inode_setup(ip); > +} > + > #define IHOLD(ip) \ > do { \ > ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > index ce80eeb..8be5bb5 100644 > --- a/fs/xfs/xfs_iops.c > +++ b/fs/xfs/xfs_iops.c > @@ -186,6 +186,8 @@ xfs_generic_create( > else > d_instantiate(dentry, inode); > > + xfs_finish_inode_setup(ip); > + > out_free_acl: > if (default_acl) > posix_acl_release(default_acl); > @@ -194,6 +196,7 @@ xfs_generic_create( > return error; > > out_cleanup_inode: > + xfs_finish_inode_setup(ip); > if (!tmpfile) > xfs_cleanup_inode(dir, inode, dentry); > iput(inode); > @@ -366,9 +369,11 @@ xfs_vn_symlink( > goto out_cleanup_inode; > > d_instantiate(dentry, inode); > + xfs_finish_inode_setup(cip); > return 0; > > out_cleanup_inode: > + xfs_finish_inode_setup(cip); > xfs_cleanup_inode(dir, inode, dentry); > iput(inode); > out: > @@ -1231,16 +1236,12 @@ xfs_diflags_to_iflags( > } > > /* > - * Initialize the Linux inode, set up the operation vectors and > - * unlock the inode. > - * > - * When reading existing inodes from disk this is called directly > - * from xfs_iget, when creating a new inode it is called from > - * xfs_ialloc after setting up the inode. > + * Initialize the Linux inode and set up the operation vectors. > * > - * We are always called with an uninitialised linux inode here. > - * We need to initialise the necessary fields and take a reference > - * on it. > + * When reading existing inodes from disk this is called directly from xfs_iget, > + * when creating a new inode it is called from xfs_ialloc after setting up the > + * inode. These callers have different criteria for clearing XFS_INEW, so leave > + * it up to the caller to deal with unlocking the inode appropriately. > */ > void > xfs_setup_inode( > @@ -1327,9 +1328,4 @@ xfs_setup_inode( > inode_has_no_xattr(inode); > cache_no_acl(inode); > } > - > - xfs_iflags_clear(ip, XFS_INEW); > - barrier(); > - > - unlock_new_inode(inode); > } > diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h > index 1c34e43..d4bcc29 100644 > --- a/fs/xfs/xfs_iops.h > +++ b/fs/xfs/xfs_iops.h > @@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations; > > extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); > > -extern void xfs_setup_inode(struct xfs_inode *); > - > /* > * Internal setattr interfaces. > */ > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c > index 3e81862..1f9e23c 100644 > --- a/fs/xfs/xfs_qm.c > +++ b/fs/xfs/xfs_qm.c > @@ -719,6 +719,7 @@ xfs_qm_qino_alloc( > xfs_trans_t *tp; > int error; > int committed; > + bool need_alloc = true; > > *ip = NULL; > /* > @@ -747,6 +748,7 @@ xfs_qm_qino_alloc( > return error; > mp->m_sb.sb_gquotino = NULLFSINO; > mp->m_sb.sb_pquotino = NULLFSINO; > + need_alloc = false; > } > } > > @@ -758,7 +760,7 @@ xfs_qm_qino_alloc( > return error; > } > > - if (!*ip) { > + if (need_alloc) { > error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, > &committed); > if (error) { > @@ -794,11 +796,14 @@ xfs_qm_qino_alloc( > spin_unlock(&mp->m_sb_lock); > xfs_log_sb(tp); > > - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { > + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); > + if (error) { > + ASSERT(XFS_FORCED_SHUTDOWN(mp)); > xfs_alert(mp, "%s failed (error %d)!", __func__, error); > - return error; > } > - return 0; > + if (need_alloc) > + xfs_finish_inode_setup(*ip); > + return error; > } > > > diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c > index 25791df..3df411e 100644 > --- a/fs/xfs/xfs_symlink.c > +++ b/fs/xfs/xfs_symlink.c > @@ -177,7 +177,7 @@ xfs_symlink( > int pathlen; > struct xfs_bmap_free free_list; > xfs_fsblock_t first_block; > - bool unlock_dp_on_error = false; > + bool unlock_dp_on_error = false; > uint cancel_flags; > int committed; > xfs_fileoff_t first_fsb; > @@ -221,7 +221,7 @@ xfs_symlink( > XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, > &udqp, &gdqp, &pdqp); > if (error) > - goto std_return; > + return error; > > tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); > cancel_flags = XFS_TRANS_RELEASE_LOG_RES; > @@ -241,7 +241,7 @@ xfs_symlink( > } > if (error) { > cancel_flags = 0; > - goto error_return; > + goto out_trans_cancel; > } > > xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); > @@ -252,7 +252,7 @@ xfs_symlink( > */ > if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { > error = -EPERM; > - goto error_return; > + goto out_trans_cancel; > } > > /* > @@ -261,7 +261,7 @@ xfs_symlink( > error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, > pdqp, resblks, 1, 0); > if (error) > - goto error_return; > + goto out_trans_cancel; > > /* > * Check for ability to enter directory entry, if no space reserved. > @@ -269,7 +269,7 @@ xfs_symlink( > if (!resblks) { > error = xfs_dir_canenter(tp, dp, link_name); > if (error) > - goto error_return; > + goto out_trans_cancel; > } > /* > * Initialize the bmap freelist prior to calling either > @@ -282,15 +282,14 @@ xfs_symlink( > */ > error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, > prid, resblks > 0, &ip, NULL); > - if (error) { > - if (error == -ENOSPC) > - goto error_return; > - goto error1; > - } > + if (error) > + goto out_trans_cancel; > > /* > - * An error after we've joined dp to the transaction will result in the > - * transaction cancel unlocking dp so don't do it explicitly in the > + * Now we join the directory inode to the transaction. We do not do it > + * earlier because xfs_dir_ialloc might commit the previous transaction > + * (and release all the locks). An error from here on will result in > + * the transaction cancel unlocking dp so don't do it explicitly in the > * error path. > */ > xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); > @@ -330,7 +329,7 @@ xfs_symlink( > XFS_BMAPI_METADATA, &first_block, resblks, > mval, &nmaps, &free_list); > if (error) > - goto error2; > + goto out_bmap_cancel; > > if (resblks) > resblks -= fs_blocks; > @@ -348,7 +347,7 @@ xfs_symlink( > BTOBB(byte_cnt), 0); > if (!bp) { > error = -ENOMEM; > - goto error2; > + goto out_bmap_cancel; > } > bp->b_ops = &xfs_symlink_buf_ops; > > @@ -378,7 +377,7 @@ xfs_symlink( > error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, > &first_block, &free_list, resblks); > if (error) > - goto error2; > + goto out_bmap_cancel; > xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); > xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); > > @@ -392,10 +391,13 @@ xfs_symlink( > } > > error = xfs_bmap_finish(&tp, &free_list, &committed); > - if (error) { > - goto error2; > - } > + if (error) > + goto out_bmap_cancel; > + > error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); > + if (error) > + goto out_release_inode; > + > xfs_qm_dqrele(udqp); > xfs_qm_dqrele(gdqp); > xfs_qm_dqrele(pdqp); > @@ -403,20 +405,28 @@ xfs_symlink( > *ipp = ip; > return 0; > > - error2: > - IRELE(ip); > - error1: > +out_bmap_cancel: > xfs_bmap_cancel(&free_list); > cancel_flags |= XFS_TRANS_ABORT; > - error_return: > +out_trans_cancel: > xfs_trans_cancel(tp, cancel_flags); > +out_release_inode: > + /* > + * Wait until after the current transaction is aborted to finish the > + * setup of the inode and release the inode. This prevents recursive > + * transactions and deadlocks from xfs_inactive. > + */ > + if (ip) { > + xfs_finish_inode_setup(ip); > + IRELE(ip); > + } > + > xfs_qm_dqrele(udqp); > xfs_qm_dqrele(gdqp); > xfs_qm_dqrele(pdqp); > > if (unlock_dp_on_error) > xfs_iunlock(dp, XFS_ILOCK_EXCL); > - std_return: > return error; > } > > -- > 2.0.0 > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs