Christoph Hellwig wrote: > New version below: > The locking in xfs_iget_cache_hit currently has numerous problems: > > - we clear the reclaim tag without i_flags_lock which protects modifications > to it > - we call inode_init_always which can sleep with pag_ici_lock held > (this is oss.sgi.com BZ #819) > - we acquire and drop i_flags_lock a lot and thus provide no consistency > between the various flags we set/clear under it > > This patch fixes all that with a major revamp of the locking in the function. > The new version acquires i_flags_lock early and only drops it once we need to > call into inode_init_always or before calling xfs_ilock. > > This patch fixes a bug seen in the wild where we race modifying the reclaim tag. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> This seems ok to me but I have to be honest, I'm having a hard time getting my head around back into the inode lifecycle. one comment, I wonder if it's worth capturing the actual error from inode_init_always() vs. turning every error into ENOMEM? True, today it's the only error we can get but why re-set it? -Eric > Index: linux-2.6/fs/xfs/xfs_iget.c > =================================================================== > --- linux-2.6.orig/fs/xfs/xfs_iget.c 2009-08-10 13:10:19.141974933 -0300 > +++ linux-2.6/fs/xfs/xfs_iget.c 2009-08-10 13:19:06.913056731 -0300 > @@ -191,80 +191,83 @@ xfs_iget_cache_hit( > int flags, > int lock_flags) __releases(pag->pag_ici_lock) > { > + struct inode *inode = VFS_I(ip); > struct xfs_mount *mp = ip->i_mount; > - int error = EAGAIN; > + int error; > + > + spin_lock(&ip->i_flags_lock); > > /* > - * If INEW is set this inode is being set up > - * If IRECLAIM is set this inode is being torn down > - * Pause and try again. > + * If we are racing with another cache hit that is currently > + * instantiating this inode or currently recycling it out of > + * reclaimabe state, wait for the initialisation to complete > + * before continuing. > + * > + * XXX(hch): eventually we should do something equivalent to > + * wait_on_inode to wait for these flags to be cleared > + * instead of polling for it. > */ > - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { > + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { > XFS_STATS_INC(xs_ig_frecycle); > + error = EAGAIN; > goto out_error; > } > > - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ > - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { > - > - /* > - * If lookup is racing with unlink, then we should return an > - * error immediately so we don't remove it from the reclaim > - * list and potentially leak the inode. > - */ > - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { > - error = ENOENT; > - goto out_error; > - } > + /* > + * If lookup is racing with unlink return an error immediately. > + */ > + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { > + error = ENOENT; > + goto out_error; > + } > > + /* > + * If IRECLAIMABLE is set, we've torn down the VFS inode already. > + * Need to carefully get it back into useable state. > + */ > + if (ip->i_flags & XFS_IRECLAIMABLE) { > xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); > > /* > - * We need to re-initialise the VFS inode as it has been > - * 'freed' by the VFS. Do this here so we can deal with > - * errors cleanly, then tag it so it can be set up correctly > - * later. > + * We need to set XFS_INEW atomically with clearing the > + * reclaimable tag so that we do have an indicator of the > + * inode still being initialized. > */ > - if (inode_init_always(mp->m_super, VFS_I(ip))) { > + ip->i_flags |= XFS_INEW; > + ip->i_flags &= ~XFS_IRECLAIMABLE; > + __xfs_inode_clear_reclaim_tag(mp, pag, ip); > + > + spin_unlock(&ip->i_flags_lock); > + read_unlock(&pag->pag_ici_lock); > + > + if (unlikely(inode_init_always(mp->m_super, inode))) { > + /* > + * Re-initializing the inode failed, and we are in deep > + * trouble. Try to re-add it to the reclaim list. > + */ > + read_lock(&pag->pag_ici_lock); > + spin_lock(&ip->i_flags_lock); > + > + ip->i_flags &= ~XFS_INEW; > + ip->i_flags |= XFS_IRECLAIMABLE; > + __xfs_inode_set_reclaim_tag(pag, ip); > + > error = ENOMEM; > goto out_error; > } > - > - /* > - * We must set the XFS_INEW flag before clearing the > - * XFS_IRECLAIMABLE flag so that if a racing lookup does > - * not find the XFS_IRECLAIMABLE above but has the igrab() > - * below succeed we can safely check XFS_INEW to detect > - * that this inode is still being initialised. > - */ > - xfs_iflags_set(ip, XFS_INEW); > - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); > - > - /* clear the radix tree reclaim flag as well. */ > - __xfs_inode_clear_reclaim_tag(mp, pag, ip); > - } else if (!igrab(VFS_I(ip))) { > + inode->i_state = I_LOCK|I_NEW; > + } else { > /* If the VFS inode is being torn down, pause and try again. */ > - XFS_STATS_INC(xs_ig_frecycle); > - goto out_error; > - } else if (xfs_iflags_test(ip, XFS_INEW)) { > - /* > - * We are racing with another cache hit that is > - * currently recycling this inode out of the XFS_IRECLAIMABLE > - * state. Wait for the initialisation to complete before > - * continuing. > - */ > - wait_on_inode(VFS_I(ip)); > - } > + if (!igrab(inode)) { > + error = EAGAIN; > + goto out_error; > + } > > - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { > - error = ENOENT; > - iput(VFS_I(ip)); > - goto out_error; > + /* We've got a live one. */ > + spin_unlock(&ip->i_flags_lock); > + read_unlock(&pag->pag_ici_lock); > } > > - /* We've got a live one. */ > - read_unlock(&pag->pag_ici_lock); > - > if (lock_flags != 0) > xfs_ilock(ip, lock_flags); > > @@ -274,6 +277,7 @@ xfs_iget_cache_hit( > return 0; > > out_error: > + spin_unlock(&ip->i_flags_lock); > read_unlock(&pag->pag_ici_lock); > return error; > } > Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c > =================================================================== > --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:19.146974522 -0300 > +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c 2009-08-10 13:10:59.958993938 -0300 > @@ -708,6 +708,16 @@ xfs_reclaim_inode( > return 0; > } > > +void > +__xfs_inode_set_reclaim_tag( > + struct xfs_perag *pag, > + struct xfs_inode *ip) > +{ > + radix_tree_tag_set(&pag->pag_ici_root, > + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), > + XFS_ICI_RECLAIM_TAG); > +} > + > /* > * We set the inode flag atomically with the radix tree tag. > * Once we get tag lookups on the radix tree, this inode flag > @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag( > > read_lock(&pag->pag_ici_lock); > spin_lock(&ip->i_flags_lock); > - radix_tree_tag_set(&pag->pag_ici_root, > - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); > + __xfs_inode_set_reclaim_tag(pag, ip); > __xfs_iflags_set(ip, XFS_IRECLAIMABLE); > spin_unlock(&ip->i_flags_lock); > read_unlock(&pag->pag_ici_lock); > Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.h > =================================================================== > --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:19.153974227 -0300 > +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.h 2009-08-10 13:10:59.962994168 -0300 > @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode * > int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); > > void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); > +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); > void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); > void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, > struct xfs_inode *ip); -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html