On Wed, Aug 12, 2020 at 07:25:48PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > To move away from using the on disk inode buffers to track and log > unlinked inodes, we need pointers to track them in memory. Because > we have arbitrary remove order from the list, it needs to be a > double linked list. > > We start by noting that inodes are always in memory when they are > active on the unlinked list, and hence we can track these inodes > without needing to take references to the inodes or store them in > the list. We cannot, however, use inode locks to protect the inodes > on the list - the list needs an external lock to serialise all > inserts and removals. We can use the existing AGI buffer lock for > this right now as that already serialises all unlinked list > traversals and modifications. > > Hence we can convert the in-memory unlinked list to a simple > list_head list in the perag. We can use list_empty() to detect an > empty unlinked list, likewise we can detect the end of the list when > the inode next pointer points back to the perag list_head. This > makes insert, remove and traversal. > > The only complication here is log recovery of old filesystems that > have multiple lists. These always remove from the head of the list, > so we can easily construct just enough of the unlinked list for > recovery from any list to work correctly. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> Hm. This is orthogonal to this patch, but should we get meaner about failing the mount if the AGI read fails or the unlinked walk fails? For this patch, Reviewed-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --D > --- > fs/xfs/xfs_icache.c | 1 + > fs/xfs/xfs_inode.c | 21 ++++- > fs/xfs/xfs_inode.h | 1 + > fs/xfs/xfs_log_recover.c | 179 +++++++++++++++++++++++---------------- > fs/xfs/xfs_mount.c | 1 + > fs/xfs/xfs_mount.h | 1 + > 6 files changed, 130 insertions(+), 74 deletions(-) > > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c > index 5cdded02cdc8..0c04a66bf88d 100644 > --- a/fs/xfs/xfs_icache.c > +++ b/fs/xfs/xfs_icache.c > @@ -66,6 +66,7 @@ xfs_inode_alloc( > memset(&ip->i_d, 0, sizeof(ip->i_d)); > ip->i_sick = 0; > ip->i_checked = 0; > + INIT_LIST_HEAD(&ip->i_unlink); > INIT_WORK(&ip->i_ioend_work, xfs_end_io); > INIT_LIST_HEAD(&ip->i_ioend_list); > spin_lock_init(&ip->i_ioend_lock); > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index fa92bdf6e0da..dcf80ac51267 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -2294,7 +2294,17 @@ xfs_iunlink( > if (error) > return error; > > - return xfs_iunlink_insert_inode(tp, agibp, ip); > + /* > + * Insert the inode into the on disk unlinked list, and if that > + * succeeds, then insert it into the in memory list. We do it in this > + * order so that the modifications required to the on disk list are not > + * impacted by already having this inode in the list. > + */ > + error = xfs_iunlink_insert_inode(tp, agibp, ip); > + if (!error) > + list_add(&ip->i_unlink, &agibp->b_pag->pag_ici_unlink_list); > + > + return error; > } > > /* Return the imap, dinode pointer, and buffer for an inode. */ > @@ -2516,7 +2526,14 @@ xfs_iunlink_remove( > if (error) > return error; > > - return xfs_iunlink_remove_inode(tp, agibp, ip); > + /* > + * Remove the inode from the on-disk list and then remove it from the > + * in-memory list. This order of operations ensures we can look up both > + * next and previous inode in the on-disk list via the in-memory list. > + */ > + error = xfs_iunlink_remove_inode(tp, agibp, ip); > + list_del(&ip->i_unlink); > + return error; > } > > /* > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 5ea962c6cf98..73f36908a1ce 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -56,6 +56,7 @@ typedef struct xfs_inode { > uint64_t i_delayed_blks; /* count of delay alloc blks */ > > struct xfs_icdinode i_d; /* most of ondisk inode */ > + struct list_head i_unlink; > > /* VFS inode */ > struct inode i_vnode; /* embedded VFS inode */ > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c > index e2ec91b2d0f4..b3481f4e2f96 100644 > --- a/fs/xfs/xfs_log_recover.c > +++ b/fs/xfs/xfs_log_recover.c > @@ -2682,11 +2682,11 @@ xlog_recover_clear_agi_bucket( > return; > } > > -STATIC xfs_agino_t > -xlog_recover_process_one_iunlink( > +static struct xfs_inode * > +xlog_recover_get_one_iunlink( > struct xfs_mount *mp, > xfs_agnumber_t agno, > - xfs_agino_t agino, > + xfs_agino_t *agino, > int bucket) > { > struct xfs_buf *ibp; > @@ -2695,48 +2695,35 @@ xlog_recover_process_one_iunlink( > xfs_ino_t ino; > int error; > > - ino = XFS_AGINO_TO_INO(mp, agno, agino); > + ino = XFS_AGINO_TO_INO(mp, agno, *agino); > error = xfs_iget(mp, NULL, ino, 0, 0, &ip); > if (error) > - goto fail; > + return NULL; > > /* > - * Get the on disk inode to find the next inode in the bucket. > + * Get the on disk inode to find the next inode in the bucket. Should > + * not fail because we just read the inode from this buffer, but if it > + * does then we still have to allow the caller to set up and release > + * the inode we just looked up. Make sure the list walk terminates here, > + * though. > */ > error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0); > - if (error) > - goto fail_iput; > + if (error) { > + ASSERT(0); > + *agino = NULLAGINO; > + return ip; > + } > + > > xfs_iflags_clear(ip, XFS_IRECOVERY); > ASSERT(VFS_I(ip)->i_nlink == 0); > ASSERT(VFS_I(ip)->i_mode != 0); > > - /* setup for the next pass */ > - agino = be32_to_cpu(dip->di_next_unlinked); > + /* Get the next inode we will be looking up. */ > + *agino = be32_to_cpu(dip->di_next_unlinked); > xfs_buf_relse(ibp); > > - /* > - * Prevent any DMAPI event from being sent when the reference on > - * the inode is dropped. > - */ > - ip->i_d.di_dmevmask = 0; > - > - xfs_irele(ip); > - return agino; > - > - fail_iput: > - xfs_irele(ip); > - fail: > - /* > - * We can't read in the inode this bucket points to, or this inode > - * is messed up. Just ditch this bucket of inodes. We will lose > - * some inodes and space, but at least we won't hang. > - * > - * Call xlog_recover_clear_agi_bucket() to perform a transaction to > - * clear the inode pointer in the bucket. > - */ > - xlog_recover_clear_agi_bucket(mp, agno, bucket); > - return NULLAGINO; > + return ip; > } > > /* > @@ -2762,58 +2749,106 @@ xlog_recover_process_one_iunlink( > * scheduled on this CPU to ensure other scheduled work can run without undue > * latency. > */ > -STATIC void > -xlog_recover_process_iunlinks( > - struct xlog *log) > +static int > +xlog_recover_iunlinks_ag( > + struct xfs_mount *mp, > + xfs_agnumber_t agno) > { > - xfs_mount_t *mp; > - xfs_agnumber_t agno; > - xfs_agi_t *agi; > - xfs_buf_t *agibp; > - xfs_agino_t agino; > - int bucket; > - int error; > + struct xfs_agi *agi; > + struct xfs_buf *agibp; > + int bucket; > + int error; > > - mp = log->l_mp; > + /* > + * Find the agi for this ag. > + */ > + error = xfs_read_agi(mp, NULL, agno, &agibp); > + if (error) { > > - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { > /* > - * Find the agi for this ag. > + * AGI is b0rked. Don't process it. > + * > + * We should probably mark the filesystem as corrupt after we've > + * recovered all the ag's we can.... > */ > - error = xfs_read_agi(mp, NULL, agno, &agibp); > + return 0; > + } > + > + /* > + * Unlock the buffer so that it can be acquired in the normal course of > + * the transaction to truncate and free each inode. Because we are not > + * racing with anyone else here for the AGI buffer, we don't even need > + * to hold it locked to read the initial unlinked bucket entries out of > + * the buffer. We keep buffer reference though, so that it stays pinned > + * in memory while we need the buffer. > + */ > + agi = agibp->b_addr; > + xfs_buf_unlock(agibp); > + > + /* > + * The unlinked inode list is maintained on incore inodes as a double > + * linked list. We don't have any of that state in memory, so we have to > + * create it as we go. This is simple as we are only removing from the > + * head of the list and that means we only need to pull the current > + * inode in and the next inode. Inodes are unlinked when their > + * reference count goes to zero, so we can overlap the xfs_iget() and > + * xfs_irele() calls so we always have the first two inodes on the list > + * in memory. Hence we can fake up the necessary in memory state for the > + * unlink to "just work". > + */ > + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { > + struct xfs_inode *ip, *prev_ip = NULL; > + xfs_agino_t agino; > + > + agino = be32_to_cpu(agi->agi_unlinked[bucket]); > + while (agino != NULLAGINO) { > + ip = xlog_recover_get_one_iunlink(mp, agno, &agino, > + bucket); > + if (!ip) { > + /* > + * something busted, but still got to release > + * prev_ip, so make it look like it's at the end > + * of the list before it gets released. > + */ > + error = -EFSCORRUPTED; > + break; > + } > + list_add_tail(&ip->i_unlink, > + &agibp->b_pag->pag_ici_unlink_list); > + if (prev_ip) > + xfs_irele(prev_ip); > + prev_ip = ip; > + cond_resched(); > + } > + if (prev_ip) > + xfs_irele(prev_ip); > if (error) { > /* > - * AGI is b0rked. Don't process it. > - * > - * We should probably mark the filesystem as corrupt > - * after we've recovered all the ag's we can.... > + * We can't read an inode this bucket points to, or an > + * inode is messed up. Just ditch this bucket of > + * inodes. We will lose some inodes and space, but at > + * least we won't hang. > */ > - continue; > - } > - /* > - * Unlock the buffer so that it can be acquired in the normal > - * course of the transaction to truncate and free each inode. > - * Because we are not racing with anyone else here for the AGI > - * buffer, we don't even need to hold it locked to read the > - * initial unlinked bucket entries out of the buffer. We keep > - * buffer reference though, so that it stays pinned in memory > - * while we need the buffer. > - */ > - agi = agibp->b_addr; > - xfs_buf_unlock(agibp); > - > - for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { > - agino = be32_to_cpu(agi->agi_unlinked[bucket]); > - while (agino != NULLAGINO) { > - agino = xlog_recover_process_one_iunlink(mp, > - agno, agino, bucket); > - cond_resched(); > - } > + xlog_recover_clear_agi_bucket(mp, agno, bucket); > + break; > } > - xfs_buf_rele(agibp); > } > + xfs_buf_rele(agibp); > + return error; > } > > +void > +xlog_recover_process_iunlinks( > + struct xlog *log) > +{ > + struct xfs_mount *mp = log->l_mp; > + xfs_agnumber_t agno; > + > + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) > + xlog_recover_iunlinks_ag(mp, agno); > +} > + > + > STATIC void > xlog_unpack_data( > struct xlog_rec_header *rhead, > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c > index bbfd1d5b1c04..2def15297a5f 100644 > --- a/fs/xfs/xfs_mount.c > +++ b/fs/xfs/xfs_mount.c > @@ -200,6 +200,7 @@ xfs_initialize_perag( > pag->pag_mount = mp; > spin_lock_init(&pag->pag_ici_lock); > INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); > + INIT_LIST_HEAD(&pag->pag_ici_unlink_list); > if (xfs_buf_hash_init(pag)) > goto out_free_pag; > init_waitqueue_head(&pag->pagb_wait); > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index a72cfcaa4ad1..c35a6c463529 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -355,6 +355,7 @@ typedef struct xfs_perag { > struct radix_tree_root pag_ici_root; /* incore inode cache root */ > int pag_ici_reclaimable; /* reclaimable inodes */ > unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ > + struct list_head pag_ici_unlink_list; /* unlinked inode list */ > > /* buffer cache index */ > spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ > -- > 2.26.2.761.g0e0b3e54be >