On Thu, May 06, 2021 at 05:20:53PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Because it's a mess. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/libxfs/xfs_ialloc.c | 270 +++++++++++++++++++++---------------- > 1 file changed, 153 insertions(+), 117 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c > index d749bb7c7a69..340bb95d7bc1 100644 > --- a/fs/xfs/libxfs/xfs_ialloc.c > +++ b/fs/xfs/libxfs/xfs_ialloc.c > @@ -604,9 +604,10 @@ xfs_inobt_insert_sprec( > } > > /* > - * Allocate new inodes in the allocation group specified by agbp. > - * Returns 0 if inodes were allocated in this AG; 1 if there was no space > - * in this AG; or the usual negative error code. > + * Allocate new inodes in the allocation group specified by agbp. Returns 0 if > + * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so > + * the caller knows it can try another AG, a hard -ENOSPC when over the maximum > + * inode count threshold, or the usual negative error code for other errors. > */ > STATIC int > xfs_ialloc_ag_alloc( > @@ -792,7 +793,7 @@ xfs_ialloc_ag_alloc( > } > > if (args.fsbno == NULLFSBLOCK) > - return 1; > + return -EAGAIN; > > ASSERT(args.len == args.minlen); > > @@ -1568,14 +1569,17 @@ xfs_dialloc_roll( > /* Re-attach the quota info that we detached from prev trx. */ > tp->t_dqinfo = dqinfo; > > - *tpp = tp; > - if (error) > - return error; > + /* > + * Join the buffer even on commit error so that the buffer is released > + * when the caller cancels the transaction and doesn't have to handle > + * this error case specially. > + */ > xfs_trans_bjoin(tp, agibp); > - return 0; > + *tpp = tp; > + return error; > } > > -STATIC xfs_agnumber_t > +static xfs_agnumber_t > xfs_ialloc_next_ag( > xfs_mount_t *mp) > { > @@ -1590,16 +1594,136 @@ xfs_ialloc_next_ag( > return agno; > } > > +static bool > +xfs_dialloc_good_ag( > + struct xfs_trans *tp, > + struct xfs_perag *pag, > + umode_t mode, > + int flags, > + bool ok_alloc) > +{ > + struct xfs_mount *mp = tp->t_mountp; > + xfs_extlen_t ineed; > + xfs_extlen_t longest = 0; > + int needspace; > + int error; > + > + if (!pag->pagi_inodeok) > + return false; > + > + if (!pag->pagi_init) { > + error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno); > + if (error) > + return false; > + } > + > + if (pag->pagi_freecount) > + return true; > + if (!ok_alloc) > + return false; > + > + if (!pag->pagf_init) { > + error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags); > + if (error) > + return false; > + } > + > + /* > + * Check that there is enough free space for the file plus a chunk of > + * inodes if we need to allocate some. If this is the first pass across > + * the AGs, take into account the potential space needed for alignment > + * of inode chunks when checking the longest contiguous free space in > + * the AG - this prevents us from getting ENOSPC because we have free > + * space larger than ialloc_blks but alignment constraints prevent us > + * from using it. > + * > + * If we can't find an AG with space for full alignment slack to be > + * taken into account, we must be near ENOSPC in all AGs. Hence we > + * don't include alignment for the second pass and so if we fail > + * allocation due to alignment issues then it is most likely a real > + * ENOSPC condition. > + * > + * XXX(dgc): this calculation is now bogus thanks to the per-ag > + * reservations that xfs_alloc_fix_freelist() now does via > + * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will > + * be more than large enough for the check below to succeed, but > + * xfs_alloc_space_available() will fail because of the non-zero > + * metadata reservation and hence we won't actually be able to allocate > + * more inodes in this AG. We do soooo much unnecessary work near ENOSPC > + * because of this. Yuck. Can this be fixed by doing: really_free = pag->pagf_freeblks - xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE); return really_free >= needspace + ineed && longest >= ineed) to account for those reservations, perhaps? > + */ > + ineed = M_IGEO(mp)->ialloc_min_blks; > + if (flags && ineed > 1) > + ineed += M_IGEO(mp)->cluster_align; > + longest = pag->pagf_longest; > + if (!longest) > + longest = pag->pagf_flcount > 0; > + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); > + > + if (pag->pagf_freeblks < needspace + ineed || longest < ineed) > + return false; > + return true; > +} > + > +static int > +xfs_dialloc_try_ag( > + struct xfs_trans **tpp, > + struct xfs_perag *pag, > + xfs_ino_t parent, > + xfs_ino_t *new_ino, > + bool ok_alloc) > +{ > + struct xfs_buf *agbp; > + xfs_ino_t ino; > + int error; > + > + /* > + * Then read in the AGI buffer and recheck with the AGI buffer > + * lock held. > + */ > + error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp); > + if (error) > + return error; > + > + if (!pag->pagi_freecount) { > + if (!ok_alloc) { > + error = -EAGAIN; > + goto out_release; > + } > + > + error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); > + if (error < 0) > + goto out_release; > + > + /* > + * We successfully allocated space for an inode cluster in this > + * AG. Roll the transaction so that we can allocate one of the > + * new inodes. > + */ > + ASSERT(pag->pagi_freecount > 0); > + error = xfs_dialloc_roll(tpp, agbp); > + if (error) > + goto out_release; > + } > + > + /* Allocate an inode in the found AG */ > + error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); > + if (!error) > + *new_ino = ino; > + return error; > + > +out_release: > + xfs_trans_brelse(*tpp, agbp); > + return error; > +} > + > /* > - * Select and prepare an AG for inode allocation. > + * Allocate an on-disk inode. > * > * Mode is used to tell whether the new inode is a directory and hence where to > - * locate it. > - * > - * This function will ensure that the selected AG has free inodes available to > - * allocate from. The selected AGI will be returned locked to the caller, and it > - * will allocate more free inodes if required. If no free inodes are found or > - * can be allocated, -ENOSPC be returned. > + * locate it. The on-disk inode that is allocated will be returned in @new_ino > + * on success, otherwise an error will be set to indicate the failure (e.g. > + * -ENOSPC). > */ > int > xfs_dialloc( > @@ -1609,14 +1733,12 @@ xfs_dialloc( > xfs_ino_t *new_ino) > { > struct xfs_mount *mp = (*tpp)->t_mountp; > - struct xfs_buf *agbp; > xfs_agnumber_t agno; > int error = 0; > xfs_agnumber_t start_agno; > struct xfs_perag *pag; > struct xfs_ino_geometry *igeo = M_IGEO(mp); > - bool okalloc = true; > - int needspace; > + bool ok_alloc = true; > int flags; > xfs_ino_t ino; > > @@ -1624,7 +1746,6 @@ xfs_dialloc( > * Files of these types need at least one block if length > 0 > * (and they won't fit in the inode, but that's hard to figure out). > */ > - needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); > if (S_ISDIR(mode)) > start_agno = xfs_ialloc_next_ag(mp); > else { > @@ -1635,7 +1756,7 @@ xfs_dialloc( > > /* > * If we have already hit the ceiling of inode blocks then clear > - * okalloc so we scan all available agi structures for a free > + * ok_alloc so we scan all available agi structures for a free > * inode. > * > * Read rough value of mp->m_icount by percpu_counter_read_positive, > @@ -1644,7 +1765,7 @@ xfs_dialloc( Er... didn't this logic get split into xfs_dialloc_select_ag in 5.11? Nice cleanup, at least... --D > if (igeo->maxicount && > percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > > igeo->maxicount) { > - okalloc = false; > + ok_alloc = false; > } > > /* > @@ -1655,95 +1776,14 @@ xfs_dialloc( > agno = start_agno; > flags = XFS_ALLOC_FLAG_TRYLOCK; > for (;;) { > - xfs_extlen_t ineed; > - xfs_extlen_t longest = 0; > - > pag = xfs_perag_get(mp, agno); > - if (!pag->pagi_inodeok) > - goto nextag; > - > - if (!pag->pagi_init) { > - error = xfs_ialloc_pagi_init(mp, *tpp, agno); > - if (error) > - break; > - } > - > - if (!pag->pagi_freecount) > - goto nextag; > - if (!okalloc) > - goto nextag; > - > - if (!pag->pagf_init) { > - error = xfs_alloc_pagf_init(mp, *tpp, agno, flags); > - if (error) > - goto nextag; > - } > - > - /* > - * Check that there is enough free space for the file plus a > - * chunk of inodes if we need to allocate some. If this is the > - * first pass across the AGs, take into account the potential > - * space needed for alignment of inode chunks when checking the > - * longest contiguous free space in the AG - this prevents us > - * from getting ENOSPC because we have free space larger than > - * ialloc_blks but alignment constraints prevent us from using > - * it. > - * > - * If we can't find an AG with space for full alignment slack to > - * be taken into account, we must be near ENOSPC in all AGs. > - * Hence we don't include alignment for the second pass and so > - * if we fail allocation due to alignment issues then it is most > - * likely a real ENOSPC condition. > - */ > - ineed = M_IGEO(mp)->ialloc_min_blks; > - if (flags && ineed > 1) > - ineed += M_IGEO(mp)->cluster_align; > - longest = pag->pagf_longest; > - if (!longest) > - longest = pag->pagf_flcount > 0; > - > - if (pag->pagf_freeblks < needspace + ineed || longest < ineed) > - goto nextag; > - > - /* > - * Then read in the AGI buffer and recheck with the AGI buffer > - * lock held. > - */ > - error = xfs_ialloc_read_agi(mp, *tpp, agno, &agbp); > - if (error) > - break; > - > - if (pag->pagi_freecount) > - goto found_ag; > - > - if (!okalloc) > - goto nextag_relse_buffer; > - > - error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); > - if (error < 0) { > - xfs_trans_brelse(*tpp, agbp); > - break; > - } > - > - if (error == 0) { > - /* > - * We successfully allocated space for an inode cluster > - * in this AG. Roll the transaction so that we can > - * allocate one of the new inodes. > - */ > - ASSERT(pag->pagi_freecount > 0); > - > - error = xfs_dialloc_roll(tpp, agbp); > - if (error) { > - xfs_buf_relse(agbp); > + if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) { > + error = xfs_dialloc_try_ag(tpp, pag, parent, > + &ino, ok_alloc); > + if (error != -EAGAIN) > break; > - } > - goto found_ag; > } > > -nextag_relse_buffer: > - xfs_trans_brelse(*tpp, agbp); > -nextag: > if (XFS_FORCED_SHUTDOWN(mp)) { > error = -EFSCORRUPTED; > break; > @@ -1751,23 +1791,19 @@ xfs_dialloc( > if (++agno == mp->m_maxagi) > agno = 0; > if (agno == start_agno) { > - if (!flags) > + if (!flags) { > + error = -ENOSPC; > break; > + } > flags = 0; > } > xfs_perag_put(pag); > } > > + if (!error) > + *new_ino = ino; > xfs_perag_put(pag); > - return error ? error : -ENOSPC; > -found_ag: > - /* Allocate an inode in the found AG */ > - error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); > - xfs_perag_put(pag); > - if (error) > - return error; > - *new_ino = ino; > - return 0; > + return error; > } > > /* > -- > 2.31.1 >