On Tue, Jun 11, 2019 at 11:47:44PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > Create a new iterator function to simplify walking inodes in an XFS > filesystem. This new iterator will replace the existing open-coded > walking that goes on in various places. > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > --- > fs/xfs/Makefile | 1 > fs/xfs/libxfs/xfs_ialloc_btree.c | 36 +++ > fs/xfs/libxfs/xfs_ialloc_btree.h | 3 > fs/xfs/xfs_itable.c | 5 > fs/xfs/xfs_itable.h | 8 + > fs/xfs/xfs_iwalk.c | 418 ++++++++++++++++++++++++++++++++++++++ > fs/xfs/xfs_iwalk.h | 19 ++ > fs/xfs/xfs_trace.h | 40 ++++ > 8 files changed, 524 insertions(+), 6 deletions(-) > create mode 100644 fs/xfs/xfs_iwalk.c > create mode 100644 fs/xfs/xfs_iwalk.h > > ... > diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c > new file mode 100644 > index 000000000000..49289588413f > --- /dev/null > +++ b/fs/xfs/xfs_iwalk.c > @@ -0,0 +1,418 @@ ... > +/* Allocate memory for a walk. */ > +STATIC int > +xfs_iwalk_alloc( > + struct xfs_iwalk_ag *iwag) > +{ > + size_t size; > + > + ASSERT(iwag->recs == NULL); > + iwag->nr_recs = 0; > + > + /* Allocate a prefetch buffer for inobt records. */ > + size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); > + iwag->recs = kmem_alloc(size, KM_MAYFAIL); > + if (iwag->recs == NULL) > + return -ENOMEM; > + > + return 0; > +} > + > +/* Free memory we allocated for a walk. */ > +STATIC void > +xfs_iwalk_free( > + struct xfs_iwalk_ag *iwag) > +{ > + kmem_free(iwag->recs); It might be a good idea to ->recs = NULL here since the alloc call asserts for that (if any future code happens to free and realloc the recs buffer for whatever reason). > +} > + ... > +/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ > +STATIC int > +xfs_iwalk_ag( > + struct xfs_iwalk_ag *iwag) > +{ > + struct xfs_mount *mp = iwag->mp; > + struct xfs_trans *tp = iwag->tp; > + struct xfs_buf *agi_bp = NULL; > + struct xfs_btree_cur *cur = NULL; > + xfs_agnumber_t agno; > + xfs_agino_t agino; > + int has_more; > + int error = 0; > + > + /* Set up our cursor at the right place in the inode btree. */ > + agno = XFS_INO_TO_AGNO(mp, iwag->startino); > + agino = XFS_INO_TO_AGINO(mp, iwag->startino); > + error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); > + > + while (!error && has_more) { > + struct xfs_inobt_rec_incore *irec; > + > + cond_resched(); > + > + /* Fetch the inobt record. */ > + irec = &iwag->recs[iwag->nr_recs]; > + error = xfs_inobt_get_rec(cur, irec, &has_more); > + if (error || !has_more) > + break; > + > + /* No allocated inodes in this chunk; skip it. */ > + if (irec->ir_freecount == irec->ir_count) { > + error = xfs_btree_increment(cur, 0, &has_more); > + if (error) > + break; > + continue; > + } > + > + /* > + * Start readahead for this inode chunk in anticipation of > + * walking the inodes. > + */ > + xfs_bulkstat_ichunk_ra(mp, agno, irec); > + > + /* > + * If there's space in the buffer for more records, increment > + * the btree cursor and grab more. > + */ > + if (++iwag->nr_recs < iwag->sz_recs) { > + error = xfs_btree_increment(cur, 0, &has_more); > + if (error || !has_more) > + break; > + continue; > + } > + > + /* > + * Otherwise, we need to save cursor state and run the callback > + * function on the cached records. The run_callbacks function > + * is supposed to return a cursor pointing to the record where > + * we would be if we had been able to increment like above. > + */ > + has_more = true; has_more should always be true if we get here right? If so, perhaps better to replace this with ASSERT(has_more). > + error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, > + &has_more); > + } > + > + if (iwag->nr_recs == 0 || error) > + goto out; > + > + /* Walk the unprocessed records in the cache. */ > + error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more); > + > +out: > + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); > + return error; > +} > + > +/* > + * Given the number of inodes to prefetch, set the number of inobt records that > + * we cache in memory, which controls the number of inodes we try to read > + * ahead. > + */ > +static inline void > +xfs_iwalk_set_prefetch( > + struct xfs_iwalk_ag *iwag, > + unsigned int max_prefetch) > +{ > + /* > + * Default to 4096 bytes' worth of inobt records; this should be plenty > + * of inodes to read ahead. This number was chosen so that the cache > + * is never more than a single memory page and the amount of inode > + * readahead is limited to to 16k inodes regardless of CPU: > + * > + * 4096 bytes / 16 bytes per inobt record = 256 inobt records > + * 256 inobt records * 64 inodes per record = 16384 inodes > + * 16384 inodes * 512 bytes per inode(?) = 8MB of inode readahead > + */ > + iwag->sz_recs = 4096 / sizeof(struct xfs_inobt_rec_incore); > + So we decided not to preserve current readahead behavior in this patch? > + /* > + * If the caller gives us a desired prefetch amount, round it up to > + * an even inode chunk and cap it as defined previously. > + */ > + if (max_prefetch) { > + unsigned int nr; > + > + nr = round_up(max_prefetch, XFS_INODES_PER_CHUNK) / > + XFS_INODES_PER_CHUNK; > + iwag->sz_recs = min_t(unsigned int, iwag->sz_recs, nr); This is comparing the record count calculated above with max_prefetch, which the rounding just above suggests is in inodes. BTW, could we add a one line /* prefetch in inodes */ comment on the max_prefetch parameter line at the top of the function? Aside from those nits the rest looks good to me. Brian > + } > + > + /* > + * Allocate enough space to prefetch at least two records so that we > + * can cache both the inobt record where the iwalk started and the next > + * record. This simplifies the AG inode walk loop setup code. > + */ > + iwag->sz_recs = max_t(unsigned int, iwag->sz_recs, 2); > +} > + > +/* > + * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn > + * will be called for each allocated inode, being passed the inode's number and > + * @data. @max_prefetch controls how many inobt records' worth of inodes we > + * try to readahead. > + */ > +int > +xfs_iwalk( > + struct xfs_mount *mp, > + struct xfs_trans *tp, > + xfs_ino_t startino, > + xfs_iwalk_fn iwalk_fn, > + unsigned int max_prefetch, > + void *data) > +{ > + struct xfs_iwalk_ag iwag = { > + .mp = mp, > + .tp = tp, > + .iwalk_fn = iwalk_fn, > + .data = data, > + .startino = startino, > + }; > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); > + int error; > + > + ASSERT(agno < mp->m_sb.sb_agcount); > + > + xfs_iwalk_set_prefetch(&iwag, max_prefetch); > + error = xfs_iwalk_alloc(&iwag); > + if (error) > + return error; > + > + for (; agno < mp->m_sb.sb_agcount; agno++) { > + error = xfs_iwalk_ag(&iwag); > + if (error) > + break; > + iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); > + } > + > + xfs_iwalk_free(&iwag); > + return error; > +} > diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h > new file mode 100644 > index 000000000000..9e762e31dadc > --- /dev/null > +++ b/fs/xfs/xfs_iwalk.h > @@ -0,0 +1,19 @@ > +// SPDX-License-Identifier: GPL-2.0+ > +/* > + * Copyright (C) 2019 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > + */ > +#ifndef __XFS_IWALK_H__ > +#define __XFS_IWALK_H__ > + > +/* Walk all inodes in the filesystem starting from @startino. */ > +typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, > + xfs_ino_t ino, void *data); > +/* Return values for xfs_iwalk_fn. */ > +#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE) > +#define XFS_IWALK_ABORT (XFS_ITER_ABORT) > + > +int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, > + xfs_iwalk_fn iwalk_fn, unsigned int max_prefetch, void *data); > + > +#endif /* __XFS_IWALK_H__ */ > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 2464ea351f83..f9bb1d50bc0e 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -3516,6 +3516,46 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \ > DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick); > DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); > > +TRACE_EVENT(xfs_iwalk_ag, > + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, > + xfs_agino_t startino), > + TP_ARGS(mp, agno, startino), > + TP_STRUCT__entry( > + __field(dev_t, dev) > + __field(xfs_agnumber_t, agno) > + __field(xfs_agino_t, startino) > + ), > + TP_fast_assign( > + __entry->dev = mp->m_super->s_dev; > + __entry->agno = agno; > + __entry->startino = startino; > + ), > + TP_printk("dev %d:%d agno %d startino %u", > + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, > + __entry->startino) > +) > + > +TRACE_EVENT(xfs_iwalk_ag_rec, > + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, > + struct xfs_inobt_rec_incore *irec), > + TP_ARGS(mp, agno, irec), > + TP_STRUCT__entry( > + __field(dev_t, dev) > + __field(xfs_agnumber_t, agno) > + __field(xfs_agino_t, startino) > + __field(uint64_t, freemask) > + ), > + TP_fast_assign( > + __entry->dev = mp->m_super->s_dev; > + __entry->agno = agno; > + __entry->startino = irec->ir_startino; > + __entry->freemask = irec->ir_free; > + ), > + TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx", > + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, > + __entry->startino, __entry->freemask) > +) > + > #endif /* _TRACE_XFS_H */ > > #undef TRACE_INCLUDE_PATH >