From: Dave Chinner <dchinner@xxxxxxxxxx> Running some recent repair tests on broken filesystem meant running phase 1 and 2 repeatedly to reproduce an issue at the start of phase 3. Phase 2 was taking approximately 10 minutes to run as it processes each AG serially. Phase 2 can be trivially parallelised - it is simply scanning the per AG trees to calculate free block counts and free and used inodes counts. This can be done safely in parallel by giving each AG it's own structure to aggregate counts into, then once the AG scan is complete adding them all together. This patch uses 32-way threading which results in no noticable slowdown on single SATA drives with NCQ, but results in ~10x reduction in runtime on a 12 disk RAID-0 array. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- repair/phase2.c | 20 +--- repair/scan.c | 294 +++++++++++++++++++++++++++++---------------------- repair/scan.h | 39 +------ repair/xfs_repair.c | 10 ++- 4 files changed, 182 insertions(+), 181 deletions(-) diff --git a/repair/phase2.c b/repair/phase2.c index e81ebf0..1e9377e 100644 --- a/repair/phase2.c +++ b/repair/phase2.c @@ -24,10 +24,9 @@ #include "err_protos.h" #include "incore.h" #include "progress.h" +#include "scan.h" void set_mp(xfs_mount_t *mpp); -void scan_ag(xfs_agnumber_t agno); -void validate_sb(struct xfs_sb *sb); /* workaround craziness in the xlog routines */ int xlog_recover_do_trans(xlog_t *log, xlog_recover_t *t, int p) { return 0; } @@ -107,9 +106,10 @@ zero_log(xfs_mount_t *mp) */ void -phase2(xfs_mount_t *mp) +phase2( + struct xfs_mount *mp, + int scan_threads) { - xfs_agnumber_t i; int j; ino_tree_node_t *ino_rec; @@ -138,17 +138,7 @@ phase2(xfs_mount_t *mp) set_progress_msg(PROG_FMT_SCAN_AG, (__uint64_t) glob_agcount); - for (i = 0; i < mp->m_sb.sb_agcount; i++) { - scan_ag(i); -#ifdef XR_INODE_TRACE - print_inode_list(i); -#endif - } - - /* - * Validate that our manual counts match the superblock. - */ - validate_sb(&mp->m_sb); + scan_ags(mp, scan_threads); print_final_rpt(); diff --git a/repair/scan.c b/repair/scan.c index 85017ff..afed693 100644 --- a/repair/scan.c +++ b/repair/scan.c @@ -28,6 +28,7 @@ #include "versions.h" #include "bmap.h" #include "progress.h" +#include "threads.h" extern int verify_set_agheader(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb, xfs_agf_t *agf, xfs_agi_t *agi, xfs_agnumber_t i); @@ -35,27 +36,31 @@ extern int verify_set_agheader(xfs_mount_t *mp, xfs_buf_t *sbuf, xfs_sb_t *sb, static xfs_mount_t *mp = NULL; /* - * Global variables to validate superblock values against the manual count + * Variables to validate AG header values against the manual count * from the btree traversal. - * - * No locking for now as phase2 is not threaded. */ -static __uint64_t fdblocks; -static __uint64_t icount; -static __uint64_t ifreecount; +struct aghdr_cnts { + xfs_agnumber_t agno; + xfs_extlen_t agffreeblks; + xfs_extlen_t agflongest; + __uint64_t agfbtreeblks; + __uint32_t agicount; + __uint32_t agifreecount; + __uint64_t fdblocks; + __uint64_t icount; + __uint64_t ifreecount; +}; -/* - * Global variables to validate AG header values against the manual count - * from the btree traversal. - * - * Note: these values must be reset when processing a new AG, and for now - * forces the AG scanning in phase2 to not be threaded. - */ -static xfs_extlen_t agffreeblks; -static xfs_extlen_t agflongest; -static __uint64_t agfbtreeblks; -static __uint32_t agicount; -static __uint32_t agifreecount; +static void +scanfunc_allocbt( + struct xfs_btree_block *block, + int level, + xfs_agblock_t bno, + xfs_agnumber_t agno, + int suspect, + int isroot, + __uint32_t magic, + struct aghdr_cnts *agcnts); void set_mp(xfs_mount_t *mpp) @@ -75,8 +80,10 @@ scan_sbtree( xfs_agblock_t bno, xfs_agnumber_t agno, int suspect, - int isroot), - int isroot) + int isroot, + void *priv), + int isroot, + void *priv) { xfs_buf_t *bp; @@ -86,7 +93,8 @@ scan_sbtree( do_error(_("can't read btree block %d/%d\n"), agno, root); return; } - (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1, root, agno, suspect, isroot); + (*func)(XFS_BUF_TO_BLOCK(bp), nlevels - 1, root, agno, suspect, + isroot, priv); libxfs_putbuf(bp); } @@ -468,7 +476,35 @@ _("out-of-order bmap key (file offset) in inode %llu, %s fork, fsbno %llu\n"), return(0); } -void +static void +scanfunc_bno( + struct xfs_btree_block *block, + int level, + xfs_agblock_t bno, + xfs_agnumber_t agno, + int suspect, + int isroot, + void *agcnts) +{ + return scanfunc_allocbt(block, level, bno, agno, + suspect, isroot, XFS_ABTB_MAGIC, agcnts); +} + +static void +scanfunc_cnt( + struct xfs_btree_block *block, + int level, + xfs_agblock_t bno, + xfs_agnumber_t agno, + int suspect, + int isroot, + void *agcnts) +{ + return scanfunc_allocbt(block, level, bno, agno, + suspect, isroot, XFS_ABTC_MAGIC, agcnts); +} + +static void scanfunc_allocbt( struct xfs_btree_block *block, int level, @@ -476,7 +512,8 @@ scanfunc_allocbt( xfs_agnumber_t agno, int suspect, int isroot, - __uint32_t magic) + __uint32_t magic, + struct aghdr_cnts *agcnts) { const char *name; int i; @@ -506,8 +543,8 @@ scanfunc_allocbt( * free data block counter. */ if (!isroot) { - agfbtreeblks++; - fdblocks++; + agcnts->agfbtreeblks++; + agcnts->fdblocks++; } if (be16_to_cpu(block->bb_level) != level) { @@ -583,10 +620,10 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), lastblock = b; } } else { - fdblocks += len; - agffreeblks += len; - if (len > agflongest) - agflongest = len; + agcnts->fdblocks += len; + agcnts->agffreeblks += len; + if (len > agcnts->agflongest) + agcnts->agflongest = len; if (len < lastcount) { do_warn(_( "out-of-order cnt btree record %d (%u %u) block %u/%u\n"), @@ -670,38 +707,12 @@ _("%s freespace btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), if (bno != 0 && verify_agbno(mp, agno, bno)) { scan_sbtree(bno, level, agno, suspect, (magic == XFS_ABTB_MAGIC) ? - scanfunc_bno : scanfunc_cnt, 0); + scanfunc_bno : scanfunc_cnt, 0, + (void *)agcnts); } } } -void -scanfunc_bno( - struct xfs_btree_block *block, - int level, - xfs_agblock_t bno, - xfs_agnumber_t agno, - int suspect, - int isroot) -{ - return scanfunc_allocbt(block, level, bno, agno, - suspect, isroot, XFS_ABTB_MAGIC); -} - -void -scanfunc_cnt( - struct xfs_btree_block *block, - int level, - xfs_agblock_t bno, - xfs_agnumber_t agno, - int suspect, - int isroot - ) -{ - return scanfunc_allocbt(block, level, bno, agno, - suspect, isroot, XFS_ABTC_MAGIC); -} - static int scan_single_ino_chunk( xfs_agnumber_t agno, @@ -879,16 +890,17 @@ _("inode rec for ino %llu (%d/%d) overlaps existing rec (start %d/%d)\n"), * get the start and alignment of the inode chunks right. Those chunks * that we aren't sure about go into the uncertain list. */ -void +static void scanfunc_ino( struct xfs_btree_block *block, int level, xfs_agblock_t bno, xfs_agnumber_t agno, int suspect, - int isroot - ) + int isroot, + void *priv) { + struct aghdr_cnts *agcnts = priv; int i; int numrecs; int state; @@ -968,10 +980,10 @@ _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), * the block. skip processing of bogus records. */ for (i = 0; i < numrecs; i++) { - agicount += XFS_INODES_PER_CHUNK; - icount += XFS_INODES_PER_CHUNK; - agifreecount += be32_to_cpu(rp[i].ir_freecount); - ifreecount += be32_to_cpu(rp[i].ir_freecount); + agcnts->agicount += XFS_INODES_PER_CHUNK; + agcnts->icount += XFS_INODES_PER_CHUNK; + agcnts->agifreecount += be32_to_cpu(rp[i].ir_freecount); + agcnts->ifreecount += be32_to_cpu(rp[i].ir_freecount); suspect = scan_single_ino_chunk(agno, &rp[i], suspect); } @@ -1015,13 +1027,14 @@ _("inode btree block claimed (state %d), agno %d, bno %d, suspect %d\n"), if (be32_to_cpu(pp[i]) != 0 && verify_agbno(mp, agno, be32_to_cpu(pp[i]))) scan_sbtree(be32_to_cpu(pp[i]), level, agno, - suspect, scanfunc_ino, 0); + suspect, scanfunc_ino, 0, priv); } } -void +static void scan_freelist( - xfs_agf_t *agf) + xfs_agf_t *agf, + struct aghdr_cnts *agcnts) { xfs_agfl_t *agfl; xfs_buf_t *agflbuf; @@ -1068,7 +1081,7 @@ scan_freelist( be32_to_cpu(agf->agf_flcount), agno); } - fdblocks += count; + agcnts->fdblocks += count; libxfs_putbuf(agflbuf); } @@ -1076,14 +1089,15 @@ scan_freelist( static void validate_agf( struct xfs_agf *agf, - xfs_agnumber_t agno) + xfs_agnumber_t agno, + struct aghdr_cnts *agcnts) { xfs_agblock_t bno; bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); if (bno != 0 && verify_agbno(mp, agno, bno)) { scan_sbtree(bno, be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), - agno, 0, scanfunc_bno, 1); + agno, 0, scanfunc_bno, 1, agcnts); } else { do_warn(_("bad agbno %u for btbno root, agno %d\n"), bno, agno); @@ -1092,33 +1106,34 @@ validate_agf( bno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); if (bno != 0 && verify_agbno(mp, agno, bno)) { scan_sbtree(bno, be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), - agno, 0, scanfunc_cnt, 1); + agno, 0, scanfunc_cnt, 1, agcnts); } else { do_warn(_("bad agbno %u for btbcnt root, agno %d\n"), bno, agno); } - if (be32_to_cpu(agf->agf_freeblks) != agffreeblks) { + if (be32_to_cpu(agf->agf_freeblks) != agcnts->agffreeblks) { do_warn(_("agf_freeblks %u, counted %u in ag %u\n"), - be32_to_cpu(agf->agf_freeblks), agffreeblks, agno); + be32_to_cpu(agf->agf_freeblks), agcnts->agffreeblks, agno); } - if (be32_to_cpu(agf->agf_longest) != agflongest) { + if (be32_to_cpu(agf->agf_longest) != agcnts->agflongest) { do_warn(_("agf_longest %u, counted %u in ag %u\n"), - be32_to_cpu(agf->agf_longest), agflongest, agno); + be32_to_cpu(agf->agf_longest), agcnts->agflongest, agno); } if (xfs_sb_version_haslazysbcount(&mp->m_sb) && - be32_to_cpu(agf->agf_btreeblks) != agfbtreeblks) { + be32_to_cpu(agf->agf_btreeblks) != agcnts->agfbtreeblks) { do_warn(_("agf_btreeblks %u, counted %u in ag %u\n"), - be32_to_cpu(agf->agf_btreeblks), agfbtreeblks, agno); + be32_to_cpu(agf->agf_btreeblks), agcnts->agfbtreeblks, agno); } } static void validate_agi( struct xfs_agi *agi, - xfs_agnumber_t agno) + xfs_agnumber_t agno, + struct aghdr_cnts *agcnts) { xfs_agblock_t bno; int i; @@ -1126,20 +1141,20 @@ validate_agi( bno = be32_to_cpu(agi->agi_root); if (bno != 0 && verify_agbno(mp, agno, bno)) { scan_sbtree(bno, be32_to_cpu(agi->agi_level), - agno, 0, scanfunc_ino, 1); + agno, 0, scanfunc_ino, 1, agcnts); } else { do_warn(_("bad agbno %u for inobt root, agno %d\n"), be32_to_cpu(agi->agi_root), agno); } - if (be32_to_cpu(agi->agi_count) != agicount) { + if (be32_to_cpu(agi->agi_count) != agcnts->agicount) { do_warn(_("agi_count %u, counted %u in ag %u\n"), - be32_to_cpu(agi->agi_count), agicount, agno); + be32_to_cpu(agi->agi_count), agcnts->agicount, agno); } - if (be32_to_cpu(agi->agi_freecount) != agifreecount) { + if (be32_to_cpu(agi->agi_freecount) != agcnts->agifreecount) { do_warn(_("agi_freecount %u, counted %u in ag %u\n"), - be32_to_cpu(agi->agi_freecount), agifreecount, agno); + be32_to_cpu(agi->agi_freecount), agcnts->agifreecount, agno); } for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { @@ -1155,42 +1170,15 @@ validate_agi( } /* - * Validate block/inode counts in the superblock. - * - * Note: needs to be called after scan_ag() has been called for all - * allocation groups. - */ -void -validate_sb( - struct xfs_sb *sb) -{ - if (sb->sb_icount != icount) { - do_warn(_("sb_icount %lld, counted %lld\n"), - sb->sb_icount, icount); - } - - if (sb->sb_ifree != ifreecount) { - do_warn(_("sb_ifree %lld, counted %lld\n"), - sb->sb_ifree, ifreecount); - } - - if (sb->sb_fdblocks != fdblocks) { - do_warn(_("sb_fdblocks %lld, counted %lld\n"), - sb->sb_fdblocks, fdblocks); - } - - /* XXX: check sb_frextents */ -} - -/* * Scan an AG for obvious corruption. - * - * Note: This code is not reentrant due to the use of global variables. */ -void +static void scan_ag( - xfs_agnumber_t agno) + work_queue_t *wq, + xfs_agnumber_t agno, + void *arg) { + struct aghdr_cnts *agcnts = arg; xfs_agf_t *agf; xfs_buf_t *agfbuf; int agf_dirty = 0; @@ -1202,16 +1190,6 @@ scan_ag( int sb_dirty = 0; int status; - /* - * Reset the global variables to track the AG header validity. - * - * Because we use global variable but can get called multiple times - * we have to make sure to always reset these variables. - */ - agicount = agifreecount = 0; - agffreeblks = agfbtreeblks = 0; - agflongest = 0; - sbbuf = libxfs_readbuf(mp->m_dev, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), 0); if (!sbbuf) { @@ -1301,10 +1279,10 @@ scan_ag( return; } - scan_freelist(agf); + scan_freelist(agf, agcnts); - validate_agf(agf, agno); - validate_agi(agi, agno); + validate_agf(agf, agno, agcnts); + validate_agi(agi, agno, agcnts); ASSERT(agi_dirty == 0 || (agi_dirty && !no_modify)); @@ -1331,4 +1309,64 @@ scan_ag( libxfs_putbuf(sbbuf); free(sb); PROG_RPT_INC(prog_rpt_done[agno], 1); + +#ifdef XR_INODE_TRACE + print_inode_list(i); +#endif + return; } + +#define SCAN_THREADS 32 + +void +scan_ags( + struct xfs_mount *mp, + int scan_threads) +{ + struct aghdr_cnts *agcnts; + __uint64_t fdblocks = 0; + __uint64_t icount = 0; + __uint64_t ifreecount = 0; + xfs_agnumber_t i; + work_queue_t wq; + + agcnts = malloc(mp->m_sb.sb_agcount * sizeof(*agcnts)); + if (!agcnts) { + do_abort(_("no memory for ag header counts\n")); + return; + } + memset(agcnts, 0, mp->m_sb.sb_agcount * sizeof(*agcnts)); + + create_work_queue(&wq, mp, scan_threads); + + for (i = 0; i < mp->m_sb.sb_agcount; i++) + queue_work(&wq, scan_ag, i, &agcnts[i]); + + destroy_work_queue(&wq); + + /* tally up the counts */ + for (i = 0; i < mp->m_sb.sb_agcount; i++) { + fdblocks += agcnts[i].fdblocks; + icount += agcnts[i].icount; + ifreecount += agcnts[i].ifreecount; + } + + /* + * Validate that our manual counts match the superblock. + */ + if (mp->m_sb.sb_icount != icount) { + do_warn(_("sb_icount %lld, counted %lld\n"), + mp->m_sb.sb_icount, icount); + } + + if (mp->m_sb.sb_ifree != ifreecount) { + do_warn(_("sb_ifree %lld, counted %lld\n"), + mp->m_sb.sb_ifree, ifreecount); + } + + if (mp->m_sb.sb_fdblocks != fdblocks) { + do_warn(_("sb_fdblocks %lld, counted %lld\n"), + mp->m_sb.sb_fdblocks, fdblocks); + } +} + diff --git a/repair/scan.h b/repair/scan.h index 20567fb..9f945cf 100644 --- a/repair/scan.h +++ b/repair/scan.h @@ -20,19 +20,6 @@ struct blkmap; -void scan_sbtree( - xfs_agblock_t root, - int nlevels, - xfs_agnumber_t agno, - int suspect, - void (*func)(struct xfs_btree_block *block, - int level, - xfs_agblock_t bno, - xfs_agnumber_t agno, - int suspect, - int isroot), - int isroot); - int scan_lbtree( xfs_dfsbno_t root, int nlevels, @@ -74,29 +61,9 @@ int scanfunc_bmap( int check_dups, int *dirty); -void scanfunc_bno( - struct xfs_btree_block *block, - int level, - xfs_agblock_t bno, - xfs_agnumber_t agno, - int suspect, - int isroot); - -void scanfunc_cnt( - struct xfs_btree_block *block, - int level, - xfs_agblock_t bno, - xfs_agnumber_t agno, - int suspect, - int isroot); - void -scanfunc_ino( - struct xfs_btree_block *block, - int level, - xfs_agblock_t bno, - xfs_agnumber_t agno, - int suspect, - int isroot); +scan_ags( + struct xfs_mount *mp, + int scan_threads); #endif /* _XR_SCAN_H */ diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c index 1d9ad46..4707b83 100644 --- a/repair/xfs_repair.c +++ b/repair/xfs_repair.c @@ -33,7 +33,7 @@ #define rounddown(x, y) (((x)/(y))*(y)) extern void phase1(xfs_mount_t *); -extern void phase2(xfs_mount_t *); +extern void phase2(xfs_mount_t *, int); extern void phase3(xfs_mount_t *); extern void phase4(xfs_mount_t *); extern void phase5(xfs_mount_t *); @@ -63,6 +63,8 @@ char *o_opts[] = { "ag_stride", #define FORCE_GEO 5 "force_geometry", +#define PHASE2_THREADS 6 + "phase2_threads", NULL }; @@ -80,6 +82,7 @@ char *c_opts[] = { static int ihash_option_used; static int bhash_option_used; static long max_mem_specified; /* in megabytes */ +static int phase2_threads = 32; static void usage(void) @@ -266,6 +269,9 @@ process_args(int argc, char **argv) respec('o', o_opts, FORCE_GEO); force_geo = 1; break; + case PHASE2_THREADS: + phase2_threads = (int)strtol(val, NULL, 0); + break; default: unknown('o', val); break; @@ -709,7 +715,7 @@ main(int argc, char **argv) } /* make sure the per-ag freespace maps are ok so we can mount the fs */ - phase2(mp); + phase2(mp, phase2_threads); timestamp(PHASE_END, 2, NULL); if (do_prefetch) -- 1.7.2.3 _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs