xfs_ialloc_ag_alloc() makes several attempts to allocate a full inode chunk. If all else fails, reduce the allocation to the minimum sparse granularity and attempt to allocate a sparse inode chunk. If sparse chunk allocation succeeds, check whether an inobt record already exists that can track the chunk. If so, inherit and update the existing record. Otherwise, insert a new record for the sparse chunk. Update xfs_inobt_insert_rec() to take the holemask as a parameter and set the associated field on disk. Create the xfs_inobt_update_insert() helper to handle the sparse chunk allocation case - insert or update an existing record depending on whether it already exists. Signed-off-by: Brian Foster <bfoster@xxxxxxxxxx> --- fs/xfs/libxfs/xfs_ialloc.c | 397 +++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_trace.h | 47 ++++++ 2 files changed, 426 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index fc001d9..090d114 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -122,12 +122,16 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, + __uint16_t holemask, + __uint8_t count, __int32_t freecount, xfs_inofree_t free, int *stat) { - cur->bc_rec.i.ir_holemask = 0; - cur->bc_rec.i.ir_count = 0; /* zero for backwards compatibility */ + ASSERT(count == 0 || xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)); + + cur->bc_rec.i.ir_holemask = holemask; + cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); @@ -151,6 +155,19 @@ xfs_inobt_insert( xfs_agino_t thisino; int i; int error; + uint8_t count; + + /* + * Only set ir_count in the inobt record if the sparse inodes feature is + * enabled. If disabled, we must maintain backwards compatibility with + * the older inobt record format where the current count and holemask + * fields map to the higher order bytes of freecount and thus must be + * zeroed. + */ + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + count = XFS_INODES_PER_CHUNK; + else + count = 0; cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); @@ -164,7 +181,7 @@ xfs_inobt_insert( } ASSERT(i == 0); - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + error = xfs_inobt_insert_rec(cur, 0, count, XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -174,8 +191,58 @@ xfs_inobt_insert( } xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +} +/* + * Update or insert a new record based on a sparse inode chunk allocation. + * + * If a record already exists, the new record is an updated version of that + * record based on a merge of sparse inode chunks. Update the record in place. + * Otherwise, insert a new record in the tree. Note that the record to insert + * must already have been aligned and merged, if necessary. + */ +STATIC int +xfs_inobt_update_insert( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_inobt_rec_incore *rec, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int i; + int error; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + error = xfs_inobt_lookup(cur, rec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + if (i == 1) { + /* found a record, update it with the merged record */ + error = xfs_inobt_update(cur, rec); + if (error) + goto error; + goto out; + } + + /* no existing record, insert a new one */ + error = xfs_inobt_insert_rec(cur, rec->ir_holemask, rec->ir_count, + rec->ir_freecount, rec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(i == 1, error); + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; + +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; } /* @@ -215,8 +282,36 @@ xfs_check_agi_freecount( } return 0; } + +/* + * Verify that an inode record has a valid inode count. With sparse inode chunk + * support enabled, the count must be consistent with the holemask. Otherwise, + * the count is set to 0. + */ +STATIC int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount; + DECLARE_BITMAP(allocbmap, XFS_INODES_PER_CHUNK); + + if (!xfs_sb_version_hassparseinodes(&mp->m_sb)) { + if (rec->ir_count) + return -EFSCORRUPTED; + return 0; + } + + xfs_inobt_ialloc_bitmap(allocbmap, rec); + inocount = bitmap_weight(allocbmap, XFS_INODES_PER_CHUNK); + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} #else #define xfs_check_agi_freecount(cur, agi) 0 +#define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* @@ -358,6 +453,183 @@ xfs_ialloc_inode_init( } /* + * Align a record for a recently allocated sparse chunk. The input is a record + * that describes the unaligned chunk. The record is aligned such that it is fit + * for insertion (or merge) into the on-disk inode btrees. + */ +STATIC void +xfs_align_sparse_rec( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + uint16_t allocmask; + + agbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + rec->ir_startino -= offset; + + /* + * Since startino has been aligned down, we have to left shift + * ir_holemask such that it continues to represent the same physical + * inodes as the unaligned record. The unaligned record by definition + * tracks the allocated inodes with the lowest order bits. + * + * ir_holemask is inverted before the shift such that set bits represent + * allocated inodes. This makes it safe for the bit-shift to introduce + * zeroes in the lower order bits without corrupting the record. + * + * Note that no change is required for ir_count, ir_freecount or + * ir_free. The count values are not affected by alignment and ir_free + * is initialized to 1s for all inodes, sparse or otherwise. + */ + allocmask = ~rec->ir_holemask; + allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; + rec->ir_holemask = ~allocmask; +} + +/* + * Determine whether two sparse inode records can be merged. The inode ranges + * must match and there must be no allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + DECLARE_BITMAP(talloc, 64); + DECLARE_BITMAP(salloc, 64); + DECLARE_BITMAP(tmp, 64); + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + xfs_inobt_ialloc_bitmap(talloc, trec); + xfs_inobt_ialloc_bitmap(salloc, srec); + + bitmap_and(tmp, salloc, talloc, 64); + if (!bitmap_empty(tmp, 64)) + return false; + + return true; +} + +/* + * Merge two sparse inode records. The caller must call __xfs_inobt_can_merge() + * to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* merge the holemask */ + trec->ir_holemask &= srec->ir_holemask; + + /* merge the free mask */ + trec->ir_free &= srec->ir_free; +} + +/* + * Determine whether a newly allocated sparse inode chunk record overlaps with + * an existing sparse record in the inobt. When sparse inode chunks are enabled, + * all inode chunk alignment is increased from cluster size to physical inode + * chunk size. This means that the smallest, non-zero gap between two inode + * chunks is at least one full inode chunk. When a sparse inode chunk is + * allocated, the containing record is also aligned in this manner such that + * future sparse allocations within that same range all align to the same record + * startino. This alignment policy supports the ability to merge sparse chunks + * into complete chunks over time. + * + * Given an newly allocated/aligned sparse inode record, look up whether a + * sparse record already exists at this startino. If so, merge the two records + * and return the merged record in nrec. + * + * An error is returned if records overlap but a merge is not possible. Given + * the alignment constraints described above, this should never happen and thus + * is treated as fs corruption. + */ +STATIC int +xfs_inobt_rec_merge( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_btnum_t btnum, + struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, we're done */ + if (i == 0) + goto out; + + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(i == 1, error); + ASSERT(rec.ir_startino == nrec->ir_startino); + + /* + * This should never happen. If we have coexisting records that cannot + * merge, something is seriously wrong. + */ + if (!__xfs_inobt_can_merge(nrec, &rec)) { + error = -EFSCORRUPTED; + goto error; + } + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, rec.ir_holemask, + nrec->ir_startino, nrec->ir_holemask); + + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -375,6 +647,9 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; + struct xfs_perag *pag; memset(&args, 0, sizeof(args)); @@ -490,6 +765,45 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spinoalignmt; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -514,6 +828,65 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk... + */ + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* align record and update newino for agi_newino */ + xfs_align_sparse_rec(args.mp, &rec); + newino = rec.ir_startino; + + error = xfs_inobt_rec_merge(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec); + if (!error) + error = xfs_inobt_rec_check_count(args.mp, &rec); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + error = xfs_inobt_update_insert(args.mp, tp, agbp, &rec, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_update_insert(args.mp, tp, agbp, &rec, + XFS_BTNUM_FINO); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -522,20 +895,6 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } - /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, @@ -1672,7 +2031,9 @@ xfs_difree_finobt( */ XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, + ibtrec->ir_count, + ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 51372e3..12a4bf4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -734,6 +734,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) +TRACE_EVENT(xfs_irec_merge_pre, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), + TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + __field(xfs_agino_t, nagino) + __field(uint16_t, nholemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + __entry->nagino = nagino; + __entry->nholemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->agino, __entry->holemask, __entry->nagino, + __entry->nholemask) +) + +TRACE_EVENT(xfs_irec_merge_post, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask), + TP_ARGS(mp, agno, agino, holemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->agno, __entry->agino, + __entry->holemask) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ -- 1.8.3.1 _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs