On Tue, Mar 03, 2020 at 07:28:28PM -0800, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > Create an in-core fake root for AG-rooted btree types so that callers > can generate a whole new btree using the upcoming btree bulk load > function without making the new tree accessible from the rest of the > filesystem. It is up to the individual btree type to provide a function > to create a staged cursor (presumably with the appropriate callouts to > update the fakeroot) and then commit the staged root back into the > filesystem. > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > --- The code all seems reasonable, mostly infrastructure. Just a few high level comments.. It would be helpful if the commit log (or code comments) explained more about the callouts that are replaced for a staging tree (and why). > fs/xfs/libxfs/xfs_btree.c | 117 +++++++++++++++++++++++++++++++++++++++++++++ > fs/xfs/libxfs/xfs_btree.h | 42 ++++++++++++++-- > fs/xfs/xfs_trace.h | 28 +++++++++++ > 3 files changed, 182 insertions(+), 5 deletions(-) > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > index e6f898bf3174..9a7c1a4d0423 100644 > --- a/fs/xfs/libxfs/xfs_btree.c > +++ b/fs/xfs/libxfs/xfs_btree.c > @@ -382,6 +382,8 @@ xfs_btree_del_cursor( > /* > * Free the cursor. > */ > + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) > + kmem_free((void *)cur->bc_ops); > kmem_cache_free(xfs_btree_cur_zone, cur); > } > > @@ -4908,3 +4910,118 @@ xfs_btree_has_more_records( > else > return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); > } > + > +/* We don't allow staging cursors to be duplicated. */ > +STATIC struct xfs_btree_cur * > +xfs_btree_fakeroot_dup_cursor( > + struct xfs_btree_cur *cur) > +{ > + ASSERT(0); > + return NULL; > +} > + > +/* Refuse to allow regular block allocation for a staging cursor. */ > +STATIC int > +xfs_btree_fakeroot_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start_bno, > + union xfs_btree_ptr *new_bno, > + int *stat) > +{ > + ASSERT(0); > + return -EFSCORRUPTED; Calling these is a runtime bug as opposed to corruption, right? > +} > + > +/* Refuse to allow block freeing for a staging cursor. */ > +STATIC int > +xfs_btree_fakeroot_free_block( > + struct xfs_btree_cur *cur, > + struct xfs_buf *bp) > +{ > + ASSERT(0); > + return -EFSCORRUPTED; > +} > + For example, why do we not allow alloc/frees of blocks into a staging tree? Is this something related to how staging trees will be constructed vs. normal trees, or is this just stubbed in and to be implemented later? > +/* Initialize a pointer to the root block from the fakeroot. */ > +STATIC void > +xfs_btree_fakeroot_init_ptr_from_cur( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *ptr) > +{ > + struct xbtree_afakeroot *afake; > + > + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); > + > + afake = cur->bc_private.a.afake; > + ptr->s = cpu_to_be32(afake->af_root); > +} > + > +/* Set the root block when our tree has a fakeroot. */ > +STATIC void > +xfs_btree_afakeroot_set_root( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *ptr, > + int inc) > +{ > + struct xbtree_afakeroot *afake = cur->bc_private.a.afake; > + > + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); > + afake->af_root = be32_to_cpu(ptr->s); > + afake->af_levels += inc; > +} > + > +/* > + * Initialize a AG-rooted btree cursor with the given AG btree fake root. The > + * btree cursor's @bc_ops will be overridden as needed to make the staging > + * functionality work. If @new_ops is not NULL, these new ops will be passed > + * out to the caller for further overriding. > + */ > +void > +xfs_btree_stage_afakeroot( > + struct xfs_btree_cur *cur, > + struct xbtree_afakeroot *afake, > + struct xfs_btree_ops **new_ops) > +{ > + struct xfs_btree_ops *nops; > + > + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); > + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); > + > + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); > + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); > + nops->alloc_block = xfs_btree_fakeroot_alloc_block; > + nops->free_block = xfs_btree_fakeroot_free_block; > + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; > + nops->set_root = xfs_btree_afakeroot_set_root; > + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; > + > + cur->bc_private.a.afake = afake; > + cur->bc_nlevels = afake->af_levels; > + cur->bc_ops = nops; > + cur->bc_flags |= XFS_BTREE_STAGING; > + > + if (new_ops) > + *new_ops = nops; > +} > + > +/* > + * Transform an AG-rooted staging btree cursor back into a regular cursor by > + * substituting a real btree root for the fake one and restoring normal btree > + * cursor ops. The caller must log the btree root change prior to calling > + * this. > + */ > +void > +xfs_btree_commit_afakeroot( > + struct xfs_btree_cur *cur, > + struct xfs_buf *agbp, > + const struct xfs_btree_ops *ops) > +{ > + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); > + > + trace_xfs_btree_commit_afakeroot(cur); > + > + kmem_free((void *)cur->bc_ops); > + cur->bc_private.a.agbp = agbp; > + cur->bc_ops = ops; > + cur->bc_flags &= ~XFS_BTREE_STAGING; > +} Any reason this new code isn't off in a new xfs_staging_btree.c or some such instead of xfs_btree.c? > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h > index 3eff7c321d43..3ada085609a8 100644 > --- a/fs/xfs/libxfs/xfs_btree.h > +++ b/fs/xfs/libxfs/xfs_btree.h > @@ -188,6 +188,16 @@ union xfs_btree_cur_private { > } abt; > }; > > +/* Private information for a AG-rooted btree. */ > +struct xfs_btree_priv_ag { /* needed for BNO, CNT, INO */ > + union { > + struct xfs_buf *agbp; /* agf/agi buffer pointer */ > + struct xbtree_afakeroot *afake; /* fake ag header root */ > + }; > + xfs_agnumber_t agno; /* ag number */ > + union xfs_btree_cur_private priv; > +}; > + Ideally refactoring this would be a separate patch from adding a new field. Brian > /* > * Btree cursor structure. > * This collects all information needed by the btree code in one place. > @@ -209,11 +219,7 @@ typedef struct xfs_btree_cur > xfs_btnum_t bc_btnum; /* identifies which btree type */ > int bc_statoff; /* offset of btre stats array */ > union { > - struct { /* needed for BNO, CNT, INO */ > - struct xfs_buf *agbp; /* agf/agi buffer pointer */ > - xfs_agnumber_t agno; /* ag number */ > - union xfs_btree_cur_private priv; > - } a; > + struct xfs_btree_priv_ag a; > struct { /* needed for BMAP */ > struct xfs_inode *ip; /* pointer to our inode */ > int allocated; /* count of alloced */ > @@ -232,6 +238,12 @@ typedef struct xfs_btree_cur > #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ > #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ > #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ > +/* > + * The root of this btree is a fakeroot structure so that we can stage a btree > + * rebuild without leaving it accessible via primary metadata. The ops struct > + * is dynamically allocated and must be freed when the cursor is deleted. > + */ > +#define XFS_BTREE_STAGING (1<<5) > > > #define XFS_BTREE_NOERROR 0 > @@ -512,4 +524,24 @@ xfs_btree_islastblock( > return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); > } > > +/* Fake root for an AG-rooted btree. */ > +struct xbtree_afakeroot { > + /* AG block number of the new btree root. */ > + xfs_agblock_t af_root; > + > + /* Height of the new btree. */ > + unsigned int af_levels; > + > + /* Number of blocks used by the btree. */ > + unsigned int af_blocks; > +}; > + > +/* Cursor interactions with with fake roots for AG-rooted btrees. */ > +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, > + struct xbtree_afakeroot *afake, > + struct xfs_btree_ops **new_ops); > +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, > + struct xfs_buf *agbp, > + const struct xfs_btree_ops *ops); > + > #endif /* __XFS_BTREE_H__ */ > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index e242988f57fb..57ff9f583b5f 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -3594,6 +3594,34 @@ TRACE_EVENT(xfs_check_new_dalign, > __entry->calc_rootino) > ) > > +TRACE_EVENT(xfs_btree_commit_afakeroot, > + TP_PROTO(struct xfs_btree_cur *cur), > + TP_ARGS(cur), > + TP_STRUCT__entry( > + __field(dev_t, dev) > + __field(xfs_btnum_t, btnum) > + __field(xfs_agnumber_t, agno) > + __field(xfs_agblock_t, agbno) > + __field(unsigned int, levels) > + __field(unsigned int, blocks) > + ), > + TP_fast_assign( > + __entry->dev = cur->bc_mp->m_super->s_dev; > + __entry->btnum = cur->bc_btnum; > + __entry->agno = cur->bc_private.a.agno; > + __entry->agbno = cur->bc_private.a.afake->af_root; > + __entry->levels = cur->bc_private.a.afake->af_levels; > + __entry->blocks = cur->bc_private.a.afake->af_blocks; > + ), > + TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u", > + MAJOR(__entry->dev), MINOR(__entry->dev), > + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), > + __entry->agno, > + __entry->levels, > + __entry->blocks, > + __entry->agbno) > +) > + > #endif /* _TRACE_XFS_H */ > > #undef TRACE_INCLUDE_PATH >