On Thu, Jun 25, 2015 at 04:39:16PM -0700, Darrick J. Wong wrote: > Create a per-AG btree to track the reference counts of physical blocks > to support reflink. Few things from a quick glance... > --- a/fs/xfs/libxfs/xfs_alloc.c > +++ b/fs/xfs/libxfs/xfs_alloc.c > @@ -54,6 +54,8 @@ xfs_extlen_t > xfs_prealloc_blocks( > struct xfs_mount *mp) > { > + if (xfs_sb_version_hasreflink(&mp->m_sb)) > + return XFS_RL_BLOCK(mp) + 1; Should introduce the sb version stuff as a separate patch perhaps with the basic infrastructure defines (see how I did the first rmap btree patch). > @@ -1117,6 +1118,9 @@ xfs_btree_set_refs( > case XFS_BTNUM_RMAP: > xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); > break; > + case XFS_BTNUM_RL: Probably better to call it XFS_BTNUM_REFLINK > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h > index 48ab2b1..a3f8661 100644 > --- a/fs/xfs/libxfs/xfs_btree.h > +++ b/fs/xfs/libxfs/xfs_btree.h > @@ -43,6 +43,7 @@ union xfs_btree_key { > xfs_alloc_key_t alloc; > struct xfs_inobt_key inobt; > struct xfs_rmap_key rmap; > + xfs_reflink_key_t reflink; No new typedefs. struct xfs_reflink_key... (only say this once, but applies many times ;) > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > index 9cff517..e4954ab 100644 > --- a/fs/xfs/libxfs/xfs_format.h > +++ b/fs/xfs/libxfs/xfs_format.h > @@ -446,9 +446,11 @@ xfs_sb_has_compat_feature( > > #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ > #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ > +#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflink btree */ > #define XFS_SB_FEAT_RO_COMPAT_ALL \ > (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ > - XFS_SB_FEAT_RO_COMPAT_RMAPBT) > + XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ > + XFS_SB_FEAT_RO_COMPAT_REFLINK) The XFS_SB_FEAT_RO_COMPAT_REFLINK flag shoul dbe added as a separate patch and put last in the series so it is only enabled once everything is complete. > #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL > static inline bool > xfs_sb_has_ro_compat_feature( > @@ -522,6 +524,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) > (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); > } > > +static inline int xfs_sb_version_hasreflink(xfs_sb_t *sbp) bool. > @@ -1338,6 +1349,50 @@ typedef __be32 xfs_rmap_ptr_t; > XFS_IBT_BLOCK(mp) + 1) > > /* > + * reflink Btree format definitions > + * > + */ > +#define XFS_RLBT_CRC_MAGIC 0x524C4233 /* 'RLB3' */ #define XFS_RMAP_CRC_MAGIC 0x524d4233 /* 'RMB3' */ Only one bit difference in the magic numbers, which means they are too similar. "RFL3" might be better or maybe "R3FL"... > +/* > + * Data record/key structure > + */ > +typedef struct xfs_reflink_rec { > + __be32 rr_startblock; /* starting block number */ > + __be32 rr_blockcount; /* count of blocks */ > + __be32 rr_nlinks; /* number of inodes linked here */ > +} xfs_reflink_rec_t; > + > +typedef struct xfs_reflink_key { > + __be32 rr_startblock; /* starting block number */ > +} xfs_reflink_key_t; > + > +typedef struct xfs_reflink_rec_incore { > + xfs_agblock_t rr_startblock; /* starting block number */ > + xfs_extlen_t rr_blockcount; /* count of free blocks */ > + xfs_nlink_t rr_nlinks; /* number of inodes linked here */ > +} xfs_reflink_rec_incore_t; We have being using "irec" as shorthand for "in-core record". i.e: struct xfs_reflink_irec. (kill typedefs) > + > +/* > + * When a block hits MAXRLCOUNT references, it becomes permanently > + * stuck in CoW mode, because who knows how many times it's really > + * referenced. > + */ > +#define MAXRLCOUNT ((xfs_nlink_t)~0U) > +#define MAXRLEXTLEN ((xfs_extlen_t)~0U) I'd suggest that if we hit the maximum count, we just abort the reflink operation. > +/* btree pointer type */ > +typedef __be32 xfs_reflink_ptr_t; > + > +#define XFS_RL_BLOCK(mp) \ > + (xfs_sb_version_hasrmapbt(&((mp)->m_sb)) ? \ > + XFS_RMAP_BLOCK(mp) + 1 : \ > + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ > + XFS_FIBT_BLOCK(mp) + 1 : \ > + XFS_IBT_BLOCK(mp) + 1)) That's getting unwieldy. It's large enough for a function.... > +#ifdef REFLINK_DEBUG > +# define dbg_printk(f, a...) do {printk(KERN_ERR f, ## a); } while (0) > +#else > +# define dbg_printk(f, a...) > +#endif xfs_debug() is your friend. > +#define CHECK_AG_NUMBER(mp, agno) \ > + do { \ > + ASSERT((agno) != NULLAGNUMBER); \ > + ASSERT((agno) < (mp)->m_sb.sb_agcount); \ > + } while(0); Ugh. Used once, just open code. > +#define CHECK_AG_EXTENT(mp, agbno, len) \ > + do { \ > + ASSERT((agbno) != NULLAGBLOCK); \ > + ASSERT((len) > 0); \ > + ASSERT((unsigned long long)(agbno) + (len) <= \ > + (mp)->m_sb.sb_agblocks); \ > + } while(0); These are really used in places where corruption checks are warranted, or the extent has already been checked.... > +#define XFS_WANT_CORRUPTED_RLEXT_GOTO(mp, have, agbno, len, nr, label) \ > + do { \ > + XFS_WANT_CORRUPTED_GOTO((mp), (have) == 1, label); \ > + XFS_WANT_CORRUPTED_GOTO((mp), (len) > 0, label); \ > + XFS_WANT_CORRUPTED_GOTO((mp), (nr) >= 2, label); \ > + XFS_WANT_CORRUPTED_GOTO((mp), (unsigned long long)(agbno) + \ > + (len) <= (mp)->m_sb.sb_agblocks, label); \ > + } while(0); Unused. > + > +STATIC int > +xfs_reflinkbt_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start, > + union xfs_btree_ptr *new, > + int *stat) > +{ > + int error; > + xfs_agblock_t bno; > + > + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); > + > + /* Allocate the new block from the freelist. If we can't, give up. */ > + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, > + &bno, 1); > + if (error) { > + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); > + return error; > + } Why does the reflink btree use the free list? Why can't it use block allocation like the BMBT tree? > +/* > + * Allocate a new reflink btree cursor. > + */ > +struct xfs_btree_cur * /* new reflink btree cursor */ > +xfs_reflinkbt_init_cursor( > + struct xfs_mount *mp, /* file system mount point */ > + struct xfs_trans *tp, /* transaction pointer */ > + struct xfs_buf *agbp, /* buffer for agf structure */ > + xfs_agnumber_t agno) /* allocation group number */ No real need for these comments on the variables. They are redundant as the code documents what they are just fine. > +{ > + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); > + struct xfs_btree_cur *cur; > + > + CHECK_AG_NUMBER(mp, agno); > + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); > + > + cur->bc_tp = tp; > + cur->bc_mp = mp; > + cur->bc_btnum = XFS_BTNUM_RL; > + cur->bc_blocklog = mp->m_sb.sb_blocklog; > + cur->bc_ops = &xfs_reflinkbt_ops; > + > + cur->bc_nlevels = be32_to_cpu(agf->agf_reflink_level); > + > + cur->bc_private.a.agbp = agbp; > + cur->bc_private.a.agno = agno; > + > + if (xfs_sb_version_hascrc(&mp->m_sb)) > + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; Can be set unconditionally. The next set of functions normally go into a different file. i.e the "xfs_foo_btree.c" file contains the functions required by the generic btree abstraction to implement the "foo" btree format. The file "xfs_foo.c" then contains the code/logic that provides the external foo API, manages the information inthe foo btree, and calls the generic btree functions to manage the btree. This logic isn't present in the patch, so really it shoul dbe added by the patch that starts implementing the reflink API.... > +/* > + * Get the data from the pointed-to record. > + */ > +int /* error */ > +xfs_reflink_get_rec( > + struct xfs_btree_cur *cur, /* btree cursor */ > + xfs_agblock_t *bno, /* output: starting block of extent */ > + xfs_extlen_t *len, /* output: length of extent */ > + xfs_nlink_t *nlink, /* output: number of links */ > + int *stat) /* output: success/failure */ > +{ > + union xfs_btree_rec *rec; > + int error; > + > + error = xfs_btree_get_rec(cur, &rec, stat); > + if (!error && *stat == 1) { > + CHECK_AG_EXTENT(cur->bc_mp, > + be32_to_cpu(rec->reflink.rr_startblock), > + be32_to_cpu(rec->reflink.rr_blockcount)); > + *bno = be32_to_cpu(rec->reflink.rr_startblock); > + *len = be32_to_cpu(rec->reflink.rr_blockcount); > + *nlink = be32_to_cpu(rec->reflink.rr_nlinks); > + } > + return error; if (error || !*stat) return error; ..... return 0; > + error = xfs_reflink_get_rec(cur, &bno, &len, &nr, &x); > + if (error) > + return error; > + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, x == 1, error0); > + error = xfs_btree_delete(cur, i); > + if (error) > + return error; > + error = xfs_reflink_lookup_ge(cur, bno, &x); > +error0: New code should use sane jump labels. e.g. "out_error" is a pretty standard jump label name for this... > diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h > index 88efbb4..d1de74e 100644 > --- a/fs/xfs/libxfs/xfs_shared.h > +++ b/fs/xfs/libxfs/xfs_shared.h > @@ -216,6 +216,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); > #define XFS_INO_REF 2 > #define XFS_ATTR_BTREE_REF 1 > #define XFS_DQUOT_REF 1 > +#define XFS_REFLINK_BTREE_REF 1 whitespace. > @@ -315,6 +317,9 @@ typedef struct xfs_perag { > /* for rcu-safe freeing */ > struct rcu_head rcu_head; > int pagb_count; /* pagb slots in use */ > + > + /* reflink */ > + __uint8_t pagf_reflink_level; May as well just make it the same as what is on disk (i.e. uint32_t). > +++ b/fs/xfs/xfs_stats.c > @@ -61,6 +61,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v) > { "ibt2", XFSSTAT_END_IBT_V2 }, > { "fibt2", XFSSTAT_END_FIBT_V2 }, > { "rmapbt", XFSSTAT_END_RMAP_V2 }, > + { "rlbt2", XFSSTAT_END_RLBT_V2 }, "reflinkbt". No need for the "2", as there is only one set of reflink btree stats. Cheers, Dave. -- Dave Chinner david@xxxxxxxxxxxxx _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs