On Thu, Aug 22, 2024 at 05:17:31PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > Create an incore object that will contain information about a realtime > allocation group. This will eventually enable us to shard the realtime > section in a similar manner to how we shard the data section, but for > now just a single object for the entire RT subvolume is created. > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > --- > fs/xfs/Makefile | 1 > fs/xfs/libxfs/xfs_format.h | 3 + > fs/xfs/libxfs/xfs_rtgroup.c | 196 ++++++++++++++++++++++++++++++++++++++++ > fs/xfs/libxfs/xfs_rtgroup.h | 212 +++++++++++++++++++++++++++++++++++++++++++ > fs/xfs/libxfs/xfs_sb.c | 7 + > fs/xfs/libxfs/xfs_types.h | 4 + > fs/xfs/xfs_log_recover.c | 20 ++++ > fs/xfs/xfs_mount.c | 16 +++ > fs/xfs/xfs_mount.h | 14 +++ > fs/xfs/xfs_rtalloc.c | 6 + > fs/xfs/xfs_super.c | 1 > fs/xfs/xfs_trace.c | 1 > fs/xfs/xfs_trace.h | 38 ++++++++ > 13 files changed, 517 insertions(+), 2 deletions(-) > create mode 100644 fs/xfs/libxfs/xfs_rtgroup.c > create mode 100644 fs/xfs/libxfs/xfs_rtgroup.h Ok, how is the global address space for real time extents laid out across rt groups? i.e. is it sparse similar to how fsbnos and inode numbers are created for the data device like so? fsbno = (agno << agblklog) | agbno Or is it something different? I can't find that defined anywhere in this patch, so I can't determine if the unit conversion code and validation is correct or not... > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > index 4d8ca08cdd0ec..388b5cef48ca5 100644 > --- a/fs/xfs/Makefile > +++ b/fs/xfs/Makefile > @@ -60,6 +60,7 @@ xfs-y += $(addprefix libxfs/, \ > # xfs_rtbitmap is shared with libxfs > xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ > xfs_rtbitmap.o \ > + xfs_rtgroup.o \ > ) > > # highlevel code > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > index 16a7bc02aa5f5..fa5cfc8265d92 100644 > --- a/fs/xfs/libxfs/xfs_format.h > +++ b/fs/xfs/libxfs/xfs_format.h > @@ -176,6 +176,9 @@ typedef struct xfs_sb { > > xfs_ino_t sb_metadirino; /* metadata directory tree root */ > > + xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ > + xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ So min/max rtgroup size is defined by the sb_rextsize field? What redundant metadata do we end up with that allows us to validate the sb_rextsize field is still valid w.r.t. rtgroups geometry? Also, rtgroup lengths are defined by "rtx counts", but the definitions in the xfs_mount later on are "m_rtblklog" and "m_rgblocks" and we use xfs_rgblock_t and rgbno all over the place. Just from the context of this patch, it is somewhat confusing trying to work out what the difference is... > /* must be padded to 64 bit alignment */ > } xfs_sb_t; > > diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c > new file mode 100644 > index 0000000000000..2bad1ecb811eb > --- /dev/null > +++ b/fs/xfs/libxfs/xfs_rtgroup.c > @@ -0,0 +1,196 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <djwong@xxxxxxxxxx> > + */ > +#include "xfs.h" > +#include "xfs_fs.h" > +#include "xfs_shared.h" > +#include "xfs_format.h" > +#include "xfs_trans_resv.h" > +#include "xfs_bit.h" > +#include "xfs_sb.h" > +#include "xfs_mount.h" > +#include "xfs_btree.h" > +#include "xfs_alloc_btree.h" > +#include "xfs_rmap_btree.h" > +#include "xfs_alloc.h" > +#include "xfs_ialloc.h" > +#include "xfs_rmap.h" > +#include "xfs_ag.h" > +#include "xfs_ag_resv.h" > +#include "xfs_health.h" > +#include "xfs_error.h" > +#include "xfs_bmap.h" > +#include "xfs_defer.h" > +#include "xfs_log_format.h" > +#include "xfs_trans.h" > +#include "xfs_trace.h" > +#include "xfs_inode.h" > +#include "xfs_icache.h" > +#include "xfs_rtgroup.h" > +#include "xfs_rtbitmap.h" > + > +/* > + * Passive reference counting access wrappers to the rtgroup structures. If > + * the rtgroup structure is to be freed, the freeing code is responsible for > + * cleaning up objects with passive references before freeing the structure. > + */ > +struct xfs_rtgroup * > +xfs_rtgroup_get( > + struct xfs_mount *mp, > + xfs_rgnumber_t rgno) > +{ > + struct xfs_rtgroup *rtg; > + > + rcu_read_lock(); > + rtg = xa_load(&mp->m_rtgroups, rgno); > + if (rtg) { > + trace_xfs_rtgroup_get(rtg, _RET_IP_); > + ASSERT(atomic_read(&rtg->rtg_ref) >= 0); > + atomic_inc(&rtg->rtg_ref); > + } > + rcu_read_unlock(); > + return rtg; > +} > + > +/* Get a passive reference to the given rtgroup. */ > +struct xfs_rtgroup * > +xfs_rtgroup_hold( > + struct xfs_rtgroup *rtg) > +{ > + ASSERT(atomic_read(&rtg->rtg_ref) > 0 || > + atomic_read(&rtg->rtg_active_ref) > 0); > + > + trace_xfs_rtgroup_hold(rtg, _RET_IP_); > + atomic_inc(&rtg->rtg_ref); > + return rtg; > +} > + > +void > +xfs_rtgroup_put( > + struct xfs_rtgroup *rtg) > +{ > + trace_xfs_rtgroup_put(rtg, _RET_IP_); > + ASSERT(atomic_read(&rtg->rtg_ref) > 0); > + atomic_dec(&rtg->rtg_ref); > +} > + > +/* > + * Active references for rtgroup structures. This is for short term access to > + * the rtgroup structures for walking trees or accessing state. If an rtgroup > + * is being shrunk or is offline, then this will fail to find that group and > + * return NULL instead. > + */ > +struct xfs_rtgroup * > +xfs_rtgroup_grab( > + struct xfs_mount *mp, > + xfs_agnumber_t agno) > +{ > + struct xfs_rtgroup *rtg; > + > + rcu_read_lock(); > + rtg = xa_load(&mp->m_rtgroups, agno); > + if (rtg) { > + trace_xfs_rtgroup_grab(rtg, _RET_IP_); > + if (!atomic_inc_not_zero(&rtg->rtg_active_ref)) > + rtg = NULL; > + } > + rcu_read_unlock(); > + return rtg; > +} > + > +void > +xfs_rtgroup_rele( > + struct xfs_rtgroup *rtg) > +{ > + trace_xfs_rtgroup_rele(rtg, _RET_IP_); > + if (atomic_dec_and_test(&rtg->rtg_active_ref)) > + wake_up(&rtg->rtg_active_wq); > +} This is all duplicates of the xfs_perag code. Can you put together a patchset to abstract this into a "xfs_group" and embed them in both the perag and and rtgroup structures? That way we only need one set of lookup and iterator infrastructure, and it will work for both data and rt groups... > + > +/* Compute the number of rt extents in this realtime group. */ > +xfs_rtxnum_t > +xfs_rtgroup_extents( + struct xfs_mount *mp, > + xfs_rgnumber_t rgno) > +{ > + xfs_rgnumber_t rgcount = mp->m_sb.sb_rgcount; > + > + ASSERT(rgno < rgcount); > + if (rgno == rgcount - 1) > + return mp->m_sb.sb_rextents - > + ((xfs_rtxnum_t)rgno * mp->m_sb.sb_rgextents); Urk. So this relies on a non-rtgroup filesystem doing a multiplication by zero of a field that the on-disk format does not understand to get the right result. I think this is a copying a bad pattern we've been slowly trying to remove from the normal allocation group code. > + > + ASSERT(xfs_has_rtgroups(mp)); > + return mp->m_sb.sb_rgextents; > +} We already embed the length of the rtgroup in the rtgroup structure. THis should be looking up the rtgroup (or being passed the rtgroup the caller already has) and doing the right thing. i.e. if (!rtg || !xfs_has_rtgroups(rtg->rtg_mount)) return mp->m_sb.sb_rextents; return rtg->rtg_extents; > diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h > new file mode 100644 > index 0000000000000..2c09ecfc50328 > --- /dev/null > +++ b/fs/xfs/libxfs/xfs_rtgroup.h > @@ -0,0 +1,212 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > +/* > + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <djwong@xxxxxxxxxx> > + */ > +#ifndef __LIBXFS_RTGROUP_H > +#define __LIBXFS_RTGROUP_H 1 > + > +struct xfs_mount; > +struct xfs_trans; > + > +/* > + * Realtime group incore structure, similar to the per-AG structure. > + */ > +struct xfs_rtgroup { > + struct xfs_mount *rtg_mount; > + xfs_rgnumber_t rtg_rgno; > + atomic_t rtg_ref; /* passive reference count */ > + atomic_t rtg_active_ref; /* active reference count */ > + wait_queue_head_t rtg_active_wq;/* woken active_ref falls to zero */ Yeah, that's all common with xfs_perag.... .... > +/* > + * rt group iteration APIs > + */ > +static inline struct xfs_rtgroup * > +xfs_rtgroup_next( > + struct xfs_rtgroup *rtg, > + xfs_rgnumber_t *rgno, > + xfs_rgnumber_t end_rgno) > +{ > + struct xfs_mount *mp = rtg->rtg_mount; > + > + *rgno = rtg->rtg_rgno + 1; > + xfs_rtgroup_rele(rtg); > + if (*rgno > end_rgno) > + return NULL; > + return xfs_rtgroup_grab(mp, *rgno); > +} > + > +#define for_each_rtgroup_range(mp, rgno, end_rgno, rtg) \ > + for ((rtg) = xfs_rtgroup_grab((mp), (rgno)); \ > + (rtg) != NULL; \ > + (rtg) = xfs_rtgroup_next((rtg), &(rgno), (end_rgno))) > + > +#define for_each_rtgroup_from(mp, rgno, rtg) \ > + for_each_rtgroup_range((mp), (rgno), (mp)->m_sb.sb_rgcount - 1, (rtg)) > + > + > +#define for_each_rtgroup(mp, rgno, rtg) \ > + (rgno) = 0; \ > + for_each_rtgroup_from((mp), (rgno), (rtg)) Yup, that's all common with xfs_perag iteration, too. Can you put together a patchset to unify these, please? > +static inline bool > +xfs_verify_rgbno( > + struct xfs_rtgroup *rtg, > + xfs_rgblock_t rgbno) Ok, what's the difference between and xfs_rgblock_t and a "rtx"? OH.... Then penny just dropped - it's another "single letter difference that's really, really hard to spot" problem. You've defined "xfs_r*g*block_t" for the like a a*g*bno, but we have xfs_r*t*block_t for the global 64bit block number instead of a xfs_fsbno_t. We just had a bug caused by exactly this sort of confusion with a patch that mixed xfs_[f]inobt changes together and one of the conversions was incorrect. Nobody spotted the single incorrect letter in the bigger patch, and I can see -exactly- the same sort of confusion happening with rtblock vs rgblock causing implicit 32/64 bit integer promotion bugs... > +{ > + struct xfs_mount *mp = rtg->rtg_mount; > + > + if (rgbno >= rtg->rtg_extents * mp->m_sb.sb_rextsize) > + return false; Why isn't the max valid "rgbno" stored in the rtgroup instead of having to multiply the extent count by extent size every time we have to verify a rgbno? (i.e. same as pag->block_count). We know from the agbno verification this will be a -very- hot path, and so precalculating all the constants and storing them in the rtg should be done right from the start here. > + if (xfs_has_rtsb(mp) && rtg->rtg_rgno == 0 && > + rgbno < mp->m_sb.sb_rextsize) > + return false; Same here - this value is stored in pag->min_block... > + return true; > +} And then, if we put the max_bno and min_bno in the generic "xfs_group" structure, we suddenly have a generic "group bno" verification mechanism that is independent of whether the group static inline bool xfs_verify_gbno( struct xfs_group *g, xfs_gblock_t gbno) { struct xfs_mount *mp = g->g_mount; if (gbno >= g->block_count) return false; if (gbno < g->min_block) return false; return true; } And the rest of these functions fall out the same way.... > +static inline xfs_rtblock_t > +xfs_rgno_start_rtb( > + struct xfs_mount *mp, > + xfs_rgnumber_t rgno) > +{ > + if (mp->m_rgblklog >= 0) > + return ((xfs_rtblock_t)rgno << mp->m_rgblklog); > + return ((xfs_rtblock_t)rgno * mp->m_rgblocks); > +} Where does mp->m_rgblklog come from? That wasn't added to the on-disk superblock structure and it is always initialised to zero in this patch. When will m_rgblklog be zero and when will it be non-zero? If it's only going to be zero for existing non-rtg realtime systems, then this code makes little sense (again, relying on multiplication by zero to get the right result). If it's not always used for rtg enabled filesytsems, then the reason for that has not been explained and I can't work out why this would ever need to be done. > +static inline xfs_rtblock_t > +xfs_rgbno_to_rtb( > + struct xfs_mount *mp, > + xfs_rgnumber_t rgno, > + xfs_rgblock_t rgbno) > +{ > + return xfs_rgno_start_rtb(mp, rgno) + rgbno; > +} > + > +static inline xfs_rgnumber_t > +xfs_rtb_to_rgno( > + struct xfs_mount *mp, > + xfs_rtblock_t rtbno) > +{ > + if (!xfs_has_rtgroups(mp)) > + return 0; > + > + if (mp->m_rgblklog >= 0) > + return rtbno >> mp->m_rgblklog; > + > + return div_u64(rtbno, mp->m_rgblocks); > +} Ah, now I'm really confused, because m_rgblklog is completely bypassed for legacy rt filesystems. And I just realised, this "if (mp->m_rgblklog >= 0)" implies that m_rgblklog can have negative values and there's no comments anywhere about why that can happen and what would trigger it. We validate sb_agblklog during the superblock verifier, and so once the filesystem is mounted we never, ever need to check whether sb_agblklog is in range. Why is the rtblklog being handled so differently here? > + > +static inline uint64_t > +__xfs_rtb_to_rgbno( > + struct xfs_mount *mp, > + xfs_rtblock_t rtbno) > +{ > + uint32_t rem; > + > + if (!xfs_has_rtgroups(mp)) > + return rtbno; > + > + if (mp->m_rgblklog >= 0) > + return rtbno & mp->m_rgblkmask; > + > + div_u64_rem(rtbno, mp->m_rgblocks, &rem); > + return rem; > +} Why is this function returning a uint64_t - a xfs_rgblock_t is only a 32 bit type... > + > +static inline xfs_rgblock_t > +xfs_rtb_to_rgbno( > + struct xfs_mount *mp, > + xfs_rtblock_t rtbno) > +{ > + return __xfs_rtb_to_rgbno(mp, rtbno); > +} > + > +static inline xfs_daddr_t > +xfs_rtb_to_daddr( > + struct xfs_mount *mp, > + xfs_rtblock_t rtbno) > +{ > + return rtbno << mp->m_blkbb_log; > +} > + > +static inline xfs_rtblock_t > +xfs_daddr_to_rtb( > + struct xfs_mount *mp, > + xfs_daddr_t daddr) > +{ > + return daddr >> mp->m_blkbb_log; > +} Ah. This code doesn't sparsify the xfs_rtblock_t address space for rtgroups. xfs_rtblock_t is still direct physical encoding of the location on disk. I really think that needs to be changed to match how xfs_fsbno_t is a sparse encoding before these changes get merged. It shouldn't affect any of the other code in the patch set - the existing rt code has a rtgno of 0, so it will always be a direct physical encoding even when using a sparse xfs_rtblock_t address space. All that moving to a sparse encoding means is that the addresses stored in the BMBT are logical addresses rather than physical addresses. It should not affect any of the other code, just what ends up stored on disk for global 64-bit rt extent addresses... In doing this, I think we can greatly simply all this group management stuff as most of the verification, type conversion and iteration infrastructure can then be shared between the exist perag and the new rtg infrastructure.... > diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h > index a8cd44d03ef64..1ce4b9eb16f47 100644 > --- a/fs/xfs/libxfs/xfs_types.h > +++ b/fs/xfs/libxfs/xfs_types.h > @@ -9,10 +9,12 @@ > typedef uint32_t prid_t; /* project ID */ > > typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ > +typedef uint32_t xfs_rgblock_t; /* blockno in realtime group */ Is that right? The rtg length is 2^32 * rtextsize, and rtextsize can be 2^20 bytes: #define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) Hence for a 4kB fsbno filesystem, the actual maximum size of an rtg in filesystem blocks far exceeds what we can address with a 32 bit variable. If xfs_rgblock_t is actually indexing multi-fsbno rtextents, then it is an extent number index, not a "block" index. An extent number index won't overflow 32 bits (because the rtg has a max of 2^32 - 1 rtextents) IOWs, shouldn't this be named soemthing like: typedef uint32_t xfs_rgext_t; /* extent number in realtime group */ > typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ > typedef uint32_t xfs_extlen_t; /* extent length in blocks */ > typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ > typedef uint32_t xfs_agnumber_t; /* allocation group number */ > +typedef uint32_t xfs_rgnumber_t; /* realtime group number */ > typedef uint64_t xfs_extnum_t; /* # of extents in a file */ > typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ > typedef int64_t xfs_fsize_t; /* bytes in a file */ > @@ -53,7 +55,9 @@ typedef void * xfs_failaddr_t; > #define NULLFILEOFF ((xfs_fileoff_t)-1) > > #define NULLAGBLOCK ((xfs_agblock_t)-1) > +#define NULLRGBLOCK ((xfs_rgblock_t)-1) > #define NULLAGNUMBER ((xfs_agnumber_t)-1) > +#define NULLRGNUMBER ((xfs_rgnumber_t)-1) What's the maximum valid rtg number? We're not ever going to be supporting 2^32 - 2 rtgs, so what is a realistic maximum we can cap this at and validate it at? > #define NULLCOMMITLSN ((xfs_lsn_t)-1) > > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c > index 4423dd344239b..c627cde3bb1e0 100644 > --- a/fs/xfs/xfs_log_recover.c > +++ b/fs/xfs/xfs_log_recover.c > @@ -28,6 +28,7 @@ > #include "xfs_ag.h" > #include "xfs_quota.h" > #include "xfs_reflink.h" > +#include "xfs_rtgroup.h" > > #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) > > @@ -3346,6 +3347,7 @@ xlog_do_recover( > struct xfs_mount *mp = log->l_mp; > struct xfs_buf *bp = mp->m_sb_bp; > struct xfs_sb *sbp = &mp->m_sb; > + xfs_rgnumber_t old_rgcount = sbp->sb_rgcount; > int error; > > trace_xfs_log_recover(log, head_blk, tail_blk); > @@ -3399,6 +3401,24 @@ xlog_do_recover( > xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); > return error; > } > + > + if (sbp->sb_rgcount < old_rgcount) { > + xfs_warn(mp, "rgcount shrink not supported"); > + return -EINVAL; > + } > + if (sbp->sb_rgcount > old_rgcount) { > + xfs_rgnumber_t rgno; > + > + for (rgno = old_rgcount; rgno < sbp->sb_rgcount; rgno++) { > + error = xfs_rtgroup_alloc(mp, rgno); > + if (error) { > + xfs_warn(mp, > + "Failed post-recovery rtgroup init: %d", > + error); > + return error; > + } > + } > + } Please factor this out into a separate function with all the other rtgroup init/teardown code. That means we don't have to care about how rtgrowfs functions in recovery code, similar to the xfs_initialize_perag() already in this function for handling recovery of data device growing... > mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); > > /* Normal transactions can now occur */ > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c > index b0ea88acdb618..e1e849101cdd4 100644 > --- a/fs/xfs/xfs_mount.c > +++ b/fs/xfs/xfs_mount.c > @@ -36,6 +36,7 @@ > #include "xfs_ag.h" > #include "xfs_rtbitmap.h" > #include "xfs_metafile.h" > +#include "xfs_rtgroup.h" > #include "scrub/stats.h" > > static DEFINE_MUTEX(xfs_uuid_table_mutex); > @@ -664,6 +665,7 @@ xfs_mountfs( > struct xfs_ino_geometry *igeo = M_IGEO(mp); > uint quotamount = 0; > uint quotaflags = 0; > + xfs_rgnumber_t rgno; > int error = 0; > > xfs_sb_mount_common(mp, sbp); > @@ -830,10 +832,18 @@ xfs_mountfs( > goto out_free_dir; > } > > + for (rgno = 0; rgno < mp->m_sb.sb_rgcount; rgno++) { > + error = xfs_rtgroup_alloc(mp, rgno); > + if (error) { > + xfs_warn(mp, "Failed rtgroup init: %d", error); > + goto out_free_rtgroup; > + } > + } Same - factor this to a xfs_rtgroup_init() function located with the rest of the rtgroup infrastructure... > + > if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) { > xfs_warn(mp, "no log defined"); > error = -EFSCORRUPTED; > - goto out_free_perag; > + goto out_free_rtgroup; > } > > error = xfs_inodegc_register_shrinker(mp); > @@ -1068,7 +1078,8 @@ xfs_mountfs( > if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) > xfs_buftarg_drain(mp->m_logdev_targp); > xfs_buftarg_drain(mp->m_ddev_targp); > - out_free_perag: > + out_free_rtgroup: > + xfs_free_rtgroups(mp, rgno); > xfs_free_perag(mp); > out_free_dir: > xfs_da_unmount(mp); > @@ -1152,6 +1163,7 @@ xfs_unmountfs( > xfs_errortag_clearall(mp); > #endif > shrinker_free(mp->m_inodegc_shrinker); > + xfs_free_rtgroups(mp, mp->m_sb.sb_rgcount); ... like you've already for the cleanup side ;) .... > @@ -1166,6 +1169,9 @@ xfs_rtmount_inodes( > if (error) > goto out_rele_summary; > > + for_each_rtgroup(mp, rgno, rtg) > + rtg->rtg_extents = xfs_rtgroup_extents(mp, rtg->rtg_rgno); > + This also needs to be done after recovery has initialised new rtgs as a result fo replaying a sb growfs modification, right? Which leads to the next question: if there are thousands of rtgs, this requires walking every rtg at mount time, right? We know that walking thousands of static structures at mount time is a scalability issue, so can we please avoid this if at all possible? i.e. do demand loading of per-rtg metadata when it is first required (like we do with agf/agi information) rather than doing it all at mount time... -Dave. -- Dave Chinner david@xxxxxxxxxxxxx