Re: [PATCH 5/9] xfs_db: add inobtcnt upgrade path

"Darrick J. Wong" <darrick.wong@xxxxxxxxxx> · Wed, 28 Oct 2020 17:03:32 -0700

On Wed, Oct 28, 2020 at 01:29:25PM -0400, Brian Foster wrote:
> On Mon, Oct 26, 2020 at 04:33:38PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > 
> > Enable users to upgrade their filesystems to support inode btree block
> > counters.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > ---
> >  db/sb.c              |   76 +++++++++++++++++++++++++++++++++++++++++++++++++-
> >  db/xfs_admin.sh      |    4 ++-
> >  man/man8/xfs_admin.8 |   16 +++++++++++
> >  3 files changed, 94 insertions(+), 2 deletions(-)
> > 
> > 
> > diff --git a/db/sb.c b/db/sb.c
> > index e3b1fe0b2e6e..b1033e5ef7f0 100644
> > --- a/db/sb.c
> > +++ b/db/sb.c
> > @@ -620,6 +620,44 @@ do_version(xfs_agnumber_t agno, uint16_t version, uint32_t features)
> >  	return 1;
> >  }
> >  
> > +/* Add new V5 features to the filesystem. */
> > +static bool
> > +add_v5_features(
> > +	struct xfs_mount	*mp,
> > +	uint32_t		compat,
> > +	uint32_t		ro_compat,
> > +	uint32_t		incompat,
> > +	uint32_t		log_incompat)
> > +{
> > +	struct xfs_sb		tsb;
> > +	xfs_agnumber_t		agno;
> > +
> > +	dbprintf(_("Upgrading V5 filesystem\n"));
> > +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> > +		if (!get_sb(agno, &tsb))
> > +			break;
> > +
> > +		tsb.sb_features_compat |= compat;
> > +		tsb.sb_features_ro_compat |= ro_compat;
> > +		tsb.sb_features_incompat |= incompat;
> > +		tsb.sb_features_log_incompat |= log_incompat;
> > +		libxfs_sb_to_disk(iocur_top->data, &tsb);
> > +		write_cur();
> > +	}
> > +
> > +	if (agno != mp->m_sb.sb_agcount) {
> > +		dbprintf(
> > +_("Failed to upgrade V5 filesystem AG %d\n"), agno);
> > +		return false;
> 
> Do we need to undo changes if this somehow occurs?

Not sure.  The superblocks are inconsistent now, but I guess we should
try to put them back the way they were.

> > +	}
> > +
> > +	mp->m_sb.sb_features_compat |= compat;
> > +	mp->m_sb.sb_features_ro_compat |= ro_compat;
> > +	mp->m_sb.sb_features_incompat |= incompat;
> > +	mp->m_sb.sb_features_log_incompat |= log_incompat;
> > +	return true;
> > +}
> > +
> >  static char *
> >  version_string(
> >  	xfs_sb_t	*sbp)
> > @@ -705,6 +743,10 @@ version_f(
> 
> The comment above version_f() needs an update if we start to support v5
> features.

Ok.

> >  {
> >  	uint16_t	version = 0;
> >  	uint32_t	features = 0;
> > +	uint32_t	upgrade_compat = 0;
> > +	uint32_t	upgrade_ro_compat = 0;
> > +	uint32_t	upgrade_incompat = 0;
> > +	uint32_t	upgrade_log_incompat = 0;
> >  	xfs_agnumber_t	ag;
> >  
> >  	if (argc == 2) {	/* WRITE VERSION */
> > @@ -716,7 +758,28 @@ version_f(
> >  		}
> >  
> >  		/* Logic here derived from the IRIX xfs_chver(1M) script. */
> > -		if (!strcasecmp(argv[1], "extflg")) {
> > +		if (!strcasecmp(argv[1], "inobtcount")) {
> > +			if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) {
> > +				dbprintf(
> > +		_("inode btree counter feature is already enabled\n"));
> > +				exitcode = 1;
> > +				return 1;
> > +			}
> > +			if (!xfs_sb_version_hasfinobt(&mp->m_sb)) {
> > +				dbprintf(
> > +		_("inode btree counter feature cannot be enabled on filesystems lacking free inode btrees\n"));
> > +				exitcode = 1;
> > +				return 1;
> > +			}
> > +			if (!xfs_sb_version_hascrc(&mp->m_sb)) {
> > +				dbprintf(
> > +		_("inode btree counter feature cannot be enabled on pre-V5 filesystems\n"));
> > +				exitcode = 1;
> > +				return 1;
> > +			}
> > +
> > +			upgrade_ro_compat |= XFS_SB_FEAT_RO_COMPAT_INOBTCNT;
> > +		} else if (!strcasecmp(argv[1], "extflg")) {
> >  			switch (XFS_SB_VERSION_NUM(&mp->m_sb)) {
> >  			case XFS_SB_VERSION_1:
> >  				version = 0x0004 | XFS_SB_VERSION_EXTFLGBIT;
> > @@ -807,6 +870,17 @@ version_f(
> >  			mp->m_sb.sb_versionnum = version;
> >  			mp->m_sb.sb_features2 = features;
> >  		}
> > +
> > +		if (upgrade_compat || upgrade_ro_compat || upgrade_incompat ||
> > +		    upgrade_log_incompat) {
> > +			if (!add_v5_features(mp, upgrade_compat,
> > +					upgrade_ro_compat,
> > +					upgrade_incompat,
> > +					upgrade_log_incompat)) {
> > +				exitcode = 1;
> > +				return 1;
> > +			}
> > +		}
> 
> What's the purpose of the unused upgrade variables?

I'm laying the groundwork for adding more features later, such as
bigtime and atomic log swapping.

> Also, it looks like we just update the feature bits here. What about the
> counters? Is the user expected to run xfs_repair?

Yes.  Come to think of it, I could set sb_inprogress and force the user
to run repair.

> 
> >  	}
> >  
> >  	if (argc == 3) {	/* VERSIONNUM + FEATURES2 */
> > diff --git a/db/xfs_admin.sh b/db/xfs_admin.sh
> > index bd325da2f776..0f0c8d18d6cb 100755
> > --- a/db/xfs_admin.sh
> > +++ b/db/xfs_admin.sh
> > @@ -9,7 +9,7 @@ DB_OPTS=""
> >  REPAIR_OPTS=""
> >  USAGE="Usage: xfs_admin [-efjlpuV] [-c 0|1] [-L label] [-U uuid] device [logdev]"
> >  
> > -while getopts "efjlpuc:L:U:V" c
> > +while getopts "efjlpuc:L:O:U:V" c
> >  do
> >  	case $c in
> >  	c)	REPAIR_OPTS=$REPAIR_OPTS" -c lazycount="$OPTARG;;
> > @@ -19,6 +19,8 @@ do
> >  	l)	DB_OPTS=$DB_OPTS" -r -c label";;
> >  	L)	DB_OPTS=$DB_OPTS" -c 'label "$OPTARG"'";;
> >  	p)	DB_OPTS=$DB_OPTS" -c 'version projid32bit'";;
> > +	O)	DB_OPTS=$DB_OPTS" -c 'version "$OPTARG"'";
> > +		REPAIR_OPTS="$REPAIR_OPTS ";;
> 
> Ah, I see.. xfs_admin runs repair if options are specified, hence this
> little whitespace hack. It might be worth a comment here so somebody
> doesn't fly by and "clean" that up. ;)

Ok.

> BTW, does this also address the error scenario I asked about above...?

Yes, but I think we should revert the primary super if the upgrade
fails.

> >  	u)	DB_OPTS=$DB_OPTS" -r -c uuid";;
> >  	U)	DB_OPTS=$DB_OPTS" -c 'uuid "$OPTARG"'";;
> >  	V)	xfs_db -p xfs_admin -V
> > diff --git a/man/man8/xfs_admin.8 b/man/man8/xfs_admin.8
> > index 8afc873fb50a..65ca6afc1e12 100644
> > --- a/man/man8/xfs_admin.8
> > +++ b/man/man8/xfs_admin.8
> > @@ -6,6 +6,8 @@ xfs_admin \- change parameters of an XFS filesystem
> >  [
> >  .B \-eflpu
> >  ] [
> > +.BR \-O " feature"
> > +] [
> >  .BR "\-c 0" | 1
> >  ] [
> >  .B \-L
> > @@ -103,6 +105,20 @@ The filesystem label can be cleared using the special "\c
> >  " value for
> >  .IR label .
> >  .TP
> > +.BI \-O " feature"
> > +Add a new feature to the filesystem.
> > +Only one feature can be specified at a time.
> > +Features are as follows:
> > +.RS 0.7i
> > +.TP
> > +.B inobtcount
> > +Upgrade the filesystem to support the inode btree counters feature.
> > +This reduces mount time by caching the size of the inode btrees in the
> > +allocation group metadata.
> > +Once enabled, the filesystem will not be writable by older kernels.
> > +The filesystem cannot be downgraded after this feature is enabled.
> 
> Any reason for not allowing the downgrade path? It seems like we're
> mostly there implementation wise and that might facilitate enabling the
> feature by default.

Downgrading will not be easy for bigtime, since we'd have to scan the
whole fs to see if there are any timestamps past 2038.  For other
features it might not be such a big deal, but in general I don't want to
increase our testing burden even further.

I'll ask the ext4 folks if they know of people downgrading filesystems
with tune2fs, but AFAIK it's not generally done.

--D

> Brian
> 
> > +.RE
> > +.TP
> >  .BI \-U " uuid"
> >  Set the UUID of the filesystem to
> >  .IR uuid .
> > 
>