On Fri, Jan 21, 2022 at 10:48:53AM +0530, Chandan Babu R wrote: > This commit introduces new fields in the on-disk inode format to support > 64-bit data fork extent counters and 32-bit attribute fork extent > counters. The new fields will be used only when an inode has > XFS_DIFLAG2_NREXT64 flag set. Otherwise we continue to use the regular 32-bit > data fork extent counters and 16-bit attribute fork extent counters. > > Signed-off-by: Chandan Babu R <chandan.babu@xxxxxxxxxx> > Suggested-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/libxfs/xfs_format.h | 22 +++++++-- > fs/xfs/libxfs/xfs_inode_buf.c | 49 ++++++++++++++++++-- > fs/xfs/libxfs/xfs_inode_fork.h | 6 +++ > fs/xfs/libxfs/xfs_log_format.h | 22 +++++++-- > fs/xfs/xfs_inode_item.c | 23 ++++++++-- > fs/xfs/xfs_inode_item_recover.c | 79 ++++++++++++++++++++++++++++----- > 6 files changed, 174 insertions(+), 27 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h > index d3dfd45c39e0..df1d6ec39c45 100644 > --- a/fs/xfs/libxfs/xfs_format.h > +++ b/fs/xfs/libxfs/xfs_format.h > @@ -792,16 +792,30 @@ struct xfs_dinode { > __be32 di_nlink; /* number of links to file */ > __be16 di_projid_lo; /* lower part of owner's project id */ > __be16 di_projid_hi; /* higher part owner's project id */ > - __u8 di_pad[6]; /* unused, zeroed space */ > - __be16 di_flushiter; /* incremented on flush */ > + union { > + __be64 di_big_nextents;/* NREXT64 data extents */ > + __u8 di_v3_pad[8]; /* !NREXT64 V3 inode zeroed space */ > + struct { > + __u8 di_v2_pad[6]; /* V2 inode zeroed space */ > + __be16 di_flushiter; /* V2 inode incremented on flush */ > + }; > + }; I think it might be time to reflow part of the comments for these fields away from inline... union { /* Number of data fork extents if NREXT64 is set */ __be64 di_big_nextents; /* Padding for V3 inodes without NREXT64 set. */ __be64 di_v3_pad; /* Padding and inode flush counter for V2 inodes. */ struct { __u8 di_v2_pad[6]; __be16 di_flushiter; }; }; > xfs_timestamp_t di_atime; /* time last accessed */ > xfs_timestamp_t di_mtime; /* time last modified */ > xfs_timestamp_t di_ctime; /* time created/inode modified */ > __be64 di_size; /* number of bytes in file */ > __be64 di_nblocks; /* # of direct & btree blocks used */ > __be32 di_extsize; /* basic/minimum extent size for file */ > - __be32 di_nextents; /* number of extents in data fork */ > - __be16 di_anextents; /* number of extents in attribute fork*/ > + union { > + struct { > + __be32 di_big_anextents; /* NREXT64 attr extents */ > + __be16 di_nrext64_pad; /* NREXT64 unused, zero */ > + } __packed; > + struct { > + __be32 di_nextents; /* !NREXT64 data extents */ > + __be16 di_anextents; /* !NREXT64 attr extents */ > + } __packed; > + }; union { /* * For V2 inodes and V3 inodes without NREXT64 set, this * is the number of data and attr fork extents. */ struct { __be32 di_nextents; __be16 di_anextents; } __packed; /* Number of attr fork extents if NREXT64 is set. */ struct { __be32 di_big_anextents; __be16 di_nrext64_pad; } __packed; } __packed; > __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ > __s8 di_aformat; /* format of attr fork's data */ > __be32 di_dmevmask; /* DMIG event mask */ > diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c > index 34f360a38603..2200526bcee0 100644 > --- a/fs/xfs/libxfs/xfs_inode_buf.c > +++ b/fs/xfs/libxfs/xfs_inode_buf.c > @@ -279,6 +279,25 @@ xfs_inode_to_disk_ts( > return ts; > } > > +static inline void > +xfs_inode_to_disk_iext_counters( > + struct xfs_inode *ip, > + struct xfs_dinode *to) > +{ > + if (xfs_inode_has_nrext64(ip)) { > + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); > + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); > + /* > + * We might be upgrading the inode to use larger extent counters > + * than was previously used. Hence zero the unused field. > + */ > + to->di_nrext64_pad = cpu_to_be16(0); > + } else { > + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); > + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); > + } > +} > + > void > xfs_inode_to_disk( > struct xfs_inode *ip, > @@ -296,7 +315,6 @@ xfs_inode_to_disk( > to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); > to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); > > - memset(to->di_pad, 0, sizeof(to->di_pad)); > to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); > to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); > to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); > @@ -307,8 +325,6 @@ xfs_inode_to_disk( > to->di_size = cpu_to_be64(ip->i_disk_size); > to->di_nblocks = cpu_to_be64(ip->i_nblocks); > to->di_extsize = cpu_to_be32(ip->i_extsize); > - to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); > - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); > to->di_forkoff = ip->i_forkoff; > to->di_aformat = xfs_ifork_format(ip->i_afp); > to->di_flags = cpu_to_be16(ip->i_diflags); > @@ -323,11 +339,14 @@ xfs_inode_to_disk( > to->di_lsn = cpu_to_be64(lsn); > memset(to->di_pad2, 0, sizeof(to->di_pad2)); > uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); > - to->di_flushiter = 0; > + memset(to->di_v3_pad, 0, sizeof(to->di_v3_pad)); > } else { > to->di_version = 2; > to->di_flushiter = cpu_to_be16(ip->i_flushiter); > + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); > } > + > + xfs_inode_to_disk_iext_counters(ip, to); > } > > static xfs_failaddr_t > @@ -397,6 +416,24 @@ xfs_dinode_verify_forkoff( > return NULL; > } > > +static xfs_failaddr_t > +xfs_dinode_verify_nextents( > + struct xfs_mount *mp, > + struct xfs_dinode *dip) > +{ > + if (xfs_dinode_has_nrext64(dip)) { > + if (!xfs_has_nrext64(mp)) > + return __this_address; > + if (dip->di_nrext64_pad != 0) > + return __this_address; Don't we need to check that: if (xfs_dfork_data_extents(dip) > XFS_MAX_EXTCNT_DATA_FORK) return __this_address; if (xfs_dfork_attr_extents(dip) > XFS_MAX_EXTCNT_ATTR_FORK) return __this_address; here? OH, the actual checking of the extent count fields is in xfs_dinode_verify_fork, isn't it? I think that means this function exists to check the consistency of the nrext64 inode flag vs. the superblock nrext64 flag and the padding fields, right? In which case... perhaps this should be xfs_dinode_verify_nrext64() ? > + } else { > + if (dip->di_version == 3 && dip->di_big_nextents != 0) > + return __this_address; We're using tagged unions in xfs_dinode, then "di_big_nextents" is meaningless on an inode that doesn't have NREXT64 set. IOWs, } else if (dip->di_version >= 3) { if (dip->di_v3_pad != 0) return __this_address; } (Note that I changed the type of di_v3_pad above.) > + } > + > + return NULL; > +} > + > xfs_failaddr_t > xfs_dinode_verify( > struct xfs_mount *mp, > @@ -440,6 +477,10 @@ xfs_dinode_verify( > if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) > return __this_address; > > + fa = xfs_dinode_verify_nextents(mp, dip); > + if (fa) > + return fa; > + > nextents = xfs_dfork_data_extents(dip); > nextents += xfs_dfork_attr_extents(dip); > nblocks = be64_to_cpu(dip->di_nblocks); > diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h > index e56803436c61..8e6221e32660 100644 > --- a/fs/xfs/libxfs/xfs_inode_fork.h > +++ b/fs/xfs/libxfs/xfs_inode_fork.h > @@ -156,6 +156,9 @@ static inline xfs_extnum_t > xfs_dfork_data_extents( > struct xfs_dinode *dip) > { > + if (xfs_dinode_has_nrext64(dip)) > + return be64_to_cpu(dip->di_big_nextents); > + > return be32_to_cpu(dip->di_nextents); > } > > @@ -163,6 +166,9 @@ static inline xfs_extnum_t > xfs_dfork_attr_extents( > struct xfs_dinode *dip) > { > + if (xfs_dinode_has_nrext64(dip)) > + return be32_to_cpu(dip->di_big_anextents); > + > return be16_to_cpu(dip->di_anextents); > } > > diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h > index fd66e70248f7..7f4ebf112a3c 100644 > --- a/fs/xfs/libxfs/xfs_log_format.h > +++ b/fs/xfs/libxfs/xfs_log_format.h > @@ -388,16 +388,30 @@ struct xfs_log_dinode { > uint32_t di_nlink; /* number of links to file */ > uint16_t di_projid_lo; /* lower part of owner's project id */ > uint16_t di_projid_hi; /* higher part of owner's project id */ > - uint8_t di_pad[6]; /* unused, zeroed space */ > - uint16_t di_flushiter; /* incremented on flush */ > + union { > + uint64_t di_big_nextents;/* NREXT64 data extents */ > + uint8_t di_v3_pad[8]; /* !NREXT64 V3 inode zeroed space */ > + struct { > + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ > + uint16_t di_flushiter; /* V2 inode incremented on flush */ > + }; > + }; > xfs_log_timestamp_t di_atime; /* time last accessed */ > xfs_log_timestamp_t di_mtime; /* time last modified */ > xfs_log_timestamp_t di_ctime; /* time created/inode modified */ > xfs_fsize_t di_size; /* number of bytes in file */ > xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ > xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ > - uint32_t di_nextents; /* number of extents in data fork */ > - uint16_t di_anextents; /* number of extents in attribute fork*/ > + union { > + struct { > + uint32_t di_big_anextents; /* NREXT64 attr extents */ > + uint16_t di_nrext64_pad; /* NREXT64 unused, zero */ > + } __packed; > + struct { > + uint32_t di_nextents; /* !NREXT64 data extents */ > + uint16_t di_anextents; /* !NREXT64 attr extents */ > + } __packed; > + }; I think you could apply the same transformations as I did to xfs_dinode above. --D > uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ > int8_t di_aformat; /* format of attr fork's data */ > uint32_t di_dmevmask; /* DMIG event mask */ > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c > index 90d8e591baf8..8304ce062e43 100644 > --- a/fs/xfs/xfs_inode_item.c > +++ b/fs/xfs/xfs_inode_item.c > @@ -358,6 +358,21 @@ xfs_copy_dm_fields_to_log_dinode( > } > } > > +static inline void > +xfs_inode_to_log_dinode_iext_counters( > + struct xfs_inode *ip, > + struct xfs_log_dinode *to) > +{ > + if (xfs_inode_has_nrext64(ip)) { > + to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); > + to->di_big_anextents = xfs_ifork_nextents(ip->i_afp); > + to->di_nrext64_pad = 0; > + } else { > + to->di_nextents = xfs_ifork_nextents(&ip->i_df); > + to->di_anextents = xfs_ifork_nextents(ip->i_afp); > + } > +} > + > static void > xfs_inode_to_log_dinode( > struct xfs_inode *ip, > @@ -373,7 +388,6 @@ xfs_inode_to_log_dinode( > to->di_projid_lo = ip->i_projid & 0xffff; > to->di_projid_hi = ip->i_projid >> 16; > > - memset(to->di_pad, 0, sizeof(to->di_pad)); > memset(to->di_pad3, 0, sizeof(to->di_pad3)); > to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); > to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); > @@ -385,8 +399,6 @@ xfs_inode_to_log_dinode( > to->di_size = ip->i_disk_size; > to->di_nblocks = ip->i_nblocks; > to->di_extsize = ip->i_extsize; > - to->di_nextents = xfs_ifork_nextents(&ip->i_df); > - to->di_anextents = xfs_ifork_nextents(ip->i_afp); > to->di_forkoff = ip->i_forkoff; > to->di_aformat = xfs_ifork_format(ip->i_afp); > to->di_flags = ip->i_diflags; > @@ -406,11 +418,14 @@ xfs_inode_to_log_dinode( > to->di_lsn = lsn; > memset(to->di_pad2, 0, sizeof(to->di_pad2)); > uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); > - to->di_flushiter = 0; > + memset(to->di_v3_pad, 0, sizeof(to->di_v3_pad)); > } else { > to->di_version = 2; > to->di_flushiter = ip->i_flushiter; > + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); > } > + > + xfs_inode_to_log_dinode_iext_counters(ip, to); > } > > /* > diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c > index 767a551816a0..fa3556633ca9 100644 > --- a/fs/xfs/xfs_inode_item_recover.c > +++ b/fs/xfs/xfs_inode_item_recover.c > @@ -148,6 +148,22 @@ static inline bool xfs_log_dinode_has_nrext64(const struct xfs_log_dinode *ld) > (ld->di_flags2 & XFS_DIFLAG2_NREXT64); > } > > +static inline void > +xfs_log_dinode_to_disk_iext_counters( > + struct xfs_log_dinode *from, > + struct xfs_dinode *to) > +{ > + if (xfs_log_dinode_has_nrext64(from)) { > + to->di_big_nextents = cpu_to_be64(from->di_big_nextents); > + to->di_big_anextents = cpu_to_be32(from->di_big_anextents); > + to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); > + } else { > + to->di_nextents = cpu_to_be32(from->di_nextents); > + to->di_anextents = cpu_to_be16(from->di_anextents); > + } > + > +} > + > STATIC void > xfs_log_dinode_to_disk( > struct xfs_log_dinode *from, > @@ -164,7 +180,6 @@ xfs_log_dinode_to_disk( > to->di_nlink = cpu_to_be32(from->di_nlink); > to->di_projid_lo = cpu_to_be16(from->di_projid_lo); > to->di_projid_hi = cpu_to_be16(from->di_projid_hi); > - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); > > to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); > to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); > @@ -173,8 +188,6 @@ xfs_log_dinode_to_disk( > to->di_size = cpu_to_be64(from->di_size); > to->di_nblocks = cpu_to_be64(from->di_nblocks); > to->di_extsize = cpu_to_be32(from->di_extsize); > - to->di_nextents = cpu_to_be32(from->di_nextents); > - to->di_anextents = cpu_to_be16(from->di_anextents); > to->di_forkoff = from->di_forkoff; > to->di_aformat = from->di_aformat; > to->di_dmevmask = cpu_to_be32(from->di_dmevmask); > @@ -192,10 +205,13 @@ xfs_log_dinode_to_disk( > to->di_lsn = cpu_to_be64(lsn); > memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); > uuid_copy(&to->di_uuid, &from->di_uuid); > - to->di_flushiter = 0; > + memcpy(to->di_v3_pad, from->di_v3_pad, sizeof(to->di_v3_pad)); > } else { > to->di_flushiter = cpu_to_be16(from->di_flushiter); > + memcpy(to->di_v2_pad, from->di_v2_pad, sizeof(to->di_v2_pad)); > } > + > + xfs_log_dinode_to_disk_iext_counters(from, to); > } > > STATIC int > @@ -209,6 +225,8 @@ xlog_recover_inode_commit_pass2( > struct xfs_mount *mp = log->l_mp; > struct xfs_buf *bp; > struct xfs_dinode *dip; > + xfs_extnum_t nextents; > + xfs_aextnum_t anextents; > int len; > char *src; > char *dest; > @@ -348,21 +366,60 @@ xlog_recover_inode_commit_pass2( > goto out_release; > } > } > - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ > - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", > + > + if (xfs_log_dinode_has_nrext64(ldip)) { > + if (!xfs_has_nrext64(mp) || (ldip->di_nrext64_pad != 0)) { > + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", > + XFS_ERRLEVEL_LOW, mp, ldip, > + sizeof(*ldip)); > + xfs_alert(mp, > + "%s: Bad inode log record, rec ptr "PTR_FMT", " > + "dino ptr "PTR_FMT", dino bp "PTR_FMT", " > + "ino %Ld, xfs_has_nrext64(mp) = %d, " > + "ldip->di_nrext64_pad = %u", > + __func__, item, dip, bp, in_f->ilf_ino, > + xfs_has_nrext64(mp), ldip->di_nrext64_pad); > + error = -EFSCORRUPTED; > + goto out_release; > + } > + } else { > + if (ldip->di_version == 3 && ldip->di_big_nextents != 0) { > + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", > + XFS_ERRLEVEL_LOW, mp, ldip, > + sizeof(*ldip)); > + xfs_alert(mp, > + "%s: Bad inode log record, rec ptr "PTR_FMT", " > + "dino ptr "PTR_FMT", dino bp "PTR_FMT", " > + "ino %Ld, ldip->di_big_dextcnt = %llu", > + __func__, item, dip, bp, in_f->ilf_ino, > + ldip->di_big_nextents); > + error = -EFSCORRUPTED; > + goto out_release; > + } > + } > + > + if (xfs_log_dinode_has_nrext64(ldip)) { > + nextents = ldip->di_big_nextents; > + anextents = ldip->di_big_anextents; > + } else { > + nextents = ldip->di_nextents; > + anextents = ldip->di_anextents; > + } > + > + if (unlikely(nextents + anextents > ldip->di_nblocks)) { > + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", > XFS_ERRLEVEL_LOW, mp, ldip, > sizeof(*ldip)); > xfs_alert(mp, > "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " > - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", > + "dino bp "PTR_FMT", ino %Ld, total extents = %llu, nblocks = %Ld", > __func__, item, dip, bp, in_f->ilf_ino, > - ldip->di_nextents + ldip->di_anextents, > - ldip->di_nblocks); > + nextents + anextents, ldip->di_nblocks); > error = -EFSCORRUPTED; > goto out_release; > } > if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { > - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", > + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(8)", > XFS_ERRLEVEL_LOW, mp, ldip, > sizeof(*ldip)); > xfs_alert(mp, > @@ -374,7 +431,7 @@ xlog_recover_inode_commit_pass2( > } > isize = xfs_log_dinode_size(mp); > if (unlikely(item->ri_buf[1].i_len > isize)) { > - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", > + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(9)", > XFS_ERRLEVEL_LOW, mp, ldip, > sizeof(*ldip)); > xfs_alert(mp, > -- > 2.30.2 >