On 02 Feb 2022 at 00:40, Darrick J. Wong wrote: > On Fri, Jan 21, 2022 at 10:48:53AM +0530, Chandan Babu R wrote: >> This commit introduces new fields in the on-disk inode format to support >> 64-bit data fork extent counters and 32-bit attribute fork extent >> counters. The new fields will be used only when an inode has >> XFS_DIFLAG2_NREXT64 flag set. Otherwise we continue to use the regular 32-bit >> data fork extent counters and 16-bit attribute fork extent counters. >> >> Signed-off-by: Chandan Babu R <chandan.babu@xxxxxxxxxx> >> Suggested-by: Dave Chinner <dchinner@xxxxxxxxxx> >> --- >> fs/xfs/libxfs/xfs_format.h | 22 +++++++-- >> fs/xfs/libxfs/xfs_inode_buf.c | 49 ++++++++++++++++++-- >> fs/xfs/libxfs/xfs_inode_fork.h | 6 +++ >> fs/xfs/libxfs/xfs_log_format.h | 22 +++++++-- >> fs/xfs/xfs_inode_item.c | 23 ++++++++-- >> fs/xfs/xfs_inode_item_recover.c | 79 ++++++++++++++++++++++++++++----- >> 6 files changed, 174 insertions(+), 27 deletions(-) >> >> diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h >> index d3dfd45c39e0..df1d6ec39c45 100644 >> --- a/fs/xfs/libxfs/xfs_format.h >> +++ b/fs/xfs/libxfs/xfs_format.h >> @@ -792,16 +792,30 @@ struct xfs_dinode { >> __be32 di_nlink; /* number of links to file */ >> __be16 di_projid_lo; /* lower part of owner's project id */ >> __be16 di_projid_hi; /* higher part owner's project id */ >> - __u8 di_pad[6]; /* unused, zeroed space */ >> - __be16 di_flushiter; /* incremented on flush */ >> + union { >> + __be64 di_big_nextents;/* NREXT64 data extents */ >> + __u8 di_v3_pad[8]; /* !NREXT64 V3 inode zeroed space */ >> + struct { >> + __u8 di_v2_pad[6]; /* V2 inode zeroed space */ >> + __be16 di_flushiter; /* V2 inode incremented on flush */ >> + }; >> + }; > > I think it might be time to reflow part of the comments for these fields > away from inline... > > union { > /* Number of data fork extents if NREXT64 is set */ > __be64 di_big_nextents; > > /* Padding for V3 inodes without NREXT64 set. */ > __be64 di_v3_pad; > > /* Padding and inode flush counter for V2 inodes. */ > struct { > __u8 di_v2_pad[6]; > __be16 di_flushiter; > }; > }; > Ok. I will include these changes in the next version. >> xfs_timestamp_t di_atime; /* time last accessed */ >> xfs_timestamp_t di_mtime; /* time last modified */ >> xfs_timestamp_t di_ctime; /* time created/inode modified */ >> __be64 di_size; /* number of bytes in file */ >> __be64 di_nblocks; /* # of direct & btree blocks used */ >> __be32 di_extsize; /* basic/minimum extent size for file */ >> - __be32 di_nextents; /* number of extents in data fork */ >> - __be16 di_anextents; /* number of extents in attribute fork*/ >> + union { >> + struct { >> + __be32 di_big_anextents; /* NREXT64 attr extents */ >> + __be16 di_nrext64_pad; /* NREXT64 unused, zero */ >> + } __packed; >> + struct { >> + __be32 di_nextents; /* !NREXT64 data extents */ >> + __be16 di_anextents; /* !NREXT64 attr extents */ >> + } __packed; >> + }; > > > union { > /* > * For V2 inodes and V3 inodes without NREXT64 set, this > * is the number of data and attr fork extents. > */ > struct { > __be32 di_nextents; > __be16 di_anextents; > } __packed; > > /* Number of attr fork extents if NREXT64 is set. */ > struct { > __be32 di_big_anextents; > __be16 di_nrext64_pad; > } __packed; > } __packed; > >> __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ >> __s8 di_aformat; /* format of attr fork's data */ >> __be32 di_dmevmask; /* DMIG event mask */ >> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c >> index 34f360a38603..2200526bcee0 100644 >> --- a/fs/xfs/libxfs/xfs_inode_buf.c >> +++ b/fs/xfs/libxfs/xfs_inode_buf.c >> @@ -279,6 +279,25 @@ xfs_inode_to_disk_ts( >> return ts; >> } >> >> +static inline void >> +xfs_inode_to_disk_iext_counters( >> + struct xfs_inode *ip, >> + struct xfs_dinode *to) >> +{ >> + if (xfs_inode_has_nrext64(ip)) { >> + to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); >> + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); >> + /* >> + * We might be upgrading the inode to use larger extent counters >> + * than was previously used. Hence zero the unused field. >> + */ >> + to->di_nrext64_pad = cpu_to_be16(0); >> + } else { >> + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); >> + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); >> + } >> +} >> + >> void >> xfs_inode_to_disk( >> struct xfs_inode *ip, >> @@ -296,7 +315,6 @@ xfs_inode_to_disk( >> to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); >> to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); >> >> - memset(to->di_pad, 0, sizeof(to->di_pad)); >> to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); >> to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); >> to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); >> @@ -307,8 +325,6 @@ xfs_inode_to_disk( >> to->di_size = cpu_to_be64(ip->i_disk_size); >> to->di_nblocks = cpu_to_be64(ip->i_nblocks); >> to->di_extsize = cpu_to_be32(ip->i_extsize); >> - to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); >> - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); >> to->di_forkoff = ip->i_forkoff; >> to->di_aformat = xfs_ifork_format(ip->i_afp); >> to->di_flags = cpu_to_be16(ip->i_diflags); >> @@ -323,11 +339,14 @@ xfs_inode_to_disk( >> to->di_lsn = cpu_to_be64(lsn); >> memset(to->di_pad2, 0, sizeof(to->di_pad2)); >> uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); >> - to->di_flushiter = 0; >> + memset(to->di_v3_pad, 0, sizeof(to->di_v3_pad)); >> } else { >> to->di_version = 2; >> to->di_flushiter = cpu_to_be16(ip->i_flushiter); >> + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); >> } >> + >> + xfs_inode_to_disk_iext_counters(ip, to); >> } >> >> static xfs_failaddr_t >> @@ -397,6 +416,24 @@ xfs_dinode_verify_forkoff( >> return NULL; >> } >> >> +static xfs_failaddr_t >> +xfs_dinode_verify_nextents( >> + struct xfs_mount *mp, >> + struct xfs_dinode *dip) >> +{ >> + if (xfs_dinode_has_nrext64(dip)) { >> + if (!xfs_has_nrext64(mp)) >> + return __this_address; >> + if (dip->di_nrext64_pad != 0) >> + return __this_address; > > Don't we need to check that: > > if (xfs_dfork_data_extents(dip) > XFS_MAX_EXTCNT_DATA_FORK) > return __this_address; > if (xfs_dfork_attr_extents(dip) > XFS_MAX_EXTCNT_ATTR_FORK) > return __this_address; > > here? > > OH, the actual checking of the extent count fields is in > xfs_dinode_verify_fork, isn't it? > > I think that means this function exists to check the consistency of the > nrext64 inode flag vs. the superblock nrext64 flag and the padding > fields, right? > > In which case... perhaps this should be xfs_dinode_verify_nrext64() ? > You are right. The name xfs_dinode_verify_nextents implies that we are verifying the values of extent counters. I will fix this. >> + } else { >> + if (dip->di_version == 3 && dip->di_big_nextents != 0) >> + return __this_address; > > We're using tagged unions in xfs_dinode, then "di_big_nextents" is > meaningless on an inode that doesn't have NREXT64 set. IOWs, > > } else if (dip->di_version >= 3) { > if (dip->di_v3_pad != 0) > return __this_address; > } > > (Note that I changed the type of di_v3_pad above.) > Thanks for the suggestion. I will fix this as well. >> + } >> + >> + return NULL; >> +} >> + >> xfs_failaddr_t >> xfs_dinode_verify( >> struct xfs_mount *mp, >> @@ -440,6 +477,10 @@ xfs_dinode_verify( >> if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) >> return __this_address; >> >> + fa = xfs_dinode_verify_nextents(mp, dip); >> + if (fa) >> + return fa; >> + >> nextents = xfs_dfork_data_extents(dip); >> nextents += xfs_dfork_attr_extents(dip); >> nblocks = be64_to_cpu(dip->di_nblocks); >> diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h >> index e56803436c61..8e6221e32660 100644 >> --- a/fs/xfs/libxfs/xfs_inode_fork.h >> +++ b/fs/xfs/libxfs/xfs_inode_fork.h >> @@ -156,6 +156,9 @@ static inline xfs_extnum_t >> xfs_dfork_data_extents( >> struct xfs_dinode *dip) >> { >> + if (xfs_dinode_has_nrext64(dip)) >> + return be64_to_cpu(dip->di_big_nextents); >> + >> return be32_to_cpu(dip->di_nextents); >> } >> >> @@ -163,6 +166,9 @@ static inline xfs_extnum_t >> xfs_dfork_attr_extents( >> struct xfs_dinode *dip) >> { >> + if (xfs_dinode_has_nrext64(dip)) >> + return be32_to_cpu(dip->di_big_anextents); >> + >> return be16_to_cpu(dip->di_anextents); >> } >> >> diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h >> index fd66e70248f7..7f4ebf112a3c 100644 >> --- a/fs/xfs/libxfs/xfs_log_format.h >> +++ b/fs/xfs/libxfs/xfs_log_format.h >> @@ -388,16 +388,30 @@ struct xfs_log_dinode { >> uint32_t di_nlink; /* number of links to file */ >> uint16_t di_projid_lo; /* lower part of owner's project id */ >> uint16_t di_projid_hi; /* higher part of owner's project id */ >> - uint8_t di_pad[6]; /* unused, zeroed space */ >> - uint16_t di_flushiter; /* incremented on flush */ >> + union { >> + uint64_t di_big_nextents;/* NREXT64 data extents */ >> + uint8_t di_v3_pad[8]; /* !NREXT64 V3 inode zeroed space */ >> + struct { >> + uint8_t di_v2_pad[6]; /* V2 inode zeroed space */ >> + uint16_t di_flushiter; /* V2 inode incremented on flush */ >> + }; >> + }; >> xfs_log_timestamp_t di_atime; /* time last accessed */ >> xfs_log_timestamp_t di_mtime; /* time last modified */ >> xfs_log_timestamp_t di_ctime; /* time created/inode modified */ >> xfs_fsize_t di_size; /* number of bytes in file */ >> xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ >> xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ >> - uint32_t di_nextents; /* number of extents in data fork */ >> - uint16_t di_anextents; /* number of extents in attribute fork*/ >> + union { >> + struct { >> + uint32_t di_big_anextents; /* NREXT64 attr extents */ >> + uint16_t di_nrext64_pad; /* NREXT64 unused, zero */ >> + } __packed; >> + struct { >> + uint32_t di_nextents; /* !NREXT64 data extents */ >> + uint16_t di_anextents; /* !NREXT64 attr extents */ >> + } __packed; >> + }; > > I think you could apply the same transformations as I did to xfs_dinode > above. > Sure. I will make relevant changes to xfs_log_dinode. > --D > >> uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ >> int8_t di_aformat; /* format of attr fork's data */ >> uint32_t di_dmevmask; /* DMIG event mask */ >> diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c >> index 90d8e591baf8..8304ce062e43 100644 >> --- a/fs/xfs/xfs_inode_item.c >> +++ b/fs/xfs/xfs_inode_item.c >> @@ -358,6 +358,21 @@ xfs_copy_dm_fields_to_log_dinode( >> } >> } >> >> +static inline void >> +xfs_inode_to_log_dinode_iext_counters( >> + struct xfs_inode *ip, >> + struct xfs_log_dinode *to) >> +{ >> + if (xfs_inode_has_nrext64(ip)) { >> + to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); >> + to->di_big_anextents = xfs_ifork_nextents(ip->i_afp); >> + to->di_nrext64_pad = 0; >> + } else { >> + to->di_nextents = xfs_ifork_nextents(&ip->i_df); >> + to->di_anextents = xfs_ifork_nextents(ip->i_afp); >> + } >> +} >> + >> static void >> xfs_inode_to_log_dinode( >> struct xfs_inode *ip, >> @@ -373,7 +388,6 @@ xfs_inode_to_log_dinode( >> to->di_projid_lo = ip->i_projid & 0xffff; >> to->di_projid_hi = ip->i_projid >> 16; >> >> - memset(to->di_pad, 0, sizeof(to->di_pad)); >> memset(to->di_pad3, 0, sizeof(to->di_pad3)); >> to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); >> to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); >> @@ -385,8 +399,6 @@ xfs_inode_to_log_dinode( >> to->di_size = ip->i_disk_size; >> to->di_nblocks = ip->i_nblocks; >> to->di_extsize = ip->i_extsize; >> - to->di_nextents = xfs_ifork_nextents(&ip->i_df); >> - to->di_anextents = xfs_ifork_nextents(ip->i_afp); >> to->di_forkoff = ip->i_forkoff; >> to->di_aformat = xfs_ifork_format(ip->i_afp); >> to->di_flags = ip->i_diflags; >> @@ -406,11 +418,14 @@ xfs_inode_to_log_dinode( >> to->di_lsn = lsn; >> memset(to->di_pad2, 0, sizeof(to->di_pad2)); >> uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); >> - to->di_flushiter = 0; >> + memset(to->di_v3_pad, 0, sizeof(to->di_v3_pad)); >> } else { >> to->di_version = 2; >> to->di_flushiter = ip->i_flushiter; >> + memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); >> } >> + >> + xfs_inode_to_log_dinode_iext_counters(ip, to); >> } >> >> /* >> diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c >> index 767a551816a0..fa3556633ca9 100644 >> --- a/fs/xfs/xfs_inode_item_recover.c >> +++ b/fs/xfs/xfs_inode_item_recover.c >> @@ -148,6 +148,22 @@ static inline bool xfs_log_dinode_has_nrext64(const struct xfs_log_dinode *ld) >> (ld->di_flags2 & XFS_DIFLAG2_NREXT64); >> } >> >> +static inline void >> +xfs_log_dinode_to_disk_iext_counters( >> + struct xfs_log_dinode *from, >> + struct xfs_dinode *to) >> +{ >> + if (xfs_log_dinode_has_nrext64(from)) { >> + to->di_big_nextents = cpu_to_be64(from->di_big_nextents); >> + to->di_big_anextents = cpu_to_be32(from->di_big_anextents); >> + to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); >> + } else { >> + to->di_nextents = cpu_to_be32(from->di_nextents); >> + to->di_anextents = cpu_to_be16(from->di_anextents); >> + } >> + >> +} >> + >> STATIC void >> xfs_log_dinode_to_disk( >> struct xfs_log_dinode *from, >> @@ -164,7 +180,6 @@ xfs_log_dinode_to_disk( >> to->di_nlink = cpu_to_be32(from->di_nlink); >> to->di_projid_lo = cpu_to_be16(from->di_projid_lo); >> to->di_projid_hi = cpu_to_be16(from->di_projid_hi); >> - memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); >> >> to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); >> to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); >> @@ -173,8 +188,6 @@ xfs_log_dinode_to_disk( >> to->di_size = cpu_to_be64(from->di_size); >> to->di_nblocks = cpu_to_be64(from->di_nblocks); >> to->di_extsize = cpu_to_be32(from->di_extsize); >> - to->di_nextents = cpu_to_be32(from->di_nextents); >> - to->di_anextents = cpu_to_be16(from->di_anextents); >> to->di_forkoff = from->di_forkoff; >> to->di_aformat = from->di_aformat; >> to->di_dmevmask = cpu_to_be32(from->di_dmevmask); >> @@ -192,10 +205,13 @@ xfs_log_dinode_to_disk( >> to->di_lsn = cpu_to_be64(lsn); >> memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); >> uuid_copy(&to->di_uuid, &from->di_uuid); >> - to->di_flushiter = 0; >> + memcpy(to->di_v3_pad, from->di_v3_pad, sizeof(to->di_v3_pad)); >> } else { >> to->di_flushiter = cpu_to_be16(from->di_flushiter); >> + memcpy(to->di_v2_pad, from->di_v2_pad, sizeof(to->di_v2_pad)); >> } >> + >> + xfs_log_dinode_to_disk_iext_counters(from, to); >> } >> >> STATIC int >> @@ -209,6 +225,8 @@ xlog_recover_inode_commit_pass2( >> struct xfs_mount *mp = log->l_mp; >> struct xfs_buf *bp; >> struct xfs_dinode *dip; >> + xfs_extnum_t nextents; >> + xfs_aextnum_t anextents; >> int len; >> char *src; >> char *dest; >> @@ -348,21 +366,60 @@ xlog_recover_inode_commit_pass2( >> goto out_release; >> } >> } >> - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ >> - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", >> + >> + if (xfs_log_dinode_has_nrext64(ldip)) { >> + if (!xfs_has_nrext64(mp) || (ldip->di_nrext64_pad != 0)) { >> + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", >> + XFS_ERRLEVEL_LOW, mp, ldip, >> + sizeof(*ldip)); >> + xfs_alert(mp, >> + "%s: Bad inode log record, rec ptr "PTR_FMT", " >> + "dino ptr "PTR_FMT", dino bp "PTR_FMT", " >> + "ino %Ld, xfs_has_nrext64(mp) = %d, " >> + "ldip->di_nrext64_pad = %u", >> + __func__, item, dip, bp, in_f->ilf_ino, >> + xfs_has_nrext64(mp), ldip->di_nrext64_pad); >> + error = -EFSCORRUPTED; >> + goto out_release; >> + } >> + } else { >> + if (ldip->di_version == 3 && ldip->di_big_nextents != 0) { >> + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", >> + XFS_ERRLEVEL_LOW, mp, ldip, >> + sizeof(*ldip)); >> + xfs_alert(mp, >> + "%s: Bad inode log record, rec ptr "PTR_FMT", " >> + "dino ptr "PTR_FMT", dino bp "PTR_FMT", " >> + "ino %Ld, ldip->di_big_dextcnt = %llu", >> + __func__, item, dip, bp, in_f->ilf_ino, >> + ldip->di_big_nextents); >> + error = -EFSCORRUPTED; >> + goto out_release; >> + } >> + } >> + >> + if (xfs_log_dinode_has_nrext64(ldip)) { >> + nextents = ldip->di_big_nextents; >> + anextents = ldip->di_big_anextents; >> + } else { >> + nextents = ldip->di_nextents; >> + anextents = ldip->di_anextents; >> + } >> + >> + if (unlikely(nextents + anextents > ldip->di_nblocks)) { >> + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", >> XFS_ERRLEVEL_LOW, mp, ldip, >> sizeof(*ldip)); >> xfs_alert(mp, >> "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " >> - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", >> + "dino bp "PTR_FMT", ino %Ld, total extents = %llu, nblocks = %Ld", >> __func__, item, dip, bp, in_f->ilf_ino, >> - ldip->di_nextents + ldip->di_anextents, >> - ldip->di_nblocks); >> + nextents + anextents, ldip->di_nblocks); >> error = -EFSCORRUPTED; >> goto out_release; >> } >> if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { >> - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", >> + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(8)", >> XFS_ERRLEVEL_LOW, mp, ldip, >> sizeof(*ldip)); >> xfs_alert(mp, >> @@ -374,7 +431,7 @@ xlog_recover_inode_commit_pass2( >> } >> isize = xfs_log_dinode_size(mp); >> if (unlikely(item->ri_buf[1].i_len > isize)) { >> - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", >> + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(9)", >> XFS_ERRLEVEL_LOW, mp, ldip, >> sizeof(*ldip)); >> xfs_alert(mp, >> -- >> 2.30.2 >> -- chandan