On Thu, Nov 02, 2017 at 12:24:55AM +0300, Artem Blagodarenko wrote: > Use dirdata to store high bits of 64bit inode > number. > > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@xxxxxxxxxx> > --- > fs/ext2/super.c | 6 ++--- > fs/ext4/dir.c | 4 +-- > fs/ext4/ext4.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++---------- > fs/ext4/ialloc.c | 19 ++++++++----- > fs/ext4/namei.c | 53 +++++++++++++++++++++++++++++++----- > fs/ext4/resize.c | 8 +++--- > fs/ext4/super.c | 14 +++++++--- > 7 files changed, 145 insertions(+), 41 deletions(-) > > diff --git a/fs/ext2/super.c b/fs/ext2/super.c > index 1458706bd2ec..f6437f559d0a 100644 > --- a/fs/ext2/super.c > +++ b/fs/ext2/super.c > @@ -1232,7 +1232,7 @@ void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, > ext2_clear_super_error(sb); > spin_lock(&EXT2_SB(sb)->s_lock); > es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); > - es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); > + ext4_set_free_inodes_count(es, ext4_get_free_inodes_count(sb)); > es->s_wtime = cpu_to_le32(get_seconds()); > /* unlock before we do IO */ > spin_unlock(&EXT2_SB(sb)->s_lock); > @@ -1459,9 +1459,9 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) > buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); > if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) > buf->f_bavail = 0; > - buf->f_files = le32_to_cpu(es->s_inodes_count); > + buf->f_files = ext4_get_inodes_count(sb); > buf->f_ffree = ext2_count_free_inodes(sb); > - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); > + ext4_set_free_inodes_count(sb, buf->f_ffree); > buf->f_namelen = EXT2_NAME_LEN; > fsid = le64_to_cpup((void *)es->s_uuid) ^ > le64_to_cpup((void *)es->s_uuid + sizeof(u64)); > diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c > index 46fcb8ec47a6..1ed3b2ae1f88 100644 > --- a/fs/ext4/dir.c > +++ b/fs/ext4/dir.c > @@ -76,7 +76,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, > else if (unlikely(((char *) de - buf) + rlen > size)) > error_msg = "directory entry across range"; > else if (unlikely(le32_to_cpu(de->inode) > > - le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) > + ext4_get_inodes_count(dir->i_sb))) > error_msg = "inode out of bounds"; > else > return 0; > @@ -382,7 +382,7 @@ struct fname { > __u32 minor_hash; > struct rb_node rb_hash; > struct fname *next; > - __u32 inode; > + __u64 inode; > __u8 name_len; > __u8 file_type; > char name[0]; > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 9a9b01b0956a..a6b385dac61e 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1331,7 +1331,12 @@ struct ext4_super_block { > __le32 s_lpf_ino; /* Location of the lost+found inode */ > __le32 s_prj_quota_inum; /* inode for tracking project quota */ > __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ > - __le32 s_reserved[98]; /* Padding to the end of the block */ > + __le32 s_inodes_count_hi; /* higth part of inode count */ > + __le32 s_free_inodes_count_hi; /* Free inodes count */ > + __le32 s_usr_quota_inum_hi; Comment? > + __le32 s_grp_quota_inum_hi; Comment? > + __le32 s_prj_quota_inum_hi; Comment? > + __le32 s_reserved[93]; /* Padding to the end of the block */ > __le32 s_checksum; /* crc32c(superblock) */ > }; > > @@ -1539,18 +1544,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) > return container_of(inode, struct ext4_inode_info, vfs_inode); > } > > -static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) > -{ > - return ino == EXT4_ROOT_INO || > - ino == EXT4_USR_QUOTA_INO || > - ino == EXT4_GRP_QUOTA_INO || > - ino == EXT4_BOOT_LOADER_INO || > - ino == EXT4_JOURNAL_INO || > - ino == EXT4_RESIZE_INO || > - (ino >= EXT4_FIRST_INO(sb) && > - ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); > -} > - > /* > * Inode dynamic state flags > */ > @@ -1689,6 +1682,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) > #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ > #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ > #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 > +#define EXT4_FEATURE_INCOMPAT_INODE64 0x20000 > > #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ > static inline bool ext4_has_feature_##name(struct super_block *sb) \ > @@ -1777,6 +1771,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) > EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) > EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) > EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) > +EXT4_FEATURE_INCOMPAT_FUNCS(inode64, INODE64) > + > > #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR > #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ > @@ -1805,6 +1801,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) > EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ > EXT4_FEATURE_INCOMPAT_ENCRYPT | \ > EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ > + EXT4_FEATURE_INCOMPAT_INODE64 | \ > EXT4_FEATURE_INCOMPAT_LARGEDIR | \ > EXT4_FEATURE_INCOMPAT_DIRDATA) > #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ > @@ -2462,7 +2459,7 @@ extern int ext4fs_dirhash(const char *name, int len, struct > > /* ialloc.c */ > extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, > - const struct qstr *qstr, __u32 goal, > + const struct qstr *qstr, __u64 goal, > uid_t *owner, __u32 i_flags, > int handle_type, unsigned int line_no, > int nblocks); > @@ -2889,6 +2886,63 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) > return 1 << sbi->s_log_groups_per_flex; > } > > +static inline unsigned long ext4_get_inodes_count(struct super_block *sb) > +{ > + struct ext4_super_block *es = EXT4_SB(sb)->s_es; > + unsigned long inodes_count = le32_to_cpu(es->s_inodes_count); > + > + if (ext4_has_feature_inode64(sb)) > + inodes_count |= > + (unsigned long)le32_to_cpu(es->s_inodes_count_hi) > + << 32; > + return inodes_count; > +} > + > +static inline void ext4_set_inodes_count(struct super_block *sb, > + unsigned long val) > +{ > + struct ext4_super_block *es = EXT4_SB(sb)->s_es; > + > + if (ext4_has_feature_inode64(sb)) > + es->s_inodes_count_hi = cpu_to_le32(val >> 32); > + > + es->s_inodes_count = cpu_to_le32(val); > +} > + > +static inline unsigned long ext4_get_free_inodes_count(struct super_block *sb) > +{ > + struct ext4_super_block *es = EXT4_SB(sb)->s_es; > + unsigned long inodes_count = le32_to_cpu(es->s_free_inodes_count); > + > + if (ext4_has_feature_inode64(sb)) > + inodes_count |= > + (unsigned long)le32_to_cpu(es->s_free_inodes_count_hi) > + << 32; > + return inodes_count; > +} > + > +static inline void ext4_set_free_inodes_count(struct super_block *sb, > + unsigned long val) > +{ > + struct ext4_super_block *es = EXT4_SB(sb)->s_es; > + > + if (ext4_has_feature_inode64(sb)) > + es->s_free_inodes_count_hi = cpu_to_le32(val >> 32); > + > + es->s_free_inodes_count = cpu_to_le32(val); > +} > + > +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) > +{ > + return ino == EXT4_ROOT_INO || > + ino == EXT4_USR_QUOTA_INO || > + ino == EXT4_GRP_QUOTA_INO || > + ino == EXT4_JOURNAL_INO || > + ino == EXT4_RESIZE_INO || > + (ino >= EXT4_FIRST_INO(sb) && > + ino <= ext4_get_inodes_count(sb)); > +} > + > #define ext4_std_error(sb, errno) \ > do { \ > if ((errno)) \ > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index ee823022aa34..e23dc4133e84 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -303,7 +303,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) > ext4_clear_inode(inode); > > es = EXT4_SB(sb)->s_es; > - if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { > + if (ino < EXT4_FIRST_INO(sb) || ino > ext4_get_inodes_count(sb)) { > ext4_error(sb, "reserved or nonexistent inode %lu", ino); > goto error_return; > } > @@ -770,7 +770,7 @@ static int find_inode_bit(struct super_block *sb, ext4_group_t group, > */ > struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, > umode_t mode, const struct qstr *qstr, > - __u32 goal, uid_t *owner, __u32 i_flags, > + __u64 goal, uid_t *owner, __u32 i_flags, > int handle_type, unsigned int line_no, > int nblocks) > { > @@ -887,7 +887,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, > if (!goal) > goal = sbi->s_inode_goal; > > - if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { > + if (goal && goal <= ext4_get_inodes_count(sb)) { > group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); > ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); > ret2 = 0; > @@ -1149,6 +1149,11 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, > __le32 gen = cpu_to_le32(inode->i_generation); > csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, > sizeof(inum)); > + if (inode->i_ino >> 32) { > + inum = cpu_to_le32(inode->i_ino >> 32); > + csum = ext4_chksum(sbi, sbi->s_csum_seed, > + (__u8 *)&inum, sizeof(inum)); There's a similar clause in ext4_iget that needs to have this piece added. Though TBH the whole seed precomputation clause should be refactored into a helper and called from both places. (Apologies from 2011 me who didn't do that...) > + } > ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, > sizeof(gen)); > } > @@ -1226,7 +1231,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, > /* Verify that we are loading a valid orphan from disk */ > struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) > { > - unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); > + unsigned long max_ino = ext4_get_inodes_count(sb); > ext4_group_t block_group; > int bit; > struct buffer_head *bitmap_bh = NULL; > @@ -1330,9 +1335,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) > bitmap_count += x; > } > brelse(bitmap_bh); > - printk(KERN_DEBUG "ext4_count_free_inodes: " > - "stored = %u, computed = %lu, %lu\n", > - le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); > + printk(KERN_DEBUG "ext4_count_free_inodes:\n" > + "stored = %lu, computed = %lu, %lu\n", > + ext4_get_inodes_count(sb), desc_count, bitmap_count); > return desc_count; > #else > desc_count = 0; > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c > index b09e73100e14..9671732e478f 100644 > --- a/fs/ext4/namei.c > +++ b/fs/ext4/namei.c > @@ -1574,11 +1574,39 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi > return (struct dentry *) bh; > inode = NULL; > if (bh) { > - __u32 ino = le32_to_cpu(de->inode); > + unsigned long ino; > + > + ino = le32_to_cpu(de->inode); > + if (ext4_has_feature_inode64(dir->i_sb) && > + (de->file_type & EXT4_DIRENT_INODE)) { Where is this defined? (Oh, right, the previous patch) > + char *data = &de->name[de->name_len]; struct ext4_dirent_data_header *ddh = &de->name[de->name_len]; name_len + 1, since we added the null byte? > + > + if (de->file_type & EXT4_DIRENT_LUFID) { > + /* skip LUFID record if present */ It's a little strange that the LUFID thing isn't checked against the feature being enabled. I didn't notice until I started reading this patch that I don't know where we copy the LUFID stuff out of the dirent and back into the dentry. > + data += *data; ddh = &de->name[de->name_len + 1 + ddh->ddh_length]; (Or maybe just a ext4_dirent_data_next() that does this for us.) > + } > + > + if (data > &de->name[de->rec_len]) { > + EXT4_ERROR_INODE(dir, > + "corrupted dirdata entry\n"); > + return ERR_PTR(-EFSCORRUPTED); > + } > + > + if (*data == sizeof(__u32)) { Don't we set this to 5 in add_dirent_to_buf? > + __le32 ino_hi; > + > + memcpy(&ino_hi, ++data, sizeof(__u32)); > + ino |= (__u64)le32_to_cpu(ino_hi) << 32; Use struct ext4_dirent_inohi as mentioned in the previous patch, instead of fiddling with raw byte access... > + } else { > + EXT4_ERROR_INODE(dir, > + "corrupted dirdata inode number\n"); > + return ERR_PTR(-EFSCORRUPTED); > + } > + } > > brelse(bh); > if (!ext4_valid_inum(dir->i_sb, ino)) { > - EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); > + EXT4_ERROR_INODE(dir, "bad inode number: %lu", ino); > return ERR_PTR(-EFSCORRUPTED); > } > if (unlikely(ino == dir->i_ino)) { > @@ -1589,7 +1617,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi > inode = ext4_iget_normal(dir->i_sb, ino); > if (inode == ERR_PTR(-ESTALE)) { > EXT4_ERROR_INODE(dir, > - "deleted inode referenced: %u", > + "deleted inode referenced: %lu", > ino); > return ERR_PTR(-EFSCORRUPTED); > } > @@ -1893,7 +1921,7 @@ static int add_dirent_to_buf(handle_t *handle, > unsigned int blocksize = dir->i_sb->s_blocksize; > int csum_size = 0; > unsigned short reclen, dotdot_reclen = 0; > - int err, dlen = 0; > + int err, dlen = 0, data_offset = 0; > bool is_dotdot = false, write_short_dotdot = false; > unsigned char *data; > int namelen = dentry->d_name.len; > @@ -1939,8 +1967,19 @@ static int add_dirent_to_buf(handle_t *handle, > /* If we're writing short form of "dotdot", don't add data section */ > if (data && !write_short_dotdot) { > de->name[namelen] = 0; > - memcpy(&de->name[namelen + 1], data, *(char *)data); > + memcpy(&de->name[namelen + 1], data, *data); > de->file_type |= EXT4_DIRENT_LUFID; > + data_offset = *data; > + } > + > + if (inode) { > + __u32 *i_ino_hi; > + > + de->name[namelen + 1 + data_offset] = 5; The comparison above was against 4, why is this 5? struct ext4_dirent_inohi *di = &de->name[namelen + 1 + data_offset]; di->di_header.ddh_length = sizeof(*di); di->di_inohi = cpu_to_le32(inode->i_ino >> 32); de->inode = cpu_to_le32(inode->i_ino & 0xFFFFFFFF); de->file_type |= EXT4_DIRENT_INODE; > + i_ino_hi = (__u32 *)&de->name[namelen + 1 + data_offset + 1]; > + *i_ino_hi = cpu_to_le32((__u32)(inode->i_ino >> 32)); > + de->file_type |= EXT4_DIRENT_INODE; > + de->inode = cpu_to_le32(inode->i_ino & 0xFFFFFFFF); > } > > /* > @@ -2045,8 +2084,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, > entries = root->entries; > dx_set_block(entries, 1); > dx_set_count(entries, 1); > - dx_set_limit(entries, dx_root_limit(dir, > - fde, sizeof(root->info))); > + dx_set_limit(entries, dx_root_limit(dir, (struct ext4_dir_entry_2 *)fde, > + sizeof(root->info))); > > /* Initialize as for dx_probe */ > fname->hinfo.hash_version = root->info.hash_version; > diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c > index 035cd3f4785e..d0d5acd1a70d 100644 > --- a/fs/ext4/resize.c > +++ b/fs/ext4/resize.c > @@ -1337,10 +1337,10 @@ static void ext4_update_super(struct super_block *sb, > > ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); > ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); > - le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * > - flex_gd->count); > - le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * > - flex_gd->count); > + ext4_set_inodes_count(sb, ext4_get_inodes_count(sb) + > + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); > + ext4_set_free_inodes_count(sb, ext4_get_free_inodes_count(sb) + > + EXT4_INODES_PER_GROUP(sb) * flex_gd->count); > > ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); > /* > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index ead9406d9cff..a06252f9aada 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -3489,6 +3489,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > goto cantfind_ext4; > } > > + if (ext4_has_feature_inode64(sb) && > + (sizeof(u64) != sizeof(unsigned long))) { > + ext4_msg(sb, KERN_ERR, "64 bit inodes need 64 bit kernel."); Is there an architectural reason in ext4 for not supporting 64-bit inodes on 32-bit kernels, or is this simply a statement that we don't intend to support that configuration? Particularly since 32-bit userspace never got updated to know about 64-bit inode numbers. Also, I don't know if this has already been talked about, but have you checked all the places where we export file handles to make sure that we refuse to give out a handle if the handle isn't big enough to fit a large inode? (I think that only affects pre-2000 nfs setups? XFS has some weird warts to handle those safely, but perhaps in 2017 we don't have to care.) --D > + goto failed_mount; > + } > + > /* Load the checksum driver */ > if (ext4_has_feature_metadata_csum(sb) || > ext4_has_feature_ea_inode(sb)) { > @@ -4248,7 +4254,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > GFP_KERNEL); > if (!err) { > unsigned long freei = ext4_count_free_inodes(sb); > - sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); > + ext4_set_free_inodes_count(sb, freei); > err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, > GFP_KERNEL); > } > @@ -4705,9 +4711,9 @@ static int ext4_commit_super(struct super_block *sb, int sync) > EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( > &EXT4_SB(sb)->s_freeclusters_counter))); > if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) > - es->s_free_inodes_count = > - cpu_to_le32(percpu_counter_sum_positive( > - &EXT4_SB(sb)->s_freeinodes_counter)); > + ext4_set_free_inodes_count(sb, > + cpu_to_le32(percpu_counter_sum_positive( > + &EXT4_SB(sb)->s_freeinodes_counter))); > BUFFER_TRACE(sbh, "marking dirty"); > ext4_superblock_csum_set(sb); > if (sync) > -- > 2.13.5 (Apple Git-94) >