On Oct 27, 2017, at 11:22 AM, Artem Blagodarenko <artem.blagodarenko@xxxxxxxxx> wrote: > > Use dirdata to store high bits of 64bit inode > number. > > Signed-off-by: Artem Blagodarenko <artem.blagodarenko@xxxxxxxxxxx> > --- > fs/ext4/dir.c | 4 +-- > fs/ext4/ext4.h | 77 +++++++++++++++++++++++++++++++++++++++++++++----------- > fs/ext4/ialloc.c | 8 +++--- > fs/ext4/namei.c | 29 ++++++++++++++++++--- > fs/ext4/super.c | 8 +++--- > 5 files changed, 97 insertions(+), 29 deletions(-) > > diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c > index 282279b66385..d2a17af0185d 100644 > --- a/fs/ext4/dir.c > +++ b/fs/ext4/dir.c > @@ -381,7 +381,7 @@ struct fname { > __u32 minor_hash; > struct rb_node rb_hash; > struct fname *next; > - __u32 inode; > + __u64 inode; > __u8 name_len; > __u8 file_type; > char name[0]; > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index aaaed29caa67..5965c54976e3 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1331,8 +1331,13 @@ struct ext4_super_block { > __le32 s_lpf_ino; /* Location of the lost+found inode */ > __le32 s_prj_quota_inum; /* inode for tracking project quota */ > __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ > - __le32 s_reserved[98]; /* Padding to the end of the block */ > - __le32 s_checksum; /* crc32c(superblock) */ > + __u32 s_inodes_count_hi; /* higth part of inode count */ > + __u32 s_free_inodes_count_hi; /* Free inodes count */ > + __u32 s_usr_quota_inum_hi; > + __u32 s_grp_quota_inum_hi; > + __u32 s_prj_quota_inum_hi; > + __u32 s_reserved[93]; /* Padding to the end of the block */ > + __u32 s_checksum; /* crc32c(superblock) */ Why use "__u32" instead of "__le32" like the other fields? __le32 helps ensure that the fields are swabbed properly upon access. > @@ -1805,6 +1799,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) > EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ > EXT4_FEATURE_INCOMPAT_ENCRYPT | \ > EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ > + EXT4_FEATURE_INCOMPAT_64INODE | \ > EXT4_FEATURE_INCOMPAT_LARGEDIR | \ > EXT4_FEATURE_INCOMPAT_DIRDATA) IMHO, this feature would be better named "INCOMPAT_INODE64" so that the helper function is named ext4_feature_inode64() instead of ext4_feature_64inode(). > @@ -2889,6 +2884,58 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) > return 1 << sbi->s_log_groups_per_flex; > } > > +static inline unsigned long ext4_get_inodes_count(struct ext4_super_block *sb) > +{ > + unsigned long inodes_count = sb->s_inodes_count; This needs to be u64 instead of "unsigned long", or it will break on 32-bit CPUs. > + if (sb->s_feature_incompat & EXT4_FEATURE_INCOMPAT_64INODE) This should use ext4_has_feature_inode64(sb), as with all these other helpers. > + inodes_count |= (unsigned long)sb->s_inodes_count_hi << 32; > + return inodes_count; Also, these functions need to swab s_inodes_count and s_inodes_count_hi before combining them into the unsigned long, since it isn't possible to do it correctly in the caller: static inline unsigned long ext4_get_inodes_count(struct ext4_super_block *sb) { unsigned long inodes_count = le32_to_cpu(sb->s_inodes_count); if (ext4_has_feature_inode64(sb)) inodes_count |= (unsigned long)le32_to_cpu(sb->s_inodes_count_hi) << 32; return inodes_count; } > +static inline void ext4_set_inodes_count(struct ext4_super_block *sb, unsigned long val) Also needs to be u64 instead of "unsigned long", and wrap at 80 columns. Hmm, I see struct inode "i_ino" is declared as unsigned long, which is a bit of a problem. Does that mean we just refuse to mount ext4 filesystems with 64-bit inode numbers on 32-bit systems? It wouldn't be the end of the world. > +{ > + if (sb->s_feature_incompat & EXT4_FEATURE_INCOMPAT_64INODE) > + sb->s_inodes_count_hi = val >> 32; > + > + sb->s_inodes_count = val; > +} > + > +static inline unsigned long ext4_get_free_inodes_count(struct ext4_super_block *sb) > +{ > + unsigned long inodes_count = sb->s_free_inodes_count; > + > + if (sb->s_feature_incompat & EXT4_FEATURE_INCOMPAT_64INODE); > + inodes_count |= (unsigned long)sb->s_free_inodes_count_hi << 32; > + return inodes_count; > +} > + > +static inline void ext4_set_free_inodes_count(struct ext4_super_block *sb, unsigned long val) > +{ > + if (sb->s_feature_incompat & EXT4_FEATURE_INCOMPAT_64INODE) > + sb->s_free_inodes_count_hi = (__u32)(val >> 32); > + > + sb->s_free_inodes_count = val; > +} > + > +static inline void ext4_inc_free_inodes_count(struct ext4_super_block *sb) > +{ > + __u64 val = ext4_get_free_inodes_count(sb); > + > + ext4_set_free_inodes_count(sb, val); This isn't actually incrementing the free inodes count? Also, rather than having separate inc/dec operations it is probably better to just pass an argument with +/-1 (or +/- N). > +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) > +{ > + return ino == EXT4_ROOT_INO || > + ino == EXT4_USR_QUOTA_INO || > + ino == EXT4_GRP_QUOTA_INO || > + ino == EXT4_JOURNAL_INO || > + ino == EXT4_RESIZE_INO || > + (ino >= EXT4_FIRST_INO(sb) && > + ino <= le64_to_cpu(ext4_get_inodes_count(EXT4_SB(sb)->s_es))); (defect) swabbing should not be done here. > +} > + > #define ext4_std_error(sb, errno) \ > do { \ > if ((errno)) \ > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index ee823022aa34..99bcced744c8 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -303,7 +303,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) > ext4_clear_inode(inode); > > es = EXT4_SB(sb)->s_es; > - if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { > + if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(ext4_get_inodes_count(es))) { Swabbing needs to be done inside ext4_get_inodes_count() before returning it to the caller. Also, le32_to_cpu() would be wrong, since it is a 64-bit value. > ext4_error(sb, "reserved or nonexistent inode %lu", ino); > goto error_return; > } > @@ -770,7 +770,7 @@ static int find_inode_bit(struct super_block *sb, ext4_group_t group, > */ > struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, > umode_t mode, const struct qstr *qstr, > - __u32 goal, uid_t *owner, __u32 i_flags, > + __u64 goal, uid_t *owner, __u32 i_flags, > int handle_type, unsigned int line_no, > int nblocks) > { > @@ -887,7 +887,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, > if (!goal) > goal = sbi->s_inode_goal; > > - if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { > + if (goal && goal <= le32_to_cpu(ext4_get_inodes_count(sbi->s_es))) { > group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); > ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); > ret2 = 0; There also needs to be a change to the metadata checksum code. It is using only i_ino (low 32-bit value) to compute the checksum seed, but that means the seed is the same for all inode numbers mod 2^32. If there is a bug in the code somewhere that drops the high bits of the inode number then it may write it to the wrong location on disk, but it would have a "valid" checksum. It would be better to use the full 64-bit inode number for the checksum seed so that this does not happen: /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); if (inode->i_ino >> 32) { inum = cpu_to_le32(inode->i_ino >> 32); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); } ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); > @@ -1226,7 +1226,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, > /* Verify that we are loading a valid orphan from disk */ > struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) > { > - unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); > + unsigned long max_ino = le32_to_cpu(ext4_get_inodes_count(EXT4_SB(sb)->s_es)); Swabbing needs to be done in ext4_get_inodes_count(). > ext4_group_t block_group; > int bit; > struct buffer_head *bitmap_bh = NULL; > diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c > index ccb6ca477bab..fbcfb8b193ad 100644 > --- a/fs/ext4/namei.c > +++ b/fs/ext4/namei.c > @@ -1573,11 +1573,22 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi > return (struct dentry *) bh; > inode = NULL; > if (bh) { > - __u32 ino = le32_to_cpu(de->inode); > + __u32 ino_hi; > + __u32 ino_lo; > + unsigned long ino; > + unsigned char *data; > + data = ext4_dentry_get_data(dir->i_sb, (struct ext4_dentry_param *)dentry->d_fsdata); > + > + if (data) { > + ino_hi = le32_to_cpu(de->name[dentry->d_name.len + 1 + *data + 1]); (style) Need to run this through checkpatch.pl to catch issues like 80-char wrap. That said, I don't think this is correct. This should be getting the high 32 bits of the inode number from "de", and it shouldn't have anything to do with "dentry->d_fsdata". This should be something like: ino = le32_to_cpu(de->inode); if (de->file_type & EXT4_DIRENT_INODE) { /* XXX also check if FEATURE_INODE64 is set? */ /* skip LUFID record if present */ char *data = de->name[de->name_len]; if (de->file_type & EXT4_DIRENT_LUFID) { /* first byte is record length */ data += *(char *)data; /* XXX verify we are still inside rec_len? */ } if (*(char *)data) == sizeof(__u32)) { __le32 ino_hi; memcpy(ino_hi, data, sizeof(__u32)); ino |= (u64)le32_to_cpu(ino_hi) << 32; } /* XXX else error? */ } > + ino_lo = le32_to_cpu(de->inode); > + ino = (__u64)ino_hi << 32 & ino_lo; > + } else > + ino = le32_to_cpu(de->inode); (style) braces around both parts of if/else block. > @@ -1890,9 +1901,10 @@ static int add_dirent_to_buf(handle_t *handle, > unsigned int blocksize = dir->i_sb->s_blocksize; > int csum_size = 0; > unsigned short reclen, dotdot_reclen = 0; > - int err, dlen = 0; > + int err, dlen = 0, data_offset = 0; > bool is_dotdot = false, write_short_dotdot = false; > unsigned char *data; > + __u32 *i_ino_hi; This can be declared local to the the "if (inode) {" block. > int namelen = dentry->d_name.len; > > if (ext4_has_metadata_csum(inode->i_sb)) > @@ -1937,6 +1949,15 @@ static int add_dirent_to_buf(handle_t *handle, > de->name[namelen] = 0; > memcpy(&de->name[namelen + 1], data, *(char *) data); > de->file_type |= EXT4_DIRENT_LUFID; > + data_offset = *data; > + } > + > + if (inode) { > + de->name[namelen + 1 + data_offset] = 3; "3" is what, the length of the extra data? I'd think to store the high 32 bits this should be "5" (length byte + sizeof(__u32)). > + i_ino_hi = (__u32 *)&de->name[namelen + 1 + data_offset + 1]; This unaligned __u32 access may cause problems on some architectures. Better to use memcpy() to copy the value rather than direct assignment. That said, since this is the same as in ext4_lookup() it might make sense to have a helper function like "ext4_dirent_get_inode(de)" or similar that returns the inode number (including 64-bit value if present). > + *i_ino_hi = cpu_to_le32((__u32)(inode->i_ino >> 32)); > + de->file_type |= EXT4_DIRENT_INODE; > + de->inode = cpu_to_le32(inode->i_ino & 0x0fffffff); (defect) this is only storing the low 2^28 bits of the inode number? > } > > /* > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index ead9406d9cff..bcea0b84b668 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -4248,7 +4248,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > GFP_KERNEL); > if (!err) { > unsigned long freei = ext4_count_free_inodes(sb); > - sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); > + ext4_set_free_inodes_count(sbi->s_es, cpu_to_le64(freei)); > err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, > GFP_KERNEL); > } At a minimum, there needs to be a mount-time check to prevent mounting a filesystem with 64-bit inodes on a 32-bit kernel. Better would be to fix the code to handle this even on 32-bit kernels, but the support for this looks patchy. The VFS i_ino is unsigned long, so we'd need to store the high bits in ext4_inode_info for 32-bit kernels to use in ext4_getattr(). This would need a wrapper function for all i_ino uses to get i_ino_hi from ext4_inode_info, like ext4_get_ino(inode) or similar (it would just return i_ino for 64-bit kernels). The struct linux_dirent also has unsigned long for d_ino, but that looks like it is only for 32-bit syscalls, there is also linux_dirent64 that has u64 for d_ino. > @@ -4705,9 +4705,9 @@ static int ext4_commit_super(struct super_block *sb, int sync) > EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( > &EXT4_SB(sb)->s_freeclusters_counter))); > if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) > - es->s_free_inodes_count = > - cpu_to_le32(percpu_counter_sum_positive( > - &EXT4_SB(sb)->s_freeinodes_counter)); > + ext4_set_free_inodes_count(es, > + cpu_to_le32(percpu_counter_sum_positive( > + &EXT4_SB(sb)->s_freeinodes_counter))); > BUFFER_TRACE(sbh, "marking dirty"); > ext4_superblock_csum_set(sb); > if (sync) > -- > 2.13.5 (Apple Git-94) > Cheers, Andreas
Attachment:
signature.asc
Description: Message signed with OpenPGP