Andreas, I not strongly against it patch, but my testing say - 3 level h-tree need a more than 20mil files in directory, and creation rate was dropped dramatically, a specially without parallel dir operations, Did we need do some other research changes with landing it patch as disk format will changed anyway by incompatible way ? > 17 марта 2017 г., в 0:44, Andreas Dilger <adilger@xxxxxxxxx> написал(а): > > On Mar 16, 2017, at 3:51 AM, Artem Blagodarenko <artem.blagodarenko@xxxxxxxxx> wrote: >> >> From: Artem Blagodarenko <artem.blagodarenko@xxxxxxxxx> >> >> This INCOMPAT_LARGEDIR feature allows larger directories to be created >> in ldiskfs, both with directory sizes over 2GB and and a maximum htree >> depth of 3 instead of the current limit of 2. These features are needed >> in order to exceed the current limit of approximately 10M entries in a >> single directory. > > Artem, > many thanks for updating and submitting this, and creating the e2fsck patches. > > Ted, > can you please add the original author for the largedir ext4 patch when > landing this patch: Liang Zhen <liang.zhen@xxxxxxxxx> > > and these tags which contain more background information on this feature: > > Lustre-change: https://review.hpdd.intel.com/375 > Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-50 > >> Signed-off-by: Yang Sheng <yang.sheng@xxxxxxxxx> >> Signed-off-by: Artem Blagodarenko <artem.blagodarenko@xxxxxxxxxxx> > > Reviewed-by: Andreas Dilger <andreas.dilger@xxxxxxxxx> > >> --- >> fs/ext4/ext4.h | 23 ++++++++--- >> fs/ext4/inode.c | 4 +- >> fs/ext4/namei.c | 118 +++++++++++++++++++++++++++++++++++++++----------------- >> 3 files changed, 102 insertions(+), 43 deletions(-) >> >> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h >> index 01d52b9..0bbbd9b 100644 >> --- a/fs/ext4/ext4.h >> +++ b/fs/ext4/ext4.h >> @@ -1799,7 +1799,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) >> EXT4_FEATURE_INCOMPAT_MMP | \ >> EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ >> EXT4_FEATURE_INCOMPAT_ENCRYPT | \ >> - EXT4_FEATURE_INCOMPAT_CSUM_SEED) >> + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ >> + EXT4_FEATURE_INCOMPAT_LARGEDIR) >> #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ >> EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ >> EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ >> @@ -2125,6 +2126,16 @@ struct dir_private_info { >> */ >> #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) >> >> +/* htree levels for ext4 */ >> +#define EXT4_HTREE_LEVEL_COMPAT 2 >> +#define EXT4_HTREE_LEVEL 3 >> + >> +static inline int ext4_dir_htree_level(struct super_block *sb) >> +{ >> + return ext4_has_feature_largedir(sb) ? >> + EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; >> +} >> + >> /* >> * Timeout and state flag for lazy initialization inode thread. >> */ >> @@ -2758,13 +2769,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, >> es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); >> } >> >> -static inline loff_t ext4_isize(struct ext4_inode *raw_inode) >> +static inline loff_t ext4_isize(struct super_block *sb, >> + struct ext4_inode *raw_inode) >> { >> - if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) >> + if (ext4_has_feature_largedir(sb) || >> + S_ISREG(le16_to_cpu(raw_inode->i_mode))) >> return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | >> le32_to_cpu(raw_inode->i_size_lo); >> - else >> - return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >> + >> + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); >> } >> >> static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c >> index f622d4a..5787f3d 100644 >> --- a/fs/ext4/inode.c >> +++ b/fs/ext4/inode.c >> @@ -4682,7 +4682,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) >> if (ext4_has_feature_64bit(sb)) >> ei->i_file_acl |= >> ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; >> - inode->i_size = ext4_isize(raw_inode); >> + inode->i_size = ext4_isize(sb, raw_inode); >> if ((size = i_size_read(inode)) < 0) { >> EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); >> ret = -EFSCORRUPTED; >> @@ -5008,7 +5008,7 @@ static int ext4_do_update_inode(handle_t *handle, >> raw_inode->i_file_acl_high = >> cpu_to_le16(ei->i_file_acl >> 32); >> raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); >> - if (ei->i_disksize != ext4_isize(raw_inode)) { >> + if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { >> ext4_isize_set(raw_inode, ei->i_disksize); >> need_datasync = 1; >> } >> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c >> index 6ad612c..3298fe3 100644 >> --- a/fs/ext4/namei.c >> +++ b/fs/ext4/namei.c >> @@ -513,7 +513,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle, >> >> static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) >> { >> - return le32_to_cpu(entry->block) & 0x00ffffff; >> + return le32_to_cpu(entry->block) & 0x0fffffff; >> } > > It wouldn't be terrible to add a comment to this function explaining why > the block numbers are masked off, which is to allow the high bits of the > logical block number to hold a "fullness" fraction so that the kernel > could start shrinking directories if many files are deleted. That said, > this isn't a shortcoming of this patch, but a suggestion if the patch is > being resubmitted for some other reason. > >> static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) >> @@ -739,6 +739,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >> struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); >> u32 hash; >> >> + memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); >> frame->bh = ext4_read_dirblock(dir, 0, INDEX); >> if (IS_ERR(frame->bh)) >> return (struct dx_frame *) frame->bh; >> @@ -768,9 +769,15 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >> } >> >> indirect = root->info.indirect_levels; >> - if (indirect > 1) { >> - ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", >> - root->info.indirect_levels); >> + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { >> + ext4_warning(dir->i_sb, >> + "Directory (ino: %lu) htree depth %#06x exceed" >> + "supported value", dir->i_ino, >> + ext4_dir_htree_level(dir->i_sb)); >> + if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { >> + ext4_warning(dir->i_sb, "Enable large directory " >> + "feature to access it"); >> + } >> goto fail; >> } >> >> @@ -859,12 +866,19 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, >> >> static void dx_release(struct dx_frame *frames) >> { >> + struct dx_root_info *info; >> + int i; >> + >> if (frames[0].bh == NULL) >> return; >> >> - if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) >> - brelse(frames[1].bh); >> - brelse(frames[0].bh); >> + info = &((struct dx_root *)frames[0].bh->b_data)->info; >> + for (i = 0; i <= info->indirect_levels; i++) { >> + if (frames[i].bh == NULL) >> + break; >> + brelse(frames[i].bh); >> + frames[i].bh = NULL; >> + } >> } >> >> /* >> @@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, >> { >> struct dx_hash_info hinfo; >> struct ext4_dir_entry_2 *de; >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> struct inode *dir; >> ext4_lblk_t block; >> int count = 0; >> @@ -1517,7 +1531,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, >> struct ext4_dir_entry_2 **res_dir) >> { >> struct super_block * sb = dir->i_sb; >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> const struct qstr *d_name = fname->usr_fname; >> struct buffer_head *bh; >> ext4_lblk_t block; >> @@ -1947,7 +1961,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, >> */ >> dir->i_mtime = dir->i_ctime = current_time(dir); >> ext4_update_dx_flag(dir); >> - dir->i_version++; >> + inode_inc_iversion(dir); >> ext4_mark_inode_dirty(handle, dir); >> BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); >> err = ext4_handle_dirty_dirent_node(handle, dir, bh); >> @@ -1966,7 +1980,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, >> { >> struct buffer_head *bh2; >> struct dx_root *root; >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> struct dx_entry *entries; >> struct ext4_dir_entry_2 *de, *de2; >> struct ext4_dir_entry_tail *t; >> @@ -2185,13 +2199,16 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, >> static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> struct inode *dir, struct inode *inode) >> { >> - struct dx_frame frames[2], *frame; >> + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; >> struct dx_entry *entries, *at; >> struct buffer_head *bh; >> struct super_block *sb = dir->i_sb; >> struct ext4_dir_entry_2 *de; >> + int restart; >> int err; >> >> +again: >> + restart = 0; >> frame = dx_probe(fname, dir, NULL, frames); >> if (IS_ERR(frame)) >> return PTR_ERR(frame); >> @@ -2213,24 +2230,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> if (err != -ENOSPC) >> goto cleanup; >> >> + err = 0; >> /* Block full, should compress but for now just split */ >> dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", >> dx_get_count(entries), dx_get_limit(entries))); >> /* Need to split index? */ >> if (dx_get_count(entries) == dx_get_limit(entries)) { >> ext4_lblk_t newblock; >> - unsigned icount = dx_get_count(entries); >> - int levels = frame - frames; >> + int levels = frame - frames + 1; >> + unsigned int icount; >> + int add_level = 1; >> struct dx_entry *entries2; >> struct dx_node *node2; >> struct buffer_head *bh2; >> >> - if (levels && (dx_get_count(frames->entries) == >> - dx_get_limit(frames->entries))) { >> - ext4_warning_inode(dir, "Directory index full!"); >> + while (frame > frames) { >> + if (dx_get_count((frame - 1)->entries) < >> + dx_get_limit((frame - 1)->entries)) { >> + add_level = 0; >> + break; >> + } >> + frame--; /* split higher index block */ >> + at = frame->at; >> + entries = frame->entries; >> + restart = 1; >> + } >> + if (add_level && levels == ext4_dir_htree_level(sb)) { >> + ext4_warning(sb, "Directory (ino: %lu) index full, " >> + "reach max htree level :%d", >> + dir->i_ino, levels); >> + if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { >> + ext4_warning(sb, "Large directory feature is " >> + "not enabled on this " >> + "filesystem"); >> + } >> err = -ENOSPC; >> goto cleanup; >> } >> + icount = dx_get_count(entries); >> bh2 = ext4_append(handle, dir, &newblock); >> if (IS_ERR(bh2)) { >> err = PTR_ERR(bh2); >> @@ -2245,7 +2282,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> err = ext4_journal_get_write_access(handle, frame->bh); >> if (err) >> goto journal_error; >> - if (levels) { >> + if (!add_level) { >> unsigned icount1 = icount/2, icount2 = icount - icount1; >> unsigned hash2 = dx_get_hash(entries + icount1); >> dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", >> @@ -2253,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> >> BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ >> err = ext4_journal_get_write_access(handle, >> - frames[0].bh); >> + (frame - 1)->bh); >> if (err) >> goto journal_error; >> >> @@ -2269,17 +2306,23 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> frame->entries = entries = entries2; >> swap(frame->bh, bh2); >> } >> - dx_insert_block(frames + 0, hash2, newblock); >> - dxtrace(dx_show_index("node", frames[1].entries)); >> + dx_insert_block((frame - 1), hash2, newblock); >> + dxtrace(dx_show_index("node", frame->entries)); >> dxtrace(dx_show_index("node", >> ((struct dx_node *) bh2->b_data)->entries)); >> err = ext4_handle_dirty_dx_node(handle, dir, bh2); >> if (err) >> goto journal_error; >> brelse (bh2); >> + ext4_handle_dirty_metadata(handle, dir, >> + (frame - 1)->bh); >> + if (restart) { >> + ext4_handle_dirty_metadata(handle, dir, >> + frame->bh); >> + goto cleanup; >> + } >> } else { >> - dxtrace(printk(KERN_DEBUG >> - "Creating second level index...\n")); >> + struct dx_root *dxroot; >> memcpy((char *) entries2, (char *) entries, >> icount * sizeof(struct dx_entry)); >> dx_set_limit(entries2, dx_node_limit(dir)); >> @@ -2287,19 +2330,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> /* Set up root */ >> dx_set_count(entries, 1); >> dx_set_block(entries + 0, newblock); >> - ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; >> - >> - /* Add new access path frame */ >> - frame = frames + 1; >> - frame->at = at = at - entries + entries2; >> - frame->entries = entries = entries2; >> - frame->bh = bh2; >> - err = ext4_journal_get_write_access(handle, >> - frame->bh); >> - if (err) >> - goto journal_error; >> + dxroot = (struct dx_root *)frames[0].bh->b_data; >> + dxroot->info.indirect_levels += 1; >> + dxtrace(printk(KERN_DEBUG >> + "Creating %d level index...\n", >> + info->indirect_levels)); >> + ext4_handle_dirty_metadata(handle, dir, frame->bh); >> + ext4_handle_dirty_metadata(handle, dir, bh2); >> + brelse(bh2); >> + restart = 1; >> + goto cleanup; >> } >> - err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); >> if (err) { >> ext4_std_error(inode->i_sb, err); >> goto cleanup; >> @@ -2318,6 +2359,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, >> cleanup: >> brelse(bh); >> dx_release(frames); >> + /* @restart is true means htree-path has been changed, we need to >> + * repeat dx_probe() to find out valid htree-path >> + */ >> + if (restart && err == 0) >> + goto again; >> return err; >> } >> >> @@ -2354,7 +2400,7 @@ int ext4_generic_delete_entry(handle_t *handle, >> blocksize); >> else >> de->inode = 0; >> - dir->i_version++; >> + inode_inc_iversion(dir); >> return 0; >> } >> i += ext4_rec_len_from_disk(de->rec_len, blocksize); >> -- >> 1.8.3.1 >> > > > Cheers, Andreas > > > > >