>From 9e09fef78b2fa552c883bf8124af873abfde0805 Mon Sep 17 00:00:00 2001 From: Lokesh Nagappa Jaliminche <lokesh.jaliminche@xxxxxxxxx> Date: Sat, 19 Dec 2015 00:33:06 +0530 Subject: [PATCH] ext4: optimizing group serch for inode allocation Added a check at the start of group search loop to avoid looping unecessarily in case of empty group. This also allow group search to jump directly to "found_flex_bg" with "stats" and "group" already set, so there is no need to go through the extra steps of setting "best_desc" and "best_group" and then break out of the loop just to set "stats" and "group" again. Signed-off-by: Lokesh Nagappa Jaliminche <lokesh.jaliminche@xxxxxxxxx> --- fs/ext4/ialloc.c | 8 ++++++++ 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1b8024d..588bf8e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -446,6 +446,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); + unsigned int inodes_per_flex_group; + long unsigned int blocks_per_clustre; unsigned int freei, avefreei, grp_free; ext4_fsblk_t freeb, avefreec; unsigned int ndirs; @@ -470,6 +472,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, percpu_counter_read_positive(&sbi->s_freeclusters_counter)); avefreec = freeb; do_div(avefreec, ngroups); + inodes_per_flex_group = inodes_per_group * flex_size; + blocks_per_clustre = sbi->s_blocks_per_group * flex_size; ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); · if (S_ISDIR(mode) && @@ -489,6 +493,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); + /* the group can't get any better than empty */ + if (inodes_per_flex_group == stats.free_inodes && + blocks_per_clustre == stats.free_clusters) + goto found_flex_bg; if (!stats.free_inodes) continue; if (stats.used_dirs >= best_ndir) -- 1.7.1 On Tue, Dec 15, 2015 at 02:32:52PM -0700, Andreas Dilger wrote: > On Dec 15, 2015, at 3:33 AM, lokesh jaliminche <lokesh.jaliminche@xxxxxxxxx> wrote: > > > > "No, this isn't correct. This loop is looking for the *best* group it > > can find, and your "break" would have it exit the loop as soon as the > > *first* group that matches the conditions is found." > > > > But as we are checking all the groups with the same conditions then > > how it guarantees better group selection ? As per my understanding we > > are just wasting time in looping. > > The important part of the loop is ensuring that the selected group is always > improving over the previous one: > > if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) > continue; > > The "best_ndir" value tracks for the best group found so far the number of > used directories in the group, and if the new directory has fewer directories > than the previous "best" directory and still meets all the other criteria > (fewer than average inodes allocated, etc) then the new group will be chosen. > > That said, you are correct that the loop can spend a lot of time searching > needlessly. It would be trivial to add a check at the start of the loop: > > /* the group can't get any better than empty */ > if (desc->bg_free_inodes_count == inodes_per_group && > desc->bg_free_blocks_count == > EXT4_BLOCKS_PER_GROUP(sb)) > goto found; > > This jumps directly to "found" with "desc" and "group" already set, so > there is no need to go through the extra steps of setting "best_desc" and > "best_group" and then break out of the loop just to set "desc" and "group" > again. > > Since you are the one to find this optimization, could you please submit a > patch to this effect so you get the credit. > > Cheers, Andreas > > > On Sat, Dec 5, 2015 at 3:24 AM, Andreas Dilger <adilger@xxxxxxxxx> wrote: > >> > >>> On Dec 3, 2015, at 12:13 PM, lokesh jaliminche <lokesh.jaliminche@xxxxxxxxx> wrote: > >>> > >>> Ohh thanks for the clarification. There is one more thing I would like > >>> to point out here. > >>> In the code there is a loop to scan the groups for inode > >>> alllocation(Inside find_group_orlov function). > >>> There are some policies for group selection . while scanning the > >>> groups, it checks for these > >>> policies to be satisfied. > >>> If a particular group satisfy these properties it should get selected > >>> for inode allocation but instead > >>> it does further lookup in next groups. > >>> I think there is missing breaking condition. I have added break over > >>> there and here is the > >>> patch for that. Any reason for not having break condition over here ? > >>> > >>> diff -Nur linux-2.6.32-431.17.1.el6.x86_64/fs/ext4/ialloc.c > >>> linux-2.6.32-431.17.1.el6.x86_64/fs/ext4/ialloc.c > >>> --- linux-2.6.32-431.17.1.el6.x86_64/fs/ext4/ialloc.c 2014-04-12 > >>> 01:20:31.000000000 +0530 > >>> +++ linux-2.6.32-431.17.1.el6.x86_64/fs/ext4/ialloc.c 2015-11-29 > >>> 21:36:51.805542209 +0530 > >>> @@ -529,6 +529,7 @@ > >>> grp = g; > >>> ret = 0; > >>> best_ndir = stats.used_dirs; > >>> + break; > >>> } > >>> if (ret) > >>> goto fallback; > >> > >> No, this isn't correct. This loop is looking for the *best* group it can find, > >> and your "break" would have it exit the loop as soon as the *first* directory > >> that matches the conditions is found. Since those conditions are fairly weak, > >> for example that the group actually has free inodes, and it has better than > >> average free inodes and blocks, it makes sense to search beyond just the first > >> matching group. > >> > >> That said, it also doesn't make sense to search beyond a "perfect" group that > >> has no allocated inodes and no allocated blocks, so a break condition could be > >> added to this loop and make it more efficient, especially for very large > >> filesystems that have 128k+ groups. > >> > >> It should be noted that this part of the algorithm only applies to "top level" > >> directories (those below the root inode, or with the EXT4_INODE_TOPDIR flag > >> set, so searching a bit longer for a good group is not a bad idea in this case. > >> > >> Cheers, Andreas. > >> > >>> Thanks & Regards, > >>> Lokesh > >>> > >>> > >>> > >>> On Thu, Dec 3, 2015 at 11:28 PM, Andreas Dilger <adilger@xxxxxxxxx> wrote: > >>>> On Dec 3, 2015, at 01:07, lokesh jaliminche <lokesh.jaliminche@xxxxxxxxx> wrote: > >>>>> > >>>>> Thought of giving more clarification on my question > >>>>> why group search start is random ? because we can also start search > >>>>> for valid groups for inode allocation from the start. As this group > >>>>> search is random inode selection might go to end of groups which > >>>>> might affect IO performance > >>>> > >>>> Starting the inode search at the beginning of the disk each time > >>>> means that inode allocation will be inefficient because it will search > >>>> over groups that are mostly or entirely full already. > >>>> > >>>> Allocating the new directory in a semi-random group, one that is > >>>> relatively unused, ensures that new > >>>> inode and block allocations are relatively efficient afterward. > >>>> > >>>> Cheers, Andreas > >>>> > >>>>> On Thu, Dec 3, 2015 at 1:14 PM, lokesh jaliminche > >>>>> <lokesh.jaliminche@xxxxxxxxx> wrote: > >>>>>> hello folks, > >>>>>> I am new to ext4 code. I was going through the > >>>>>> ext4-source for allocation of inode. > >>>>>> There is one thing that I did not understand while selection of groups > >>>>>> for inode allocation . I came across this code snippet which is part > >>>>>> of find_group_orlov function. question is, why group search start is > >>>>>> random ? > >>>>>> > >>>>>> Code snippet: > >>>>>> ========== > >>>>>> В·В·В·if (qstr) { > >>>>>> »·······»·······»·······hinfo.hash_version = LDISKFS_DX_HASH_HALF_MD4; > >>>>>> »·······»·······»·······hinfo.seed = sbi->s_hash_seed; > >>>>>> »·······»·······»·······ldiskfsfs_dirhash(qstr->name, qstr->len, &hinfo); > >>>>>> »·······»·······»·······grp = hinfo.hash; > >>>>>> »·······»·······} else > >>>>>> »·······»·······»·······get_random_bytes(&grp, sizeof(grp)); > >>>>>> »·······»·······parent_group = (unsigned)grp % ngroups; > >>>>>> »·······»·······for (i = 0; i < ngroups; i++) { > >>>>>> »·······»·······»·······g = (parent_group + i) % ngroups; > >>>>>> »·······»·······»·······get_orlov_stats(sb, g, flex_size, &stats); > >>>>>> »·······»·······»·······if (!stats.free_inodes) > >>>>>> »·······»·······»·······»·······continue; > >>>>>> »·······»·······»·······if (stats.used_dirs >= best_ndir) > >>>>>> »·······»·······»·······»·······continue; > >>>>>> »·······»·······»·······if (stats.free_inodes < avefreei) > >>>>>> »·······»·······»·······»·······continue; > >>>>>> »·······»·······»·······if (stats.free_blocks < avefreeb) > >>>>>> »·······»·······»·······»·······continue; > >>>>>> »·······»·······»·······grp = g; > >>>>>> »·······»·······»·······ret = 0; > >>>>>> »·······»·······»·······best_ndir = stats.used_dirs; > >>>>>> »·······»·······} > >>>>>> > >>>>>> Thanks & Regards, > >>>>>> Lokesh > >>>>> N‹§Іжмrё›yъиљШbІX¬¶З§vШ^–)Ює{.nЗ+‰·ҐЉ{±{ xЉ{ayє К‡Ъ™л,j ўfЈў·hљ‹аz№ ®wҐўё ў·¦j:+v‰ЁЉwиjШm¶џяѕ «‘кзzZ+ѓщљЋЉЭўj"ќъ!¶i > >> > >> > >> Cheers, Andreas > >> > >> > >> > >> > >> > > > Cheers, Andreas > > > > > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html