> Ext4's on-line resizing adds a new block group and then, only at the > last step adjusts s_groups_count. However, it's possible on SMP > systems that another CPU could see the updated the s_group_count and > not see the newly initialized data structures for the just-added block > group. For this reason, it's important to insert a SMP read barrier > after reading s_groups_count and before reading any, say, block group > descriptors allowed by the block group count. > > Unfortunately, we rather blatently violate this locking protocol as > documented in fs/ext4/resize.c. Fortunately, (1) on-line resizes > happen relatively rarely, and (2) it seems rare that the filesystem > code will immediately try to use just-added block group before any > memory ordering issues resolve themselves. So apparently problems > here are relatively hard to hit, since ext3 also is vulnerable to this > race and no one has apparently complained. Ouch... Hmm, smp_rmb() isn't completely free and mainly it's a bit ugly and prone to errors (I'm afraid next time someone changes the allocation code, we miss some barriers again)... so.. Maybe a stupid idea but wouldn't it be easier to solve the online resize like: freeze the filesystem, do all the changes required for extend, unfreeze the filesystem? I guess the resize code might get simpler as well with this. Honza > Signed-off-by: "Theodore Ts'o" <tytso@xxxxxxx> > --- > fs/ext4/balloc.c | 6 ++++-- > fs/ext4/ialloc.c | 34 +++++++++++++++++++++------------- > fs/ext4/mballoc.c | 49 ++++++++++++++++++++++++++++++++----------------- > 3 files changed, 57 insertions(+), 32 deletions(-) > > diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c > index 53c72ad..d1615f2 100644 > --- a/fs/ext4/balloc.c > +++ b/fs/ext4/balloc.c > @@ -88,9 +88,11 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, > ext4_group_t block_group, struct ext4_group_desc *gdp) > { > int bit, bit_max; > + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; > unsigned free_blocks, group_blocks; > struct ext4_sb_info *sbi = EXT4_SB(sb); > > + smp_rmb(); /* after reading s_groups_count first */ > if (bh) { > J_ASSERT_BH(bh, buffer_locked(bh)); > > @@ -123,7 +125,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, > bit_max += ext4_bg_num_gdb(sb, block_group); > } > > - if (block_group == sbi->s_groups_count - 1) { > + if (block_group == ngroups - 1) { > /* > * Even though mke2fs always initialize first and last group > * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need > @@ -131,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, > */ > group_blocks = ext4_blocks_count(sbi->s_es) - > le32_to_cpu(sbi->s_es->s_first_data_block) - > - (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); > + (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1)); > } else { > group_blocks = EXT4_BLOCKS_PER_GROUP(sb); > } > diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c > index f18e0a0..52ce274 100644 > --- a/fs/ext4/ialloc.c > +++ b/fs/ext4/ialloc.c > @@ -322,6 +322,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, > ext4_group_t group; > int ret = -1; > > + smp_rmb(); /* after reading s_groups_count first */ > freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter); > avefreei = freei / ngroups; > > @@ -362,7 +363,8 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, > ext4_group_t n_fbg_groups; > ext4_group_t i; > > - n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> > + smp_rmb(); /* after reading s_groups_count first */ > + n_fbg_groups = (ngroups + flex_size - 1) >> > sbi->s_log_groups_per_flex; > > find_close_to_parent: > @@ -478,18 +480,18 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, > { > ext4_group_t parent_group = EXT4_I(parent)->i_block_group; > struct ext4_sb_info *sbi = EXT4_SB(sb); > - ext4_group_t ngroups = sbi->s_groups_count; > int inodes_per_group = EXT4_INODES_PER_GROUP(sb); > unsigned int freei, avefreei; > ext4_fsblk_t freeb, avefreeb; > unsigned int ndirs; > int max_dirs, min_inodes; > ext4_grpblk_t min_blocks; > - ext4_group_t i, grp, g; > + ext4_group_t i, grp, g, ngroups = sbi->s_groups_count;; > struct ext4_group_desc *desc; > struct orlov_stats stats; > int flex_size = ext4_flex_bg_size(sbi); > > + smp_rmb(); /* after reading s_groups_count first */ > if (flex_size > 1) { > ngroups = (ngroups + flex_size - 1) >> > sbi->s_log_groups_per_flex; > @@ -585,6 +587,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, > fallback: > ngroups = sbi->s_groups_count; > avefreei = freei / ngroups; > + smp_rmb(); > fallback_retry: > parent_group = EXT4_I(parent)->i_block_group; > for (i = 0; i < ngroups; i++) { > @@ -613,11 +616,11 @@ static int find_group_other(struct super_block *sb, struct inode *parent, > ext4_group_t *group, int mode) > { > ext4_group_t parent_group = EXT4_I(parent)->i_block_group; > - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; > + ext4_group_t i, last, ngroups = EXT4_SB(sb)->s_groups_count; > struct ext4_group_desc *desc; > - ext4_group_t i, last; > int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); > > + smp_rmb(); /* after reading s_groups_count first */ > /* > * Try to place the inode is the same flex group as its > * parent. If we can't find space, use the Orlov algorithm to > @@ -799,7 +802,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) > struct super_block *sb; > struct buffer_head *inode_bitmap_bh = NULL; > struct buffer_head *group_desc_bh; > - ext4_group_t group = 0; > + ext4_group_t ngroups, group = 0; > unsigned long ino = 0; > struct inode *inode; > struct ext4_group_desc *gdp = NULL; > @@ -851,12 +854,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) > ret2 = find_group_other(sb, dir, &group, mode); > > got_group: > + ngroups = sbi->s_groups_count; > + smp_rmb(); > EXT4_I(dir)->i_last_alloc_group = group; > err = -ENOSPC; > if (ret2 == -1) > goto out; > > - for (i = 0; i < sbi->s_groups_count; i++) { > + for (i = 0; i < ngroups; i++) { > err = -EIO; > > gdp = ext4_get_group_desc(sb, group, &group_desc_bh); > @@ -917,7 +922,7 @@ repeat_in_this_group: > * group descriptor metadata has not yet been updated. > * So we just go onto the next blockgroup. > */ > - if (++group == sbi->s_groups_count) > + if (++group == ngroups) > group = 0; > } > err = -ENOSPC; > @@ -1158,17 +1163,18 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) > { > unsigned long desc_count; > struct ext4_group_desc *gdp; > - ext4_group_t i; > + ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count; > #ifdef EXT4FS_DEBUG > struct ext4_super_block *es; > unsigned long bitmap_count, x; > struct buffer_head *bitmap_bh = NULL; > > + smp_rmb(); /* after reading s_groups_count first */ > es = EXT4_SB(sb)->s_es; > desc_count = 0; > bitmap_count = 0; > gdp = NULL; > - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { > + for (i = 0; i < ngroups; i++) { > gdp = ext4_get_group_desc(sb, i, NULL); > if (!gdp) > continue; > @@ -1189,8 +1195,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) > le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); > return desc_count; > #else > + smp_rmb(); /* after reading s_groups_count first */ > desc_count = 0; > - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { > + for (i = 0; i < ngroups; i++) { > gdp = ext4_get_group_desc(sb, i, NULL); > if (!gdp) > continue; > @@ -1205,9 +1212,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) > unsigned long ext4_count_dirs(struct super_block * sb) > { > unsigned long count = 0; > - ext4_group_t i; > + ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count; > > - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { > + smp_rmb(); /* after reading s_groups_count first */ > + for (i = 0; i < ngroups; i++) { > struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); > if (!gdp) > continue; > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c > index f871677..ecc2d49 100644 > --- a/fs/ext4/mballoc.c > +++ b/fs/ext4/mballoc.c > @@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, > > static int ext4_mb_init_cache(struct page *page, char *incore) > { > + ext4_group_t ngroups; > int blocksize; > int blocks_per_page; > int groups_per_page; > @@ -757,6 +758,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) > > inode = page->mapping->host; > sb = inode->i_sb; > + ngroups = EXT4_SB(sb)->s_groups_count; > + smp_rmb(); > blocksize = 1 << inode->i_blkbits; > blocks_per_page = PAGE_CACHE_SIZE / blocksize; > > @@ -780,7 +783,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) > for (i = 0; i < groups_per_page; i++) { > struct ext4_group_desc *desc; > > - if (first_group + i >= EXT4_SB(sb)->s_groups_count) > + if (first_group + i >= ngroups) > break; > > err = -EIO; > @@ -852,7 +855,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) > struct ext4_group_info *grinfo; > > group = (first_block + i) >> 1; > - if (group >= EXT4_SB(sb)->s_groups_count) > + if (group >= ngroups) > break; > > /* > @@ -1788,9 +1791,11 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) > int block, pnum; > int blocks_per_page; > int groups_per_page; > + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; > ext4_group_t first_group; > struct ext4_group_info *grp; > > + smp_rmb(); /* after reading s_groups_count first */ > blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; > /* > * the buddy cache inode stores the block bitmap > @@ -1807,7 +1812,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) > /* read all groups the page covers into the cache */ > for (i = 0; i < groups_per_page; i++) { > > - if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) > + if ((first_group + i) >= ngroups) > break; > grp = ext4_get_group_info(sb, first_group + i); > /* take all groups write allocation > @@ -1945,8 +1950,7 @@ err: > static noinline_for_stack int > ext4_mb_regular_allocator(struct ext4_allocation_context *ac) > { > - ext4_group_t group; > - ext4_group_t i; > + ext4_group_t ngroups, group, i; > int cr; > int err = 0; > int bsbits; > @@ -1957,6 +1961,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) > > sb = ac->ac_sb; > sbi = EXT4_SB(sb); > + ngroups = EXT4_SB(sb)->s_groups_count; > + smp_rmb(); > BUG_ON(ac->ac_status == AC_STATUS_FOUND); > > /* first, try the goal */ > @@ -2017,11 +2023,11 @@ repeat: > */ > group = ac->ac_g_ex.fe_group; > > - for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { > + for (i = 0; i < ngroups; group++, i++) { > struct ext4_group_info *grp; > struct ext4_group_desc *desc; > > - if (group == EXT4_SB(sb)->s_groups_count) > + if (group == ngroups) > group = 0; > > /* quick check to skip empty groups */ > @@ -2320,7 +2326,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) > > if (*pos < 0 || *pos >= sbi->s_groups_count) > return NULL; > - > + smp_rmb(); > group = *pos + 1; > return (void *) ((unsigned long) group); > } > @@ -2334,6 +2340,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) > ++*pos; > if (*pos < 0 || *pos >= sbi->s_groups_count) > return NULL; > + smp_rmb(); > group = *pos + 1; > return (void *) ((unsigned long) group); > } > @@ -2587,6 +2594,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) > > static int ext4_mb_init_backend(struct super_block *sb) > { > + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; > ext4_group_t i; > int metalen; > struct ext4_sb_info *sbi = EXT4_SB(sb); > @@ -2597,8 +2605,10 @@ static int ext4_mb_init_backend(struct super_block *sb) > struct ext4_group_info **meta_group_info; > struct ext4_group_desc *desc; > > + smp_rmb(); /* after reading s_groups_count first */ > + > /* This is the number of blocks used by GDT */ > - num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - > + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - > 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); > > /* > @@ -2644,7 +2654,7 @@ static int ext4_mb_init_backend(struct super_block *sb) > for (i = 0; i < num_meta_group_infos; i++) { > if ((i + 1) == num_meta_group_infos) > metalen = sizeof(*meta_group_info) * > - (sbi->s_groups_count - > + (ngroups - > (i << EXT4_DESC_PER_BLOCK_BITS(sb))); > meta_group_info = kmalloc(metalen, GFP_KERNEL); > if (meta_group_info == NULL) { > @@ -2655,7 +2665,7 @@ static int ext4_mb_init_backend(struct super_block *sb) > sbi->s_group_info[i] = meta_group_info; > } > > - for (i = 0; i < sbi->s_groups_count; i++) { > + for (i = 0; i < ngroups; i++) { > desc = ext4_get_group_desc(sb, i, NULL); > if (desc == NULL) { > printk(KERN_ERR > @@ -2781,13 +2791,15 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) > > int ext4_mb_release(struct super_block *sb) > { > + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; > ext4_group_t i; > int num_meta_group_infos; > struct ext4_group_info *grinfo; > struct ext4_sb_info *sbi = EXT4_SB(sb); > > + smp_rmb(); /* after reading s_groups_count first */ > if (sbi->s_group_info) { > - for (i = 0; i < sbi->s_groups_count; i++) { > + for (i = 0; i < ngroups; i++) { > grinfo = ext4_get_group_info(sb, i); > #ifdef DOUBLE_CHECK > kfree(grinfo->bb_bitmap); > @@ -2797,7 +2809,7 @@ int ext4_mb_release(struct super_block *sb) > ext4_unlock_group(sb, i); > kfree(grinfo); > } > - num_meta_group_infos = (sbi->s_groups_count + > + num_meta_group_infos = (ngroups + > EXT4_DESC_PER_BLOCK(sb) - 1) >> > EXT4_DESC_PER_BLOCK_BITS(sb); > for (i = 0; i < num_meta_group_infos; i++) > @@ -4121,7 +4133,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode, > static void ext4_mb_show_ac(struct ext4_allocation_context *ac) > { > struct super_block *sb = ac->ac_sb; > - ext4_group_t i; > + ext4_group_t ngroups, i; > > printk(KERN_ERR "EXT4-fs: Can't allocate:" > " Allocation context details:\n"); > @@ -4145,7 +4157,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) > printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, > ac->ac_found); > printk(KERN_ERR "EXT4-fs: groups: \n"); > - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { > + ngroups = EXT4_SB(ac->ac_sb)->s_groups_count; > + smp_rmb(); > + for (i = 0; i < EXT4_SB(sb)->ngroups; i++) { > struct ext4_group_info *grp = ext4_get_group_info(sb, i); > struct ext4_prealloc_space *pa; > ext4_grpblk_t start; > @@ -4469,13 +4483,14 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) > > static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) > { > - ext4_group_t i; > + ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count; > int ret; > int freed = 0; > > + smp_rmb(); /* after reading s_groups_count first */ > trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", > sb->s_id, needed); > - for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { > + for (i = 0; i < ngroups && needed > 0; i++) { > ret = ext4_mb_discard_group_preallocations(sb, i, needed); > freed += ret; > needed -= ret; > -- > 1.5.6.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Jan Kara <jack@xxxxxxx> SuSE CR Labs -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html