Re: [PATCH 1/5] ext4: Avoid races caused by on-line resizing and SMP memory reordering

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> Ext4's on-line resizing adds a new block group and then, only at the
> last step adjusts s_groups_count.  However, it's possible on SMP
> systems that another CPU could see the updated the s_group_count and
> not see the newly initialized data structures for the just-added block
> group.  For this reason, it's important to insert a SMP read barrier
> after reading s_groups_count and before reading any, say, block group
> descriptors allowed by the block group count.
> 
> Unfortunately, we rather blatently violate this locking protocol as
> documented in fs/ext4/resize.c.  Fortunately, (1) on-line resizes
> happen relatively rarely, and (2) it seems rare that the filesystem
> code will immediately try to use just-added block group before any
> memory ordering issues resolve themselves.  So apparently problems
> here are relatively hard to hit, since ext3 also is vulnerable to this
> race and no one has apparently complained.
  Ouch... Hmm, smp_rmb() isn't completely free and mainly it's a bit
ugly and prone to errors (I'm afraid next time someone changes the
allocation code, we miss some barriers again)... so.. Maybe a stupid
idea but wouldn't it be easier to solve the online resize like: freeze
the filesystem, do all the changes required for extend, unfreeze the
filesystem?
  I guess the resize code might get simpler as well with this.

								Honza
> Signed-off-by: "Theodore Ts'o" <tytso@xxxxxxx>
> ---
>  fs/ext4/balloc.c  |    6 ++++--
>  fs/ext4/ialloc.c  |   34 +++++++++++++++++++++-------------
>  fs/ext4/mballoc.c |   49 ++++++++++++++++++++++++++++++++-----------------
>  3 files changed, 57 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
> index 53c72ad..d1615f2 100644
> --- a/fs/ext4/balloc.c
> +++ b/fs/ext4/balloc.c
> @@ -88,9 +88,11 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
>  		 ext4_group_t block_group, struct ext4_group_desc *gdp)
>  {
>  	int bit, bit_max;
> +	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
>  	unsigned free_blocks, group_blocks;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	if (bh) {
>  		J_ASSERT_BH(bh, buffer_locked(bh));
>  
> @@ -123,7 +125,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
>  		bit_max += ext4_bg_num_gdb(sb, block_group);
>  	}
>  
> -	if (block_group == sbi->s_groups_count - 1) {
> +	if (block_group == ngroups - 1) {
>  		/*
>  		 * Even though mke2fs always initialize first and last group
>  		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
> @@ -131,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
>  		 */
>  		group_blocks = ext4_blocks_count(sbi->s_es) -
>  			le32_to_cpu(sbi->s_es->s_first_data_block) -
> -			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
> +			(EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
>  	} else {
>  		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
>  	}
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index f18e0a0..52ce274 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -322,6 +322,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
>  	ext4_group_t group;
>  	int ret = -1;
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
>  	avefreei = freei / ngroups;
>  
> @@ -362,7 +363,8 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
>  	ext4_group_t n_fbg_groups;
>  	ext4_group_t i;
>  
> -	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
> +	smp_rmb();		/* after reading s_groups_count first */
> +	n_fbg_groups = (ngroups + flex_size - 1) >>
>  		sbi->s_log_groups_per_flex;
>  
>  find_close_to_parent:
> @@ -478,18 +480,18 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
>  {
>  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> -	ext4_group_t ngroups = sbi->s_groups_count;
>  	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
>  	unsigned int freei, avefreei;
>  	ext4_fsblk_t freeb, avefreeb;
>  	unsigned int ndirs;
>  	int max_dirs, min_inodes;
>  	ext4_grpblk_t min_blocks;
> -	ext4_group_t i, grp, g;
> +	ext4_group_t i, grp, g, ngroups = sbi->s_groups_count;;
>  	struct ext4_group_desc *desc;
>  	struct orlov_stats stats;
>  	int flex_size = ext4_flex_bg_size(sbi);
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	if (flex_size > 1) {
>  		ngroups = (ngroups + flex_size - 1) >>
>  			sbi->s_log_groups_per_flex;
> @@ -585,6 +587,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
>  fallback:
>  	ngroups = sbi->s_groups_count;
>  	avefreei = freei / ngroups;
> +	smp_rmb();
>  fallback_retry:
>  	parent_group = EXT4_I(parent)->i_block_group;
>  	for (i = 0; i < ngroups; i++) {
> @@ -613,11 +616,11 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
>  			    ext4_group_t *group, int mode)
>  {
>  	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
> -	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
> +	ext4_group_t i, last, ngroups = EXT4_SB(sb)->s_groups_count;
>  	struct ext4_group_desc *desc;
> -	ext4_group_t i, last;
>  	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	/*
>  	 * Try to place the inode is the same flex group as its
>  	 * parent.  If we can't find space, use the Orlov algorithm to
> @@ -799,7 +802,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
>  	struct super_block *sb;
>  	struct buffer_head *inode_bitmap_bh = NULL;
>  	struct buffer_head *group_desc_bh;
> -	ext4_group_t group = 0;
> +	ext4_group_t ngroups, group = 0;
>  	unsigned long ino = 0;
>  	struct inode *inode;
>  	struct ext4_group_desc *gdp = NULL;
> @@ -851,12 +854,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
>  		ret2 = find_group_other(sb, dir, &group, mode);
>  
>  got_group:
> +	ngroups = sbi->s_groups_count;
> +	smp_rmb();
>  	EXT4_I(dir)->i_last_alloc_group = group;
>  	err = -ENOSPC;
>  	if (ret2 == -1)
>  		goto out;
>  
> -	for (i = 0; i < sbi->s_groups_count; i++) {
> +	for (i = 0; i < ngroups; i++) {
>  		err = -EIO;
>  
>  		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
> @@ -917,7 +922,7 @@ repeat_in_this_group:
>  		 * group descriptor metadata has not yet been updated.
>  		 * So we just go onto the next blockgroup.
>  		 */
> -		if (++group == sbi->s_groups_count)
> +		if (++group == ngroups)
>  			group = 0;
>  	}
>  	err = -ENOSPC;
> @@ -1158,17 +1163,18 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
>  {
>  	unsigned long desc_count;
>  	struct ext4_group_desc *gdp;
> -	ext4_group_t i;
> +	ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count;
>  #ifdef EXT4FS_DEBUG
>  	struct ext4_super_block *es;
>  	unsigned long bitmap_count, x;
>  	struct buffer_head *bitmap_bh = NULL;
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	es = EXT4_SB(sb)->s_es;
>  	desc_count = 0;
>  	bitmap_count = 0;
>  	gdp = NULL;
> -	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
> +	for (i = 0; i < ngroups; i++) {
>  		gdp = ext4_get_group_desc(sb, i, NULL);
>  		if (!gdp)
>  			continue;
> @@ -1189,8 +1195,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
>  	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
>  	return desc_count;
>  #else
> +	smp_rmb();		/* after reading s_groups_count first */
>  	desc_count = 0;
> -	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
> +	for (i = 0; i < ngroups; i++) {
>  		gdp = ext4_get_group_desc(sb, i, NULL);
>  		if (!gdp)
>  			continue;
> @@ -1205,9 +1212,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
>  unsigned long ext4_count_dirs(struct super_block * sb)
>  {
>  	unsigned long count = 0;
> -	ext4_group_t i;
> +	ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count;
>  
> -	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
> +	smp_rmb();		/* after reading s_groups_count first */
> +	for (i = 0; i < ngroups; i++) {
>  		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
>  		if (!gdp)
>  			continue;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index f871677..ecc2d49 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
>  
>  static int ext4_mb_init_cache(struct page *page, char *incore)
>  {
> +	ext4_group_t ngroups;
>  	int blocksize;
>  	int blocks_per_page;
>  	int groups_per_page;
> @@ -757,6 +758,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
>  
>  	inode = page->mapping->host;
>  	sb = inode->i_sb;
> +	ngroups = EXT4_SB(sb)->s_groups_count;
> +	smp_rmb();
>  	blocksize = 1 << inode->i_blkbits;
>  	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
>  
> @@ -780,7 +783,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
>  	for (i = 0; i < groups_per_page; i++) {
>  		struct ext4_group_desc *desc;
>  
> -		if (first_group + i >= EXT4_SB(sb)->s_groups_count)
> +		if (first_group + i >= ngroups)
>  			break;
>  
>  		err = -EIO;
> @@ -852,7 +855,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
>  		struct ext4_group_info *grinfo;
>  
>  		group = (first_block + i) >> 1;
> -		if (group >= EXT4_SB(sb)->s_groups_count)
> +		if (group >= ngroups)
>  			break;
>  
>  		/*
> @@ -1788,9 +1791,11 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
>  	int block, pnum;
>  	int blocks_per_page;
>  	int groups_per_page;
> +	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
>  	ext4_group_t first_group;
>  	struct ext4_group_info *grp;
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
>  	/*
>  	 * the buddy cache inode stores the block bitmap
> @@ -1807,7 +1812,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
>  	/* read all groups the page covers into the cache */
>  	for (i = 0; i < groups_per_page; i++) {
>  
> -		if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
> +		if ((first_group + i) >= ngroups)
>  			break;
>  		grp = ext4_get_group_info(sb, first_group + i);
>  		/* take all groups write allocation
> @@ -1945,8 +1950,7 @@ err:
>  static noinline_for_stack int
>  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>  {
> -	ext4_group_t group;
> -	ext4_group_t i;
> +	ext4_group_t ngroups, group, i;
>  	int cr;
>  	int err = 0;
>  	int bsbits;
> @@ -1957,6 +1961,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
>  
>  	sb = ac->ac_sb;
>  	sbi = EXT4_SB(sb);
> +	ngroups = EXT4_SB(sb)->s_groups_count;
> +	smp_rmb();
>  	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
>  
>  	/* first, try the goal */
> @@ -2017,11 +2023,11 @@ repeat:
>  		 */
>  		group = ac->ac_g_ex.fe_group;
>  
> -		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
> +		for (i = 0; i < ngroups; group++, i++) {
>  			struct ext4_group_info *grp;
>  			struct ext4_group_desc *desc;
>  
> -			if (group == EXT4_SB(sb)->s_groups_count)
> +			if (group == ngroups)
>  				group = 0;
>  
>  			/* quick check to skip empty groups */
> @@ -2320,7 +2326,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
>  
>  	if (*pos < 0 || *pos >= sbi->s_groups_count)
>  		return NULL;
> -
> +	smp_rmb();
>  	group = *pos + 1;
>  	return (void *) ((unsigned long) group);
>  }
> @@ -2334,6 +2340,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
>  	++*pos;
>  	if (*pos < 0 || *pos >= sbi->s_groups_count)
>  		return NULL;
> +	smp_rmb();
>  	group = *pos + 1;
>  	return (void *) ((unsigned long) group);
>  }
> @@ -2587,6 +2594,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
>  
>  static int ext4_mb_init_backend(struct super_block *sb)
>  {
> +	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
>  	ext4_group_t i;
>  	int metalen;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -2597,8 +2605,10 @@ static int ext4_mb_init_backend(struct super_block *sb)
>  	struct ext4_group_info **meta_group_info;
>  	struct ext4_group_desc *desc;
>  
> +	smp_rmb();		/* after reading s_groups_count first */
> +
>  	/* This is the number of blocks used by GDT */
> -	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
> +	num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
>  				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
>  
>  	/*
> @@ -2644,7 +2654,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
>  	for (i = 0; i < num_meta_group_infos; i++) {
>  		if ((i + 1) == num_meta_group_infos)
>  			metalen = sizeof(*meta_group_info) *
> -				(sbi->s_groups_count -
> +				(ngroups -
>  					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
>  		meta_group_info = kmalloc(metalen, GFP_KERNEL);
>  		if (meta_group_info == NULL) {
> @@ -2655,7 +2665,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
>  		sbi->s_group_info[i] = meta_group_info;
>  	}
>  
> -	for (i = 0; i < sbi->s_groups_count; i++) {
> +	for (i = 0; i < ngroups; i++) {
>  		desc = ext4_get_group_desc(sb, i, NULL);
>  		if (desc == NULL) {
>  			printk(KERN_ERR
> @@ -2781,13 +2791,15 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
>  
>  int ext4_mb_release(struct super_block *sb)
>  {
> +	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
>  	ext4_group_t i;
>  	int num_meta_group_infos;
>  	struct ext4_group_info *grinfo;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	if (sbi->s_group_info) {
> -		for (i = 0; i < sbi->s_groups_count; i++) {
> +		for (i = 0; i < ngroups; i++) {
>  			grinfo = ext4_get_group_info(sb, i);
>  #ifdef DOUBLE_CHECK
>  			kfree(grinfo->bb_bitmap);
> @@ -2797,7 +2809,7 @@ int ext4_mb_release(struct super_block *sb)
>  			ext4_unlock_group(sb, i);
>  			kfree(grinfo);
>  		}
> -		num_meta_group_infos = (sbi->s_groups_count +
> +		num_meta_group_infos = (ngroups +
>  				EXT4_DESC_PER_BLOCK(sb) - 1) >>
>  			EXT4_DESC_PER_BLOCK_BITS(sb);
>  		for (i = 0; i < num_meta_group_infos; i++)
> @@ -4121,7 +4133,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
>  static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
>  {
>  	struct super_block *sb = ac->ac_sb;
> -	ext4_group_t i;
> +	ext4_group_t ngroups, i;
>  
>  	printk(KERN_ERR "EXT4-fs: Can't allocate:"
>  			" Allocation context details:\n");
> @@ -4145,7 +4157,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
>  	printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
>  		ac->ac_found);
>  	printk(KERN_ERR "EXT4-fs: groups: \n");
> -	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
> +	ngroups = EXT4_SB(ac->ac_sb)->s_groups_count;
> +	smp_rmb();
> +	for (i = 0; i < EXT4_SB(sb)->ngroups; i++) {
>  		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
>  		struct ext4_prealloc_space *pa;
>  		ext4_grpblk_t start;
> @@ -4469,13 +4483,14 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
>  
>  static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
>  {
> -	ext4_group_t i;
> +	ext4_group_t i, ngroups = EXT4_SB(sb)->s_groups_count;
>  	int ret;
>  	int freed = 0;
>  
> +	smp_rmb();		/* after reading s_groups_count first */
>  	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
>  		   sb->s_id, needed);
> -	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
> +	for (i = 0; i < ngroups && needed > 0; i++) {
>  		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
>  		freed += ret;
>  		needed -= ret;
> -- 
> 1.5.6.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@xxxxxxx>
SuSE CR Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux