Re: [PATCH] ext4: always pre-allocate max depth for path

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Jun 16, 2016, at 10:53 AM, Faccini, Bruno <bruno.faccini@xxxxxxxxx> wrote:
> 
> From: Bruno Faccini <bruno.faccini@xxxxxxxxx>
> 
> I have first found this way to fix holes in previous ext4 layers versions
> where an array of struct ext4_ext_path had been allocated with an arbitrary
> evaluated size and finally could overrun upon ext_depth() growth outside
> i_data_sem protection. But it seems it can still help with recent ext4
> version, to avoid re-allocation need and overhead when it can be allocated
> to max possible extent depth (ie, 5 presently) at first time and for a low
> cost regarding its memory foot-print, it should also avoid further invalid
> dereference by underlying callers sharing same ppath (with present
> inter-routines path re-use scheme), and also upon re-allocation error.
> 
> Signed-off-by: Bruno Faccini <bruno.faccini@xxxxxxxxx>
> ---
> fs/ext4/ext4.h        |    3 +
> fs/ext4/extents.c     |  113 ++++++++++++++++++++++++++--------------
> fs/ext4/move_extent.c |   10 +--
> fs/ext4/super.c       |    3 -
> 4 files changed, 85 insertions(+), 44 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 1e20fa9..73d5866 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1340,6 +1340,9 @@ struct ext4_sb_info {
> 	unsigned long s_ext_extents;
> #endif
> 
> +	/* maximum possible extents tree depth, to be computed at mount time */
> +	unsigned int s_max_ext_tree_depth;
> +
> 	/* for buddy allocator */
> 	struct ext4_group_info ***s_group_info;
> 	struct inode *s_buddy_cache;
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index b52fea3..aee676d 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -871,19 +871,18 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
> 
> 	if (path) {
> 		ext4_ext_drop_refs(path);
> -		if (depth > path[0].p_maxdepth) {
> -			kfree(path);
> -			*orig_path = path = NULL;
> -		}
> -	}
> -	if (!path) {
> +		/* path has been allocated for the max possible depth
> +		 * so we can simply reuse it */
> +		/* XXX need to clear/zero old path content? */
> +	} else {
> 		/* account possible depth increase */
> -		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
> +		path = kzalloc(sizeof(struct ext4_ext_path) *
> +				EXT4_SB(inode->i_sb)->s_max_ext_tree_depth,
> 				GFP_NOFS);
> 		if (unlikely(!path))
> 			return ERR_PTR(-ENOMEM);
> -		path[0].p_maxdepth = depth + 1;
> 	}
> +	path[0].p_maxdepth = depth + 1;
> 	path[0].p_hdr = eh;
> 	path[0].p_bh = NULL;
> 
> @@ -934,9 +933,11 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
> 
> err:
> 	ext4_ext_drop_refs(path);
> -	kfree(path);
> -	if (orig_path)
> -		*orig_path = NULL;
> +
> +	/* do not free *orig_path, it is likely to be referenced by callers */
> +	if (!orig_path)
> +		kfree(path);
> +
> 	return ERR_PTR(ret);
> }
> 
> @@ -2147,7 +2148,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
> 				    ext4_lblk_t block, ext4_lblk_t num,
> 				    struct fiemap_extent_info *fieinfo)
> {
> -	struct ext4_ext_path *path = NULL;
> +	struct ext4_ext_path *path = NULL, *orig_path = NULL;
> +	struct ext4_ext_path **orig_ppath = &orig_path;
> 	struct ext4_extent *ex;
> 	struct extent_status es;
> 	ext4_lblk_t next, next_del, start = 0, end = 0;
> @@ -2161,13 +2163,16 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
> 		/* find extent for this block */
> 		down_read(&EXT4_I(inode)->i_data_sem);
> 
> -		path = ext4_find_extent(inode, block, &path, 0);
> +		path = ext4_find_extent(inode, block, orig_ppath, 0);
> 		if (IS_ERR(path)) {
> 			up_read(&EXT4_I(inode)->i_data_sem);
> 			err = PTR_ERR(path);
> 			path = NULL;
> 			break;
> 		}
> +		if (!orig_path)
> +			orig_path = path;
> +		/* XXX else BUG_ON(path != orig_path); */
> 
> 		depth = ext_depth(inode);
> 		if (unlikely(path[depth].p_hdr == NULL)) {
> @@ -2287,8 +2292,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
> 		block = es.es_lblk + es.es_len;
> 	}
> 
> -	ext4_ext_drop_refs(path);
> -	kfree(path);
> +	ext4_ext_drop_refs(orig_path);
> +	kfree(orig_path);
> 	return err;
> }
> 
> @@ -2910,7 +2915,8 @@ again:
> 			path[k].p_block =
> 				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
> 	} else {
> -		path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
> +		path = kzalloc(sizeof(struct ext4_ext_path) *
> +			       EXT4_SB(inode->i_sb)->s_max_ext_tree_depth,
> 			       GFP_NOFS);
> 		if (path == NULL) {
> 			ext4_journal_stop(handle);
> @@ -3051,13 +3057,14 @@ out:
>  */
> void ext4_ext_init(struct super_block *sb)
> {
> +	ext4_fsblk_t maxblocks;
> +
> 	/*
> 	 * possible initialization would be here
> 	 */
> 
> 	if (ext4_has_feature_extents(sb)) {
> -#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)

This debugging info shouldn't be printed for every mount.  This should be
kept under the original #ifdef:

#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || \
    defined(EXTENTS_STATS) || defined(EXT_DEBUG)


> -		printk(KERN_INFO "EXT4-fs: file extents enabled"
> +		printk(KERN_INFO "EXT4-fs (%s): file extents enabled"
> #ifdef AGGRESSIVE_TEST
> 		       ", aggressive tests"
> #endif
> @@ -3067,8 +3074,36 @@ void ext4_ext_init(struct super_block *sb)
> #ifdef EXTENTS_STATS
> 		       ", stats"
> #endif
> -		       "\n");
> +		       , sb->s_id);
> +
> +
> +		EXT4_SB(sb)->s_max_ext_tree_depth = 1;
> +
> +		maxblocks = sb->s_maxbytes / sb->s_blocksize;

To avoid the 64-bit divide on 32-bit platforms this can be:

		maxblocks = sb->s_maxbytes >> sb->s_blocksize_bits;

> +		/* 1st/root level/node of extents tree stands in i_data and
> +		 * entries stored in tree nodes can be of type ext4_extent
> +		 * (leaf node) or ext4_extent_idx (internal node) */
> +		maxblocks /= (sizeof(((struct ext4_inode_info *)0x0)->i_data) -
> +			      sizeof(struct ext4_extent_header)) /
> +			     max(sizeof(struct ext4_extent),
> +				 sizeof(struct ext4_extent_idx));
> +		/* compute maximum extents tree depth for a fully populated
> +		 * file of max size made of only minimal/1-block extents */
> +		while (maxblocks > 0) {
> +			maxblocks /= (sb->s_blocksize -
> +				      sizeof(struct ext4_extent_header)) /
> +				     max(sizeof(struct ext4_extent),
> +					 sizeof(struct ext4_extent_idx));

These need to be changed to avoid a 64-bit divide, and possibly simplified:

		unsigned ext_size = max(sizeof(struct ext4_extent),
					sizeof(struct ext4_extent_idx));
		unsigned ext_per_leaf = (sb->s_blocksize -
					 sizeof(struct ext4_extent_header)) /
					ext_size);

		do_div(maxblocks,
                       (sizeof(((struct ext4_inode_info *)NULL)->i_data) -
			sizeof(struct ext4_extent_header)) / ext_size);

		while (maxblocks > 0) {
			do_div(maxblocks, ext_per_leaf);
			EXT4_SB(sb)->s_max_ext_tree_depth++;
		}

> +#ifdef EXT_DEBUG
> +		printk(", maximum tree depth=%u",
> +		       EXT4_SB(sb)->s_max_ext_tree_depth);
> #endif
> +		printk("\n");

It isn't great to "continue" printk() statements, since they can get mixed
up if there are multiple messages being printed to the console at once.
Better to calculate the s_max_ext_tree_depth first, then print out the
debugging info in a single printk() call.  Alternately, just make this a
completely separate printk() under EXT_DEBUG instead of trying to continue
the previous one.

Cheers, Andreas

> #ifdef EXTENTS_STATS
> 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
> 		EXT4_SB(sb)->s_ext_min = 1 << 30;
> @@ -5348,18 +5383,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
> 		       ext4_lblk_t start, ext4_lblk_t shift,
> 		       enum SHIFT_DIRECTION SHIFT)
> {
> -	struct ext4_ext_path *path;
> +	struct ext4_ext_path *path, *orig_path;
> 	int ret = 0, depth;
> 	struct ext4_extent *extent;
> 	ext4_lblk_t stop, *iterator, ex_start, ex_end;
> 
> 	/* Let path point to the last extent */
> -	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
> -	if (IS_ERR(path))
> -		return PTR_ERR(path);
> +	orig_path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
> +	if (IS_ERR(orig_path))
> +		return PTR_ERR(orig_path);
> 
> -	depth = path->p_depth;
> -	extent = path[depth].p_ext;
> +	depth = orig_path->p_depth;
> +	extent = orig_path[depth].p_ext;
> 	if (!extent)
> 		goto out;
> 
> @@ -5371,9 +5406,11 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
> 	 * sure the hole is big enough to accommodate the shift.
> 	*/
> 	if (SHIFT == SHIFT_LEFT) {
> -		path = ext4_find_extent(inode, start - 1, &path, 0);
> -		if (IS_ERR(path))
> -			return PTR_ERR(path);
> +		path = ext4_find_extent(inode, start - 1, &orig_path, 0);
> +		if (IS_ERR(path)) {
> +			ret = PTR_ERR(path);
> +			goto out;
> +		}
> 		depth = path->p_depth;
> 		extent =  path[depth].p_ext;
> 		if (extent) {
> @@ -5387,9 +5424,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
> 
> 		if ((start == ex_start && shift > ex_start) ||
> 		    (shift > start - ex_end)) {
> -			ext4_ext_drop_refs(path);
> -			kfree(path);
> -			return -EINVAL;
> +			ret = -EINVAL;
> +			goto out;
> 		}
> 	}
> 
> @@ -5405,15 +5441,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
> 
> 	/* Its safe to start updating extents */
> 	while (start < stop) {
> -		path = ext4_find_extent(inode, *iterator, &path, 0);
> -		if (IS_ERR(path))
> -			return PTR_ERR(path);
> +		path = ext4_find_extent(inode, *iterator, &orig_path, 0);
> +		if (IS_ERR(path)) {
> +			ret = PTR_ERR(path);
> +			goto out;
> +		}
> 		depth = path->p_depth;
> 		extent = path[depth].p_ext;
> 		if (!extent) {
> 			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
> 					 (unsigned long) *iterator);
> -			return -EFSCORRUPTED;
> +			ret =  -EFSCORRUPTED;
> +			goto out;
> 		}
> 		if (SHIFT == SHIFT_LEFT && *iterator >
> 		    le32_to_cpu(extent->ee_block)) {
> @@ -5445,8 +5484,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
> 			break;
> 	}
> out:
> -	ext4_ext_drop_refs(path);
> -	kfree(path);
> +	ext4_ext_drop_refs(orig_path);
> +	kfree(orig_path);
> 	return ret;
> }
> 
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index fb6f117..7c319c5 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -39,13 +39,11 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
> 	path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
> 	if (IS_ERR(path))
> 		return PTR_ERR(path);
> -	if (path[ext_depth(inode)].p_ext == NULL) {
> -		ext4_ext_drop_refs(path);
> -		kfree(path);
> -		*ppath = NULL;
> +	if (path[ext_depth(inode)].p_ext == NULL)
> 		return -ENODATA;
> -	}
> -	*ppath = path;
> +	if (*ppath == NULL)
> +		*ppath = path;
> +	/* XXX else BUG_ON(path != *ppath); */
> 	return 0;
> }
> 
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 486e869..199157a 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -3701,6 +3701,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
> 		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
> 			goto failed_mount3a;
> 
> +	ext4_ext_init(sb); /* needed before using extent-mapped journal */
> +
> 	/*
> 	 * The first inode we look at is the journal inode.  Don't try
> 	 * root first: it may be modified in the journal!
> @@ -3894,7 +3896,6 @@ no_journal:
> 		goto failed_mount4a;
> 	}
> 
> -	ext4_ext_init(sb);
> 	err = ext4_mb_init(sb);
> 	if (err) {
> 		ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
> --


Cheers, Andreas





Attachment: signature.asc
Description: Message signed with OpenPGP using GPGMail


[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux