Re: [RFC][PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full

Mingming Cao <cmm@xxxxxxxxxx> · Thu, 21 Feb 2008 13:07:17 -0800

Hi Aneesh,

It's a good start, a few comments below..

On Fri, 2008-02-22 at 00:47 +0530, Aneesh Kumar K.V wrote:
> From 6a73edd4dbb32344e6a83ebdc07edd0e96d376bd Mon Sep 17 00:00:00 2001
> From: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> Date: Thu, 21 Feb 2008 23:57:38 +0530
> Subject: [PATCH] ext4: Convert uninitialized extent to initialized extent in case of file system full
> 
> A write to prealloc area cause the split of unititalized extent into a initialized
> and uninitialized extent. If we don't have space to add new extent information instead
> of returning error convert the existing uninitialized extent to initialized one. We
> need to zero out the blocks corresponding to the extent to prevent wrong data reaching
> userspace.
> 

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx>
> ---
>  fs/ext4/extents.c |  135 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 133 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index b179b03..d37c14e 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -2137,6 +2137,103 @@ void ext4_ext_release(struct super_block *sb)
>  #endif
>  }
> 
> +static int ext4_ext_zero_out(handle_t *handle, struct inode *inode,
> +				ext4_lblk_t iblock, struct ext4_extent *ex)
> +{
> +	ext4_lblk_t ee_block;
> +	unsigned int ee_len, blkcount, blocksize;
> +	loff_t pos;
> +	pgoff_t index, skip_index;
> +	unsigned long offset;
> +	struct page *page;
> +	struct address_space *mapping = inode->i_mapping;
> +	struct buffer_head *head, *bh;
> +	int err = 0;
> +
> +	ee_block = le32_to_cpu(ex->ee_block);
> +	ee_len = blkcount = ext4_ext_get_actual_len(ex);
> +	blocksize = inode->i_sb->s_blocksize;
> +
> +	/*
> +	 * find the skip index. We can't call __grab_cache_page for this
> +	 * because we are in the writeout of this page and we already have
> +	 * taken the lock on this page
> +	 */
> +	pos = iblock <<  inode->i_blkbits;
> +	skip_index = pos >> PAGE_CACHE_SHIFT;
> +

We should not need to look up the page cache to do the zero out. The
approach I had thought is just zero it out on disk.

> +	while (blkcount) {
> +		pos = (ee_block  + ee_len - blkcount) << inode->i_blkbits;
> +		index = pos >> PAGE_CACHE_SHIFT;
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +		if (index == skip_index) {
> +			/* Page will already be locked in the writepage */
> +			read_lock_irq(&mapping->tree_lock);
> +			page = radix_tree_lookup(&mapping->page_tree, index);
> +			read_unlock_irq(&mapping->tree_lock);
> +			if (page)
> +				page_cache_get(page);
> +			else
> +				return -ENOMEM;
> +		} else {
> +			page = __grab_cache_page(mapping, index);
> +			if (!page)
> +				return -ENOMEM;
> +		}
> +

I the page is already locked before calling get_block() via writepage(),
isn't it? and the journal transaction already started...

> +		if (!page_has_buffers(page))
> +			create_empty_buffers(page, blocksize, 0);
> +
> +		head = page_buffers(page);
> +		/* Look for the buffer_head which map the block */
> +		bh = head;
> +		while (offset > 0) {
> +			bh = bh->b_this_page;
> +			offset -= blocksize;
> +		}
> +		offset = (pos & (PAGE_CACHE_SIZE - 1));
> +
> +		/* Now write all the buffer_heads in the page */
> +		do {
> +			set_buffer_uptodate(bh);
> +			if (ext4_should_journal_data(inode)) {
> +				err = ext4_journal_get_write_access(handle, bh);
> +				/* do we have that many credits ??*/
> +				if (err)
> +					goto err_out;
> +			}
> +			zero_user(page, offset, blocksize);

Ah oh, you are trying to zero out the pages in the page cache, that's
seems wrong to me. By the time get_block() is called from writepages(),
the pages should have meaningful content that needs to flush to disk,
zero the pages out will lost the data.

> +			offset += blocksize;
> +			if (ext4_should_journal_data(inode)) {
> +				err = ext4_journal_dirty_metadata(handle, bh);
> +				if (err)
> +					goto err_out;
> +			} else {
> +				if (ext4_should_order_data(inode)) {
> +					err = ext4_journal_dirty_data(handle,
> +									bh);
> +					if (err)
> +						goto err_out;
> +				}
> +				mark_buffer_dirty(bh);
> +			}
> +
> +			bh = bh->b_this_page;
> +			blkcount--;
> +		} while ((bh != head) && (blkcount > 0));
> +		/* only unlock if we have locked */
> +		if (index != skip_index)
> +			unlock_page(page);
> +		page_cache_release(page);
> +	}
> +
> +	return 0;
> +err_out:
> +	unlock_page(page);
> +	page_cache_release(page);
> +	return err;
> +}
> +

I was thinking just simply create a new bh, zero out the bh, then map
the bh with the block number to zero out, lastly submit a IO via
ll_rw_block. It maybe more efficient to do this via bio(perhaps cooking
a bio with zeroed out pages and submit_bio) but I have not look very
closely to it. Just throw out my thoughts.

Mingming
>  /* 
>   * This function is called by ext4_ext_get_blocks() if someone tries to write
>   * to an uninitialized extent. It may result in splitting the uninitialized
> @@ -2153,7 +2250,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  						ext4_lblk_t iblock,
>  						unsigned long max_blocks)
>  {
> -	struct ext4_extent *ex, newex;
> +	struct ext4_extent *ex, newex, zeroout_ex;
>  	struct ext4_extent *ex1 = NULL;
>  	struct ext4_extent *ex2 = NULL;
>  	struct ext4_extent *ex3 = NULL;
> @@ -2172,6 +2269,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	allocated = ee_len - (iblock - ee_block);
>  	newblock = iblock - ee_block + ext_pblock(ex);
>  	ex2 = ex;
> +	zeroout_ex.ee_block = ex->ee_block;
> +	zeroout_ex.ee_len   = cpu_to_le16(ee_len);
> +	ext4_ext_store_pblock(&zeroout_ex, ext_pblock(ex));
> 
>  	err = ext4_ext_get_access(handle, inode, path + depth);
>  	if (err)
> @@ -2200,13 +2300,32 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
>  		ext4_ext_mark_uninitialized(ex3);
>  		err = ext4_ext_insert_extent(handle, inode, path, ex3);
> -		if (err)
> +		if (err == -ENOSPC) {
> +			err =  ext4_ext_zero_out(handle, inode,
> +							iblock, &zeroout_ex);
> +			if (err)
> +				goto out;
> +			/* update the extent length and mark as initialized */
> +			ex->ee_block = zeroout_ex.ee_block;
> +			ex->ee_len   = zeroout_ex.ee_len;
> +			ext4_ext_store_pblock(ex, ext_pblock(&zeroout_ex));
> +			ext4_ext_dirty(handle, inode, path + depth);
> +			return le16_to_cpu(ex->ee_len);
> +
> +		} else if (err)
>  			goto out;
> +
>  		/*
>  		 * The depth, and hence eh & ex might change
>  		 * as part of the insert above.
>  		 */
>  		newdepth = ext_depth(inode);
> +		/*
> +		 * update the extent length after successfull insert of the
> +		 * split extent
> +		 */
> +		zeroout_ex.ee_len = cpu_to_le16(ee_len -
> +						ext4_ext_get_actual_len(ex3));
>  		if (newdepth != depth) {
>  			depth = newdepth;
>  			ext4_ext_drop_refs(path);
> @@ -2281,6 +2400,18 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
>  	goto out;
>  insert:
>  	err = ext4_ext_insert_extent(handle, inode, path, &newex);
> +	if (err == -ENOSPC) {
> +		err =  ext4_ext_zero_out(handle, inode, iblock, &zeroout_ex);
> +		if (err)
> +			goto out;
> +		/* update the extent length and mark as initialized */
> +		ex->ee_block = zeroout_ex.ee_block;
> +		ex->ee_len   = zeroout_ex.ee_len;
> +		ext4_ext_store_pblock(ex, ext_pblock(&zeroout_ex));
> +		ext4_ext_dirty(handle, inode, path + depth);
> +		return le16_to_cpu(ex->ee_len);
> +	}
> +
>  out:
>  	return err ? err : allocated;
>  }

-
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html