[PATCH 3/3] ext4: Add EXT4_IOC_TRANSFER_BLOCK_RANGE ioctl

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>

The EXT4_IOC_TRANSFER_BLOCK_RANGE ioctl transfers the data blocks lying
between [start, "start + length") form source file and append them
to destination file (represented by dest_fd).
This operation leaves a hole in the source file from where data blocks
are transfrered.
If there is any fallocated area beyond isize of destination it will
be truncated.

Signed-off-by: Namjae Jeon <namjae.jeon@xxxxxxxxxxx>
Signed-off-by: Ashish Sangwan <a.sangwan@xxxxxxxxxxx>
---
 fs/ext4/ext4.h    |   10 +-
 fs/ext4/extents.c |  471 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ioctl.c   |   47 ++++++
 3 files changed, 527 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 246a03a..8f01855 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -591,6 +591,7 @@ enum {
 #define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
 #define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
 #define EXT4_IOC_TRUNCATE_BLOCK_RANGE	_IOW('f', 18, struct truncate_range)
+#define EXT4_IOC_TRANSFER_BLOCK_RANGE	_IOW('f', 19, struct transfer_range)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -688,6 +689,12 @@ struct truncate_range {
 	__u32 length;
 };
 
+struct transfer_range {
+	__u32 dest_fd;
+	__u32 start_block;
+	__u32 length;
+};
+
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
 #define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)
@@ -2700,7 +2707,8 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 extern int ext4_ext_truncate_range(struct inode *inode, ext4_lblk_t start,
 				   ext4_lblk_t end, ext4_lblk_t last_block);
-
+extern int ext4_ext_transfer_range(struct inode *sinode, struct inode *dinode,
+				   __u32 start_block, __u32 end_block);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ed85e34..f95d43f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5002,3 +5002,474 @@ out:
 	return ret;
 }
 
+/**
+ * ext4_ext_prepare_extent_transfer
+ *
+ * If start lies between extent, extent is split such that start
+ * is the first block of new extent.
+ * If start lies in a hole, start is adjusted to point to the starting
+ * block of next extent.
+ * If end lies between extent, extent is split such that end is the
+ * last block of old extent.
+ *
+ * @inode: The inode of the file from which extents are to be removed
+ * @start: The starting block for removing extent
+ * @orig_end : The end block for removing extent
+ * @handle: journal handle
+ *
+ * Returns 0 on success, 1 if no transfer is needed, error otherwise
+ */
+int ext4_ext_prepare_extent_transfer(struct inode *inode, ext4_lblk_t *start,
+				     ext4_lblk_t orig_end, handle_t *handle)
+{
+	int err, depth;
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent *ex;
+	ext4_lblk_t orig_start = *start;
+
+	err = get_ext_path(inode, orig_start, &path);
+	if (err)
+		return err;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+
+	/* if start lies between extent, split the extent */
+	if (orig_start > le32_to_cpu(ex->ee_block) && orig_start <=
+	    le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex) - 1) {
+		int split_flag = 0;
+		if (ext4_ext_is_uninitialized(ex))
+			split_flag = EXT4_EXT_MARK_UNINIT1 |
+				     EXT4_EXT_MARK_UNINIT2;
+		err = ext4_split_extent_at(handle, inode, path, orig_start,
+				split_flag, EXT4_GET_BLOCKS_METADATA_NOFAIL |
+				EXT4_GET_BLOCKS_PRE_IO);
+		if (err < 0)
+			goto out;
+	} else if (le32_to_cpu(ex->ee_block) +
+		   ext4_ext_get_actual_len(ex) - 1 < orig_start) {
+		/*
+		 * start lies in a hole, adjust start to point to
+		 * the start of next extent
+		 */
+		err = mext_next_extent(inode, path, &ex);
+		if (err < 0 || err == 1)
+			goto out;
+		*start = le32_to_cpu(ex->ee_block);
+	} else
+		/* start lies in a hole which is at the begining of block */
+		*start = le32_to_cpu(ex->ee_block);
+
+	/* Both start and end lies in same hole */
+	if (orig_end < *start) {
+		err = 1;
+		goto out;
+	}
+
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	path = NULL;
+
+	err = get_ext_path(inode, orig_end, &path);
+	if (err)
+		return err;
+
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	if (orig_end >= le32_to_cpu(ex->ee_block) && orig_end <
+	    le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex) - 1) {
+		int split_flag = 0;
+
+		if (ext4_ext_is_uninitialized(ex))
+			split_flag = EXT4_EXT_MARK_UNINIT1 |
+					EXT4_EXT_MARK_UNINIT2;
+		/*
+		 * Split the extent in two so that 'end' is the last
+		 * block in the first new extent
+		 */
+		err = ext4_split_extent_at(handle, inode, path,
+					   orig_end + 1, split_flag,
+					   EXT4_GET_BLOCKS_PRE_IO |
+					   EXT4_GET_BLOCKS_METADATA_NOFAIL);
+		if (err < 0)
+			goto out;
+	}
+
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	return err;
+}
+
+/*
+ * ext4_ext_transfer_extents
+ *
+ * Function to transfer extents from source inode to destination inode
+ * which lies between start and end. Unlike truncate, which start
+ * removing extents from end, we transfer from start.
+ *
+ * @sinode: The source inode for extent transfer
+ * @dinode: The destination inode for extent transfer
+ * @start: The starting block number for extent transfer. start should be
+ * be the first block in an extent.
+ * @end: The ending block number for extent transfer. end could lie inside
+ * hole or it sholud be the last block in an extent.
+ *
+ * Returns number of blocks successfully transfered or error
+ */
+loff_t ext4_ext_transfer_extents(struct inode *sinode, struct inode *dinode,
+			      ext4_lblk_t start, ext4_lblk_t  end,
+			      handle_t *handle)
+{
+	int i, depth = ext_depth(sinode), err, erase_index = 0;
+	struct ext4_extent *ex, *last_ex;
+	struct ext4_ext_path *path = NULL, *d_path = NULL;
+	ext4_lblk_t move_index;
+	loff_t blocks_moved = 0;
+	struct ext4_extent_header *hdr = ext_inode_hdr(sinode);
+
+	move_index = dinode->i_size >> dinode->i_blkbits;
+	err = get_ext_path(sinode, start, &path);
+	if (err)
+		return err;
+	i = depth;
+	ex = path[i].p_ext;
+
+	while (i >= 0 && err == 0) {
+		if (i == depth) {
+			int extent_count = 0;
+			hdr = path[i].p_hdr;
+			if (!ex)
+				ex = EXT_FIRST_EXTENT(hdr);
+			last_ex =  EXT_LAST_EXTENT(hdr);
+			err = ext4_trange_dirty_path(handle, sinode, path + i,
+						     1, dinode);
+			if (err)
+				goto out;
+
+			while (ex != NULL &&
+			       (le32_to_cpu(ex->ee_block) <= end)) {
+				int ext_length = ext4_ext_get_actual_len(ex);
+
+				d_path = ext4_ext_find_extent(dinode,
+							      move_index,
+							      NULL);
+				if (IS_ERR(d_path)) {
+					err = PTR_ERR(d_path);
+					goto out;
+				}
+				ex->ee_block = cpu_to_le32(move_index);
+				err = ext4_ext_insert_extent(handle, dinode,
+							     d_path, ex, 0);
+				if (err)
+					goto out;
+
+				extent_count++;
+				blocks_moved += ext_length;
+				move_index += ext_length;
+				memset(ex, 0, sizeof(struct ext4_extent));
+				le16_add_cpu(&(hdr->eh_entries), -1);
+				ext4_ext_drop_refs(d_path);
+				kfree(d_path);
+				d_path = NULL;
+
+				/* Check if all the extents in this block have
+				 * transfered
+				 */
+				if (++ex > last_ex)
+					ex = NULL;
+			}
+
+			ext4_ext_dirty(handle, sinode, path + i);
+
+			if (!ex) {
+				brelse(path[i].p_bh);
+				path[i].p_bh = NULL;
+				/*move level down */
+				i--;
+				if (!le16_to_cpu(hdr->eh_entries))
+					erase_index = 1;
+				else
+					erase_index = 0;
+				continue;
+			} else {
+				/* All the required extents are transfered */
+				last_ex++;
+				if (extent_count) {
+					memmove(ex - extent_count, ex,
+						(last_ex - ex) *
+						sizeof(struct ext4_extent));
+					memset(last_ex - extent_count, 0,
+						extent_count *
+						sizeof(struct ext4_extent));
+					ext4_ext_dirty(handle, sinode,
+						       path + i);
+					path[i].p_ext = EXT_FIRST_EXTENT(hdr);
+					err = ext4_ext_correct_indexes(handle,
+								sinode, path);
+				}
+				break;
+			}
+		}
+
+		/* Now we are at leaf node */
+		if (erase_index) {
+			struct ext4_extent_idx *idx = path[i].p_idx;
+			struct ext4_extent_idx *last_idx =
+						EXT_LAST_INDEX(path[i].p_hdr);
+			int k = i - 1;
+			ext4_fsblk_t leaf;
+
+			leaf = ext4_idx_pblock(path[i].p_idx);
+			err = ext4_trange_dirty_path(handle, sinode, path + i,
+						     1, dinode);
+			if (err)
+				goto out;
+
+			if (idx != last_idx)
+				memmove(idx, idx + 1, (last_idx - idx) *
+					   sizeof(struct ext4_extent_idx));
+
+			memset(last_idx, 0, sizeof(struct ext4_extent_idx));
+			le16_add_cpu(&(path[i].p_hdr->eh_entries), -1);
+			ext4_ext_dirty(handle, sinode, path + i);
+
+			ext4_free_blocks(handle, sinode, NULL, leaf, 1,
+					 EXT4_FREE_BLOCKS_METADATA |
+					 EXT4_FREE_BLOCKS_FORGET);
+			erase_index = 0;
+			/* Adjust all the indexes to the top */
+			if (path[i].p_hdr->eh_entries &&
+			    idx == EXT_FIRST_INDEX(path[i].p_hdr))
+				while (k >= 0) {
+					if (path[k].p_idx !=
+						EXT_FIRST_INDEX(path[k].p_hdr))
+						break;
+					err = ext4_ext_get_access(handle,
+							sinode, path + k);
+					if (err)
+						break;
+					path[k].p_idx->ei_block = idx->ei_block;
+					err = ext4_ext_dirty(handle, sinode,
+							     path + k);
+					if (err)
+						break;
+					k--;
+				}
+		} else {
+			if (!path[i].p_idx)
+				path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
+			else
+				path[i].p_idx++;
+		   }
+
+		if (path[i].p_idx <= EXT_LAST_INDEX(path[i].p_hdr)) {
+			struct buffer_head *bh = NULL;
+
+			memset(path + i + 1, 0, sizeof(struct ext4_ext_path));
+			bh = sb_bread(sinode->i_sb,
+				      ext4_idx_pblock(path[i].p_idx));
+			if (!bh) {
+				err = -EIO;
+				goto out;
+			}
+			if (ext4_ext_check(sinode, ext_block_hdr(bh),
+			    depth - i - 1)) {
+				err = -EIO;
+				put_bh(bh);
+				goto out;
+			}
+			path[i + 1].p_bh = bh;
+			path[i + 1].p_hdr = ext_block_hdr(path[i+1].p_bh);
+			i++;
+		} else {
+			erase_index = 0;
+			if (!le16_to_cpu(path[i].p_hdr->eh_entries)) {
+				erase_index = 1;
+				path[i].p_hdr->eh_depth = 0;
+			}
+
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+		}
+	}
+out:
+	ext4_ext_drop_refs(path);
+	kfree(path);
+
+	if (d_path)
+		ext4_ext_drop_refs(d_path);
+	kfree(d_path);
+	if (err)
+		return err;
+	else
+		return blocks_moved;
+}
+
+/*
+ * ext4_ext_can_transfer_range: Check if transfer range
+ * can be performed
+ *
+ * @sinode: Source file inode
+ * @dinode: Destination file inode
+ *
+ * This function returns 0 on success, error otherwise
+ */
+static int ext4_ext_can_transfer_range(struct inode *sinode,
+				       struct inode *dinode)
+{
+	/* source file could not be empty */
+	if (!i_size_read(sinode))
+		return -EINVAL;
+
+	/* source and destination inode should be from same fs */
+	if (sinode->i_sb != dinode->i_sb)
+		return -EINVAL;
+
+	/* source and destination should be different inodes */
+	if (sinode == dinode)
+		return -EINVAL;
+
+	/* Regular file check */
+	if (!S_ISREG(sinode->i_mode) || !S_ISREG(dinode->i_mode))
+		return -EINVAL;
+
+	/* cannot move blocks for immutable files */
+	if (IS_IMMUTABLE(sinode) || IS_APPEND(dinode))
+		return -EPERM;
+
+	/* Ignore swap files */
+	if (IS_SWAPFILE(sinode) || IS_SWAPFILE(dinode))
+		return -EINVAL;
+
+	/* Ext4 move block range supports only extent based file */
+	if (!(ext4_test_inode_flag(sinode, EXT4_INODE_EXTENTS)) ||
+	    !(ext4_test_inode_flag(dinode, EXT4_INODE_EXTENTS)))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+/**
+ * ext4_ext_transfer_range
+ *
+ * @sinode: source inode from which blocks are to be moved
+ * @dinode: destination inode to which blocks are added
+ * @start_block: The starting block number from which the
+ * block movement starts
+ * @end_block: The last block number which is to be moved
+ *
+ * This function returns 0 on success or error otherwise
+ */
+int ext4_ext_transfer_range(struct inode *sinode, struct inode *dinode,
+			    __u32 start_block, __u32 end_block)
+{
+	ext4_lblk_t s_last_block;
+	int ret, credits, blkbits = EXT4_BLOCK_SIZE_BITS(sinode->i_sb);
+	handle_t *handle;
+	struct address_space *mapping = sinode->i_mapping;
+	loff_t daligned_size, blocks_moved;
+	loff_t first_page_offset, last_page_offset;
+
+	ret = ext4_ext_can_transfer_range(sinode, dinode);
+	if (ret)
+		return ret;
+
+	ext4_inode_double_lock(sinode, dinode);
+	ext4_inode_block_unlocked_dio(sinode);
+	ext4_inode_block_unlocked_dio(dinode);
+	inode_dio_wait(sinode);
+	inode_dio_wait(dinode);
+
+	s_last_block = ((round_up(sinode->i_size,
+			 EXT4_BLOCK_SIZE(sinode->i_sb))) >> blkbits) - 1;
+
+	/* start_block cannot be greater than source end_block or last_block */
+	if (start_block > end_block || start_block > s_last_block) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If end_block is greater than source last_block, adjust it */
+	if (end_block > s_last_block)
+		end_block = s_last_block;
+
+	/* sync dirty pages for transfer */
+	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+		ret = filemap_write_and_wait_range(mapping,
+				(loff_t)start_block << blkbits,
+				((loff_t)(end_block + 1) << blkbits) - 1);
+		if (ret)
+			goto out;
+	}
+
+	first_page_offset = round_down((loff_t)start_block << blkbits,
+				       PAGE_SIZE);
+	last_page_offset = round_up((loff_t)end_block << blkbits, PAGE_SIZE);
+	truncate_pagecache_range(sinode, first_page_offset,
+				 last_page_offset - 1);
+
+	/* Protect extent tree against block allocations via delalloc */
+	down_write(&EXT4_I(sinode)->i_data_sem);
+
+	/* we need to update 2 inodes */
+	credits = ext4_writepage_trans_blocks(sinode) +
+		  ext4_writepage_trans_blocks(dinode);
+	handle = ext4_journal_start(sinode, EXT4_HT_TRUNCATE, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out2;
+	}
+
+	ret = ext4_ext_prepare_extent_transfer(sinode, &start_block,
+					       end_block, handle);
+	if (ret != 0) {
+		if (ret == 1)
+			/* No need to move blocks */
+			ret = 0;
+		goto stop_journal;
+	}
+
+	daligned_size = (loff_t)(round_up(dinode->i_size,
+					  EXT4_BLOCK_SIZE(dinode->i_sb)));
+	/* if dest inode isize is not block aligned, make it block aligned */
+	if (dinode->i_size != daligned_size)
+		i_size_write(dinode, daligned_size);
+
+	/* Discard any falloacted area beyond i_size for dest inode */
+	ext4_truncate(dinode);
+
+	down_write(&EXT4_I(dinode)->i_data_sem);
+	blocks_moved = ext4_ext_transfer_extents(sinode, dinode, start_block,
+						 end_block, handle);
+	if (blocks_moved <= 0) {
+		ret = blocks_moved;
+		goto out3;
+	}
+
+	/* Update size and disksize here */
+	i_size_write(dinode,
+		     (dinode->i_size + (blocks_moved << blkbits)));
+	EXT4_I(dinode)->i_disksize += (blocks_moved << blkbits);
+	sinode->i_blocks -= (blocks_moved << (blkbits - 9));
+	dinode->i_blocks += (blocks_moved << (blkbits - 9));
+
+	sinode->i_mtime = sinode->i_ctime = ext4_current_time(sinode);
+	ext4_mark_inode_dirty(handle, sinode);
+
+	dinode->i_mtime = dinode->i_ctime = ext4_current_time(dinode);
+	ext4_mark_inode_dirty(handle, dinode);
+out3:
+	up_write(&EXT4_I(dinode)->i_data_sem);
+stop_journal:
+	ext4_journal_stop(handle);
+out2:
+	up_write(&EXT4_I(sinode)->i_data_sem);
+out:
+	ext4_inode_resume_unlocked_dio(sinode);
+	ext4_inode_resume_unlocked_dio(dinode);
+	ext4_inode_double_unlock(sinode, dinode);
+
+	return ret;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0530daf..f2240f6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -685,6 +685,53 @@ resizefs_out:
 		return error;
 	}
 
+	case EXT4_IOC_TRANSFER_BLOCK_RANGE:
+	{
+		struct transfer_range tr;
+		struct fd dest_fd;
+		int err;
+		ext4_lblk_t end_block;
+
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+			ext4_msg(sb, KERN_ERR,
+			    "Move block range not supported with bigalloc");
+			return -EOPNOTSUPP;
+		}
+
+		if (copy_from_user(&tr, (struct transfer_range __user *)arg,
+				   sizeof(tr)))
+			return -EFAULT;
+
+		if (tr.length == 0)
+			return -EINVAL;
+		end_block = tr.start_block + tr.length - 1;
+
+		dest_fd = fdget(tr.dest_fd);
+		if (!dest_fd.file)
+			return -EBADF;
+
+		if (!(dest_fd.file->f_mode & FMODE_WRITE)) {
+			err = -EBADF;
+			goto fput_out;
+		}
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			goto fput_out;
+
+		err = ext4_ext_transfer_range(inode, file_inode(dest_fd.file),
+					      tr.start_block, end_block);
+		mnt_drop_write_file(filp);
+
+fput_out:
+		fdput(dest_fd);
+		return err;
+	}
+
 	default:
 		return -ENOTTY;
 	}
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux