[RFC][PATCH 1/8]ext4: main function of defrag and ioctl implementation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



ext4: online defrag-- Main function of defrag and ioctl implementation

From: Akira Fujita <a-fujita@xxxxxxxxxxxxx>

Create the temporary inode and do defrag per
defrag_size (defalut 64MB).

Signed-off-by: Akira Fujita <a-fujita@xxxxxxxxxxxxx>
Signed-off-by: Takashi Sato <t-sato@xxxxxxxxxxxxx>
---
 fs/ext4/Makefile       |    2 +-
 fs/ext4/defrag.c       |  448 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h         |   18 ++
 fs/ext4/ext4_extents.h |    2 +
 fs/ext4/extents.c      |    2 +-
 fs/ext4/ioctl.c        |    3 +
 6 files changed, 473 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8c..8028102 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o

 ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o mballoc.o
+		   ext4_jbd2.o migrate.o mballoc.o defrag.o

 ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
 ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/defrag.c b/fs/ext4/defrag.c
index e69de29..a591e11 100644
--- a/fs/ext4/defrag.c
+++ b/fs/ext4/defrag.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2008, NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@xxxxxxxxxxxxx>
+ *            Akira Fujita <a-fujita@xxxxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/* Online defragmentation for EXT4 */
+
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "group.h"
+
+/**
+ * ext4_defrag_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode:	inode which is searched
+ * @path:	this will obtain data for the next extent
+ * @extent:	pointer to the next extent we have just gotten
+ *
+ * This function returns 0 or 1(last entry) if succeed, otherwise
+ * returns -EIO.
+ */
+static int
+ext4_defrag_next_extent(struct inode *inode, struct ext4_ext_path *path,
+			struct ext4_extent **extent)
+{
+	return 0;
+}
+
+int ext4_defrag_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+			unsigned long arg)
+{
+	int err = 0;
+
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		printk(KERN_ERR "ext4 defrag: ino[%lu] is not extents "
+					"based file\n", inode->i_ino);
+		return -EOPNOTSUPP;
+	}
+
+	if (cmd == EXT4_IOC_DEFRAG) {
+		struct ext4_ext_defrag_data defrag;
+		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+
+		if (!capable(CAP_DAC_OVERRIDE)) {
+			if ((inode->i_mode & S_IRUSR) != S_IRUSR)
+				return -EACCES;
+			if (current->fsuid != inode->i_uid)
+				return -EACCES;
+		}
+
+		if (copy_from_user(&defrag,
+			(struct ext4_ext_defrag_data __user *)arg,
+						sizeof(defrag)))
+			return -EFAULT;
+
+		/* Check goal offset if goal offset was given from userspace */
+		if (defrag.goal != -1 &&
+				ext4_blocks_count(es) <= defrag.goal) {
+			printk(KERN_ERR "ext4 defrag: Invalid goal offset"
+				" %llu, you can set goal offset up to %llu\n",
+				defrag.goal, ext4_blocks_count(es) - 1);
+			return -EINVAL;
+		}
+
+		err = ext4_defrag(filp, defrag.start_offset,
+				defrag.defrag_size);
+	}
+
+	return err;
+}
+
+/**
+ * ext4_defrag_partial - Defrag a file per page
+ *
+ * @tmp_inode:		temporary inode
+ * @filp:		pointer to file
+ * @org_offset:		page index on original file
+ * @dest_offset:	page index on temporary file
+ *
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_partial(struct inode *tmp_inode, struct file *filp,
+			pgoff_t org_offset, pgoff_t dest_offset)
+{
+	return 0;
+}
+
+/**
+ * ext4_defrag_new_extent_tree - Get contiguous blocks and build an extent tree
+ *
+ * @org_inode:		original inode
+ * @tmp_inode:		temporary inode
+ * @org_path:		indicating the original inode's extent
+ * @tar_start:		starting offset to allocate in blocks
+ * @tar_blocks:		the number of blocks to allocate
+ * @iblock:		file related offset
+ *
+ *
+ * This function returns the value as below:
+ *	0 (succeed)
+ *	1 (not improved)
+ *	negative value (error case)
+ */
+static int
+ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode,
+			struct ext4_ext_path *org_path, ext4_lblk_t tar_start,
+			ext4_lblk_t tar_blocks, ext4_lblk_t iblock)
+{
+	return 0;
+}
+
+/**
+ * ext4_defrag_check - Check the enviroment whether a defrag can be done
+ *
+ * @org_inode:		original inode
+ * @defrag_size:	size of defrag in blocks
+ *
+ * This function returns 0 if succeed, otherwise returns error value.
+ */
+static int
+ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size)
+{
+
+	/* ext4 online defrag supports only 4KB block size */
+	if (org_inode->i_sb->s_blocksize != DEFRAG_BLOCK_SIZE) {
+		printk(KERN_ERR "ext4 defrag: ext4 online defrag supports "
+				"only 4KB block size for the moment.\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* ext4 online defrag needs mballoc mount option. */
+	if (!test_opt(org_inode->i_sb, MBALLOC)) {
+		printk(KERN_ERR "ext4 defrag: multiblock allocation "
+				"is disabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/**
+ * ext4_defrag_init_tmp_inode - Create a temporary inode
+ *
+ * @org_inode:		original inode
+ *
+ * This function returns pointer to the struct inode if succeed,
+ * otherwise returns error value.
+ */
+static struct inode *
+ext4_defrag_init_tmp_inode(struct inode *org_inode)
+{
+	handle_t *handle;
+	struct inode *tmp_inode;
+
+	handle = ext4_journal_start(org_inode,
+		EXT4_DATA_TRANS_BLOCKS(org_inode->i_sb) +
+		EXT4_INDEX_EXTRA_TRANS_BLOCKS + 4 +
+		2 * EXT4_QUOTA_INIT_BLOCKS(org_inode->i_sb));
+	if (IS_ERR(handle))
+		/* Return error code */
+		return (struct inode *)handle;
+
+	tmp_inode = ext4_new_inode(handle,
+		org_inode->i_sb->s_root->d_inode, S_IFREG);
+	if (IS_ERR(tmp_inode))
+		goto out;
+
+	i_size_write(tmp_inode, i_size_read(org_inode));
+	tmp_inode->i_nlink = 0;
+	ext4_ext_tree_init(handle, tmp_inode);
+	ext4_orphan_add(handle, tmp_inode);
+
+out:
+	ext4_journal_stop(handle);
+
+	return tmp_inode;
+}
+
+/**
+ * ext4_defrag - Defrag the specified range of a file
+ *
+ * If no-option is specified, ext4_defrag() proceeds the following order.
+ * 1.ext4_defrag() calculates the block number where defrag terminates
+ *   by the start block number(defrag_start) and the size of defraged data
+ *   (defrag_size) specified as arguments.
+ *   If the defrag_start points a hole, the extent's start offset pointed by
+ *   ext_cur(current extent), holecheck_path, org_path are set after
+ *   hole behind.
+ * 2.Continue step 3 to step 5, until the holecheck_path points to last_extent
+ *   or the ext_cur exceeds the block_end which is last logical block number.
+ * 3.To get a length of continues area, call ext4_defrag_next_extent()
+ *   specified with the ext_cur(initial value is holecheck_path) re-cursive,
+ *   until find un-continuous extent, the start logical block number exceeds
+ *   the block_end or the extent points to the last extent.
+ * 4.After determining the length of continuous block,
+ *   allocates continuous blocks to a temporary inode
+ *   by ext4_defrag_new_extent_tree().
+ * 5.Exchange the original inode data with temporary inode data
+ *   from page_offset to seq_end_page by page unit.
+ *   The start page index of data are specified as arguments:
+ *   the original inode is page_offset, the temporary inode is dest_offset.
+ * 6.Update holecheck_path and org_path to points a next proceeding extent,
+ *   and release the temporary inode holding the original fragmented data.
+ *   Then, returns to step 2.
+ * 7.Release holecheck_path, org_path and temporary inode,
+ *    and returns the defrag_size which is the size of defraged data.
+ *    The defrag_size is used for the command to calculate the file offset
+ *    where a next defrag processing start.
+ *    (Since the defrag command calls defrag_ioctl() by 64MB unit,
+ *     a file bigger than 64MB calls defrag_ioctl many times.)
+ *
+ * @filp:		pointer to file
+ * @block_start:	starting offset to defrag in blocks
+ * @defrag_size:	size of defrag in blocks
+ *
+ * This function returns the number of blocks if succeed, otherwise
+ * returns error value.
+ */
+int
+ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+		ext4_lblk_t defrag_size)
+{
+	struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL;
+	struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL;
+	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+	pgoff_t page_offset, seq_end_page, dest_offset;
+	int ret, depth, seq_extents, last_extent = 0;
+
+	/* Check the filesystem enviroment whether defrag can be done */
+	ret = ext4_defrag_check(org_inode, defrag_size);
+	if (ret < 0)
+		return ret;
+
+	file_end = (org_inode->i_size - 1) >> org_inode->i_blkbits;
+	block_end = block_start + defrag_size - 1;
+	if (file_end < block_end)
+		defrag_size -= block_end - file_end;
+
+	mutex_lock(&org_inode->i_mutex);
+	down_write(&EXT4_I(org_inode)->i_data_sem);
+
+	org_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+	if (IS_ERR(org_path)) {
+		ret = PTR_ERR(org_path);
+		org_path = NULL;
+		goto out;
+	}
+
+	/* Get path structure to check the hole */
+	holecheck_path = ext4_ext_find_extent(org_inode, block_start, NULL);
+	if (IS_ERR(holecheck_path)) {
+		ret = PTR_ERR(holecheck_path);
+		holecheck_path = NULL;
+		goto out;
+	}
+
+	depth = ext_depth(org_inode);
+	ext_cur = holecheck_path[depth].p_ext;
+	if (ext_cur == NULL)
+		goto out;
+
+	/*
+	 * Get proper extent whose ee_block is beyond block_start
+	 * if block_start was within the hole.
+	 */
+	if (le32_to_cpu(ext_cur->ee_block) +
+		le16_to_cpu(ext_cur->ee_len) - 1 < block_start) {
+		last_extent = ext4_defrag_next_extent(org_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+		last_extent = ext4_defrag_next_extent(org_inode, org_path,
+							&ext_dummy);
+		if (last_extent < 0) {
+			ret = last_extent;
+			goto out;
+		}
+	}
+	seq_extents = 1;
+	seq_start = le32_to_cpu(ext_cur->ee_block);
+
+	/* No blocks within the specified range. */
+	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+		printk(KERN_INFO "ext4 defrag: The specified range of file"
+				" may be the hole\n");
+		goto out;
+	}
+
+	/* Adjust start blocks */
+	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+			 le16_to_cpu(ext_cur->ee_len), block_end + 1) -
+		     max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+		seq_blocks += add_blocks;
+
+		/* Create a temporary inode to be exchanged data block */
+		tmp_inode = ext4_defrag_init_tmp_inode(org_inode);
+		if (IS_ERR(tmp_inode)) {
+			ret = PTR_ERR(tmp_inode);
+			tmp_inode = NULL;
+			goto out;
+		}
+
+		/* Adjust tail blocks */
+		if (seq_start + seq_blocks - 1 > block_end)
+			seq_blocks = block_end - seq_start + 1;
+
+		ext_prev = ext_cur;
+		last_extent = ext4_defrag_next_extent(org_inode,
+					holecheck_path, &ext_cur);
+		if (last_extent < 0) {
+			ret = last_extent;
+			break;
+		}
+		if (!last_extent)
+			seq_extents++;
+		add_blocks = le16_to_cpu(ext_cur->ee_len);
+
+		/*
+		 * Extend the length of contiguous block (seq_blocks)
+		 * if extents are contiguous.
+		 */
+		if (le32_to_cpu(ext_prev->ee_block) +
+				le16_to_cpu(ext_prev->ee_len) ==
+				le32_to_cpu(ext_cur->ee_block) &&
+				block_end >= le32_to_cpu(ext_cur->ee_block) &&
+				!last_extent) {
+			if (tmp_inode) {
+				iput(tmp_inode);
+				tmp_inode = NULL;
+			}
+			continue;
+		}
+
+		/* Found an isolated block */
+		if (seq_extents == 1) {
+			seq_start = le32_to_cpu(ext_cur->ee_block);
+			goto CLEANUP;
+		}
+
+		ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode,
+					org_path, seq_start, seq_blocks,
+					block_start);
+
+		if (ret < 0) {
+			break;
+		} else if (ret == 1) {
+			ret = 0;
+			seq_start = le32_to_cpu(ext_cur->ee_block);
+			goto CLEANUP;
+		}
+
+		page_offset = seq_start >>
+				(PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+		dest_offset = 0;
+		seq_end_page = (seq_start + seq_blocks - 1) >>
+				(PAGE_CACHE_SHIFT - org_inode->i_blkbits);
+		seq_start = le32_to_cpu(ext_cur->ee_block);
+
+		/*
+		 * Discard all preallocations.
+		 * This is provisional solution.
+		 * When true ext4_mb_return_to_preallocation() is
+		 * implemented, this will be removed.
+		 */
+		ext4_mb_discard_inode_preallocations(org_inode);
+
+		while (page_offset <= seq_end_page) {
+			/* Swap original branches with new branches */
+			ret = ext4_defrag_partial(tmp_inode, filp,
+					page_offset, dest_offset);
+			if (ret < 0)
+				goto out;
+
+			page_offset++;
+			dest_offset++;
+		}
+
+		/* Decrease buffer counter */
+		if (holecheck_path)
+			ext4_ext_drop_refs(holecheck_path);
+		holecheck_path = ext4_ext_find_extent(org_inode,
+						seq_start, holecheck_path);
+		if (IS_ERR(holecheck_path)) {
+			ret = PTR_ERR(holecheck_path);
+			holecheck_path = NULL;
+			break;
+		}
+		depth = holecheck_path->p_depth;
+
+CLEANUP:
+		/* Decrease buffer counter */
+		if (org_path)
+			ext4_ext_drop_refs(org_path);
+		org_path = ext4_ext_find_extent(org_inode, seq_start, org_path);
+		if (IS_ERR(org_path)) {
+			ret = PTR_ERR(org_path);
+			org_path = NULL;
+			break;
+		}
+
+		ext_cur = holecheck_path[depth].p_ext;
+		add_blocks = le16_to_cpu(ext_cur->ee_len);
+		seq_blocks = 0;
+		dest_offset = 0;
+		seq_extents = 1;
+
+		if (tmp_inode) {
+			iput(tmp_inode);
+			tmp_inode = NULL;
+		}
+	}
+
+out:
+	if (org_path) {
+		ext4_ext_drop_refs(org_path);
+		kfree(org_path);
+	}
+	if (holecheck_path) {
+		ext4_ext_drop_refs(holecheck_path);
+		kfree(holecheck_path);
+	}
+
+	up_write(&EXT4_I(org_inode)->i_data_sem);
+	mutex_unlock(&org_inode->i_mutex);
+
+	if (tmp_inode)
+		iput(tmp_inode);
+
+	return (ret ? ret : defrag_size);
+}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a7a541d..67281ad 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -298,6 +298,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT4_IOC_SETRSVSZ		_IOW('f', 6, long)
 #define EXT4_IOC_MIGRATE		_IO('f', 7)
+#define EXT4_IOC_DEFRAG		_IOW('f', 10, struct ext4_ext_defrag_data)

 /*
  * ioctl commands in 32 bit emulation
@@ -315,6 +316,18 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION

+/*
+ * Will go away.
+ * ext4 online defrag supports only 4KB block size.
+ */
+#define DEFRAG_BLOCK_SIZE	4096
+
+struct ext4_ext_defrag_data {
+	ext4_lblk_t start_offset;	/* start offset to defrag in blocks */
+	ext4_lblk_t defrag_size;	/* size of defrag in blocks */
+	ext4_fsblk_t goal;		/* block offset for allocation */
+};
+

 /*
  *  Mount options
@@ -1113,6 +1126,11 @@ extern void ext4_inode_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
 				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+/* defrag.c */
+extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start,
+			ext4_lblk_t defrag_size);
+extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int,
+				unsigned long);

 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b5..9868c02 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -228,5 +228,7 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
 						ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *path);
 #endif /* _EXT4_EXTENTS */

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5ba81b3..ffced61 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -48,7 +48,7 @@
  * ext_pblock:
  * combine low and high parts of physical block number into ext4_fsblk_t
  */
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
 {
 	ext4_fsblk_t block;

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1..4c7fca1 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -241,6 +241,9 @@ setversion_out:

 		return err;
 	}
+	case EXT4_IOC_DEFRAG: {
+		return ext4_defrag_ioctl(inode, filp, cmd, arg);
+	}
 	case EXT4_IOC_GROUP_ADD: {
 		struct ext4_new_group_data input;
 		struct super_block *sb = inode->i_sb;
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux