[RFC] Ext3 online defrag

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



  Hello,

  I've written a simple patch implementing ext3 ioctl for file
relocation. Basically you call ioctl on a file, give it list of blocks
and it relocates the file into given blocks (provided they are still
free). The idea is to use it as a kernel part of ext3 online
defragmenter (or generally disk access optimizer). Now I don't have the
user space part that finds larger runs of free blocks and so on so that
it can really be used as a defragmenter. I just send this as a kind of
proof-of-concept to hear some comments. Attached is also a simple
program that demonstrates the use of the ioctl.
  Thanks for suggestions/comments in advance.

								Honza

-- 
Jan Kara <jack@xxxxxxx>
SuSE CR Labs
Implement ext3 ioctl for relocation of file into a given set of blocks. The
function also allocates those blocks (provided they are still free). We simply
build new indirect-tree in given blocks, copy data to it and in the end we
swap pointers to blocks from the inode.

Signed-off-by: Jan Kara <jack@xxxxxxx>

diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/fs/ext3/inode.c linux-2.6.18-1-defragment-ext3/fs/ext3/inode.c
--- linux-2.6.18/fs/ext3/inode.c	2006-09-27 13:08:35.000000000 +0200
+++ linux-2.6.18-1-defragment-ext3/fs/ext3/inode.c	2006-10-20 17:54:57.000000000 +0200
@@ -3219,3 +3219,360 @@ int ext3_change_inode_journal_flag(struc
 
 	return err;
 }
+
+static inline int get_next_reloc_extent(struct ext3_reloc_extent *ext,
+	struct ext3_reloc_extent __user *ext_user, int *act_ext, int extents)
+{
+	/* Still some blocks in the current extent? */
+	if (ext->len)
+		return 0;
+	/* Not enough extents? */
+	if (++(*act_ext) >= extents)
+		return -ENOSPC;
+	if (copy_from_user(ext, ext_user+*act_ext, sizeof(*ext)))
+		return -EFAULT;
+	/* Invalid extent? */
+	if (!ext->len)
+		return -EINVAL;
+	return 0;
+}
+
+static ext3_fsblk_t alloc_reloc_extent(handle_t *handle, struct inode *inode,
+	struct ext3_reloc_extent *ext, unsigned long *blocks, int *err)
+{
+	ext3_fsblk_t ret;
+
+	if (*blocks > ext->len)
+		*blocks = ext->len;
+	ret = ext3_new_blocks(handle, inode, ext->start, blocks, err);
+	if (!ret)
+		return 0;
+
+	/* Required block not free? */
+	if (ret != ext->start) {
+		ext3_free_blocks(handle, inode, ret, *blocks);
+		*err = -ENOSPC;
+		return 0;
+	}
+	ext->start += *blocks;
+	ext->len -= *blocks;
+	return ret;
+}
+
+static int reloc_tree(struct inode *inode, int depth, loff_t pos,
+	struct ext3_reloc_extent *ext, struct ext3_reloc_extent __user *ext_user,
+	int *act_ext, int extents, __le32 oblk, __le32 *nblk,
+	struct buffer_head *nblk_bh)
+{
+	struct buffer_head *obh, *nbh = NULL;
+	handle_t *handle;
+	ext3_fsblk_t newblock;
+	unsigned long count = 1;
+	int ret, i, j;
+	loff_t blocks = (i_size_read(inode) + inode->i_sb->s_blocksize-1) >>
+		inode->i_sb->s_blocksize_bits;
+
+	if (!oblk) {
+		*nblk = 0;
+		return 0;
+	}
+	obh = sb_bread(inode->i_sb, le32_to_cpu(oblk));
+	if (!obh)
+		return -EIO;
+	ret = get_next_reloc_extent(ext, ext_user, act_ext, extents);
+	if (ret < 0)
+		goto out_bh;
+	/* We modify nblk, bitmap, sb, descriptor, buffer */
+	handle = ext3_journal_start(inode, 4);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_bh;
+	}
+	/* First we allocate indirect block */
+	newblock = alloc_reloc_extent(handle, inode, ext, &count, &ret);
+	if (!newblock)
+		goto out_trans;
+	nbh = sb_getblk(inode->i_sb, newblock);
+	if (!nbh) {
+		ret = -EIO;
+		goto out_alloc;
+	}
+	lock_buffer(nbh);
+	ret = ext3_journal_get_create_access(handle, nbh);
+	if (ret) {
+		unlock_buffer(nbh);
+		goto out_alloc;
+	}
+	memset(nbh->b_data, 0, nbh->b_size);
+	set_buffer_uptodate(nbh);
+	unlock_buffer(nbh);
+	*nblk = cpu_to_le32(newblock);
+	if (nblk_bh)
+		ret = ext3_journal_dirty_metadata(handle, nblk_bh);
+	else
+		ret = ext3_mark_inode_dirty(handle, inode);
+	if (ret) {
+		*nblk = 0;
+		goto out_alloc;
+	}
+	ext3_journal_stop(handle);
+
+	printk("Indirect block allocated.\n");
+	/* Now it's time to allocate further data/indirect blocks */
+	if (depth) {
+		for (i = 0; i < EXT3_ADDR_PER_BLOCK(inode->i_sb) &&
+			pos < blocks; i++, pos +=
+			1 << depth*EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb)) {
+			if (!((__le32 *)obh->b_data)[i])
+				continue;
+			ret = reloc_tree(inode, depth-1, pos, ext, ext_user,
+				act_ext, extents, ((__le32 *)obh->b_data)[i],
+				((__le32 *)nbh->b_data)+i, nbh);
+			if (ret)
+				goto out_bh;
+		}
+	} else {
+		for (i = 0; i < EXT3_ADDR_PER_BLOCK(inode->i_sb) &&
+			pos < blocks; i++, pos++) {
+			if (!((__le32 *)obh->b_data)[i])
+				continue;
+			ret = get_next_reloc_extent(ext, ext_user, act_ext,
+				extents);
+			if (ret < 0)
+				goto out_bh;
+			/* We modify sb+descriptor+bitmap+nbh */
+			handle = ext3_journal_start(inode, 4);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				goto out_bh;
+			}
+			/* Compute size of continuous extent we may have */
+			for (count = 0;
+				i+count < EXT3_ADDR_PER_BLOCK(inode->i_sb) &&
+				pos+count < blocks &&
+				((__le32 *)obh->b_data)[i+count];
+				count++);
+			printk("Going to allocated extent of lenght %lu\n", count);
+			/* Allocate extent and store block pointers */
+			newblock = alloc_reloc_extent(handle, inode, ext,
+				&count, &ret);
+			printk("Got extent from block %lu, lenght %lu\n", newblock, count);
+			if (!newblock)
+				goto out_trans;
+			for (j = 0; j < count; j++)
+				((__le32 *)nbh->b_data)[i+j] =
+					cpu_to_le32(newblock+j);
+			printk("Pointers stored, going to dirty metadata.\n");
+			ret = ext3_journal_dirty_metadata(handle, nbh);
+			if (ret) {
+				memset(((__le32 *)nbh->b_data)+i, 0,
+					sizeof(__le32)*count);
+				goto out_alloc;
+			}
+			ext3_journal_stop(handle);
+			i += count-1;
+			pos += count-1;
+		}
+	}
+	ret = 0;
+	goto out_bh;
+out_alloc:
+	ext3_free_blocks(handle, inode, newblock, count);
+out_trans:
+	ext3_journal_stop(handle);
+out_bh:
+	brelse(obh);
+	if (nbh)
+		brelse(nbh);
+	return ret;
+}
+
+/*
+ *  Move file into new blocks
+ */
+int ext3_move_file_blocks(struct inode *inode, int extents,
+	struct ext3_reloc_extent __user *ext_user)
+{
+	int ret, act_ext = 0, j;
+	unsigned long count = 1;
+	struct ext3_reloc_extent ext;
+	struct inode *tmp_inode = NULL;
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	struct ext3_inode_info *tmp_ei;
+	struct super_block *sb = inode->i_sb;
+	loff_t blocks = (i_size_read(inode) + sb->s_blocksize-1) >>
+			sb->s_blocksize_bits;
+	loff_t i;
+	handle_t *handle;
+	__le32 tmp_i_data[EXT3_N_BLOCKS];
+
+	/* FIXME: Maybe rewrite like in the style of direct IO? */
+	if (inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+		struct writeback_control wbc = {
+			/* We don't need to wait for data, but we need to wait for I_LOCK */
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 0,
+		};
+		ret = sync_inode(inode, &wbc);
+		if (ret)
+			return ret;
+	}
+	if (copy_from_user(&ext, ext_user, sizeof(ext)))
+		return -EFAULT;
+	if (!ext.len)
+		return -EINVAL;
+
+	/* We modify sb+inode+bitmap+descriptor */
+	handle = ext3_journal_start(inode, 4);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	tmp_inode = ext3_new_inode(handle, sb->s_root->d_inode, S_IFREG);
+	i_size_write(tmp_inode, i_size_read(inode));
+	tmp_inode->i_nlink = 0;
+	/* Add inode to the orphan list in case we crash so that replay
+	 * takes care after it */
+	ret = ext3_orphan_add(handle, tmp_inode);
+	ext3_journal_stop(handle);
+
+	tmp_ei = EXT3_I(tmp_inode);
+	mutex_lock(&inode->i_mutex);
+	for (i = 0; i < EXT3_NDIR_BLOCKS && i < blocks; i++) {
+		if (ei->i_data[i] == 0) {
+			tmp_ei->i_data[i] = 0;
+			continue;
+		}
+		ret = get_next_reloc_extent(&ext, ext_user, &act_ext, extents);
+		if (ret < 0)
+			goto out;
+		/* We modify inode, bitmap, sb, descriptor */
+		handle = ext3_journal_start(inode, 4);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		tmp_ei->i_data[i] = cpu_to_le32(
+			alloc_reloc_extent(handle, tmp_inode, &ext, &count, &ret));
+		ext3_mark_inode_dirty(handle, tmp_inode);
+		ext3_journal_stop(handle);
+		if (!tmp_ei->i_data[i])
+			goto out;
+	}
+	if (i >= blocks)
+		goto copy_data;
+
+	ret = reloc_tree(tmp_inode, 0, i, &ext, ext_user, &act_ext, extents,
+		ei->i_data[EXT3_IND_BLOCK], tmp_ei->i_data+EXT3_IND_BLOCK,
+		NULL);
+	if (ret < 0)
+		goto out;
+	i += EXT3_ADDR_PER_BLOCK(sb);
+	if (blocks <= i)
+		goto copy_data;
+	ret = reloc_tree(tmp_inode, 1, i, &ext, ext_user, &act_ext, extents,
+		ei->i_data[EXT3_DIND_BLOCK], tmp_ei->i_data+EXT3_DIND_BLOCK,
+		NULL);
+	if (ret < 0)
+		goto out;
+	i += 1 << 2*EXT3_ADDR_PER_BLOCK_BITS(sb);
+	if (blocks <= i)
+		goto copy_data;
+	ret = reloc_tree(tmp_inode, 2, i, &ext, ext_user, &act_ext, extents,
+		ei->i_data[EXT3_TIND_BLOCK], tmp_ei->i_data+EXT3_TIND_BLOCK,
+		NULL);
+	if (ret < 0)
+		goto out;
+copy_data:
+	/* Currently simple, later we may do something more clever */
+	for (i = 0; i < blocks; i += EXT3_MAX_TRANS_DATA) {
+		struct buffer_head *in_bh[EXT3_MAX_TRANS_DATA];
+		struct buffer_head *out_bh[EXT3_MAX_TRANS_DATA];
+
+		/* Prepare all buffers for copying */
+		count = min(EXT3_MAX_TRANS_DATA, (unsigned)(blocks-i));
+		for (j = 0; j < count; j++) {
+			in_bh[j] = ext3_bread(NULL, inode, i+j, 0, &ret);
+			if (!in_bh[j]) {
+				journal_brelse_array(in_bh, j);
+				goto out;
+			}
+		}
+		for (j = 0; j < count; j++) {
+			out_bh[j] = ext3_getblk(NULL, tmp_inode, i+j, 0, &ret);
+			if (!out_bh[j]) {
+				journal_brelse_array(out_bh, j);
+				journal_brelse_array(in_bh, count);
+				goto out;
+			}
+		}
+		/* Copy data */
+		if (ext3_should_journal_data(inode))
+			handle = ext3_journal_start(inode, count);
+		else
+			/* No metadata, just data => no credits needed */
+			handle = ext3_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		for (j = 0; j < count; j++) {
+			if (ext3_should_journal_data(inode)) {
+				ret = ext3_journal_get_write_access(handle,
+					out_bh[j]);
+				if (ret)
+					goto release_buffers;
+			}
+			lock_buffer(out_bh[j]);
+			memcpy(out_bh[j]->b_data, in_bh[j]->b_data,
+				out_bh[j]->b_size);
+			set_buffer_uptodate(out_bh[j]);
+			unlock_buffer(out_bh[j]);
+			ret = 0;
+			if (ext3_should_journal_data(inode)) {
+				ret = ext3_journal_dirty_metadata(handle,
+					out_bh[j]);
+			}
+			else {
+				if (ext3_should_order_data(inode))
+					ret = ext3_journal_dirty_data(handle,
+						out_bh[j]);
+				mark_buffer_dirty(out_bh[j]);
+			}
+			if (ret)
+				goto release_buffers;
+		
+		}
+release_buffers:
+		ext3_journal_stop(handle);
+		journal_brelse_array(in_bh, count);
+		journal_brelse_array(out_bh, count);
+		if (ret)
+			goto out;
+	}
+
+	/* Need to modify 2 inodes + superblock */
+	handle = ext3_journal_start(inode, 3);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	/* Finally file is duplicated. Just swap blocks... */
+	memcpy(tmp_i_data, ei->i_data, sizeof(__le32)*EXT3_N_BLOCKS);
+	memcpy(ei->i_data, tmp_ei->i_data, sizeof(__le32)*EXT3_N_BLOCKS);
+	memcpy(tmp_ei->i_data, tmp_i_data, sizeof(__le32)*EXT3_N_BLOCKS);
+	ext3_mark_inode_dirty(handle, inode);
+	ext3_mark_inode_dirty(handle, tmp_inode);
+	ext3_orphan_del(handle, tmp_inode);
+	ext3_journal_stop(handle);
+	/* We know that there are no writers and all data has been written */
+	ret = invalidate_inode_pages2(inode->i_mapping);
+	if (ret) {
+		printk(KERN_WARNING "Cannot invalidate inode pages in ext3_move_file_blocks!\n");
+		goto out;
+	}
+
+out:
+	mutex_unlock(&inode->i_mutex);
+	if (tmp_inode)
+		iput(tmp_inode);
+	return ret;
+}
diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/fs/ext3/ioctl.c linux-2.6.18-1-defragment-ext3/fs/ext3/ioctl.c
--- linux-2.6.18/fs/ext3/ioctl.c	2006-09-27 13:08:35.000000000 +0200
+++ linux-2.6.18-1-defragment-ext3/fs/ext3/ioctl.c	2006-10-20 01:51:59.000000000 +0200
@@ -246,7 +246,26 @@ flags_err:
 
 		return err;
 	}
+	case EXT3_IOC_FILE_RELOC: {
+		struct ext3_file_move_data input;
+		int err;
 
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+		if (IS_RDONLY(inode))
+			return -EROFS;
+		if (copy_from_user(&input, (struct ext3_file_move_data __user *)arg, sizeof(input)))
+			return -EFAULT;
+		if (!access_ok(VERIFY_READ, input.ext_array, input.extents * sizeof(struct ext3_reloc_extent)))
+			return -EFAULT;
+				
+		err = deny_write_access(filp);
+		if (err)
+			return err;
+		err = ext3_move_file_blocks(inode, input.extents, input.ext_array);
+		allow_write_access(filp);
+		return err;
+	}
 
 	default:
 		return -ENOTTY;
diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/include/linux/ext3_fs.h linux-2.6.18-1-defragment-ext3/include/linux/ext3_fs.h
--- linux-2.6.18/include/linux/ext3_fs.h	2006-09-27 13:09:04.000000000 +0200
+++ linux-2.6.18-1-defragment-ext3/include/linux/ext3_fs.h	2006-10-20 01:51:47.000000000 +0200
@@ -233,6 +233,7 @@ struct ext3_new_group_data {
 #endif
 #define EXT3_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT3_IOC_SETRSVSZ		_IOW('f', 6, long)
+#define EXT3_IOC_FILE_RELOC		_IOR('f', 9, struct ext3_file_move_data)
 
 /*
  *  Mount options
@@ -598,6 +599,22 @@ static inline int ext3_valid_inum(struct
 #define EXT3_DEFM_JMODE_WBACK	0x0060
 
 /*
+ * File relocation structures
+ */
+
+struct ext3_reloc_extent {
+	ext3_fsblk_t start;
+	loff_t len;
+};
+
+struct ext3_file_move_data {
+	int extents;
+	struct ext3_reloc_extent __user *ext_array;
+};
+
+int ext3_move_file_blocks(struct inode *inode, int extents, struct ext3_reloc_extent __user *ext_user);
+
+/*
  * Structure of a directory entry
  */
 #define EXT3_NAME_LEN 255
#include <stdio.h>
#include <sys/ioctl.h>
#include <asm/ioctl.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
#include <linux/fs.h>
#include <unistd.h>

#define BLOCKSIZE 1024
#define BLOCKS 1024
#define FILENAME "reloc_test_file"
#define BUFSIZE 64

struct ext3_reloc_extent {
        unsigned int start;
        loff_t len;
};

struct ext3_file_move_data {
        int extents;
        struct ext3_reloc_extent *ext_array;
};

int main(int argc, char **argv)
{
	char buf[BUFSIZE*BLOCKSIZE];
	int fd = open(FILENAME, O_CREAT | O_TRUNC | O_WRONLY, S_IRWXU);
	int i, extents;
	unsigned int blockmap[BLOCKS];
	struct ext3_reloc_extent ext[BLOCKS];
	struct ext3_file_move_data data;

	if (fd < 0) {
		perror("Cannot open file for block allocation");
		return 1;
	}
	unlink(FILENAME);
	if (argc != 2) {
		puts("Usage: ext3_reloc_test filename");
		return 1;
	}
	memset(buf, 1, sizeof(buf));
	for (i = 0; i < BLOCKS; i += BUFSIZE)
		if (write(fd, buf, sizeof(buf)) != sizeof(buf)) {
			perror("Cannot write data");
			return 1;
		}
	for (i = 0, extents = 0; i < BLOCKS; i++) {
		blockmap[i] = i;
		if (ioctl(fd, FIBMAP, &(blockmap[i])) < 0) {
			perror("ioctl");
			return 1;
		}
		if (!i)
			ext[0].start = blockmap[0];
		else if (i && blockmap[i-1]+1 != blockmap[i]) {
			ext[extents].len = blockmap[i-1]-ext[extents].start+1;
			extents++;
			ext[extents].start = blockmap[i];
		}
	}
	ext[extents].len = blockmap[BLOCKS-1]-ext[extents].start+1;
	extents++;
	close(fd);
	sync();
	fd = open(argv[1], O_RDONLY);
	if (fd < 0) {
		perror("Cannot open file to relocate");
		return 1;
	}
	data.extents = extents;
	data.ext_array = ext;
	if (ioctl(fd, _IOR('f', 9, struct ext3_file_move_data), &data) < 0) {
		perror("Move ioctl");
		return 1;
	}
	close(fd);
	return 0;
}

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux