Hello, I've written a simple patch implementing ext3 ioctl for file relocation. Basically you call ioctl on a file, give it list of blocks and it relocates the file into given blocks (provided they are still free). The idea is to use it as a kernel part of ext3 online defragmenter (or generally disk access optimizer). Now I don't have the user space part that finds larger runs of free blocks and so on so that it can really be used as a defragmenter. I just send this as a kind of proof-of-concept to hear some comments. Attached is also a simple program that demonstrates the use of the ioctl. Thanks for suggestions/comments in advance. Honza -- Jan Kara <jack@xxxxxxx> SuSE CR Labs
Implement ext3 ioctl for relocation of file into a given set of blocks. The function also allocates those blocks (provided they are still free). We simply build new indirect-tree in given blocks, copy data to it and in the end we swap pointers to blocks from the inode. Signed-off-by: Jan Kara <jack@xxxxxxx> diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/fs/ext3/inode.c linux-2.6.18-1-defragment-ext3/fs/ext3/inode.c --- linux-2.6.18/fs/ext3/inode.c 2006-09-27 13:08:35.000000000 +0200 +++ linux-2.6.18-1-defragment-ext3/fs/ext3/inode.c 2006-10-20 17:54:57.000000000 +0200 @@ -3219,3 +3219,360 @@ int ext3_change_inode_journal_flag(struc return err; } + +static inline int get_next_reloc_extent(struct ext3_reloc_extent *ext, + struct ext3_reloc_extent __user *ext_user, int *act_ext, int extents) +{ + /* Still some blocks in the current extent? */ + if (ext->len) + return 0; + /* Not enough extents? */ + if (++(*act_ext) >= extents) + return -ENOSPC; + if (copy_from_user(ext, ext_user+*act_ext, sizeof(*ext))) + return -EFAULT; + /* Invalid extent? */ + if (!ext->len) + return -EINVAL; + return 0; +} + +static ext3_fsblk_t alloc_reloc_extent(handle_t *handle, struct inode *inode, + struct ext3_reloc_extent *ext, unsigned long *blocks, int *err) +{ + ext3_fsblk_t ret; + + if (*blocks > ext->len) + *blocks = ext->len; + ret = ext3_new_blocks(handle, inode, ext->start, blocks, err); + if (!ret) + return 0; + + /* Required block not free? */ + if (ret != ext->start) { + ext3_free_blocks(handle, inode, ret, *blocks); + *err = -ENOSPC; + return 0; + } + ext->start += *blocks; + ext->len -= *blocks; + return ret; +} + +static int reloc_tree(struct inode *inode, int depth, loff_t pos, + struct ext3_reloc_extent *ext, struct ext3_reloc_extent __user *ext_user, + int *act_ext, int extents, __le32 oblk, __le32 *nblk, + struct buffer_head *nblk_bh) +{ + struct buffer_head *obh, *nbh = NULL; + handle_t *handle; + ext3_fsblk_t newblock; + unsigned long count = 1; + int ret, i, j; + loff_t blocks = (i_size_read(inode) + inode->i_sb->s_blocksize-1) >> + inode->i_sb->s_blocksize_bits; + + if (!oblk) { + *nblk = 0; + return 0; + } + obh = sb_bread(inode->i_sb, le32_to_cpu(oblk)); + if (!obh) + return -EIO; + ret = get_next_reloc_extent(ext, ext_user, act_ext, extents); + if (ret < 0) + goto out_bh; + /* We modify nblk, bitmap, sb, descriptor, buffer */ + handle = ext3_journal_start(inode, 4); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_bh; + } + /* First we allocate indirect block */ + newblock = alloc_reloc_extent(handle, inode, ext, &count, &ret); + if (!newblock) + goto out_trans; + nbh = sb_getblk(inode->i_sb, newblock); + if (!nbh) { + ret = -EIO; + goto out_alloc; + } + lock_buffer(nbh); + ret = ext3_journal_get_create_access(handle, nbh); + if (ret) { + unlock_buffer(nbh); + goto out_alloc; + } + memset(nbh->b_data, 0, nbh->b_size); + set_buffer_uptodate(nbh); + unlock_buffer(nbh); + *nblk = cpu_to_le32(newblock); + if (nblk_bh) + ret = ext3_journal_dirty_metadata(handle, nblk_bh); + else + ret = ext3_mark_inode_dirty(handle, inode); + if (ret) { + *nblk = 0; + goto out_alloc; + } + ext3_journal_stop(handle); + + printk("Indirect block allocated.\n"); + /* Now it's time to allocate further data/indirect blocks */ + if (depth) { + for (i = 0; i < EXT3_ADDR_PER_BLOCK(inode->i_sb) && + pos < blocks; i++, pos += + 1 << depth*EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb)) { + if (!((__le32 *)obh->b_data)[i]) + continue; + ret = reloc_tree(inode, depth-1, pos, ext, ext_user, + act_ext, extents, ((__le32 *)obh->b_data)[i], + ((__le32 *)nbh->b_data)+i, nbh); + if (ret) + goto out_bh; + } + } else { + for (i = 0; i < EXT3_ADDR_PER_BLOCK(inode->i_sb) && + pos < blocks; i++, pos++) { + if (!((__le32 *)obh->b_data)[i]) + continue; + ret = get_next_reloc_extent(ext, ext_user, act_ext, + extents); + if (ret < 0) + goto out_bh; + /* We modify sb+descriptor+bitmap+nbh */ + handle = ext3_journal_start(inode, 4); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_bh; + } + /* Compute size of continuous extent we may have */ + for (count = 0; + i+count < EXT3_ADDR_PER_BLOCK(inode->i_sb) && + pos+count < blocks && + ((__le32 *)obh->b_data)[i+count]; + count++); + printk("Going to allocated extent of lenght %lu\n", count); + /* Allocate extent and store block pointers */ + newblock = alloc_reloc_extent(handle, inode, ext, + &count, &ret); + printk("Got extent from block %lu, lenght %lu\n", newblock, count); + if (!newblock) + goto out_trans; + for (j = 0; j < count; j++) + ((__le32 *)nbh->b_data)[i+j] = + cpu_to_le32(newblock+j); + printk("Pointers stored, going to dirty metadata.\n"); + ret = ext3_journal_dirty_metadata(handle, nbh); + if (ret) { + memset(((__le32 *)nbh->b_data)+i, 0, + sizeof(__le32)*count); + goto out_alloc; + } + ext3_journal_stop(handle); + i += count-1; + pos += count-1; + } + } + ret = 0; + goto out_bh; +out_alloc: + ext3_free_blocks(handle, inode, newblock, count); +out_trans: + ext3_journal_stop(handle); +out_bh: + brelse(obh); + if (nbh) + brelse(nbh); + return ret; +} + +/* + * Move file into new blocks + */ +int ext3_move_file_blocks(struct inode *inode, int extents, + struct ext3_reloc_extent __user *ext_user) +{ + int ret, act_ext = 0, j; + unsigned long count = 1; + struct ext3_reloc_extent ext; + struct inode *tmp_inode = NULL; + struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_inode_info *tmp_ei; + struct super_block *sb = inode->i_sb; + loff_t blocks = (i_size_read(inode) + sb->s_blocksize-1) >> + sb->s_blocksize_bits; + loff_t i; + handle_t *handle; + __le32 tmp_i_data[EXT3_N_BLOCKS]; + + /* FIXME: Maybe rewrite like in the style of direct IO? */ + if (inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + struct writeback_control wbc = { + /* We don't need to wait for data, but we need to wait for I_LOCK */ + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + }; + ret = sync_inode(inode, &wbc); + if (ret) + return ret; + } + if (copy_from_user(&ext, ext_user, sizeof(ext))) + return -EFAULT; + if (!ext.len) + return -EINVAL; + + /* We modify sb+inode+bitmap+descriptor */ + handle = ext3_journal_start(inode, 4); + if (IS_ERR(handle)) + return PTR_ERR(handle); + tmp_inode = ext3_new_inode(handle, sb->s_root->d_inode, S_IFREG); + i_size_write(tmp_inode, i_size_read(inode)); + tmp_inode->i_nlink = 0; + /* Add inode to the orphan list in case we crash so that replay + * takes care after it */ + ret = ext3_orphan_add(handle, tmp_inode); + ext3_journal_stop(handle); + + tmp_ei = EXT3_I(tmp_inode); + mutex_lock(&inode->i_mutex); + for (i = 0; i < EXT3_NDIR_BLOCKS && i < blocks; i++) { + if (ei->i_data[i] == 0) { + tmp_ei->i_data[i] = 0; + continue; + } + ret = get_next_reloc_extent(&ext, ext_user, &act_ext, extents); + if (ret < 0) + goto out; + /* We modify inode, bitmap, sb, descriptor */ + handle = ext3_journal_start(inode, 4); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + tmp_ei->i_data[i] = cpu_to_le32( + alloc_reloc_extent(handle, tmp_inode, &ext, &count, &ret)); + ext3_mark_inode_dirty(handle, tmp_inode); + ext3_journal_stop(handle); + if (!tmp_ei->i_data[i]) + goto out; + } + if (i >= blocks) + goto copy_data; + + ret = reloc_tree(tmp_inode, 0, i, &ext, ext_user, &act_ext, extents, + ei->i_data[EXT3_IND_BLOCK], tmp_ei->i_data+EXT3_IND_BLOCK, + NULL); + if (ret < 0) + goto out; + i += EXT3_ADDR_PER_BLOCK(sb); + if (blocks <= i) + goto copy_data; + ret = reloc_tree(tmp_inode, 1, i, &ext, ext_user, &act_ext, extents, + ei->i_data[EXT3_DIND_BLOCK], tmp_ei->i_data+EXT3_DIND_BLOCK, + NULL); + if (ret < 0) + goto out; + i += 1 << 2*EXT3_ADDR_PER_BLOCK_BITS(sb); + if (blocks <= i) + goto copy_data; + ret = reloc_tree(tmp_inode, 2, i, &ext, ext_user, &act_ext, extents, + ei->i_data[EXT3_TIND_BLOCK], tmp_ei->i_data+EXT3_TIND_BLOCK, + NULL); + if (ret < 0) + goto out; +copy_data: + /* Currently simple, later we may do something more clever */ + for (i = 0; i < blocks; i += EXT3_MAX_TRANS_DATA) { + struct buffer_head *in_bh[EXT3_MAX_TRANS_DATA]; + struct buffer_head *out_bh[EXT3_MAX_TRANS_DATA]; + + /* Prepare all buffers for copying */ + count = min(EXT3_MAX_TRANS_DATA, (unsigned)(blocks-i)); + for (j = 0; j < count; j++) { + in_bh[j] = ext3_bread(NULL, inode, i+j, 0, &ret); + if (!in_bh[j]) { + journal_brelse_array(in_bh, j); + goto out; + } + } + for (j = 0; j < count; j++) { + out_bh[j] = ext3_getblk(NULL, tmp_inode, i+j, 0, &ret); + if (!out_bh[j]) { + journal_brelse_array(out_bh, j); + journal_brelse_array(in_bh, count); + goto out; + } + } + /* Copy data */ + if (ext3_should_journal_data(inode)) + handle = ext3_journal_start(inode, count); + else + /* No metadata, just data => no credits needed */ + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + for (j = 0; j < count; j++) { + if (ext3_should_journal_data(inode)) { + ret = ext3_journal_get_write_access(handle, + out_bh[j]); + if (ret) + goto release_buffers; + } + lock_buffer(out_bh[j]); + memcpy(out_bh[j]->b_data, in_bh[j]->b_data, + out_bh[j]->b_size); + set_buffer_uptodate(out_bh[j]); + unlock_buffer(out_bh[j]); + ret = 0; + if (ext3_should_journal_data(inode)) { + ret = ext3_journal_dirty_metadata(handle, + out_bh[j]); + } + else { + if (ext3_should_order_data(inode)) + ret = ext3_journal_dirty_data(handle, + out_bh[j]); + mark_buffer_dirty(out_bh[j]); + } + if (ret) + goto release_buffers; + + } +release_buffers: + ext3_journal_stop(handle); + journal_brelse_array(in_bh, count); + journal_brelse_array(out_bh, count); + if (ret) + goto out; + } + + /* Need to modify 2 inodes + superblock */ + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + /* Finally file is duplicated. Just swap blocks... */ + memcpy(tmp_i_data, ei->i_data, sizeof(__le32)*EXT3_N_BLOCKS); + memcpy(ei->i_data, tmp_ei->i_data, sizeof(__le32)*EXT3_N_BLOCKS); + memcpy(tmp_ei->i_data, tmp_i_data, sizeof(__le32)*EXT3_N_BLOCKS); + ext3_mark_inode_dirty(handle, inode); + ext3_mark_inode_dirty(handle, tmp_inode); + ext3_orphan_del(handle, tmp_inode); + ext3_journal_stop(handle); + /* We know that there are no writers and all data has been written */ + ret = invalidate_inode_pages2(inode->i_mapping); + if (ret) { + printk(KERN_WARNING "Cannot invalidate inode pages in ext3_move_file_blocks!\n"); + goto out; + } + +out: + mutex_unlock(&inode->i_mutex); + if (tmp_inode) + iput(tmp_inode); + return ret; +} diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/fs/ext3/ioctl.c linux-2.6.18-1-defragment-ext3/fs/ext3/ioctl.c --- linux-2.6.18/fs/ext3/ioctl.c 2006-09-27 13:08:35.000000000 +0200 +++ linux-2.6.18-1-defragment-ext3/fs/ext3/ioctl.c 2006-10-20 01:51:59.000000000 +0200 @@ -246,7 +246,26 @@ flags_err: return err; } + case EXT3_IOC_FILE_RELOC: { + struct ext3_file_move_data input; + int err; + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (copy_from_user(&input, (struct ext3_file_move_data __user *)arg, sizeof(input))) + return -EFAULT; + if (!access_ok(VERIFY_READ, input.ext_array, input.extents * sizeof(struct ext3_reloc_extent))) + return -EFAULT; + + err = deny_write_access(filp); + if (err) + return err; + err = ext3_move_file_blocks(inode, input.extents, input.ext_array); + allow_write_access(filp); + return err; + } default: return -ENOTTY; diff -rupX /home/jack/.kerndiffexclude linux-2.6.18/include/linux/ext3_fs.h linux-2.6.18-1-defragment-ext3/include/linux/ext3_fs.h --- linux-2.6.18/include/linux/ext3_fs.h 2006-09-27 13:09:04.000000000 +0200 +++ linux-2.6.18-1-defragment-ext3/include/linux/ext3_fs.h 2006-10-20 01:51:47.000000000 +0200 @@ -233,6 +233,7 @@ struct ext3_new_group_data { #endif #define EXT3_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT3_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT3_IOC_FILE_RELOC _IOR('f', 9, struct ext3_file_move_data) /* * Mount options @@ -598,6 +599,22 @@ static inline int ext3_valid_inum(struct #define EXT3_DEFM_JMODE_WBACK 0x0060 /* + * File relocation structures + */ + +struct ext3_reloc_extent { + ext3_fsblk_t start; + loff_t len; +}; + +struct ext3_file_move_data { + int extents; + struct ext3_reloc_extent __user *ext_array; +}; + +int ext3_move_file_blocks(struct inode *inode, int extents, struct ext3_reloc_extent __user *ext_user); + +/* * Structure of a directory entry */ #define EXT3_NAME_LEN 255
#include <stdio.h> #include <sys/ioctl.h> #include <asm/ioctl.h> #include <fcntl.h> #include <string.h> #include <sys/stat.h> #include <linux/fs.h> #include <unistd.h> #define BLOCKSIZE 1024 #define BLOCKS 1024 #define FILENAME "reloc_test_file" #define BUFSIZE 64 struct ext3_reloc_extent { unsigned int start; loff_t len; }; struct ext3_file_move_data { int extents; struct ext3_reloc_extent *ext_array; }; int main(int argc, char **argv) { char buf[BUFSIZE*BLOCKSIZE]; int fd = open(FILENAME, O_CREAT | O_TRUNC | O_WRONLY, S_IRWXU); int i, extents; unsigned int blockmap[BLOCKS]; struct ext3_reloc_extent ext[BLOCKS]; struct ext3_file_move_data data; if (fd < 0) { perror("Cannot open file for block allocation"); return 1; } unlink(FILENAME); if (argc != 2) { puts("Usage: ext3_reloc_test filename"); return 1; } memset(buf, 1, sizeof(buf)); for (i = 0; i < BLOCKS; i += BUFSIZE) if (write(fd, buf, sizeof(buf)) != sizeof(buf)) { perror("Cannot write data"); return 1; } for (i = 0, extents = 0; i < BLOCKS; i++) { blockmap[i] = i; if (ioctl(fd, FIBMAP, &(blockmap[i])) < 0) { perror("ioctl"); return 1; } if (!i) ext[0].start = blockmap[0]; else if (i && blockmap[i-1]+1 != blockmap[i]) { ext[extents].len = blockmap[i-1]-ext[extents].start+1; extents++; ext[extents].start = blockmap[i]; } } ext[extents].len = blockmap[BLOCKS-1]-ext[extents].start+1; extents++; close(fd); sync(); fd = open(argv[1], O_RDONLY); if (fd < 0) { perror("Cannot open file to relocate"); return 1; } data.extents = extents; data.ext_array = ext; if (ioctl(fd, _IOR('f', 9, struct ext3_file_move_data), &data) < 0) { perror("Move ioctl"); return 1; } close(fd); return 0; }