[RFC] basic delayed allocation in ext4

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Good day,

please review ...

thanks, Alex

Basic delayed allocation in ext4

Two special ->get_block() methods are introduced:

 * ext4_da_get_block_prep()
   to be used with ->prepare_write(), defers allocation till flush
 * ext4_da_get_block_write()
   to be used with mpage_da_writepages(), allocate blocks and correct on-disk size

Current implementation works with data=writeback only, you should
mount filesystem with delalloc,data=writeback options.

TODO:
 * reservation
 * data=ordered
 * quota
 * bmap

Signed-off-by: Alex Tomas <alex@xxxxxxxxxxxxx>


Index: linux-2.6.22/include/linux/ext4_fs.h
===================================================================
--- linux-2.6.22.orig/include/linux/ext4_fs.h	2007-07-26 12:30:25.000000000 +0400
+++ linux-2.6.22/include/linux/ext4_fs.h	2007-07-26 12:32:04.000000000 +0400
@@ -488,6 +488,7 @@ do {									       \
 #define EXT4_MOUNT_EXTENTS		0x400000 /* Extents support */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
+#define EXT4_MOUNT_DELALLOC		0x2000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
Index: linux-2.6.22/fs/ext4/super.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/super.c	2007-07-26 12:30:25.000000000 +0400
+++ linux-2.6.22/fs/ext4/super.c	2007-07-26 12:32:04.000000000 +0400
@@ -728,7 +728,7 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_extents, Opt_noextents,
+	Opt_grpquota, Opt_extents, Opt_noextents, Opt_delalloc,
 };

 static match_table_t tokens = {
@@ -782,6 +782,7 @@ static match_table_t tokens = {
 	{Opt_barrier, "barrier=%u"},
 	{Opt_extents, "extents"},
 	{Opt_noextents, "noextents"},
+	{Opt_delalloc, "delalloc"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
 };
@@ -1127,6 +1128,9 @@ clear_qf_name:
 		case Opt_noextents:
 			clear_opt (sbi->s_mount_opt, EXTENTS);
 			break;
+		case Opt_delalloc:
+			set_opt (sbi->s_mount_opt, DELALLOC);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
Index: linux-2.6.22/fs/ext4/inode.c
===================================================================
--- linux-2.6.22.orig/fs/ext4/inode.c	2007-07-26 12:30:22.000000000 +0400
+++ linux-2.6.22/fs/ext4/inode.c	2007-07-26 12:32:04.000000000 +0400
@@ -39,6 +39,8 @@
 #include "xattr.h"
 #include "acl.h"

+static void ext4_invalidatepage(struct page *page, unsigned long offset);
+
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -1291,6 +1293,142 @@ static int ext4_journalled_commit_write(
 }

 /*
+ * this is a special callback for ->prepare_write() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+	/* first, we need to know whether the block is allocated already
+	 * XXX: when the filesystem has a lot of free blocks, we could
+	 * reserve even allocated blocks to save this lookup */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
+	if (ret >= 0) {
+		if (buffer_mapped(bh_result)) {
+			bh_result->b_size = (ret << inode->i_blkbits);
+		} else {
+			/* OK, the block isn't allocated yet, let's reserve space */
+			/* XXX: call reservation here */
+			/* XXX: __block_prepare_write() unmaps passed block, is it OK? */
+			map_bh(bh_result, inode->i_sb, 0);
+			set_buffer_new(bh_result);
+			set_buffer_delay(bh_result);
+		}
+		ret = 0;
+	}
+
+	return ret;
+}
+
+
+static int ext4_da_prepare_write(struct file *file, struct page *page,
+			      		unsigned from, unsigned to)
+{
+	return block_prepare_write(page, from, to, ext4_da_get_block_prep);
+}
+
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				struct buffer_head *bh_result, int create)
+{
+	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	if (create) {
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+	}
+
+	ret = ext4_get_blocks_wrap(handle, inode, iblock,
+				max_blocks, bh_result, create, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			/*
+			 * XXX: replace with spinlock if seen contended -bzzz
+			 */
+			mutex_lock(&EXT4_I(inode)->truncate_mutex);
+			if (disksize > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = disksize;
+			mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+			if (EXT4_I(inode)->i_disksize == disksize) {
+				if (handle == NULL)
+					handle = ext4_journal_start(inode, 1);
+				if (!IS_ERR(handle))
+					ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (handle && !IS_ERR(handle))
+		ext4_journal_stop(handle);
+
+	return ret;
+}
+
+static int ext4_da_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	bh = head = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off && buffer_delay(bh)) {
+			clear_buffer_delay(bh);
+			/* XXX: add real stuff here */
+		}
+		curr_off = next_off;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
+
+/*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
@@ -1741,10 +1879,28 @@ static const struct address_space_operat
 	.releasepage	= ext4_releasepage,
 };

+static const struct address_space_operations ext4_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_writeback_writepage,
+	.writepages	= ext4_da_writepages,
+	.sync_page	= block_sync_page,
+	.prepare_write	= ext4_da_prepare_write,
+	.commit_write	= generic_commit_write,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_da_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
 void ext4_set_aops(struct inode *inode)
 {
 	if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode) &&
+			test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else


-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux