[PATCH] [RFC] Asynchronous unlink/truncate patch for ext3

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Per discussion earlier today, included here is a copy of the old
asynchronous delete thread patch that we previously used in Lustre
to speed up unlink() and truncate(0) before we had extents.  This
patch is also available from the Lustre git repositories:

https://git.whamcloud.com/?p=fs/lustre-release.git;f=lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch;hb=9b6f9d17a35188f5f4dbfae840164b999a7a78a2

https://github.com/lustre/lustre-release/blob/1.4.10/lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch

This patch is based on ext3 and the 2.4.29 kernel, so it will need
updating, but I think the general idea is reasonably solid, and the
code worked for years without issues before it was deprecated.  The
"asyncdel" mount option enables this behavior on a filesystem.

There is no support for extent-mapped files (which would only need
to copy the extent flag over), nor is there support for metadata_csum,
which would cause checksums for the extent blocks to be invalid when
blocks are swapped to a new inode, similar to the EXT4_BOOT_LOADER_INO
issue that was recently hit.


When files are being unlinked or truncated to zero, a temporary inode
is allocated and the blocks are moved to that temp inode, and then a
per-superblock background thread is woken to handle this work.  The
temporary inode is added to the orphan list in case of crash, and is
processed normally at mount time like other orphan inodes.

This not only defers the inode indirect block iteration, it also
avoids increasing the foreground transaction size for the truncate.

The blocks of files temporarily in the delete queue are tracked in
an in-memory counter, and there is a wait queue for callers needing
space to wait on, but the blocks are not actually subtracted from the
statfs counters in this patch, nor does block or inode allocation
actually wait on the thread if they run out of space.

Signed-off-by: Andreas Dilger <adilger@xxxxxxxxx>
Signed-off-by: Alex Zhuravlev <alex@xxxxxxxxxxxxx>
---
 fs/ext3/file.c             |    4 +
 fs/ext3/inode.c            |  112 +++++++++++++++++++++++++++++++++++
 fs/ext3/namei.c            |   38 +++++++++++-
 fs/ext3/super.c            |  142 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/ext3_fs.h    |    5 +
 include/linux/ext3_fs_sb.h |   10 +++
 6 files changed, 309 insertions(+), 2 deletions(-)


Index: linux-2.4.29/fs/ext3/super.c
===================================================================
--- linux-2.4.29.orig/fs/ext3/super.c	2005-05-03 15:53:33.047533872 +0300
+++ linux-2.4.29/fs/ext3/super.c	2005-05-03 15:54:47.192262160 +0300
@@ -400,6 +400,127 @@
 	}
 }

+#ifdef EXT3_DELETE_THREAD
+/*
+ * Delete inodes in a loop until there are no more to be deleted.
+ * Normally, we run in the background doing the deletes and sleeping again,
+ * and clients just add new inodes to be deleted onto the end of the list.
+ * If someone is concerned about free space (e.g. block allocation or similar)
+ * then they can sleep on s_delete_waiter_queue and be woken up when space
+ * has been freed.
+ */
+int ext3_delete_thread(void *data)
+{
+	struct super_block *sb = data;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct task_struct *tsk = current;
+
+	/* Almost like daemonize, but not quite */
+	exit_mm(current);
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	tsk->tty = NULL;
+	exit_files(current);
+	reparent_to_init();
+
+	sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
+	sigfillset(&tsk->blocked);
+
+	/*tsk->flags |= PF_KERNTHREAD;*/
+
+	INIT_LIST_HEAD(&sbi->s_delete_list);
+	wake_up(&sbi->s_delete_waiter_queue);
+	ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
+
+	/* main loop */
+	for (;;) {
+		wait_event_interruptible(sbi->s_delete_thread_queue,
+					 !list_empty(&sbi->s_delete_list) ||
+					 !test_opt(sb, ASYNCDEL));
+		ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
+			   tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
+
+		spin_lock(&sbi->s_delete_lock);
+		if (list_empty(&sbi->s_delete_list)) {
+			clear_opt(sbi->s_mount_opt, ASYNCDEL);
+			memset(&sbi->s_delete_list, 0,
+			       sizeof(sbi->s_delete_list));
+			spin_unlock(&sbi->s_delete_lock);
+			ext3_debug("delete thread on %s exiting\n",
+				   kdevname(sb->s_dev));
+			wake_up(&sbi->s_delete_waiter_queue);
+			break;
+		}
+
+		while (!list_empty(&sbi->s_delete_list)) {
+			struct inode *inode=list_entry(sbi->s_delete_list.next,
+						       struct inode, i_devices);
+			unsigned long blocks = inode->i_blocks >>
+							(inode->i_blkbits - 9);
+
+			list_del_init(&inode->i_devices);
+			spin_unlock(&sbi->s_delete_lock);
+			ext3_debug("%s delete ino %lu blk %lu\n",
+				   tsk->comm, inode->i_ino, blocks);
+
+			J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE);
+			J_ASSERT(inode->i_nlink == 1);
+			inode->i_nlink = 0;
+			iput(inode);
+
+			spin_lock(&sbi->s_delete_lock);
+			sbi->s_delete_blocks -= blocks;
+			sbi->s_delete_inodes--;
+		}
+		if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
+			ext3_warning(sb, __FUNCTION__,
+				     "%lu blocks, %lu inodes on list?\n",
+				     sbi->s_delete_blocks,sbi->s_delete_inodes);
+			sbi->s_delete_blocks = 0;
+			sbi->s_delete_inodes = 0;
+		}
+		spin_unlock(&sbi->s_delete_lock);
+		wake_up(&sbi->s_delete_waiter_queue);
+	}
+
+	return 0;
+}
+
+static void ext3_start_delete_thread(struct super_block *sb)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	int rc;
+
+	spin_lock_init(&sbi->s_delete_lock);
+	init_waitqueue_head(&sbi->s_delete_thread_queue);
+	init_waitqueue_head(&sbi->s_delete_waiter_queue);
+
+	if (!test_opt(sb, ASYNCDEL))
+		return;
+
+	rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
+	if (rc < 0)
+		printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
+		       rc);
+	else
+		wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
+}
+
+static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
+{
+	if (sbi->s_delete_list.next == 0)	/* thread never started */
+		return;
+
+	clear_opt(sbi->s_mount_opt, ASYNCDEL);
+	wake_up(&sbi->s_delete_thread_queue);
+	wait_event(sbi->s_delete_waiter_queue,
+			sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
+}
+#else
+#define ext3_start_delete_thread(sbi) do {} while(0)
+#define ext3_stop_delete_thread(sbi) do {} while(0)
+#endif /* EXT3_DELETE_THREAD */
+
 void ext3_put_super (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
@@ -407,6 +528,9 @@
 	kdev_t j_dev = sbi->s_journal->j_dev;
 	int i;

+#ifdef EXT3_DELETE_THREAD
+	J_ASSERT(sbi->s_delete_inodes == 0);
+#endif
 	ext3_xattr_put_super(sb);
 	journal_destroy(sbi->s_journal);
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -526,6 +650,13 @@
 			clear_opt (*mount_options, XATTR_USER);
 		else
 #endif
+#ifdef EXT3_DELETE_THREAD
+		if (!strcmp(this_char, "asyncdel"))
+			set_opt(*mount_options, ASYNCDEL);
+		else if (!strcmp(this_char, "noasyncdel"))
+			clear_opt(*mount_options, ASYNCDEL);
+		else
+#endif
 		if (!strcmp (this_char, "bsddf"))
 			clear_opt (*mount_options, MINIX_DF);
 		else if (!strcmp (this_char, "nouid32")) {
@@ -1244,6 +1375,7 @@
 	}

 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+	ext3_start_delete_thread(sb);
 	EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
 	ext3_orphan_cleanup(sb, es);
 	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
@@ -1626,7 +1758,12 @@
 static int ext3_sync_fs(struct super_block *sb)
 {
 	tid_t target;
-
+
+	if (atomic_read(&sb->s_active) == 0) {
+		/* fs is being umounted: time to stop delete thread */
+		ext3_stop_delete_thread(EXT3_SB(sb));
+	}
+
 	sb->s_dirt = 0;
 	target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
 	log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -1690,6 +1827,9 @@
 	if (!parse_options(data, &tmp, sbi, &tmp, 1))
 		return -EINVAL;

+	if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
+		ext3_stop_delete_thread(sbi);
+
 	if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
 		ext3_abort(sb, __FUNCTION__, "Abort forced by user");

Index: linux-2.4.29/fs/ext3/inode.c
===================================================================
--- linux-2.4.29.orig/fs/ext3/inode.c	2005-05-03 15:53:36.555000656 +0300
+++ linux-2.4.29/fs/ext3/inode.c	2005-05-03 15:53:56.901907456 +0300
@@ -2562,6 +2562,118 @@
 	return err;
 }

+#ifdef EXT3_DELETE_THREAD
+/* Move blocks from to-be-truncated inode over to a new inode, and delete
+ * that one from the delete thread instead.  This avoids a lot of latency
+ * when truncating large files.
+ *
+ * If we have any problem deferring the truncate, just truncate it right away.
+ * If we defer it, we also mark how many blocks it would free, so that we
+ * can keep the statfs data correct, and we know if we should sleep on the
+ * delete thread when we run out of space.
+ */
+void ext3_truncate_thread(struct inode *old_inode)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
+	struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
+	struct inode *new_inode;
+	handle_t *handle;
+	unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
+
+	if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
+		goto out_truncate;
+
+	/* XXX This is a temporary limitation for code simplicity.
+	 *     We could truncate to arbitrary sizes at some later time.
+	 */
+	if (old_inode->i_size != 0)
+		goto out_truncate;
+
+	/* We may want to truncate the inode immediately and not defer it */
+	if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
+	    old_inode->i_size > oei->i_disksize)
+		goto out_truncate;
+
+	/* We can't use the delete thread as-is during real orphan recovery,
+	 * as we add to the orphan list here, causing ext3_orphan_cleanup()
+	 * to loop endlessly.  It would be nice to do so, but needs work.
+	 */
+	if (oei->i_state & EXT3_STATE_DELETE ||
+	    sbi->s_mount_state & EXT3_ORPHAN_FS) {
+		ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
+			   old_inode->i_ino, blocks);
+		goto out_truncate;
+	}
+
+	ext3_discard_prealloc(old_inode);
+
+	/* old_inode   = 1
+	 * new_inode   = sb + GDT + ibitmap
+	 * orphan list = 1 inode/superblock for add, 2 inodes for del
+	 * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
+	 */
+	handle = ext3_journal_start(old_inode, 7);
+	if (IS_ERR(handle))
+		goto out_truncate;
+
+	new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
+	if (IS_ERR(new_inode)) {
+		ext3_debug("truncate inode %lu directly (no new inodes)\n",
+			   old_inode->i_ino);
+		goto out_journal;
+	}
+
+	nei = EXT3_I(new_inode);
+
+	down_write(&oei->truncate_sem);
+	new_inode->i_size = old_inode->i_size;
+	new_inode->i_blocks = old_inode->i_blocks;
+	new_inode->i_uid = old_inode->i_uid;
+	new_inode->i_gid = old_inode->i_gid;
+	new_inode->i_nlink = 1;
+
+	/* FIXME when we do arbitrary truncates */
+	old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
+	old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
+
+	memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
+	memset(oei->i_data, 0, sizeof(oei->i_data));
+
+	nei->i_disksize = oei->i_disksize;
+	nei->i_state |= EXT3_STATE_DELETE;
+	up_write(&oei->truncate_sem);
+
+	if (ext3_orphan_add(handle, new_inode) < 0)
+		goto out_journal;
+
+	if (ext3_orphan_del(handle, old_inode) < 0) {
+		ext3_orphan_del(handle, new_inode);
+		iput(new_inode);
+		goto out_journal;
+	}
+
+	ext3_journal_stop(handle, old_inode);
+
+	spin_lock(&sbi->s_delete_lock);
+	J_ASSERT(list_empty(&new_inode->i_devices));
+	list_add_tail(&new_inode->i_devices, &sbi->s_delete_list);
+	sbi->s_delete_blocks += blocks;
+	sbi->s_delete_inodes++;
+	spin_unlock(&sbi->s_delete_lock);
+
+	ext3_debug("delete inode %lu (%lu blocks) by thread\n",
+		   new_inode->i_ino, blocks);
+
+	wake_up(&sbi->s_delete_thread_queue);
+	return;
+
+out_journal:
+	ext3_journal_stop(handle, old_inode);
+out_truncate:
+	ext3_truncate(old_inode);
+}
+#endif /* EXT3_DELETE_THREAD */
+
 /*
  * On success, We end up with an outstanding reference count against
  * iloc->bh.  This _must_ be cleaned up later.
Index: linux-2.4.29/fs/ext3/file.c
===================================================================
--- linux-2.4.29.orig/fs/ext3/file.c	2005-04-07 19:31:00.000000000 +0300
+++ linux-2.4.29/fs/ext3/file.c	2005-05-03 15:53:56.902907304 +0300
@@ -123,7 +123,11 @@
 };

 struct inode_operations ext3_file_inode_operations = {
+#ifdef EXT3_DELETE_THREAD
+	truncate:	ext3_truncate_thread,	/* BKL held */
+#else
 	truncate:	ext3_truncate,		/* BKL held */
+#endif
 	setattr:	ext3_setattr,		/* BKL held */
 	setxattr:	ext3_setxattr,		/* BKL held */
 	getxattr:	ext3_getxattr,		/* BKL held */
Index: linux-2.4.29/fs/ext3/namei.c
===================================================================
--- linux-2.4.29.orig/fs/ext3/namei.c	2005-05-03 15:53:33.044534328 +0300
+++ linux-2.4.29/fs/ext3/namei.c	2005-05-03 15:53:56.905906848 +0300
@@ -838,6 +838,40 @@
 	return retval;
 }

+#ifdef EXT3_DELETE_THREAD
+static int ext3_try_to_delay_deletion(struct inode *inode)
+{
+	struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+	struct ext3_inode_info *ei = EXT3_I(inode);
+	unsigned long blocks;
+
+	if (!test_opt(inode->i_sb, ASYNCDEL))
+		return 0;
+
+	/* We may want to delete the inode immediately and not defer it */
+	blocks = inode->i_blocks >> (inode->i_blkbits - 9);
+	if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS)
+		return 0;
+
+	inode->i_nlink = 1;
+	atomic_inc(&inode->i_count);
+	ei->i_state |= EXT3_STATE_DELETE;
+
+	spin_lock(&sbi->s_delete_lock);
+	J_ASSERT(list_empty(&inode->i_devices));
+	list_add_tail(&inode->i_devices, &sbi->s_delete_list);
+	sbi->s_delete_blocks += blocks;
+	sbi->s_delete_inodes++;
+	spin_unlock(&sbi->s_delete_lock);
+
+	wake_up(&sbi->s_delete_thread_queue);
+
+	return 0;
+}
+#else
+#define ext3_try_to_delay_deletion(inode) do {} while (0)
+#endif
+
 static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 {
 	int retval;
@@ -878,8 +912,10 @@
 	dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
 	ext3_mark_inode_dirty(handle, dir);
 	inode->i_nlink--;
-	if (!inode->i_nlink)
+	if (!inode->i_nlink) {
+		ext3_try_to_delay_deletion(inode);
 		ext3_orphan_add(handle, inode);
+	}
 	inode->i_ctime = dir->i_ctime;
 	ext3_mark_inode_dirty(handle, inode);
 	retval = 0;
Index: linux-2.4.29/include/linux/ext3_fs.h
===================================================================
--- linux-2.4.29.orig/include/linux/ext3_fs.h	2005-05-03 15:53:37.124914016 +0300
+++ linux-2.4.29/include/linux/ext3_fs.h	2005-05-03 15:53:56.907906544 +0300
@@ -188,6 +188,7 @@
  */
 #define EXT3_STATE_JDATA		0x00000001 /* journaled data exists */
 #define EXT3_STATE_NEW			0x00000002 /* inode is newly created */
+#define EXT3_STATE_DELETE		0x00000010 /* deferred delete inode */

 /*
  * ioctl commands
@@ -315,6 +316,7 @@
 #define EXT3_MOUNT_UPDATE_JOURNAL	0x1000	/* Update the journal format */
 #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
 #define EXT3_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
+#define EXT3_MOUNT_ASYNCDEL		0x20000 /* Delayed deletion */

 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -639,6 +641,9 @@
 extern void ext3_dirty_inode(struct inode *);
 extern int ext3_change_inode_journal_flag(struct inode *, int);
 extern void ext3_truncate (struct inode *);
+#ifdef EXT3_DELETE_THREAD
+extern void ext3_truncate_thread(struct inode *inode);
+#endif
 extern void ext3_set_inode_flags(struct inode *);

 /* ioctl.c */
Index: linux-2.4.29/include/linux/ext3_fs_sb.h
===================================================================
--- linux-2.4.29.orig/include/linux/ext3_fs_sb.h	2005-05-03 15:53:33.048533720 +0300
+++ linux-2.4.29/include/linux/ext3_fs_sb.h	2005-05-03 15:53:56.909906240 +0300
@@ -29,6 +29,8 @@

 #define EXT3_MAX_GROUP_LOADED	8

+#define EXT3_DELETE_THREAD
+
 /*
  * third extended-fs super-block data in memory
  */
@@ -74,6 +76,14 @@
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
 	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
 #endif
+#ifdef EXT3_DELETE_THREAD
+	spinlock_t s_delete_lock;
+	struct list_head s_delete_list;
+	unsigned long s_delete_blocks;
+	unsigned long s_delete_inodes;
+	wait_queue_head_t s_delete_thread_queue;
+	wait_queue_head_t s_delete_waiter_queue;
+#endif
 };

 #endif	/* _LINUX_EXT3_FS_SB */


Cheers, Andreas

Attachment: signature.asc
Description: Message signed with OpenPGP


[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux