Eric had asked me about the CFS delete thread patch, in the context of speeding up unlinking. While this doesn't actually speed up unlinking, it does reduce the latency of unlink() or truncate(0), and unless the application is permanently unlink bound it will be fine. The 2.4.29 kernel version is below. This is the last one we did before we started supporting 2.6 kernels, when extents were added to our code. The "Almost like daemonize()" code is to keep a reference on the "mount" fs context before daemonize() is called (which reparents the context to that of init), and reinstates the fs context. I'm think there was a reason for this, but I'm not sure if it is still relevant because the thread does not actually do any path operations (which is where kernel threads sharing a fs context get in trouble). The check in ext3_sync_fs() to see if the fs is being unmounted can likely be done in ext3_put_super() for 2.6. Signed-off-by: Andreas Dilger <adilger@xxxxxxxxxxxxx> Index: linux-2.4.29/fs/ext3/super.c =================================================================== --- linux-2.4.29.orig/fs/ext3/super.c 2005-05-03 15:53:33.047533872 +0300 +++ linux-2.4.29/fs/ext3/super.c 2005-05-03 15:54:47.192262160 +0300 @@ -400,6 +400,127 @@ } } +#ifdef EXT3_DELETE_THREAD +/* + * Delete inodes in a loop until there are no more to be deleted. + * Normally, we run in the background doing the deletes and sleeping again, + * and clients just add new inodes to be deleted onto the end of the list. + * If someone is concerned about free space (e.g. block allocation or similar) + * then they can sleep on s_delete_waiter_queue and be woken up when space + * has been freed. + */ +int ext3_delete_thread(void *data) +{ + struct super_block *sb = data; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct task_struct *tsk = current; + struct fs_struct *fs = current->fs; + + /* Almost like daemonize, but keep a separate filesystem context */ + atomic_inc(&fs->count); + daemonize(name); + exit_fs(current); + current->fs = fs; + set_fs_pwd(current->fs, init_task.fs->pwdmnt, init_task.fs->pwd); + + snprintf(tsk->comm,sizeof(tsk->comm),"kdelext3-%s",kdevname(sb->s_dev)); + sigfillset(&tsk->blocked); + + /*tsk->flags |= PF_KERNTHREAD;*/ + + INIT_LIST_HEAD(&sbi->s_delete_list); + wake_up(&sbi->s_delete_waiter_queue); + ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); + + /* main loop */ + for (;;) { + wait_event_interruptible(sbi->s_delete_thread_queue, + !list_empty(&sbi->s_delete_list) || + !test_opt(sb, ASYNCDEL)); + ext3_debug("%s woken up: %lu inodes, %lu blocks\n", + tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); + + spin_lock(&sbi->s_delete_lock); + if (list_empty(&sbi->s_delete_list)) { + clear_opt(sbi->s_mount_opt, ASYNCDEL); + memset(&sbi->s_delete_list, 0, + sizeof(sbi->s_delete_list)); + spin_unlock(&sbi->s_delete_lock); + ext3_debug("delete thread on %s exiting\n", + kdevname(sb->s_dev)); + wake_up(&sbi->s_delete_waiter_queue); + break; + } + + while (!list_empty(&sbi->s_delete_list)) { + struct inode *inode=list_entry(sbi->s_delete_list.next, + struct inode, i_devices); + unsigned long blocks = inode->i_blocks >> + (inode->i_blkbits - 9); + + list_del_init(&inode->i_devices); + spin_unlock(&sbi->s_delete_lock); + ext3_debug("%s delete ino %lu blk %lu\n", + tsk->comm, inode->i_ino, blocks); + + J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE); + J_ASSERT(inode->i_nlink == 1); + inode->i_nlink = 0; + iput(inode); + + spin_lock(&sbi->s_delete_lock); + sbi->s_delete_blocks -= blocks; + sbi->s_delete_inodes--; + } + if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { + ext3_warning(sb, __FUNCTION__, + "%lu blocks, %lu inodes on list?\n", + sbi->s_delete_blocks,sbi->s_delete_inodes); + sbi->s_delete_blocks = 0; + sbi->s_delete_inodes = 0; + } + spin_unlock(&sbi->s_delete_lock); + wake_up(&sbi->s_delete_waiter_queue); + } + + return 0; +} + +static void ext3_start_delete_thread(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int rc; + + spin_lock_init(&sbi->s_delete_lock); + init_waitqueue_head(&sbi->s_delete_thread_queue); + init_waitqueue_head(&sbi->s_delete_waiter_queue); + + if (!test_opt(sb, ASYNCDEL)) + return; + + rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); + if (rc < 0) + printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", + rc); + else + wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); +} + +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) +{ + if (sbi->s_delete_list.next == 0) /* thread never started */ + return; + + clear_opt(sbi->s_mount_opt, ASYNCDEL); + wake_up(&sbi->s_delete_thread_queue); + wait_event(sbi->s_delete_waiter_queue, + sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0); +} +#else +#define ext3_start_delete_thread(sbi) do {} while(0) +#define ext3_stop_delete_thread(sbi) do {} while(0) +#endif /* EXT3_DELETE_THREAD */ + void ext3_put_super (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -407,6 +528,9 @@ kdev_t j_dev = sbi->s_journal->j_dev; int i; +#ifdef EXT3_DELETE_THREAD + J_ASSERT(sbi->s_delete_inodes == 0); +#endif ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { @@ -526,6 +650,13 @@ clear_opt (*mount_options, XATTR_USER); else #endif +#ifdef EXT3_DELETE_THREAD + if (!strcmp(this_char, "asyncdel")) + set_opt(*mount_options, ASYNCDEL); + else if (!strcmp(this_char, "noasyncdel")) + clear_opt(*mount_options, ASYNCDEL); + else +#endif if (!strcmp (this_char, "bsddf")) clear_opt (*mount_options, MINIX_DF); else if (!strcmp (this_char, "nouid32")) { @@ -1244,6 +1375,7 @@ } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + ext3_start_delete_thread(sb); EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; @@ -1626,7 +1758,12 @@ static int ext3_sync_fs(struct super_block *sb) { tid_t target; - + + if (atomic_read(&sb->s_active) == 0) { + /* fs is being umounted: time to stop delete thread */ + ext3_stop_delete_thread(EXT3_SB(sb)); + } + sb->s_dirt = 0; target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); log_wait_commit(EXT3_SB(sb)->s_journal, target); @@ -1690,6 +1827,9 @@ if (!parse_options(data, &tmp, sbi, &tmp, 1)) return -EINVAL; + if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) + ext3_stop_delete_thread(sbi); + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) ext3_abort(sb, __FUNCTION__, "Abort forced by user"); Index: linux-2.4.29/fs/ext3/inode.c =================================================================== --- linux-2.4.29.orig/fs/ext3/inode.c 2005-05-03 15:53:36.555000656 +0300 +++ linux-2.4.29/fs/ext3/inode.c 2005-05-03 15:53:56.901907456 +0300 @@ -2562,6 +2562,116 @@ return err; } +#ifdef EXT3_DELETE_THREAD +/* Move blocks from to-be-truncated inode over to a new inode, and delete + * that one from the delete thread instead. This avoids a lot of latency + * when truncating large files. + * + * If we have any problem deferring the truncate, just truncate it right away. + * If we defer it, we also mark how many blocks it would free, so that we + * can keep the statfs data correct, and we know if we should sleep on the + * delete thread when we run out of space. */ +void ext3_truncate_thread(struct inode *old_inode) +{ + struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); + struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); + struct inode *new_inode; + handle_t *handle; + unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); + + if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) + goto out_truncate; + + /* XXX This is a temporary limitation for code simplicity. + * We could truncate to arbitrary sizes at some later time. */ + if (old_inode->i_size != 0) + goto out_truncate; + + /* We may want to truncate the inode immediately and not defer it */ + if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || + old_inode->i_size > oei->i_disksize) + goto out_truncate; + + /* We can't use the delete thread as-is during real orphan recovery, + * as we add to the orphan list here, causing ext3_orphan_cleanup() + * to loop endlessly. It would be nice to do so, but needs work. */ + if (oei->i_state & EXT3_STATE_DELETE || + sbi->s_mount_state & EXT3_ORPHAN_FS) { + ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", + old_inode->i_ino, blocks); + goto out_truncate; + } + + ext3_discard_prealloc(old_inode); + + /* old_inode = 1 + * new_inode = sb + GDT + ibitmap + * orphan list = 1 inode/superblock for add, 2 inodes for del + * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + */ + handle = ext3_journal_start(old_inode, 7); + if (IS_ERR(handle)) + goto out_truncate; + + new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); + if (IS_ERR(new_inode)) { + ext3_debug("truncate inode %lu directly (no new inodes)\n", + old_inode->i_ino); + goto out_journal; + } + + nei = EXT3_I(new_inode); + + down_write(&oei->truncate_sem); + new_inode->i_size = old_inode->i_size; + new_inode->i_blocks = old_inode->i_blocks; + new_inode->i_uid = old_inode->i_uid; + new_inode->i_gid = old_inode->i_gid; + new_inode->i_flags = old_inode->i_flags; + new_inode->i_nlink = 1; + + /* FIXME when we do arbitrary truncates */ + old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; + old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; + + memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); + memset(oei->i_data, 0, sizeof(oei->i_data)); + + nei->i_disksize = oei->i_disksize; + nei->i_state |= EXT3_STATE_DELETE; + up_write(&oei->truncate_sem); + + if (ext3_orphan_add(handle, new_inode) < 0) + goto out_journal; + + if (ext3_orphan_del(handle, old_inode) < 0) { + ext3_orphan_del(handle, new_inode); + iput(new_inode); + goto out_journal; + } + + ext3_journal_stop(handle, old_inode); + + spin_lock(&sbi->s_delete_lock); + J_ASSERT(list_empty(&new_inode->i_devices)); + list_add_tail(&new_inode->i_devices, &sbi->s_delete_list); + sbi->s_delete_blocks += blocks; + sbi->s_delete_inodes++; + spin_unlock(&sbi->s_delete_lock); + + ext3_debug("delete inode %lu (%lu blocks) by thread\n", + new_inode->i_ino, blocks); + + wake_up(&sbi->s_delete_thread_queue); + return; + +out_journal: + ext3_journal_stop(handle, old_inode); +out_truncate: + ext3_truncate(old_inode); +} +#endif /* EXT3_DELETE_THREAD */ + /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. Index: linux-2.4.29/fs/ext3/file.c =================================================================== --- linux-2.4.29.orig/fs/ext3/file.c 2005-04-07 19:31:00.000000000 +0300 +++ linux-2.4.29/fs/ext3/file.c 2005-05-03 15:53:56.902907304 +0300 @@ -123,7 +123,11 @@ }; struct inode_operations ext3_file_inode_operations = { +#ifdef EXT3_DELETE_THREAD + truncate: ext3_truncate_thread, /* BKL held */ +#else truncate: ext3_truncate, /* BKL held */ +#endif setattr: ext3_setattr, /* BKL held */ setxattr: ext3_setxattr, /* BKL held */ getxattr: ext3_getxattr, /* BKL held */ Index: linux-2.4.29/fs/ext3/namei.c =================================================================== --- linux-2.4.29.orig/fs/ext3/namei.c 2005-05-03 15:53:33.044534328 +0300 +++ linux-2.4.29/fs/ext3/namei.c 2005-05-03 15:53:56.905906848 +0300 @@ -838,6 +838,40 @@ return retval; } +#ifdef EXT3_DELETE_THREAD +static int ext3_try_to_delay_deletion(struct inode *inode) +{ + struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long blocks; + + if (!test_opt(inode->i_sb, ASYNCDEL)) + return 0; + + /* We may want to delete the inode immediately and not defer it */ + blocks = inode->i_blocks >> (inode->i_blkbits - 9); + if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS + !!inode->i_file_acl) + return 0; + + inode->i_nlink = 1; + atomic_inc(&inode->i_count); + ei->i_state |= EXT3_STATE_DELETE; + + spin_lock(&sbi->s_delete_lock); + J_ASSERT(list_empty(&inode->i_devices)); + list_add_tail(&inode->i_devices, &sbi->s_delete_list); + sbi->s_delete_blocks += blocks; + sbi->s_delete_inodes++; + spin_unlock(&sbi->s_delete_lock); + + wake_up(&sbi->s_delete_thread_queue); + + return 0; +} +#else +#define ext3_try_to_delay_deletion(inode) do {} while (0) +#endif + static int ext3_unlink(struct inode * dir, struct dentry *dentry) { int retval; @@ -878,8 +912,10 @@ dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ext3_mark_inode_dirty(handle, dir); inode->i_nlink--; - if (!inode->i_nlink) + if (!inode->i_nlink) { + ext3_try_to_delay_deletion(inode); ext3_orphan_add(handle, inode); + } inode->i_ctime = dir->i_ctime; ext3_mark_inode_dirty(handle, inode); retval = 0; Index: linux-2.4.29/include/linux/ext3_fs.h =================================================================== --- linux-2.4.29.orig/include/linux/ext3_fs.h 2005-05-03 15:53:37.124914016 +0300 +++ linux-2.4.29/include/linux/ext3_fs.h 2005-05-03 15:53:56.907906544 +0300 @@ -188,6 +188,7 @@ */ #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ +#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ /* * ioctl commands @@ -315,6 +316,7 @@ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ +#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -639,6 +641,9 @@ extern void ext3_dirty_inode(struct inode *); extern int ext3_change_inode_journal_flag(struct inode *, int); extern void ext3_truncate (struct inode *); +#ifdef EXT3_DELETE_THREAD +extern void ext3_truncate_thread(struct inode *inode); +#endif extern void ext3_set_inode_flags(struct inode *); /* ioctl.c */ Index: linux-2.4.29/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.4.29.orig/include/linux/ext3_fs_sb.h 2005-05-03 15:53:33.048533720 +0300 +++ linux-2.4.29/include/linux/ext3_fs_sb.h 2005-05-03 15:53:56.909906240 +0300 @@ -29,6 +29,8 @@ #define EXT3_MAX_GROUP_LOADED 8 +#define EXT3_DELETE_THREAD + /* * third extended-fs super-block data in memory */ @@ -74,6 +76,14 @@ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif +#ifdef EXT3_DELETE_THREAD + spinlock_t s_delete_lock; + struct list_head s_delete_list; + unsigned long s_delete_blocks; + unsigned long s_delete_inodes; + wait_queue_head_t s_delete_thread_queue; + wait_queue_head_t s_delete_waiter_queue; +#endif }; #endif /* _LINUX_EXT3_FS_SB */ Cheers, Andreas -- Andreas Dilger Sr. Staff Engineer, Lustre Group Sun Microsystems of Canada, Inc. -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html