[PATCH] fs: make sure the timestamps for lazytime inodes eventually get written

"Theodore Ts'o" <tytso@xxxxxxx> · Sat, 7 Mar 2015 00:34:08 -0500

Hi Jan,

I believe the following should address all of the issues that you
raised.  Could you take a look and let me know what you think?

Thanks!!

						- Ted

commit e42f32d0ed65059e5cb689cc0ac28099a729ba9a
Author: Theodore Ts'o <tytso@xxxxxxx>
Date:   Sat Mar 7 00:22:37 2015 -0500

    fs: make sure the timestamps for lazytime inodes eventually get written
    
    Jan Kara pointed out that if there is an inode which is constantly
    getting dirtied with I_DIRTY_PAGES, an inode with an updated timestamp
    will never be written since inode->dirtied_when is constantly getting
    updated.  We fix this by adding an extra field to the inode,
    dirtied_time_when, so inodes with a stale dirtytime can get detected
    and handled.
    
    Also add a sysctl tunable, dirtytime_expire_seconds so we can properly
    debug this code and make sure it all works.
    
    Finally, if we have a dirtytime inode caused by an atime update, and
    there is no write activity on the file system, we need to have a
    secondary system to make sure these inodes get written out.  We do
    this by setting up a second delayed work structure which wakes up the
    CPU much more rarely compared to writeback_expire_centisecs.
    
    Signed-off-by: Theodore Ts'o <tytso@xxxxxxx>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e907052..260d7e7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,6 +53,9 @@ struct wb_writeback_work {
 	struct completion *done;	/* set if the caller waits */
 };
 
+/* 12 hours in seconds */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -275,8 +278,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 
 	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
 		older_than_this = work->older_than_this;
-	else if ((work->reason == WB_REASON_SYNC) == 0) {
-		expire_time = jiffies - (HZ * 86400);
+	else if (!work->for_sync) {
+		expire_time = jiffies - (dirtytime_expire_interval * HZ);
 		older_than_this = &expire_time;
 	}
 	while (!list_empty(delaying_queue)) {
@@ -458,6 +461,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		 */
 		redirty_tail(inode, wb);
 	} else if (inode->i_state & I_DIRTY_TIME) {
+		inode->dirtied_when = jiffies;
 		list_move(&inode->i_wb_list, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
@@ -741,6 +745,13 @@ static long writeback_sb_inodes(struct super_block *sb,
 		spin_lock(&inode->i_lock);
 		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
+		if ((inode->i_state & I_DIRTY_TIME) &&
+		    ((start_time - inode->dirtied_time_when) >
+		     (dirtytime_expire_interval * HZ))) {
+			inode->i_state &= ~I_DIRTY_TIME;
+			inode->i_state |= I_DIRTY_SYNC;
+			trace_writeback_lazytime(inode);
+		}
 		requeue_inode(inode, wb, &wbc);
 		inode_sync_complete(inode);
 		spin_unlock(&inode->i_lock);
@@ -1131,6 +1142,56 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	rcu_read_unlock();
 }
 
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically.  We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system.  So instead we define a separate delayed work
+ * function which gets called much more rarely.
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary.  But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		if (list_empty(&bdi->wb.b_dirty_time))
+			continue;
+		bdi_wakeup_thread(bdi);
+	}
+	rcu_read_unlock();
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+
+static int __init start_dirtytime_writeback(void)
+{
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+	return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret == 0 && write)
+		mod_delayed_work(system_wq, &dirtytime_work, 0);
+	return ret;
+}
+
+
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 {
 	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1269,6 +1330,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
+			if (dirtytime)
+				inode->dirtied_time_when = jiffies;
+			if (flags & I_DIRTY_PAGES)
+				dirtytime = 0;
 			list_move(&inode->i_wb_list, dirtytime ?
 				  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
 			spin_unlock(&bdi->wb.list_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b4d71b5..f4131e8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -604,6 +604,7 @@ struct inode {
 	struct mutex		i_mutex;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
+	unsigned long		dirtied_time_when;
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 0004833..b2dd371e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -130,6 +130,7 @@ extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
+extern unsigned int dirtytime_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
@@ -146,6 +147,8 @@ extern int dirty_ratio_handler(struct ctl_table *table, int write,
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos);
 
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 88ea2d6..ce410bb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1228,6 +1228,14 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "dirtytime_expire_seconds",
+		.data		= &dirtytime_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
+		.mode		= 0644,
+		.proc_handler	= dirtytime_interval_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname       = "nr_pdflush_threads",
 		.mode           = 0444 /* read-only */,
 		.proc_handler   = pdflush_proc_obsolete,
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html