Re: regression in page writeback

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Shaohua,

On Tue, Sep 22, 2009 at 01:49:13PM +0800, Li, Shaohua wrote:
> Hi,
> Commit d7831a0bdf06b9f722b947bb0c205ff7d77cebd8 causes disk io regression
> in my test.
> My system has 12 disks, each disk has two partitions. System runs fio sequence
> write on all partitions, each partion has 8 jobs.
> 2.6.31-rc1, fio gives 460m/s disk io
> 2.6.31-rc2, fio gives about 400m/s disk io. Revert the patch, speed back to
> 460m/s
> 
> Under latest git: fio gives 450m/s disk io; If reverting the patch, the speed
> is 484m/s.
> 
> With the patch, fio reports less io merge and more interrupts. My naive
> analysis is the patch makes balance_dirty_pages_ratelimited_nr() limits
> write chunk to 8 pages and then soon go to sleep in balance_dirty_pages(),
> because most time the bdi_nr_reclaimable < bdi_thresh, and so when write
> the pages out, the chunk is 8 pages long instead of 4M long. Without the patch,
> thread can write 8 pages and then move some pages to writeback, and then
> continue doing write. The patch seems to break this.

Do you have trace/numbers for above descriptions?

> Unfortunatelly I can't figure out a fix for this issue, hopefully
> you have more ideas.

Attached is a very verbose writeback debug patch, hope it helps and
won't disturb the workload a lot :)

Thanks,
Fengguang

--- linux-2.6.orig/fs/fs-writeback.c	2009-08-23 14:44:22.000000000 +0800
+++ linux-2.6/fs/fs-writeback.c	2009-09-22 18:45:06.000000000 +0800
@@ -26,6 +26,9 @@
 #include "internal.h"
 
 
+int sysctl_dirty_debug __read_mostly;
+
+
 /**
  * writeback_acquire - attempt to get exclusive writeback access to a device
  * @bdi: the device's backing_dev_info structure
@@ -186,6 +189,11 @@ static int write_inode(struct inode *ino
 	return 0;
 }
 
+#define redirty_tail(inode)						\
+	do {								\
+		__redirty_tail(inode, __LINE__);			\
+	} while (0)
+
 /*
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * furthest end of its superblock's dirty-inode list.
@@ -195,10 +203,15 @@ static int write_inode(struct inode *ino
  * the case then the inode must have been redirtied while it was being written
  * out and we don't reset its dirtied_when.
  */
-static void redirty_tail(struct inode *inode)
+static void __redirty_tail(struct inode *inode, int line)
 {
 	struct super_block *sb = inode->i_sb;
 
+	if (sysctl_dirty_debug) {
+		printk(KERN_DEBUG "redirty_tail +%d: inode %lu\n",
+				line, inode->i_ino);
+	}
+
 	if (!list_empty(&sb->s_dirty)) {
 		struct inode *tail_inode;
 
@@ -210,12 +223,22 @@ static void redirty_tail(struct inode *i
 	list_move(&inode->i_list, &sb->s_dirty);
 }
 
+#define requeue_io(inode)						\
+	do {								\
+		__requeue_io(inode, __LINE__);				\
+	} while (0)
+
 /*
  * requeue inode for re-scanning after sb->s_io list is exhausted.
  */
-static void requeue_io(struct inode *inode)
+static void __requeue_io(struct inode *inode, int line)
 {
 	list_move(&inode->i_list, &inode->i_sb->s_more_io);
+
+	if (sysctl_dirty_debug) {
+		printk(KERN_DEBUG "requeue_io +%d: inode %lu\n",
+				line, inode->i_ino);
+	}
 }
 
 static void inode_sync_complete(struct inode *inode)
--- linux-2.6.orig/include/linux/writeback.h	2009-08-23 14:44:23.000000000 +0800
+++ linux-2.6/include/linux/writeback.h	2009-09-22 18:29:05.000000000 +0800
@@ -168,5 +168,6 @@ void writeback_set_ratelimit(void);
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
 				   read-only. */
 
+extern int sysctl_dirty_debug;
 
 #endif		/* WRITEBACK_H */
--- linux-2.6.orig/kernel/sysctl.c	2009-08-23 14:44:23.000000000 +0800
+++ linux-2.6/kernel/sysctl.c	2009-09-22 18:29:05.000000000 +0800
@@ -1516,6 +1516,14 @@ static struct ctl_table fs_table[] = {
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "dirty_debug",
+		.data		= &sysctl_dirty_debug,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 	{
 		.ctl_name	= CTL_UNNUMBERED,
--- linux-2.6.orig/mm/page-writeback.c	2009-08-23 14:44:23.000000000 +0800
+++ linux-2.6/mm/page-writeback.c	2009-09-22 18:45:50.000000000 +0800
@@ -116,6 +116,35 @@ EXPORT_SYMBOL(laptop_mode);
 
 /* End of sysctl-exported parameters */
 
+#define writeback_debug_report(n, wbc) do {                             \
+	if(sysctl_dirty_debug)						\
+		__writeback_debug_report(n, wbc,			\
+				__FILE__, __LINE__, __FUNCTION__);	\
+} while (0)
+
+void print_writeback_control(struct writeback_control *wbc)
+{
+	printk(KERN_DEBUG
+			"global dirty=%lu writeback=%lu nfs=%lu "
+			"flags=%c%c towrite=%ld skipped=%ld\n",
+			global_page_state(NR_FILE_DIRTY),
+			global_page_state(NR_WRITEBACK),
+			global_page_state(NR_UNSTABLE_NFS),
+			wbc->encountered_congestion ? 'C':'_',
+			wbc->more_io ? 'M':'_',
+			wbc->nr_to_write,
+			wbc->pages_skipped);
+}
+
+void __writeback_debug_report(long n, struct writeback_control *wbc,
+		const char *file, int line, const char *func)
+{
+	printk(KERN_DEBUG "%s %d %s: comm=%s pid=%d n=%ld\n",
+			file, line, func,
+			current->comm, current->pid,
+			n);
+	print_writeback_control(wbc);
+}
 
 static void background_writeout(unsigned long _min_pages);
 
@@ -550,7 +579,12 @@ static void balance_dirty_pages(struct a
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,
 				       &bdi_thresh, bdi);
+			writeback_debug_report(pages_written, &wbc);
 		}
+		printk("bdi_nr_reclaimable=%lu, bdi_thresh=%lu, "
+		       "background_thresh=%lu, dirty_thresh=%lu\n",
+		       bdi_nr_reclaimable, bdi_thresh,
+		       background_thresh, dirty_thresh);
 
 		/*
 		 * In order to avoid the stacked BDI deadlock we need
@@ -670,6 +704,11 @@ void throttle_vm_writeout(gfp_t gfp_mask
 			global_page_state(NR_WRITEBACK) <= dirty_thresh)
                         	break;
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
+		printk(KERN_DEBUG "throttle_vm_writeout: "
+				"congestion_wait on %lu+%lu > %lu\n",
+				global_page_state(NR_UNSTABLE_NFS),
+				global_page_state(NR_WRITEBACK),
+				dirty_thresh);
 
 		/*
 		 * The caller might hold locks which can prevent IO completion
@@ -719,7 +758,9 @@ static void background_writeout(unsigned
 			else
 				break;
 		}
+		writeback_debug_report(min_pages, &wbc);
 	}
+	writeback_debug_report(min_pages, &wbc);
 }
 
 /*
@@ -792,7 +833,9 @@ static void wb_kupdate(unsigned long arg
 				break;	/* All the old data is written */
 		}
 		nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		writeback_debug_report(nr_to_write, &wbc);
 	}
+	writeback_debug_report(nr_to_write, &wbc);
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
 	if (dirty_writeback_interval)

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux