Shaohua, On Tue, Sep 22, 2009 at 01:49:13PM +0800, Li, Shaohua wrote: > Hi, > Commit d7831a0bdf06b9f722b947bb0c205ff7d77cebd8 causes disk io regression > in my test. > My system has 12 disks, each disk has two partitions. System runs fio sequence > write on all partitions, each partion has 8 jobs. > 2.6.31-rc1, fio gives 460m/s disk io > 2.6.31-rc2, fio gives about 400m/s disk io. Revert the patch, speed back to > 460m/s > > Under latest git: fio gives 450m/s disk io; If reverting the patch, the speed > is 484m/s. > > With the patch, fio reports less io merge and more interrupts. My naive > analysis is the patch makes balance_dirty_pages_ratelimited_nr() limits > write chunk to 8 pages and then soon go to sleep in balance_dirty_pages(), > because most time the bdi_nr_reclaimable < bdi_thresh, and so when write > the pages out, the chunk is 8 pages long instead of 4M long. Without the patch, > thread can write 8 pages and then move some pages to writeback, and then > continue doing write. The patch seems to break this. Do you have trace/numbers for above descriptions? > Unfortunatelly I can't figure out a fix for this issue, hopefully > you have more ideas. Attached is a very verbose writeback debug patch, hope it helps and won't disturb the workload a lot :) Thanks, Fengguang
--- linux-2.6.orig/fs/fs-writeback.c 2009-08-23 14:44:22.000000000 +0800 +++ linux-2.6/fs/fs-writeback.c 2009-09-22 18:45:06.000000000 +0800 @@ -26,6 +26,9 @@ #include "internal.h" +int sysctl_dirty_debug __read_mostly; + + /** * writeback_acquire - attempt to get exclusive writeback access to a device * @bdi: the device's backing_dev_info structure @@ -186,6 +189,11 @@ static int write_inode(struct inode *ino return 0; } +#define redirty_tail(inode) \ + do { \ + __redirty_tail(inode, __LINE__); \ + } while (0) + /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. @@ -195,10 +203,15 @@ static int write_inode(struct inode *ino * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ -static void redirty_tail(struct inode *inode) +static void __redirty_tail(struct inode *inode, int line) { struct super_block *sb = inode->i_sb; + if (sysctl_dirty_debug) { + printk(KERN_DEBUG "redirty_tail +%d: inode %lu\n", + line, inode->i_ino); + } + if (!list_empty(&sb->s_dirty)) { struct inode *tail_inode; @@ -210,12 +223,22 @@ static void redirty_tail(struct inode *i list_move(&inode->i_list, &sb->s_dirty); } +#define requeue_io(inode) \ + do { \ + __requeue_io(inode, __LINE__); \ + } while (0) + /* * requeue inode for re-scanning after sb->s_io list is exhausted. */ -static void requeue_io(struct inode *inode) +static void __requeue_io(struct inode *inode, int line) { list_move(&inode->i_list, &inode->i_sb->s_more_io); + + if (sysctl_dirty_debug) { + printk(KERN_DEBUG "requeue_io +%d: inode %lu\n", + line, inode->i_ino); + } } static void inode_sync_complete(struct inode *inode) --- linux-2.6.orig/include/linux/writeback.h 2009-08-23 14:44:23.000000000 +0800 +++ linux-2.6/include/linux/writeback.h 2009-09-22 18:29:05.000000000 +0800 @@ -168,5 +168,6 @@ void writeback_set_ratelimit(void); extern int nr_pdflush_threads; /* Global so it can be exported to sysctl read-only. */ +extern int sysctl_dirty_debug; #endif /* WRITEBACK_H */ --- linux-2.6.orig/kernel/sysctl.c 2009-08-23 14:44:23.000000000 +0800 +++ linux-2.6/kernel/sysctl.c 2009-09-22 18:29:05.000000000 +0800 @@ -1516,6 +1516,14 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &two, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "dirty_debug", + .data = &sysctl_dirty_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) { .ctl_name = CTL_UNNUMBERED, --- linux-2.6.orig/mm/page-writeback.c 2009-08-23 14:44:23.000000000 +0800 +++ linux-2.6/mm/page-writeback.c 2009-09-22 18:45:50.000000000 +0800 @@ -116,6 +116,35 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ +#define writeback_debug_report(n, wbc) do { \ + if(sysctl_dirty_debug) \ + __writeback_debug_report(n, wbc, \ + __FILE__, __LINE__, __FUNCTION__); \ +} while (0) + +void print_writeback_control(struct writeback_control *wbc) +{ + printk(KERN_DEBUG + "global dirty=%lu writeback=%lu nfs=%lu " + "flags=%c%c towrite=%ld skipped=%ld\n", + global_page_state(NR_FILE_DIRTY), + global_page_state(NR_WRITEBACK), + global_page_state(NR_UNSTABLE_NFS), + wbc->encountered_congestion ? 'C':'_', + wbc->more_io ? 'M':'_', + wbc->nr_to_write, + wbc->pages_skipped); +} + +void __writeback_debug_report(long n, struct writeback_control *wbc, + const char *file, int line, const char *func) +{ + printk(KERN_DEBUG "%s %d %s: comm=%s pid=%d n=%ld\n", + file, line, func, + current->comm, current->pid, + n); + print_writeback_control(wbc); +} static void background_writeout(unsigned long _min_pages); @@ -550,7 +579,12 @@ static void balance_dirty_pages(struct a pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + writeback_debug_report(pages_written, &wbc); } + printk("bdi_nr_reclaimable=%lu, bdi_thresh=%lu, " + "background_thresh=%lu, dirty_thresh=%lu\n", + bdi_nr_reclaimable, bdi_thresh, + background_thresh, dirty_thresh); /* * In order to avoid the stacked BDI deadlock we need @@ -670,6 +704,11 @@ void throttle_vm_writeout(gfp_t gfp_mask global_page_state(NR_WRITEBACK) <= dirty_thresh) break; congestion_wait(BLK_RW_ASYNC, HZ/10); + printk(KERN_DEBUG "throttle_vm_writeout: " + "congestion_wait on %lu+%lu > %lu\n", + global_page_state(NR_UNSTABLE_NFS), + global_page_state(NR_WRITEBACK), + dirty_thresh); /* * The caller might hold locks which can prevent IO completion @@ -719,7 +758,9 @@ static void background_writeout(unsigned else break; } + writeback_debug_report(min_pages, &wbc); } + writeback_debug_report(min_pages, &wbc); } /* @@ -792,7 +833,9 @@ static void wb_kupdate(unsigned long arg break; /* All the old data is written */ } nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + writeback_debug_report(nr_to_write, &wbc); } + writeback_debug_report(nr_to_write, &wbc); if (time_before(next_jif, jiffies + HZ)) next_jif = jiffies + HZ; if (dirty_writeback_interval)