[known bug: this patch will block sync(1) in schecule() if dirty threshold is set to as low as 1MB.] The generic writeback routines are departing from congestion_wait() in preference of get_request_wait(), aka. waiting on the block queues. Introduce the missing writeback wait queue for NFS, otherwise its writeback pages will grow out of control, exhausting all PG_dirty pages. CC: Jens Axboe <axboe@xxxxxxxxx> CC: Chris Mason <chris.mason@xxxxxxxxxx> CC: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> CC: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx> Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx> --- fs/nfs/client.c | 2 fs/nfs/write.c | 89 +++++++++++++++++++++++++++++++----- include/linux/nfs_fs_sb.h | 1 3 files changed, 81 insertions(+), 11 deletions(-) --- linux-next.orig/fs/nfs/write.c 2011-09-29 20:23:44.000000000 +0800 +++ linux-next/fs/nfs/write.c 2011-10-05 10:43:28.000000000 +0800 @@ -189,11 +189,64 @@ static int wb_priority(struct writeback_ * NFS congestion control */ +#define NFS_WAIT_PAGES (1024L >> (PAGE_SHIFT - 10)) int nfs_congestion_kb; -#define NFS_CONGESTION_ON_THRESH (nfs_congestion_kb >> (PAGE_SHIFT-10)) -#define NFS_CONGESTION_OFF_THRESH \ - (NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2)) +/* + * SYNC requests will block on (2*limit) and wakeup on (2*limit-NFS_WAIT_PAGES) + * ASYNC requests will block on (limit) and wakeup on (limit - NFS_WAIT_PAGES) + * In this way SYNC writes will never be blocked by ASYNC ones. + */ + +static void nfs_set_congested(long nr, struct backing_dev_info *bdi) +{ + long limit = nfs_congestion_kb >> (PAGE_SHIFT - 10); + + if (nr > limit && !test_bit(BDI_async_congested, &bdi->state)) + set_bdi_congested(bdi, BLK_RW_ASYNC); + else if (nr > 2 * limit && !test_bit(BDI_sync_congested, &bdi->state)) + set_bdi_congested(bdi, BLK_RW_SYNC); +} + +static void nfs_wait_congested(int is_sync, + struct backing_dev_info *bdi, + wait_queue_head_t *wqh) +{ + int waitbit = is_sync ? BDI_sync_congested : BDI_async_congested; + DEFINE_WAIT(wait); + + if (!test_bit(waitbit, &bdi->state)) + return; + + for (;;) { + prepare_to_wait(&wqh[is_sync], &wait, TASK_UNINTERRUPTIBLE); + if (!test_bit(waitbit, &bdi->state)) + break; + + io_schedule(); + } + finish_wait(&wqh[is_sync], &wait); +} + +static void nfs_wakeup_congested(long nr, + struct backing_dev_info *bdi, + wait_queue_head_t *wqh) +{ + long limit = nfs_congestion_kb >> (PAGE_SHIFT - 10); + + if (nr < 2 * limit - min(limit / 8, NFS_WAIT_PAGES)) { + if (test_bit(BDI_sync_congested, &bdi->state)) + clear_bdi_congested(bdi, BLK_RW_SYNC); + if (waitqueue_active(&wqh[BLK_RW_SYNC])) + wake_up(&wqh[BLK_RW_SYNC]); + } + if (nr < limit - min(limit / 8, NFS_WAIT_PAGES)) { + if (test_bit(BDI_async_congested, &bdi->state)) + clear_bdi_congested(bdi, BLK_RW_ASYNC); + if (waitqueue_active(&wqh[BLK_RW_ASYNC])) + wake_up(&wqh[BLK_RW_ASYNC]); + } +} static int nfs_set_page_writeback(struct page *page) { @@ -204,11 +257,8 @@ static int nfs_set_page_writeback(struct struct nfs_server *nfss = NFS_SERVER(inode); page_cache_get(page); - if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + nfs_set_congested(atomic_long_inc_return(&nfss->writeback), + &nfss->backing_dev_info); } return ret; } @@ -220,8 +270,10 @@ static void nfs_end_page_writeback(struc end_page_writeback(page); page_cache_release(page); - if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + + nfs_wakeup_congested(atomic_long_dec_return(&nfss->writeback), + &nfss->backing_dev_info, + nfss->writeback_wait); } static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) @@ -322,19 +374,34 @@ static int nfs_writepage_locked(struct p int nfs_writepage(struct page *page, struct writeback_control *wbc) { + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret; ret = nfs_writepage_locked(page, wbc); unlock_page(page); + + nfs_wait_congested(wbc->sync_mode == WB_SYNC_ALL, + &nfss->backing_dev_info, + nfss->writeback_wait); + return ret; } -static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) +static int nfs_writepages_callback(struct page *page, + struct writeback_control *wbc, void *data) { + struct inode *inode = page->mapping->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret; ret = nfs_do_writepage(page, wbc, data); unlock_page(page); + + nfs_wait_congested(wbc->sync_mode == WB_SYNC_ALL, + &nfss->backing_dev_info, + nfss->writeback_wait); + return ret; } --- linux-next.orig/include/linux/nfs_fs_sb.h 2011-09-02 09:02:07.000000000 +0800 +++ linux-next/include/linux/nfs_fs_sb.h 2011-10-04 20:26:03.000000000 +0800 @@ -102,6 +102,7 @@ struct nfs_server { struct nfs_iostats __percpu *io_stats; /* I/O statistics */ struct backing_dev_info backing_dev_info; atomic_long_t writeback; /* number of writeback pages */ + wait_queue_head_t writeback_wait[2]; int flags; /* various flags */ unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ --- linux-next.orig/fs/nfs/client.c 2011-08-22 13:59:52.000000000 +0800 +++ linux-next/fs/nfs/client.c 2011-10-04 20:26:03.000000000 +0800 @@ -1066,6 +1066,8 @@ static struct nfs_server *nfs_alloc_serv INIT_LIST_HEAD(&server->layouts); atomic_set(&server->active, 0); + init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]); + init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html