Re: [PATCH 1/2] writeback: Improve busyloop prevention

Wu Fengguang <fengguang.wu@xxxxxxxxx> · Thu, 20 Oct 2011 20:09:09 +0800

Jan,

I tried the below combined patch over the ioless one, and find some
minor regressions. I studied the thresh=1G/ext3-1dd case in particular
and find that nr_writeback and the iostat avgrq-sz drops from time to time.

I'll try to bisect the changeset.

3.1.0-rc9-ioless-full-next-20111014+  3.1.0-rc9-ioless-full-more_io_wait-next-20111014+
------------------------  ------------------------
                   56.47        -0.6%        56.13  thresh=100M/btrfs-10dd-4k-8p-4096M-100M:10-X
                   56.28        -0.4%        56.07  thresh=100M/btrfs-1dd-4k-8p-4096M-100M:10-X
                   56.11        -0.1%        56.05  thresh=100M/btrfs-2dd-4k-8p-4096M-100M:10-X
                   37.86        +1.8%        38.54  thresh=100M/ext3-10dd-4k-8p-4096M-100M:10-X
                   45.91        +0.7%        46.22  thresh=100M/ext3-1dd-4k-8p-4096M-100M:10-X
                   41.87        +0.8%        42.19  thresh=100M/ext3-2dd-4k-8p-4096M-100M:10-X
                   45.68        -0.4%        45.50  thresh=100M/ext4-10dd-4k-8p-4096M-100M:10-X
                   55.74        -2.2%        54.51  thresh=100M/ext4-1dd-4k-8p-4096M-100M:10-X
                   46.20        -4.8%        43.98  thresh=100M/xfs-10dd-4k-8p-4096M-100M:10-X
                   55.72        +0.1%        55.76  thresh=100M/xfs-1dd-4k-8p-4096M-100M:10-X
                   54.01        -2.0%        52.94  thresh=100M/xfs-2dd-4k-8p-4096M-100M:10-X
                   55.08        -1.0%        54.52  thresh=1G/btrfs-100dd-4k-8p-4096M-1024M:10-X
                   55.49        -1.0%        54.94  thresh=1G/btrfs-10dd-4k-8p-4096M-1024M:10-X
                   55.38        -2.7%        53.91  thresh=1G/btrfs-1dd-4k-8p-4096M-1024M:10-X
                   36.70        -1.5%        36.15  thresh=1G/ext3-100dd-4k-8p-4096M-1024M:10-X
                   40.64        -5.9%        38.25  thresh=1G/ext3-10dd-4k-8p-4096M-1024M:10-X
                   48.65        -6.9%        45.30  thresh=1G/ext3-1dd-4k-8p-4096M-1024M:10-X
                   49.84        -3.2%        48.23  thresh=1G/ext4-100dd-4k-8p-4096M-1024M:10-X
                   56.03        -3.3%        54.21  thresh=1G/ext4-10dd-4k-8p-4096M-1024M:10-X
                   57.42        -2.3%        56.07  thresh=1G/ext4-1dd-4k-8p-4096M-1024M:10-X
                   45.74        -1.4%        45.12  thresh=1G/xfs-100dd-4k-8p-4096M-1024M:10-X
                   54.19        -0.5%        53.94  thresh=1G/xfs-10dd-4k-8p-4096M-1024M:10-X
                   55.93        -0.5%        55.66  thresh=1G/xfs-1dd-4k-8p-4096M-1024M:10-X
                    2.77       +27.8%         3.54  thresh=1M/btrfs-10dd-4k-8p-4096M-1M:10-X
                    2.20       -15.5%         1.86  thresh=1M/btrfs-1dd-4k-8p-4096M-1M:10-X
                    2.42        -1.3%         2.39  thresh=1M/btrfs-2dd-4k-8p-4096M-1M:10-X
                   28.91        +1.9%        29.47  thresh=1M/ext3-10dd-4k-8p-4096M-1M:10-X
                   45.02        +1.1%        45.50  thresh=1M/ext3-1dd-4k-8p-4096M-1M:10-X
                   40.91        +0.4%        41.09  thresh=1M/ext3-2dd-4k-8p-4096M-1M:10-X
                   31.82        +2.3%        32.56  thresh=1M/ext4-10dd-4k-8p-4096M-1M:10-X
                   52.33        -0.9%        51.85  thresh=1M/ext4-1dd-4k-8p-4096M-1M:10-X
                   28.43        +1.2%        28.77  thresh=1M/xfs-10dd-4k-8p-4096M-1M:10-X
                   52.93        -3.8%        50.90  thresh=1M/xfs-1dd-4k-8p-4096M-1M:10-X
                   46.87        -0.0%        46.85  thresh=1M/xfs-2dd-4k-8p-4096M-1M:10-X
                   54.54        -1.3%        53.82  thresh=8M/btrfs-10dd-4k-8p-4096M-8M:10-X
                   56.60        -1.4%        55.80  thresh=8M/btrfs-1dd-4k-8p-4096M-8M:10-X
                   56.21        -0.4%        55.96  thresh=8M/btrfs-2dd-4k-8p-4096M-8M:10-X
                   32.54        +0.2%        32.62  thresh=8M/ext3-10dd-4k-8p-4096M-8M:10-X
                   46.01        -1.0%        45.55  thresh=8M/ext3-1dd-4k-8p-4096M-8M:10-X
                   44.13        -0.6%        43.87  thresh=8M/ext3-2dd-4k-8p-4096M-8M:10-X
                   35.78        -0.4%        35.63  thresh=8M/ext4-10dd-4k-8p-4096M-8M:10-X
                   55.29        +0.2%        55.38  thresh=8M/ext4-1dd-4k-8p-4096M-8M:10-X
                   31.21        -0.8%        30.96  thresh=8M/xfs-10dd-4k-8p-4096M-8M:10-X
                   54.10        -0.3%        53.95  thresh=8M/xfs-1dd-4k-8p-4096M-8M:10-X
                   46.97        +0.5%        47.20  thresh=8M/xfs-2dd-4k-8p-4096M-8M:10-X
                 2010.92        -1.1%      1989.70  TOTAL write_bw

--- linux-next.orig/fs/fs-writeback.c	2011-10-20 19:26:37.000000000 +0800
+++ linux-next/fs/fs-writeback.c	2011-10-20 20:00:22.000000000 +0800
@@ -234,6 +234,15 @@ static void requeue_io(struct inode *ino
 	list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 
+/*
+ * The inode should be retried in an opportunistic way.
+ */
+static void requeue_io_wait(struct inode *inode, struct bdi_writeback *wb)
+{
+	assert_spin_locked(&wb->list_lock);
+	list_move(&inode->i_wb_list, &wb->b_more_io_wait);
+}
+
 static void inode_sync_complete(struct inode *inode)
 {
 	/*
@@ -321,6 +330,7 @@ static void queue_io(struct bdi_writebac
 	int moved;
 	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
+	list_splice_init(&wb->b_more_io_wait, &wb->b_io);
 	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
 	trace_writeback_queue_io(wb, work, moved);
 }
@@ -470,7 +480,7 @@ writeback_single_inode(struct inode *ino
 				 * retrying writeback of the dirty page/inode
 				 * that cannot be performed immediately.
 				 */
-				redirty_tail(inode, wb);
+				requeue_io_wait(inode, wb);
 			}
 		} else if (inode->i_state & I_DIRTY) {
 			/*
@@ -478,8 +488,18 @@ writeback_single_inode(struct inode *ino
 			 * operations, such as delayed allocation during
 			 * submission or metadata updates after data IO
 			 * completion.
+			 *
+			 * For the latter case it is very important to give
+			 * the inode another turn on b_more_io instead of
+			 * redirtying it.  Constantly moving dirtied_when
+			 * forward will prevent us from ever writing out
+			 * the metadata dirtied in the I/O completion handler.
+			 *
+			 * For files on XFS that constantly get appended to
+			 * calling redirty_tail means they will never get
+			 * their updated i_size written out.
 			 */
-			redirty_tail(inode, wb);
+			requeue_io_wait(inode, wb);
 		} else {
 			/*
 			 * The inode is clean.  At this point we either have
@@ -600,7 +620,7 @@ static long writeback_sb_inodes(struct s
 			 * writeback is not making progress due to locked
 			 * buffers.  Skip this inode for now.
 			 */
-			redirty_tail(inode, wb);
+			requeue_io_wait(inode, wb);
 		}
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&wb->list_lock);
@@ -637,7 +657,7 @@ static long __writeback_inodes_wb(struct
 			 * s_umount being grabbed by someone else. Don't use
 			 * requeue_io() to avoid busy retrying the inode/sb.
 			 */
-			redirty_tail(inode, wb);
+			requeue_io_wait(inode, wb);
 			continue;
 		}
 		wrote += writeback_sb_inodes(sb, wb, work);
@@ -720,10 +740,10 @@ static long wb_writeback(struct bdi_writ
 			 struct wb_writeback_work *work)
 {
 	unsigned long wb_start = jiffies;
-	long nr_pages = work->nr_pages;
 	unsigned long oldest_jif;
 	struct inode *inode;
 	long progress;
+	long total_progress = 0;
 
 	oldest_jif = jiffies;
 	work->older_than_this = &oldest_jif;
@@ -753,11 +773,17 @@ static long wb_writeback(struct bdi_writ
 		if (work->for_background && !over_bground_thresh(wb->bdi))
 			break;
 
+		/*
+		 * Kupdate and background works are special and we want to
+		 * include all inodes that need writing. Livelock avoidance is
+		 * handled by these works yielding to any other work so we are
+		 * safe.
+		 */
 		if (work->for_kupdate) {
 			oldest_jif = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
-			work->older_than_this = &oldest_jif;
-		}
+		} else if (work->for_background)
+			oldest_jif = jiffies;
 
 		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
@@ -767,6 +793,7 @@ static long wb_writeback(struct bdi_writ
 		else
 			progress = __writeback_inodes_wb(wb, work);
 		trace_writeback_written(wb->bdi, work);
+		total_progress += progress;
 
 		wb_update_bandwidth(wb, wb_start);
 
@@ -800,7 +827,7 @@ static long wb_writeback(struct bdi_writ
 	}
 	spin_unlock(&wb->list_lock);
 
-	return nr_pages - work->nr_pages;
+	return total_progress;
 }
 
 /*
@@ -863,7 +890,7 @@ static long wb_check_old_data_flush(stru
 
 	expired = wb->last_old_flush +
 			msecs_to_jiffies(dirty_writeback_interval * 10);
-	if (time_before(jiffies, expired))
+	if (time_before(jiffies, expired) && list_empty(&wb->b_more_io_wait))
 		return 0;
 
 	wb->last_old_flush = jiffies;
@@ -934,7 +961,11 @@ int bdi_writeback_thread(void *data)
 {
 	struct bdi_writeback *wb = data;
 	struct backing_dev_info *bdi = wb->bdi;
-	long pages_written;
+	long progress;
+	unsigned int pause = 1;
+	unsigned int max_pause = dirty_writeback_interval ?
+			msecs_to_jiffies(dirty_writeback_interval * 10) :
+			HZ;
 
 	current->flags |= PF_SWAPWRITE;
 	set_freezable();
@@ -954,12 +985,12 @@ int bdi_writeback_thread(void *data)
 		 */
 		del_timer(&wb->wakeup_timer);
 
-		pages_written = wb_do_writeback(wb, 0);
+		progress = wb_do_writeback(wb, 0);
 
-		trace_writeback_pages_written(pages_written);
-
-		if (pages_written)
+		if (progress) {
 			wb->last_active = jiffies;
+			pause = 1;
+		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
@@ -967,8 +998,11 @@ int bdi_writeback_thread(void *data)
 			continue;
 		}
 
-		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
+		if (!list_empty(&wb->b_more_io_wait) && pause < max_pause) {
+			schedule_timeout(pause);
+			pause <<= 1;
+		} else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+			schedule_timeout(max_pause);
 		else {
 			/*
 			 * We have nothing to do, so can go sleep without any
--- linux-next.orig/include/linux/backing-dev.h	2011-10-20 19:26:37.000000000 +0800
+++ linux-next/include/linux/backing-dev.h	2011-10-20 19:29:39.000000000 +0800
@@ -59,6 +59,7 @@ struct bdi_writeback {
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_more_io_wait;/* opportunistic retry io */
 	spinlock_t list_lock;		/* protects the b_* lists */
 };
 
@@ -133,9 +134,10 @@ extern struct list_head bdi_pending_list
 
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
-	return !list_empty(&wb->b_dirty) ||
-	       !list_empty(&wb->b_io) ||
-	       !list_empty(&wb->b_more_io);
+	return !list_empty(&wb->b_dirty)	||
+	       !list_empty(&wb->b_io)		||
+	       !list_empty(&wb->b_more_io)	||
+	       !list_empty(&wb->b_more_io_wait);
 }
 
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
--- linux-next.orig/mm/backing-dev.c	2011-10-20 19:26:37.000000000 +0800
+++ linux-next/mm/backing-dev.c	2011-10-20 19:29:39.000000000 +0800
@@ -74,10 +74,10 @@ static int bdi_debug_stats_show(struct s
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	unsigned long nr_dirty, nr_io, nr_more_io;
+	unsigned long nr_dirty, nr_io, nr_more_io, nr_more_io_wait;
 	struct inode *inode;
 
-	nr_dirty = nr_io = nr_more_io = 0;
+	nr_dirty = nr_io = nr_more_io = nr_more_io_wait = 0;
 	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
@@ -85,6 +85,8 @@ static int bdi_debug_stats_show(struct s
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
+	list_for_each_entry(inode, &wb->b_more_io_wait, i_wb_list)
+		nr_more_io_wait++;
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -103,6 +105,7 @@ static int bdi_debug_stats_show(struct s
 		   "b_dirty:            %10lu\n"
 		   "b_io:               %10lu\n"
 		   "b_more_io:          %10lu\n"
+		   "b_more_io_wait:     %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -116,6 +119,7 @@ static int bdi_debug_stats_show(struct s
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
+		   nr_more_io_wait,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -651,6 +655,7 @@ static void bdi_wb_init(struct bdi_write
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	INIT_LIST_HEAD(&wb->b_more_io_wait);
 	spin_lock_init(&wb->list_lock);
 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
@@ -718,6 +723,7 @@ void bdi_destroy(struct backing_dev_info
 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 		list_splice(&bdi->wb.b_io, &dst->b_io);
 		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
+		list_splice(&bdi->wb.b_more_io_wait, &dst->b_more_io_wait);
 		spin_unlock(&bdi->wb.list_lock);
 		spin_unlock(&dst->list_lock);
 	}
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html