[RFC v3] writeback: add elastic bdi in cgwb bdp

Hillf Danton <hdanton@xxxxxxxx> · Tue, 12 Nov 2019 11:42:27 +0800

The elastic bdi (ebdi) which is the mirror bdi of spinning disk,
SSD and USB key on market is introduced to balancing dirty pages
(bdp).

The risk arises that system runs out of free memory, when dirty
pages are produced too many too soon, so bdp is needed in field.

Ebdi facilitates bdp in elastic time intervals e.g. from a jiffy
to one HZ, depending on the time it would take to increase dirty
pages by the amount which is defined by the variable
ratelimit_pages.

During cgroup writeback (cgwb) bdp, ebdi helps observe the
changes both in cgwb's dirty pages (dirty speed) and in
written-out pages (laundry speed) in elastic time intervals,
until a balance is established between the two parties i.e.
the two speeds statistically equal.

The above mechanism of elastic equilibrium effectively prevents
dirty page hogs, as no chance is left for dirty pages to pile up,
thus cuts the risk that system free memory falls to unsafe level.

Thanks to Rong Chen for testing.

V3 is based on next-20191108.

Changes since v2
- add code document and comments
- adapt balance_dirty_pages_ratelimited() to ebdi

Changes since v1
- drop CGWB_BDP_WITH_EBDI 

Changes since v0
- add CGWB_BDP_WITH_EBDI in mm/Kconfig
- drop wakeup in wbc_detach_inode()
- add wakeup in wb_workfn()

Signed-off-by: Hillf Danton <hdanton@xxxxxxxx>
---

--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -170,6 +170,8 @@ struct bdi_writeback {
 
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
 
+	struct wait_queue_head bdp_waitq; /* used for bdp, balancing dirty pages */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct percpu_ref refcnt;	/* used only for !root wb's */
 	struct fprop_local_percpu memcg_completions;
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -324,6 +324,8 @@ static int wb_init(struct bdi_writeback
 			goto out_destroy_stat;
 	}
 
+	init_waitqueue_head(&wb->bdp_waitq);
+
 	return 0;
 
 out_destroy_stat:
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -811,6 +811,8 @@ static long wb_split_bdi_pages(struct bd
 	if (nr_pages == LONG_MAX)
 		return LONG_MAX;
 
+	return nr_pages;
+
 	/*
 	 * This may be called on clean wb's and proportional distribution
 	 * may not make sense, just use the original @nr_pages in those
@@ -1604,6 +1606,7 @@ static long writeback_chunk_size(struct
 		pages = min(pages, work->nr_pages);
 		pages = round_down(pages + MIN_WRITEBACK_PAGES,
 				   MIN_WRITEBACK_PAGES);
+		pages = work->nr_pages;
 	}
 
 	return pages;
@@ -2092,6 +2095,9 @@ void wb_workfn(struct work_struct *work)
 		wb_wakeup_delayed(wb);
 
 	current->flags &= ~PF_SWAPWRITE;
+
+	if (waitqueue_active(&wb->bdp_waitq))
+		wake_up_all(&wb->bdp_waitq);
 }
 
 /*
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1830,6 +1830,67 @@ pause:
 		wb_start_background_writeback(wb);
 }
 
+/**
+ * cgwb_bdp_should_throttle()	tell if a wb should be throttled
+ * @wb bdi_writeback to throttle
+ *
+ * To avoid the risk of exhausting the system free memory, we check
+ * and try much to prevent too many dirty pages from being produced
+ * too soon.
+ *
+ * For cgroup writeback, it is essencially to keep an equilibrium
+ * between its dirty speed and laundry speed i.e. dirty pages are
+ * written out as fast as they are produced in an ideal state.
+ */
+static bool cgwb_bdp_should_throttle(struct bdi_writeback *wb)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+	if (fatal_signal_pending(current))
+		return false;
+
+	gdtc.avail = global_dirtyable_memory();
+
+	domain_dirty_limits(&gdtc);
+
+	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+		     global_node_page_state(NR_UNSTABLE_NFS) +
+		     global_node_page_state(NR_WRITEBACK);
+
+	if (gdtc.dirty < gdtc.bg_thresh)
+		return false;
+
+	if (!writeback_in_progress(wb))
+		wb_start_background_writeback(wb);
+
+	if (gdtc.dirty < gdtc.thresh)
+		return false;
+
+	/*
+	 * throttle wb if there is the risk that wb's dirty speed is
+	 * running away from its laundry speed, better with statistic
+	 * error taken into account.
+	 */
+	return  wb_stat(wb, WB_DIRTIED) >
+		wb_stat(wb, WB_WRITTEN) + wb_stat_error();
+}
+
+/**
+ * cgwb_bdp()  cgroup writeback tries to balance dirty pages 
+ * @wb bdi_writeback in question
+ *
+ * if no balance exists at the moment @wb will be throttled till
+ * it is established.
+ */
+static inline void cgwb_bdp(struct bdi_writeback *wb)
+{
+	wait_event_interruptible_timeout(wb->bdp_waitq,
+			!cgwb_bdp_should_throttle(wb), HZ);
+}
+
+/* for detecting dirty page hogs */
+static DEFINE_PER_CPU(int, bdp_in_flight);
+
 static DEFINE_PER_CPU(int, bdp_ratelimits);
 
 /*
@@ -1866,8 +1927,8 @@ void balance_dirty_pages_ratelimited(str
 	struct inode *inode = mapping->host;
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct bdi_writeback *wb = NULL;
-	int ratelimit;
-	int *p;
+	bool try_bdp;
+	int *dirty, *leak, *flights;
 
 	if (!bdi_cap_account_dirty(bdi))
 		return;
@@ -1877,10 +1938,6 @@ void balance_dirty_pages_ratelimited(str
 	if (!wb)
 		wb = &bdi->wb;
 
-	ratelimit = current->nr_dirtied_pause;
-	if (wb->dirty_exceeded)
-		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
-
 	preempt_disable();
 	/*
 	 * This prevents one CPU to accumulate too many dirtied pages without
@@ -1888,29 +1945,38 @@ void balance_dirty_pages_ratelimited(str
 	 * 1000+ tasks, all of them start dirtying pages at exactly the same
 	 * time, hence all honoured too large initial task->nr_dirtied_pause.
 	 */
-	p =  this_cpu_ptr(&bdp_ratelimits);
-	if (unlikely(current->nr_dirtied >= ratelimit))
-		*p = 0;
-	else if (unlikely(*p >= ratelimit_pages)) {
-		*p = 0;
-		ratelimit = 0;
-	}
+	dirty = this_cpu_ptr(&bdp_ratelimits);
+
 	/*
 	 * Pick up the dirtied pages by the exited tasks. This avoids lots of
 	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
 	 * the dirty throttling and livelock other long-run dirtiers.
 	 */
-	p = this_cpu_ptr(&dirty_throttle_leaks);
-	if (*p > 0 && current->nr_dirtied < ratelimit) {
-		unsigned long nr_pages_dirtied;
-		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
-		*p -= nr_pages_dirtied;
-		current->nr_dirtied += nr_pages_dirtied;
+	leak = this_cpu_ptr(&dirty_throttle_leaks);
+
+	if (*dirty + *leak < ratelimit_pages) {
+		/*
+		 * nothing to do as it would take some more time to
+		 * eat out ratelimit_pages
+		 */
+		try_bdp = false;
+	} else {
+		try_bdp = true;
+
+		/*
+		 * bdp in flight helps detect dirty page hogs soon
+		 */
+		flights = this_cpu_ptr(&bdp_in_flight);
+
+		if ((*flights)++ & 1) {
+			*dirty = *dirty + *leak - ratelimit_pages;
+			*leak = 0;
+		}
 	}
 	preempt_enable();
 
-	if (unlikely(current->nr_dirtied >= ratelimit))
-		balance_dirty_pages(wb, current->nr_dirtied);
+	if (try_bdp)
+		cgwb_bdp(wb);
 
 	wb_put(wb);
 }
--