[PATCH] bcache: fix journal full and c->root write not flushed

mingzhe.zou@xxxxxxxxxxxx · Thu, 9 Jan 2025 11:23:04 +0800

From: Mingzhe Zou <mingzhe.zou@xxxxxxxxxxxx>

When we use a new cache device for performance testing, the (bs=4k, iodepth=1)
write result is abnormal. With a cycle of 30 seconds, IOPS drop to 0 within 10
seconds, and then recover after 30 seconds.

After debugging, we found that journal is full and btree_node_write_work() runs
at least 30 seconds apart. However, when the journal is full, we expect to call
btree_flush_write() to release the oldest journal entry. Obviously, flush write
failed to release the journal.

View the code, we found that the btree_flush_write() only select flushing btree
node from c->btree_cache list. However, list_del_init(&b->list) will be called
in bch_btree_set_root(), so the c->root is not in the c->btree_cache list.

For a new cache, there was only one btree node before btree split. This patch
hopes to flush c->root write when the journal is full.

Fixes: 91be66e1 (bcache: performance improvement for btree_flush_write())
Signed-off-by: Mingzhe Zou <mingzhe.zou@xxxxxxxxxxxx>
---
 drivers/md/bcache/journal.c | 86 ++++++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 40 deletions(-)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7ff14bd2feb8..837073fe5e5a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -413,13 +413,53 @@ void bch_journal_space_reserve(struct journal *j)
 
 /* Journalling */
 
+static inline bool btree_need_flush_write(struct btree *b, atomic_t *front)
+{
+	if (btree_node_journal_flush(b))
+		pr_err("BUG: flush_write bit should not be set here!\n");
+
+	mutex_lock(&b->write_lock);
+
+	if (!btree_node_dirty(b)) {
+		mutex_unlock(&b->write_lock);
+		return false;
+	}
+
+	if (!btree_current_write(b)->journal) {
+		mutex_unlock(&b->write_lock);
+		return false;
+	}
+
+	/*
+	 * Only select the btree node which exactly references
+	 * the oldest journal entry.
+	 *
+	 * If the journal entry pointed by fifo_front_p is
+	 * reclaimed in parallel, don't worry:
+	 * - the list_for_each_xxx loop will quit when checking
+	 *   next now_fifo_front_p.
+	 * - If there are matched nodes recorded in btree_nodes[],
+	 *   they are clean now (this is why and how the oldest
+	 *   journal entry can be reclaimed). These selected nodes
+	 *   will be ignored and skipped in the folowing for-loop.
+	 */
+	if ((btree_current_write(b)->journal - front) & b->c->journal.pin.mask) {
+		mutex_unlock(&b->write_lock);
+		return false;
+	}
+
+	set_btree_node_journal_flush(b);
+
+	mutex_unlock(&b->write_lock);
+	return true;
+}
+
 static void btree_flush_write(struct cache_set *c)
 {
 	struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR];
 	unsigned int i, nr;
 	int ref_nr;
 	atomic_t *fifo_front_p, *now_fifo_front_p;
-	size_t mask;
 
 	if (c->journal.btree_flushing)
 		return;
@@ -446,12 +486,14 @@ static void btree_flush_write(struct cache_set *c)
 	}
 	spin_unlock(&c->journal.lock);
 
-	mask = c->journal.pin.mask;
 	nr = 0;
 	atomic_long_inc(&c->flush_write);
 	memset(btree_nodes, 0, sizeof(btree_nodes));
 
 	mutex_lock(&c->bucket_lock);
+	if (btree_need_flush_write(c->root, fifo_front_p))
+		btree_nodes[nr++] = c->root;
+
 	list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
 		/*
 		 * It is safe to get now_fifo_front_p without holding
@@ -476,45 +518,9 @@ static void btree_flush_write(struct cache_set *c)
 		if (nr >= ref_nr)
 			break;
 
-		if (btree_node_journal_flush(b))
-			pr_err("BUG: flush_write bit should not be set here!\n");
-
-		mutex_lock(&b->write_lock);
-
-		if (!btree_node_dirty(b)) {
-			mutex_unlock(&b->write_lock);
-			continue;
-		}
-
-		if (!btree_current_write(b)->journal) {
-			mutex_unlock(&b->write_lock);
-			continue;
-		}
-
-		/*
-		 * Only select the btree node which exactly references
-		 * the oldest journal entry.
-		 *
-		 * If the journal entry pointed by fifo_front_p is
-		 * reclaimed in parallel, don't worry:
-		 * - the list_for_each_xxx loop will quit when checking
-		 *   next now_fifo_front_p.
-		 * - If there are matched nodes recorded in btree_nodes[],
-		 *   they are clean now (this is why and how the oldest
-		 *   journal entry can be reclaimed). These selected nodes
-		 *   will be ignored and skipped in the following for-loop.
-		 */
-		if (((btree_current_write(b)->journal - fifo_front_p) &
-		     mask) != 0) {
-			mutex_unlock(&b->write_lock);
-			continue;
-		}
-
-		set_btree_node_journal_flush(b);
-
-		mutex_unlock(&b->write_lock);
+		if (btree_need_flush_write(b, fifo_front_p))
+			btree_nodes[nr++] = b;
 
-		btree_nodes[nr++] = b;
 		/*
 		 * To avoid holding c->bucket_lock too long time,
 		 * only scan for BTREE_FLUSH_NR matched btree nodes
-- 
2.34.1