Re: Raid10 and page cache

NeilBrown <neilb@xxxxxxx> · Thu, 8 Dec 2011 11:10:09 +1100

On Wed, 7 Dec 2011 15:37:30 -0800 Yucong Sun (叶雨飞) <sunyucong@xxxxxxxxx>
wrote:

> Neil, I can't compile latest MD against 2.6.32,  and that commit can't
> be patched into 2.6.32 directly either, can you help me on this?
> 

This should do it.

NeilBrown

commit ef54b7cf955dc3b7d33248e8591b1a00b4fa998c
Author: NeilBrown <neilb@xxxxxxx>
Date:   Tue Oct 11 16:50:01 2011 +1100

    md: add proper write-congestion reporting to RAID1 and RAID10.
    
    RAID1 and RAID10 handle write requests by queuing them for handling by
    a separate thread.  This is because when a write-intent-bitmap is
    active we might need to update the bitmap first, so it is good to
    queue a lot of writes, then do one big bitmap update for them all.
    
    However writeback request devices to appear to be congested after a
    while so it can make some guesstimate of throughput.  The infinite
    queue defeats that (note that RAID5 has already has a finite queue so
    it doesn't suffer from this problem).
    
    So impose a limit on the number of pending write requests.  By default
    it is 1024 which seems to be generally suitable.  Make it configurable
    via module option just in case someone finds a regression.
    
    Signed-off-by: NeilBrown <neilb@xxxxxxx>

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e07ce2e..fe7ae3c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -50,6 +50,11 @@
  */
 #define	NR_RAID1_BIOS 256
 
+/* When there are this many requests queue to be written by
+ * the raid1 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
 
 static void unplug_slaves(mddev_t *mddev);
 
@@ -576,7 +581,8 @@ static int raid1_congested(void *data, int bits)
 	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
-	if (mddev_congested(mddev, bits))
+	if (mddev_congested(mddev, bits) &&
+	    conf->pending_count >= max_queued_requests)
 		return 1;
 
 	rcu_read_lock();
@@ -613,10 +619,12 @@ static int flush_pending_writes(conf_t *conf)
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
 		blk_remove_plug(conf->mddev->queue);
+		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to
 		 * disk before proceeding w/ I/O */
 		bitmap_unplug(conf->mddev->bitmap);
+		wake_up(&conf->wait_barrier);
 
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
@@ -789,6 +797,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	int cpu;
 	bool do_barriers;
 	mdk_rdev_t *blocked_rdev;
+	int cnt = 0;
 
 	/*
 	 * Register the new request and wait if the reconstruction
@@ -864,6 +873,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	/*
 	 * WRITE:
 	 */
+	if (conf->pending_count >= max_queued_requests) {
+		md_wakeup_thread(mddev->thread);
+		wait_event(conf->wait_barrier,
+			   conf->pending_count < max_queued_requests);
+	}
 	/* first select target devices under spinlock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -970,6 +984,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 		atomic_inc(&r1_bio->remaining);
 
 		bio_list_add(&bl, mbio);
+		cnt++;
 	}
 	kfree(behind_pages); /* the behind pages are attached to the bios now */
 
@@ -978,6 +993,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	spin_lock_irqsave(&conf->device_lock, flags);
 	bio_list_merge(&conf->pending_bio_list, &bl);
 	bio_list_init(&bl);
+	conf->pending_count += cnt;
 
 	blk_plug_device(mddev->queue);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2021,7 +2037,7 @@ static int run(mddev_t *mddev)
 
 	bio_list_init(&conf->pending_bio_list);
 	bio_list_init(&conf->flushing_bio_list);
-
+	conf->pending_count = 0;
 
 	mddev->degraded = 0;
 	for (i = 0; i < conf->raid_disks; i++) {
@@ -2317,3 +2333,5 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
 MODULE_ALIAS("md-raid1");
 MODULE_ALIAS("md-level-1");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e87b84d..520288c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -38,6 +38,7 @@ struct r1_private_data_s {
 	/* queue of writes that have been unplugged */
 	struct bio_list		flushing_bio_list;
 
+	int			pending_count;
 	/* for use when syncing mirrors: */
 
 	spinlock_t		resync_lock;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b8..4c7d9b5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -59,6 +59,11 @@ static void unplug_slaves(mddev_t *mddev);
 
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
+/* When there are this many requests queue to be written by
+ * the raid10 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
 
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
@@ -631,6 +636,10 @@ static int raid10_congested(void *data, int bits)
 	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
+	if ((bits & (1 << BDI_async_congested)) &&
+	    conf->pending_count >= max_queued_requests)
+		return 1;
+
 	if (mddev_congested(mddev, bits))
 		return 1;
 	rcu_read_lock();
@@ -660,10 +669,12 @@ static int flush_pending_writes(conf_t *conf)
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
 		blk_remove_plug(conf->mddev->queue);
+		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
 		bitmap_unplug(conf->mddev->bitmap);
+		wake_up(&conf->wait_barrier);
 
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
@@ -802,6 +813,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	struct bio_list bl;
 	unsigned long flags;
 	mdk_rdev_t *blocked_rdev;
+	int cnt = 0;
 
 	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
 		bio_endio(bio, -EOPNOTSUPP);
@@ -894,6 +906,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	/*
 	 * WRITE:
 	 */
+	if (conf->pending_count >= max_queued_requests) {
+		md_wakeup_thread(mddev->thread);
+		wait_event(conf->wait_barrier,
+			   conf->pending_count < max_queued_requests);
+	}
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
@@ -957,6 +974,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 		atomic_inc(&r10_bio->remaining);
 		bio_list_add(&bl, mbio);
+		cnt++
 	}
 
 	if (unlikely(!atomic_read(&r10_bio->remaining))) {
@@ -970,6 +988,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 	spin_lock_irqsave(&conf->device_lock, flags);
 	bio_list_merge(&conf->pending_bio_list, &bl);
 	blk_plug_device(mddev->queue);
+	conf->pending_count += cnt;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 
 	/* In case raid10d snuck in to freeze_array */
@@ -2318,3 +2337,5 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
 MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 59cd1ef..e6e1613 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -39,7 +39,7 @@ struct r10_private_data_s {
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
-
+	int			pending_count;
 
 	spinlock_t		resync_lock;
 	int nr_pending;


Attachment:
signature.asc

Description: PGP signature