[PATCH md 6 of 6] Fix handling of overlapping requests in raid5

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



If we detect an overlap, we set a flag and wait for a wakeup.
When requests are handled, if the flag was set, we perform the
wakeup.

Note that the code currently in -mm is badly broken.  With this
patch applied, it passes tests the use O_DIRECT to cause lots
of overlapping requests.

Signed-off-by: Neil Brown <neilb@xxxxxxxxxxxxxxx>

### Diffstat output
 ./drivers/md/raid5.c         |   52 ++++++++++++++++++++++++++++++++---------
 ./drivers/md/raid6main.c     |   54 +++++++++++++++++++++++++++++++++++--------
 ./include/linux/raid/raid5.h |    2 +
 3 files changed, 87 insertions(+), 21 deletions(-)

diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
--- ./drivers/md/raid5.c~current~	2005-02-07 13:34:09.000000000 +1100
+++ ./drivers/md/raid5.c	2005-02-07 13:34:23.000000000 +1100
@@ -49,7 +49,7 @@
  * This macro is used to determine the 'next' bio in the list, given the sector
  * of the current stripe+device
  */
-#define r5_next_bio(bio, sect) ( ( bio->bi_sector + (bio->bi_size>>9) < sect + STRIPE_SECTORS) ? bio->bi_next : NULL)
+#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
 /*
  * The following can be used to debug the driver
  */
@@ -722,6 +722,10 @@ static void compute_parity(struct stripe
 				ptr[count++] = page_address(sh->dev[i].page);
 				chosen = sh->dev[i].towrite;
 				sh->dev[i].towrite = NULL;
+
+				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+					wake_up(&conf->wait_for_overlap);
+
 				if (sh->dev[i].written) BUG();
 				sh->dev[i].written = chosen;
 				check_xor();
@@ -734,6 +738,10 @@ static void compute_parity(struct stripe
 			if (i!=pd_idx && sh->dev[i].towrite) {
 				chosen = sh->dev[i].towrite;
 				sh->dev[i].towrite = NULL;
+
+				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+					wake_up(&conf->wait_for_overlap);
+
 				if (sh->dev[i].written) BUG();
 				sh->dev[i].written = chosen;
 			}
@@ -790,7 +798,7 @@ static void compute_parity(struct stripe
  * toread/towrite point to the first in a chain. 
  * The bi_next chain must be in order.
  */
-static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 {
 	struct bio **bip;
 	raid5_conf_t *conf = sh->raid_conf;
@@ -808,9 +816,12 @@ static int add_stripe_bio (struct stripe
 		bip = &sh->dev[dd_idx].toread;
 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
 		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
-			return 0; /* cannot add just now due to overlap */
+			goto overlap;
 		bip = & (*bip)->bi_next;
 	}
+	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 
+		goto overlap;
+
 	if (*bip && bi->bi_next && (*bip) != bi->bi_next)
 		BUG();
 	if (*bip)
@@ -825,7 +836,7 @@ static int add_stripe_bio (struct stripe
 		(unsigned long long)sh->sector, dd_idx);
 
 	if (forwrite) {
-		/* check if page is coverred */
+		/* check if page is covered */
 		sector_t sector = sh->dev[dd_idx].sector;
 		for (bi=sh->dev[dd_idx].towrite;
 		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
@@ -838,6 +849,12 @@ static int add_stripe_bio (struct stripe
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
 	return 1;
+
+ overlap:
+	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+	spin_unlock_irq(&conf->device_lock);
+	spin_unlock(&sh->lock);
+	return 0;
 }
 
 
@@ -898,6 +915,8 @@ static void handle_stripe(struct stripe_
 			spin_lock_irq(&conf->device_lock);
 			rbi = dev->toread;
 			dev->toread = NULL;
+			if (test_and_clear_bit(R5_Overlap, &dev->flags))
+				wake_up(&conf->wait_for_overlap);
 			spin_unlock_irq(&conf->device_lock);
 			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 				copy_data(0, rbi, dev->page, dev->sector);
@@ -945,6 +964,9 @@ static void handle_stripe(struct stripe_
 			sh->dev[i].towrite = NULL;
 			if (bi) to_write--;
 
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wake_up(&conf->wait_for_overlap);
+
 			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
 				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -973,6 +995,8 @@ static void handle_stripe(struct stripe_
 			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
 				bi = sh->dev[i].toread;
 				sh->dev[i].toread = NULL;
+				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+					wake_up(&conf->wait_for_overlap);
 				if (bi) to_read--;
 				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
 					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
@@ -1400,6 +1424,7 @@ static int make_request (request_queue_t
 	if ( bio_data_dir(bi) == WRITE )
 		md_write_start(mddev);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+		DEFINE_WAIT(w);
 		
 		new_sector = raid5_compute_sector(logical_sector,
 						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
@@ -1408,25 +1433,29 @@ static int make_request (request_queue_t
 			(unsigned long long)new_sector, 
 			(unsigned long long)logical_sector);
 
+		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+
+	retry:
 		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
-
-			while (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-				/* add failed due to overlap.  Flush everything
+			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+				/* Add failed due to overlap.  Flush everything
 				 * and wait a while
-				 * FIXME - overlapping requests should be handled better
 				 */
 				raid5_unplug_device(mddev->queue);
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				schedule_timeout(1);
+				release_stripe(sh);
+				schedule();
+				goto retry;
 			}
-
+			finish_wait(&conf->wait_for_overlap, &w);
 			raid5_plug_device(conf);
 			handle_stripe(sh);
 			release_stripe(sh);
+			
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+			finish_wait(&conf->wait_for_overlap, &w);
 			break;
 		}
 			
@@ -1574,6 +1603,7 @@ static int run (mddev_t *mddev)
 
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
+	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->inactive_list);

diff ./drivers/md/raid6main.c~current~ ./drivers/md/raid6main.c
--- ./drivers/md/raid6main.c~current~	2005-02-07 13:34:09.000000000 +1100
+++ ./drivers/md/raid6main.c	2005-02-07 13:34:23.000000000 +1100
@@ -54,7 +54,7 @@
  * This macro is used to determine the 'next' bio in the list, given the sector
  * of the current stripe+device
  */
-#define r5_next_bio(bio, sect) ( ( bio->bi_sector + (bio->bi_size>>9) < sect + STRIPE_SECTORS) ? bio->bi_next : NULL)
+#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
 /*
  * The following can be used to debug the driver
  */
@@ -690,7 +690,7 @@ static void copy_data(int frombio, struc
 		if (len > 0 && page_offset + len > STRIPE_SIZE)
 			clen = STRIPE_SIZE - page_offset;	
 		else clen = len;
-			
+		
 		if (clen > 0) {
 			char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
 			if (frombio)
@@ -735,6 +735,10 @@ static void compute_parity(struct stripe
 			if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
 				chosen = sh->dev[i].towrite;
 				sh->dev[i].towrite = NULL;
+
+				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+					wake_up(&conf->wait_for_overlap);
+
 				if (sh->dev[i].written) BUG();
 				sh->dev[i].written = chosen;
 			}
@@ -897,7 +901,7 @@ static void compute_block_2(struct strip
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
 {
 	struct bio **bip;
 	raid6_conf_t *conf = sh->raid_conf;
@@ -914,10 +918,13 @@ static void add_stripe_bio (struct strip
 	else
 		bip = &sh->dev[dd_idx].toread;
 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-		BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
-		bip = & (*bip)->bi_next;
+		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+			goto overlap;
+		bip = &(*bip)->bi_next;
 	}
-/* FIXME do I need to worry about overlapping bion */
+	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 
+		goto overlap;
+
 	if (*bip && bi->bi_next && (*bip) != bi->bi_next)
 		BUG();
 	if (*bip)
@@ -932,7 +939,7 @@ static void add_stripe_bio (struct strip
 		(unsigned long long)sh->sector, dd_idx);
 
 	if (forwrite) {
-		/* check if page is coverred */
+		/* check if page is covered */
 		sector_t sector = sh->dev[dd_idx].sector;
 		for (bi=sh->dev[dd_idx].towrite;
 		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
@@ -944,6 +951,13 @@ static void add_stripe_bio (struct strip
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
 	}
+	return 1;
+
+ overlap:
+	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
+	spin_unlock_irq(&conf->device_lock);
+	spin_unlock(&sh->lock);
+	return 0;
 }
 
 
@@ -1007,6 +1021,8 @@ static void handle_stripe(struct stripe_
 			spin_lock_irq(&conf->device_lock);
 			rbi = dev->toread;
 			dev->toread = NULL;
+			if (test_and_clear_bit(R5_Overlap, &dev->flags))
+				wake_up(&conf->wait_for_overlap);
 			spin_unlock_irq(&conf->device_lock);
 			while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 				copy_data(0, rbi, dev->page, dev->sector);
@@ -1056,6 +1072,9 @@ static void handle_stripe(struct stripe_
 			sh->dev[i].towrite = NULL;
 			if (bi) to_write--;
 
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wake_up(&conf->wait_for_overlap);
+
 			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
 				struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1084,6 +1103,8 @@ static void handle_stripe(struct stripe_
 			if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
 				bi = sh->dev[i].toread;
 				sh->dev[i].toread = NULL;
+				if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+					wake_up(&conf->wait_for_overlap);
 				if (bi) to_read--;
 				while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
 					struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
@@ -1563,6 +1584,7 @@ static int make_request (request_queue_t
 	if ( bio_data_dir(bi) == WRITE )
 		md_write_start(mddev);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+		DEFINE_WAIT(w);
 
 		new_sector = raid6_compute_sector(logical_sector,
 						  raid_disks, data_disks, &dd_idx, &pd_idx, conf);
@@ -1571,17 +1593,28 @@ static int make_request (request_queue_t
 		       (unsigned long long)new_sector,
 		       (unsigned long long)logical_sector);
 
+		prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+
+	retry:
 		sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
 		if (sh) {
-
-			add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
-
+			if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+				/* Add failed due to overlap.  Flush everything
+				 * and wait a while
+				 */
+				raid6_unplug_device(mddev->queue);
+				release_stripe(sh);
+				schedule();
+				goto retry;
+			}
+			finish_wait(&conf->wait_for_overlap, &w);
 			raid6_plug_device(conf);
 			handle_stripe(sh);
 			release_stripe(sh);
 		} else {
 			/* cannot get stripe for read-ahead, just give-up */
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
+			finish_wait(&conf->wait_for_overlap, &w);
 			break;
 		}
 
@@ -1729,6 +1762,7 @@ static int run (mddev_t *mddev)
 
 	spin_lock_init(&conf->device_lock);
 	init_waitqueue_head(&conf->wait_for_stripe);
+	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->inactive_list);

diff ./include/linux/raid/raid5.h~current~ ./include/linux/raid/raid5.h
--- ./include/linux/raid/raid5.h~current~	2005-02-07 13:31:05.000000000 +1100
+++ ./include/linux/raid/raid5.h	2005-02-07 13:34:23.000000000 +1100
@@ -152,6 +152,7 @@ struct stripe_head {
 #define	R5_Wantread	4	/* want to schedule a read */
 #define	R5_Wantwrite	5
 #define	R5_Syncio	6	/* this io need to be accounted as resync io */
+#define	R5_Overlap	7	/* There is a pending overlapping request on this block */
 
 /*
  * Write method
@@ -219,6 +220,7 @@ struct raid5_private_data {
 	atomic_t		active_stripes;
 	struct list_head	inactive_list;
 	wait_queue_head_t	wait_for_stripe;
+	wait_queue_head_t	wait_for_overlap;
 	int			inactive_blocked;	/* release of inactive stripes blocked,
 							 * waiting for 25% to be free
 							 */        
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux