[PATCH md 005 of 6] Support write-mostly device in raid1

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This allows a device in a raid1 to be marked as "write mostly".
Read requests will only be sent if there is no other option.

Signed-off-by: Neil Brown <neilb@xxxxxxxxxxxxxxx>

### Diffstat output
 ./drivers/md/md.c           |   18 ++++++++++
 ./drivers/md/raid1.c        |   76 ++++++++++++++++++++++++++++++--------------
 ./include/linux/raid/md_k.h |    3 +
 ./include/linux/raid/md_p.h |   11 +++++-
 4 files changed, 82 insertions(+), 26 deletions(-)

diff ./drivers/md/md.c~current~ ./drivers/md/md.c
--- ./drivers/md/md.c~current~	2005-08-11 15:35:33.000000000 +1000
+++ ./drivers/md/md.c	2005-08-11 16:03:03.000000000 +1000
@@ -671,6 +671,7 @@ static int super_90_validate(mddev_t *md
 
 	if (mddev->level != LEVEL_MULTIPATH) {
 		rdev->faulty = 0;
+		rdev->flags = 0;
 		desc = sb->disks + rdev->desc_nr;
 
 		if (desc->state & (1<<MD_DISK_FAULTY))
@@ -680,6 +681,8 @@ static int super_90_validate(mddev_t *md
 			rdev->in_sync = 1;
 			rdev->raid_disk = desc->raid_disk;
 		}
+		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
 	} else /* MULTIPATH are always insync */
 		rdev->in_sync = 1;
 	return 0;
@@ -778,6 +781,8 @@ static void super_90_sync(mddev_t *mddev
 			spare++;
 			working++;
 		}
+		if (test_bit(WriteMostly, &rdev2->flags))
+			d->state |= (1<<MD_DISK_WRITEMOSTLY);
 	}
 	
 	/* now set the "removed" and "faulty" bits on any missing devices */
@@ -991,6 +996,9 @@ static int super_1_validate(mddev_t *mdd
 			rdev->raid_disk = role;
 			break;
 		}
+		rdev->flags = 0;
+		if (sb->devflags & WriteMostly1)
+			set_bit(WriteMostly, &rdev->flags);
 	} else /* MULTIPATH are always insync */
 		rdev->in_sync = 1;
 
@@ -2151,6 +2159,8 @@ static int get_disk_info(mddev_t * mddev
 			info.state |= (1<<MD_DISK_ACTIVE);
 			info.state |= (1<<MD_DISK_SYNC);
 		}
+		if (test_bit(WriteMostly, &rdev->flags))
+			info.state |= (1<<MD_DISK_WRITEMOSTLY);
 	} else {
 		info.major = info.minor = 0;
 		info.raid_disk = -1;
@@ -2236,6 +2246,9 @@ static int add_new_disk(mddev_t * mddev,
 		rdev->saved_raid_disk = rdev->raid_disk;
 
 		rdev->in_sync = 0; /* just to be sure */
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err)
@@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev,
 		else
 			rdev->in_sync = 0;
 
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
 			export_rdev(rdev);
@@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *
 			char b[BDEVNAME_SIZE];
 			seq_printf(seq, " %s[%d]",
 				bdevname(rdev->bdev,b), rdev->desc_nr);
+			if (test_bit(WriteMostly, &rdev->flags))
+				seq_printf(seq, "(W)");
 			if (rdev->faulty) {
 				seq_printf(seq, "(F)");
 				continue;

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2005-08-11 15:14:05.000000000 +1000
+++ ./drivers/md/raid1.c	2005-08-11 16:03:03.000000000 +1000
@@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1
 {
 	const unsigned long this_sector = r1_bio->sector;
 	int new_disk = conf->last_used, disk = new_disk;
+	int wonly_disk = -1;
 	const int sectors = r1_bio->sectors;
 	sector_t new_distance, current_distance;
-	mdk_rdev_t *new_rdev, *rdev;
+	mdk_rdev_t *rdev;
 
 	rcu_read_lock();
 	/*
-	 * Check if it if we can balance. We can balance on the whole
+	 * Check if we can balance. We can balance on the whole
 	 * device if no resync is going on, or below the resync window.
 	 * We take the first readable disk when above the resync window.
 	 */
@@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1
 		/* Choose the first operation device, for consistancy */
 		new_disk = 0;
 
-		while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-		       !new_rdev->in_sync) {
-			new_disk++;
-			if (new_disk == conf->raid_disks) {
-				new_disk = -1;
+		for (rdev = conf->mirrors[new_disk].rdev;
+		     !rdev || !rdev->in_sync
+			     || test_bit(WriteMostly, &rdev->flags);
+		     rdev = conf->mirrors[++new_disk].rdev) {
+		     
+			if (rdev && rdev->in_sync)
+				wonly_disk = new_disk;
+
+			if (new_disk == conf->raid_disks - 1) {
+				new_disk = wonly_disk;
 				break;
 			}
 		}
@@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1
 
 
 	/* make sure the disk is operational */
-	while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-	       !new_rdev->in_sync) {
+	for (rdev = conf->mirrors[new_disk].rdev;
+	     !rdev || !rdev->in_sync ||
+		     test_bit(WriteMostly, &rdev->flags);
+	     rdev = conf->mirrors[new_disk].rdev) {
+
+		if (rdev && rdev->in_sync)
+			wonly_disk = new_disk;
+
 		if (new_disk <= 0)
 			new_disk = conf->raid_disks;
 		new_disk--;
 		if (new_disk == disk) {
-			new_disk = -1;
-			goto rb_out;
+			new_disk = wonly_disk;
+			break;
 		}
 	}
+
+	if (new_disk < 0) 
+		goto rb_out;
+
 	disk = new_disk;
 	/* now disk == new_disk == starting point for search */
 
@@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1
 			disk = conf->raid_disks;
 		disk--;
 
-		if ((rdev=conf->mirrors[disk].rdev) == NULL ||
-		    !rdev->in_sync)
+		rdev = conf->mirrors[disk].rdev;
+
+		if (!rdev ||
+		    !rdev->in_sync ||
+		    test_bit(WriteMostly, &rdev->flags))
 			continue;
 
 		if (!atomic_read(&rdev->nr_pending)) {
 			new_disk = disk;
-			new_rdev = rdev;
 			break;
 		}
 		new_distance = abs(this_sector - conf->mirrors[disk].head_position);
 		if (new_distance < current_distance) {
 			current_distance = new_distance;
 			new_disk = disk;
-			new_rdev = rdev;
 		}
 	} while (disk != conf->last_used);
 
-rb_out:
+ rb_out:
 
 
 	if (new_disk >= 0) {
-		conf->next_seq_sect = this_sector + sectors;
-		conf->last_used = new_disk;
-		atomic_inc(&new_rdev->nr_pending);
-		if (!new_rdev->in_sync) {
+		rdev = conf->mirrors[new_disk].rdev;
+		if (!rdev)
+			goto retry;
+		atomic_inc(&rdev->nr_pending);
+		if (!rdev->in_sync) {
 			/* cannot risk returning a device that failed
 			 * before we inc'ed nr_pending
 			 */
-			atomic_dec(&new_rdev->nr_pending);
+			atomic_dec(&rdev->nr_pending);
 			goto retry;
 		}
+		conf->next_seq_sect = this_sector + sectors;
+		conf->last_used = new_disk;
 	}
 	rcu_read_unlock();
 
@@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *md
 	sector_t max_sector, nr_sectors;
 	int disk;
 	int i;
+	int wonly;
 	int write_targets = 0;
 	int sync_blocks;
 	int still_degraded = 0;
@@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *md
 	 */
 	disk = conf->last_used;
 	/* make sure disk is operational */
-
+	wonly = disk;
 	while (conf->mirrors[disk].rdev == NULL ||
-	       !conf->mirrors[disk].rdev->in_sync) {
+	       !conf->mirrors[disk].rdev->in_sync ||
+	       test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
+		) {
+		if (conf->mirrors[disk].rdev  &&
+		    conf->mirrors[disk].rdev->in_sync)
+			wonly = disk;
 		if (disk <= 0)
 			disk = conf->raid_disks;
 		disk--;
-		if (disk == conf->last_used)
+		if (disk == conf->last_used) {
+			disk = wonly;
 			break;
+		}
 	}
 	conf->last_used = disk;
 	atomic_inc(&conf->mirrors[disk].rdev->nr_pending);

diff ./include/linux/raid/md_k.h~current~ ./include/linux/raid/md_k.h
--- ./include/linux/raid/md_k.h~current~	2005-08-11 15:28:56.000000000 +1000
+++ ./include/linux/raid/md_k.h	2005-08-11 16:03:03.000000000 +1000
@@ -181,6 +181,9 @@ struct mdk_rdev_s
 	int faulty;			/* if faulty do not issue IO requests */
 	int in_sync;			/* device is a full member of the array */
 
+	unsigned long	flags;		/* Should include faulty and in_sync here. */
+#define	WriteMostly	4		/* Avoid reading if at all possible */
+
 	int desc_nr;			/* descriptor index in the superblock */
 	int raid_disk;			/* role of device in array */
 	int saved_raid_disk;		/* role that device used to have in the

diff ./include/linux/raid/md_p.h~current~ ./include/linux/raid/md_p.h
--- ./include/linux/raid/md_p.h~current~	2005-08-11 16:03:03.000000000 +1000
+++ ./include/linux/raid/md_p.h	2005-08-11 16:03:03.000000000 +1000
@@ -79,6 +79,11 @@
 #define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
 #define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
 
+#define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
+				   * read requests will only be sent here in 
+				   * dire need
+				   */
+
 typedef struct mdp_device_descriptor_s {
 	__u32 number;		/* 0 Device number in the entire set	      */
 	__u32 major;		/* 1 Device major number		      */
@@ -193,7 +198,7 @@ struct mdp_superblock_1 {
 
 	__u64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/
 	__u32	level;		/* -4 (multipath), -1 (linear), 0,1,4,5 */
-	__u32	layout;		/* only for raid5 currently */
+	__u32	layout;		/* only for raid5 and raid10 currently */
 	__u64	size;		/* used size of component devices, in 512byte sectors */
 
 	__u32	chunksize;	/* in 512byte sectors */
@@ -212,7 +217,9 @@ struct mdp_superblock_1 {
 	__u32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__u32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
-	__u8	pad2[64-56];	/* set to 0 when writing */
+	__u8	devflags;	/* per-device flags.  Only one defined...*/
+#define	WriteMostly1	1	/* mask for writemostly flag in above */
+	__u8	pad2[64-57];	/* set to 0 when writing */
 
 	/* array state information - 64 bytes */
 	__u64	utime;		/* 40 bits second, 24 btes microseconds */
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux