Subject: [001/002 ] raid0 reshape

raz ben yehuda <raziebe@xxxxxxx> · Sun, 03 May 2009 00:46:04 +0300

Neil Hello
The bellow is the raid0 grow code.I have decided to fix raid0 and not 
perform the transformation raid0-raid4-raid0 due to two reasons:
1. raid0 zones. this patch support any zone transformations.
2. Undesired dependency of raid0 over raid4 re-striping code. 

The following tests were conducted:
1. various chunk sizes, 4K to 512K. ( mainly in 2.6.27 and 2.6.18 )
2. regrow ( tested on 2.6.27 and 2.6.18 )
3. various super blocks. 0.9 , 1, 1.1 and 1.2 ( mainly in 2.6.27 and 2.6.18 ).
4. support assembling and mounting older raid version ( older kernels and code before patch) after it was grown.

patch passed checkpatch.pl . other than reshaping code i beautified the code. 
Currently i about to pass this code to our testing team for further tests. 
Other things to do:
1. Speedup the reshape process.It is too slow.
2. Support for non power 2^n ( page size) chunks.

I will be thankful for your criticism.

Raz


 drivers/md/Kconfig |   13 
 drivers/md/md.c    |    6 
 drivers/md/raid0.c |  711 ++++++++++++++++++++++++++++++++++---------
 drivers/md/raid0.h |    5 
 4 files changed, 590 insertions(+), 145 deletions(-)

Signed-off-by:  Neil Brown <neilb@xxxxxxx>
---

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675..a9f0ff6 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -77,6 +77,19 @@ config MD_RAID0
 
 	  If unsure, say Y.
 
+config MD_RAID0_RESHAPE
+	bool "Support adding drives to a raid-0 array.(EXPERIMENTAL)"
+	depends on MD_RAID0 && EXPERIMENTAL
+	default n
+	---help---
+	  A RAID-0 set can be expanded by adding extra drives. This
+	  requires "restriping" .
+	  You will need mdadm version 2.4.x or later to use this.
+	  The mdadm usage is e.g.
+	       mdadm --grow /dev/md0 --raid-disks=6
+	  Note: The array can only be expanded.
+	  If unsure, say N.
+
 config MD_RAID1
 	tristate "RAID-1 (mirroring) mode"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ed5727c..82f57ea 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5707,6 +5707,8 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
 		max_blocks = mddev->resync_max_sectors >> 1;
 	else
 		max_blocks = mddev->dev_sectors / 2;
+	if (mddev->level == 0)
+		max_blocks = mddev->array_sectors>>1;
 
 	/*
 	 * Should not happen.
@@ -5915,7 +5917,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
 		if (mddev->pers) {
 			mddev->pers->status(seq, mddev);
 	 		seq_printf(seq, "\n      ");
-			if (mddev->pers->sync_request) {
+			if (mddev->pers->sync_request || !mddev->level) {
 				if (mddev->curr_resync > 2) {
 					status_resync(seq, mddev);
 					seq_printf(seq, "\n      ");
@@ -6146,7 +6148,7 @@ int md_allow_write(mddev_t *mddev)
 		return 0;
 	if (mddev->ro)
 		return 0;
-	if (!mddev->pers->sync_request)
+	if (!mddev->pers->sync_request && mddev->level != 0)
 		return 0;
 
 	spin_lock_irq(&mddev->write_lock);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d755..9e2b6de 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -18,11 +18,14 @@
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
 */
 
+#include <linux/kthread.h>
 #include <linux/blkdev.h>
 #include <linux/seq_file.h>
 #include "md.h"
 #include "raid0.h"
 
+static int raid0_create_reshape_thread(mddev_t *mddev);
+
 static void raid0_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
@@ -53,27 +56,46 @@ static int raid0_congested(void *data, int bits)
 }
 

-static int create_strip_zones (mddev_t *mddev)
+static void raid0_dump_zones(mddev_t *mddev)
 {
-	int i, c, j;
-	sector_t current_start, curr_zone_start;
-	sector_t min_spacing;
+	int j, k, h;
+	char b[BDEVNAME_SIZE];
 	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
-	struct strip_zone *zone;
-	int cnt;
+	printk(KERN_INFO "***** %s configuration ******\n\n",
+		mdname(mddev));
+	h = 0;
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		printk(KERN_INFO "zone%d", j);
+		if (conf->hash_table[h] == conf->strip_zone+j)
+			printk("(h%d)", h++);
+		printk(KERN_INFO "=[");
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			printk(KERN_INFO "%s/", bdevname(
+				conf->strip_zone[j].dev[k]->bdev, b));
+		printk(KERN_INFO "]\n\t zone offset=%llu device offset=%llu size=%llukb\n",
+			(unsigned long long)conf->strip_zone[j].zone_start,
+			(unsigned long long)conf->strip_zone[j].dev_start,
+			(unsigned long long)conf->strip_zone[j].sectors>>1);
+	}
+	printk(KERN_INFO "**********************************\n\n");
+}
+
+
+static void raid0_count_zones(mddev_t *mddev, struct list_head *disks)
+{
+	int c = 0;
 	char b[BDEVNAME_SIZE];
- 
+	mdk_rdev_t  *rdev1, *rdev2;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
 	/*
 	 * The number of 'same size groups'
 	 */
 	conf->nr_strip_zones = 0;
- 
-	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+	list_for_each_entry(rdev1, disks, same_set) {
 		printk(KERN_INFO "raid0: looking at %s\n",
 			bdevname(rdev1->bdev,b));
 		c = 0;
-		list_for_each_entry(rdev2, &mddev->disks, same_set) {
+		list_for_each_entry(rdev2, disks, same_set) {
 			printk(KERN_INFO "raid0:   comparing %s(%llu)",
 			       bdevname(rdev1->bdev,b),
 			       (unsigned long long)rdev1->sectors);
@@ -103,78 +125,72 @@ static int create_strip_zones (mddev_t *mddev)
 		}
 	}
 	printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
+}
 
-	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
-				conf->nr_strip_zones, GFP_KERNEL);
-	if (!conf->strip_zone)
-		return 1;
-	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
-				conf->nr_strip_zones*mddev->raid_disks,
-				GFP_KERNEL);
-	if (!conf->devlist)
-		return 1;
 
-	/* The first zone must contain all devices, so here we check that
-	 * there is a proper alignment of slots to devices and find them all
-	 */
-	zone = &conf->strip_zone[0];
-	cnt = 0;
-	smallest = NULL;
-	zone->dev = conf->devlist;
-	list_for_each_entry(rdev1, &mddev->disks, same_set) {
-		int j = rdev1->raid_disk;
+/*
+ * The first zone must contain all devices, so here we check that
+ * there is a proper alignment of slots to devices and find them all
+ */
+static int raid0_create_first_zone(mddev_t *mddev, struct list_head *disks)
+{
+	mdk_rdev_t *smallest = NULL;
+	mdk_rdev_t  *rdev;
+	int cnt = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	struct strip_zone *zone0 = &conf->strip_zone[0];
 
+	zone0->dev = conf->devlist;
+	list_for_each_entry(rdev, disks, same_set) {
+		int j = rdev->raid_disk;
 		if (j < 0 || j >= mddev->raid_disks) {
 			printk(KERN_ERR "raid0: bad disk number %d - "
 				"aborting!\n", j);
-			goto abort;
+			return -1;
 		}
-		if (zone->dev[j]) {
+		if (zone0->dev[j]) {
 			printk(KERN_ERR "raid0: multiple devices for %d - "
 				"aborting!\n", j);
-			goto abort;
+			return -1;
 		}
-		zone->dev[j] = rdev1;
-
-		blk_queue_stack_limits(mddev->queue,
-				       rdev1->bdev->bd_disk->queue);
-		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
-		 */
-
-		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
-		if (!smallest || (rdev1->sectors < smallest->sectors))
-			smallest = rdev1;
+		zone0->dev[j] = rdev;
+		if (!smallest || (rdev->sectors < smallest->sectors))
+			smallest = rdev;
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
 		printk(KERN_ERR "raid0: too few disks (%d of %d) - "
 			"aborting!\n", cnt, mddev->raid_disks);
-		goto abort;
+		return -1;
 	}
-	zone->nb_dev = cnt;
-	zone->sectors = smallest->sectors * cnt;
-	zone->zone_start = 0;
+	zone0->nb_dev = cnt;
+	zone0->sectors = smallest->sectors * cnt;
+	zone0->zone_start = 0;
+	return 0;
+}
+
+
 
-	current_start = smallest->sectors;
-	curr_zone_start = zone->sectors;
+static void raid0_set_higher_zones(mddev_t *mddev)
+{
+	int i, j, c;
+	mdk_rdev_t *rdev;
+	struct strip_zone *zone;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t *smallest;
+	sector_t current_start =
+		conf->strip_zone[0].sectors/conf->strip_zone[0].nb_dev;
+	sector_t curr_zone_start = conf->strip_zone[0].sectors;
 
 	/* now do the other zones */
-	for (i = 1; i < conf->nr_strip_zones; i++)
-	{
+	for (i = 1; i < conf->nr_strip_zones; i++) {
 		zone = conf->strip_zone + i;
 		zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
-
 		printk(KERN_INFO "raid0: zone %d\n", i);
 		zone->dev_start = current_start;
 		smallest = NULL;
 		c = 0;
-
-		for (j=0; j<cnt; j++) {
+		for (j = 0; j < conf->strip_zone[0].nb_dev; j++) {
 			char b[BDEVNAME_SIZE];
 			rdev = conf->strip_zone[0].dev[j];
 			printk(KERN_INFO "raid0: checking %s ...",
@@ -197,25 +213,33 @@ static int create_strip_zones (mddev_t *mddev)
 		zone->sectors = (smallest->sectors - current_start) * c;
 		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
 			zone->nb_dev, (unsigned long long)zone->sectors);
-
 		zone->zone_start = curr_zone_start;
 		curr_zone_start += zone->sectors;
-
 		current_start = smallest->sectors;
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
 			(unsigned long long)current_start);
 	}
+}
 
-	/* Now find appropriate hash spacing.
-	 * We want a number which causes most hash entries to cover
-	 * at most two strips, but the hash table must be at most
-	 * 1 PAGE.  We choose the smallest strip, or contiguous collection
-	 * of strips, that has big enough size.  We never consider the last
-	 * strip though as it's size has no bearing on the efficacy of the hash
-	 * table.
-	 */
-	conf->spacing = curr_zone_start;
-	min_spacing = curr_zone_start;
+
+/* Now find appropriate hash spacing.
+ * We want a number which causes most hash entries to cover
+ * at most two strips, but the hash table must be at most
+ * 1 PAGE.  We choose the smallest strip, or contiguous collection
+ * of strips, that has big enough size.  We never consider the last
+ * strip though as it's size has no bearing on the efficacy of the hash
+ * table.
+ */
+static void raid0_find_hash_spacing(mddev_t *mddev)
+{
+	int i, j;
+	sector_t min_spacing;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+
+	conf->spacing = 0;
+	for (i = 0; i < conf->nr_strip_zones; i++)
+		conf->spacing += conf->strip_zone[i].sectors;
+	min_spacing = conf->spacing;
 	sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
 	for (i=0; i < conf->nr_strip_zones-1; i++) {
 		sector_t s = 0;
@@ -225,16 +249,31 @@ static int create_strip_zones (mddev_t *mddev)
 		if (s >= min_spacing && s < conf->spacing)
 			conf->spacing = s;
 	}
+}
 
-	mddev->queue->unplug_fn = raid0_unplug;
+static int raid0_create_strip_zones(mddev_t *mddev, struct list_head *disks)
+{
+	raid0_conf_t *conf = mddev_to_conf(mddev);
 
+	raid0_count_zones(mddev, disks);
+	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
+				conf->nr_strip_zones, GFP_KERNEL);
+	if (!conf->strip_zone)
+		return 1;
+	conf->devlist = kzalloc(sizeof(mdk_rdev_t *)*
+				conf->nr_strip_zones*mddev->raid_disks,
+				GFP_KERNEL);
+	if (!conf->devlist)
+		return 1;
+	if (raid0_create_first_zone(mddev, disks))
+		return 1;
+	raid0_set_higher_zones(mddev);
+	raid0_find_hash_spacing(mddev);
+	mddev->queue->unplug_fn = raid0_unplug;
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
-
 	printk(KERN_INFO "raid0: done.\n");
 	return 0;
- abort:
-	return 1;
 }
 
 /**
@@ -265,79 +304,73 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 
 static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-	sector_t array_sectors = 0;
+	int i;
 	mdk_rdev_t *rdev;
-
-	WARN_ONCE(sectors || raid_disks,
-		  "%s does not support generic reshape\n", __func__);
-
-	list_for_each_entry(rdev, &mddev->disks, same_set)
-		array_sectors += rdev->sectors;
-
+	sector_t array_sectors = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	for (i = 0; i < mddev->raid_disks; i++) {
+		rdev = devlist[i];
+		if (test_bit(In_sync, &rdev->flags))
+			array_sectors += rdev->sectors;
+	}
 	return array_sectors;
 }
 
-static int raid0_run (mddev_t *mddev)
+static void raid0_set_queue_limits(mddev_t *mddev)
 {
-	unsigned  cur=0, i=0, nb_zone;
-	s64 sectors;
-	raid0_conf_t *conf;
+	mdk_rdev_t  *rdev;
 
-	if (mddev->chunk_size == 0) {
-		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
-		return -EINVAL;
-	}
-	printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
-	       mdname(mddev),
-	       mddev->chunk_size >> 9,
-	       (mddev->chunk_size>>1)-1);
-	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
-	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
-	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		blk_queue_stack_limits(mddev->queue,
+			       rdev->bdev->bd_disk->queue);
+		/* as we don't honour merge_bvec_fn, we must never risk
+		 * violating it, so limit ->max_sector to one PAGE, as
+		 * a one page request is never in violation.
+		 */
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
+		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-	conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
-	if (!conf)
-		goto out;
-	mddev->private = (void *)conf;
- 
-	conf->strip_zone = NULL;
-	conf->devlist = NULL;
-	if (create_strip_zones (mddev)) 
-		goto out_free_conf;
+	}
+}
 
-	/* calculate array device size */
-	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
+static int raid0_set_array_hash(mddev_t *mddev)
+{
+	int nb_zone = 0;
+	sector_t space;
+	int round;
+	sector_t s , sectors;
+	int  cur = 0, i = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
 
 	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
 		(unsigned long long)mddev->array_sectors);
 	printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
 		(unsigned long long)conf->spacing);
-	{
-		sector_t s = raid0_size(mddev, 0, 0);
-		sector_t space = conf->spacing;
-		int round;
-		conf->sector_shift = 0;
-		if (sizeof(sector_t) > sizeof(u32)) {
-			/*shift down space and s so that sector_div will work */
-			while (space > (sector_t) (~(u32)0)) {
-				s >>= 1;
-				space >>= 1;
-				s += 1; /* force round-up */
-				conf->sector_shift++;
-			}
+
+	s = raid0_size(mddev, 0, mddev->raid_disks);
+	space = conf->spacing;
+	conf->sector_shift = 0;
+	if (sizeof(sector_t) > sizeof(u32)) {
+		/*shift down space and s so that sector_div will work */
+		while (space > (sector_t) (~(u32)0)) {
+			s >>= 1;
+			space >>= 1;
+			s += 1; /* force round-up */
+			conf->sector_shift++;
 		}
-		round = sector_div(s, (u32)space) ? 1 : 0;
-		nb_zone = s + round;
 	}
+	round = sector_div(s, (u32)space) ? 1 : 0;
+	nb_zone = s + round;
 	printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
 
 	printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
 				nb_zone*sizeof(struct strip_zone*));
 	conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
 	if (!conf->hash_table)
-		goto out_free_conf;
+		return -1;
 	sectors = conf->strip_zone[cur].sectors;
-
 	conf->hash_table[0] = conf->strip_zone + cur;
 	for (i=1; i< nb_zone; i++) {
 		while (sectors <= conf->spacing) {
@@ -354,24 +387,59 @@ static int raid0_run (mddev_t *mddev)
 		 */
 		conf->spacing++;
 	}
+	return 0;
+}
 
-	/* calculate the max read-ahead size.
-	 * For read-ahead of large files to be effective, we need to
-	 * readahead at least twice a whole stripe. i.e. number of devices
-	 * multiplied by chunk size times 2.
-	 * If an individual device has an ra_pages greater than the
-	 * chunk size, then we will not drive that device as hard as it
-	 * wants.  We consider this a configuration error: a larger
-	 * chunksize should be used in that case.
-	 */
-	{
-		int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
-		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
-	}
+/* calculate the max read-ahead size.
+ * For read-ahead of large files to be effective, we need to
+ * readahead at least twice a whole stripe. i.e. number of devices
+ * multiplied by chunk size times 2.
+ * If an individual device has an ra_pages greater than the
+ * chunk size, then we will not drive that device as hard as it
+ * wants.  We consider this a configuration error: a larger
+ * chunksize should be used in that case.
+ */
+static void raid0_set_max_ra(mddev_t *mddev)
+{
+	int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
+	if (mddev->queue->backing_dev_info.ra_pages < 2*stripe)
+		mddev->queue->backing_dev_info.ra_pages = 2*stripe;
 
+}
+
+static int raid0_run(mddev_t *mddev)
+{
+	raid0_conf_t *conf;
+	if (mddev->chunk_size == 0) {
+		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+		return -EINVAL;
+	}
+	printk(KERN_INFO "%s: setting max_sectors"
+			" to %d, segment boundary to %d\n",
+	       mdname(mddev),
+	       mddev->chunk_size >> 9,
+	       (mddev->chunk_size>>1)-1);
+	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
+	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
+	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
+	conf = kmalloc(sizeof(raid0_conf_t), GFP_KERNEL);
+	if (!conf)
+		goto out;
+	mddev->private = (void *)conf;
+	conf->strip_zone = NULL;
+	conf->devlist = NULL;
+	if (raid0_create_strip_zones(mddev, &mddev->disks))
+		goto out_free_conf;
+	/* calculate array device size */
+	md_set_array_sectors(mddev, raid0_size(mddev, 0, mddev->raid_disks));
+	raid0_set_array_hash(mddev);
+	raid0_set_queue_limits(mddev);
+	raid0_set_max_ra(mddev);
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+	raid0_dump_zones(mddev);
+	raid0_create_reshape_thread(mddev);
+	init_completion(&conf->wait_reshape);
 	return 0;
 
 out_free_conf:
@@ -386,7 +454,10 @@ out:
 static int raid0_stop (mddev_t *mddev)
 {
 	raid0_conf_t *conf = mddev_to_conf(mddev);
-
+	if (mddev->thread) {
+		md_unregister_thread(mddev->thread);
+		mddev->thread = 0;
+	}
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf->hash_table);
 	conf->hash_table = NULL;
@@ -414,7 +485,10 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
-
+	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
+		bio_endio(bio, -EBUSY);
+		return 0;
+	}
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
 	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
@@ -513,6 +587,357 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 	return;
 }
 
+#ifdef CONFIG_MD_RAID0_RESHAPE
+
+#define DEBUG 0
+#define r0_dprintk(x...) ((void)(DEBUG && printk(x)))
+
+static void raid0_reshape_endio(struct bio *bi, int error)
+{
+	struct completion* w = (struct completion *)bi->bi_private;
+	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+	r0_dprintk("raid0: endio: sec=%lld:size=%d "
+		"bvlen=%d bvoff=%d \n",
+			(unsigned long long)bi->bi_sector,
+			bi->bi_size,
+			bi->bi_io_vec[0].bv_len,
+			bi->bi_io_vec[0].bv_offset);
+	if (!error || uptodate)
+		return (void)complete(w);
+	printk("raid0: end reshape: io error sector=%llu\n",
+			(unsigned long long)bi->bi_sector);
+}
+
+static int raid0_reshape_rw(struct bio *bi, int dir, int size)
+{
+	char b[BDEVNAME_SIZE];
+	bi->bi_rw  	= dir;
+	bi->bi_size     = size;
+	bi->bi_idx      = 0;
+	r0_dprintk("%s %c %llu sec size=%d\n",
+			bdevname(bi->bi_bdev, b),
+			dir == 0 ? 'R' : 'W',
+			(unsigned long long)bi->bi_sector, bi->bi_size);
+	generic_make_request(bi);
+	wait_for_completion((struct completion *)(bi->bi_private));
+	return 0;
+}
+
+static struct strip_zone *raid0_point_to_zone(mddev_t *mddev,
+					sector_t sector)
+{
+	sector_t x;
+	struct strip_zone *zone;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+
+	x = sector >> conf->sector_shift;
+	sector_div(x, (u32)conf->spacing);
+	zone = conf->hash_table[x];
+	while (sector >= zone->zone_start + zone->sectors)
+		zone++;
+	return zone;
+}
+
+
+static int raid0_point_bio_to_disk(struct bio *bio, sector_t raid_sector,
+				mddev_t *mddev)
+{
+	int chunksect_bits;
+	mdk_rdev_t *tmp_dev;
+	sector_t x, chunk_sects, chunk, rsect;
+	sector_t sect_in_chunk;
+	struct strip_zone *zone;
+
+	chunk_sects = mddev->chunk_size >> 9;
+	chunksect_bits = ffz(~chunk_sects);
+
+	zone = raid0_point_to_zone(mddev, raid_sector);
+	sect_in_chunk = raid_sector & (chunk_sects - 1);
+	x = (raid_sector - zone->zone_start) >> chunksect_bits;
+	sector_div(x, zone->nb_dev);
+	chunk = x;
+	x = raid_sector >> chunksect_bits;
+	tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
+	rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
+
+	bio->bi_bdev   = tmp_dev->bdev;
+	bio->bi_sector = rsect + tmp_dev->data_offset;
+	return 0;
+}
+
+
+static void raid0_take_speed(mddev_t *mddev, sector_t raid_sector)
+{
+	if ((jiffies-mddev->resync_mark) < 1000)
+		return;
+	mddev->resync_mark = jiffies;
+	mddev->resync_mark_cnt = raid_sector;
+}
+
+
+static sector_t raid0_reshape_move_blocks(mddev_t *mddev,
+					mddev_t *mddev_target,
+					struct strip_zone *zone)
+{
+	raid0_conf_t *conf 	= mddev_to_conf(mddev);
+	struct bio  *bi 	= conf->reshape_bi;
+	int io_size 		= bi->bi_size;
+	sector_t raid_sector    = zone->zone_start;
+	sector_t last_sector 	= (zone->zone_start + zone->sectors);
+	mddev->curr_mark_cnt    = io_size>>10;
+
+	while (raid_sector < last_sector && !kthread_should_stop()) {
+		raid0_take_speed(mddev, raid_sector);
+		if (raid0_point_bio_to_disk(bi, raid_sector, mddev)) {
+			printk(KERN_ERR "raid0:reshape point"
+					" read to bio failed\n");
+			break;
+		}
+		raid0_reshape_rw(bi, READ, io_size);
+		if (raid0_point_bio_to_disk(bi, raid_sector, mddev_target)) {
+			printk(KERN_ERR "raid0: point write to bio failed\n");
+			break;
+		}
+		raid0_reshape_rw(bi, WRITE, io_size);
+		raid_sector += io_size>>9;
+		mddev->curr_mark_cnt = raid_sector;
+		mddev->curr_resync = raid_sector;
+	}
+	bi->bi_size = io_size;
+	return raid_sector - zone->zone_start;
+}
+
+
+static void raid0_reshape_move_zones(mddev_t *mddev, mddev_t *mddev_target)
+{
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	sector_t raid_sector = 0;
+	int i = 0;
+	for (; i < conf->nr_strip_zones && !kthread_should_stop() ; i++)
+		raid_sector += raid0_reshape_move_blocks(mddev,
+						mddev_target,
+						&conf->strip_zone[i]);
+	if (raid_sector == mddev->array_sectors) {
+		printk(KERN_INFO "raid0: reshape ended %llu sectors moved OK\n",
+			(unsigned long long)raid_sector);
+	} else{
+		printk(KERN_INFO "raid0: reshape ended %llu sector moved BAD\n",
+			(unsigned long long)raid_sector);
+	}
+}
+
+
+static int raid0_reshape_prepare(mddev_t *mddev, mddev_t *mddev_target)
+{
+	raid0_conf_t *conf;
+	mddev_target->private = NULL;
+	conf = kzalloc(sizeof(raid0_conf_t), GFP_KERNEL);
+	if (!conf)
+		return -1;
+	mddev_target->private = (void *)conf;
+	conf->strip_zone = NULL;
+	conf->devlist = NULL;
+	if (raid0_create_strip_zones(mddev_target, &mddev->disks))
+		return -1;
+	return raid0_set_array_hash(mddev_target);
+}
+
+
+static	mddev_t *raid0_clone_mddev(mddev_t *mddev)
+{
+	void *m = kmalloc(sizeof(*mddev), GFP_NOIO);
+	if (!m)
+		return NULL;
+	memcpy(m, mddev, sizeof(*mddev));
+	return (mddev_t *)m;
+}
+
+static int raid0_reshape_iosize(mddev_t *mddev)
+{
+	int chunk_size_sectors = (mddev->chunk_size / PAGE_SIZE)*8;
+
+	if (mddev->queue->max_hw_sectors >= chunk_size_sectors)
+		return chunk_size_sectors;
+	if ((chunk_size_sectors % mddev->queue->max_hw_sectors) == 0)
+		return mddev->queue->max_hw_sectors;
+	return chunk_size_sectors /
+		((chunk_size_sectors / mddev->queue->max_hw_sectors)*2);
+}
+
+
+static mddev_t *raid0_reshape_init(mddev_t *mddev)
+{
+	int i;
+	mddev_t *mddev_target = NULL;
+	mdk_rdev_t *rdev = NULL;
+	int nraid_disks = 0;
+	struct bio *bi = NULL;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	int pages = raid0_reshape_iosize(mddev)/8;
+	if (pages == 0) {
+		printk(KERN_INFO "raid0: failed to "
+				"determine transfer size\n");
+		return NULL;
+	}
+	printk("raid0: using transfer size %usectors\n", pages*8);
+	bi = bio_alloc(GFP_NOIO, pages);
+	if (!bi) {
+		printk(KERN_INFO "raid0:failed too alloc bio for"
+			" reshaping. rejecting\n");
+		goto RAID0_RESHAPE_INIT_EXIT_BAD;
+	}
+	mddev_target = raid0_clone_mddev(mddev);
+	bi->bi_vcnt = 0;
+	if (!mddev_target) {
+		printk(KERN_INFO "raid0: failed to clone mddev\n");
+		goto RAID0_RESHAPE_INIT_EXIT_BAD;
+	}
+	mddev->reshape_position = 0;
+	mddev->delta_disks = 0;
+	atomic_set(&mddev->recovery_active, 0);
+	nraid_disks = mddev->raid_disks;
+
+	list_for_each_entry(rdev, &mddev->disks, same_set) {
+		if (!test_bit(In_sync, &rdev->flags)) {
+			rdev->raid_disk = nraid_disks++;
+			rdev->desc_nr = rdev->raid_disk;
+			set_bit(In_sync, &rdev->flags);
+		}
+	}
+	mddev_target->raid_disks = nraid_disks;
+	if (raid0_reshape_prepare(mddev, mddev_target)) {
+		printk(KERN_INFO "raid0: failed to"
+			" setup temporary mappings\n");
+		goto RAID0_RESHAPE_INIT_EXIT_BAD;
+	}
+	bi->bi_vcnt = pages;
+	for (i = 0; i < bi->bi_vcnt; i++) {
+		bi->bi_io_vec[i].bv_len    = PAGE_SIZE;
+		bi->bi_io_vec[i].bv_offset = 0;
+		bi->bi_io_vec[i].bv_page   = alloc_page(GFP_NOIO);
+		get_page(bi->bi_io_vec[i].bv_page);
+	}
+	bi->bi_next    		   = NULL;
+	bi->bi_end_io 		   = raid0_reshape_endio;
+	bi->bi_size     	   = PAGE_SIZE * bi->bi_vcnt;
+	bi->bi_private  	   = &conf->wait_reshape;
+	bi->bi_idx  		   = 0;
+	conf->reshape_bi 	   = bi;
+	return mddev_target;
+
+RAID0_RESHAPE_INIT_EXIT_BAD:
+	kfree(mddev_target);
+	for (i = 0; i < bi->bi_vcnt; i++)
+		safe_put_page(bi->bi_io_vec[i].bv_page);
+	if (bi)
+		bio_put(bi);
+	return NULL;
+}
+
+
+static void raid0_reshape_thread(mddev_t *mddev)
+{
+	int i = 0;
+	mddev_t *mddev_target = 0;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+
+	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+		return;
+	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+	clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+	mddev_target = raid0_reshape_init(mddev);
+	if (!mddev_target)
+		return;
+	raid0_reshape_move_zones(mddev, mddev_target);
+	if (kthread_should_stop())
+		goto RAID0_RELEASE_PSEUDO_RAID;
+	for (i = 0; i < conf->reshape_bi->bi_vcnt; i++)
+		safe_put_page(conf->reshape_bi->bi_io_vec[i].bv_page);
+	bio_put(conf->reshape_bi);
+	mddev->resync_mark = 0L;
+	mddev->resync_mark_cnt = 0L;
+	mddev->curr_resync = 0;
+	mddev->recovery_cp = MaxSector;
+	mddev->reshape_position = MaxSector;
+	mddev->raid_disks = mddev_target->raid_disks;
+	kfree(conf->hash_table);
+	kfree(conf);
+	mutex_lock(&mddev->reconfig_mutex);
+	raid0_run(mddev);
+RAID0_RELEASE_PSEUDO_RAID:
+	if (!mutex_is_locked(&mddev->reconfig_mutex))
+		mutex_lock(&mddev->reconfig_mutex);
+	mddev->in_sync = 1;
+	if (md_allow_write(mddev)) {
+		printk("raid0: did not write sb"
+				" critical error\n");
+	}
+	mutex_unlock(&mddev->reconfig_mutex);
+	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	conf = mddev_target->private;
+	kfree(conf->hash_table);
+	kfree(conf->strip_zone);
+	kfree(conf->devlist);
+	kfree(mddev_target);
+}
+
+
+static int raid0_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdk_rdev_t *rdev1;
+	if (rdev->sectors < (mddev->chunk_size>>11)) {
+		printk(KERN_INFO "raid0: device smaller than "
+			"chunk size %llusectors < %llusectors\n",
+				(unsigned long long)rdev->sectors,
+				((unsigned long long)mddev->chunk_size)>>10);
+		return -1;
+	}
+	if (rdev->bdev->bd_disk->queue->max_hw_sectors <
+				mddev->queue->max_hw_sectors) {
+		printk(KERN_INFO "raid0: device trasnfer"
+			" size %usectors is smaller than other"
+			"raid's components %usectors, rejecting ",
+			 rdev->bdev->bd_disk->queue->max_hw_sectors,
+			 mddev->queue->max_hw_sectors);
+		return -1;
+	}
+	list_for_each_entry(rdev1, &mddev->disks, same_set) {
+		if (rdev1 == rdev) {
+			clear_bit(In_sync, &rdev->flags);
+			return 0;
+		}
+	}
+	return -1;
+}
+
+
+static int raid0_create_reshape_thread(mddev_t *mddev)
+{
+	if (mddev->thread)
+		return 0;
+	mddev->thread = md_register_thread(
+			raid0_reshape_thread,
+				mddev, "%s_raid0");
+	if (!mddev->thread) {
+		printk(KERN_ERR
+			"raid0: couldn't allocate thread for %s\n",
+			mdname(mddev));
+		return -1;
+	}
+	mddev->recovery_cp = MaxSector;
+	return 0;
+}
+
+
+static int raid0_reshape(mddev_t *mddev)
+{
+	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+	md_wakeup_thread(mddev->thread);
+	return 0;
+}
+
+#endif
+
 static struct mdk_personality raid0_personality=
 {
 	.name		= "raid0",
@@ -523,6 +948,10 @@ static struct mdk_personality raid0_personality=
 	.stop		= raid0_stop,
 	.status		= raid0_status,
 	.size		= raid0_size,
+#ifdef CONFIG_MD_RAID0_RESHAPE
+	.check_reshape	= raid0_reshape,
+	.hot_add_disk	= raid0_add_disk,
+#endif
 };
 
 static int __init raid0_init (void)
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 824b12e..ff2dca9 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -14,9 +14,10 @@ struct raid0_private_data
 {
 	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
 	struct strip_zone *strip_zone;
-	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
+	mdk_rdev_t **devlist;/* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
-
+	struct bio *reshape_bi;
+	struct completion wait_reshape;
 	sector_t spacing;
 	int sector_shift; /* shift this before divide by spacing */
 };



--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html