[dm-devel] mirroring: [patch 4 of 8] device failure tolerance

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch adds detection of device failure for writes. It also contains the read balancing code as a byproduct.

 brassow

diff -urN linux-2.6.12-004/drivers/md/dm-raid1.c linux-2.6.12-005/drivers/md/dm-raid1.c --- linux-2.6.12-004/drivers/md/dm-raid1.c 2005-06-28 16:46:37.000000000 -0500 +++ linux-2.6.12-005/drivers/md/dm-raid1.c 2005-06-29 10:48:36.137827465 -0500
@@ -28,6 +28,8 @@
 	queue_work(_kmirrord_wq, &_kmirrord_work);
 }

+static struct workqueue_struct *_mir_mond_wq;
+
 /*-----------------------------------------------------------------
  * Region hash
  *
@@ -553,7 +555,8 @@
  * Mirror set structures.
  *---------------------------------------------------------------*/
 struct mirror {
-	atomic_t error_count;
+	atomic_t error_count;  /* Error counter to flag mirror failure */
+	struct mirror_set *ms;
 	struct dm_dev *dev;
 	sector_t offset;
 };
@@ -564,16 +567,23 @@
 	struct region_hash rh;
 	struct kcopyd_client *kcopyd_client;

-	spinlock_t lock;	/* protects the next two lists */
+	spinlock_t lock;	/* protects the lists */
 	struct bio_list reads;
 	struct bio_list writes;
+	struct bio_list failures;
+	struct work_struct failure_work;

 	/* recovery */
+	atomic_t suspended;
 	region_t nr_regions;
 	int in_sync;

 	unsigned int nr_mirrors;
-	struct mirror mirror[0];
+	spinlock_t choose_lock; /* protects select in choose_mirror(). */
+	atomic_t read_count;    /* Read counter for read balancing. */
+	unsigned int read_mirror;       /* Last mirror read. */
+	struct mirror *default_mirror;  /* Default mirror. */
+ 	struct mirror mirror[0];
 };

 /*
@@ -621,7 +631,7 @@
 	unsigned long flags = 0;

 	/* fill in the source */
-	m = ms->mirror + DEFAULT_MIRROR;
+	m = ms->default_mirror;
 	from.bdev = m->dev->bdev;
 	from.sector = m->offset + region_to_sector(reg->rh, reg->key);
 	if (reg->key == (ms->nr_regions - 1)) {
@@ -637,7 +647,7 @@

 	/* fill in the destinations */
 	for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
-		if (i == DEFAULT_MIRROR)
+		if (&ms->mirror[i] == ms->default_mirror)
 			continue;

 		m = ms->mirror + i;
@@ -687,12 +697,74 @@
 }

 /*-----------------------------------------------------------------
- * Reads
+ * Misc Functions
  *---------------------------------------------------------------*/
-static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+#define MIN_READS       128
+/*
+ * choose_mirror
+ * @ms: the mirror set
+ * @m: mirror that has failed, or NULL if just choosing
+ *
+ * Returns: chosen mirror, or NULL on failure
+ */
+static struct mirror *choose_mirror(struct mirror_set *ms, struct mirror *m)
+{
+	int i, retry;
+	unsigned long flags;
+	struct mirror *ret = NULL;
+
+	spin_lock_irqsave(&ms->choose_lock, flags);
+
+	if (unlikely(m == ms->default_mirror)) {
+		i = DEFAULT_MIRROR;
+		atomic_set(&ms->read_count, MIN_READS);
+	} else {
+		i = ms->read_mirror;
+	}
+
+	for (retry = 0; retry < ms->nr_mirrors; ) {
+		i %= ms->nr_mirrors;
+		ret = ms->mirror + i;
+
+		if (unlikely(atomic_read(&ret->error_count))) {
+			retry++;
+			i++;
+		} else {
+			/*
+			 * Guarantee that a number of read IOs
+			 * get queued to the same mirror.
+			 */
+			if (atomic_dec_and_test(&ms->read_count)) {
+				atomic_set(&ms->read_count, MIN_READS);
+				i++;
+			}
+
+			ms->read_mirror = i;
+			break;
+		}
+	}
+
+	/* Check for failure of default mirror, reset if necessary */
+	if (unlikely(m == ms->default_mirror)) {
+		ms->default_mirror = ret;
+	}
+
+	spin_unlock_irqrestore(&ms->choose_lock, flags);
+
+	if (unlikely(atomic_read(&ret->error_count))) {
+		DMERR("All mirror devices are dead. Unable to choose mirror.");
+		return NULL;
+	}
+
+	return ret;
+}
+
+static void fail_mirror(struct mirror *m)
 {
-	/* FIXME: add read balancing */
-	return ms->mirror + DEFAULT_MIRROR;
+	DMINFO("incrementing error_count on %s", m->dev->name);
+	atomic_inc(&m->error_count);
+
+	choose_mirror(m->ms, m);
 }

 /*
@@ -704,6 +776,9 @@
 	bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
 }

+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 {
 	region_t region;
@@ -717,9 +792,9 @@
 		 * We can only read balance if the region is in sync.
 		 */
 		if (rh_in_sync(&ms->rh, region, 0) == RH_CLEAN)
-			m = choose_mirror(ms, bio->bi_sector);
+			m = choose_mirror(ms, NULL);
 		else
-			m = ms->mirror + DEFAULT_MIRROR;
+			m = ms->default_mirror;;

 		map_bio(ms, m, bio);
 		generic_make_request(bio);
@@ -736,35 +811,87 @@
  * RECOVERING:	delay the io until recovery completes
  * NOSYNC:	increment pending, just write to the default mirror
  *---------------------------------------------------------------*/
+static void write_failure_handler(void *data)
+{
+	struct bio *bio;
+	struct bio_list failed_writes;
+	struct mirror_set *ms = (struct mirror_set *)data;
+
+	dm_table_event(ms->ti->table);
+
+	/* Take list out to handle endios. */
+	spin_lock(&ms->lock);
+	failed_writes = ms->failures;
+	bio_list_init(&ms->failures);
+	spin_unlock(&ms->lock);
+
+	while ((bio = bio_list_pop(&failed_writes))) {
+		bio_endio(bio, bio->bi_size, 0);
+	}
+}
+
 static void write_callback(unsigned long error, void *context)
 {
-	unsigned int i;
-	int uptodate = 1;
+	unsigned int i, ret = 0;
 	struct bio *bio = (struct bio *) context;
 	struct mirror_set *ms;
-
+
 	ms = bio_get_ms(bio);
 	bio_set_ms(bio, NULL);
-
+
 	/*
 	 * NOTE: We don't decrement the pending count here,
 	 * instead it is done by the targets endio function.
 	 * This way we handle both writes to SYNC and NOSYNC
 	 * regions with the same code.
 	 */
+	if (unlikely(error)) {
+		int uptodate = 0, run;
+
+		DMERR("Error during write occurred.");

-	if (error) {
 		/*
-		 * only error the io if all mirrors failed.
-		 * FIXME: bogus
+		 * Test all bits - if all failed, fail io.
+		 * Otherwise, go through hassle of failing a device...
 		 */
-		uptodate = 0;
-		for (i = 0; i < ms->nr_mirrors; i++)
-			if (!test_bit(i, &error)) {
+		for (i = 0; i < ms->nr_mirrors; i++) {
+			if (test_bit(i, &error))
+				fail_mirror(ms->mirror + i);
+			else
 				uptodate = 1;
-				break;
+		}
+
+		if (likely(uptodate)) {
+			spin_lock(&ms->lock);
+			if (atomic_read(&ms->suspended)) {
+				/*
+				 * The device is suspended, it is
+				 * safe to complete I/O.
+				 */
+				spin_unlock(&ms->lock);
+			} else {
+				/*
+				 * Need to raise event.  Since raising
+				 * events can block, we need to do it in
+				 * seperate thread.
+				 */
+				run = !ms->failures.head;
+				bio_list_add(&ms->failures, bio);
+				spin_unlock(&ms->lock);
+			
+				if (run) {
+					queue_work(_mir_mond_wq,
+						   &ms->failure_work);
+				}
+				return;
 			}
+		} else {
+			DMERR("All replicated volumes dead, failing I/O");
+			/* None of the writes succeeded, fail the I/O. */
+			ret = -EIO;
+		}
 	}
+
 	bio_endio(bio, bio->bi_size, 0);
 }

@@ -843,7 +970,7 @@
 		rh_delay(&ms->rh, bio);

 	while ((bio = bio_list_pop(&nosync))) {
-		map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
+		map_bio(ms, ms->default_mirror, bio);
 		generic_make_request(bio);
 	}
 }
@@ -905,11 +1032,15 @@

 	memset(ms, 0, len);
 	spin_lock_init(&ms->lock);
+	spin_lock_init(&ms->choose_lock);

 	ms->ti = ti;
 	ms->nr_mirrors = nr_mirrors;
 	ms->nr_regions = dm_sector_div_up(ti->len, region_size);
 	ms->in_sync = 0;
+	ms->default_mirror = &ms->mirror[DEFAULT_MIRROR];
+
+	atomic_set(&ms->suspended, 0);

 	if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
 		ti->error = "dm-mirror: Error creating dirty region hash";
@@ -917,6 +1048,11 @@
 		return NULL;
 	}

+	atomic_set(&ms->read_count, MIN_READS);
+
+	bio_list_init(&ms->failures);
+	INIT_WORK(&ms->failure_work, write_failure_handler, ms);
+	
 	return ms;
 }

@@ -954,6 +1090,8 @@
 	}

 	ms->mirror[mirror].offset = offset;
+	atomic_set(&(ms->mirror[mirror].error_count), 0);
+	ms->mirror[mirror].ms = ms;

 	return 0;
 }
@@ -1148,7 +1286,7 @@
 		return 0;
 	}

-	m = choose_mirror(ms, bio->bi_sector);
+	m = choose_mirror(ms, NULL);
 	if (!m)
 		return -EIO;

@@ -1172,6 +1310,13 @@
 	return 0;
 }

+static void mirror_presuspend(struct dm_target *ti){
+	struct mirror_set *ms = (struct mirror_set *)ti->private;
+
+	atomic_set(&ms->suspended, 1);
+}
+
+
 static void mirror_postsuspend(struct dm_target *ti)
 {
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
@@ -1191,6 +1336,7 @@
 		/* FIXME: need better error handling */
 		DMWARN("log resume failed");
 	rh_start_recovery(&ms->rh);
+	atomic_set(&ms->suspended, 0);
 }

 static int mirror_status(struct dm_target *ti, status_type_t type,
@@ -1233,6 +1379,7 @@
 	.dtr	 = mirror_dtr,
 	.map	 = mirror_map,
 	.end_io	 = mirror_end_io,
+	.presuspend = mirror_presuspend,
 	.postsuspend = mirror_postsuspend,
 	.resume	 = mirror_resume,
 	.status	 = mirror_status,
@@ -1250,16 +1397,25 @@
 	if (!_kmirrord_wq) {
 		DMERR("couldn't start kmirrord");
 		dm_dirty_log_exit();
-		return r;
+		return -ENOMEM;
 	}
 	INIT_WORK(&_kmirrord_work, do_work, NULL);

+	_mir_mond_wq = create_workqueue("mir_mond");
+	if (!_mir_mond_wq) {
+		DMERR("couldn't start mir_mond");
+		dm_dirty_log_exit();
+		destroy_workqueue(_kmirrord_wq);
+		return -ENOMEM;
+	}
+
 	r = dm_register_target(&mirror_target);
 	if (r < 0) {
 		DMERR("%s: Failed to register mirror target",
 		      mirror_target.name);
 		dm_dirty_log_exit();
 		destroy_workqueue(_kmirrord_wq);
+		destroy_workqueue(_mir_mond_wq);
 	}

 	return r;


[Index of Archives]     [DM Crypt]     [Fedora Desktop]     [ATA RAID]     [Fedora Marketing]     [Fedora Packaging]     [Fedora SELinux]     [Yosemite Discussion]     [KDE Users]     [Fedora Docs]

  Powered by Linux