[PATCH 5/5] Fix adding of new disk with new reload code

rgoldwyn@xxxxxxx · Mon, 5 Oct 2015 10:09:54 -0500

From: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx>

Adding the disk worked incorrectly with the new reload code. Fix it:

 - No operation should be performed on rdev marked as Candidate
 - After a metadata update operation, kick disk if role is 0xfffe
   else clear Candidate bit and continue with the regular change check.
 - On the initiating node, cancel the metadata_update (unlock token),
   in case of an error while adding

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx>
---
 drivers/md/md.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 75d14e3..e324544 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3246,14 +3246,6 @@ static void analyze_sbs(struct mddev *mddev)
 				md_kick_rdev_from_array(rdev);
 				continue;
 			}
-			/* No device should have a Candidate flag
-			 * when reading devices
-			 */
-			if (test_bit(Candidate, &rdev->flags)) {
-				pr_info("md: kicking Cluster Candidate %s from array!\n",
-					bdevname(rdev->bdev, b));
-				md_kick_rdev_from_array(rdev);
-			}
 		}
 		if (mddev->level == LEVEL_MULTIPATH) {
 			rdev->desc_nr = i++;
@@ -5962,7 +5954,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 				/* --add initiated by this node */
 				err = md_cluster_ops->add_new_disk_start(mddev, rdev);
 				if (err) {
-					md_cluster_ops->add_new_disk_finish(mddev);
+					md_cluster_ops->metadata_update_cancel(mddev);
 					export_rdev(rdev);
 					return err;
 				}
@@ -5973,11 +5965,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err)
 			export_rdev(rdev);
-		else
+		else if (!(info->state & (1 << MD_DISK_CANDIDATE)))
 			err = add_bound_rdev(rdev);
-		if (mddev_is_clustered(mddev) &&
+		if (err && mddev_is_clustered(mddev) &&
 				(info->state & (1 << MD_DISK_CLUSTER_ADD)))
-			md_cluster_ops->add_new_disk_finish(mddev);
+			md_cluster_ops->metadata_update_cancel(mddev);
 		return err;
 	}
 
@@ -8038,6 +8030,8 @@ static int remove_and_add_spares(struct mddev *mddev,
 	rdev_for_each(rdev, mddev) {
 		if (this && this != rdev)
 			continue;
+		if (test_bit(Candidate, &rdev->flags))
+			continue;
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(In_sync, &rdev->flags) &&
 		    !test_bit(Faulty, &rdev->flags))
@@ -8955,6 +8949,19 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 
 		/* Check if the roles changed */
 		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
+
+		if (test_bit(Candidate, &rdev2->flags)) {
+			if (role == 0xfffe) {
+				pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
+				md_kick_rdev_from_array(rdev2);
+				continue;
+			}
+			else {
+				clear_bit(Candidate, &rdev2->flags);
+				rdev2->raid_disk = -1;
+			}
+		}
+
 		if (role != rdev2->raid_disk) {
 			/* got activated */
 			if (rdev2->raid_disk == -1 && role != 0xffff) {
-- 
1.8.5.6

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html