From: Czarnowska, Anna Sent: Monday, July 05, 2010 11:38 AM To: Neil Brown Cc: linux-raid@xxxxxxxxxxxxxxx; Czarnowska, Anna; Hawrylewicz Czarnowski, Przemyslaw; Labun, Marcin; Neubauer, Wojciech; Williams, Dan J; Ciechanowski, Ed; dledford@xxxxxxxxxx Subject: [PATCH 23/33] Monitor: Spare sharing with domain/subset support Works for both native and external metadata. Moves spares between arrays/containers with matching domain and subset to use for rebuild. Signed-off-by: Anna Czarnowska <anna.czarnowska@xxxxxxxxx> --- Monitor.c | 209 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 202 insertions(+), 7 deletions(-) diff --git a/Monitor.c b/Monitor.c index ed57af0..11f3758 100644 --- a/Monitor.c +++ b/Monitor.c @@ -49,6 +49,7 @@ struct state { int devstate[MaxDisks]; int devid[MaxDisks]; int percent; + unsigned long long min_size; char *metadata_version; struct state *next; struct state *volumes; @@ -56,6 +57,9 @@ struct state { struct state *missing; }; +static void spare_sharing(struct state *statelist, char *mailaddr, + char *mailfrom, char *alert_cmd, int dosyslog); + int Monitor(mddev_dev_t devlist, char *mailaddr, char *alert_cmd, int period, int daemonise, int scan, int oneshot, @@ -92,11 +96,10 @@ int Monitor(mddev_dev_t devlist, * DeviceDisappeared * Couldn't access a device which was previously visible * - * if we detect an array with active<raid and spare==0 - * we look at other arrays that have same spare-group - * If we find one with active==raid and spare>0, - * and if we can get_disk_info and find a name - * Then we hot-remove and hot-add to the other array + * If we detect an array with active<raid and spare==0 + * we look at other arrays that have a spare + * and are in the same domain and subset + * Then we hot-remove and hot-add to the other array * * If devlist is NULL, then we can monitor everything because --scan * was given. We get an initial list from config file and add anything @@ -185,6 +188,9 @@ int Monitor(mddev_dev_t devlist, st->parent = NULL; st->volumes = NULL; st->total = 0; + st->min_size = 0; + memset(st->devid, 0, MaxDisks*sizeof(int)); + memset(st->devstate, 0, MaxDisks*sizeof(int)); if (mdlist->spare_group) st->spare_group = strdup(mdlist->spare_group); else @@ -211,6 +217,9 @@ int Monitor(mddev_dev_t devlist, st->parent = NULL; st->volumes = NULL; st->total = 0; + st->min_size = 0; + memset(st->devid, 0, MaxDisks*sizeof(int)); + memset(st->devstate, 0, MaxDisks*sizeof(int)); if (mdlist) { st->expected_spares = mdlist->spare_disks; if (mdlist->spare_group) @@ -371,6 +380,7 @@ int Monitor(mddev_dev_t devlist, int newstate=0; int change; char *dv = NULL; + unsigned long long dsize; disc.number = i; if (i > array.raid_disks + array.nr_disks) { newstate = 0; @@ -415,6 +425,17 @@ int Monitor(mddev_dev_t devlist, } st->devstate[i] = newstate; st->devid[i] = makedev(disc.major, disc.minor); + + /* for volumes only we get minimum disk size + * (only active disks) */ + fd = open(dv, O_RDONLY); + if (dv && newstate & (1<<MD_DISK_ACTIVE) && + array.raid_disks && fd >= 0 && + get_dev_size(fd, dv, &dsize) && + (st->min_size == 0 || dsize < st->min_size)) + st->min_size = dsize; + if (fd >= 0) + close(fd); } st->active = array.active_disks; st->working = array.working_disks; @@ -506,6 +527,7 @@ int Monitor(mddev_dev_t devlist, new_found = 1; } } + spare_sharing(statelist, mailaddr, mailfrom, alert_cmd, dosyslog); /* If an array has active < raid && spare == 0 && spare_group != NULL * Look for another array with spare > 0 and active == raid and same spare_group * if found, choose a device and hotremove/hotadd @@ -577,8 +599,63 @@ int Monitor(mddev_dev_t devlist, return 0; } +static int move_spare(struct state *st2, struct state *st1, int i, char *mailaddr, + char *mailfrom, char *alert_cmd, int dosyslog) + +{ + struct mddev_dev_s devlist; + char devname[20]; + int from_fd, to_fd; + + if (!st1 || !st2 || st2->devid[i] == 0) + return 0; + + from_fd = open(st2->devname, O_RDONLY); + if (from_fd < 0) + return 0; + + to_fd = open(st1->devname, O_RDONLY); + if (to_fd < 0) { + close(from_fd); + return 0; + } + + devlist.next = NULL; + devlist.used = 0; + devlist.re_add = 0; + devlist.writemostly = 0; + devlist.devname = devname; + char *dv = map_dev(major(st2->devid[i]), minor(st2->devid[i]), 1); + if (!dv) { + close(from_fd); + close(to_fd); + return 0; + } + snprintf(devname, 20, "%s", dv); + + devlist.disposition = 'r'; + if (Manage_subdevs(st2->devname, from_fd, &devlist, -1) == 0) { + devlist.disposition = 'a'; + if (Manage_subdevs(st1->devname, to_fd, &devlist, -1) == 0) { + st2->devid[i] = 0; + ping_manager(st2->devname); + ping_manager(st1->devname); + alert("MoveSpare", st1->devname, st2->devname, mailaddr, + mailfrom, alert_cmd, dosyslog); + close(from_fd); + close(to_fd); + return 1; + } else { + Manage_subdevs(st2->devname, from_fd, &devlist, -1); + } + } + close(from_fd); + close(to_fd); + return 0; +} + /* check if disk is used in donor array (native) or any volume in donor container (external)*/ -int check_disk_is_free(struct state *donor, int disk_idx, int ext) +static int check_disk_is_free(struct state *donor, int disk_idx, int +ext) { struct state *vol = NULL; int vol_disk; @@ -601,7 +678,7 @@ int check_disk_is_free(struct state *donor, int disk_idx, int ext) return disk_idx; } -int get_disk_domain_and_subset(int devid, char *metadata_version, struct domain_ent **domain, +static int get_disk_domain_and_subset(int devid, char +*metadata_version, struct domain_ent **domain, struct subset **subset, int fcheck) { struct supertype *sty; @@ -650,6 +727,124 @@ fail: return 0; } +/* If an array has active < raid && spare == 0 + * Look for another array/container with unused, unfailed spare + * and the same domain and subset + * if found, hotremove/hotadd the spare (to parent container in +external) */ static void spare_sharing(struct state *statelist, char +*mailaddr, + char *mailfrom, char *alert_cmd, int dosyslog) { + struct state *st, *stp, *vol, *st2 = NULL; + struct domain_ent *domain, *spare_domain; + struct subset *subset, *spare_subset; + int i, ext, found; + unsigned long long ssize; + + for (st = statelist; st; st = st->next) { + if (st->err || st->active == st->raid || st->spare > 0) + continue; + + found = 0; + ext = is_external(st->metadata_version); + + /* + * for exernal metadata spare will be moved to parent container + */ + if (ext) { + stp = st->parent; + if (!stp) + continue; + } else { + stp = st; + } + /* check if there is a spare in this array/parent container */ + for (i = 0; i < stp->total; i++) + if ((check_disk_is_free(stp, i, ext) == i) && + (!ext || !disk_faulty_from_id(stp->devid[i]))) + break; + if (i < stp->total) + /* there is a spare in array/parent container, + * if it is big enough it was probably just added + * but mdmon has not started recovery yet + * we will not add any more spares for now */ + if (dev_size_from_id(stp->devid[i], &ssize) && + ssize >= st->min_size) + continue; + + /* get any good disk from array to check domain and subset */ + for (i = 0; i < st->total; i++) + if (st->devid[i] > 0 && + (st->devstate[i] & 1<<MD_DISK_ACTIVE)) + break; + + if (i == st->total) + continue; + + if (!get_disk_domain_and_subset(st->devid[i], + stp->metadata_version, &domain, &subset, 0)) + continue; + + /* search for an array/container with unused spare */ + for (st2 = statelist; st2; st2 = st2->next) { + if (st2->err || st2 == stp) + continue; + + if ((ext && st2->parent != NULL) || + (strcmp(stp->metadata_version, st2->metadata_version) + != 0)) + continue; + + if (ext) { + /* if container has degraded volume + * we can't remove spares */ + for (vol = st2->volumes; vol; vol = vol->volumes) + if (vol->active < vol->raid) + break; + if (vol) + continue; + } else { + if (st2->active < st2->raid) + continue; + } + + for (i = 0; i < st2->total; i++) { /* find a spare */ + if (check_disk_is_free(st2, i, ext) == INT_MAX) + continue; + + if (!get_disk_domain_and_subset(st2->devid[i], + st2->metadata_version, + &spare_domain, &spare_subset, ext)) + continue; + + /* check if domain and subset + * are the same as for st */ + if ((domain != spare_domain) || + (subset != spare_subset)) { + /* no point looking + * in that array/container */ + break; + } else { + if (!dev_size_from_id(st2->devid[i], &ssize) || + ssize < st->min_size) + continue; + + if (move_spare(st2, stp, i, mailaddr, + mailfrom, alert_cmd, + dosyslog)) { + found = 1; + /* stop searching disks */ + break; + } + } + } + if (found) + break; /* stop searching arrays */ + } + } + return; +} + static void alert(char *event, char *dev, char *disc, char *mailaddr, char *mailfrom, char *cmd, int dosyslog) { -- 1.6.4.2 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html