On Friday October 16, Rob.Becker@xxxxxxxxxxxx wrote: > We've noticed severe lasting performance degradation of our raid > arrays when we have drives that yield large amounts of media errors. > The raid10 module will queue each failed read for retry, and also > will attempt call fix_read_error() to perform the read recovery. > Read recovery is performed while the array is frozen, so repeated > recovery attempts can degrade the performance of the array for > extended periods of time. > > With this patch I propose adding a per md device max number of > corrected read attempts. Each rdev will maintain a count of > read correction attempts in the rdev->read_errors field (not > used currently for raid10). When we enter fix_read_error() > we'll check to see when the last read error occurred, and > divide the read error count by 2 for every hour since the > last read error. If at that point our read error count > exceeds the read error threshold, we'll fail the raid device. > > In addition in this patch I add sysfs nodes (get/set) for > the per md max_read_errors attribute, the rdev->read_errors > attribute, and added some printk's to indicate when > fix_read_error fails to repair an rdev. Thanks for this. I've split the "added some printk's" out into a separate patch as it really is an independent change. I've also removed the code that provides sysfs access to rdev->read_errors because I'm not convinced that it is needed (certainly not setting it) and the name isn't really an accurate description of the attribute (maybe 'recent_read_errors' .... or something). The rest I have added to my queue for 2.6.33. You can see this in my 'md-scratch' branch at git://neil.brown.name/md, or at http://neil.brown.name/git?p=md;a=shortlog;h=refs/heads/md-scratch I'm happy to listen to arguments for 'rdev->read_errors' to be exported via sysfs, with a more suitable name. Thanks, NeilBrown > > For testing I used debugfs->fail_make_request to inject > IO errors to the rdev while doing IO to the raid array. > > Signed-off-by: Robert Becker <Rob.Becker@xxxxxxxxxxxx> > --- > drivers/md/md.c | 57 +++++++++++++++++++++++++++ > drivers/md/md.h | 5 ++- > drivers/md/raid10.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 164 insertions(+), 4 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 641b211..1a837fa 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -68,6 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); > #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } > > /* > + * Default number of read corrections we'll attempt on an rdev > + * before ejecting it from the array. We divide the read error > + * count by 2 for every hour elapsed between read errors. > + */ > +#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 > +/* > * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' > * is 1000 KB/sec, so the extra system load does not show up that much. > * Increase it if you want to have more _guaranteed_ speed. Note that > @@ -2156,6 +2162,28 @@ static struct rdev_sysfs_entry rdev_errors = > __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); > > static ssize_t > +read_errors_show(mdk_rdev_t *rdev, char *page) > +{ > + return sprintf(page, "%d\n", atomic_read(&rdev->read_errors)); > +} > + > +static ssize_t > +read_errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) > +{ > + char *e; > + unsigned long n = simple_strtoul(buf, &e, 10); > + > + if (*buf && (*e == 0 || *e == '\n')) { > + atomic_set(&rdev->read_errors, n); > + return len; > + } > + return -EINVAL; > +} > + > +static struct rdev_sysfs_entry rdev_read_errors = > +__ATTR(read_errors, S_IRUGO, read_errors_show, read_errors_store); > + > +static ssize_t > slot_show(mdk_rdev_t *rdev, char *page) > { > if (rdev->raid_disk < 0) > @@ -2388,6 +2416,7 @@ static struct attribute *rdev_default_attrs[] = { > &rdev_slot.attr, > &rdev_offset.attr, > &rdev_size.attr, > + &rdev_read_errors.attr, > NULL, > }; > static ssize_t > @@ -2489,6 +2518,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi > rdev->flags = 0; > rdev->data_offset = 0; > rdev->sb_events = 0; > + rdev->last_read_error.tv_sec = 0; > + rdev->last_read_error.tv_nsec = 0; > atomic_set(&rdev->nr_pending, 0); > atomic_set(&rdev->read_errors, 0); > atomic_set(&rdev->corrected_errors, 0); > @@ -3101,6 +3132,29 @@ static struct md_sysfs_entry md_array_state = > __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); > > static ssize_t > +max_corrected_read_errors_show(mddev_t *mddev, char *page) { > + return sprintf(page, "%d\n", > + atomic_read(&mddev->max_corr_read_errors)); > +} > + > +static ssize_t > +max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) > +{ > + char *e; > + unsigned long n = simple_strtoul(buf, &e, 10); > + > + if (*buf && (*e == 0 || *e == '\n')) { > + atomic_set(&mddev->max_corr_read_errors, n); > + return len; > + } > + return -EINVAL; > +} > + > +static struct md_sysfs_entry max_corr_read_errors = > +__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, > + max_corrected_read_errors_store); > + > +static ssize_t > null_show(mddev_t *mddev, char *page) > { > return -EINVAL; > @@ -3729,6 +3783,7 @@ static struct attribute *md_default_attrs[] = { > &md_array_state.attr, > &md_reshape_position.attr, > &md_array_size.attr, > + &max_corr_read_errors.attr, > NULL, > }; > > @@ -4183,6 +4238,8 @@ static int do_md_run(mddev_t * mddev) > mddev->ro = 0; > > atomic_set(&mddev->writes_pending,0); > + atomic_set(&mddev->max_corr_read_errors, > + MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); > mddev->safemode = 0; > mddev->safemode_timer.function = md_safemode_timeout; > mddev->safemode_timer.data = (unsigned long) mddev; > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 8227ab9..9dcfc67 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -104,6 +104,9 @@ struct mdk_rdev_s > atomic_t read_errors; /* number of consecutive read errors that > * we have tried to ignore. > */ > + struct timespec last_read_error; /* monotonic time since our > + * last read error > + */ > atomic_t corrected_errors; /* number of corrected read errors, > * for reporting to userspace and storing > * in superblock. > @@ -285,7 +288,7 @@ struct mddev_s > * hot-adding a bitmap. It should > * eventually be settable by sysfs. > */ > - > + atomic_t max_corr_read_errors; /* max read retries */ > struct list_head all_mddevs; > }; > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > index 499620a..791edb8 100644 > --- a/drivers/md/raid10.c > +++ b/drivers/md/raid10.c > @@ -1427,6 +1427,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) > > > /* > + * Used by fix_read_error() to decay the per rdev read_errors. > + * We halve the read error count for every hour that has elapsed > + * since the last recorded read error. > + * > + */ > +static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) > +{ > + struct timespec cur_time_mon; > + unsigned long hours_since_last; > + unsigned int read_errors = atomic_read(&rdev->read_errors); > + > + ktime_get_ts(&cur_time_mon); > + > + if (rdev->last_read_error.tv_sec == 0 && > + rdev->last_read_error.tv_nsec == 0) { > + /* first time we've seen a read error */ > + rdev->last_read_error = cur_time_mon; > + return; > + } > + > + hours_since_last = (cur_time_mon.tv_sec - > + rdev->last_read_error.tv_sec) / 3600; > + > + rdev->last_read_error = cur_time_mon; > + > + /* > + * if hours_since_last is > the number of bits in read_errors > + * just set read errors to 0. We do this to avoid > + * overflowing the shift of read_errors by hours_since_last. > + */ > + if (hours_since_last >= 8 * sizeof(read_errors)) > + atomic_set(&rdev->read_errors, 0); > + else > + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); > +} > + > +/* > * This is a kernel thread which: > * > * 1. Retries failed read operations on working mirrors. > @@ -1439,6 +1476,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) > int sect = 0; /* Offset from r10_bio->sector */ > int sectors = r10_bio->sectors; > mdk_rdev_t*rdev; > + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); > + > + rcu_read_lock(); > + { > + int d = r10_bio->devs[r10_bio->read_slot].devnum; > + char b[BDEVNAME_SIZE]; > + int cur_read_error_count = 0; > + > + rdev = rcu_dereference(conf->mirrors[d].rdev); > + bdevname(rdev->bdev, b); > + > + if (test_bit(Faulty, &rdev->flags)) { > + rcu_read_unlock(); > + /* drive has already been failed, just ignore any > + more fix_read_error() attempts */ > + return; > + } > + > + check_decay_read_errors(mddev, rdev); > + atomic_inc(&rdev->read_errors); > + cur_read_error_count = atomic_read(&rdev->read_errors); > + if (cur_read_error_count > max_read_errors) { > + rcu_read_unlock(); > + printk(KERN_NOTICE > + "raid10: %s: Raid device exceeded " > + "read_error threshold " > + "[cur %d:max %d]\n", > + b, cur_read_error_count, max_read_errors); > + printk(KERN_NOTICE > + "raid10: %s: Failing raid " > + "device\n", b); > + md_error(mddev, conf->mirrors[d].rdev); > + return; > + } > + } > + rcu_read_unlock(); > + > while(sectors) { > int s = sectors; > int sl = r10_bio->read_slot; > @@ -1483,6 +1557,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) > /* write it back and re-read */ > rcu_read_lock(); > while (sl != r10_bio->read_slot) { > + char b[BDEVNAME_SIZE]; > int d; > if (sl==0) > sl = conf->copies; > @@ -1498,9 +1573,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) > r10_bio->devs[sl].addr + > sect + rdev->data_offset, > s<<9, conf->tmppage, WRITE) > - == 0) > + == 0) { > /* Well, this device is dead */ > + printk(KERN_NOTICE > + "raid10:%s: read correction " > + "write failed" > + " (%d sectors at %llu on %s)\n", > + mdname(mddev), s, > + (unsigned long long)(sect+ > + rdev->data_offset), > + bdevname(rdev->bdev, b)); > + printk(KERN_NOTICE "raid10:%s: failing " > + "drive\n", > + bdevname(rdev->bdev, b)); > md_error(mddev, rdev); > + } > rdev_dec_pending(rdev, mddev); > rcu_read_lock(); > } > @@ -1521,10 +1608,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) > if (sync_page_io(rdev->bdev, > r10_bio->devs[sl].addr + > sect + rdev->data_offset, > - s<<9, conf->tmppage, READ) == 0) > + s<<9, conf->tmppage, > + READ) == 0) { > /* Well, this device is dead */ > + printk(KERN_NOTICE > + "raid10:%s: unable to read back " > + "corrected sectors" > + " (%d sectors at %llu on %s)\n", > + mdname(mddev), s, > + (unsigned long long)(sect+ > + rdev->data_offset), > + bdevname(rdev->bdev, b)); > + printk(KERN_NOTICE "raid10:%s: failing drive\n", > + bdevname(rdev->bdev, b)); > + > md_error(mddev, rdev); > - else > + } else { > printk(KERN_INFO > "raid10:%s: read error corrected" > " (%d sectors at %llu on %s)\n", > @@ -1532,6 +1631,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) > (unsigned long long)(sect+ > rdev->data_offset), > bdevname(rdev->bdev, b)); > + } > > rdev_dec_pending(rdev, mddev); > rcu_read_lock(); > -- > 1.5.6.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-raid" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html