Hi Neil, Thanks for the comments. Hopefully now we have the complete fix. I am posting what we have applied on top of 3.8.13 (now we are moving to 3.18.19, which already has part of the fix, so I will need to apply a delta, until both your latest patches reach Mr. Stable). Locking the spinlock in "error" function is not there (but I will apply it to 3.18.19). Below patch is a bit ugly: - CONFIG_MD_ZADARA is a define that we add, so that every engineer can clearly distinguish our changes vs the original code. I know it's ugly. - zklog is a macro that eventually ends up in printk. This is only to have a bit more prints, and is not functionally needed. zklog also prints current->pid, function name etc, to have more context. Hopefully, gmail will not make the patch even uglier. Thanks for your help! Alex. diff --git a/md/3.8.13-030813-generic/drivers/md/raid1.c b/md/3.8.13-030813-generic/drivers/md/raid1.c index c864a6e..32170e9 100644 --- a/md/3.8.13-030813-generic/drivers/md/raid1.c +++ b/md/3.8.13-030813-generic/drivers/md/raid1.c @@ -322,24 +322,36 @@ static void raid1_end_read_request(struct bio *bio, int error) if (uptodate) set_bit(R1BIO_Uptodate, &r1_bio->state); else { /* If all other devices have failed, we want to return * the error upwards rather than fail the last device. * Here we redefine "uptodate" to mean "Don't want to retry" */ unsigned long flags; spin_lock_irqsave(&conf->device_lock, flags); +#ifndef CONFIG_MD_ZADARA if (r1_bio->mddev->degraded == conf->raid_disks || (r1_bio->mddev->degraded == conf->raid_disks-1 && !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) uptodate = 1; +#else/*CONFIG_MD_ZADARA*/ + /* Make sure we retry read error from a recovering device */ + if (r1_bio->mddev->degraded == conf->raid_disks || + (r1_bio->mddev->degraded == conf->raid_disks-1 && + test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) { + char b[BDEVNAME_SIZE]; + zklog_rl(mdname(conf->mddev), Z_KERR, "READ rdev=%s[%llu:%u] ERROR - not retrying (last good drive)", + bdevname(conf->mirrors[mirror].rdev->bdev, b), (unsigned long long)r1_bio->sector, r1_bio->sectors); + uptodate = 1; + } +#endif/*CONFIG_MD_ZADARA*/ spin_unlock_irqrestore(&conf->device_lock, flags); } if (uptodate) { raid_end_bio_io(r1_bio); rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } else { /* * oops, read error: */ @@ -1418,20 +1430,31 @@ static void status(struct seq_file *seq, struct mddev *mddev) rcu_read_unlock(); seq_printf(seq, "]"); } static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; +#ifdef CONFIG_MD_ZADARA + { + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + if (__ratelimit(&_rs)) { + zklog(mdname(mddev), Z_KERR, "rdev=%s failure InSync=%u Failed=%u STACK:", + bdevname(rdev->bdev, b), test_bit(In_sync, &rdev->flags), test_bit(Faulty, &rdev->flags)); + dump_stack(); + } + } +#endif/*CONFIG_MD_ZADARA*/ + /* * If it is not operational, then we have already marked it as dead * else if it is the last working disks, ignore the error, let the * next level up know. * else mark the drive as failed */ if (test_bit(In_sync, &rdev->flags) && (conf->raid_disks - mddev->degraded) == 1) { /* * Don't fail the drive, act as though we were just a @@ -1497,20 +1520,25 @@ static void close_sync(struct r1conf *conf) conf->r1buf_pool = NULL; } static int raid1_spare_active(struct mddev *mddev) { int i; struct r1conf *conf = mddev->private; int count = 0; unsigned long flags; +#ifdef CONFIG_MD_ZADARA + /* we lock the whole function */ + spin_lock_irqsave(&conf->device_lock, flags); +#endif /*CONFIG_MD_ZADARA*/ + /* * Find all failed disks within the RAID1 configuration * and mark them readable. * Called under mddev lock, so rcu protection not needed. */ for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; if (repl && repl->recovery_offset == MaxSector @@ -1530,21 +1558,24 @@ static int raid1_spare_active(struct mddev *mddev) rdev->sysfs_state); } } if (rdev && !test_bit(Faulty, &rdev->flags) && !test_and_set_bit(In_sync, &rdev->flags)) { count++; sysfs_notify_dirent_safe(rdev->sysfs_state); } } +#ifndef CONFIG_MD_ZADARA + /* we lock the whole function */ spin_lock_irqsave(&conf->device_lock, flags); +#endif /*CONFIG_MD_ZADARA*/ mddev->degraded -= count; spin_unlock_irqrestore(&conf->device_lock, flags); print_conf(conf); return count; } static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { @@ -2027,20 +2058,27 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) * * 1. Retries failed read operations on working mirrors. * 2. Updates the raid superblock when problems encounter. * 3. Performs writes following reads for array synchronising. */ static void fix_read_error(struct r1conf *conf, int read_disk, sector_t sect, int sectors) { struct mddev *mddev = conf->mddev; +#ifdef CONFIG_MD_ZADARA + { + char b[BDEVNAME_SIZE] = {'\0'}; + zklog_rl(mdname(mddev), Z_KWARN, "attempting to fix READ rdev=%s[%llu:%u]", + bdevname(conf->mirrors[read_disk].rdev->bdev, b), (unsigned long long)sect, sectors); + } +#endif /*CONFIG_MD_ZADARA*/ while(sectors) { int s = sectors; int d = read_disk; int success = 0; int start; struct md_rdev *rdev; if (s > (PAGE_SIZE>>9)) s = PAGE_SIZE >> 9; @@ -2078,33 +2116,41 @@ static void fix_read_error(struct r1conf *conf, int read_disk, break; } /* write it back and re-read */ start = d; while (d != read_disk) { if (d==0) d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && +#ifndef CONFIG_MD_ZADARA test_bit(In_sync, &rdev->flags)) +#else /*CONFIG_MD_ZADARA*/ + !test_bit(Faulty, &rdev->flags)) +#endif /*CONFIG_MD_ZADARA*/ r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); } d = start; while (d != read_disk) { char b[BDEVNAME_SIZE]; if (d==0) d = conf->raid_disks * 2; d--; rdev = conf->mirrors[d].rdev; if (rdev && +#ifndef CONFIG_MD_ZADARA test_bit(In_sync, &rdev->flags)) { +#else /*CONFIG_MD_ZADARA*/ + !test_bit(Faulty, &rdev->flags)) { +#endif /*CONFIG_MD_ZADARA*/ if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); printk(KERN_INFO "md/raid1:%s: read error corrected " "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + rdev->data_offset), bdevname(rdev->bdev, b)); On Mon, Jul 27, 2015 at 3:56 AM, NeilBrown <neilb@xxxxxxxx> wrote: > On Sun, 26 Jul 2015 10:15:05 +0200 Alexander Lyakas > <alex.bolshoy@xxxxxxxxx> wrote: > >> Hi Neil, >> >> On Fri, Jul 24, 2015 at 1:24 AM, NeilBrown <neilb@xxxxxxxx> wrote: >> > Hi Alex >> > thanks for noticing! >> > Just to be sure we mean the same thing: this is the patch which is >> > missing - correct? >> > >> > Thanks, >> > NeilBrown >> > >> > From: NeilBrown <neilb@xxxxxxxx> >> > Date: Fri, 24 Jul 2015 09:22:16 +1000 >> > Subject: [PATCH] md/raid1: fix test for 'was read error from last working >> > device'. >> > >> > When we get a read error from the last working device, we don't >> > try to repair it, and don't fail the device. We simple report a >> > read error to the caller. >> > >> > However the current test for 'is this the last working device' is >> > wrong. >> > When there is only one fully working device, it assumes that a >> > non-faulty device is that device. However a spare which is rebuilding >> > would be non-faulty but so not the only working device. >> > >> > So change the test from "!Faulty" to "In_sync". If ->degraded says >> > there is only one fully working device and this device is in_sync, >> > this must be the one. >> > >> > This bug has existed since we allowed read_balance to read from >> > a recovering spare in v3.0 >> > >> > Reported-and-tested-by: Alexander Lyakas <alex.bolshoy@xxxxxxxxx> >> > Fixes: 76073054c95b ("md/raid1: clean up read_balance.") >> > Cc: stable@xxxxxxxxxxxxxxx (v3.0+) >> > Signed-off-by: NeilBrown <neilb@xxxxxxxx> >> > >> > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c >> > index 166616411215..b368307a9651 100644 >> > --- a/drivers/md/raid1.c >> > +++ b/drivers/md/raid1.c >> > @@ -336,7 +336,7 @@ static void raid1_end_read_request(struct bio *bio, int error) >> > spin_lock_irqsave(&conf->device_lock, flags); >> > if (r1_bio->mddev->degraded == conf->raid_disks || >> > (r1_bio->mddev->degraded == conf->raid_disks-1 && >> > - !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) >> > + test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) >> > uptodate = 1; >> > spin_unlock_irqrestore(&conf->device_lock, flags); >> > } >> >> Yes, this was the missing piece. >> >> But you also advised to put the whole of "raid1_spare_active" under >> the spinlock: >> > 2/ extend the spinlock in raid1_spare_active to cover the whole function. >> So we did this bit too. Can you pls comment if this is needed? > > When you say "we did this bit" it would help a lot to actually show the > patch - helps avoid misunderstanding. In fact I was probably hoping > that you would post a patch once you had it fixed.... no mater. > See below for that patch I have just queue. There is an extra place > were a spinlock is probably helpful. > >> >> Also, will you be sending this patch to "stable"? We are moving to >> kernel 3.18, so we should get this then. > > Putting "cc: stable@xxxxxxxxxxxxxxx" means that it will automatically > migrated to the stable kernels once it has appeared in Linus' kernel. > So yes: it will go to -stable > >> >> Thanks! >> Alex. > > Thanks, > NeilBrown > > From 04f58ef505b9d354dd06c477a94a7e314a38cb72 Mon Sep 17 00:00:00 2001 > From: NeilBrown <neilb@xxxxxxxx> > Date: Mon, 27 Jul 2015 11:48:52 +1000 > Subject: [PATCH] md/raid1: extend spinlock to protect raid1_end_read_request > against inconsistencies > > raid1_end_read_request() assumes that the In_sync bits are consistent > with the ->degaded count. > raid1_spare_active updates the In_sync bit before the ->degraded count > and so exposes an inconsistency, as does error() > So extend the spinlock in raid1_spare_active() and error() to hide those > inconsistencies. > > This should probably be part of > Commit: 34cab6f42003 ("md/raid1: fix test for 'was read error from > last working device'.") > as it addresses the same issue. It fixes the same bug and should go > to -stable for same reasons. > > Fixes: 76073054c95b ("md/raid1: clean up read_balance.") > Cc: stable@xxxxxxxxxxxxxxx (v3.0+) > Signed-off-by: NeilBrown <neilb@xxxxxxxx> > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c > index b368307a9651..742b50794dfd 100644 > --- a/drivers/md/raid1.c > +++ b/drivers/md/raid1.c > @@ -1476,6 +1476,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) > { > char b[BDEVNAME_SIZE]; > struct r1conf *conf = mddev->private; > + unsigned long flags; > > /* > * If it is not operational, then we have already marked it as dead > @@ -1495,14 +1496,13 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) > return; > } > set_bit(Blocked, &rdev->flags); > + spin_lock_irqsave(&conf->device_lock, flags); > if (test_and_clear_bit(In_sync, &rdev->flags)) { > - unsigned long flags; > - spin_lock_irqsave(&conf->device_lock, flags); > mddev->degraded++; > set_bit(Faulty, &rdev->flags); > - spin_unlock_irqrestore(&conf->device_lock, flags); > } else > set_bit(Faulty, &rdev->flags); > + spin_unlock_irqrestore(&conf->device_lock, flags); > /* > * if recovery is running, make sure it aborts. > */ > @@ -1568,7 +1568,10 @@ static int raid1_spare_active(struct mddev *mddev) > * Find all failed disks within the RAID1 configuration > * and mark them readable. > * Called under mddev lock, so rcu protection not needed. > + * device_lock used to avoid races with raid1_end_read_request > + * which expects 'In_sync' flags and ->degraded to be consistent. > */ > + spin_lock_irqsave(&conf->device_lock, flags); > for (i = 0; i < conf->raid_disks; i++) { > struct md_rdev *rdev = conf->mirrors[i].rdev; > struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; > @@ -1599,7 +1602,6 @@ static int raid1_spare_active(struct mddev *mddev) > sysfs_notify_dirent_safe(rdev->sysfs_state); > } > } > - spin_lock_irqsave(&conf->device_lock, flags); > mddev->degraded -= count; > spin_unlock_irqrestore(&conf->device_lock, flags); > -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html