This is a note to let you know that I've just added the patch titled md/raid5: ensure sync and DISCARD don't happen at the same time. to the 3.8-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: md-raid5-ensure-sync-and-discard-don-t-happen-at-the-same-time.patch and it can be found in the queue-3.8 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From f8dfcffd0472a0f353f34a567ad3f53568914d04 Mon Sep 17 00:00:00 2001 From: NeilBrown <neilb@xxxxxxx> Date: Tue, 12 Mar 2013 12:18:06 +1100 Subject: md/raid5: ensure sync and DISCARD don't happen at the same time. From: NeilBrown <neilb@xxxxxxx> commit f8dfcffd0472a0f353f34a567ad3f53568914d04 upstream. A number of problems can occur due to races between resync/recovery and discard. - if sync_request calls handle_stripe() while a discard is happening on the stripe, it might call handle_stripe_clean_event before all of the individual discard requests have completed (so some devices are still locked, but not all). Since commit ca64cae96037de16e4af92678814f5d4bf0c1c65 md/raid5: Make sure we clear R5_Discard when discard is finished. this will cause R5_Discard to be cleared for the parity device, so handle_stripe_clean_event() will not be called when the other devices do become unlocked, so their ->written will not be cleared. This ultimately leads to a WARN_ON in init_stripe and a lock-up. - If handle_stripe_clean_event() does clear R5_UPTODATE at an awkward time for resync, it can lead to s->uptodate being less than disks in handle_parity_checks5(), which triggers a BUG (because it is one). So: - keep R5_Discard on the parity device until all other devices have completed their discard request - make sure we don't try to have a 'discard' and a 'sync' action at the same time. This involves a new stripe flag to we know when a 'discard' is happening, and the use of R5_Overlap on the parity disk so when a discard is wanted while a sync is active, so we know to wake up the discard at the appropriate time. Discard support for RAID5 was added in 3.7, so this is suitable for any -stable kernel since 3.7. Reported-by: Jes Sorensen <Jes.Sorensen@xxxxxxxxxx> Tested-by: Jes Sorensen <Jes.Sorensen@xxxxxxxxxx> Signed-off-by: NeilBrown <neilb@xxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- drivers/md/raid5.c | 45 +++++++++++++++++++++++++++++++++++++++------ drivers/md/raid5.h | 1 + 2 files changed, 40 insertions(+), 6 deletions(-) --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2612,6 +2612,8 @@ handle_failed_sync(struct r5conf *conf, int i; clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. @@ -2785,6 +2787,7 @@ static void handle_stripe_clean_event(st { int i; struct r5dev *dev; + int discard_pending = 0; for (i = disks; i--; ) if (sh->dev[i].written) { @@ -2813,9 +2816,23 @@ static void handle_stripe_clean_event(st STRIPE_SECTORS, !test_bit(STRIPE_DEGRADED, &sh->state), 0); - } - } else if (test_bit(R5_Discard, &sh->dev[i].flags)) - clear_bit(R5_Discard, &sh->dev[i].flags); + } else if (test_bit(R5_Discard, &dev->flags)) + discard_pending = 1; + } + if (!discard_pending && + test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { + clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + if (sh->qd_idx >= 0) { + clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags); + } + /* now that discard is done we can proceed with any sync */ + clear_bit(STRIPE_DISCARD, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); + + } if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -3467,9 +3484,15 @@ static void handle_stripe(struct stripe_ return; } - if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { - set_bit(STRIPE_SYNCING, &sh->state); - clear_bit(STRIPE_INSYNC, &sh->state); + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + spin_lock(&sh->stripe_lock); + /* Cannot process 'sync' concurrently with 'discard' */ + if (!test_bit(STRIPE_DISCARD, &sh->state) && + test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + } + spin_unlock(&sh->stripe_lock); } clear_bit(STRIPE_DELAYED, &sh->state); @@ -3629,6 +3652,8 @@ static void handle_stripe(struct stripe_ test_bit(STRIPE_INSYNC, &sh->state)) { md_done_sync(conf->mddev, STRIPE_SECTORS, 1); clear_bit(STRIPE_SYNCING, &sh->state); + if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) + wake_up(&conf->wait_for_overlap); } /* If the failed drives are just a ReadError, then we might need @@ -4195,6 +4220,13 @@ static void make_discard_request(struct sh = get_active_stripe(conf, logical_sector, 0, 0, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); + if (test_bit(STRIPE_SYNCING, &sh->state)) { + release_stripe(sh); + schedule(); + goto again; + } + clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); spin_lock_irq(&sh->stripe_lock); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) @@ -4207,6 +4239,7 @@ static void make_discard_request(struct goto again; } } + set_bit(STRIPE_DISCARD, &sh->state); finish_wait(&conf->wait_for_overlap, &w); for (d = 0; d < conf->raid_disks; d++) { if (d == sh->pd_idx || d == sh->qd_idx) --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -323,6 +323,7 @@ enum { STRIPE_COMPUTE_RUN, STRIPE_OPS_REQ_PENDING, STRIPE_ON_UNPLUG_LIST, + STRIPE_DISCARD, }; /* Patches currently in stable-queue which might be from neilb@xxxxxxx are queue-3.8/md-raid5-ensure-sync-and-discard-don-t-happen-at-the-same-time.patch queue-3.8/md-raid5-schedule_construction-should-abort-if-nothing-to-do.patch queue-3.8/md-raid5-avoid-accessing-gendisk-or-queue-structs-when-not-available.patch -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html