Re: [PATCH V2] md/raid1: don't clear bitmap bits on interrupted recovery.

Nate Dailey <nate.dailey@xxxxxxxxxxx> · Mon, 11 Feb 2019 12:25:49 -0500

Hi Jack, I've attached my repro script. The keys to reproducing this seem to be 
not doing IO during the recovery (because that might cover up a mismatch), and 
having a small bitmap chunk size (smaller/more chunks mean more chances to hit).

I haven't used blktest or mdadm test, so I'm not sure how well this fits in...

Nate

On 2/11/19 4:10 AM, Jinpu Wang wrote:
>
> sync_request_write no longer submits writes to a Faulty device. This has
> the unfortunate side effect that bitmap bits can be incorrectly cleared
> if a recovery is interrupted (previously, end_sync_write would have
> prevented this). This means the next recovery may not copy everything
> it should, potentially corrupting data.
>
> Add a function for doing the proper md_bitmap_end_sync, called from
> end_sync_write and the Faulty case in sync_request_write.
>
> Fixes: 0c9d5b127f69 ("md/raid1: avoid reusing a resync bio after error
> handling.")
> Signed-off-by: Nate Dailey <nate.dailey@xxxxxxxxxxx>
> ---
> drivers/md/raid1.c | 28 ++++++++++++++++++----------
> 1 file changed, 18 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 1d54109071cc..fa47249fa3e4 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1863,6 +1863,20 @@ static void end_sync_read(struct bio *bio)
> reschedule_retry(r1_bio);
> }
>
> +static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
> +{
> + sector_t sync_blocks = 0;
> + sector_t s = r1_bio->sector;
> + long sectors_to_go = r1_bio->sectors;
> +
> + /* make sure these bits don't get cleared. */
> + do {
> + md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
> + s += sync_blocks;
> + sectors_to_go -= sync_blocks;
> + } while (sectors_to_go > 0);
> +}
> +
> static void end_sync_write(struct bio *bio)
> {
> int uptodate = !bio->bi_status;
> @@ -1874,15 +1888,7 @@ static void end_sync_write(struct bio *bio)
> struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
>
> if (!uptodate) {
> - sector_t sync_blocks = 0;
> - sector_t s = r1_bio->sector;
> - long sectors_to_go = r1_bio->sectors;
> - /* make sure these bits doesn't get cleared. */
> - do {
> - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
> - s += sync_blocks;
> - sectors_to_go -= sync_blocks;
> - } while (sectors_to_go > 0);
> + abort_sync_write(mddev, r1_bio);
> set_bit(WriteErrorSeen, &rdev->flags);
> if (!test_and_set_bit(WantReplacement, &rdev->flags))
> set_bit(MD_RECOVERY_NEEDED, &
> @@ -2172,8 +2178,10 @@ static void sync_request_write(struct mddev
> *mddev, struct r1bio *r1_bio)
> (i == r1_bio->read_disk ||
> !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
> continue;
> - if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
> + if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
> + abort_sync_write(mddev, r1_bio);
> continue;
> + }
>
> bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
> if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
> --
> 2.19.1

Thanks Nate, the patch looks good to me!

Do you have a reproducer for the bug here? It would good to have a
test case in blktest or mdadm test.

As this could lead to data corruption, it should go to stable 4.14+, I
guess Liu Song will mark it for stable :)

Thanks.

--
Jack Wang
Linux Kernel Developer

1&1 IONOS Cloud GmbH | Greifswalder Str. 207 | 10405 Berlin | Germany
Phone: +49 30 57700-8042 | Fax: +49 30 57700-8598
E-mail: jinpu.wang@xxxxxxxxxxxxxxx | Web: www.ionos.de 
<http://www.ionos.de>

Head Office: Berlin, Germany
District Court Berlin Charlottenburg, Registration number: HRB 125506 B
Executive Management: Christoph Steffens, Matthias Steinberg, Achim Weiss

Member of United Internet

This e-mail may contain confidential and/or privileged information. If
you are not the intended recipient of this e-mail, you are hereby
notified that saving, distribution or use of the content of this
e-mail in any way is prohibited. If you have received this e-mail in
error, please notify the sender and delete the e-mail.

#!/bin/bash

# NOTES:
#
# Run this on a raid1 with a mounted file system.
#
# Works best with a small bitmap chunk size (like 32K) and small partition
# size (like 10G).
#
# Timings might have to be adjusted to make sure recovery is actually
# interrupted.

[ $# -ne 1 ] && echo "usage: $0 <md>" && exit 1
sysmd=/sys/block/$1
[ ! -d "$sysmd" ] && echo "failed to find md" && exit 1

devmd="/dev/$(udevadm info -q name --path=$sysmd)"
[ ! -b "$devmd" ] && echo "failed to find md devnode" && exit 1

mountpoint=$(awk -v devmd="$devmd" 'match($0, devmd) {print $2}' /proc/mounts)
[ ! -e "$mountpoint" ] && echo "failed to find mountpoint" && exit 1

for m in ${sysmd}/md/dev-*
{
    syssd=$(cd $m/block && pwd -P)
    break
}
[ ! -d "$syssd" ] && echo "failed to find md member disk" && exit 1

devsd="/dev/$(udevadm info -q name --path=$syssd)"
[ ! -b "$devsd" ] && echo "failed to find md member disk devnode" && exit 1

[ -f "${syssd}/partition" ] && syssd=$(dirname $syssd)
syssd=/sys/block/${syssd##*block/}
[ ! -d "$syssd" ] && echo "failed to find md member disk" && exit 1

wait_idle() {
    while [ "$(cat $sysmd/md/sync_action)" != "idle" ] ; do sleep 1s; done
}

wait_duplex() {
    while [ "$(ls -d $sysmd/md/dev-sd*/ | wc -l)" != "2" ] ; do sleep 1s; done
}

wait_simplex() {
    while [ "$(ls -d $sysmd/md/dev-sd*/ | wc -l)" == "2" ] ; do	sleep 1s; done
}

iter() {
    echo
    echo " - remove $devsd from $devmd..."
    mdadm -f $devmd $devsd
    sleep 5s
    mdadm -r $devmd $devsd
    wait_simplex

    echo
    echo " - write to ${mountpoint}..."
    dd if=/dev/urandom of=${mountpoint}/TESTFILE oflag=direct bs=1M count=1000
    sleep 2s

    echo
    echo " - add $devsd to $devmd..."
    mdadm -a $devmd $devsd

    echo
    echo " - wait for $sysmd recovery to start..."
    wait_duplex

    echo
    echo " - offline $syssd (interrupting $sysmd recovery)"
    sleep 2s
    echo "offline" > ${syssd}/device/state
    sleep 5s

    echo
    echo " - remove $devsd from $devmd..."
    mdadm -f $devmd $devsd
    sleep 5s
    mdadm -r $devmd $devsd
    wait_simplex

    echo
    echo " - restore $sysd"
    echo "running" > ${syssd}/device/state
    sleep 5s

    echo
    echo " - add $devsd to $devmd..."
    mdadm -a $devmd $devsd

    echo
    echo " - wait for $sysmd recovery to complete..."
    wait_duplex
    sleep 10s
    wait_idle

    echo
    echo " - repair $sysmd..."
    echo repair > $sysmd/md/sync_action
    wait_idle

    cnt=$(cat $sysmd/md/mismatch_cnt)

    echo
    echo " - $sysmd mismatch count: $cnt"

    echo
    if [ $cnt -ne 0 ] ; then
	echo " - FAILED!"
	exit 1
    else
	echo " - passed!"
    fi
}

i=0
while [ true ]
do
    i=$(($i + 1))
    echo
    echo
    echo "$(date) ********** Iteration $i **********"
    iter
    sleep 5s
done