Re: [PATCH 2/3] md: Set MD_BROKEN for RAID1 and RAID10

Mariusz Tkaczyk <mariusz.tkaczyk@xxxxxxxxxxxxxxx> · Mon, 14 Feb 2022 09:55:38 +0100

Hi Guoqing,
Thanks for review. Song already merged that, so I will adress issues in
the new patch.

On Sat, 12 Feb 2022 09:17:57 +0800
Guoqing Jiang <guoqing.jiang@xxxxxxxxx> wrote:

> On 1/27/22 11:39 PM, Mariusz Tkaczyk wrote:
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index f888ef197765..fda8473f96b8 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -2983,10 +2983,11 @@ state_store(struct md_rdev *rdev, const
> > char *buf, size_t len) 
> >   	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
> >   		md_error(rdev->mddev, rdev);
> > -		if (test_bit(Faulty, &rdev->flags))
> > -			err = 0;
> > -		else
> > +
> > +		if (test_bit(MD_BROKEN, &rdev->mddev->flags))
> >   			err = -EBUSY;
> > +		else
> > +			err = 0;  
> 
> Again, it is weird to check MD_BROKEN which is for mddev in rdev 
> specific function from
> my understanding.

As I wrote in description:

"-EBUSY expectation cannot be removed without breaking compatibility
with userspace."

I agree with your opinion but I would like to not introduce regression
here.
> 
> >   	} else if (cmd_match(buf, "remove")) {
> >   		if (rdev->mddev->pers) {
> >   			clear_bit(Blocked, &rdev->flags);
> > @@ -7441,7 +7442,7 @@ static int set_disk_faulty(struct mddev
> > *mddev, dev_t dev) err =  -ENODEV;
> >   	else {
> >   		md_error(mddev, rdev);
> > -		if (!test_bit(Faulty, &rdev->flags))
> > +		if (test_bit(MD_BROKEN, &mddev->flags))
> >   			err = -EBUSY;  
> 
> Ditto.
> 
> >   	}
> >   	rcu_read_unlock();
> > @@ -7987,12 +7988,14 @@ void md_error(struct mddev *mddev, struct
> > md_rdev *rdev) if (!mddev->pers->sync_request)
> >   		return;
> >   
> > -	if (mddev->degraded)
> > +	if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
> >   		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
> >   	sysfs_notify_dirent_safe(rdev->sysfs_state);
> >   	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> > -	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> > -	md_wakeup_thread(mddev->thread);
> > +	if (!test_bit(MD_BROKEN, &mddev->flags)) {
> > +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> > +		md_wakeup_thread(mddev->thread);
> > +	}
> >   	if (mddev->event_work.func)
> >   		queue_work(md_misc_wq, &mddev->event_work);
> >   	md_new_event();
> > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > index bc3f2094d0b6..1eb7d0e88cb2 100644
> > --- a/drivers/md/md.h
> > +++ b/drivers/md/md.h
> > @@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct
> > md_rdev *rdev, sector_t s, int sectors, int is_new);
> >   struct md_cluster_info;
> >   
> > -/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag
> > is added */ +/**
> > + * enum mddev_flags - md device flags.
> > + * @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
> > + * @MD_CLOSING: If set, we are closing the array, do not open it
> > then.
> > + * @MD_JOURNAL_CLEAN: A raid with journal is already clean.
> > + * @MD_HAS_JOURNAL: The raid array has journal feature set.
> > + * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node,
> > already took
> > + *			       resync lock, need to release the
> > lock.
> > + * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is
> > supported as
> > + *			    calls to md_error() will never cause
> > the array to
> > + *			    become failed.
> > + * @MD_HAS_PPL:  The raid array has PPL feature set.
> > + * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature
> > set.
> > + * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the
> > metadata
> > + *			 without taking reconfig_mutex.
> > + * @MD_UPDATING_SB: md_check_recovery is updating the metadata
> > without
> > + *		     explicitly holding reconfig_mutex.
> > + * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not
> > report that
> > + *		   array is ready yet.
> > + * @MD_BROKEN: This is used to stop writes and mark array as
> > failed.
> > + *
> > + * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag
> > is added
> > + */
> >   enum mddev_flags {
> > -	MD_ARRAY_FIRST_USE,	/* First use of array, needs
> > initialization */
> > -	MD_CLOSING,		/* If set, we are closing the
> > array, do not open
> > -				 * it then */
> > -	MD_JOURNAL_CLEAN,	/* A raid with journal is already
> > clean */
> > -	MD_HAS_JOURNAL,		/* The raid array has
> > journal feature set */
> > -	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which
> > means node
> > -				   * already took resync lock,
> > need to
> > -				   * release the lock */
> > -	MD_FAILFAST_SUPPORTED,	/* Using MD_FAILFAST on
> > metadata writes is
> > -				 * supported as calls to
> > md_error() will
> > -				 * never cause the array to become
> > failed.
> > -				 */
> > -	MD_HAS_PPL,		/* The raid array has PPL
> > feature set */
> > -	MD_HAS_MULTIPLE_PPLS,	/* The raid array has
> > multiple PPLs feature set */
> > -	MD_ALLOW_SB_UPDATE,	/* md_check_recovery is allowed
> > to update
> > -				 * the metadata without taking
> > reconfig_mutex.
> > -				 */
> > -	MD_UPDATING_SB,		/* md_check_recovery is
> > updating the metadata
> > -				 * without explicitly holding
> > reconfig_mutex.
> > -				 */
> > -	MD_NOT_READY,		/* do_md_run() is active, so
> > 'array_state'
> > -				 * must not report that array is
> > ready yet
> > -				 */
> > -	MD_BROKEN,              /* This is used in RAID-0/LINEAR
> > only, to stop
> > -				 * I/O in case an array member is
> > gone/failed.
> > -				 */  
> 
> People have to scroll up to see the meaning of each flag with the 
> change, I would
> suggest move these comments on top of each flag if you really hate 
> previous style,
> but just my personal flavor.

Kernel has well-defined definition and comment rules so I followed
them:
https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html
Personally, I think that we should follow. It gives as a change to
generate documentation.

> 
> > +	MD_ARRAY_FIRST_USE,
> > +	MD_CLOSING,
> > +	MD_JOURNAL_CLEAN,
> > +	MD_HAS_JOURNAL,
> > +	MD_CLUSTER_RESYNC_LOCKED,
> > +	MD_FAILFAST_SUPPORTED,
> > +	MD_HAS_PPL,
> > +	MD_HAS_MULTIPLE_PPLS,
> > +	MD_ALLOW_SB_UPDATE,
> > +	MD_UPDATING_SB,
> > +	MD_NOT_READY,
> > +	MD_BROKEN,
> >   };
> >   
> >   enum mddev_sb_flags {
> > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> > index 7dc8026cf6ee..b222bafa1196 100644
> > --- a/drivers/md/raid1.c
> > +++ b/drivers/md/raid1.c
> > @@ -1615,30 +1615,37 @@ static void raid1_status(struct seq_file
> > *seq, struct mddev *mddev) seq_printf(seq, "]");
> >   }
> >   
> > +/**
> > + * raid1_error() - RAID1 error handler.
> > + * @mddev: affected md device.
> > + **@rdev: member device to remove.*  
> 
> It is confusing, rdev is removed in raid1_remove_disk only.
> 

Yes, that make sense. I will address it.

> > + *
> > + * Error on @rdev is raised and it is needed to be removed from
> > @mddev.
> > + * If there are more than one active member, @rdev is always
> > removed.
> > + *
> > + * If it is the last active member, it depends on
> > &mddev->fail_last_dev:
> > + * - if is on @rdev is removed.
> > + * - if is off, @rdev is not removed, but recovery from it is
> > disabled (@rdev is
> > + *   very likely to fail).
> > + * In both cases, &MD_BROKEN will be set in &mddev->flags.
> > + */
> >   static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
> >   {
> >   	char b[BDEVNAME_SIZE];
> >   	struct r1conf *conf = mddev->private;
> >   	unsigned long flags;
> >   
> > -	/*
> > -	 * If it is not operational, then we have already marked
> > it as dead
> > -	 * else if it is the last working disks with
> > "fail_last_dev == false",
> > -	 * ignore the error, let the next level up know.
> > -	 * else mark the drive as failed
> > -	 */
> >   	spin_lock_irqsave(&conf->device_lock, flags);
> > -	if (test_bit(In_sync, &rdev->flags) &&
> > !mddev->fail_last_dev
> > -	    && (conf->raid_disks - mddev->degraded) == 1) {
> > -		/*
> > -		 * Don't fail the drive, act as though we were
> > just a
> > -		 * normal single drive.
> > -		 * However don't try a recovery from this drive as
> > -		 * it is very likely to fail.
> > -		 */
> > -		conf->recovery_disabled = mddev->recovery_disabled;
> > -		spin_unlock_irqrestore(&conf->device_lock, flags);
> > -		return;
> > +
> > +	if (test_bit(In_sync, &rdev->flags) &&
> > +	    (conf->raid_disks - mddev->degraded) == 1) {
> > +		set_bit(MD_BROKEN, &mddev->flags);
> > +
> > +		if (!mddev->fail_last_dev) {
> > +			conf->recovery_disabled =
> > mddev->recovery_disabled;
> > +			spin_unlock_irqrestore(&conf->device_lock,
> > flags);
> > +			return;
> > +		}
> >   	}
> >   	set_bit(Blocked, &rdev->flags);
> >   	if (test_and_clear_bit(In_sync, &rdev->flags))
> > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> > index dde98f65bd04..7471e20d7cd6 100644
> > --- a/drivers/md/raid10.c
> > +++ b/drivers/md/raid10.c
> > @@ -1945,26 +1945,37 @@ static int enough(struct r10conf *conf, int
> > ignore) _enough(conf, 1, ignore);
> >   }
> >   
> > +/**
> > + * raid10_error() - RAID10 error handler.
> > + * @mddev: affected md device.
> > + * @rdev: member device to remove.  
> 
> Ditto.
>
Noted.

Thanks,
Mariusz