Re: [PATCH 14/14] md-cluster: add the support for resize

Shaohua Li <shli@xxxxxxxxxx> · Tue, 28 Feb 2017 11:25:36 -0800

On Fri, Feb 24, 2017 at 11:15:24AM +0800, Guoqing Jiang wrote:
> To update size for cluster raid, we need to make
> sure all nodes can perform the change successfully.
> However, it is possible that some of them can't do
> it due to failure (bitmap_resize could fail). So
> we need to consider the issue before we set the
> capacity unconditionally, and we use below steps
> to perform sanity check.
> 
> 1. A change the size, then broadcast METADATA_UPDATED
>    msg.
> 2. B and C receive METADATA_UPDATED change the size
>    excepts call set_capacity, sync_size is not update
>    if the change failed. Also call bitmap_update_sb
>    to sync sb to disk.
> 3. A checks other node's sync_size, if sync_size has
>    been updated in all nodes, then send CHANGE_CAPACITY
>    msg otherwise send msg to revert previous change.
> 4. B and C call set_capacity if receive CHANGE_CAPACITY
>    msg, otherwise pers->resize will be called to restore
>    the old value.
> 
> Reviewed-by: NeilBrown <neilb@xxxxxxxx>
> Signed-off-by: Guoqing Jiang <gqjiang@xxxxxxxx>
> ---
>  Documentation/md/md-cluster.txt |  2 +-
>  drivers/md/md-cluster.c         | 75 +++++++++++++++++++++++++++++++++++++++++
>  drivers/md/md-cluster.h         |  1 +
>  drivers/md/md.c                 | 21 +++++++++---
>  4 files changed, 93 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
> index 38883276d31c..2663d49dd8a0 100644
> --- a/Documentation/md/md-cluster.txt
> +++ b/Documentation/md/md-cluster.txt
> @@ -321,4 +321,4 @@ The algorithm is:
>  
>  There are somethings which are not supported by cluster MD yet.
>  
> -- update size and change array_sectors.
> +- change array_sectors.
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index d3c024e6bfcf..75da83187c31 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -1147,6 +1147,80 @@ int cluster_check_sync_size(struct mddev *mddev)
>  	return (my_sync_size == sync_size) ? 0 : -1;
>  }
>  
> +/*
> + * Update the size for cluster raid is a little more complex, we perform it
> + * by the steps:
> + * 1. hold token lock and update superblock in initiator node.
> + * 2. send METADATA_UPDATED msg to other nodes.
> + * 3. The initiator node continues to check each bitmap's sync_size, if all
> + *    bitmaps have the same value of sync_size, then we can set capacity and
> + *    let other nodes to perform it. If one node can't update sync_size
> + *    accordingly, we need to revert to previous value.
> + */
> +static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
> +{
> +	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	struct cluster_msg cmsg;
> +	struct md_rdev *rdev;
> +	int ret = 0;
> +	int raid_slot = -1;
> +
> +	md_update_sb(mddev, 1);
> +	lock_comm(cinfo, 1);
> +
> +	memset(&cmsg, 0, sizeof(cmsg));
> +	cmsg.type = cpu_to_le32(METADATA_UPDATED);
> +	rdev_for_each(rdev, mddev)
> +		if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
> +			raid_slot = rdev->desc_nr;
> +			break;
> +		}
> +	if (raid_slot >= 0) {
> +		cmsg.raid_slot = cpu_to_le32(raid_slot);
> +		/*
> +		 * We can only change capiticy after all the nodes can do it,
> +		 * so need to wait after other nodes already received the msg
> +		 * and handled the change
> +		 */
> +		ret = __sendmsg(cinfo, &cmsg);
> +		if (ret) {
> +			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
> +			       __func__, __LINE__);
> +			unlock_comm(cinfo);
> +			return;
> +		}
> +	} else {
> +		pr_err("md-cluster: No good device id found to send\n");
> +		unlock_comm(cinfo);
> +		return;
> +	}
> +
> +	/*
> +	 * check the sync_size from other node's bitmap, if sync_size
> +	 * have already updated in other nodes as expected, send an
> +	 * empty metadata msg to permit the change of capacity
> +	 */
> +	if (cluster_check_sync_size(mddev) == 0) {
> +		memset(&cmsg, 0, sizeof(cmsg));
> +		cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
> +		ret = __sendmsg(cinfo, &cmsg);
> +		if (ret)
> +			pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
> +			       __func__, __LINE__);
> +		set_capacity(mddev->gendisk, mddev->array_sectors);

don't call revalidate_disk here? And why don't move the gendisk related stuff to md.c.

> +	} else {
> +		/* revert to previous sectors */
> +		ret = mddev->pers->resize(mddev, old_dev_sectors);
> +		if (!ret)
> +			revalidate_disk(mddev->gendisk);
> +		ret = __sendmsg(cinfo, &cmsg);
> +		if (ret)
> +			pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
> +			       __func__, __LINE__);
> +	}
> +	unlock_comm(cinfo);
> +}
> +
>  static int resync_start(struct mddev *mddev)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> @@ -1392,6 +1466,7 @@ static struct md_cluster_operations cluster_ops = {
>  	.gather_bitmaps = gather_bitmaps,
>  	.lock_all_bitmaps = lock_all_bitmaps,
>  	.unlock_all_bitmaps = unlock_all_bitmaps,
> +	.update_size = update_size,
>  };
>  
>  static int __init cluster_init(void)
> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
> index 8f26a5e80810..e939c14222ba 100644
> --- a/drivers/md/md-cluster.h
> +++ b/drivers/md/md-cluster.h
> @@ -25,6 +25,7 @@ struct md_cluster_operations {
>  	int (*gather_bitmaps)(struct md_rdev *rdev);
>  	int (*lock_all_bitmaps)(struct mddev *mddev);
>  	void (*unlock_all_bitmaps)(struct mddev *mddev);
> +	void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
>  };
>  
>  #endif /* _MD_CLUSTER_H */
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 975c9dd60c05..2bc64059ff57 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6503,10 +6503,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
>  	struct md_rdev *rdev;
>  	int rv;
>  	int fit = (num_sectors == 0);
> -
> -	/* cluster raid doesn't support update size */
> -	if (mddev_is_clustered(mddev))
> -		return -EINVAL;
> +	sector_t old_dev_sectors = mddev->dev_sectors;
>  
>  	if (mddev->pers->resize == NULL)
>  		return -EINVAL;
> @@ -6535,7 +6532,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
>  	}
>  	rv = mddev->pers->resize(mddev, num_sectors);
>  	if (!rv) {
> -		if (mddev->queue) {
> +		if (mddev_is_clustered(mddev))
> +			md_cluster_ops->update_size(mddev, old_dev_sectors);
> +		else if (mddev->queue) {
>  			set_capacity(mddev->gendisk, mddev->array_sectors);
>  			revalidate_disk(mddev->gendisk);
>  		}
> @@ -8753,6 +8752,18 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
>  	int role, ret;
>  	char b[BDEVNAME_SIZE];
>  
> +	/*
> +	 * If size is changed in another node then we need to
> +	 * do resize as well.
> +	 */
> +	if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
> +		ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
> +		if (ret)
> +			pr_info("md-cluster: resize failed\n");
> +		else
> +			bitmap_update_sb(mddev->bitmap);
> +	}

I'm confused, who will trigger this? The patch 10 only calls set_capacity.
Please describe the details in each node. Also please add it to md-cluster.txt
document.

Thanks,
Shaohua
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html