Re: [PATCH 1/6] md-cluster: Protect communication with mutexes

NeilBrown <neilb@xxxxxxxx> · Tue, 10 Nov 2015 10:31:06 +1100

On Fri, Nov 06 2015, rgoldwyn@xxxxxxx wrote:

> From: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx>
>
> Communication can happen through multiple threads. It is possible that
> one thread steps over another threads sequence. So, we use mutexes to
> protect both the send and receive sequences.
>
> We use a new flag CLUSTER_MD_COMM_LOCKED which is used to detect
> if communication is already locked. This is useful for cases where we need to
> take and hold the token DLM lock for a while. This bit is set only
> after locking communication.

I don't understand what this flag is trying to achieve, but I'm fairly
sure it doesn't achieve it.

Maybe if it was test_and_set_bit in metadata_update_start, it might make
sense.  But then I would suggest that clearing the bit be moved to
unlock_comm()

Do you just want to use mutex_try_lock() to detect if communication is
already locked?

>
> Also, Remove stray/ununsed sb_mutex.

I already removed that it mainline - I should have mentioned.

Thanks,
NeilBrown

>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx>
> ---
>  drivers/md/md-cluster.c | 26 +++++++++++++++++++++-----
>  1 file changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index b73729b..a93734e 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -47,6 +47,7 @@ struct resync_info {
>  #define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
>  #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
>  #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
> +#define		MD_CLUSTER_COMM_LOCKED			4
>  
>  
>  struct md_cluster_info {
> @@ -54,7 +55,8 @@ struct md_cluster_info {
>  	dlm_lockspace_t *lockspace;
>  	int slot_number;
>  	struct completion completion;
> -	struct mutex sb_mutex;
> +	struct mutex recv_mutex;
> +	struct mutex send_mutex;
>  	struct dlm_lock_resource *bitmap_lockres;
>  	struct dlm_lock_resource *resync_lockres;
>  	struct list_head suspend_list;
> @@ -503,6 +505,7 @@ static void recv_daemon(struct md_thread *thread)
>  	struct cluster_msg msg;
>  	int ret;
>  
> +	mutex_lock(&cinfo->recv_mutex);
>  	/*get CR on Message*/
>  	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
>  		pr_err("md/raid1:failed to get CR on MESSAGE\n");
> @@ -529,6 +532,7 @@ static void recv_daemon(struct md_thread *thread)
>  	ret = dlm_unlock_sync(message_lockres);
>  	if (unlikely(ret != 0))
>  		pr_info("unlock msg failed return %d\n", ret);
> +	mutex_unlock(&cinfo->recv_mutex);
>  }
>  
>  /* lock_comm()
> @@ -542,20 +546,22 @@ static int lock_comm(struct md_cluster_info *cinfo)
>  {
>  	int error;
>  
> -	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
> -		return 0;
> +	mutex_lock(&cinfo->send_mutex);
>  
>  	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
>  	if (error)
>  		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
>  				__func__, __LINE__, error);
> +	mutex_lock(&cinfo->recv_mutex);
>  	return error;
>  }
>  
>  static void unlock_comm(struct md_cluster_info *cinfo)
>  {
>  	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
> +	mutex_unlock(&cinfo->recv_mutex);
>  	dlm_unlock_sync(cinfo->token_lockres);
> +	mutex_unlock(&cinfo->send_mutex);
>  }
>  
>  /* __sendmsg()
> @@ -709,7 +715,8 @@ static int join(struct mddev *mddev, int nodes)
>  	init_completion(&cinfo->completion);
>  	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
>  
> -	mutex_init(&cinfo->sb_mutex);
> +	mutex_init(&cinfo->send_mutex);
> +	mutex_init(&cinfo->recv_mutex);
>  	mddev->cluster_info = cinfo;
>  
>  	memset(str, 0, 64);
> @@ -839,7 +846,12 @@ static int slot_number(struct mddev *mddev)
>  
>  static int metadata_update_start(struct mddev *mddev)
>  {
> -	return lock_comm(mddev->cluster_info);
> +	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	int ret;
> +	if (test_and_clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state))
> +		return 0;
> +	ret = lock_comm(cinfo);
> +	return ret;
>  }
>  
>  static int metadata_update_finish(struct mddev *mddev)
> @@ -864,6 +876,7 @@ static int metadata_update_finish(struct mddev *mddev)
>  		ret = __sendmsg(cinfo, &cmsg);
>  	} else
>  		pr_warn("md-cluster: No good device id found to send\n");
> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	unlock_comm(cinfo);
>  	return ret;
>  }
> @@ -871,6 +884,7 @@ static int metadata_update_finish(struct mddev *mddev)
>  static void metadata_update_cancel(struct mddev *mddev)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	unlock_comm(cinfo);
>  }
>  
> @@ -945,6 +959,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>  	memcpy(cmsg.uuid, uuid, 16);
>  	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
>  	lock_comm(cinfo);
> +	set_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	ret = __sendmsg(cinfo, &cmsg);
>  	if (ret)
>  		return ret;
> @@ -964,6 +979,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>  static void add_new_disk_cancel(struct mddev *mddev)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	unlock_comm(cinfo);
>  }
>  
> -- 
> 1.8.5.6
Attachment:
signature.asc

Description: PGP signature