Re: [PATCH net-next 01/10] net/smc: remove locks smc_client_lgr_pending and smc_server_lgr_pending

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Aug 11, 2022 at 01:47:32AM +0800, D. Wythe wrote:
> From: "D. Wythe" <alibuda@xxxxxxxxxxxxxxxxx>
> 
> This patch attempts to remove locks named smc_client_lgr_pending and
> smc_server_lgr_pending, which aim to serialize the creation of link
> group. However, once link group existed already, those locks are
> meaningless, worse still, they make incoming connections have to be
> queued one after the other.
> 
> Now, the creation of link group is no longer generated by competition,
> but allocated through following strategy.
> 
> 1. Try to find a suitable link group, if successd, current connection
> is considered as NON first contact connection. ends.
> 
> 2. Check the number of connections currently waiting for a suitable
> link group to be created, if it is not less that the number of link
> groups to be created multiplied by (SMC_RMBS_PER_LGR_MAX - 1), then
> increase the number of link groups to be created, current connection
> is considered as the first contact connection. ends.
> 
> 3. Increase the number of connections currently waiting, and wait
> for woken up.
> 
> 4. Decrease the number of connections currently waiting, goto 1.
> 
> We wake up the connection that was put to sleep in stage 3 through
> the SMC link state change event. Once the link moves out of the
> SMC_LNK_ACTIVATING state, decrease the number of link groups to
> be created, and then wake up at most (SMC_RMBS_PER_LGR_MAX - 1)
> connections.
> 
> In the iplementation, we introduce the concept of lnk cluster, which is
> a collection of links with the same characteristics (see
> smcr_lnk_cluster_cmpfn() with more details), which makes it possible to
> wake up efficiently in the scenario of N v.s 1.
> 
> Signed-off-by: D. Wythe <alibuda@xxxxxxxxxxxxxxxxx>
> ---
>  net/smc/af_smc.c   |  11 +-
>  net/smc/smc_core.c | 356 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  net/smc/smc_core.h |  48 ++++++++
>  net/smc/smc_llc.c  |   9 +-
>  4 files changed, 411 insertions(+), 13 deletions(-)
> 
> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index 79c1318..af4b0aa 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -1194,10 +1194,8 @@ static int smc_connect_rdma(struct smc_sock *smc,
>  	if (reason_code)
>  		return reason_code;
>  
> -	mutex_lock(&smc_client_lgr_pending);
>  	reason_code = smc_conn_create(smc, ini);
>  	if (reason_code) {
> -		mutex_unlock(&smc_client_lgr_pending);
>  		return reason_code;
>  	}
>  
> @@ -1289,7 +1287,6 @@ static int smc_connect_rdma(struct smc_sock *smc,
>  		if (reason_code)
>  			goto connect_abort;
>  	}
> -	mutex_unlock(&smc_client_lgr_pending);
>  
>  	smc_copy_sock_settings_to_clc(smc);
>  	smc->connect_nonblock = 0;
> @@ -1299,7 +1296,6 @@ static int smc_connect_rdma(struct smc_sock *smc,
>  	return 0;
>  connect_abort:
>  	smc_conn_abort(smc, ini->first_contact_local);
> -	mutex_unlock(&smc_client_lgr_pending);
>  	smc->connect_nonblock = 0;
>  
>  	return reason_code;
> @@ -2377,7 +2373,8 @@ static void smc_listen_work(struct work_struct *work)
>  	if (rc)
>  		goto out_decl;
>  
> -	mutex_lock(&smc_server_lgr_pending);
> +	if (ini->is_smcd)
> +		mutex_lock(&smc_server_lgr_pending);
>  	smc_close_init(new_smc);
>  	smc_rx_init(new_smc);
>  	smc_tx_init(new_smc);
> @@ -2415,7 +2412,6 @@ static void smc_listen_work(struct work_struct *work)
>  					    ini->first_contact_local, ini);
>  		if (rc)
>  			goto out_unlock;
> -		mutex_unlock(&smc_server_lgr_pending);
>  	}
>  	smc_conn_save_peer_info(new_smc, cclc);
>  	smc_listen_out_connected(new_smc);
> @@ -2423,7 +2419,8 @@ static void smc_listen_work(struct work_struct *work)
>  	goto out_free;
>  
>  out_unlock:
> -	mutex_unlock(&smc_server_lgr_pending);
> +	if (ini->is_smcd)
> +		mutex_unlock(&smc_server_lgr_pending);
>  out_decl:
>  	smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
>  			   proposal_version);
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index ff49a11..a3338cc 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -46,6 +46,10 @@ struct smc_lgr_list smc_lgr_list = {	/* established link groups */
>  	.num = 0,
>  };
>  
> +struct smc_lgr_manager smc_lgr_manager = {
> +	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_manager.lock),
> +};
> +
>  static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */
>  static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted);
>  
> @@ -55,6 +59,282 @@ static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb,
>  
>  static void smc_link_down_work(struct work_struct *work);
>  
> +/* SMC-R lnk cluster compare func
> + * All lnks that meet the description conditions of this function
> + * are logically aggregated, called lnk cluster.
> + * For the server side, lnk cluster is used to determine whether
> + * a new group needs to be created when processing new imcoming connections.
> + * For the client side, lnk cluster is used to determine whether
> + * to wait for link ready (in other words, first contact ready).
> + */
> +static int smcr_lnk_cluster_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
> +{
> +	const struct smc_lnk_cluster_compare_arg *key = arg->key;
> +	const struct smc_lnk_cluster *lnkc = obj;
> +
> +	if (memcmp(key->peer_systemid, lnkc->peer_systemid, SMC_SYSTEMID_LEN))
> +		return 1;
> +
> +	if (memcmp(key->peer_gid, lnkc->peer_gid, SMC_GID_SIZE))
> +		return 1;
> +
> +	if ((key->role == SMC_SERV || key->clcqpn == lnkc->clcqpn) &&
> +	    (key->smcr_version == SMC_V2 ||
> +	    !memcmp(key->peer_mac, lnkc->peer_mac, ETH_ALEN)))
> +		return 0;
> +
> +	return 1;
> +}

Also, SMC prefers to use *link* for function name and struct name. It's
okay to use *lnk* for local variables.

> +
> +/* SMC-R lnk cluster hash func */
> +static u32 smcr_lnk_cluster_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct smc_lnk_cluster *lnkc = data;
> +
> +	return jhash2((u32 *)lnkc->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed)
> +		+ (lnkc->role == SMC_SERV) ? 0 : lnkc->clcqpn;
> +}
> +
> +/* SMC-R lnk cluster compare arg hash func */
> +static u32 smcr_lnk_cluster_compare_arg_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct smc_lnk_cluster_compare_arg *key = data;
> +
> +	return jhash2((u32 *)key->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed)
> +		+ (key->role == SMC_SERV) ? 0 : key->clcqpn;
> +}
> +
> +static const struct rhashtable_params smcr_lnk_cluster_rhl_params = {
> +	.head_offset = offsetof(struct smc_lnk_cluster, rnode),
> +	.key_len = sizeof(struct smc_lnk_cluster_compare_arg),
> +	.obj_cmpfn = smcr_lnk_cluster_cmpfn,
> +	.obj_hashfn = smcr_lnk_cluster_hashfn,
> +	.hashfn = smcr_lnk_cluster_compare_arg_hashfn,
> +	.automatic_shrinking = true,
> +};
> +
> +/* hold a reference for smc_lnk_cluster */
> +static inline void smc_lnk_cluster_hold(struct smc_lnk_cluster *lnkc)
> +{
> +	if (likely(lnkc))
> +		refcount_inc(&lnkc->ref);
> +}
> +
> +/* release a reference for smc_lnk_cluster */
> +static inline void smc_lnk_cluster_put(struct smc_lnk_cluster *lnkc)
> +{
> +	bool do_free = false;
> +
> +	if (!lnkc)
> +		return;
> +
> +	if (refcount_dec_not_one(&lnkc->ref))
> +		return;
> +
> +	spin_lock_bh(&smc_lgr_manager.lock);
> +	/* last ref */
> +	if (refcount_dec_and_test(&lnkc->ref)) {
> +		do_free = true;
> +		rhashtable_remove_fast(&smc_lgr_manager.lnk_cluster_maps, &lnkc->rnode,
> +				       smcr_lnk_cluster_rhl_params);
> +	}
> +	spin_unlock_bh(&smc_lgr_manager.lock);
> +	if (do_free)
> +		kfree(lnkc);
> +}
> +
> +/* Get or create smc_lnk_cluster by key
> + * This function will hold a reference of returned smc_lnk_cluster
> + * or create a new smc_lnk_cluster with the reference initialized to 1。
> + * caller MUST call smc_lnk_cluster_put after this.
> + */
> +static inline struct smc_lnk_cluster *
> +smcr_lnk_get_or_create_cluster(struct smc_lnk_cluster_compare_arg *key)
> +{
> +	struct smc_lnk_cluster *lnkc, *tmp_lnkc;
> +	bool busy_retry;
> +	int err;
> +
> +	/* serving a hardware or software interrupt, or preemption is disabled */
> +	busy_retry = !in_interrupt();
> +
> +	spin_lock_bh(&smc_lgr_manager.lock);
> +	lnkc = rhashtable_lookup_fast(&smc_lgr_manager.lnk_cluster_maps, key,
> +				      smcr_lnk_cluster_rhl_params);
> +	if (!lnkc) {
> +		lnkc = kzalloc(sizeof(*lnkc), GFP_ATOMIC);
> +		if (unlikely(!lnkc))
> +			goto fail;
> +
> +		/* init cluster */
> +		spin_lock_init(&lnkc->lock);
> +		lnkc->role = key->role;
> +		if (key->role == SMC_CLNT)
> +			lnkc->clcqpn = key->clcqpn;
> +		init_waitqueue_head(&lnkc->first_contact_waitqueue);
> +		memcpy(lnkc->peer_systemid, key->peer_systemid, SMC_SYSTEMID_LEN);
> +		memcpy(lnkc->peer_gid, key->peer_gid, SMC_GID_SIZE);
> +		memcpy(lnkc->peer_mac, key->peer_mac, ETH_ALEN);
> +		refcount_set(&lnkc->ref, 1);
> +
> +		do {
> +			err = rhashtable_insert_fast(&smc_lgr_manager.lnk_cluster_maps,
> +						     &lnkc->rnode, smcr_lnk_cluster_rhl_params);
> +
> +			/* success or fatal error */
> +			if (err != -EBUSY)
> +				break;
> +
> +			/* impossible in fact right now */
> +			if (unlikely(!busy_retry)) {
> +				pr_warn_ratelimited("smc: create lnk cluster in softirq\n");
> +				break;
> +			}
> +
> +			spin_unlock_bh(&smc_lgr_manager.lock);
> +			/* yeild */
> +			cond_resched();
> +			spin_lock_bh(&smc_lgr_manager.lock);
> +
> +			/* after spin_unlock_bh(), lnk_cluster_maps may be changed */
> +			tmp_lnkc = rhashtable_lookup_fast(&smc_lgr_manager.lnk_cluster_maps, key,
> +							  smcr_lnk_cluster_rhl_params);
> +
> +			if (unlikely(tmp_lnkc)) {
> +				pr_warn_ratelimited("smc: create cluster failed dues to duplicat key");
> +				kfree(lnkc);
> +				lnkc = NULL;
> +				goto fail;
> +			}
> +		} while (1);
> +
> +		if (unlikely(err)) {
> +			pr_warn_ratelimited("smc: rhashtable_insert_fast failed (%d)", err);
> +			kfree(lnkc);
> +			lnkc = NULL;
> +		}
> +	} else {
> +		smc_lnk_cluster_hold(lnkc);
> +	}
> +fail:
> +	spin_unlock_bh(&smc_lgr_manager.lock);
> +	return lnkc;
> +}
> +
> +/* Get or create a smc_lnk_cluster by lnk
> + * caller MUST call smc_lnk_cluster_put after this.
> + */
> +static inline struct smc_lnk_cluster *smcr_lnk_get_cluster(struct smc_link *lnk)
> +{
> +	struct smc_lnk_cluster_compare_arg key;
> +	struct smc_link_group *lgr;
> +
> +	lgr = lnk->lgr;
> +	if (!lgr || lgr->is_smcd)
> +		return NULL;
> +
> +	key.smcr_version = lgr->smc_version;
> +	key.peer_systemid = lgr->peer_systemid;
> +	key.peer_gid = lnk->peer_gid;
> +	key.peer_mac = lnk->peer_mac;
> +	key.role	 = lgr->role;
> +	if (key.role == SMC_CLNT)
> +		key.clcqpn = lnk->peer_qpn;
> +
> +	return smcr_lnk_get_or_create_cluster(&key);
> +}
> +
> +/* Get or create a smc_lnk_cluster by ini
> + * caller MUST call smc_lnk_cluster_put after this.
> + */
> +static inline struct smc_lnk_cluster *
> +smcr_lnk_get_cluster_by_ini(struct smc_init_info *ini, int role)
> +{
> +	struct smc_lnk_cluster_compare_arg key;
> +
> +	if (ini->is_smcd)
> +		return NULL;
> +
> +	key.smcr_version = ini->smcr_version;
> +	key.peer_systemid = ini->peer_systemid;
> +	key.peer_gid = ini->peer_gid;
> +	key.peer_mac = ini->peer_mac;
> +	key.role	= role;
> +	if (role == SMC_CLNT)
> +		key.clcqpn	= ini->ib_clcqpn;
> +
> +	return smcr_lnk_get_or_create_cluster(&key);
> +}
> +
> +/* callback when smc link state change */
> +void smcr_lnk_cluster_on_lnk_state(struct smc_link *lnk)
> +{
> +	struct smc_lnk_cluster *lnkc;
> +	int nr = 0;
> +
> +	/* barrier for lnk->state */
> +	smp_mb();
> +
> +	/* only first link can made connections block on
> +	 * first_contact_waitqueue
> +	 */
> +	if (lnk->link_idx != SMC_SINGLE_LINK)
> +		return;
> +
> +	/* state already seen  */
> +	if (lnk->state_record & SMC_LNK_STATE_BIT(lnk->state))
> +		return;
> +
> +	lnkc = smcr_lnk_get_cluster(lnk);
> +
> +	if (unlikely(!lnkc))
> +		return;
> +
> +	spin_lock_bh(&lnkc->lock);
> +
> +	/* all lnk state change should be
> +	 * 1. SMC_LNK_UNUSED -> SMC_LNK_TEAR_DWON (link init failed)
> +	 * 2. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_TEAR_DWON
> +	 * 3. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_INACTIVE -> SMC_LNK_TEAR_DWON
> +	 * 4. SMC_LNK_UNUSED -> SMC_LNK_ACTIVATING -> SMC_LNK_INACTIVE -> SMC_LNK_TEAR_DWON
> +	 * 5. SMC_LNK_UNUSED -> SMC_LNK_ATIVATING -> SMC_LNK_ACTIVE ->SMC_LNK_INACTIVE
> +	 * -> SMC_LNK_TEAR_DWON
> +	 */
> +	switch (lnk->state) {
> +	case SMC_LNK_ACTIVATING:
> +		/* It's safe to hold a reference without lock
> +		 * dues to the smcr_lnk_get_cluster already hold one
> +		 */
> +		smc_lnk_cluster_hold(lnkc);
> +		break;
> +	case SMC_LNK_TEAR_DWON:
> +		if (lnk->state_record & SMC_LNK_STATE_BIT(SMC_LNK_ACTIVATING))
> +			/* smc_lnk_cluster_hold in SMC_LNK_ACTIVATING */
> +			smc_lnk_cluster_put(lnkc);
> +		fallthrough;
> +	case SMC_LNK_ACTIVE:
> +	case SMC_LNK_INACTIVE:
> +		if (!(lnk->state_record &
> +			(SMC_LNK_STATE_BIT(SMC_LNK_ACTIVE)
> +			| SMC_LNK_STATE_BIT(SMC_LNK_INACTIVE)))) {
> +			lnkc->pending_capability -= (SMC_RMBS_PER_LGR_MAX - 1);
> +			/* TODO: wakeup just one to perfrom first contact
> +			 * if record state has no SMC_LNK_ACTIVE
> +			 */
> +			nr = SMC_RMBS_PER_LGR_MAX - 1;
> +		}
> +		break;
> +	case SMC_LNK_UNUSED:
> +		pr_warn_ratelimited("net/smc: invalid lnk state. ");
> +		break;
> +	}
> +	SMC_LNK_STATE_RECORD(lnk, lnk->state);
> +	spin_unlock_bh(&lnkc->lock);
> +	if (nr)
> +		wake_up_nr(&lnkc->first_contact_waitqueue, nr);
> +	smc_lnk_cluster_put(lnkc);	/* smc_lnk_cluster_hold in smcr_lnk_get_cluster */
> +}
> +
>  /* return head of link group list and its lock for a given link group */
>  static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr,
>  						  spinlock_t **lgr_lock)
> @@ -651,8 +931,10 @@ static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
>  	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
>  		struct smc_link *lnk = &lgr->lnk[i];
>  
> -		if (smc_link_sendable(lnk))
> +		if (smc_link_sendable(lnk)) {
>  			lnk->state = SMC_LNK_INACTIVE;
> +			smcr_lnk_cluster_on_lnk_state(lnk);
> +		}
>  	}
>  	wake_up_all(&lgr->llc_msg_waiter);
>  	wake_up_all(&lgr->llc_flow_waiter);
> @@ -762,6 +1044,9 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>  	atomic_set(&lnk->conn_cnt, 0);
>  	smc_llc_link_set_uid(lnk);
>  	INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
> +	lnk->peer_qpn = ini->ib_clcqpn;
> +	memcpy(lnk->peer_gid, ini->peer_gid, SMC_GID_SIZE);
> +	memcpy(lnk->peer_mac, ini->peer_mac, sizeof(lnk->peer_mac));
>  	if (!lnk->smcibdev->initialized) {
>  		rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev);
>  		if (rc)
> @@ -792,6 +1077,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>  	if (rc)
>  		goto destroy_qp;
>  	lnk->state = SMC_LNK_ACTIVATING;
> +	smcr_lnk_cluster_on_lnk_state(lnk);
>  	return 0;
>  
>  destroy_qp:
> @@ -806,6 +1092,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
>  	smc_ibdev_cnt_dec(lnk);
>  	put_device(&lnk->smcibdev->ibdev->dev);
>  	smcibdev = lnk->smcibdev;
> +	lnk->state = SMC_LNK_TEAR_DWON;
> +	smcr_lnk_cluster_on_lnk_state(lnk);
>  	memset(lnk, 0, sizeof(struct smc_link));
>  	lnk->state = SMC_LNK_UNUSED;
>  	if (!atomic_dec_return(&smcibdev->lnk_cnt))
> @@ -1263,6 +1551,8 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
>  	if (!lnk->lgr || lnk->clearing ||
>  	    lnk->state == SMC_LNK_UNUSED)
>  		return;
> +	lnk->state = SMC_LNK_TEAR_DWON;
> +	smcr_lnk_cluster_on_lnk_state(lnk);
>  	lnk->clearing = 1;
>  	lnk->peer_qpn = 0;
>  	smc_llc_link_clear(lnk, log);
> @@ -1712,6 +2002,7 @@ void smcr_link_down_cond(struct smc_link *lnk)
>  {
>  	if (smc_link_downing(&lnk->state)) {
>  		trace_smcr_link_down(lnk, __builtin_return_address(0));
> +		smcr_lnk_cluster_on_lnk_state(lnk);
>  		smcr_link_down(lnk);
>  	}
>  }
> @@ -1721,6 +2012,7 @@ void smcr_link_down_cond_sched(struct smc_link *lnk)
>  {
>  	if (smc_link_downing(&lnk->state)) {
>  		trace_smcr_link_down(lnk, __builtin_return_address(0));
> +		smcr_lnk_cluster_on_lnk_state(lnk);
>  		schedule_work(&lnk->link_down_wrk);
>  	}
>  }
> @@ -1850,11 +2142,13 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
>  {
>  	struct smc_connection *conn = &smc->conn;
>  	struct net *net = sock_net(&smc->sk);
> +	DECLARE_WAITQUEUE(wait, current);
> +	struct smc_lnk_cluster *lnkc = NULL;
>  	struct list_head *lgr_list;
>  	struct smc_link_group *lgr;
>  	enum smc_lgr_role role;
>  	spinlock_t *lgr_lock;
> -	int rc = 0;
> +	int rc = 0, timeo = CLC_WAIT_TIME;
>  
>  	lgr_list = ini->is_smcd ? &ini->ism_dev[ini->ism_selected]->lgr_list :
>  				  &smc_lgr_list.list;
> @@ -1862,12 +2156,26 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
>  				  &smc_lgr_list.lock;
>  	ini->first_contact_local = 1;
>  	role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
> -	if (role == SMC_CLNT && ini->first_contact_peer)
> +
> +	if (!ini->is_smcd) {
> +		lnkc = smcr_lnk_get_cluster_by_ini(ini, role);
> +		if (unlikely(!lnkc))
> +			return SMC_CLC_DECL_INTERR;
> +	}
> +
> +	if (role == SMC_CLNT && ini->first_contact_peer) {
> +		/* first_contact */
> +		spin_lock_bh(&lnkc->lock);
> +		lnkc->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1);
> +		spin_unlock_bh(&lnkc->lock);
>  		/* create new link group as well */
>  		goto create;
> +	}
>  
>  	/* determine if an existing link group can be reused */
>  	spin_lock_bh(lgr_lock);
> +	spin_lock(&lnkc->lock);
> +again:
>  	list_for_each_entry(lgr, lgr_list, list) {
>  		write_lock_bh(&lgr->conns_lock);
>  		if ((ini->is_smcd ?
> @@ -1894,9 +2202,33 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
>  		}
>  		write_unlock_bh(&lgr->conns_lock);
>  	}
> +	if (lnkc && ini->first_contact_local) {
> +		if (lnkc->pending_capability > lnkc->conns_pending) {
> +			lnkc->conns_pending++;
> +			add_wait_queue(&lnkc->first_contact_waitqueue, &wait);
> +			spin_unlock(&lnkc->lock);
> +			spin_unlock_bh(lgr_lock);
> +			set_current_state(TASK_INTERRUPTIBLE);
> +			/* need to wait at least once first contact done */
> +			timeo = schedule_timeout(timeo);
> +			set_current_state(TASK_RUNNING);
> +			remove_wait_queue(&lnkc->first_contact_waitqueue, &wait);
> +			spin_lock_bh(lgr_lock);
> +			spin_lock(&lnkc->lock);
> +
> +			lnkc->conns_pending--;
> +			if (timeo)
> +				goto again;
> +		}
> +		if (role == SMC_SERV) {
> +			/* first_contact */
> +			lnkc->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1);
> +		}
> +	}
> +	spin_unlock(&lnkc->lock);
>  	spin_unlock_bh(lgr_lock);
>  	if (rc)
> -		return rc;
> +		goto out;
>  
>  	if (role == SMC_CLNT && !ini->first_contact_peer &&
>  	    ini->first_contact_local) {
> @@ -1904,7 +2236,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
>  		 * a new one
>  		 * send out_of_sync decline, reason synchr. error
>  		 */
> -		return SMC_CLC_DECL_SYNCERR;
> +		rc = SMC_CLC_DECL_SYNCERR;
> +		goto out;
>  	}
>  
>  create:
> @@ -1941,6 +2274,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
>  #endif
>  
>  out:
> +	/* smc_lnk_cluster_hold in smcr_lnk_get_or_create_cluster */
> +	smc_lnk_cluster_put(lnkc);
>  	return rc;
>  }
>  
> @@ -2599,12 +2934,23 @@ static int smc_core_reboot_event(struct notifier_block *this,
>  
>  int __init smc_core_init(void)
>  {
> +	/* init smc lnk cluster maps */
> +	rhashtable_init(&smc_lgr_manager.lnk_cluster_maps, &smcr_lnk_cluster_rhl_params);
>  	return register_reboot_notifier(&smc_reboot_notifier);
>  }
>  
> +static void smc_lnk_cluster_free_cb(void *ptr, void *arg)
> +{
> +	pr_warn("smc: smc lnk cluster refcnt leak.\n");
> +	kfree(ptr);
> +}
> +
>  /* Called (from smc_exit) when module is removed */
>  void smc_core_exit(void)
>  {
>  	unregister_reboot_notifier(&smc_reboot_notifier);
>  	smc_lgrs_shutdown();
> +	/* destroy smc lnk cluster maps */
> +	rhashtable_free_and_destroy(&smc_lgr_manager.lnk_cluster_maps, smc_lnk_cluster_free_cb,
> +				    NULL);
>  }
> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> index fe8b524..199f533 100644
> --- a/net/smc/smc_core.h
> +++ b/net/smc/smc_core.h
> @@ -15,6 +15,7 @@
>  #include <linux/atomic.h>
>  #include <linux/smc.h>
>  #include <linux/pci.h>
> +#include <linux/rhashtable.h>
>  #include <rdma/ib_verbs.h>
>  #include <net/genetlink.h>
>  
> @@ -29,18 +30,62 @@ struct smc_lgr_list {			/* list of link group definition */
>  	u32			num;	/* unique link group number */
>  };
>  
> +struct smc_lgr_manager {		/* manager for link group */
> +	struct rhashtable	lnk_cluster_maps;	/* maps of smc_lnk_cluster */
> +	spinlock_t		lock;	/* lock for lgr_cm_maps */
> +};
> +
> +struct smc_lnk_cluster {
> +	struct rhash_head	rnode;	/* node for rhashtable */
> +	struct wait_queue_head	first_contact_waitqueue;
> +					/* queue for non first contact to wait
> +					 * first contact to be established.
> +					 */
> +	spinlock_t		lock;	/* protection for link group */
> +	refcount_t		ref;	/* refcount for cluster */
> +	unsigned long		pending_capability;
> +					/* maximum pending number of connections that
> +					 * need wait first contact complete.
> +					 */
> +	unsigned long		conns_pending;
> +					/* connections that are waiting for first contact
> +					 * complete
> +					 */
> +	u8		peer_systemid[SMC_SYSTEMID_LEN];
> +	u8		peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
> +	u8		peer_gid[SMC_GID_SIZE];	/* gid of peer*/
> +	int		clcqpn;
> +	int		role;
> +};
> +
>  enum smc_lgr_role {		/* possible roles of a link group */
>  	SMC_CLNT,	/* client */
>  	SMC_SERV	/* server */
>  };
>  
> +struct smc_lnk_cluster_compare_arg	/* key for smc_lnk_cluster */
> +{
> +	int	smcr_version;
> +	enum smc_lgr_role role;
> +	u8	*peer_systemid;
> +	u8	*peer_gid;
> +	u8	*peer_mac;
> +	int clcqpn;
> +};
> +
>  enum smc_link_state {			/* possible states of a link */
>  	SMC_LNK_UNUSED,		/* link is unused */
>  	SMC_LNK_INACTIVE,	/* link is inactive */
>  	SMC_LNK_ACTIVATING,	/* link is being activated */
>  	SMC_LNK_ACTIVE,		/* link is active */
> +	SMC_LNK_TEAR_DWON,	/* link is tear down */
>  };
>  
> +#define SMC_LNK_STATE_BIT(state)	(1 << (state))
> +
> +#define	SMC_LNK_STATE_RECORD(lnk, state)	\
> +	((lnk)->state_record |= SMC_LNK_STATE_BIT(state))
> +
>  #define SMC_WR_BUF_SIZE		48	/* size of work request buffer */
>  #define SMC_WR_BUF_V2_SIZE	8192	/* size of v2 work request buffer */
>  
> @@ -145,6 +190,7 @@ struct smc_link {
>  	int			ndev_ifidx; /* network device ifindex */
>  
>  	enum smc_link_state	state;		/* state of link */
> +	int			state_record;		/* record of previous state */
>  	struct delayed_work	llc_testlink_wrk; /* testlink worker */
>  	struct completion	llc_testlink_resp; /* wait for rx of testlink */
>  	int			llc_testlink_time; /* testlink interval */
> @@ -557,6 +603,8 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
>  int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb);
>  int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb);
>  
> +void smcr_lnk_cluster_on_lnk_state(struct smc_link *lnk);
> +
>  static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
>  {
>  	return link->lgr;
> diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
> index 175026a..8134c15 100644
> --- a/net/smc/smc_llc.c
> +++ b/net/smc/smc_llc.c
> @@ -1099,6 +1099,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
>  		goto out;
>  out_clear_lnk:
>  	lnk_new->state = SMC_LNK_INACTIVE;
> +	smcr_lnk_cluster_on_lnk_state(lnk_new);
>  	smcr_link_clear(lnk_new, false);
>  out_reject:
>  	smc_llc_cli_add_link_reject(qentry);
> @@ -1278,6 +1279,7 @@ static void smc_llc_delete_asym_link(struct smc_link_group *lgr)
>  		return; /* no asymmetric link */
>  	if (!smc_link_downing(&lnk_asym->state))
>  		return;
> +	smcr_lnk_cluster_on_lnk_state(lnk_asym);
>  	lnk_new = smc_switch_conns(lgr, lnk_asym, false);
>  	smc_wr_tx_wait_no_pending_sends(lnk_asym);
>  	if (!lnk_new)
> @@ -1492,6 +1494,7 @@ int smc_llc_srv_add_link(struct smc_link *link,
>  out_err:
>  	if (link_new) {
>  		link_new->state = SMC_LNK_INACTIVE;
> +		smcr_lnk_cluster_on_lnk_state(link_new);
>  		smcr_link_clear(link_new, false);
>  	}
>  out:
> @@ -1602,8 +1605,10 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
>  	del_llc->reason = 0;
>  	smc_llc_send_message(lnk, &qentry->msg); /* response */
>  
> -	if (smc_link_downing(&lnk_del->state))
> +	if (smc_link_downing(&lnk_del->state)) {
> +		smcr_lnk_cluster_on_lnk_state(lnk);
>  		smc_switch_conns(lgr, lnk_del, false);
> +	}
>  	smcr_link_clear(lnk_del, true);
>  
>  	active_links = smc_llc_active_link_count(lgr);
> @@ -1676,6 +1681,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
>  		goto out; /* asymmetric link already deleted */
>  
>  	if (smc_link_downing(&lnk_del->state)) {
> +		smcr_lnk_cluster_on_lnk_state(lnk);
>  		if (smc_switch_conns(lgr, lnk_del, false))
>  			smc_wr_tx_wait_no_pending_sends(lnk_del);
>  	}
> @@ -2167,6 +2173,7 @@ void smc_llc_link_active(struct smc_link *link)
>  		schedule_delayed_work(&link->llc_testlink_wrk,
>  				      link->llc_testlink_time);
>  	}
> +	smcr_lnk_cluster_on_lnk_state(link);
>  }
>  
>  /* called in worker context */
> -- 
> 1.8.3.1



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux