Re: [PATCH V2 1/3] target: iscsi: fix hang in the iSCSI login code

Mike Christie <michael.christie@xxxxxxxxxx> · Wed, 26 Apr 2023 10:35:09 -0500

On 4/17/23 12:17 PM, Maurizio Lombardi wrote:
>  static void iscsi_target_do_login_rx(struct work_struct *work)
>  {
>  	struct iscsit_conn *conn = container_of(work,
> @@ -562,12 +543,15 @@ static void iscsi_target_do_login_rx(struct work_struct *work)
>  	struct iscsi_np *np = login->np;
>  	struct iscsi_portal_group *tpg = conn->tpg;
>  	struct iscsi_tpg_np *tpg_np = conn->tpg_np;
> -	struct conn_timeout timeout;
>  	int rc, zero_tsih = login->zero_tsih;
>  	bool state;
>  
>  	pr_debug("entering iscsi_target_do_login_rx, conn: %p, %s:%d\n",
>  			conn, current->comm, current->pid);
> +
> +	spin_lock(&conn->login_worker_lock);
> +	set_bit(LOGIN_FLAGS_WORKER_RUNNING, &conn->login_flags);
> +	spin_unlock(&conn->login_worker_lock);

I think the locking bh use is reversed. You are in thread context here so you need to
do spin_lock/unlock_bh to prevent a softirq on the same cpu from interrupting us and 
running your timeout function which would try to grab the login_worker_lock.

iscsit_login_timeout is run from softirq context so you wouldn't need to do the _bh
calls from it.

>  	/*
>  	 * If iscsi_target_do_login_rx() has been invoked by ->sk_data_ready()
>  	 * before initial PDU processing in iscsi_target_start_negotiation()
> @@ -597,19 +581,16 @@ static void iscsi_target_do_login_rx(struct work_struct *work)
>  		goto err;
>  	}
>  
> -	conn->login_kworker = current;
>  	allow_signal(SIGINT);
> -
> -	timeout.conn = conn;
> -	timer_setup_on_stack(&timeout.timer, iscsi_target_login_timeout, 0);
> -	mod_timer(&timeout.timer, jiffies + TA_LOGIN_TIMEOUT * HZ);
> -	pr_debug("Starting login timer for %s/%d\n", current->comm, current->pid);
> +	rc = iscsit_set_login_timer_kworker(conn, current);
> +	if (rc < 0) {
> +		/* The login timer has already expired */
> +		pr_debug("iscsi_target_do_login_rx, login failed\n");
> +		goto err;
> +	}
>  
>  	rc = conn->conn_transport->iscsit_get_login_rx(conn, login);
> -	del_timer_sync(&timeout.timer);
> -	destroy_timer_on_stack(&timeout.timer);
>  	flush_signals(current);
> -	conn->login_kworker = NULL;
>  
>  	if (rc < 0)
>  		goto err;
> @@ -646,7 +627,16 @@ static void iscsi_target_do_login_rx(struct work_struct *work)
>  		if (iscsi_target_sk_check_and_clear(conn,
>  						    LOGIN_FLAGS_WRITE_ACTIVE))
>  			goto err;
> +
> +		/* Set the login timer thread pointer to NULL to prevent the
> +		 * login process from getting stuck if the initiator
> +		 * stops sending data.
> +		 */

Use the more common comment style we have in the target code (there was some other
cases below):

/*
 * Set the...

> +		rc = iscsit_set_login_timer_kworker(conn, NULL);
> +		if (rc < 0)
> +			goto err;
>  	} else if (rc == 1) {
> +		iscsit_stop_login_timer(conn);
>  		cancel_delayed_work(&conn->login_work);
>  		iscsi_target_nego_release(conn);
>  		iscsi_post_login_handler(np, conn, zero_tsih);
> @@ -656,6 +646,7 @@ static void iscsi_target_do_login_rx(struct work_struct *work)
>  
>  err:
>  	iscsi_target_restore_sock_callbacks(conn);
> +	iscsit_stop_login_timer(conn);
>  	cancel_delayed_work(&conn->login_work);
>  	iscsi_target_login_drop(conn, login);
>  	iscsit_deaccess_np(np, tpg, tpg_np);
> @@ -1368,14 +1359,29 @@ int iscsi_target_start_negotiation(
>  	 * and perform connection cleanup now.
>  	 */
>  	ret = iscsi_target_do_login(conn, login);
> -	if (!ret && iscsi_target_sk_check_and_clear(conn, LOGIN_FLAGS_INITIAL_PDU))
> -		ret = -1;
> +	if (!ret) {
> +		spin_lock(&conn->login_worker_lock);

Same locking comment.

> +
> +		if (iscsi_target_sk_check_and_clear(conn, LOGIN_FLAGS_INITIAL_PDU))
> +			ret = -1;
> +		else if (!test_bit(LOGIN_FLAGS_WORKER_RUNNING, &conn->login_flags)) {
> +			if (iscsit_set_login_timer_kworker(conn, NULL) < 0) {

I think this will deadlock because iscsit_set_login_timer_kworker tries to take the
login_worker_lock we took above.