Re: [PATCH] tcmu: fix crash when remove the tcmu device

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 08/30/2017 04:41 AM, lixiubo@xxxxxxxxxxxxxxxxxxxx wrote:
> From: Xiubo Li <lixiubo@xxxxxxxxxxxxxxxxxxxx>
> 
> Before the nl REMOVE msg has been sent to the userspace, the ring's
> and other resources has been released, but the userspace maybe still
> using them. And then we can see the crash messages like:
> 
> ring broken, not handling completions
> BUG: unable to handle kernel paging request at ffffffffffffffd0
> IP: tcmu_handle_completions+0x134/0x2f0 [target_core_user]
> PGD 11bdc0c067
> P4D 11bdc0c067
> PUD 11bdc0e067
> PMD 0
> 
> Oops: 0000 [#1] SMP
> cmd_id not found, ring is broken
> RIP: 0010:tcmu_handle_completions+0x134/0x2f0 [target_core_user]
> RSP: 0018:ffffb8a2d8983d88 EFLAGS: 00010296
> RAX: 0000000000000000 RBX: ffffb8a2aaa4e000 RCX: 00000000ffffffff
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
> RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000220
> R10: 0000000076c71401 R11: ffff8d2e76c713f0 R12: ffffb8a2aad56bc0
> R13: 000000000000001c R14: ffff8d2e32c90000 R15: ffff8d2e76c713f0
> FS:  00007f411ffff700(0000) GS:ffff8d1e7fdc0000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: ffffffffffffffd0 CR3: 0000001027070000 CR4:
> 00000000001406e0
> Call Trace:
> ? tcmu_irqcontrol+0x2a/0x40 [target_core_user]
> ? uio_write+0x7b/0xc0 [uio]
> ? __vfs_write+0x37/0x150
> ? __getnstimeofday64+0x3b/0xd0
> ? vfs_write+0xb2/0x1b0
> ? syscall_trace_enter+0x1d0/0x2b0
> ? SyS_write+0x55/0xc0
> ? do_syscall_64+0x67/0x150
> ? entry_SYSCALL64_slow_path+0x25/0x25
> Code: 41 5d 41 5e 41 5f 5d c3 83 f8 01 0f 85 cf 01 00
> 00 48 8b 7d d0 e8 dd 5c 1d f3 41 0f b7 74 24 04 48 8b
> 7d c8 31 d2 e8 5c c7 1b f3 <48> 8b 7d d0 49 89 c7 c6 07
> 00 0f 1f 40 00 4d 85 ff 0f 84 82 01  RIP:
> tcmu_handle_completions+0x134/0x2f0 [target_core_user]
> RSP: ffffb8a2d8983d88
> CR2: ffffffffffffffd0
> 
> And the crash also could happen in tcmu_page_fault and other places.
> 
> Signed-off-by: Zhang Zhuoyu <zhangzhuoyu@xxxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Xiubo Li <lixiubo@xxxxxxxxxxxxxxxxxxxx>
> ---
>  drivers/target/target_core_user.c | 90 +++++++++++++++++++--------------------
>  1 file changed, 45 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
> index 942d0942..0ff8541 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -1280,10 +1280,55 @@ static void tcmu_dev_call_rcu(struct rcu_head *p)
>  	kfree(udev);
>  }
>  
> +static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
> +{
> +	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
> +		kmem_cache_free(tcmu_cmd_cache, cmd);
> +		return 0;
> +	}
> +	return -EINVAL;
> +}
> +
> +static void tcmu_blocks_release(struct tcmu_dev *udev)
> +{
> +	int i;
> +	struct page *page;
> +
> +	/* Try to release all block pages */
> +	mutex_lock(&udev->cmdr_lock);
> +	for (i = 0; i <= udev->dbi_max; i++) {
> +		page = radix_tree_delete(&udev->data_blocks, i);
> +		if (page) {
> +			__free_page(page);
> +			atomic_dec(&global_db_count);
> +		}
> +	}
> +	mutex_unlock(&udev->cmdr_lock);
> +}
> +
>  static void tcmu_dev_kref_release(struct kref *kref)
>  {
>  	struct tcmu_dev *udev = container_of(kref, struct tcmu_dev, kref);
>  	struct se_device *dev = &udev->se_dev;
> +	struct tcmu_cmd *cmd;
> +	bool all_expired = true;
> +	int i;
> +
> +	vfree(udev->mb_addr);
> +
> +	/* Upper layer should drain all requests before calling this */
> +	spin_lock_irq(&udev->commands_lock);
> +	idr_for_each_entry(&udev->commands, cmd, i) {
> +		if (tcmu_check_and_free_pending_cmd(cmd) != 0)
> +			all_expired = false;
> +	}
> +	idr_destroy(&udev->commands);
> +	spin_unlock_irq(&udev->commands_lock);
> +	WARN_ON(!all_expired);
> +
> +	tcmu_blocks_release(udev);
> +
> +	uio_unregister_device(&udev->uio_info);
>  
>  	call_rcu(&dev->rcu_head, tcmu_dev_call_rcu);
>  }
> @@ -1534,37 +1579,11 @@ static int tcmu_configure_device(struct se_device *dev)
>  	return ret;
>  }
>  
> -static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
> -{
> -	if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) {
> -		kmem_cache_free(tcmu_cmd_cache, cmd);
> -		return 0;
> -	}
> -	return -EINVAL;
> -}
> -
>  static bool tcmu_dev_configured(struct tcmu_dev *udev)
>  {
>  	return udev->uio_info.uio_dev ? true : false;
>  }
>  
> -static void tcmu_blocks_release(struct tcmu_dev *udev)
> -{
> -	int i;
> -	struct page *page;
> -
> -	/* Try to release all block pages */
> -	mutex_lock(&udev->cmdr_lock);
> -	for (i = 0; i <= udev->dbi_max; i++) {
> -		page = radix_tree_delete(&udev->data_blocks, i);
> -		if (page) {
> -			__free_page(page);
> -			atomic_dec(&global_db_count);
> -		}
> -	}
> -	mutex_unlock(&udev->cmdr_lock);
> -}
> -
>  static void tcmu_free_device(struct se_device *dev)
>  {
>  	struct tcmu_dev *udev = TCMU_DEV(dev);
> @@ -1576,9 +1595,6 @@ static void tcmu_free_device(struct se_device *dev)
>  static void tcmu_destroy_device(struct se_device *dev)
>  {
>  	struct tcmu_dev *udev = TCMU_DEV(dev);
> -	struct tcmu_cmd *cmd;
> -	bool all_expired = true;
> -	int i;
>  
>  	del_timer_sync(&udev->timeout);
>  
> @@ -1586,24 +1602,8 @@ static void tcmu_destroy_device(struct se_device *dev)
>  	list_del(&udev->node);
>  	mutex_unlock(&root_udev_mutex);
>  
> -	vfree(udev->mb_addr);
> -
> -	/* Upper layer should drain all requests before calling this */
> -	spin_lock_irq(&udev->commands_lock);
> -	idr_for_each_entry(&udev->commands, cmd, i) {
> -		if (tcmu_check_and_free_pending_cmd(cmd) != 0)
> -			all_expired = false;
> -	}
> -	idr_destroy(&udev->commands);
> -	spin_unlock_irq(&udev->commands_lock);
> -	WARN_ON(!all_expired);
> -
> -	tcmu_blocks_release(udev);
> -

Agree the above part needs to be delayed.

>  	tcmu_netlink_event(udev, TCMU_CMD_REMOVED_DEVICE, 0, NULL);
>  
> -	uio_unregister_device(&udev->uio_info);
> -

Why does this call? After this returns userspace should have done its
close and stopped doing uio poll/read/write calls right?

If userspace is not doing that, why isn't it? I think that is a bug.

If we need to work around userspace then you need to drop the
uio_unregister_device call in tcmu_configure_device.


>  	/* release ref from configure */
>  	kref_put(&udev->kref, tcmu_dev_kref_release);
>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux SCSI]     [Kernel Newbies]     [Linux SCSI Target Infrastructure]     [Share Photos]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Linux IIO]     [Device Mapper]

  Powered by Linux