Re: [PATCH] tcmu: allow userspace to reset netlink

Mike Christie <mchristi@xxxxxxxxxx> · Wed, 4 Apr 2018 19:47:09 -0500

On 04/02/2018 06:42 AM, xiubli@xxxxxxxxxx wrote:
> From: Xiubo Li <xiubli@xxxxxxxxxx>
> 
> This patch adds 1 tcmu attr to reset and complete all the blocked
> netlink waiting threads. It's used when the userspace daemon like
> tcmu-runner has crashed or forced to shutdown just before the
> netlink requests be replied to the kernel, then the netlink requeting
> threads will get stuck forever. We must reboot the machine to recover
> from it and by this the rebootng is not a must then.
> 
> The netlink reset operation should be done before the userspace daemon
> could receive and handle the netlink requests to be safe.
> 
> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> ---
>  drivers/target/target_core_user.c | 99 ++++++++++++++++++++++++++++++++++++---
>  1 file changed, 93 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
> index 4ad89ea..dc8879d 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -103,9 +103,13 @@ struct tcmu_hba {
>  
>  #define TCMU_CONFIG_LEN 256
>  
> +static spinlock_t nl_complete_lock;
> +static struct idr complete_wait_udevs = IDR_INIT;
> +
>  struct tcmu_nl_cmd {
>  	/* wake up thread waiting for reply */
> -	struct completion complete;
> +	bool complete;
> +
>  	int cmd;
>  	int status;
>  };
> @@ -159,12 +163,17 @@ struct tcmu_dev {
>  
>  	spinlock_t nl_cmd_lock;
>  	struct tcmu_nl_cmd curr_nl_cmd;
> -	/* wake up threads waiting on curr_nl_cmd */
> +	/* wake up threads waiting on nl_cmd_wq */
>  	wait_queue_head_t nl_cmd_wq;
>  
> +	/* complete thread waiting complete_wq */
> +	wait_queue_head_t complete_wq;
> +
>  	char dev_config[TCMU_CONFIG_LEN];
>  
>  	int nl_reply_supported;
> +
> +	uint32_t dev_id;
>  };
>  
>  #define TCMU_DEV(_se_dev) container_of(_se_dev, struct tcmu_dev, se_dev)
> @@ -251,6 +260,56 @@ static int tcmu_get_global_max_data_area(char *buffer,
>  		 "Max MBs allowed to be allocated to all the tcmu device's "
>  		 "data areas.");
>  
> +static void tcmu_complete_wake_up(struct tcmu_dev *udev)
> +{
> +	struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
> +
> +	spin_lock(&nl_complete_lock);
> +	nl_cmd->complete = true;
> +	wake_up(&udev->complete_wq);
> +	spin_unlock(&nl_complete_lock);
> +}
> +
> +static void tcmu_complete_wake_up_all(void)
> +{
> +	struct tcmu_nl_cmd *nl_cmd;
> +	struct tcmu_dev *udev;
> +	int i;
> +
> +	spin_lock(&nl_complete_lock);
> +	idr_for_each_entry(&complete_wait_udevs, udev, i) {
> +		nl_cmd = &udev->curr_nl_cmd;
> +		nl_cmd->complete = true;
> +		wake_up(&udev->complete_wq);
> +	}
> +	spin_unlock(&nl_complete_lock);
> +}
> +
> +static int tcmu_complete_wait(struct tcmu_dev *udev)
> +{
> +	struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
> +	uint32_t dev_id;
> +
> +	spin_lock(&nl_complete_lock);
> +	dev_id = idr_alloc(&complete_wait_udevs, udev, 1, USHRT_MAX, GFP_NOWAIT);
> +	if (dev_id < 0) {
> +		pr_err("tcmu: Could not allocate dev id.\n");
> +		return dev_id;
> +	}
> +	udev->dev_id = dev_id;

dev_id is never used.

I think if you just wanted to loop over all the devices you could just
use a list.

Or,

Just add a helper around target_core_device.c:devices_idr that just
gives you the tcmu devices.

> +	spin_unlock(&nl_complete_lock);
> +
> +	pr_debug("sleeping for nl reply\n");
> +	wait_event(udev->complete_wq, nl_cmd->complete);

I don't think you will need the complete field then or this function.

> +
> +	spin_lock(&nl_complete_lock);
> +	nl_cmd->complete = false;
> +	idr_remove(&complete_wait_udevs, dev_id);
> +	spin_unlock(&nl_complete_lock);
> +
> +	return 0;
> +}
> +
>  /* multicast group */
>  enum tcmu_multicast_groups {
>  	TCMU_MCGRP_CONFIG,
> @@ -311,7 +370,7 @@ static int tcmu_genl_cmd_done(struct genl_info *info, int completed_cmd)
>  	if (!is_removed)
>  		 target_undepend_item(&dev->dev_group.cg_item);
>  	if (!ret)
> -		complete(&nl_cmd->complete);
> +		tcmu_complete_wake_up(udev);
>  	return ret;
>  }
>  
> @@ -1258,6 +1317,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
>  	timer_setup(&udev->cmd_timer, tcmu_cmd_timedout, 0);
>  
>  	init_waitqueue_head(&udev->nl_cmd_wq);
> +	init_waitqueue_head(&udev->complete_wq);
>  	spin_lock_init(&udev->nl_cmd_lock);
>  
>  	INIT_RADIX_TREE(&udev->data_blocks, GFP_KERNEL);
> @@ -1462,7 +1522,11 @@ static void tcmu_dev_call_rcu(struct rcu_head *p)
>  
>  	kfree(udev->uio_info.name);
>  	kfree(udev->name);
> +
> +	spin_lock(&nl_complete_lock);
> +	idr_remove(&complete_wait_udevs, udev->dev_id);
>  	kfree(udev);
> +	spin_unlock(&nl_complete_lock);
>  }
>  
>  static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
> @@ -1555,7 +1619,6 @@ static void tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd)
>  
>  	memset(nl_cmd, 0, sizeof(*nl_cmd));
>  	nl_cmd->cmd = cmd;
> -	init_completion(&nl_cmd->complete);
>  
>  	spin_unlock(&udev->nl_cmd_lock);
>  }
> @@ -1572,8 +1635,9 @@ static int tcmu_wait_genl_cmd_reply(struct tcmu_dev *udev)
>  	if (udev->nl_reply_supported <= 0)
>  		return 0;
>  
> -	pr_debug("sleeping for nl reply\n");
> -	wait_for_completion(&nl_cmd->complete);
> +	ret = tcmu_complete_wait(udev);
> +	if (ret)
> +		return ret;
>  
>  	spin_lock(&udev->nl_cmd_lock);
>  	nl_cmd->cmd = TCMU_CMD_UNSPEC;
> @@ -2323,6 +2387,26 @@ static ssize_t tcmu_block_dev_store(struct config_item *item, const char *page,
>  }
>  CONFIGFS_ATTR(tcmu_, block_dev);
>  
> +static ssize_t tcmu_reset_netlink_store(struct config_item *item, const char *page,
> +				    size_t count)
> +{
> +	u8 val;
> +	int ret;
> +
> +	ret = kstrtou8(page, 0, &val);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (val != 1) {
> +		pr_err("Invalid block value %d\n", val);
> +		return -EINVAL;
> +	}
> +
> +	tcmu_complete_wake_up_all();
> +	return count;
> +}
> +CONFIGFS_ATTR_WO(tcmu_, reset_netlink);

If it's on the device it should only reset the device its on, so if 2
daemons/apps are managing different devices it doesn't mess up the other.

Or we could just assume that there is only 1 daemon type and just do a
global attr at the module level. Probably just the per device is best in
case we end up with people running gluster + qemu+tcmu and ceph +
tcmu-runner.

If you do the per device then you can just take the insides of
tcmu_genl_cmd_done and make it into a helper so that you can do the
refcount/target_undepend_item properly and it would do the wake up. In
the reset configfs function then grab the nl_cmd_lock, and set the
curr_nl_cmd status to some or pass it into the helper, and then call
your helper which does the common stuff.

> +
>  static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
>  				     size_t count)
>  {
> @@ -2363,6 +2447,7 @@ static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
>  static struct configfs_attribute *tcmu_action_attrs[] = {
>  	&tcmu_attr_block_dev,
>  	&tcmu_attr_reset_ring,
> +	&tcmu_attr_reset_netlink,
>  	NULL,
>  };
>  
> @@ -2519,6 +2604,8 @@ static int __init tcmu_module_init(void)
>  	}
>  	tcmu_ops.tb_dev_attrib_attrs = tcmu_attrs;
>  
> +	spin_lock_init(&nl_complete_lock);
> +
>  	ret = transport_backend_register(&tcmu_ops);
>  	if (ret)
>  		goto out_attrs;
>