Re: [PATCH] tcmu: allow userspace to reset netlink

Mike Christie <mchristi@xxxxxxxxxx> · Wed, 4 Apr 2018 22:23:25 -0500

On 04/04/2018 09:38 PM, Xiubo Li wrote:
> On 2018/4/5 8:47, Mike Christie wrote:
>> On 04/02/2018 06:42 AM, xiubli@xxxxxxxxxx wrote:
>>> From: Xiubo Li <xiubli@xxxxxxxxxx>
>>>
>>> This patch adds 1 tcmu attr to reset and complete all the blocked
>>> netlink waiting threads. It's used when the userspace daemon like
>>> tcmu-runner has crashed or forced to shutdown just before the
>>> netlink requests be replied to the kernel, then the netlink requeting
>>> threads will get stuck forever. We must reboot the machine to recover
>>> from it and by this the rebootng is not a must then.
>>>
>>> The netlink reset operation should be done before the userspace daemon
>>> could receive and handle the netlink requests to be safe.
>>>
>>> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
>>> ---
>>>   drivers/target/target_core_user.c | 99
>>> ++++++++++++++++++++++++++++++++++++---
>>>   1 file changed, 93 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/target/target_core_user.c
>>> b/drivers/target/target_core_user.c
>>> index 4ad89ea..dc8879d 100644
>>> --- a/drivers/target/target_core_user.c
>>> +++ b/drivers/target/target_core_user.c
>>> @@ -103,9 +103,13 @@ struct tcmu_hba {
>>>     #define TCMU_CONFIG_LEN 256
>>>   +static spinlock_t nl_complete_lock;
>>> +static struct idr complete_wait_udevs = IDR_INIT;
>>> +
>>>   struct tcmu_nl_cmd {
>>>       /* wake up thread waiting for reply */
>>> -    struct completion complete;
>>> +    bool complete;
>>> +
>>>       int cmd;
>>>       int status;
>>>   };
>>> @@ -159,12 +163,17 @@ struct tcmu_dev {
>>>         spinlock_t nl_cmd_lock;
>>>       struct tcmu_nl_cmd curr_nl_cmd;
>>> -    /* wake up threads waiting on curr_nl_cmd */
>>> +    /* wake up threads waiting on nl_cmd_wq */
>>>       wait_queue_head_t nl_cmd_wq;
>>>   +    /* complete thread waiting complete_wq */
>>> +    wait_queue_head_t complete_wq;
>>> +
>>>       char dev_config[TCMU_CONFIG_LEN];
>>>         int nl_reply_supported;
>>> +
>>> +    uint32_t dev_id;
>>>   };
>>>     #define TCMU_DEV(_se_dev) container_of(_se_dev, struct tcmu_dev,
>>> se_dev)
>>> @@ -251,6 +260,56 @@ static int tcmu_get_global_max_data_area(char
>>> *buffer,
>>>            "Max MBs allowed to be allocated to all the tcmu device's "
>>>            "data areas.");
>>>   +static void tcmu_complete_wake_up(struct tcmu_dev *udev)
>>> +{
>>> +    struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
>>> +
>>> +    spin_lock(&nl_complete_lock);
>>> +    nl_cmd->complete = true;
>>> +    wake_up(&udev->complete_wq);
>>> +    spin_unlock(&nl_complete_lock);
>>> +}
>>> +
>>> +static void tcmu_complete_wake_up_all(void)
>>> +{
>>> +    struct tcmu_nl_cmd *nl_cmd;
>>> +    struct tcmu_dev *udev;
>>> +    int i;
>>> +
>>> +    spin_lock(&nl_complete_lock);
>>> +    idr_for_each_entry(&complete_wait_udevs, udev, i) {
>>> +        nl_cmd = &udev->curr_nl_cmd;
>>> +        nl_cmd->complete = true;
>>> +        wake_up(&udev->complete_wq);
>>> +    }
>>> +    spin_unlock(&nl_complete_lock);
>>> +}
>>> +
>>> +static int tcmu_complete_wait(struct tcmu_dev *udev)
>>> +{
>>> +    struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
>>> +    uint32_t dev_id;
>>> +
>>> +    spin_lock(&nl_complete_lock);
>>> +    dev_id = idr_alloc(&complete_wait_udevs, udev, 1, USHRT_MAX,
>>> GFP_NOWAIT);
>>> +    if (dev_id < 0) {
>>> +        pr_err("tcmu: Could not allocate dev id.\n");
>>> +        return dev_id;
>>> +    }
>>> +    udev->dev_id = dev_id;
>> dev_id is never used.
> It will be used when the device is being removed.

Ah yeah, you are right. Bad comment on my part. I was thinking/writing
wrt if we used a list or helper around devices_idr.

> 
>> I think if you just wanted to loop over all the devices you could just
>> use a list.
>>
>> Or,
>>
>> Just add a helper around target_core_device.c:devices_idr that just
>> gives you the tcmu devices.
>>
>>
>>
>>> +    spin_unlock(&nl_complete_lock);
>>> +
>>> +    pr_debug("sleeping for nl reply\n");
>>> +    wait_event(udev->complete_wq, nl_cmd->complete);
>> I don't think you will need the complete field then or this function.
>>
>>
>>> +
>>> +    spin_lock(&nl_complete_lock);
>>> +    nl_cmd->complete = false;
>>> +    idr_remove(&complete_wait_udevs, dev_id);
>>> +    spin_unlock(&nl_complete_lock);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   /* multicast group */
>>>   enum tcmu_multicast_groups {
>>>       TCMU_MCGRP_CONFIG,
>>> @@ -311,7 +370,7 @@ static int tcmu_genl_cmd_done(struct genl_info
>>> *info, int completed_cmd)
>>>       if (!is_removed)
>>>            target_undepend_item(&dev->dev_group.cg_item);
>>>       if (!ret)
>>> -        complete(&nl_cmd->complete);
>>> +        tcmu_complete_wake_up(udev);
>>>       return ret;
>>>   }
>>>   @@ -1258,6 +1317,7 @@ static struct se_device
>>> *tcmu_alloc_device(struct se_hba *hba, const char *name)
>>>       timer_setup(&udev->cmd_timer, tcmu_cmd_timedout, 0);
>>>         init_waitqueue_head(&udev->nl_cmd_wq);
>>> +    init_waitqueue_head(&udev->complete_wq);
>>>       spin_lock_init(&udev->nl_cmd_lock);
>>>         INIT_RADIX_TREE(&udev->data_blocks, GFP_KERNEL);
>>> @@ -1462,7 +1522,11 @@ static void tcmu_dev_call_rcu(struct rcu_head *p)
>>>         kfree(udev->uio_info.name);
>>>       kfree(udev->name);
>>> +
>>> +    spin_lock(&nl_complete_lock);
>>> +    idr_remove(&complete_wait_udevs, udev->dev_id);
>>>       kfree(udev);
>>> +    spin_unlock(&nl_complete_lock);
>>>   }
>>>     static int tcmu_check_and_free_pending_cmd(struct tcmu_cmd *cmd)
>>> @@ -1555,7 +1619,6 @@ static void tcmu_init_genl_cmd_reply(struct
>>> tcmu_dev *udev, int cmd)
>>>         memset(nl_cmd, 0, sizeof(*nl_cmd));
>>>       nl_cmd->cmd = cmd;
>>> -    init_completion(&nl_cmd->complete);
>>>         spin_unlock(&udev->nl_cmd_lock);
>>>   }
>>> @@ -1572,8 +1635,9 @@ static int tcmu_wait_genl_cmd_reply(struct
>>> tcmu_dev *udev)
>>>       if (udev->nl_reply_supported <= 0)
>>>           return 0;
>>>   -    pr_debug("sleeping for nl reply\n");
>>> -    wait_for_completion(&nl_cmd->complete);
>>> +    ret = tcmu_complete_wait(udev);
>>> +    if (ret)
>>> +        return ret;
>>>         spin_lock(&udev->nl_cmd_lock);
>>>       nl_cmd->cmd = TCMU_CMD_UNSPEC;
>>> @@ -2323,6 +2387,26 @@ static ssize_t tcmu_block_dev_store(struct
>>> config_item *item, const char *page,
>>>   }
>>>   CONFIGFS_ATTR(tcmu_, block_dev);
>>>   +static ssize_t tcmu_reset_netlink_store(struct config_item *item,
>>> const char *page,
>>> +                    size_t count)
>>> +{
>>> +    u8 val;
>>> +    int ret;
>>> +
>>> +    ret = kstrtou8(page, 0, &val);
>>> +    if (ret < 0)
>>> +        return ret;
>>> +
>>> +    if (val != 1) {
>>> +        pr_err("Invalid block value %d\n", val);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    tcmu_complete_wake_up_all();
>>> +    return count;
>>> +}
>>> +CONFIGFS_ATTR_WO(tcmu_, reset_netlink);
>>
>> If it's on the device it should only reset the device its on, so if 2
>> daemons/apps are managing different devices it doesn't mess up the other.
>>
>> Or we could just assume that there is only 1 daemon type and just do a
>> global attr at the module level. Probably just the per device is best in
>> case we end up with people running gluster + qemu+tcmu and ceph +
>> tcmu-runner.
>>
>> If you do the per device then you can just take the insides of
>> tcmu_genl_cmd_done and make it into a helper so that you can do the
>> refcount/target_undepend_item properly and it would do the wake up. In
>> the reset configfs function then grab the nl_cmd_lock, and set the
>> curr_nl_cmd status to some or pass it into the helper, and then call
>> your helper which does the common stuff.
> I thought there should only 1 daemon will be exist in user space and at

I think the most common will be the 1 daemon type setup (it would be
rare to run qemu-tcmu and tcmu-runner at the same time) but we could
have multiple daemon instances of the same type due to containers in the
future.

> the same time to simplify it by resetting only one device will also
> reset all the others, or we need to reset all the devices one by one.
> 

I think we want it per device to match the other code like ring reset
and nl reply supported for example. We then get support for everything
very cheaply.

> If so, I will just do the per device resetting, then the patch will be
> very simple.
> 
>>
>>> +
>>>   static ssize_t tcmu_reset_ring_store(struct config_item *item,
>>> const char *page,
>>>                        size_t count)
>>>   {
>>> @@ -2363,6 +2447,7 @@ static ssize_t tcmu_reset_ring_store(struct
>>> config_item *item, const char *page,
>>>   static struct configfs_attribute *tcmu_action_attrs[] = {
>>>       &tcmu_attr_block_dev,
>>>       &tcmu_attr_reset_ring,
>>> +    &tcmu_attr_reset_netlink,
>>>       NULL,
>>>   };
>>>   @@ -2519,6 +2604,8 @@ static int __init tcmu_module_init(void)
>>>       }
>>>       tcmu_ops.tb_dev_attrib_attrs = tcmu_attrs;
>>>   +    spin_lock_init(&nl_complete_lock);
>>> +
>>>       ret = transport_backend_register(&tcmu_ops);
>>>       if (ret)
>>>           goto out_attrs;
>>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe
>> target-devel" in
>> the body of a message to majordomo@xxxxxxxxxxxxxxx
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
>