Re: Kernel lock during cryptsetup luksClose

Frédéric Martinsons <frederic.martinsons@xxxxxxxxx> · Tue, 28 Feb 2023 13:51:08 +0100

>
> That has been used before and the rcu-wait mechanism is what we use
> upstream. That revert should case no harm.
> I would need to reproduce this to figure out if this is a generic
> problem or if the backport is bad.
> Basically speaking (and you could add printks to figure that out)
> manage_workers() should wake on task and that should be the one waiting
> in put_unbound_pool(). But this is somehow not happening based on your
> description.
>
> Sebastian

Thank you very much for the answer. I added printk in put_unbound_pool
with the following patch:

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a9f3cc02bdc1..5aacb02b8ec5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2025,7 +2025,9 @@ static bool manage_workers(struct worker *worker)

        pool->manager = NULL;
        pool->flags &= ~POOL_MANAGER_ACTIVE;
+       printk(KERN_INFO "manage_worker call rcuwait_wake_up , created
pool id %d, node id %d, refcnt %d", pool->id, pool->node,
pool->refcnt);
        rcuwait_wake_up(&manager_wait);
+       printk(KERN_INFO "manage_worker rcuwait_wake_up end, pool id
%d, node id %d, refcnt %d", pool->id, pool->node, pool->refcnt);
        return true;
 }

@@ -3371,20 +3373,21 @@ static void put_unbound_pool(struct worker_pool *pool)
 {
        DECLARE_COMPLETION_ONSTACK(detach_completion);
        struct worker *worker;
-
+       printk(KERN_INFO "put_unbound_pool starts pool id %d, node id
%d, refcnt %d", pool->id, pool->node, pool->refcnt);
        lockdep_assert_held(&wq_pool_mutex);

        if (--pool->refcnt)
                return;
-
+       printk(KERN_INFO "put_unbound_pool starts 2, pool id %d, node
id %d, refcnt %d", pool->id, pool->node, pool->refcnt);
        /* sanity checks */
        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;
-
+       printk(KERN_INFO "put_unbound_pool starts 3, pool id %d, node
id %d, refcnt %d", pool->id, pool->node, pool->refcnt);
        /* release id and unhash */
        if (pool->id >= 0)
                idr_remove(&worker_pool_idr, pool->id);
+       printk(KERN_INFO "put_unbound_pool starts 4, pool id %d, node
id %d, refcnt %d", pool->id, pool->node, pool->refcnt);
        hash_del(&pool->hash_node);

        /*
@@ -3394,6 +3397,7 @@ static void put_unbound_pool(struct worker_pool *pool)
         * Because of how wq_manager_inactive() works, we will hold the
         * spinlock after a successful wait.
         */
+       printk(KERN_INFO "put_unbound_pool lock pool id %d, node id
%d, refcnt %d", pool->id, pool->node, pool->refcnt);
        raw_spin_lock_irq(&pool->lock);
        rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool),
                           TASK_UNINTERRUPTIBLE);
@@ -3403,6 +3407,7 @@ static void put_unbound_pool(struct worker_pool *pool)
                destroy_worker(worker);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        raw_spin_unlock_irq(&pool->lock);
+       printk(KERN_INFO "put_unbound_pool unlock pool id %d, node id
%d, refcnt %d", pool->id, pool->node, pool->refcnt);

        mutex_lock(&wq_pool_attach_mutex);
        if (!list_empty(&pool->workers))


And I have the following associated traces before the lockup:

[   45.017508] manage_worker call rcuwait_wake_up , created pool id 5,
node id -1, refcnt 1
[   45.017514] manage_worker rcuwait_wake_up end, pool id 5, node id
-1, refcnt 1
[   45.076736] put_unbound_pool starts pool id 5, node id -1, refcnt 1
[   45.092553] put_unbound_pool starts 2, pool id 5, node id -1, refcnt 0
[   45.099022] put_unbound_pool starts 3, pool id 5, node id -1, refcnt 0
[   45.105762] put_unbound_pool starts 4, pool id 5, node id -1, refcnt 0


So it seems that it is the line hash_del(&pool->hash_node) that never returned.

Sorry but I have 0 experiences in kernel hacking but if you tell me
what info should be printed to help you
pinpoint the issue, I'll do it.