Re: [PATCH] drm/amdkfd: restore_process_worker race with GPU reset

Philip Yang <yangp@xxxxxxx> · Thu, 29 Aug 2024 18:16:48 -0400



    On 2024-08-29 17:15, Felix Kuehling
      wrote:

    
    On
      2024-08-23 15:49, Philip Yang wrote:
      

      If GPU reset kick in while KFD
        restore_process_worker running, this may
        

        causes different issues, for example below rcu stall warning,
        because
        

        restore work may move BOs and evict queues under VRAM pressure.
        

        Fix this race by taking adev reset_domain read semaphore to
        prevent GPU
        

        reset in restore_process_worker, the reset read semaphore can be
        taken
        

        recursively if adev have multiple partitions.
        

        Then there is live locking issue if CP hangs while
        

        restore_process_worker runs, then GPU reset wait for semaphore
        to start
        

        and restore_process_worker cannot finish to release semaphore.
        We need
        

        signal eviction fence to solve the live locking if evict queue
        return
        

        -ETIMEOUT (for MES path) or -ETIME (for HWS path) because CP
        hangs,
        

          amdgpu 0000:af:00.0: amdgpu: GPU reset(21) succeeded!
        

          rcu: INFO: rcu_sched self-detected stall on CPU
        

          Workqueue: kfd_restore_wq restore_process_worker [amdgpu]
        

          Call Trace:
        

           update_process_times+0x94/0xd0
        

          RIP: 0010:amdgpu_vm_handle_moved+0x9a/0x210 [amdgpu]
        

           amdgpu_amdkfd_gpuvm_restore_process_bos+0x3d6/0x7d0 [amdgpu]
        

           restore_process_helper+0x27/0x80 [amdgpu]
        

        Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx>
        

      See comments inline. I'd also like Christian to take a look at
      this patch since he's the expert on the reset locking stuff.
      

      ---
        

          drivers/gpu/drm/amd/amdkfd/kfd_process.c | 56
        +++++++++++++++++++++++-
        

          1 file changed, 55 insertions(+), 1 deletion(-)
        

        diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        

        index a902950cc060..53a814347522 100644
        

        --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        

        +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
        

        @@ -35,6 +35,7 @@
        

          #include <linux/pm_runtime.h>
        

          #include "amdgpu_amdkfd.h"
        

          #include "amdgpu.h"
        

        +#include "amdgpu_reset.h"
        

            struct mm_struct;
        

          @@ -1972,8 +1973,14 @@ static void evict_process_worker(struct
        work_struct *work)
        

                      kfd_process_restore_queues(p);
        

                    pr_debug("Finished evicting pasid 0x%x\n",
        p->pasid);
        

        -    } else
        

        +    } else if (ret == -ETIMEDOUT || ret == -ETIME) {
        

        +        /* If CP hangs, signal the eviction fence, then
        restore_bo_worker
        

        +         * can finish to up_read GPU reset semaphore to start
        GPU reset.
        

        +         */
        

        +        signal_eviction_fence(p);
        

        +    } else {
        

                  pr_err("Failed to evict queues of pasid 0x%x\n",
        p->pasid);
        

        +    }
        

          }
        

            static int restore_process_helper(struct kfd_process *p)
        

        @@ -1997,6 +2004,45 @@ static int restore_process_helper(struct
        kfd_process *p)
        

              return ret;
        

          }
        

          +/*
        

        + * kfd_hold_devices_reset_semaphore
        

        + *
        

        + * return:
        

        + *   true : hold reset domain semaphore to prevent device reset
        

        + *   false: one of the device is resetting or already reset
        

        + *
        

        + */
        

        +static bool kfd_hold_devices_reset_semaphore(struct kfd_process
        *p)
        

      I find the function naming of these functions (hold/unhold) a bit
      weird. I'd suggest
      kfd_process_trylock_reset_sems/kfd_process_unlock_reset_sems.
      

    ok

    
      +{
        

        +    struct amdgpu_device *adev;
        

        +    int i;
        

        +
        

        +    for (i = 0; i < p->n_pdds; i++) {
        

        +        adev = p->pdds[i]->dev->adev;
        

        +        if
        (!down_read_trylock(&adev->reset_domain->sem))
        

        +            goto out_upread;
        

        +    }
        

        +    return true;
        

        +
        

        +out_upread:
        

        +    while (i--) {
        

        +        adev = p->pdds[i]->dev->adev;
        

        +        up_read(&adev->reset_domain->sem);
        

        +    }
        

        +    return false;
        

        +}
        

        +
        

        +static void kfd_unhold_devices_reset_semaphore(struct
        kfd_process *p)
        

        +{
        

        +    struct amdgpu_device *adev;
        

        +    int i;
        

        +
        

        +    for (i = 0; i < p->n_pdds; i++) {
        

        +        adev = p->pdds[i]->dev->adev;
        

        +        up_read(&adev->reset_domain->sem);
        

        +    }
        

        +}
        

        +
        

          static void restore_process_worker(struct work_struct *work)
        

          {
        

              struct delayed_work *dwork;
        

        @@ -2009,6 +2055,12 @@ static void restore_process_worker(struct
        work_struct *work)
        

               * lifetime of this thread, kfd_process p will be valid
        

               */
        

              p = container_of(dwork, struct kfd_process, restore_work);
        

        +
        

        +    if (!kfd_hold_devices_reset_semaphore(p)) {
        

        +        pr_debug("GPU resetting, restore bo and queue
        skipped\n");
        

      Should we reschedule the restore worker to make sure it runs again
      after the reset is done?
      

    After GPU mode1 reset, user processes already aborted, it is
      meaningless to reschedule restore worker to update GPU mapping.
    Regards,
    Philip

    
      Thanks,
      

        Felix
      

      +        return;
        

        +    }
        

        +
        

              pr_debug("Started restoring pasid 0x%x\n", p->pasid);
        

                /* Setting last_restore_timestamp before successful
        restoration.
        

        @@ -2031,6 +2083,8 @@ static void restore_process_worker(struct
        work_struct *work)
        

        msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
        

                      kfd_process_restore_queues(p);
        

              }
        

        +
        

        +    kfd_unhold_devices_reset_semaphore(p);
        

          }
        

            void kfd_suspend_all_processes(void)