Re: [PATCH] drm/amdgpu: Skip execution of pending reset jobs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 11/10/2023 8:18 PM, Christian König wrote:
Am 09.11.23 um 08:38 schrieb Lijo Lazar:
cancel_work is not backported to all custom kernels.

Well this is pretty clear NAK to pushing this upstream. We absolutely can't add workaround for older kernels.

You could keep this in the backported kernel, but why should cancel_work not be available?


As you know there are vendor maintained kernels, and all users necessarily don't upgrade to a kernel which has backport of this, as that could be total disruption of their current environment.

Thanks,
Lijo

Regards,
Christian.

  Add a workaround to
skip execution of already queued recovery jobs, if the device is already
reset.

Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  |  9 +++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  | 16 ++++++++++++++++
  3 files changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bebc73c6822c..c66524e2a56a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5411,6 +5411,8 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
  {
      struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+    amdgpu_reset_domain_clear_pending(adev->reset_domain);
+
  #if defined(CONFIG_DEBUG_FS)
      if (!amdgpu_sriov_vf(adev))
          cancel_work(&adev->reset_work);
@@ -5452,6 +5454,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
      bool audio_suspended = false;
      bool gpu_reset_for_dev_remove = false;
+    if (amdgpu_reset_domain_in_drain_mode(adev->reset_domain))
+        return 0;
+
      gpu_reset_for_dev_remove =
              test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&                   test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 4baa300121d8..3ece7267d6ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -120,6 +120,14 @@ void amdgpu_reset_destroy_reset_domain(struct kref *ref)
      kvfree(reset_domain);
  }
+static void amdgpu_reset_domain_cancel_all_work(struct work_struct *work)
+{
+    struct amdgpu_reset_domain *reset_domain =
+        container_of(work, struct amdgpu_reset_domain, clear);
+
+    reset_domain->drain = false;
+}
+
  struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
                                   char *wq_name)
  {
@@ -142,6 +150,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
      }
+    INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
      atomic_set(&reset_domain->in_gpu_reset, 0);
      atomic_set(&reset_domain->reset_res, 0);
      init_rwsem(&reset_domain->sem);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index b0335a1c5e90..70059eea7e2f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -87,6 +87,8 @@ struct amdgpu_reset_domain {
      struct rw_semaphore sem;
      atomic_t in_gpu_reset;
      atomic_t reset_res;
+    struct work_struct clear;
+    bool drain;
  };
  #ifdef CONFIG_DEV_COREDUMP
@@ -137,6 +139,20 @@ static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *doma
      return queue_work(domain->wq, work);
  }
+static inline void amdgpu_reset_domain_clear_pending(struct amdgpu_reset_domain *domain)
+{
+    domain->drain = true;
+    /* queue one more work to the domain queue. Till this work is finished,
+     * domain is in drain mode.
+     */
+    queue_work(domain->wq, &domain->clear);
+}
+
+static inline bool amdgpu_reset_domain_in_drain_mode(struct amdgpu_reset_domain *domain)
+{
+    return domain->drain;
+}
+
  void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);   void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux