RE: [PATCH v4 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
Sent: Monday, July 8, 2024 1:06 PM
To: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
Subject: RE: [PATCH v4 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id support

[AMD Official Use Only - AMD Internal Distribution Only]

amdgpu_ras_get_fatal_error_event(struct amdgpu_device
        if (amdgpu_ras_intr_triggered())
                return RAS_EVENT_TYPE_FATAL;
        else
-               return RAS_EVENT_TYPE_INVALID;
+               return RAS_EVENT_TYPE_POISON_CONSUMPTION;
 }

Keep in mind that this is temp solution to treat poison as fatal. Moving forward, we will need to check poison handling option to determine return poison consumption or invalid event here.
[Kevin]:
Yes, agree, this is a temp solution and will be refine later.

Best Regards,
Kevin

The series is

Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>

Regards,
Hawking
-----Original Message-----
From: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
Sent: Monday, July 8, 2024 11:25
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>
Subject: [PATCH v4 3/4] drm/amdgpu: add ras POSION_CONSUMPTION event id support

add amdgpu ras POSION_CONSUMPTION event id support.

Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx>
Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c         | 16 +++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h         |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 15 ++++++++++++---
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index c7e68c5e90cd..ca09316fbb6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2076,10 +2076,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        struct amdgpu_ras_block_object *block_obj =
                amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
+       u64 event_id;
+       int ret;

        if (!block_obj || !con)
                return;

+       ret = amdgpu_ras_mark_ras_event(adev, type);
+       if (ret)
+               return;
+
        /* both query_poison_status and handle_poison_consumption are optional,
         * but at least one of them should be implemented if we need poison
         * consumption handler
@@ -2104,8 +2111,10 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
         * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
         */
        if (poison_stat && !con->is_rma) {
-               dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
-                               block_obj->ras_comm.name);
+               event_id = amdgpu_ras_acquire_event_id(adev, type);
+               RAS_EVENT_LOG(adev, event_id,
+                             "GPU reset for %s RAS poison consumption is issued!\n",
+                             block_obj->ras_comm.name);
                amdgpu_ras_reset_gpu(adev);
        }

@@ -2498,7 +2507,7 @@ static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device
        if (amdgpu_ras_intr_triggered())
                return RAS_EVENT_TYPE_FATAL;
        else
-               return RAS_EVENT_TYPE_INVALID;
+               return RAS_EVENT_TYPE_POISON_CONSUMPTION;
 }

 static void amdgpu_ras_do_recovery(struct work_struct *work) @@ -3985,6 +3994,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type
        switch (type) {
        case RAS_EVENT_TYPE_FATAL:
        case RAS_EVENT_TYPE_POISON_CREATION:
+       case RAS_EVENT_TYPE_POISON_CONSUMPTION:
                event_mgr = __get_ras_event_mgr(adev);
                if (!event_mgr)
                        return RAS_EVENT_INVALID_ID; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 69eb5fd4640f..49ec8edcbe39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -436,6 +436,7 @@ enum ras_event_type {
        RAS_EVENT_TYPE_INVALID = 0,
        RAS_EVENT_TYPE_FATAL,
        RAS_EVENT_TYPE_POISON_CREATION,
+       RAS_EVENT_TYPE_POISON_CONSUMPTION,
        RAS_EVENT_TYPE_COUNT,
 };

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 816800555f7f..8a10a0e42846 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -27,6 +27,7 @@
 #include "soc15_int.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_smi_events.h"
+#include "amdgpu_ras.h"

 /*
  * GFX9 SQ Interrupts
@@ -144,9 +145,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                                uint16_t pasid, uint16_t client_id)  {
        enum amdgpu_ras_block block = 0;
-       int old_poison;
        uint32_t reset = 0;
        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+       enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
+       u64 event_id;
+       int old_poison, ret;

        if (!p)
                return;
@@ -191,10 +194,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                return;
        }

+       ret = amdgpu_ras_mark_ras_event(dev->adev, type);
+       if (ret)
+               return;
+
        kfd_signal_poison_consumed_event(dev, pasid);

-       dev_warn(dev->adev->dev,
-                "poison is consumed by client %d, kick off gpu reset flow\n", client_id);
+       event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
+
+       RAS_EVENT_LOG(dev->adev, event_id,
+                     "poison is consumed by client %d, kick off gpu
+reset flow\n", client_id);

        amdgpu_amdkfd_ras_pasid_poison_consumption_handler(dev->adev,
                block, pasid, NULL, NULL, reset);
--
2.34.1






[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux