RE: [PATCH 01/23] drm/amdgpu: add flag to indicate nps mode

"Xie, Patrick" <Gangliang.Xie@xxxxxxx> · Wed, 13 Nov 2024 09:03:51 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx
Sent: Friday, November 8, 2024 7:15 PM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Subject: amd-gfx Digest, Vol 102, Issue 101

Send amd-gfx mailing list submissions to
        amd-gfx@xxxxxxxxxxxxxxxxxxxxx

To subscribe or unsubscribe via the World Wide Web, visit
        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
or, via email, send a message with subject or body 'help' to
        amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx

You can reach the person managing the list at
        amd-gfx-owner@xxxxxxxxxxxxxxxxxxxxx

When replying, please edit your Subject line so it is more specific than "Re: Contents of amd-gfx digest..."


Today's Topics:

   1. [PATCH 01/23] drm/amdgpu: add flag to indicate nps mode
      switch (Tao Zhou)
   2. [PATCH 03/23] drm/amdgpu: simplify RAS page retirement in one
      memory row (Tao Zhou)
   3. [PATCH 05/23] drm/amdgpu: store PA with column bits cleared
      for RAS bad page (Tao Zhou)


----------------------------------------------------------------------

Message: 1
Date: Fri, 8 Nov 2024 19:14:01 +0800
From: Tao Zhou <tao.zhou1@xxxxxxx>
To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>
Cc: Tao Zhou <tao.zhou1@xxxxxxx>
Subject: [PATCH 01/23] drm/amdgpu: add flag to indicate nps mode
        switch
Message-ID: <20241108111423.60169-1-tao.zhou1@xxxxxxx>
Content-Type: text/plain

There are two types of gpu reset, nps mode switch and normal gpu reset, add a flag to distigush them.

Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  9 +++++++--  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c  | 13 ++++++++++++-  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |  2 +-
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9365b43c0055..ba9b0d322b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1681,6 +1681,7 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)  }

 int amdgpu_in_reset(struct amdgpu_device *adev);
+int amdgpu_in_nps_switch(struct amdgpu_device *adev);

 extern const struct attribute_group amdgpu_vram_mgr_attr_group;  extern const struct attribute_group amdgpu_gtt_mgr_attr_group; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 286f0fdfcb50..d69fcbb28b0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5862,7 +5862,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        /* We need to lock reset domain only once both for XGMI and single device */
        tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
                                    reset_list);
-       amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+       amdgpu_device_lock_reset_domain(tmp_adev);

        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, reset_list) { @@ -6343,7 +6343,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
                 * Locking adev->reset_domain->sem will prevent any external access
                 * to GPU during PCI error recovery
                 */
-               amdgpu_device_lock_reset_domain(adev->reset_domain);
+               amdgpu_device_lock_reset_domain(adev);
                amdgpu_device_set_mp1_state(adev);

                /*
@@ -6579,6 +6579,11 @@ int amdgpu_in_reset(struct amdgpu_device *adev)
        return atomic_read(&adev->reset_domain->in_gpu_reset);
 }

+int amdgpu_in_nps_switch(struct amdgpu_device *adev) {
+       return atomic_read(&adev->reset_domain->in_nps_switch);
+}
+
 /**
  * amdgpu_device_halt() - bring hardware to some kind of halt state
  *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index f4c08fa83756..1becf8fbbc71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -301,15 +301,25 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d

        INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
        atomic_set(&reset_domain->in_gpu_reset, 0);
+       atomic_set(&reset_domain->in_nps_switch, 0);
        atomic_set(&reset_domain->reset_res, 0);
        init_rwsem(&reset_domain->sem);

        return reset_domain;
 }

-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev)
 {
+       struct amdgpu_reset_domain *reset_domain = adev->reset_domain;
+
        atomic_set(&reset_domain->in_gpu_reset, 1);
+       /* The life time of in_nps_switch is longer than
+        * amdgpu_device_nps_switch_needed
+        */
+       if (adev->nbio.funcs && adev->nbio.funcs->is_nps_switch_requested &&
+           adev->nbio.funcs->is_nps_switch_requested(adev))
+               atomic_set(&reset_domain->in_nps_switch, 1);
+
        down_write(&reset_domain->sem);
 }

@@ -317,6 +327,7 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)  void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)  {
        atomic_set(&reset_domain->in_gpu_reset, 0);
+       atomic_set(&reset_domain->in_nps_switch, 0);
        up_write(&reset_domain->sem);
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 977b2dd2205a..c74a1f88f0ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -97,6 +97,7 @@ struct amdgpu_reset_domain {
        enum amdgpu_reset_domain_type type;
        struct rw_semaphore sem;
        atomic_t in_gpu_reset;
+       atomic_t in_nps_switch;
        atomic_t reset_res;
        struct work_struct clear;
        bool drain;
@@ -158,7 +159,7 @@ static inline bool amdgpu_reset_pending(struct amdgpu_reset_domain *domain)
        return rwsem_is_contended(&domain->sem);  }

-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev);

 void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
[Patrick] Why not change " amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)" to " amdgpu_device_unlock_reset_domain(struct amdgpu_device *adev)", to maintain the same style.

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index daa69dfb4dca..8387e075c385 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1540,7 +1540,7 @@ static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)

        tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
                                    reset_list);
-       amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+       amdgpu_device_lock_reset_domain(tmp_adev);

        reset_context.method = AMD_RESET_METHOD_ON_INIT;
        reset_context.reset_req_dev = tmp_adev;
--
2.34.1



------------------------------

Message: 2
Date: Fri, 8 Nov 2024 19:14:03 +0800
From: Tao Zhou <tao.zhou1@xxxxxxx>
To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>
Cc: Tao Zhou <tao.zhou1@xxxxxxx>
Subject: [PATCH 03/23] drm/amdgpu: simplify RAS page retirement in one
        memory row
Message-ID: <20241108111423.60169-3-tao.zhou1@xxxxxxx>
Content-Type: text/plain

Take R13 and column bits as a whole for UMC v12.

Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 +++++++++++---------------
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h |  1 +
 2 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 1a8ea834efa6..8939b4f1fb49 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -177,7 +177,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
                                        struct ras_err_data *err_data,
                                        struct ta_ras_query_address_input *addr_in)
 {
-       uint32_t col, row, row_xor, bank, channel_index;
+       uint32_t col, row, bank, channel_index;
        uint64_t soc_pa, retired_page, column, err_addr;
        struct ta_ras_query_address_output addr_out;

@@ -195,31 +195,27 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
        channel_index = addr_out.pa.channel_idx;

        col = (err_addr >> 1) & 0x1fULL;
-       row = (err_addr >> 10) & 0x3fffULL;
-       row_xor = row ^ (0x1ULL << 13);
        /* clear [C3 C2] in soc physical address */
        soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
        /* clear [C4] in soc physical address */
        soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+       /* clear [R13] in soc physical address */
+       soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);

-       /* loop for all possibilities of [C4 C3 C2] */
-       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+       /* loop for all possibilities of [R13 C4 C3 C2] */
+       for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
                retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
                retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+               retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT);
+
                /* include column bit 0 and 1 */
                col &= 0x3;
                col |= (column << 2);
-               dev_info(adev->dev,
-                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
-                       retired_page, row, col, bank, channel_index);
-               amdgpu_umc_fill_error_record(err_data, err_addr,
-                       retired_page, channel_index, addr_in->ma.umc_inst);
+               row = (retired_page >> UMC_V12_0_PA_R0_BIT) & 0x3fffULL;

-               /* shift R13 bit */
-               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
                dev_info(adev->dev,
                        "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
-                       retired_page, row_xor, col, bank, channel_index);
+                       retired_page, row, col, bank, channel_index);
                amdgpu_umc_fill_error_record(err_data, err_addr,
                        retired_page, channel_index, addr_in->ma.umc_inst);
        }
@@ -229,7 +225,7 @@ static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev,
                                struct ta_ras_query_address_output *addr_out,
                                uint64_t err_addr)
 {
-       uint32_t col, row, row_xor, bank, channel_index;
+       uint32_t col, row, bank, channel_index;
        uint64_t soc_pa, retired_page, column;

        soc_pa = addr_out->pa.pa;
@@ -237,29 +233,27 @@ static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev,
        channel_index = addr_out->pa.channel_idx;

        col = (err_addr >> 1) & 0x1fULL;
-       row = (err_addr >> 10) & 0x3fffULL;
-       row_xor = row ^ (0x1ULL << 13);
        /* clear [C3 C2] in soc physical address */
        soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
        /* clear [C4] in soc physical address */
        soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+       /* clear [R13] in soc physical address */
+       soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);

-       /* loop for all possibilities of [C4 C3 C2] */
-       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+       /* loop for all possibilities of [R13 C4 C3 C2] */
+       for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
                retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
                retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+               retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT);
+
                /* include column bit 0 and 1 */
                col &= 0x3;
-               col |= (column << 2);
-               dev_info(adev->dev,
-                       "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
-                       retired_page, row, col, bank, channel_index);
+               col |= ((column & 0x7) << 2);
+               row = (retired_page >> UMC_V12_0_PA_R0_BIT) & 0x3fffULL;

-               /* shift R13 bit */
-               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
                dev_info(adev->dev,
                        "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
-                       retired_page, row_xor, col, bank, channel_index);
+                       retired_page, row, col, bank, channel_index);
        }
 }

@@ -274,23 +268,18 @@ static int umc_v12_0_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
        soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
        /* clear [C4] in soc physical address */
        soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+       /* clear [R13] in soc physical address */
+       soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);

        /* loop for all possibilities of [C4 C3 C2] */
-       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+       for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
                retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
                retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+               retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT);

                if (pos >= len)
                        return 0;
                pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
-
-               /* shift R13 bit */
-               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
-
-               if (pos >= len)
-                       return 0;
-               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
-
        }

        return pos;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index be5598d76c1d..dea42810fc53 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -60,6 +60,7 @@
 #define UMC_V12_0_PA_C2_BIT 15
 #define UMC_V12_0_PA_C4_BIT 21
 /* row bits in SOC physical address */
+#define UMC_V12_0_PA_R0_BIT 22
 #define UMC_V12_0_PA_R13_BIT 35

 #define MCA_UMC_HWID_V12_0     0x96
--
2.34.1



------------------------------

Message: 3
Date: Fri, 8 Nov 2024 19:14:05 +0800
From: Tao Zhou <tao.zhou1@xxxxxxx>
To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>
Cc: Tao Zhou <tao.zhou1@xxxxxxx>
Subject: [PATCH 05/23] drm/amdgpu: store PA with column bits cleared
        for RAS bad page
Message-ID: <20241108111423.60169-5-tao.zhou1@xxxxxxx>
Content-Type: text/plain

So the code can be simplified, and no need to expose the detail of PA
format outside address conversion.

Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 4 +++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 5 -----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index a030fed16c5a..65336ae12585 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -219,6 +219,8 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
        /* clear [R13] in soc physical address */
        soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);

+       paddr_out->pa.pa = soc_pa;
+
        /* loop for all possibilities of [R13 C4 C3 C2] */
        for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
                retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
@@ -537,7 +539,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        ecc_err->status = status;
        ecc_err->ipid = ipid;
        ecc_err->addr = addr;
-       ecc_err->pa_pfn = UMC_V12_ADDR_MASK_BAD_COLS(pa_addr) >> AMDGPU_GPU_PAGE_SHIFT;
+       ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT;

        /* If converted pa_pfn is 0, use pa C4 pfn. */
        if (!ecc_err->pa_pfn)
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index dea42810fc53..f0074abb5381 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -82,11 +82,6 @@
        (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
         (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))

-#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \
-       ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \
-                       (0x1ULL << UMC_V12_0_PA_C4_BIT) | \
-                       (0x1ULL << UMC_V12_0_PA_R13_BIT)))
-
 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
 bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
 bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
--
2.34.1



------------------------------

Subject: Digest Footer

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


------------------------------

End of amd-gfx Digest, Vol 102, Issue 101
*****************************************