[AMD Official Use Only - AMD Internal Distribution Only] -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx Sent: Friday, November 8, 2024 7:15 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: amd-gfx Digest, Vol 102, Issue 101 Send amd-gfx mailing list submissions to amd-gfx@xxxxxxxxxxxxxxxxxxxxx To subscribe or unsubscribe via the World Wide Web, visit https://lists.freedesktop.org/mailman/listinfo/amd-gfx or, via email, send a message with subject or body 'help' to amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx You can reach the person managing the list at amd-gfx-owner@xxxxxxxxxxxxxxxxxxxxx When replying, please edit your Subject line so it is more specific than "Re: Contents of amd-gfx digest..." Today's Topics: 1. [PATCH 01/23] drm/amdgpu: add flag to indicate nps mode switch (Tao Zhou) 2. [PATCH 03/23] drm/amdgpu: simplify RAS page retirement in one memory row (Tao Zhou) 3. [PATCH 05/23] drm/amdgpu: store PA with column bits cleared for RAS bad page (Tao Zhou) ---------------------------------------------------------------------- Message: 1 Date: Fri, 8 Nov 2024 19:14:01 +0800 From: Tao Zhou <tao.zhou1@xxxxxxx> To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Tao Zhou <tao.zhou1@xxxxxxx> Subject: [PATCH 01/23] drm/amdgpu: add flag to indicate nps mode switch Message-ID: <20241108111423.60169-1-tao.zhou1@xxxxxxx> Content-Type: text/plain There are two types of gpu reset, nps mode switch and normal gpu reset, add a flag to distigush them. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 13 ++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9365b43c0055..ba9b0d322b33 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1681,6 +1681,7 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) } int amdgpu_in_reset(struct amdgpu_device *adev); +int amdgpu_in_nps_switch(struct amdgpu_device *adev); extern const struct attribute_group amdgpu_vram_mgr_attr_group; extern const struct attribute_group amdgpu_gtt_mgr_attr_group; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 286f0fdfcb50..d69fcbb28b0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5862,7 +5862,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* We need to lock reset domain only once both for XGMI and single device */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); - amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); + amdgpu_device_lock_reset_domain(tmp_adev); /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { @@ -6343,7 +6343,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta * Locking adev->reset_domain->sem will prevent any external access * to GPU during PCI error recovery */ - amdgpu_device_lock_reset_domain(adev->reset_domain); + amdgpu_device_lock_reset_domain(adev); amdgpu_device_set_mp1_state(adev); /* @@ -6579,6 +6579,11 @@ int amdgpu_in_reset(struct amdgpu_device *adev) return atomic_read(&adev->reset_domain->in_gpu_reset); } +int amdgpu_in_nps_switch(struct amdgpu_device *adev) { + return atomic_read(&adev->reset_domain->in_nps_switch); +} + /** * amdgpu_device_halt() - bring hardware to some kind of halt state * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index f4c08fa83756..1becf8fbbc71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -301,15 +301,25 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work); atomic_set(&reset_domain->in_gpu_reset, 0); + atomic_set(&reset_domain->in_nps_switch, 0); atomic_set(&reset_domain->reset_res, 0); init_rwsem(&reset_domain->sem); return reset_domain; } -void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) +void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev) { + struct amdgpu_reset_domain *reset_domain = adev->reset_domain; + atomic_set(&reset_domain->in_gpu_reset, 1); + /* The life time of in_nps_switch is longer than + * amdgpu_device_nps_switch_needed + */ + if (adev->nbio.funcs && adev->nbio.funcs->is_nps_switch_requested && + adev->nbio.funcs->is_nps_switch_requested(adev)) + atomic_set(&reset_domain->in_nps_switch, 1); + down_write(&reset_domain->sem); } @@ -317,6 +327,7 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) { atomic_set(&reset_domain->in_gpu_reset, 0); + atomic_set(&reset_domain->in_nps_switch, 0); up_write(&reset_domain->sem); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 977b2dd2205a..c74a1f88f0ef 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -97,6 +97,7 @@ struct amdgpu_reset_domain { enum amdgpu_reset_domain_type type; struct rw_semaphore sem; atomic_t in_gpu_reset; + atomic_t in_nps_switch; atomic_t reset_res; struct work_struct clear; bool drain; @@ -158,7 +159,7 @@ static inline bool amdgpu_reset_pending(struct amdgpu_reset_domain *domain) return rwsem_is_contended(&domain->sem); } -void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain); +void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev); void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain); [Patrick] Why not change " amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)" to " amdgpu_device_unlock_reset_domain(struct amdgpu_device *adev)", to maintain the same style. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index daa69dfb4dca..8387e075c385 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1540,7 +1540,7 @@ static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work) tmp_adev = list_first_entry(&device_list, struct amdgpu_device, reset_list); - amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); + amdgpu_device_lock_reset_domain(tmp_adev); reset_context.method = AMD_RESET_METHOD_ON_INIT; reset_context.reset_req_dev = tmp_adev; -- 2.34.1 ------------------------------ Message: 2 Date: Fri, 8 Nov 2024 19:14:03 +0800 From: Tao Zhou <tao.zhou1@xxxxxxx> To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Tao Zhou <tao.zhou1@xxxxxxx> Subject: [PATCH 03/23] drm/amdgpu: simplify RAS page retirement in one memory row Message-ID: <20241108111423.60169-3-tao.zhou1@xxxxxxx> Content-Type: text/plain Take R13 and column bits as a whole for UMC v12. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 +++++++++++--------------- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 1 + 2 files changed, 24 insertions(+), 34 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 1a8ea834efa6..8939b4f1fb49 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -177,7 +177,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, struct ta_ras_query_address_input *addr_in) { - uint32_t col, row, row_xor, bank, channel_index; + uint32_t col, row, bank, channel_index; uint64_t soc_pa, retired_page, column, err_addr; struct ta_ras_query_address_output addr_out; @@ -195,31 +195,27 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, channel_index = addr_out.pa.channel_idx; col = (err_addr >> 1) & 0x1fULL; - row = (err_addr >> 10) & 0x3fffULL; - row_xor = row ^ (0x1ULL << 13); /* clear [C3 C2] in soc physical address */ soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); /* clear [C4] in soc physical address */ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); + /* clear [R13] in soc physical address */ + soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT); - /* loop for all possibilities of [C4 C3 C2] */ - for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { + /* loop for all possibilities of [R13 C4 C3 C2] */ + for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) { retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); + retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT); + /* include column bit 0 and 1 */ col &= 0x3; col |= (column << 2); - dev_info(adev->dev, - "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", - retired_page, row, col, bank, channel_index); - amdgpu_umc_fill_error_record(err_data, err_addr, - retired_page, channel_index, addr_in->ma.umc_inst); + row = (retired_page >> UMC_V12_0_PA_R0_BIT) & 0x3fffULL; - /* shift R13 bit */ - retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); dev_info(adev->dev, "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", - retired_page, row_xor, col, bank, channel_index); + retired_page, row, col, bank, channel_index); amdgpu_umc_fill_error_record(err_data, err_addr, retired_page, channel_index, addr_in->ma.umc_inst); } @@ -229,7 +225,7 @@ static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev, struct ta_ras_query_address_output *addr_out, uint64_t err_addr) { - uint32_t col, row, row_xor, bank, channel_index; + uint32_t col, row, bank, channel_index; uint64_t soc_pa, retired_page, column; soc_pa = addr_out->pa.pa; @@ -237,29 +233,27 @@ static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev, channel_index = addr_out->pa.channel_idx; col = (err_addr >> 1) & 0x1fULL; - row = (err_addr >> 10) & 0x3fffULL; - row_xor = row ^ (0x1ULL << 13); /* clear [C3 C2] in soc physical address */ soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); /* clear [C4] in soc physical address */ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); + /* clear [R13] in soc physical address */ + soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT); - /* loop for all possibilities of [C4 C3 C2] */ - for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { + /* loop for all possibilities of [R13 C4 C3 C2] */ + for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) { retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); + retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT); + /* include column bit 0 and 1 */ col &= 0x3; - col |= (column << 2); - dev_info(adev->dev, - "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", - retired_page, row, col, bank, channel_index); + col |= ((column & 0x7) << 2); + row = (retired_page >> UMC_V12_0_PA_R0_BIT) & 0x3fffULL; - /* shift R13 bit */ - retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); dev_info(adev->dev, "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", - retired_page, row_xor, col, bank, channel_index); + retired_page, row, col, bank, channel_index); } } @@ -274,23 +268,18 @@ static int umc_v12_0_lookup_bad_pages_in_a_row(struct amdgpu_device *adev, soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); /* clear [C4] in soc physical address */ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); + /* clear [R13] in soc physical address */ + soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT); /* loop for all possibilities of [C4 C3 C2] */ - for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { + for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) { retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); + retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT); if (pos >= len) return 0; pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; - - /* shift R13 bit */ - retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); - - if (pos >= len) - return 0; - pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; - } return pos; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index be5598d76c1d..dea42810fc53 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -60,6 +60,7 @@ #define UMC_V12_0_PA_C2_BIT 15 #define UMC_V12_0_PA_C4_BIT 21 /* row bits in SOC physical address */ +#define UMC_V12_0_PA_R0_BIT 22 #define UMC_V12_0_PA_R13_BIT 35 #define MCA_UMC_HWID_V12_0 0x96 -- 2.34.1 ------------------------------ Message: 3 Date: Fri, 8 Nov 2024 19:14:05 +0800 From: Tao Zhou <tao.zhou1@xxxxxxx> To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Tao Zhou <tao.zhou1@xxxxxxx> Subject: [PATCH 05/23] drm/amdgpu: store PA with column bits cleared for RAS bad page Message-ID: <20241108111423.60169-5-tao.zhou1@xxxxxxx> Content-Type: text/plain So the code can be simplified, and no need to expose the detail of PA format outside address conversion. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index a030fed16c5a..65336ae12585 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -219,6 +219,8 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, /* clear [R13] in soc physical address */ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT); + paddr_out->pa.pa = soc_pa; + /* loop for all possibilities of [R13 C4 C3 C2] */ for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) { retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); @@ -537,7 +539,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ecc_err->status = status; ecc_err->ipid = ipid; ecc_err->addr = addr; - ecc_err->pa_pfn = UMC_V12_ADDR_MASK_BAD_COLS(pa_addr) >> AMDGPU_GPU_PAGE_SHIFT; + ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT; /* If converted pa_pfn is 0, use pa C4 pfn. */ if (!ecc_err->pa_pfn) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index dea42810fc53..f0074abb5381 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -82,11 +82,6 @@ (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \ (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03)) -#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \ - ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \ - (0x1ULL << UMC_V12_0_PA_C4_BIT) | \ - (0x1ULL << UMC_V12_0_PA_R13_BIT))) - bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); -- 2.34.1 ------------------------------ Subject: Digest Footer _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx ------------------------------ End of amd-gfx Digest, Vol 102, Issue 101 *****************************************