On Aldebaran, GPU driver will handle bad page retirement even though UMC is host managed. As a result, register a bad page retirement handler on the mce notifier chain to retire bad pages on Aldebaran. Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> Reviewed-by: John Clements <John.Clements@xxxxxxx> Acked-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 154 ++++++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b1c57a5b6e89..02263f509b36 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,7 +34,9 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include "atom.h" +#include <asm/mce.h> +static bool notifier_registered; static const char *RAS_FS_NAME = "ras"; const char *ras_error_string[] = { @@ -73,6 +75,11 @@ const char *ras_block_string[] = { /* typical ECC bad page rate(1 bad page per 100MB VRAM) */ #define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) +#define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF) +#define GET_UMC_INST_NIBBLE(m) (((m) >> 20) & 0xF) +#define GET_CHAN_INDEX_NIBBLE(m) (((m) >> 12) & 0xF) +#define GPU_ID_OFFSET 8 + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -85,6 +92,7 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr); static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); +static void amdgpu_register_bad_pages_mca_notifier(void); void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) { @@ -1978,6 +1986,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) goto free; } + if ((adev->asic_type == CHIP_ALDEBARAN) && + (adev->gmc.xgmi.connected_to_cpu)) + amdgpu_register_bad_pages_mca_notifier(); + return 0; free: @@ -2427,3 +2439,145 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) kfree(con); } } + +static struct amdgpu_device *find_adev(uint32_t node_id) +{ + struct amdgpu_gpu_instance *gpu_instance; + int i; + struct amdgpu_device *adev = NULL; + + mutex_lock(&mgpu_info.mutex); + + for (i = 0; i < mgpu_info.num_gpu; i++) { + gpu_instance = &(mgpu_info.gpu_ins[i]); + adev = gpu_instance->adev; + + if (adev->gmc.xgmi.connected_to_cpu && + adev->gmc.xgmi.physical_node_id == node_id) + break; + adev = NULL; + } + + mutex_unlock(&mgpu_info.mutex); + + return adev; +} + +static void find_umc_inst_chan_index(struct mce *m, uint32_t *umc_inst, + uint32_t *chan_index) +{ + uint32_t val1 = 0; + uint32_t val2 = 0; + uint32_t rem = 0; + + /* + * Bit 20-23 provides the UMC instance nibble. + * Bit 12-15 provides the channel index nibble. + */ + val1 = GET_UMC_INST_NIBBLE(m->ipid); + val2 = GET_CHAN_INDEX_NIBBLE(m->ipid); + + *umc_inst = val1/2; + rem = val1%2; + + *chan_index = (4*rem) + val2; +} + +static int amdgpu_bad_page_notifier(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct mce *m = (struct mce *)data; + struct amdgpu_device *adev = NULL; + uint32_t gpu_id = 0; + uint32_t umc_inst = 0; + uint32_t chan_index = 0; + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct eeprom_table_record err_rec; + uint64_t retired_page; + + /* + * If the error was generated in UMC_V2, which belongs to GPU UMCs, + * and error occurred in DramECC (Extended error code = 0) then only + * process the error, else bail out. + */ + if (!m || !(is_smca_umc_v2(m->bank) && (XEC(m->status, 0x1f) == 0x0))) + return NOTIFY_DONE; + + gpu_id = GET_MCA_IPID_GPUID(m->ipid); + + /* + * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register. + */ + gpu_id -= GPU_ID_OFFSET; + + adev = find_adev(gpu_id); + if (!adev) { + dev_warn(adev->dev, "%s: Unable to find adev for gpu_id: %d\n", + __func__, gpu_id); + return NOTIFY_DONE; + } + + /* + * If it is correctable error, then print a message and return. + */ + if (mce_is_correctable(m)) { + dev_info(adev->dev, "%s: UMC Correctable error detected.", + __func__); + return NOTIFY_OK; + } + + /* + * If it is uncorrectable error, then find out UMC instance and + * channel index. + */ + find_umc_inst_chan_index(m, &umc_inst, &chan_index); + + dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d," + " chan_idx: %d", umc_inst, chan_index); + + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); + + /* + * Translate UMC channel address to Physical address + */ + retired_page = ADDR_OF_8KB_BLOCK(m->addr) | + ADDR_OF_256B_BLOCK(chan_index) | + OFFSET_IN_256B_BLOCK(m->addr); + + err_rec.address = m->addr; + err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; + err_rec.ts = (uint64_t)ktime_get_real_seconds(); + err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + err_rec.cu = 0; + err_rec.mem_channel = chan_index; + err_rec.mcumc_id = umc_inst; + + err_data.err_addr = &err_rec; + err_data.err_addr_cnt = 1; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + + return NOTIFY_OK; +} + +static struct notifier_block amdgpu_bad_page_nb = { + .notifier_call = amdgpu_bad_page_notifier, + .priority = MCE_PRIO_ACCEL, +}; + +static void amdgpu_register_bad_pages_mca_notifier(void) +{ + /* + * Register the x86 notifier with MCE subsystem. + * Please note a notifier can be registered only once + * with the MCE subsystem. + */ + if (notifier_registered == false) { + mce_register_decode_chain(&amdgpu_bad_page_nb); + notifier_registered = true; + } +} -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx