NPS mode is introduced, the value of memory physical address (PA) related to a MCA address varies per nps mode. We need to rely on MCA address and convert it into PA accroding to nps mode. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 97 ++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 77cb2d7bb0a0..f31085c3a12d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2665,40 +2665,113 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, return 0; } +static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, + struct eeprom_table_record *bps, + struct ras_err_data *err_data) +{ + struct ta_ras_query_address_input addr_in; + uint32_t socket = 0; + int ret = 0; + + if (adev->smuio.funcs && adev->smuio.funcs->get_socket_id) + socket = adev->smuio.funcs->get_socket_id(adev); + + /* reinit err_data */ + err_data->err_addr_cnt = 0; + err_data->err_addr_len = adev->umc.retire_unit; + + memset(&addr_in, 0, sizeof(addr_in)); + addr_in.ma.err_addr = bps->address; + addr_in.ma.socket_id = socket; + addr_in.ma.ch_inst = bps->mem_channel; + /* tell RAS TA the node instance is not used */ + addr_in.ma.node_inst = TA_RAS_INV_NODE; + + if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) + ret = adev->umc.ras->convert_ras_err_addr(adev, err_data, + &addr_in, NULL, false); + + return ret; +} + /* it deal with vram only. */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; + struct ras_err_data err_data; + struct eeprom_table_record *err_rec; + struct amdgpu_ras_eeprom_control *control = + &adev->psp.ras_context.ras->eeprom_control; int ret = 0; - uint32_t i; + uint32_t i, j, loop_cnt = 1; + bool is_mca_add = true; if (!con || !con->eh_data || !bps || pages <= 0) return 0; + if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) { + is_mca_add = false; + } else { + if ((pages > 1) && + (bps[0].address == bps[1].address) && + (bps[0].mem_channel == bps[1].mem_channel)) + is_mca_add = false; + } + mutex_lock(&con->recovery_lock); data = con->eh_data; if (!data) goto out; - for (i = 0; i < pages; i++) { - if (amdgpu_ras_check_bad_page_unlock(con, - bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) - continue; - - if (!data->space_left && - amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { + if (is_mca_add) { + err_data.err_addr = + kcalloc(adev->umc.retire_unit, + sizeof(struct eeprom_table_record), GFP_KERNEL); + if(!err_data.err_addr) { + dev_warn(adev->dev, "Failed to alloc UMC error address record in " + "mca2pa conversion!\n"); ret = -ENOMEM; goto out; } - amdgpu_ras_reserve_page(adev, bps[i].retired_page); + loop_cnt = adev->umc.retire_unit; + } + + for (i = 0; i < pages; i++) { + if (is_mca_add) { + if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) + goto free; + + err_rec = err_data.err_addr; + } else { + err_rec = &bps[i]; + } + + for (j = 0; j < loop_cnt; j++) { + if (amdgpu_ras_check_bad_page_unlock(con, + err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + continue; + + if (!data->space_left && + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { + ret = -ENOMEM; + goto free; + } + + amdgpu_ras_reserve_page(adev, err_rec[j].retired_page); - memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); - data->count++; - data->space_left--; + memcpy(&data->bps[data->count], &(err_rec[j]), + sizeof(struct eeprom_table_record)); + data->count++; + data->space_left--; + } } + +free: + if (is_mca_add) + kfree(err_data.err_addr); out: mutex_unlock(&con->recovery_lock); -- 2.34.1