On 4/28/2024 12:38 PM, YiPeng Chai wrote: > Add mutex to protect ras shared memory. > > Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 121 ++++++++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 2 + > 3 files changed, 84 insertions(+), 40 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > index 5583e2d1b12f..fa4fea00f6b4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > @@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context *psp) > } > } > > +static int psp_ras_send_cmd(struct psp_context *psp, > + enum ras_command cmd_id, void *in, void *out) > +{ > + struct ta_ras_shared_memory *ras_cmd; > + uint32_t cmd = cmd_id; > + int ret = 0; > + > + mutex_lock(&psp->ras_context.mutex); > + ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; > + memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); > + > + switch (cmd) { > + case TA_RAS_COMMAND__ENABLE_FEATURES: > + case TA_RAS_COMMAND__DISABLE_FEATURES: > + memcpy(&ras_cmd->ras_in_message, > + in, sizeof(ras_cmd->ras_in_message)); > + break; > + case TA_RAS_COMMAND__TRIGGER_ERROR: > + memcpy(&ras_cmd->ras_in_message.trigger_error, > + in, sizeof(ras_cmd->ras_in_message.trigger_error)); > + break; > + case TA_RAS_COMMAND__QUERY_ADDRESS: > + memcpy(&ras_cmd->ras_in_message.address, > + in, sizeof(ras_cmd->ras_in_message.address)); > + break; > + default: > + dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd); > + ret = -EINVAL; > + goto err_out; > + } > + > + ras_cmd->cmd_id = cmd; > + ret = psp_ras_invoke(psp, ras_cmd->cmd_id); > + > + switch (cmd) { > + case TA_RAS_COMMAND__TRIGGER_ERROR: > + if (out) { As NULL check for 'out' is done before copying, better to do the same for 'in' also. > + uint32_t *ras_status = (uint32_t *)out; > + > + *ras_status = ras_cmd->ras_status; > + } > + break; > + case TA_RAS_COMMAND__QUERY_ADDRESS: > + if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status) > + ret = -EINVAL; > + else if (out) > + memcpy(out, > + &ras_cmd->ras_out_message.address, > + sizeof(ras_cmd->ras_out_message.address)); > + break; > + default: > + break; > + } > + > +err_out: > + mutex_unlock(&psp->ras_context.mutex); > + > + return ret; > +} > + > int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id) > { > struct ta_ras_shared_memory *ras_cmd; > @@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id) > int psp_ras_enable_features(struct psp_context *psp, > union ta_ras_cmd_input *info, bool enable) > { > - struct ta_ras_shared_memory *ras_cmd; > + enum ras_command cmd_id; > int ret; > > - if (!psp->ras_context.context.initialized) > + if (!psp->ras_context.context.initialized || !info) > return -EINVAL; > > - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; > - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); > - > - if (enable) > - ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES; > - else > - ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES; > - > - ras_cmd->ras_in_message = *info; > - > - ret = psp_ras_invoke(psp, ras_cmd->cmd_id); > + cmd_id = enable ? > + TA_RAS_COMMAND__ENABLE_FEATURES : TA_RAS_COMMAND__DISABLE_FEATURES; > + ret = psp_ras_send_cmd(psp, cmd_id, info, NULL); > if (ret) > return -EINVAL; > > @@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp) > > psp->ras_context.context.initialized = false; > > + mutex_destroy(&psp->ras_context.mutex); > + > return ret; > } > > @@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp) > > ret = psp_ta_load(psp, &psp->ras_context.context); > > - if (!ret && !ras_cmd->ras_status) > + if (!ret && !ras_cmd->ras_status) { > psp->ras_context.context.initialized = true; > - else { > + mutex_init(&psp->ras_context.mutex); > + } else { > if (ras_cmd->ras_status) > dev_warn(adev->dev, "RAS Init Status: 0x%X\n", ras_cmd->ras_status); > > @@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp) > int psp_ras_trigger_error(struct psp_context *psp, > struct ta_ras_trigger_error_input *info, uint32_t instance_mask) > { > - struct ta_ras_shared_memory *ras_cmd; > struct amdgpu_device *adev = psp->adev; > int ret; > uint32_t dev_mask; > + uint32_t ras_status; > > - if (!psp->ras_context.context.initialized) > + if (!psp->ras_context.context.initialized || !info) > return -EINVAL; > > switch (info->block_id) { > @@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp, > dev_mask &= AMDGPU_RAS_INST_MASK; > info->sub_block_index |= dev_mask; > > - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; > - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); > - > - ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR; > - ras_cmd->ras_in_message.trigger_error = *info; > - > - ret = psp_ras_invoke(psp, ras_cmd->cmd_id); > + ret = psp_ras_send_cmd(psp, > + TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status); > if (ret) > return -EINVAL; > > @@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp, > if (amdgpu_ras_intr_triggered()) > return 0; > > - if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED) > + if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED) > return -EACCES; > - else if (ras_cmd->ras_status) > + else if (ras_status) > return -EINVAL; > > return 0; > @@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp, > struct ta_ras_query_address_input *addr_in, > struct ta_ras_query_address_output *addr_out) > { > - struct ta_ras_shared_memory *ras_cmd; > int ret; > > - if (!psp->ras_context.context.initialized) > - return -EINVAL; > - > - ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; > - memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); > - > - ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS; > - ras_cmd->ras_in_message.address = *addr_in; > - > - ret = psp_ras_invoke(psp, ras_cmd->cmd_id); > - if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status) > + if (!psp->ras_context.context.initialized || > + !addr_in || !addr_out) > return -EINVAL; > > - *addr_out = ras_cmd->ras_out_message.address; > + ret = psp_ras_send_cmd(psp, > + TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out); return psp_ras_send_cmd() will do. > > - return 0; > + return ret; > } > // ras end > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h > index ee16f134ae92..686023918ce3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h > @@ -197,6 +197,7 @@ struct psp_xgmi_context { > struct psp_ras_context { > struct ta_context context; > struct amdgpu_ras *ras; > + struct mutex mutex; > }; > > #define MEM_TRAIN_SYSTEM_SIGNATURE 0x54534942 > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c > index ca5c86e5f7cd..87f213f92d83 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c > @@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size > > context->session_id = ta_id; > > + mutex_lock(&psp->ras_context.mutex); > ret = prep_ta_mem_context(&context->mem_context, shared_buf, shared_buf_len); > if (ret) > goto err_free_shared_buf; > @@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf, size > ret = -EFAULT; > > err_free_shared_buf: This error label is used at other places as well and that happens even before acquiring the mutex. Thanks, Lijo > + mutex_unlock(&psp->ras_context.mutex); > kfree(shared_buf); > > return ret;