[Public] Might be worth changing the commit headline to what the patch does, something like Don't ignore return codes Or Various return code cleanup Otherwise it sounds like your patch is intentionally ignoring return codes, which is the opposite of what you're doing. Also 2 comments below. > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Andrew > Martin > Sent: Monday, December 30, 2024 12:02 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Russell, Kent > <Kent.Russell@xxxxxxx>; Martin, Andrew <Andrew.Martin@xxxxxxx> > Subject: [PATCH v2] drm/amdkfd: Ignored various return code > > This patch checks and handles the return code of the called function. > > Signed-off-by: Andrew Martin <Andrew.Martin@xxxxxxx> > --- > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- > drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 ++ > drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 19 ++++++++++++++----- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 11 ++++++----- > 4 files changed, 23 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index 065d87841459..52a11dc01422 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -2864,7 +2864,7 @@ static int runtime_disable(struct kfd_process *p) > pdd->dev->vm_info.last_vmid_kfd); > > if (!pdd->dev->kfd->shared_resources.enable_mes) > - debug_refresh_runlist(pdd->dev->dqm); > + (void)debug_refresh_runlist(pdd->dev->dqm); > else > kfd_dbg_set_mes_debug_mode(pdd, > > !kfd_dbg_has_cwsr_workaround(pdd->dev)); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > index 693469c18c60..f335ed9e0b3a 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > @@ -2351,6 +2351,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, > if (kdev->kfd->hive_id) { > for (nid = 0; nid < proximity_domain; ++nid) { > peer_dev = > kfd_topology_device_by_proximity_domain_no_lock(nid); > + if (!peer_dev) > + return -ENODEV; > if (!peer_dev->gpu) > continue; > if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > index a8abc3091801..b4db409d90a1 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > @@ -518,6 +518,9 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, > uint32_t *flags) > for (i = 0; i < target->n_pdds; i++) { > struct kfd_topology_device *topo_dev = > kfd_topology_device_by_id(target->pdds[i]->dev->id); > + if (!topo_dev) > + return -EINVAL; > + > uint32_t caps = topo_dev->node_props.capability; > > if (!(caps & > HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) > && > @@ -565,9 +568,9 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, > uint32_t *flags) > continue; > > if (!pdd->dev->kfd->shared_resources.enable_mes) > - debug_refresh_runlist(pdd->dev->dqm); > + (void)debug_refresh_runlist(pdd->dev->dqm); > else > - kfd_dbg_set_mes_debug_mode(pdd, true); > + (void)kfd_dbg_set_mes_debug_mode(pdd, true); > } > } > > @@ -584,8 +587,8 @@ int kfd_dbg_trap_set_flags(struct kfd_process *target, > uint32_t *flags) > */ > void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int > unwind_count) > { > - int i; > > + int i, r = 0; > if (!unwind) { > uint32_t flags = 0; > int resume_count = resume_queues(target, 0, NULL); > @@ -627,9 +630,11 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, > bool unwind, int unwind > pr_err("Failed to release debug vmid on [%i]\n", pdd->dev- > >id); > > if (!pdd->dev->kfd->shared_resources.enable_mes) > - debug_refresh_runlist(pdd->dev->dqm); > + r = debug_refresh_runlist(pdd->dev->dqm); > else > - kfd_dbg_set_mes_debug_mode(pdd, > !kfd_dbg_has_cwsr_workaround(pdd->dev)); > + r = kfd_dbg_set_mes_debug_mode(pdd, > !kfd_dbg_has_cwsr_workaround(pdd->dev)); > + if (r) > + break; > } > > kfd_dbg_set_workaround(target, false); > @@ -1071,6 +1076,10 @@ int kfd_dbg_trap_device_snapshot(struct kfd_process > *target, > for (i = 0; i < tmp_num_devices; i++) { > struct kfd_process_device *pdd = target->pdds[i]; > struct kfd_topology_device *topo_dev = > kfd_topology_device_by_id(pdd->dev->id); > + if (!topo_dev) { > + r = EINVAL; > + break; > + } > > device_info.gpu_id = pdd->dev->id; > device_info.exception_status = pdd->exception_status; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 1405e8affd48..250aa43a39c8 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1907,16 +1907,18 @@ static int start_cpsch(struct device_queue_manager > *dqm) > > static int stop_cpsch(struct device_queue_manager *dqm) > { > + int ret = 0; > dqm_lock(dqm); > if (!dqm->sched_running) { > dqm_unlock(dqm); > - return 0; > + return ret; I think it's cleaner to just leave the "return 0" . e.g. If the code gets refactored and new stuff gets added above, then this might flow differently. > } > > if (!dqm->dev->kfd->shared_resources.enable_mes) > - unmap_queues_cpsch(dqm, > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, > USE_DEFAULT_GRACE_PERIOD, false); > + ret = unmap_queues_cpsch(dqm, > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, > + 0, USE_DEFAULT_GRACE_PERIOD, false); > else > - remove_all_kfd_queues_mes(dqm); > + ret = remove_all_kfd_queues_mes(dqm); > > dqm->sched_running = false; > > @@ -1930,7 +1932,7 @@ static int stop_cpsch(struct device_queue_manager > *dqm) > dqm->detect_hang_info = NULL; > dqm_unlock(dqm); > > - return 0; > + return ret; Same here Kent > } > > static int create_kernel_queue_cpsch(struct device_queue_manager *dqm, > @@ -3439,7 +3441,6 @@ int suspend_queues(struct kfd_process *p, > else > per_device_suspended++; > } else if (err != -EBUSY) { > - r = err; > queue_ids[q_idx] |= > KFD_DBG_QUEUE_ERROR_MASK; > break; > } > -- > 2.43.0