[Public] > -----Original Message----- > From: Russell, Kent <Kent.Russell@xxxxxxx> > Sent: Monday, December 30, 2024 12:44 PM > To: Martin, Andrew <Andrew.Martin@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Martin, Andrew > <Andrew.Martin@xxxxxxx> > Subject: RE: [PATCH v2] drm/amdkfd: Ignored various return code > > [Public] > > Might be worth changing the commit headline to what the patch does, > something like Don't ignore return codes Or Various return code cleanup > > Otherwise it sounds like your patch is intentionally ignoring return codes, > which is the opposite of what you're doing. Also 2 comments below. Thanks, will do!, Please see response to your 2 comments also! > > > -----Original Message----- > > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of > > Andrew Martin > > Sent: Monday, December 30, 2024 12:02 PM > > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > > Cc: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; Russell, Kent > > <Kent.Russell@xxxxxxx>; Martin, Andrew <Andrew.Martin@xxxxxxx> > > Subject: [PATCH v2] drm/amdkfd: Ignored various return code > > > > This patch checks and handles the return code of the called function. > > > > Signed-off-by: Andrew Martin <Andrew.Martin@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- > > drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 2 ++ > > drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 19 ++++++++++++++----- > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 11 ++++++----- > > 4 files changed, 23 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > index 065d87841459..52a11dc01422 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > @@ -2864,7 +2864,7 @@ static int runtime_disable(struct kfd_process *p) > > > > pdd->dev->vm_info.last_vmid_kfd); > > > > if (!pdd->dev->kfd->shared_resources.enable_mes) > > - debug_refresh_runlist(pdd->dev->dqm); > > + > > + (void)debug_refresh_runlist(pdd->dev->dqm); > > else > > kfd_dbg_set_mes_debug_mode(pdd, > > > > !kfd_dbg_has_cwsr_workaround(pdd->dev)); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > > index 693469c18c60..f335ed9e0b3a 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c > > @@ -2351,6 +2351,8 @@ static int kfd_create_vcrat_image_gpu(void > *pcrat_image, > > if (kdev->kfd->hive_id) { > > for (nid = 0; nid < proximity_domain; ++nid) { > > peer_dev = > > kfd_topology_device_by_proximity_domain_no_lock(nid); > > + if (!peer_dev) > > + return -ENODEV; > > if (!peer_dev->gpu) > > continue; > > if (peer_dev->gpu->kfd->hive_id != > > kdev->kfd->hive_id) diff --git > > a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > index a8abc3091801..b4db409d90a1 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > @@ -518,6 +518,9 @@ int kfd_dbg_trap_set_flags(struct kfd_process > > *target, uint32_t *flags) > > for (i = 0; i < target->n_pdds; i++) { > > struct kfd_topology_device *topo_dev = > > > > kfd_topology_device_by_id(target->pdds[i]->dev->id); > > + if (!topo_dev) > > + return -EINVAL; > > + > > uint32_t caps = topo_dev->node_props.capability; > > > > if (!(caps & > > HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED) > > && > > @@ -565,9 +568,9 @@ int kfd_dbg_trap_set_flags(struct kfd_process > > *target, uint32_t *flags) > > continue; > > > > if (!pdd->dev->kfd->shared_resources.enable_mes) > > - debug_refresh_runlist(pdd->dev->dqm); > > + > > + (void)debug_refresh_runlist(pdd->dev->dqm); > > else > > - kfd_dbg_set_mes_debug_mode(pdd, true); > > + (void)kfd_dbg_set_mes_debug_mode(pdd, > > + true); > > } > > } > > > > @@ -584,8 +587,8 @@ int kfd_dbg_trap_set_flags(struct kfd_process > > *target, uint32_t *flags) > > */ > > void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, > > int > > unwind_count) > > { > > - int i; > > > > + int i, r = 0; > > if (!unwind) { > > uint32_t flags = 0; > > int resume_count = resume_queues(target, 0, NULL); @@ > > -627,9 +630,11 @@ void kfd_dbg_trap_deactivate(struct kfd_process > > *target, bool unwind, int unwind > > pr_err("Failed to release debug vmid on [%i]\n", > > pdd->dev- > > >id); > > > > if (!pdd->dev->kfd->shared_resources.enable_mes) > > - debug_refresh_runlist(pdd->dev->dqm); > > + r = debug_refresh_runlist(pdd->dev->dqm); > > else > > - kfd_dbg_set_mes_debug_mode(pdd, > > !kfd_dbg_has_cwsr_workaround(pdd->dev)); > > + r = kfd_dbg_set_mes_debug_mode(pdd, > > !kfd_dbg_has_cwsr_workaround(pdd->dev)); > > + if (r) > > + break; > > } > > > > kfd_dbg_set_workaround(target, false); @@ -1071,6 +1076,10 @@ > > int kfd_dbg_trap_device_snapshot(struct kfd_process *target, > > for (i = 0; i < tmp_num_devices; i++) { > > struct kfd_process_device *pdd = target->pdds[i]; > > struct kfd_topology_device *topo_dev = > > kfd_topology_device_by_id(pdd->dev->id); > > + if (!topo_dev) { > > + r = EINVAL; > > + break; > > + } > > > > device_info.gpu_id = pdd->dev->id; > > device_info.exception_status = pdd->exception_status; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > index 1405e8affd48..250aa43a39c8 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > @@ -1907,16 +1907,18 @@ static int start_cpsch(struct > > device_queue_manager > > *dqm) > > > > static int stop_cpsch(struct device_queue_manager *dqm) { > > + int ret = 0; > > dqm_lock(dqm); > > if (!dqm->sched_running) { > > dqm_unlock(dqm); > > - return 0; > > + return ret; I can agree with this one, since it minimizes the changes, but it does flow the same. > > I think it's cleaner to just leave the "return 0" . e.g. If the code gets refactored > and new stuff gets added above, then this might flow differently. > > > > } > > > > if (!dqm->dev->kfd->shared_resources.enable_mes) > > - unmap_queues_cpsch(dqm, > > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, > USE_DEFAULT_GRACE_PERIOD, > > false); > > + ret = unmap_queues_cpsch(dqm, > > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, > > + 0, USE_DEFAULT_GRACE_PERIOD, > > + false); > > else > > - remove_all_kfd_queues_mes(dqm); > > + ret = remove_all_kfd_queues_mes(dqm); > > > > dqm->sched_running = false; > > > > @@ -1930,7 +1932,7 @@ static int stop_cpsch(struct > > device_queue_manager > > *dqm) > > dqm->detect_hang_info = NULL; > > dqm_unlock(dqm); > > > > - return 0; > > + return ret; > > Same here > If I change this back to "return 0 ;" then I would have lost the return value from "unmap_queues_cpsch()" or "remove_all_kfd_queues_mes()" which was the whole point of touching this function. But since we in the "stop_cpch()" perhaps it is a more optimal solution is to ignore the return code with ... (void)unmap_queues_cpsch() else (void)remove_all_kfd_queues_mes() ... Which would be the only change here. Otherwise, I would need to introduce a 3rd return statement. So do let me know which path complements the design more. > Kent > > > } > > > > static int create_kernel_queue_cpsch(struct device_queue_manager > > *dqm, @@ -3439,7 +3441,6 @@ int suspend_queues(struct kfd_process *p, > > else > > per_device_suspended++; > > } else if (err != -EBUSY) { > > - r = err; > > queue_ids[q_idx] |= > > KFD_DBG_QUEUE_ERROR_MASK; > > break; > > } > > -- > > 2.43.0 >