[Public] > -----Original Message----- > From: Kim, Jonathan > Sent: Wednesday, June 7, 2023 3:28 PM > To: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Huang, JinHuiEric <JinHuiEric.Huang@xxxxxxx> > Subject: RE: [PATCH] drm/amdkfd: fix and enable debugging for gfx11 > > > > > -----Original Message----- > > From: Kuehling, Felix <Felix.Kuehling@xxxxxxx> > > Sent: Wednesday, June 7, 2023 2:20 PM > > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Kim, Jonathan > <Jonathan.Kim@xxxxxxx> > > Cc: Huang, JinHuiEric <JinHuiEric.Huang@xxxxxxx> > > Subject: Re: [PATCH] drm/amdkfd: fix and enable debugging for gfx11 > > > > > > On 2023-06-07 13:26, Jonathan Kim wrote: > > > There are a few fixes required to enable gfx11 debugging. > > > > > > First, ADD_QUEUE.trap_en is an inappropriate place to toggle > > > a per-process register so move it to SET_SHADER_DEBUGGER.trap_en. > > > When ADD_QUEUE.skip_process_ctx_clear is set, MES will prioritize > > > the SET_SHADER_DEBUGGER.trap_en setting. > > > > I see you have a firmware version check for enabling debugging. But is > > the struct SET_SHADER_DEBUGGER change safe with older firmware when > > debugging is disabled? > > Right. It changes the shape of MISC_OPs. > I'll have to figure out something that's backwards compatible. Actually, I think we should be okay. MISC_OPs allows a max data packet of 20 D-WORDs. So adding another D-WORD to SET_SHADER_DEBUGGER should be well under that limit. The writing to an unused D-WORD is likely not harmful but I can version check the trap_en setting in the MES KGD call itself just to be safe. Thanks, Jon > > > > > > > > > > > Second, to preserve correct save/restore priviledged wave states > > > in coordination with the trap enablement setting, resume suspended > > > waves early in the disable call. > > > > > > Finally, displaced single stepping can cause non-fatal illegal > > > instructions during process termination on debug disable. To work > > > around this, stall the waves prior to disable and allow clean > > > up to happen naturally on process termination. > > > > > > NOTE: The AMDGPU_MES_VERSION_MASK check is a place holder as > > > MES FW updates have been reviewed but is awaiting binary > > > creation. Once the binaries have been created, this check may > > > be subject to change. > > > > > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > > > --- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 5 ++- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 4 ++- > > > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 1 + > > > drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 31 ++++++++++++++---- > - > > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 3 +- > > > drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 12 ++++--- > > > drivers/gpu/drm/amd/include/mes_v11_api_def.h | 1 + > > > 7 files changed, 40 insertions(+), 17 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > > > index 20cc3fffe921..95d69f9c7361 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > > > @@ -928,7 +928,8 @@ int amdgpu_mes_set_shader_debugger(struct > > amdgpu_device *adev, > > > uint64_t process_context_addr, > > > uint32_t spi_gdbg_per_vmid_cntl, > > > const uint32_t *tcp_watch_cntl, > > > - uint32_t flags) > > > + uint32_t flags, > > > + bool trap_en) > > > { > > > struct mes_misc_op_input op_input = {0}; > > > int r; > > > @@ -945,6 +946,8 @@ int amdgpu_mes_set_shader_debugger(struct > > amdgpu_device *adev, > > > memcpy(op_input.set_shader_debugger.tcp_watch_cntl, > > tcp_watch_cntl, > > > > > sizeof(op_input.set_shader_debugger.tcp_watch_cntl)); > > > > > > + op_input.set_shader_debugger.trap_en = trap_en; > > > + > > > amdgpu_mes_lock(&adev->mes); > > > > > > r = adev->mes.funcs->misc_op(&adev->mes, &op_input); > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > > > index b5f5eed2b5ef..2d6ac30b7135 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > > > @@ -294,6 +294,7 @@ struct mes_misc_op_input { > > > } flags; > > > uint32_t spi_gdbg_per_vmid_cntl; > > > uint32_t tcp_watch_cntl[4]; > > > + uint32_t trap_en; > > > } set_shader_debugger; > > > }; > > > }; > > > @@ -361,7 +362,8 @@ int amdgpu_mes_set_shader_debugger(struct > > amdgpu_device *adev, > > > uint64_t process_context_addr, > > > uint32_t spi_gdbg_per_vmid_cntl, > > > const uint32_t *tcp_watch_cntl, > > > - uint32_t flags); > > > + uint32_t flags, > > > + bool trap_en); > > > > > > int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id, > > > int queue_type, int idx, > > > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > > b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > > > index c4e3cb8d44de..1bdaa00c0b46 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > > > @@ -347,6 +347,7 @@ static int mes_v11_0_misc_op(struct amdgpu_mes > > *mes, > > > memcpy(misc_pkt.set_shader_debugger.tcp_watch_cntl, > > > input->set_shader_debugger.tcp_watch_cntl, > > > > > sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl)); > > > + misc_pkt.set_shader_debugger.trap_en = input- > > >set_shader_debugger.trap_en; > > > break; > > > default: > > > DRM_ERROR("unsupported misc op (%d) \n", input->op); > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > > index 125274445f43..e7bc07068eed 100644 > > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > > @@ -349,12 +349,30 @@ int kfd_dbg_set_mes_debug_mode(struct > > kfd_process_device *pdd) > > > { > > > uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd- > > >spi_dbg_launch_mode; > > > uint32_t flags = pdd->process->dbg_flags; > > > + bool sq_trap_en = !!spi_dbg_cntl; > > > > > > if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) > > > return 0; > > > > > > + /* > > > + * For displaced single stepping, the debugger inserts s_trap > > instructions > > > + * from user space. > > > + * This can race with the CWSR workaround during a non-IOCTL > > disable and > > > + * cause non-fatal trailing SQ illegal instructions. > > > + * As a work around, stall waves indefinitely in this case as the > > process > > > + * queues will terminate anyways. MES will automatically clean up > > the > > > + * SPI debug HW registers when all queues are unmapped. > > > + * IOCTL disable will not hit these cases as the program needs to be > > in a > > > + * non-continued state to make the disable call in the first place so > > > + * debugger insertion of s_trap in debug memory will never occur. > > > > This comment has a lot of jargon. I don't know what "non-ioctl disable", > > "non-fatal trailing SQ illegal instructions" and "non-continued state" > > mean. Looking at the condition below, I think this is missing some > > context. I think you're trying to fix something about process > > termination (!pdd->process->mm), which is not mentioned anywhere in the > > comment, or obfuscated by jargon ("non-IOCTL disable"). I think you're > > also making some assumptions about how the debugger interacts with the > > CPU threads of the program ("program needs to be in a non-continued > > state to make the disable call"). Making assumptions about user mode is > > not a good idea. A different debugger or tool may use the debug API in > > unexpected ways. > > > > > > > + */ > > > + if (KFD_GC_VERSION(pdd->dev) >= IP_VERSION(11, 0, 0) && > > > + KFD_GC_VERSION(pdd->dev) < IP_VERSION(12, 0, 0) && > > > + !pdd->process->mm && !sq_trap_en) > > > + spi_dbg_cntl |= 0x1; /* Set > > SPI_GDBG_PER_VMID_CNTL.stall_vmid */ > > > > This looks like a HW-specific workaround in non-HW-specific code. Is > > this something that could be done in kgd_gfx_v11_disable_debug_trap? > > > > That said, if the pdd->process->mm is NULL, how can the debugger still > > access the process' memory to insert s_trap instructions? > > It was probably a race on save/restore induced by the s_trap instructions > before the mm reference was nulled. > I agree, this isn't where this is supposed to be fixed. > I retested with this block removed and everything is fine. > Looks like it was probably a HWS bug that somehow got fixed since the last > time I ran this. > I go ahead and remove this wordy workaround. > > Thanks, > > Jon > > > > > Regards, > > Felix > > > > > > > + > > > return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd- > > >proc_ctx_gpu_addr, spi_dbg_cntl, > > > - pdd->watch_points, flags); > > > + pdd->watch_points, flags, > > sq_trap_en); > > > } > > > > > > #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 > > > @@ -557,6 +575,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process > > *target, bool unwind, int unwind > > > > > > if (!unwind) { > > > uint32_t flags = 0; > > > + int resume_count = resume_queues(target, 0, NULL); > > > + > > > + if (resume_count) > > > + pr_debug("Resumed %d queues\n", resume_count); > > > > > > cancel_work_sync(&target->debug_event_workarea); > > > kfd_dbg_clear_process_address_watch(target); > > > @@ -598,13 +620,6 @@ void kfd_dbg_trap_deactivate(struct kfd_process > > *target, bool unwind, int unwind > > > } > > > > > > kfd_dbg_set_workaround(target, false); > > > - > > > - if (!unwind) { > > > - int resume_count = resume_queues(target, 0, NULL); > > > - > > > - if (resume_count) > > > - pr_debug("Resumed %d queues\n", resume_count); > > > - } > > > } > > > > > > static void kfd_dbg_clean_exception_status(struct kfd_process *target) > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > > index 0c1be91a87c6..ff0a28760494 100644 > > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > > @@ -227,8 +227,7 @@ static int add_queue_mes(struct > > device_queue_manager *dqm, struct queue *q, > > > queue_input.tba_addr = qpd->tba_addr; > > > queue_input.tma_addr = qpd->tma_addr; > > > queue_input.trap_en = KFD_GC_VERSION(q->device) < > > IP_VERSION(11, 0, 0) || > > > - KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, > > 0) || > > > - q->properties.is_dbg_wa; > > > + KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, > > 0); > > > queue_input.skip_process_ctx_clear = qpd->pqm->process- > > >debug_trap_enabled; > > > > > > queue_type = convert_to_mes_queue_type(q->properties.type); > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > > > index faa7939f35bd..90b86a6ac7bd 100644 > > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > > > @@ -1863,13 +1863,15 @@ static void > > kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *de > > > { > > > bool firmware_supported = true; > > > > > > - /* > > > - * FIXME: GFX11 FW currently not sufficient to deal with CWSR WA. > > > - * Updated FW with API changes coming soon. > > > - */ > > > if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) && > > > KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) { > > > - firmware_supported = false; > > > + uint32_t mes_api_rev = (dev->gpu->adev->mes.sched_version > > & > > > + > > AMDGPU_MES_API_VERSION_MASK) >> > > > + > > AMDGPU_MES_API_VERSION_SHIFT; > > > + uint32_t mes_rev = dev->gpu->adev->mes.sched_version & > > > + > > AMDGPU_MES_VERSION_MASK; > > > + > > > + firmware_supported = (mes_api_rev >= 14) && (mes_rev >= > > 64); > > > goto out; > > > } > > > > > > diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h > > b/drivers/gpu/drm/amd/include/mes_v11_api_def.h > > > index f3c15f18ddb5..0997e999416a 100644 > > > --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h > > > +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h > > > @@ -575,6 +575,7 @@ struct SET_SHADER_DEBUGGER { > > > } flags; > > > uint32_t spi_gdbg_per_vmid_cntl; > > > uint32_t tcp_watch_cntl[4]; /* TCP_WATCHx_CNTL */ > > > + uint32_t trap_en; > > > }; > > > > > > union MESAPI__MISC {