RE: [PATCH] drm/amdkfd: fix and enable debugging for gfx11

"Kim, Jonathan" <Jonathan.Kim@xxxxxxx> · Wed, 7 Jun 2023 19:57:34 +0000

[Public]

> -----Original Message-----
> From: Kim, Jonathan
> Sent: Wednesday, June 7, 2023 3:28 PM
> To: Kuehling, Felix <Felix.Kuehling@xxxxxxx>; amd-
> gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Huang, JinHuiEric <JinHuiEric.Huang@xxxxxxx>
> Subject: RE: [PATCH] drm/amdkfd: fix and enable debugging for gfx11
>
>
>
> > -----Original Message-----
> > From: Kuehling, Felix <Felix.Kuehling@xxxxxxx>
> > Sent: Wednesday, June 7, 2023 2:20 PM
> > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Kim, Jonathan
> <Jonathan.Kim@xxxxxxx>
> > Cc: Huang, JinHuiEric <JinHuiEric.Huang@xxxxxxx>
> > Subject: Re: [PATCH] drm/amdkfd: fix and enable debugging for gfx11
> >
> >
> > On 2023-06-07 13:26, Jonathan Kim wrote:
> > > There are a few fixes required to enable gfx11 debugging.
> > >
> > > First, ADD_QUEUE.trap_en is an inappropriate place to toggle
> > > a per-process register so move it to SET_SHADER_DEBUGGER.trap_en.
> > > When ADD_QUEUE.skip_process_ctx_clear is set, MES will prioritize
> > > the SET_SHADER_DEBUGGER.trap_en setting.
> >
> > I see you have a firmware version check for enabling debugging. But is
> > the struct SET_SHADER_DEBUGGER change safe with older firmware when
> > debugging is disabled?
>
> Right.  It changes the shape of MISC_OPs.
> I'll have to figure out something that's backwards compatible.

Actually, I think we should be okay.  MISC_OPs allows a max data packet of 20 D-WORDs.
So adding another D-WORD to SET_SHADER_DEBUGGER should be well under that limit.
The writing to an unused D-WORD is likely not harmful but I can version check the trap_en setting in the MES KGD call itself just to be safe.

Thanks,

Jon

>
> >
> >
> > >
> > > Second, to preserve correct save/restore priviledged wave states
> > > in coordination with the trap enablement setting, resume suspended
> > > waves early in the disable call.
> > >
> > > Finally, displaced single stepping can cause non-fatal illegal
> > > instructions during process termination on debug disable.  To work
> > > around this, stall the waves prior to disable and allow clean
> > > up to happen naturally on process termination.
> > >
> > > NOTE: The AMDGPU_MES_VERSION_MASK check is a place holder as
> > > MES FW updates have been reviewed but is awaiting binary
> > > creation.  Once the binaries have been created, this check may
> > > be subject to change.
> > >
> > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx>
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |  5 ++-
> > >   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |  4 ++-
> > >   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c        |  1 +
> > >   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        | 31 ++++++++++++++----
> -
> > >   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  3 +-
> > >   drivers/gpu/drm/amd/amdkfd/kfd_topology.c     | 12 ++++---
> > >   drivers/gpu/drm/amd/include/mes_v11_api_def.h |  1 +
> > >   7 files changed, 40 insertions(+), 17 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > > index 20cc3fffe921..95d69f9c7361 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> > > @@ -928,7 +928,8 @@ int amdgpu_mes_set_shader_debugger(struct
> > amdgpu_device *adev,
> > >                                   uint64_t process_context_addr,
> > >                                   uint32_t spi_gdbg_per_vmid_cntl,
> > >                                   const uint32_t *tcp_watch_cntl,
> > > -                         uint32_t flags)
> > > +                         uint32_t flags,
> > > +                         bool trap_en)
> > >   {
> > >           struct mes_misc_op_input op_input = {0};
> > >           int r;
> > > @@ -945,6 +946,8 @@ int amdgpu_mes_set_shader_debugger(struct
> > amdgpu_device *adev,
> > >           memcpy(op_input.set_shader_debugger.tcp_watch_cntl,
> > tcp_watch_cntl,
> > >
> >     sizeof(op_input.set_shader_debugger.tcp_watch_cntl));
> > >
> > > + op_input.set_shader_debugger.trap_en = trap_en;
> > > +
> > >           amdgpu_mes_lock(&adev->mes);
> > >
> > >           r = adev->mes.funcs->misc_op(&adev->mes, &op_input);
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > > index b5f5eed2b5ef..2d6ac30b7135 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> > > @@ -294,6 +294,7 @@ struct mes_misc_op_input {
> > >                           } flags;
> > >                           uint32_t spi_gdbg_per_vmid_cntl;
> > >                           uint32_t tcp_watch_cntl[4];
> > > +                 uint32_t trap_en;
> > >                   } set_shader_debugger;
> > >           };
> > >   };
> > > @@ -361,7 +362,8 @@ int amdgpu_mes_set_shader_debugger(struct
> > amdgpu_device *adev,
> > >                                   uint64_t process_context_addr,
> > >                                   uint32_t spi_gdbg_per_vmid_cntl,
> > >                                   const uint32_t *tcp_watch_cntl,
> > > -                         uint32_t flags);
> > > +                         uint32_t flags,
> > > +                         bool trap_en);
> > >
> > >   int amdgpu_mes_add_ring(struct amdgpu_device *adev, int gang_id,
> > >                           int queue_type, int idx,
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > > index c4e3cb8d44de..1bdaa00c0b46 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
> > > @@ -347,6 +347,7 @@ static int mes_v11_0_misc_op(struct amdgpu_mes
> > *mes,
> > >                   memcpy(misc_pkt.set_shader_debugger.tcp_watch_cntl,
> > >                                   input->set_shader_debugger.tcp_watch_cntl,
> > >
> >     sizeof(misc_pkt.set_shader_debugger.tcp_watch_cntl));
> > > +         misc_pkt.set_shader_debugger.trap_en = input-
> > >set_shader_debugger.trap_en;
> > >                   break;
> > >           default:
> > >                   DRM_ERROR("unsupported misc op (%d) \n", input->op);
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > > index 125274445f43..e7bc07068eed 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
> > > @@ -349,12 +349,30 @@ int kfd_dbg_set_mes_debug_mode(struct
> > kfd_process_device *pdd)
> > >   {
> > >           uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd-
> > >spi_dbg_launch_mode;
> > >           uint32_t flags = pdd->process->dbg_flags;
> > > + bool sq_trap_en = !!spi_dbg_cntl;
> > >
> > >           if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
> > >                   return 0;
> > >
> > > + /*
> > > +  * For displaced single stepping, the debugger inserts s_trap
> > instructions
> > > +  * from user space.
> > > +  * This can race with the CWSR workaround during a non-IOCTL
> > disable and
> > > +  * cause non-fatal trailing SQ illegal instructions.
> > > +  * As a work around, stall waves indefinitely in this case as the
> > process
> > > +  * queues will terminate anyways.  MES will automatically clean up
> > the
> > > +  * SPI debug HW registers when all queues are unmapped.
> > > +  * IOCTL disable will not hit these cases as the program needs to be
> > in a
> > > +  * non-continued state to make the disable call in the first place so
> > > +  * debugger insertion of s_trap in debug memory will never occur.
> >
> > This comment has a lot of jargon. I don't know what "non-ioctl disable",
> > "non-fatal trailing SQ illegal instructions" and "non-continued state"
> > mean. Looking at the condition below, I think this is missing some
> > context. I think you're trying to fix something about process
> > termination (!pdd->process->mm), which is not mentioned anywhere in the
> > comment, or obfuscated by jargon ("non-IOCTL disable"). I think you're
> > also making some assumptions about how the debugger interacts with the
> > CPU threads of the program ("program needs to be in a non-continued
> > state to make the disable call"). Making assumptions about user mode is
> > not a good idea. A different debugger or tool may use the debug API in
> > unexpected ways.
> >
> >
> > > +  */
> > > + if (KFD_GC_VERSION(pdd->dev) >= IP_VERSION(11, 0, 0) &&
> > > +     KFD_GC_VERSION(pdd->dev) < IP_VERSION(12, 0, 0) &&
> > > +     !pdd->process->mm && !sq_trap_en)
> > > +         spi_dbg_cntl |= 0x1; /* Set
> > SPI_GDBG_PER_VMID_CNTL.stall_vmid */
> >
> > This looks like a HW-specific workaround in non-HW-specific code. Is
> > this something that could be done in kgd_gfx_v11_disable_debug_trap?
> >
> > That said, if the pdd->process->mm is NULL, how can the debugger still
> > access the process' memory to insert s_trap instructions?
>
> It was probably a race on save/restore induced by the s_trap instructions
> before the mm reference was nulled.
> I agree, this isn't where this is supposed to be fixed.
> I retested with this block removed and everything is fine.
> Looks like it was probably a HWS bug that somehow got fixed since the last
> time I ran this.
> I go ahead and remove this wordy workaround.
>
> Thanks,
>
> Jon
>
> >
> > Regards,
> >    Felix
> >
> >
> > > +
> > >           return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd-
> > >proc_ctx_gpu_addr, spi_dbg_cntl,
> > > -                                         pdd->watch_points, flags);
> > > +                                         pdd->watch_points, flags,
> > sq_trap_en);
> > >   }
> > >
> > >   #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
> > > @@ -557,6 +575,10 @@ void kfd_dbg_trap_deactivate(struct kfd_process
> > *target, bool unwind, int unwind
> > >
> > >           if (!unwind) {
> > >                   uint32_t flags = 0;
> > > +         int resume_count = resume_queues(target, 0, NULL);
> > > +
> > > +         if (resume_count)
> > > +                 pr_debug("Resumed %d queues\n", resume_count);
> > >
> > >                   cancel_work_sync(&target->debug_event_workarea);
> > >                   kfd_dbg_clear_process_address_watch(target);
> > > @@ -598,13 +620,6 @@ void kfd_dbg_trap_deactivate(struct kfd_process
> > *target, bool unwind, int unwind
> > >           }
> > >
> > >           kfd_dbg_set_workaround(target, false);
> > > -
> > > - if (!unwind) {
> > > -         int resume_count = resume_queues(target, 0, NULL);
> > > -
> > > -         if (resume_count)
> > > -                 pr_debug("Resumed %d queues\n", resume_count);
> > > - }
> > >   }
> > >
> > >   static void kfd_dbg_clean_exception_status(struct kfd_process *target)
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > > index 0c1be91a87c6..ff0a28760494 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > > @@ -227,8 +227,7 @@ static int add_queue_mes(struct
> > device_queue_manager *dqm, struct queue *q,
> > >           queue_input.tba_addr = qpd->tba_addr;
> > >           queue_input.tma_addr = qpd->tma_addr;
> > >           queue_input.trap_en = KFD_GC_VERSION(q->device) <
> > IP_VERSION(11, 0, 0) ||
> > > -                       KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0,
> > 0) ||
> > > -                       q->properties.is_dbg_wa;
> > > +                       KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0,
> > 0);
> > >           queue_input.skip_process_ctx_clear = qpd->pqm->process-
> > >debug_trap_enabled;
> > >
> > >           queue_type = convert_to_mes_queue_type(q->properties.type);
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > > index faa7939f35bd..90b86a6ac7bd 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > > @@ -1863,13 +1863,15 @@ static void
> > kfd_topology_set_dbg_firmware_support(struct kfd_topology_device *de
> > >   {
> > >           bool firmware_supported = true;
> > >
> > > - /*
> > > -  * FIXME: GFX11 FW currently not sufficient to deal with CWSR WA.
> > > -  * Updated FW with API changes coming soon.
> > > -  */
> > >           if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(11, 0, 0) &&
> > >                           KFD_GC_VERSION(dev->gpu) < IP_VERSION(12, 0, 0)) {
> > > -         firmware_supported = false;
> > > +         uint32_t mes_api_rev = (dev->gpu->adev->mes.sched_version
> > &
> > > +
> >     AMDGPU_MES_API_VERSION_MASK) >>
> > > +
> >     AMDGPU_MES_API_VERSION_SHIFT;
> > > +         uint32_t mes_rev = dev->gpu->adev->mes.sched_version &
> > > +
> >     AMDGPU_MES_VERSION_MASK;
> > > +
> > > +         firmware_supported = (mes_api_rev >= 14) && (mes_rev >=
> > 64);
> > >                   goto out;
> > >           }
> > >
> > > diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> > b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> > > index f3c15f18ddb5..0997e999416a 100644
> > > --- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> > > +++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
> > > @@ -575,6 +575,7 @@ struct SET_SHADER_DEBUGGER {
> > >           } flags;
> > >           uint32_t spi_gdbg_per_vmid_cntl;
> > >           uint32_t tcp_watch_cntl[4]; /* TCP_WATCHx_CNTL */
> > > + uint32_t trap_en;
> > >   };
> > >
> > >   union MESAPI__MISC {