[Public] > -----Original Message----- > From: Kuehling, Felix <Felix.Kuehling@xxxxxxx> > Sent: Monday, March 20, 2023 5:50 PM > To: Kim, Jonathan <Jonathan.Kim@xxxxxxx>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx; dri-devel@xxxxxxxxxxxxxxxxxxxxx > Subject: Re: [PATCH 15/32] drm/amdkfd: prepare trap workaround for gfx11 > > > On 2023-01-25 14:53, Jonathan Kim wrote: > > Due to a HW bug, waves in only half the shader arrays can enter trap. > > > > When starting a debug session, relocate all waves to the first shader > > array of each shader engine and mask off the 2nd shader array as > > unavailable. > > > > When ending a debug session, re-enable the 2nd shader array per > > shader engine. > > > > User CU masking per queue cannot be guaranteed to remain functional > > if requested during debugging (e.g. user cu mask requests only 2nd shader > > array as an available resource leading to zero HW resources available) > > nor can runtime be alerted of any of these changes during execution. > > > > Make user CU masking and debugging mutual exclusive with respect to > > availability. > > > > If the debugger tries to attach to a process with a user cu masked > > queue, return the runtime status as enabled but busy. > > > > If the debugger tries to attach and fails to reallocate queue waves to > > the first shader array of each shader engine, return the runtime status > > as enabled but with an error. > > > > In addition, like any other mutli-process debug supported devices, > > disable trap temporary setup per-process to avoid performance impact > from > > setup overhead. > > > > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 2 + > > drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 7 +- > > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 - > > drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 64 > +++++++++++++++++++ > > drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 3 +- > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 7 ++ > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 3 +- > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c | 3 +- > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 42 ++++++++---- > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 3 +- > > .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 3 +- > > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +- > > .../amd/amdkfd/kfd_process_queue_manager.c | 9 ++- > > 13 files changed, 124 insertions(+), 29 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > > index d20df0cf0d88..b5f5eed2b5ef 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > > @@ -219,6 +219,8 @@ struct mes_add_queue_input { > > uint32_t gws_size; > > uint64_t tba_addr; > > uint64_t tma_addr; > > + uint32_t trap_en; > > + uint32_t skip_process_ctx_clear; > > uint32_t is_kfd_process; > > uint32_t is_aql_queue; > > uint32_t queue_size; > > diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > > index fbacdc42efac..38c7a0cbf264 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c > > @@ -197,17 +197,14 @@ static int mes_v11_0_add_hw_queue(struct > amdgpu_mes *mes, > > mes_add_queue_pkt.gws_size = input->gws_size; > > mes_add_queue_pkt.trap_handler_addr = input->tba_addr; > > mes_add_queue_pkt.tma_addr = input->tma_addr; > > + mes_add_queue_pkt.trap_en = input->trap_en; > > + mes_add_queue_pkt.skip_process_ctx_clear = input- > >skip_process_ctx_clear; > > mes_add_queue_pkt.is_kfd_process = input->is_kfd_process; > > > > /* For KFD, gds_size is re-used for queue size (needed in MES for AQL > queues) */ > > mes_add_queue_pkt.is_aql_queue = input->is_aql_queue; > > mes_add_queue_pkt.gds_size = input->queue_size; > > > > - if (!(((adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= > 4) && > > - (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) > && > > - (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(11, 0, 3)))) > > - mes_add_queue_pkt.trap_en = 1; > > - > > /* For KFD, gds_size is re-used for queue size (needed in MES for AQL > queues) */ > > mes_add_queue_pkt.is_aql_queue = input->is_aql_queue; > > mes_add_queue_pkt.gds_size = input->queue_size; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > index ee05c2e54ef6..f5f639de28f0 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > @@ -530,8 +530,6 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, > struct kfd_process *p, > > goto out; > > } > > > > - minfo.update_flag = UPDATE_FLAG_CU_MASK; > > - > > mutex_lock(&p->mutex); > > > > retval = pqm_update_mqd(&p->pqm, args->queue_id, &minfo); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > index f6ea6db266b4..6e99a0160275 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c > > @@ -37,6 +37,70 @@ void debug_event_write_work_handler(struct > work_struct *work) > > kernel_write(process->dbg_ev_file, &write_data, 1, &pos); > > } > > > > +static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) > > +{ > > + struct mqd_update_info minfo = {0}; > > + int err; > > + > > + if (!q || (!q->properties.is_dbg_wa && !enable)) > > Should this condition be: > > if (!q || q->properties.is_dbg_wa != enable) The latter part should probably be q->properties.is_dbg_wa == enable. q->properties.is_dbg_wa != enable would always skip a request to change the queue's current workaround state. I think we can just drop the latter half of this test condition as a redundant queue workaround update is harmless. It's a static call from a process wide call and the process wide call is static itself and only gets called twice, once on attach and once on detach. Thanks, Jon > > > > + return 0; > > + > > + if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) || > > + KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0)) > > Indentation. It would be more readable if the KFD_GC_VERSIONs were > aligned. > > > > + return 0; > > + > > + if (enable && q->properties.is_user_cu_masked) > > + return -EBUSY; > > + > > + minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : > UPDATE_FLAG_DBG_WA_DISABLE; > > + > > + q->properties.is_dbg_wa = enable; > > + err = q->device->dqm->ops.update_queue(q->device->dqm, q, > &minfo); > > + if (err) > > + q->properties.is_dbg_wa = false; > > + > > + return err; > > +} > > + > > +static int kfd_dbg_set_workaround(struct kfd_process *target, bool > enable) > > +{ > > + struct process_queue_manager *pqm = &target->pqm; > > + struct process_queue_node *pqn; > > + int r = 0; > > + > > + list_for_each_entry(pqn, &pqm->queues, process_queue_list) { > > + r = kfd_dbg_set_queue_workaround(pqn->q, enable); > > + if (enable && r) > > + goto unwind; > > + } > > + > > + return 0; > > + > > +unwind: > > + list_for_each_entry(pqn, &pqm->queues, process_queue_list) > > + kfd_dbg_set_queue_workaround(pqn->q, false); > > + > > + if (enable) { > > + target->runtime_info.runtime_state = r == -EBUSY ? > > + DEBUG_RUNTIME_STATE_ENABLED_BUSY : > > + DEBUG_RUNTIME_STATE_ENABLED_ERROR; > > + } > > Braces are not needed here. > > > > + > > + return r; > > +} > > + > > +static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd) > > +{ > > + uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd- > >spi_dbg_launch_mode; > > + uint32_t flags = pdd->process->dbg_flags; > > + > > + if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) > > + return 0; > > + > > + return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd- > >proc_ctx_gpu_addr, spi_dbg_cntl, > > + pdd->watch_points, flags); > > +} > > + > > You're adding some unused static functions here. This will cause compile > warnings until the patch that starts using them. You could avoid this by > reordering this and the next patch and moving the function calls into > this patch. That would also make it more obvious where the workaround > plugs into the debug code. > > Regards, > Felix > > > > int kfd_dbg_trap_disable(struct kfd_process *target) > > { > > if (!target->debug_trap_enabled) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > > index 53c5a3e55bd2..0c09f1729325 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h > > @@ -35,7 +35,8 @@ int kfd_dbg_trap_enable(struct kfd_process *target, > uint32_t fd, > > > > static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev) > > { > > - return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2); > > + return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) || > > + KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0); > > } > > > > void debug_event_write_work_handler(struct work_struct *work); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > index 2517716d7cbc..be1985b87ea7 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > > @@ -214,6 +214,10 @@ static int add_queue_mes(struct > device_queue_manager *dqm, struct queue *q, > > queue_input.paging = false; > > queue_input.tba_addr = qpd->tba_addr; > > queue_input.tma_addr = qpd->tma_addr; > > + queue_input.trap_en = KFD_GC_VERSION(q->device) < > IP_VERSION(11, 0, 0) || > > + KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, > 0) || > > + q->properties.is_dbg_wa; > > + queue_input.skip_process_ctx_clear = qpd->pqm->process- > >debug_trap_enabled; > > > > queue_type = convert_to_mes_queue_type(q->properties.type); > > if (queue_type < 0) { > > @@ -1679,6 +1683,9 @@ static int create_queue_cpsch(struct > device_queue_manager *dqm, struct queue *q, > > * updates the is_evicted flag but is a no-op otherwise. > > */ > > q->properties.is_evicted = !!qpd->evicted; > > + q->properties.is_dbg_wa = qpd->pqm->process- > >debug_trap_enabled && > > + KFD_GC_VERSION(q->device) >= IP_VERSION(11, 0, 0) > && > > + KFD_GC_VERSION(q->device) < IP_VERSION(12, 0, 0); > > > > if (qd) > > mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q- > >mqd_mem_obj, &q->gart_mqd_addr, > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c > > index 4889865c725c..c2a7226fc588 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c > > @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager > *mm, void *mqd, > > struct cik_mqd *m; > > uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ > > > > - if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) || > > - !minfo->cu_mask.ptr) > > + if (!minfo || !minfo->cu_mask.ptr) > > return; > > > > mqd_symmetrically_map_cu_mask(mm, > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c > > index cb484ace17de..8248e77751e7 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c > > @@ -48,8 +48,7 @@ static void update_cu_mask(struct mqd_manager > *mm, void *mqd, > > struct v10_compute_mqd *m; > > uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ > > > > - if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) || > > - !minfo->cu_mask.ptr) > > + if (!minfo || !minfo->cu_mask.ptr) > > return; > > > > mqd_symmetrically_map_cu_mask(mm, > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c > > index ac7c8fc83c94..18ab613e787c 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c > > @@ -46,15 +46,33 @@ static void update_cu_mask(struct mqd_manager > *mm, void *mqd, > > { > > struct v11_compute_mqd *m; > > uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; > > + bool has_wa_flag = minfo && (minfo->update_flag & > (UPDATE_FLAG_DBG_WA_ENABLE | > > + UPDATE_FLAG_DBG_WA_DISABLE)); > > > > - if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) || > > - !minfo->cu_mask.ptr) > > + if (!minfo || !(has_wa_flag || minfo->cu_mask.ptr)) > > return; > > > > + m = get_mqd(mqd); > > + > > + if (has_wa_flag) { > > + uint32_t wa_mask = minfo->update_flag == > UPDATE_FLAG_DBG_WA_ENABLE ? > > + 0xffff : 0xffffffff; > > + > > + m->compute_static_thread_mgmt_se0 = wa_mask; > > + m->compute_static_thread_mgmt_se1 = wa_mask; > > + m->compute_static_thread_mgmt_se2 = wa_mask; > > + m->compute_static_thread_mgmt_se3 = wa_mask; > > + m->compute_static_thread_mgmt_se4 = wa_mask; > > + m->compute_static_thread_mgmt_se5 = wa_mask; > > + m->compute_static_thread_mgmt_se6 = wa_mask; > > + m->compute_static_thread_mgmt_se7 = wa_mask; > > + > > + return; > > + } > > + > > mqd_symmetrically_map_cu_mask(mm, > > minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask); > > > > - m = get_mqd(mqd); > > m->compute_static_thread_mgmt_se0 = se_mask[0]; > > m->compute_static_thread_mgmt_se1 = se_mask[1]; > > m->compute_static_thread_mgmt_se2 = se_mask[2]; > > @@ -109,6 +127,7 @@ static void init_mqd(struct mqd_manager *mm, > void **mqd, > > uint64_t addr; > > struct v11_compute_mqd *m; > > int size; > > + uint32_t wa_mask = q->is_dbg_wa ? 0xffff : 0xffffffff; > > > > m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr; > > addr = mqd_mem_obj->gpu_addr; > > @@ -122,14 +141,15 @@ static void init_mqd(struct mqd_manager *mm, > void **mqd, > > > > m->header = 0xC0310800; > > m->compute_pipelinestat_enable = 1; > > - m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; > > - m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; > > - m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; > > - m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; > > - m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; > > - m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; > > - m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; > > - m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; > > + > > + m->compute_static_thread_mgmt_se0 = wa_mask; > > + m->compute_static_thread_mgmt_se1 = wa_mask; > > + m->compute_static_thread_mgmt_se2 = wa_mask; > > + m->compute_static_thread_mgmt_se3 = wa_mask; > > + m->compute_static_thread_mgmt_se4 = wa_mask; > > + m->compute_static_thread_mgmt_se5 = wa_mask; > > + m->compute_static_thread_mgmt_se6 = wa_mask; > > + m->compute_static_thread_mgmt_se7 = wa_mask; > > > > m->cp_hqd_persistent_state = > CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | > > 0x55 << > CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > index 86f1cf090246..50da16dd4c96 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c > > @@ -49,8 +49,7 @@ static void update_cu_mask(struct mqd_manager > *mm, void *mqd, > > struct v9_mqd *m; > > uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; > > > > - if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) || > > - !minfo->cu_mask.ptr) > > + if (!minfo || !minfo->cu_mask.ptr) > > return; > > > > mqd_symmetrically_map_cu_mask(mm, > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c > > index 530ba6f5b57e..58b40bff3e0c 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c > > @@ -51,8 +51,7 @@ static void update_cu_mask(struct mqd_manager > *mm, void *mqd, > > struct vi_mqd *m; > > uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ > > > > - if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) || > > - !minfo->cu_mask.ptr) > > + if (!minfo || !minfo->cu_mask.ptr) > > return; > > > > mqd_symmetrically_map_cu_mask(mm, > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > index 8f1e2f9023db..75521d96e937 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > @@ -479,6 +479,8 @@ struct queue_properties { > > bool is_evicted; > > bool is_active; > > bool is_gws; > > + bool is_dbg_wa; > > + bool is_user_cu_masked; > > /* Not relevant for user mode queues in cp scheduling */ > > unsigned int vmid; > > /* Relevant only for sdma queues*/ > > @@ -501,7 +503,8 @@ struct queue_properties { > > !(q).is_evicted) > > > > enum mqd_update_flag { > > - UPDATE_FLAG_CU_MASK = 0, > > + UPDATE_FLAG_DBG_WA_ENABLE = 1, > > + UPDATE_FLAG_DBG_WA_DISABLE = 2, > > }; > > > > struct mqd_update_info { > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > > index 5137476ec18e..d8f032214481 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > > @@ -498,8 +498,12 @@ int pqm_update_mqd(struct > process_queue_manager *pqm, > > return -EFAULT; > > } > > > > + /* CUs are masked for debugger requirements so deny user mask */ > > + if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr) > > + return -EBUSY; > > + > > /* ASICs that have WGPs must enforce pairwise enabled mask > checks. */ > > - if (minfo && minfo->update_flag == UPDATE_FLAG_CU_MASK && > minfo->cu_mask.ptr && > > + if (minfo && minfo->cu_mask.ptr && > > KFD_GC_VERSION(pqn->q->device) >= IP_VERSION(10, > 0, 0)) { > > int i; > > > > @@ -518,6 +522,9 @@ int pqm_update_mqd(struct > process_queue_manager *pqm, > > if (retval != 0) > > return retval; > > > > + if (minfo && minfo->cu_mask.ptr) > > + pqn->q->properties.is_user_cu_masked = true; > > + > > return 0; > > } > >