To enable HW debug mode per process, all devices must be debug enabled successfully. If a failure occures, rewind the enablement of debug mode on the enabled devices. A power management scenario that needs to be considered is HW debug mode setting during GFXOFF. During GFXOFF, these registers will be unreachable so we have to transiently disable GFXOFF when setting. Also, some devices don't support the RLC save restore function for these debug registers so we have to disable GFXOFF completely during a debug session. Cooperative launch also has debugging restriction based on FW bugs. If such bugs exists, the debugger cannot attach to a process that uses GWS resources nor can GWS resources be requested if a process is being debugged. Also multi-process debug devices can only enable trap temporaries based on certain runtime scenerios, which will be explained when the runtime enable functions are implemented in a follow up patch. Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 + drivers/gpu/drm/amd/amdkfd/kfd_debug.c | 134 +++++++++++++++++- drivers/gpu/drm/amd/amdkfd/kfd_debug.h | 23 +++ .../drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 9 ++ 5 files changed, 170 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 9ecc93d5fbba..beb26c93d920 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1442,6 +1442,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep, goto out_unlock; } + if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) { + retval = -EBUSY; + goto out_unlock; + } + retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); mutex_unlock(&p->mutex); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c index f967f89903f7..f5a5d17cde14 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c @@ -21,6 +21,7 @@ */ #include "kfd_debug.h" +#include "kfd_device_queue_manager.h" #include <linux/file.h> void debug_event_write_work_handler(struct work_struct *work) @@ -37,8 +38,59 @@ void debug_event_write_work_handler(struct work_struct *work) kernel_write(process->dbg_ev_file, &write_data, 1, &pos); } +/* kfd_dbg_trap_deactivate: + * target: target process + * unwind: If this is unwinding a failed kfd_dbg_trap_enable() + * unwind_count: + * If unwind == true, how far down the pdd list we need + * to unwind + * else: ignored + */ +static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) +{ + int i, count = 0; + + for (i = 0; i < target->n_pdds; i++) { + struct kfd_process_device *pdd = target->pdds[i]; + + /* If this is an unwind, and we have unwound the required + * enable calls on the pdd list, we need to stop now + * otherwise we may mess up another debugger session. + */ + if (unwind && count == unwind_count) + break; + + /* GFX off is already disabled by debug activate if not RLC restore supported. */ + if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) + amdgpu_gfx_off_ctrl(pdd->dev->adev, false); + pdd->spi_dbg_override = + pdd->dev->kfd2kgd->disable_debug_trap( + pdd->dev->adev, + target->runtime_info.ttmp_setup, + pdd->dev->vm_info.last_vmid_kfd); + if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) + amdgpu_gfx_off_ctrl(pdd->dev->adev, true); + + if (release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd)) + pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id); + + debug_refresh_runlist(pdd->dev->dqm); + + count++; + } +} + int kfd_dbg_trap_disable(struct kfd_process *target) { + /* + * Defer deactivation to runtime if runtime not enabled otherwise reset + * attached running target runtime state to enable for re-attach. + */ + if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) + kfd_dbg_trap_deactivate(target, false, 0); + else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) + target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; + fput(target->dbg_ev_file); target->dbg_ev_file = NULL; @@ -53,16 +105,88 @@ int kfd_dbg_trap_disable(struct kfd_process *target) return 0; } +static int kfd_dbg_trap_activate(struct kfd_process *target) +{ + int i, r = 0, unwind_count = 0; + + for (i = 0; i < target->n_pdds; i++) { + struct kfd_process_device *pdd = target->pdds[i]; + + if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) { + r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd); + + if (r) { + target->runtime_info.runtime_state = (r == -EBUSY) ? + DEBUG_RUNTIME_STATE_ENABLED_BUSY : + DEBUG_RUNTIME_STATE_ENABLED_ERROR; + + goto unwind_err; + } + } + + /* Disable GFX OFF to prevent garbage read/writes to debug registers. + * If RLC restore of debug registers is not supported and runtime enable + * hasn't done so already on ttmp setup request, restore the trap config registers. + * + * If RLC restore of debug registers is not supported, keep gfx off disabled for + * the debug session. + */ + amdgpu_gfx_off_ctrl(pdd->dev->adev, false); + if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) || + target->runtime_info.ttmp_setup)) + pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, + pdd->dev->vm_info.last_vmid_kfd); + + pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( + pdd->dev->adev, + false, + pdd->dev->vm_info.last_vmid_kfd); + + if (kfd_dbg_is_rlc_restore_supported(pdd->dev)) + amdgpu_gfx_off_ctrl(pdd->dev->adev, true); + + r = debug_refresh_runlist(pdd->dev->dqm); + if (r) { + target->runtime_info.runtime_state = + DEBUG_RUNTIME_STATE_ENABLED_ERROR; + goto unwind_err; + } + + /* Increment unwind_count as the last step */ + unwind_count++; + } + + return 0; + +unwind_err: + /* Enabling debug failed, we need to disable on + * all GPUs so the enable is all or nothing. + */ + kfd_dbg_trap_deactivate(target, true, unwind_count); + return r; +} + int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, void __user *runtime_info, uint32_t *runtime_size) { struct file *f; uint32_t copy_size; - int r = 0; + int i, r = 0; if (target->debug_trap_enabled) return -EINVAL; + /* Enable pre-checks */ + for (i = 0; i < target->n_pdds; i++) { + struct kfd_process_device *pdd = target->pdds[i]; + + if (!KFD_IS_SOC15(pdd->dev)) + return -ENODEV; + + if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws) + return -EBUSY; + } + copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); f = fget(fd); @@ -73,6 +197,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, target->dbg_ev_file = f; + /* defer activation to runtime if not runtime enabled */ + if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) + kfd_dbg_trap_activate(target); + /* We already hold the process reference but hold another one for the * debug session. */ @@ -82,8 +210,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, if (target->debugger_process) atomic_inc(&target->debugger_process->debugged_process_count); - if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) + if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) { + kfd_dbg_trap_deactivate(target, false, 0); r = -EFAULT; + } *runtime_size = sizeof(target->runtime_info); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h index b7ecd603f277..1053b7ca24c5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.h @@ -40,4 +40,27 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_dev *dev) void debug_event_write_work_handler(struct work_struct *work); +/* + * If GFX off is enabled, chips that do not support RLC restore for the debug + * registers will disable GFX off temporarily for the entire debug session. + * See disable_on_trap_action_entry and enable_on_trap_action_exit for details. + */ +static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_dev *dev) +{ + return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) || /* Navi10 */ + KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1)); /* Navi14 */ +} + +static inline bool kfd_dbg_has_gws_support(struct kfd_dev *dev) +{ + return ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1) + && dev->mec2_fw_version >= 0x81b6) || + (KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0) + && KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2) + && dev->mec2_fw_version >= 0x1b6) || + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0) + && dev->mec2_fw_version >= 0x1b6) || + (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1) + && dev->mec2_fw_version >= 0x30)); +} #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 7c983f5f2d6e..cba9968adcd0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -36,6 +36,7 @@ #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" #include "mes_api_def.h" +#include "kfd_debug.h" /* Size of the per-pipe EOP queue */ #define CIK_HPD_EOP_BYTES_LOG2 11 diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 2a38b3d8f8dc..c92da9b998d8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1161,6 +1161,7 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct kfd_process *p; + int i; /* * The kfd_process structure can not be free because the @@ -1178,6 +1179,14 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, cancel_delayed_work_sync(&p->eviction_work); cancel_delayed_work_sync(&p->restore_work); + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + + /* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */ + if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup) + amdgpu_gfx_off_ctrl(pdd->dev->adev, true); + } + if (p->debug_trap_enabled) kfd_dbg_trap_disable(p); -- 2.25.1