Every ring type can have its own timeout setting. Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e Signed-off-by: Evan Quan <evan.quan@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 82 ++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 35 +++++++-- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- 5 files changed, 124 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f6965b9403eb..9d7bf9fbf2ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority; extern int amdgpu_hw_i2c; extern int amdgpu_pcie_gen2; extern int amdgpu_msi; -extern int amdgpu_lockup_timeout; extern int amdgpu_dpm; extern int amdgpu_fw_load_type; extern int amdgpu_aspm; @@ -428,6 +427,7 @@ struct amdgpu_fpriv { }; int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv); +int amdgpu_device_get_job_timeout(struct amdgpu_device *adev); int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned size, struct amdgpu_ib *ib); @@ -1001,6 +1001,11 @@ struct amdgpu_device { struct work_struct xgmi_reset_work; bool in_baco_reset; + + long gfx_timeout; + long sdma_timeout; + long video_timeout; + long compute_timeout; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 80bf604019b1..bb111c05949d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -912,8 +912,10 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) * Validates certain module parameters and updates * the associated values used by the driver (all asics). */ -static void amdgpu_device_check_arguments(struct amdgpu_device *adev) +static int amdgpu_device_check_arguments(struct amdgpu_device *adev) { + int ret = 0; + if (amdgpu_sched_jobs < 4) { dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", amdgpu_sched_jobs); @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_vram_page_split = 1024; } - if (amdgpu_lockup_timeout == 0) { - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n"); - amdgpu_lockup_timeout = 10000; + ret = amdgpu_device_get_job_timeout(adev); + if (ret) { + dev_err(adev->dev, "invalid job timeout setting\n"); + return ret; } adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96); + + return ret; } /** @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->lock_reset); mutex_init(&adev->virt.dpm_mutex); - amdgpu_device_check_arguments(adev); + ret = amdgpu_device_check_arguments(adev); + if (ret) + return ret; spin_lock_init(&adev->mmio_idx_lock); spin_lock_init(&adev->smc_idx_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 71df27cd03de..4694e36d1bf1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -83,6 +83,8 @@ #define AMDGPU_VERSION "19.10.9.418" +#define MAX_PARAM_LENGTH 256 + int amdgpu_vram_limit = 0; int amdgpu_vis_vram_limit = 0; int amdgpu_gart_size = -1; /* auto */ @@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0; int amdgpu_hw_i2c = 0; int amdgpu_pcie_gen2 = -1; int amdgpu_msi = -1; -int amdgpu_lockup_timeout = 10000; +char amdgpu_lockup_timeout[MAX_PARAM_LENGTH]; int amdgpu_dpm = -1; int amdgpu_fw_load_type = -1; int amdgpu_aspm = -1; @@ -232,12 +234,20 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = disable, -1 = auto)"); module_param_named(msi, amdgpu_msi, int, 0444); /** - * DOC: lockup_timeout (int) - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be adjusted to 10000. - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default is 10000. + * DOC: lockup_timeout (string) + * Set GPU scheduler timeout value in ms. + * + * The format is non-compute[.gfx.sdma.video][:compute]. + * With no "non-compute[.gfx.sdma.video]" timeout specified, the default timeout for non-compute_job is 10000. + * The "non-compute" timeout setting applys to all non compute IPs(gfx, sdma and video). Unless + * the timeout for this IP is specified separately(by "[.gfx.sdma.video]"). + * + * With no "[:compute]" timeout specified, there will be no timeout enforced on compute jobs. + * Otherwise, the timeout for compute jobs is set as specified. */ -MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > 0 (default 10000)"); -module_param_named(lockup_timeout, amdgpu_lockup_timeout, int, 0444); +MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms (default: 10000 for non-compute jobs and no timeout for compute jobs), " + "format is non-compute[.gfx.sdma.video][:compute]"); +module_param_string(lockup_timeout, amdgpu_lockup_timeout, sizeof(amdgpu_lockup_timeout), 0444); /** * DOC: dpm (int) @@ -1307,6 +1317,66 @@ int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv) return 0; } +int amdgpu_device_get_job_timeout(struct amdgpu_device *adev) +{ + char *input = amdgpu_lockup_timeout; + char *non_compute_setting = NULL; + char *timeout_setting = NULL; + int index = 0; + int ret = 0; + + /* Default timeout for non compute job is 10000 */ + adev->gfx_timeout = + adev->sdma_timeout = + adev->video_timeout = 10000; + /* Enforce no timeout on compute job at default */ + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; + + if (strnlen(input, MAX_PARAM_LENGTH)) { + non_compute_setting = strsep(&input, ":"); + if (input) { + ret = kstrtol(input, 0, &(adev->compute_timeout)); + if (ret) + return ret; + } + + while ((timeout_setting = strsep(&non_compute_setting, ".")) && + strnlen(timeout_setting, MAX_PARAM_LENGTH)) { + switch (index) { + case 0: + ret = kstrtol(timeout_setting, 0, &adev->gfx_timeout); + if (ret) + return ret; + + adev->sdma_timeout = + adev->video_timeout = + adev->gfx_timeout; + break; + case 1: + ret = kstrtol(timeout_setting, 0, &adev->gfx_timeout); + if (ret) + return ret; + break; + case 2: + ret = kstrtol(timeout_setting, 0, &adev->sdma_timeout); + if (ret) + return ret; + break; + case 3: + ret = kstrtol(timeout_setting, 0, &adev->video_timeout); + if (ret) + return ret; + break; + default: + break; + } + index++; + } + } + + return ret; +} + static bool amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned int pipe, bool in_vblank_irq, int *vpos, int *hpos, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index d19ad34bef75..bb40e8946bc2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -436,9 +436,13 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring, int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, unsigned num_hw_submission) { + struct amdgpu_device *adev = ring->adev; long timeout; int r; + if (!adev) + return -EINVAL; + /* Check that num_hw_submission is a power of two */ if ((num_hw_submission & (num_hw_submission - 1)) != 0) return -EINVAL; @@ -465,12 +469,31 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, /* No need to setup the GPU scheduler for KIQ ring */ if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) { - /* for non-sriov case, no timeout enforce on compute ring */ - if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) - && !amdgpu_sriov_vf(ring->adev)) - timeout = MAX_SCHEDULE_TIMEOUT; - else - timeout = msecs_to_jiffies(amdgpu_lockup_timeout); + switch (ring->funcs->type) { + case AMDGPU_RING_TYPE_GFX: + timeout = adev->gfx_timeout; + break; + case AMDGPU_RING_TYPE_COMPUTE: + /* + * For non-sriov case, no timeout enforce + * on compute ring at default. Unless user + * specifies a timeout for compute ring. + * + * For sriov case, always use the timeout + * as gfx ring + */ + if (!amdgpu_sriov_vf(ring->adev)) + timeout = adev->compute_timeout; + else + timeout = adev->gfx_timeout; + break; + case AMDGPU_RING_TYPE_SDMA: + timeout = adev->sdma_timeout; + break; + default: + timeout = adev->video_timeout; + break; + } r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, num_hw_submission, amdgpu_job_hang_limit, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 8dbad496b29f..089952a1e6b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -343,7 +343,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) /* Trigger recovery for world switch failure if no TDR */ if (amdgpu_device_should_recover_gpu(adev) - && amdgpu_lockup_timeout == MAX_SCHEDULE_TIMEOUT) + && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT) amdgpu_device_gpu_recover(adev, NULL); } -- 2.21.0 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx