Thanks! Just sent out a new patch to address this as original patch was already committed. Regards, Evan > -----Original Message----- > From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> > Sent: 2019年4月30日 17:15 > To: Alex Deucher <alexdeucher@xxxxxxxxx>; Quan, Evan > <Evan.Quan@xxxxxxx> > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Lou, Wentao > <Wentao.Lou@xxxxxxx>; Daenzer, Michel <Michel.Daenzer@xxxxxxx>; > Koenig, Christian <Christian.Koenig@xxxxxxx>; amd-gfx list <amd- > gfx@xxxxxxxxxxxxxxxxxxxxx> > Subject: Re: [PATCH] drm/amdgpu: enable separate timeout setting for > every ring type V4 > > [CAUTION: External Email] > > Am 30.04.19 um 05:20 schrieb Alex Deucher: > > On Mon, Apr 29, 2019 at 11:16 PM Evan Quan <evan.quan@xxxxxxx> > wrote: > >> Every ring type can have its own timeout setting. > >> > >> - V2: update lockup_timeout parameter format and cosmetic fixes > >> - V3: invalidate 0 and negative values > >> - V4: update lockup_timeout parameter format > >> > >> Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e > >> Signed-off-by: Evan Quan <evan.quan@xxxxxxx> > > Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > > One more issue below, with that fixed the patch is Reviewed-by: > Christian König <christian.koenig@xxxxxxx> > > > > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++-- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 79 > ++++++++++++++++++++-- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 35 ++++++++-- > >> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- > >> 5 files changed, 121 insertions(+), 19 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> index f6965b9403eb..c9b44b8c1969 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > >> @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority; > >> extern int amdgpu_hw_i2c; > >> extern int amdgpu_pcie_gen2; > >> extern int amdgpu_msi; > >> -extern int amdgpu_lockup_timeout; > >> extern int amdgpu_dpm; > >> extern int amdgpu_fw_load_type; > >> extern int amdgpu_aspm; > >> @@ -428,6 +427,7 @@ struct amdgpu_fpriv { > >> }; > >> > >> int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv > >> **fpriv); > >> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device > >> +*adev); > >> > >> int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm > *vm, > >> unsigned size, struct amdgpu_ib *ib); @@ -1001,6 > >> +1001,11 @@ struct amdgpu_device { > >> struct work_struct xgmi_reset_work; > >> > >> bool in_baco_reset; > >> + > >> + long gfx_timeout; > >> + long sdma_timeout; > >> + long video_timeout; > >> + long compute_timeout; > >> }; > >> > >> static inline struct amdgpu_device *amdgpu_ttm_adev(struct > >> ttm_bo_device *bdev) diff --git > >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> index 80bf604019b1..b11af38a0238 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > >> @@ -912,8 +912,10 @@ static void > amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev) > >> * Validates certain module parameters and updates > >> * the associated values used by the driver (all asics). > >> */ > >> -static void amdgpu_device_check_arguments(struct amdgpu_device > >> *adev) > >> +static int amdgpu_device_check_arguments(struct amdgpu_device > *adev) > >> { > >> + int ret = 0; > >> + > >> if (amdgpu_sched_jobs < 4) { > >> dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", > >> amdgpu_sched_jobs); @@ -958,13 +960,16 @@ > >> static void amdgpu_device_check_arguments(struct amdgpu_device > *adev) > >> amdgpu_vram_page_split = 1024; > >> } > >> > >> - if (amdgpu_lockup_timeout == 0) { > >> - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to > 10000\n"); > >> - amdgpu_lockup_timeout = 10000; > >> + ret = amdgpu_device_get_job_timeout_settings(adev); > >> + if (ret) { > >> + dev_err(adev->dev, "invalid lockup_timeout parameter > syntax\n"); > >> + return ret; > >> } > >> > >> adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, > amdgpu_fw_load_type); > >> amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96); > >> + > >> + return ret; > >> } > >> > >> /** > >> @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device > *adev, > >> mutex_init(&adev->lock_reset); > >> mutex_init(&adev->virt.dpm_mutex); > >> > >> - amdgpu_device_check_arguments(adev); > >> + r = amdgpu_device_check_arguments(adev); > >> + if (r) > >> + return r; > >> > >> spin_lock_init(&adev->mmio_idx_lock); > >> spin_lock_init(&adev->smc_idx_lock); > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> index 71df27cd03de..609c7af8a3f0 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > >> @@ -83,6 +83,8 @@ > >> > >> #define AMDGPU_VERSION "19.10.9.418" > >> > >> +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 > >> + > >> int amdgpu_vram_limit = 0; > >> int amdgpu_vis_vram_limit = 0; > >> int amdgpu_gart_size = -1; /* auto */ @@ -95,7 +97,7 @@ int > >> amdgpu_disp_priority = 0; > >> int amdgpu_hw_i2c = 0; > >> int amdgpu_pcie_gen2 = -1; > >> int amdgpu_msi = -1; > >> -int amdgpu_lockup_timeout = 10000; > >> +char > amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; > >> int amdgpu_dpm = -1; > >> int amdgpu_fw_load_type = -1; > >> int amdgpu_aspm = -1; > >> @@ -232,12 +234,21 @@ MODULE_PARM_DESC(msi, "MSI support (1 = > enable, 0 = disable, -1 = auto)"); > >> module_param_named(msi, amdgpu_msi, int, 0444); > >> > >> /** > >> - * DOC: lockup_timeout (int) > >> - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be > adjusted to 10000. > >> - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The > default is 10000. > >> + * DOC: lockup_timeout (string) > >> + * Set GPU scheduler timeout value in ms. > >> + * > >> + * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. > That > >> + is there can be one or > >> + * multiple values specified. 0 and negative values are invalidated. > >> + They will be adjusted > >> + * to default timeout. > >> + * - With one value specified, the setting will apply to all non-compute > jobs. > >> + * - With multiple values specified, the first one will be for GFX. The > second one is for Compute. > >> + * And the third and fourth ones are for SDMA and Video. > >> + * By default(with no lockup_timeout settings), the timeout for all > >> + non-compute(GFX, SDMA and Video) > >> + * jobs is 10000. And there is no timeout enforced on compute jobs. > >> */ > >> -MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > 0 > >> (default 10000)"); -module_param_named(lockup_timeout, > >> amdgpu_lockup_timeout, int, 0444); > >> +MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms > (default: 10000 for non-compute jobs and no timeout for compute jobs), " > >> + "format is [Non-Compute] or > >> +[GFX,Compute,SDMA,Video]"); module_param_string(lockup_timeout, > >> +amdgpu_lockup_timeout, sizeof(amdgpu_lockup_timeout), 0444); > >> > >> /** > >> * DOC: dpm (int) > >> @@ -1307,6 +1318,62 @@ int amdgpu_file_to_fpriv(struct file *filp, struct > amdgpu_fpriv **fpriv) > >> return 0; > >> } > >> > >> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device > >> +*adev) { > >> + char *input = amdgpu_lockup_timeout; > >> + char *timeout_setting = NULL; > >> + int index = 0; > >> + long timeout; > >> + int ret = 0; > >> + > >> + /* > >> + * By default timeout for non compute jobs is 10000. > >> + * And there is no timeout enforced on compute jobs. > >> + */ > >> + adev->gfx_timeout = adev->sdma_timeout = adev->video_timeout > = 10000; > >> + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; > >> + > >> + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { > >> + while ((timeout_setting = strsep(&input, ",")) && > >> + strnlen(timeout_setting, > AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { > >> + ret = kstrtol(timeout_setting, 0, &timeout); > >> + if (ret) > >> + return ret; > >> + > >> + /* Invalidate 0 and negative values */ > >> + if (timeout <= 0) { > >> + index++; > >> + continue; > >> + } > > Negative values are perfectly valid and just mean infinite timeout. Take a > look at the msecs_to_jiffies() implementation. > > Christian. > > >> + > >> + switch (index++) { > >> + case 0: > >> + adev->gfx_timeout = timeout; > >> + break; > >> + case 1: > >> + adev->compute_timeout = timeout; > >> + break; > >> + case 2: > >> + adev->sdma_timeout = timeout; > >> + break; > >> + case 3: > >> + adev->video_timeout = timeout; > >> + break; > >> + default: > >> + break; > >> + } > >> + } > >> + /* > >> + * There is only one value specified and > >> + * it should apply to all non-compute jobs. > >> + */ > >> + if (index == 1) > >> + adev->sdma_timeout = adev->video_timeout = adev- > >gfx_timeout; > >> + } > >> + > >> + return ret; > >> +} > >> + > >> static bool > >> amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned > int pipe, > >> bool in_vblank_irq, int *vpos, int > >> *hpos, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> index d19ad34bef75..16b7e3a22e89 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> @@ -436,9 +436,13 @@ int amdgpu_fence_driver_start_ring(struct > amdgpu_ring *ring, > >> int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, > >> unsigned num_hw_submission) > >> { > >> + struct amdgpu_device *adev = ring->adev; > >> long timeout; > >> int r; > >> > >> + if (!adev) > >> + return -EINVAL; > >> + > >> /* Check that num_hw_submission is a power of two */ > >> if ((num_hw_submission & (num_hw_submission - 1)) != 0) > >> return -EINVAL; > >> @@ -465,12 +469,31 @@ int amdgpu_fence_driver_init_ring(struct > >> amdgpu_ring *ring, > >> > >> /* No need to setup the GPU scheduler for KIQ ring */ > >> if (ring->funcs->type != AMDGPU_RING_TYPE_KIQ) { > >> - /* for non-sriov case, no timeout enforce on compute ring */ > >> - if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE) > >> - && !amdgpu_sriov_vf(ring->adev)) > >> - timeout = MAX_SCHEDULE_TIMEOUT; > >> - else > >> - timeout = msecs_to_jiffies(amdgpu_lockup_timeout); > >> + switch (ring->funcs->type) { > >> + case AMDGPU_RING_TYPE_GFX: > >> + timeout = adev->gfx_timeout; > >> + break; > >> + case AMDGPU_RING_TYPE_COMPUTE: > >> + /* > >> + * For non-sriov case, no timeout enforce > >> + * on compute ring by default. Unless user > >> + * specifies a timeout for compute ring. > >> + * > >> + * For sriov case, always use the timeout > >> + * as gfx ring > >> + */ > >> + if (!amdgpu_sriov_vf(ring->adev)) > >> + timeout = adev->compute_timeout; > >> + else > >> + timeout = adev->gfx_timeout; > >> + break; > >> + case AMDGPU_RING_TYPE_SDMA: > >> + timeout = adev->sdma_timeout; > >> + break; > >> + default: > >> + timeout = adev->video_timeout; > >> + break; > >> + } > >> > >> r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, > >> num_hw_submission, > >> amdgpu_job_hang_limit, diff --git > >> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> index 8dbad496b29f..089952a1e6b0 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > >> @@ -343,7 +343,7 @@ static void xgpu_ai_mailbox_flr_work(struct > >> work_struct *work) > >> > >> /* Trigger recovery for world switch failure if no TDR */ > >> if (amdgpu_device_should_recover_gpu(adev) > >> - && amdgpu_lockup_timeout == MAX_SCHEDULE_TIMEOUT) > >> + && adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT) > >> amdgpu_device_gpu_recover(adev, NULL); > >> } > >> > >> -- > >> 2.21.0 > >> > >> _______________________________________________ > >> amd-gfx mailing list > >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx > >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > _______________________________________________ > > amd-gfx mailing list > > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx