On 2019-07-04 12:02 p.m., Kuehling, Felix wrote: > On 2019-07-03 6:19 p.m., Yang, Philip wrote: >> amdgpu_noretry default value is 0, this will generate VM fault storm >> because the vm fault is not recovered. It may slow down the machine and >> need reboot after application VM fault. Maybe change default value to 1? > > This is the same as the current behaviour. My change doesn't modify the > default behaviour, it only makes it configurable in a sensible way. > > If we want to change the default, I'd do that in a separate commit. > I misunderstood VM_CONTEXT1_CNT/RETRY_PERMISSION_OR_INVALID_PAGE_FAULT change, yes, this is the same behavior. retry was already enabled by another patch "drm/amdgpu: re-enable retry faults" before. But I don't find the patch to silent retry storm on the mailing list. Regards, Philip > Regards, > Felix > >> >> Other than that, this is reviewed by Philip Yang <Philip.Yang@xxxxxxx> >> >> On 2019-07-02 3:05 p.m., Kuehling, Felix wrote: >>> Ping. >>> >>> Christian, Philip, any opinion about this patch? >>> >>> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote: >>>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This >>>> makes the noretry param no longer KFD-specific. On GFX10 I'm not >>>> changing SH_MEM_CONFIG in this commit because GFX10 has different >>>> retry behaviour in the SQ and I don't have a way to test it at the >>>> moment. >>>> >>>> Suggested-by: Christian König <Christian.Koenig@xxxxxxx> >>>> CC: Philip Yang <Philip.Yang@xxxxxxx> >>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 16 +++++----------- >>>> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++++ >>>> drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 3 ++- >>>> drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c | 3 ++- >>>> drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c | 3 ++- >>>> drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c | 3 ++- >>>> .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 2 +- >>>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 +- >>>> 9 files changed, 20 insertions(+), 17 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> index 9b1efdf94bdf..05875279c09e 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring; >>>> extern int amdgpu_mcbp; >>>> extern int amdgpu_discovery; >>>> extern int amdgpu_mes; >>>> +extern int amdgpu_noretry; >>>> >>>> #ifdef CONFIG_DRM_AMDGPU_SI >>>> extern int amdgpu_si_support; >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >>>> index 7cf6ab07b113..0d578d95be93 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >>>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1; >>>> int amdgpu_mcbp = 0; >>>> int amdgpu_discovery = 0; >>>> int amdgpu_mes = 0; >>>> +int amdgpu_noretry; >>>> >>>> struct amdgpu_mgpu_info mgpu_info = { >>>> .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), >>>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes, >>>> "Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)"); >>>> module_param_named(mes, amdgpu_mes, int, 0444); >>>> >>>> +MODULE_PARM_DESC(noretry, >>>> + "Disable retry faults (0 = retry enabled (default), 1 = retry disabled)"); >>>> +module_param_named(noretry, amdgpu_noretry, int, 0644); >>>> + >>>> #ifdef CONFIG_HSA_AMD >>>> /** >>>> * DOC: sched_policy (int) >>>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444); >>>> MODULE_PARM_DESC(ignore_crat, >>>> "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); >>>> >>>> -/** >>>> - * DOC: noretry (int) >>>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry. >>>> - * Setting 1 disables retry. >>>> - * Retry is needed for recoverable page faults. >>>> - */ >>>> -int noretry; >>>> -module_param(noretry, int, 0644); >>>> -MODULE_PARM_DESC(noretry, >>>> - "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)"); >>>> - >>>> /** >>>> * DOC: halt_if_hws_hang (int) >>>> * Halt if HWS hang is detected. Default value, 0, disables the halt on hang. >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >>>> index e0f3014e76ea..c4e715170bfe 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c >>>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev) >>>> if (i == 0) { >>>> tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE, >>>> SH_MEM_ALIGNMENT_MODE_UNALIGNED); >>>> + tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE, >>>> + !!amdgpu_noretry); >>>> WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp); >>>> WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0); >>>> } else { >>>> tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE, >>>> SH_MEM_ALIGNMENT_MODE_UNALIGNED); >>>> + tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE, >>>> + !!amdgpu_noretry); >>>> WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp); >>>> tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE, >>>> (adev->gmc.private_aperture_start >> 48)); >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c >>>> index 9f0f189fc111..15986748f59f 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c >>>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev) >>>> block_size); >>>> /* Send no-retry XNACK on fault to suppress VM fault storm. */ >>>> tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, >>>> - RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1); >>>> + RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, >>>> + !amdgpu_noretry); >>>> WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp); >>>> WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0); >>>> WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0); >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c >>>> index b7de60a15623..d605b4963f8a 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c >>>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev) >>>> adev->vm_manager.block_size - 9); >>>> /* Send no-retry XNACK on fault to suppress VM fault storm. */ >>>> tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL, >>>> - RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0); >>>> + RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, >>>> + !amdgpu_noretry); >>>> WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp); >>>> WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0); >>>> WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0); >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c >>>> index 05d1d448c8f5..dc5ce03034d3 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c >>>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev) >>>> block_size); >>>> /* Send no-retry XNACK on fault to suppress VM fault storm. */ >>>> tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL, >>>> - RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1); >>>> + RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, >>>> + !amdgpu_noretry); >>>> WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp); >>>> WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0); >>>> WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0); >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c >>>> index 37a1a318ae63..0f9549f19ade 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c >>>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev) >>>> adev->vm_manager.block_size - 9); >>>> /* Send no-retry XNACK on fault to suppress VM fault storm. */ >>>> tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL, >>>> - RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0); >>>> + RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, >>>> + !amdgpu_noretry); >>>> WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp); >>>> WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0); >>>> WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0); >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c >>>> index e9fe39382371..95a82ac455f2 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c >>>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm, >>>> qpd->sh_mem_config = >>>> SH_MEM_ALIGNMENT_MODE_UNALIGNED << >>>> SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; >>>> - if (noretry && >>>> + if (amdgpu_noretry && >>>> !dqm->dev->device_info->needs_iommu_device) >>>> qpd->sh_mem_config |= >>>> 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> index d4bba0124d29..aa7bf20d20f8 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> @@ -157,7 +157,7 @@ extern int ignore_crat; >>>> /* >>>> * Set sh_mem_config.retry_disable on Vega10 >>>> */ >>>> -extern int noretry; >>>> +extern int amdgpu_noretry; >>>> >>>> /* >>>> * Halt if HWS hang is detected _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx