Thanks. Updated this in V2. -----Original Message----- From: Alex Deucher <alexdeucher@xxxxxxxxx> Sent: Friday, April 10, 2020 8:45 PM To: Quan, Evan <Evan.Quan@xxxxxxx> Cc: amd-gfx list <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx> Subject: Re: [PATCH] drm/amdgpu: fix wrong vram lost counter increment On Fri, Apr 10, 2020 at 4:02 AM Evan Quan <evan.quan@xxxxxxx> wrote: > > Vram lost counter is wrongly increased by two during baco reset. > > Change-Id: I8b9959a5d1632abc774ba07d56cf295bdd8288eb > Signed-off-by: Evan Quan <evan.quan@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 36 ++++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/cik.c | 2 -- > drivers/gpu/drm/amd/amdgpu/nv.c | 4 --- > drivers/gpu/drm/amd/amdgpu/soc15.c | 4 --- > drivers/gpu/drm/amd/amdgpu/vi.c | 2 -- > 5 files changed, 34 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index a2a4e4b28d00..c9317975c46e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2087,8 +2087,40 @@ static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev) > */ > static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) > { > - return !!memcmp(adev->gart.ptr, adev->reset_magic, > - AMDGPU_RESET_MAGIC_NUM); > + if (memcmp(adev->gart.ptr, adev->reset_magic, > + AMDGPU_RESET_MAGIC_NUM)) > + return true; > + > + if (!adev->in_gpu_reset) > + return false; > + > + /* > + * For all ASICs with baco reset, the VRAM is assumed to be > + * lost. > + * For SOC15 and NV ASICs with mode1 reset, the VRAM is also > + * assumed to be lost. > + */ > + switch (amdgpu_asic_reset_method(adev)) { > + case AMD_RESET_METHOD_BACO: > + return true; > + case AMD_RESET_METHOD_MODE1: > + switch (adev->asic_type) { > + case CHIP_VEGA10: > + case CHIP_VEGA12: > + case CHIP_VEGA20: > + case CHIP_RAVEN: > + case CHIP_ARCTURUS: > + case CHIP_RENOIR: > + case CHIP_NAVI10: > + case CHIP_NAVI14: > + case CHIP_NAVI12: I think we can probably just drop the asic check and always return true for MODE1 reset. The UMC block gets reset is memory is not reliable. Alex > + return true; > + default: > + return false; > + } > + default: > + return false; > + } > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c > b/drivers/gpu/drm/amd/amdgpu/cik.c > index db68ffa27984..fe306d0f73f7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/cik.c > +++ b/drivers/gpu/drm/amd/amdgpu/cik.c > @@ -1358,8 +1358,6 @@ static int cik_asic_reset(struct amdgpu_device *adev) > int r; > > if (cik_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { > - if (!adev->in_suspend) > - amdgpu_inc_vram_lost(adev); > r = amdgpu_dpm_baco_reset(adev); > } else { > r = cik_asic_pci_config_reset(adev); diff --git > a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c > index 7768880fcccf..995bdec9fa7d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/nv.c > @@ -351,8 +351,6 @@ static int nv_asic_reset(struct amdgpu_device *adev) > struct smu_context *smu = &adev->smu; > > if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { > - if (!adev->in_suspend) > - amdgpu_inc_vram_lost(adev); > ret = smu_baco_enter(smu); > if (ret) > return ret; > @@ -360,8 +358,6 @@ static int nv_asic_reset(struct amdgpu_device *adev) > if (ret) > return ret; > } else { > - if (!adev->in_suspend) > - amdgpu_inc_vram_lost(adev); > ret = nv_asic_mode1_reset(adev); > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c > b/drivers/gpu/drm/amd/amdgpu/soc15.c > index a597ad22b675..58a440a15525 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc15.c > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c > @@ -569,14 +569,10 @@ static int soc15_asic_reset(struct amdgpu_device > *adev) > > switch (soc15_asic_reset_method(adev)) { > case AMD_RESET_METHOD_BACO: > - if (!adev->in_suspend) > - amdgpu_inc_vram_lost(adev); > return soc15_asic_baco_reset(adev); > case AMD_RESET_METHOD_MODE2: > return amdgpu_dpm_mode2_reset(adev); > default: > - if (!adev->in_suspend) > - amdgpu_inc_vram_lost(adev); > return soc15_asic_mode1_reset(adev); > } > } > diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c > b/drivers/gpu/drm/amd/amdgpu/vi.c index 0a90c296409b..af8986a55354 > 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vi.c > +++ b/drivers/gpu/drm/amd/amdgpu/vi.c > @@ -744,8 +744,6 @@ static int vi_asic_reset(struct amdgpu_device *adev) > int r; > > if (vi_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { > - if (!adev->in_suspend) > - amdgpu_inc_vram_lost(adev); > r = amdgpu_dpm_baco_reset(adev); > } else { > r = vi_asic_pci_config_reset(adev); > -- > 2.26.0 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist > s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cev > an.quan%40amd.com%7C22eb212ade824eb3fc5c08d7dd4cfe75%7C3dd8961fe4884e6 > 08e11a82d994e183d%7C0%7C0%7C637221195076651531&sdata=2DazCSnEqgcdV > pRpmyEBZ9k%2BawbTdciixdhCdNIij4g%3D&reserved=0 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx