>> Please no, that thing certainly proved to be useful. Maybe we could investigate why it failed to auto detect the lost VRAM. The reason is with BACO reset I found VRAM lost in high address e.g. 15~16 G (for 16 G vega10), amdgpu_device_check_vram_lost only checks the very ahead visible part >> Yeah, but would increment it twice be a problem? I don't think so. So your suggestion is we increase the counter in BACO reset , and no need to introduce the new "bool *" parameter, right ? _____________________________________ Monk Liu|GPU Virtualization Team |AMD -----Original Message----- From: Koenig, Christian <Christian.Koenig@xxxxxxx> Sent: Friday, August 23, 2019 8:47 PM To: Liu, Monk <Monk.Liu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function Am 23.08.19 um 10:57 schrieb Liu, Monk: >>> vram_lost = amdgpu_device_check_vram_lost(tmp_adev); >>> if (vram_lost) { >>> DRM_INFO("VRAM is lost due to GPU reset!\n"); >>> atomic_inc(&tmp_adev->vram_lost_counter); > Above is the original logic, if we increment the counter in BACO reset > routine, we would potentially Have another counter increasement if by > coincidence the "amdgpu_device_check_vram_lost" successfully detected > the vram lost (although right now it didn't ..) Yeah, but would increment it twice be a problem? I don't think so. > Do you mean we remove the amdgpu_device_check_vram_lost(tmp_adev) in device_recovery() routine ? Please no, that thing certainly proved to be useful. Maybe we could investigate why it failed to auto detect the lost VRAM. Christian. > _____________________________________ > Monk Liu|GPU Virtualization Team |AMD > > > -----Original Message----- > From: Koenig, Christian <Christian.Koenig@xxxxxxx> > Sent: Friday, August 23, 2019 4:34 PM > To: Liu, Monk <Monk.Liu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for > reset function > > I thought in the BACO reset function. > > The top level reset function doesn't do much more than increment the vram_lost_counter either. > > Christian. > > Am 23.08.19 um 10:32 schrieb Liu, Monk: >>>> On the other hand wouldn't it be simpler to just increment vram_lost_counter? >> In where ? if you mean in amdgpu_device_recover routine I won't do that since the reset() can do any kind of reset like: >> 1) PF FLR >> 2) mode1/2 reset >> 3) magic reset through config space >> 4) BACO reset >> >> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO >> reset is definitely a vram lost reset >> >> If you increase the counter in general function that will be not >> accurate _____________________________________ >> Monk Liu|GPU Virtualization Team |AMD >> >> >> -----Original Message----- >> From: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> >> Sent: Friday, August 23, 2019 4:27 PM >> To: Liu, Monk <Monk.Liu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for >> reset function >> >> Am 23.08.19 um 05:34 schrieb Monk Liu: >>> for SOC15/vega10 the BACO reset would introduce vram lost in the >>> high end address range and current kmd's vram lost checking cannot >>> catch it since it only check visible frame buffer >>> >>> TODO: >>> to confirm if mode1/2 reset would introduce vram lost >> Looks good in general, but I would make the value mandatory or maybe use a special return code instead. >> >> On the other hand wouldn't it be simpler to just increment vram_lost_counter? >> >> Christian. >> >>> Signed-off-by: Monk Liu <Monk.Liu@xxxxxxx> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++-- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++----- >>> drivers/gpu/drm/amd/amdgpu/cik.c | 2 +- >>> drivers/gpu/drm/amd/amdgpu/nv.c | 10 +++++++--- >>> drivers/gpu/drm/amd/amdgpu/si.c | 2 +- >>> drivers/gpu/drm/amd/amdgpu/soc15.c | 4 +++- >>> drivers/gpu/drm/amd/amdgpu/vi.c | 2 +- >>> 7 files changed, 22 insertions(+), 14 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> index f6ae565..1fe3756 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs { >>> int (*read_register)(struct amdgpu_device *adev, u32 se_num, >>> u32 sh_num, u32 reg_offset, u32 *value); >>> void (*set_vga_state)(struct amdgpu_device *adev, bool state); >>> - int (*reset)(struct amdgpu_device *adev); >>> + int (*reset)(struct amdgpu_device *adev, bool *lost); >>> enum amd_reset_method (*reset_method)(struct amdgpu_device *adev); >>> /* get the reference clock */ >>> u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7 >>> @@ int emu_soc_asic_init(struct amdgpu_device *adev); >>> * ASICs macro. >>> */ >>> #define amdgpu_asic_set_vga_state(adev, state) >>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define >>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev)) >>> +#define amdgpu_asic_reset(adev, lost) >>> +(adev)->asic_funcs->reset((adev), (lost)) >>> #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev)) >>> #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev)) >>> #define amdgpu_asic_set_uvd_clocks(adev, v, d) >>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> index 02b3e7d..8668cb8 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) >>> struct amdgpu_device *adev = >>> container_of(__work, struct amdgpu_device, xgmi_reset_work); >>> >>> - adev->asic_reset_res = amdgpu_asic_reset(adev); >>> + adev->asic_reset_res = amdgpu_asic_reset(adev, NULL); >>> if (adev->asic_reset_res) >>> DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", >>> adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 >>> +2751,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, >>> * E.g., driver was not cleanly unloaded previously, etc. >>> */ >>> if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { >>> - r = amdgpu_asic_reset(adev); >>> + r = amdgpu_asic_reset(adev, NULL); >>> if (r) { >>> dev_err(adev->dev, "asic reset on init failed\n"); >>> goto failed; >>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) >>> pci_disable_device(dev->pdev); >>> pci_set_power_state(dev->pdev, PCI_D3hot); >>> } else { >>> - r = amdgpu_asic_reset(adev); >>> + r = amdgpu_asic_reset(adev, NULL); >>> if (r) >>> DRM_ERROR("amdgpu asic reset failed\n"); >>> } >>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, >>> if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) >>> r = -EALREADY; >>> } else >>> - r = amdgpu_asic_reset(tmp_adev); >>> + r = amdgpu_asic_reset(tmp_adev, &vram_lost); >>> >>> if (r) { >>> DRM_ERROR("ASIC reset failed with error, %d for drm dev, >>> %s", @@ >>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, >>> if (r) >>> goto out; >>> >>> - vram_lost = amdgpu_device_check_vram_lost(tmp_adev); >>> + if (!vram_lost) >>> + vram_lost = amdgpu_device_check_vram_lost(tmp_adev); >>> + >>> if (vram_lost) { >>> DRM_INFO("VRAM is lost due to GPU reset!\n"); >>> atomic_inc(&tmp_adev->vram_lost_counter); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c >>> b/drivers/gpu/drm/amd/amdgpu/cik.c >>> index 7b63d7a..0f25b82 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c >>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev) >>> * to reset them. >>> * Returns 0 for success. >>> */ >>> -static int cik_asic_reset(struct amdgpu_device *adev) >>> +static int cik_asic_reset(struct amdgpu_device *adev, bool >>> +*vramlost) >>> { >>> int r; >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c >>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c >>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev) >>> return AMD_RESET_METHOD_MODE1; >>> } >>> >>> -static int nv_asic_reset(struct amdgpu_device *adev) >>> +static int nv_asic_reset(struct amdgpu_device *adev, bool >>> +*vramlost) >>> { >>> >>> /* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@ >>> static int nv_asic_reset(struct amdgpu_device *adev) >>> int ret = 0; >>> struct smu_context *smu = &adev->smu; >>> >>> - if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) >>> + if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { >>> + if (vramlost) >>> + *vramlost = true; >>> ret = smu_baco_reset(smu); >>> - else >>> + } >>> + else { >>> ret = nv_asic_mode1_reset(adev); >>> + } >>> >>> return ret; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c >>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/si.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c >>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev, >>> } >>> >>> //xxx: not implemented >>> -static int si_asic_reset(struct amdgpu_device *adev) >>> +static int si_asic_reset(struct amdgpu_device *adev, bool >>> +*vramlost) >>> { >>> return 0; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c >>> b/drivers/gpu/drm/amd/amdgpu/soc15.c >>> index fe2212df..12b2966 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c >>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev) >>> return AMD_RESET_METHOD_MODE1; >>> } >>> >>> -static int soc15_asic_reset(struct amdgpu_device *adev) >>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool >>> +*vramlost) >>> { >>> switch (soc15_asic_reset_method(adev)) { >>> case AMD_RESET_METHOD_BACO: >>> + if (vramlost) >>> + *vramlost = true; >>> return soc15_asic_baco_reset(adev); >>> case AMD_RESET_METHOD_MODE2: >>> return soc15_mode2_reset(adev); diff --git >>> a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c >>> index 56c882b..8eceb00 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c >>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev) >>> * to reset them. >>> * Returns 0 for success. >>> */ >>> -static int vi_asic_reset(struct amdgpu_device *adev) >>> +static int vi_asic_reset(struct amdgpu_device *adev, bool >>> +*vramlost) >>> { >>> int r; >>> _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx