[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Alex Deucher <alexdeucher@xxxxxxxxx> > Sent: Friday, August 16, 2024 9:52 PM > To: Huang, Trigger <Trigger.Huang@xxxxxxx> > Cc: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Khatri, Sunil <Sunil.Khatri@xxxxxxx>; > Deucher, Alexander <Alexander.Deucher@xxxxxxx> > Subject: Re: [PATCH 3/4] drm/amdgpu: skip printing vram_lost if needed > > On Fri, Aug 16, 2024 at 3:55 AM <Trigger.Huang@xxxxxxx> wrote: > > > > From: Trigger Huang <Trigger.Huang@xxxxxxx> > > > > The vm lost status can only be obtained after a GPU reset occurs, but > > sometimes a dev core dump can be happened before GPU reset. So a new > > argument is added to tell the dev core dump implementation whether to > > skip printing the vram_lost status in the dump. > > And this patch is also trying to decouple the core dump function from > > the GPU reset function, by replacing the argument amdgpu_reset_context > > with amdgpu_job to specify the context for core dump. > > > > Signed-off-by: Trigger Huang <Trigger.Huang@xxxxxxx> > > Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx> > > --- > > .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 19 > > ++++++++++--------- .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > | 6 +++--- > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- > > 3 files changed, 14 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > > index cf2b4dd4d865..a860f52d8bb0 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > > @@ -28,8 +28,9 @@ > > #include "atom.h" > > > > #ifndef CONFIG_DEV_COREDUMP > > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > > - struct amdgpu_reset_context *reset_context) > > +void amdgpu_coredump(struct amdgpu_device *adev, bool > skip_vram_check, > > + bool vram_lost, struct amdgpu_job *job) > > + > > { > > } > > #else > > @@ -315,7 +316,7 @@ amdgpu_devcoredump_read(char *buffer, loff_t > offset, size_t count, > > } > > } > > > > - if (coredump->reset_vram_lost) > > + if (!(coredump->skip_vram_check) && coredump->reset_vram_lost) > > drm_printf(&p, "VRAM is lost due to GPU reset!\n"); > > You might want to say something like: > drm_printf(&p, "VRAM lost status skipped\n"); in the skip case so we know > that we skipped it so users don't assume it wasn't lost. Sounds good, I will add it. Regardss. Trigger > > Alex > > > > > return count - iter.remain; > > @@ -326,12 +327,11 @@ static void amdgpu_devcoredump_free(void > *data) > > kfree(data); > > } > > > > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > > - struct amdgpu_reset_context *reset_context) > > +void amdgpu_coredump(struct amdgpu_device *adev, bool > skip_vram_check, > > + bool vram_lost, struct amdgpu_job *job) > > { > > - struct amdgpu_coredump_info *coredump; > > struct drm_device *dev = adev_to_drm(adev); > > - struct amdgpu_job *job = reset_context->job; > > + struct amdgpu_coredump_info *coredump; > > struct drm_sched_job *s_job; > > > > coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); @@ -341,11 > > +341,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool > vram_lost, > > return; > > } > > > > + coredump->skip_vram_check = skip_vram_check; > > coredump->reset_vram_lost = vram_lost; > > > > - if (reset_context->job && reset_context->job->vm) { > > + if (job && job->vm) { > > struct amdgpu_task_info *ti; > > - struct amdgpu_vm *vm = reset_context->job->vm; > > + struct amdgpu_vm *vm = job->vm; > > > > ti = amdgpu_vm_get_task_info_vm(vm); > > if (ti) { > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > > index 52459512cb2b..c4e522e49251 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > > @@ -26,7 +26,6 @@ > > #define __AMDGPU_DEV_COREDUMP_H__ > > > > #include "amdgpu.h" > > -#include "amdgpu_reset.h" > > > > #ifdef CONFIG_DEV_COREDUMP > > > > @@ -36,12 +35,13 @@ struct amdgpu_coredump_info { > > struct amdgpu_device *adev; > > struct amdgpu_task_info reset_task_info; > > struct timespec64 reset_time; > > + bool skip_vram_check; > > bool reset_vram_lost; > > struct amdgpu_ring *ring; > > }; > > #endif > > > > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > > - struct amdgpu_reset_context *reset_context); > > +void amdgpu_coredump(struct amdgpu_device *adev, bool > skip_vram_check, > > + bool vram_lost, struct amdgpu_job *job); > > > > #endif > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > index 9885d0606b0a..825cc62cd75d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > @@ -5445,7 +5445,7 @@ int amdgpu_do_asic_reset(struct list_head > *device_list_handle, > > vram_lost = > > amdgpu_device_check_vram_lost(tmp_adev); > > > > if (amdgpu_gpu_coredump && > (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))) > > - amdgpu_coredump(tmp_adev, vram_lost, > reset_context); > > + amdgpu_coredump(tmp_adev, > > + false, vram_lost, reset_context->job); > > > > if (vram_lost) { > > DRM_INFO("VRAM is lost due to > > GPU reset!\n"); > > -- > > 2.34.1 > >