On Fri, Aug 16, 2024 at 3:55 AM <Trigger.Huang@xxxxxxx> wrote: > > From: Trigger Huang <Trigger.Huang@xxxxxxx> > > The vm lost status can only be obtained after a GPU reset occurs, but > sometimes a dev core dump can be happened before GPU reset. So a new > argument is added to tell the dev core dump implementation whether to > skip printing the vram_lost status in the dump. > And this patch is also trying to decouple the core dump function from > the GPU reset function, by replacing the argument amdgpu_reset_context > with amdgpu_job to specify the context for core dump. > > Signed-off-by: Trigger Huang <Trigger.Huang@xxxxxxx> > Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c | 19 ++++++++++--------- > .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h | 6 +++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- > 3 files changed, 14 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > index cf2b4dd4d865..a860f52d8bb0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c > @@ -28,8 +28,9 @@ > #include "atom.h" > > #ifndef CONFIG_DEV_COREDUMP > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > - struct amdgpu_reset_context *reset_context) > +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, > + bool vram_lost, struct amdgpu_job *job) > + > { > } > #else > @@ -315,7 +316,7 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, > } > } > > - if (coredump->reset_vram_lost) > + if (!(coredump->skip_vram_check) && coredump->reset_vram_lost) > drm_printf(&p, "VRAM is lost due to GPU reset!\n"); You might want to say something like: drm_printf(&p, "VRAM lost status skipped\n"); in the skip case so we know that we skipped it so users don't assume it wasn't lost. Alex > > return count - iter.remain; > @@ -326,12 +327,11 @@ static void amdgpu_devcoredump_free(void *data) > kfree(data); > } > > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > - struct amdgpu_reset_context *reset_context) > +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, > + bool vram_lost, struct amdgpu_job *job) > { > - struct amdgpu_coredump_info *coredump; > struct drm_device *dev = adev_to_drm(adev); > - struct amdgpu_job *job = reset_context->job; > + struct amdgpu_coredump_info *coredump; > struct drm_sched_job *s_job; > > coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); > @@ -341,11 +341,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > return; > } > > + coredump->skip_vram_check = skip_vram_check; > coredump->reset_vram_lost = vram_lost; > > - if (reset_context->job && reset_context->job->vm) { > + if (job && job->vm) { > struct amdgpu_task_info *ti; > - struct amdgpu_vm *vm = reset_context->job->vm; > + struct amdgpu_vm *vm = job->vm; > > ti = amdgpu_vm_get_task_info_vm(vm); > if (ti) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > index 52459512cb2b..c4e522e49251 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h > @@ -26,7 +26,6 @@ > #define __AMDGPU_DEV_COREDUMP_H__ > > #include "amdgpu.h" > -#include "amdgpu_reset.h" > > #ifdef CONFIG_DEV_COREDUMP > > @@ -36,12 +35,13 @@ struct amdgpu_coredump_info { > struct amdgpu_device *adev; > struct amdgpu_task_info reset_task_info; > struct timespec64 reset_time; > + bool skip_vram_check; > bool reset_vram_lost; > struct amdgpu_ring *ring; > }; > #endif > > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, > - struct amdgpu_reset_context *reset_context); > +void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check, > + bool vram_lost, struct amdgpu_job *job); > > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 9885d0606b0a..825cc62cd75d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5445,7 +5445,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > vram_lost = amdgpu_device_check_vram_lost(tmp_adev); > > if (amdgpu_gpu_coredump && (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))) > - amdgpu_coredump(tmp_adev, vram_lost, reset_context); > + amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); > > if (vram_lost) { > DRM_INFO("VRAM is lost due to GPU reset!\n"); > -- > 2.34.1 >