RE: [PATCH 3/4] drm/amdgpu: skip printing vram_lost if needed

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Alex Deucher <alexdeucher@xxxxxxxxx>
> Sent: Friday, August 16, 2024 9:52 PM
> To: Huang, Trigger <Trigger.Huang@xxxxxxx>
> Cc: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Khatri, Sunil <Sunil.Khatri@xxxxxxx>;
> Deucher, Alexander <Alexander.Deucher@xxxxxxx>
> Subject: Re: [PATCH 3/4] drm/amdgpu: skip printing vram_lost if needed
>
> On Fri, Aug 16, 2024 at 3:55 AM <Trigger.Huang@xxxxxxx> wrote:
> >
> > From: Trigger Huang <Trigger.Huang@xxxxxxx>
> >
> > The vm lost status can only be obtained after a GPU reset occurs, but
> > sometimes a dev core dump can be happened before GPU reset. So a new
> > argument is added to tell the dev core dump implementation whether to
> > skip printing the vram_lost status in the dump.
> > And this patch is also trying to decouple the core dump function from
> > the GPU reset function, by replacing the argument amdgpu_reset_context
> > with amdgpu_job to specify the context for core dump.
> >
> > Signed-off-by: Trigger Huang <Trigger.Huang@xxxxxxx>
> > Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx>
> > ---
> >  .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 19
> > ++++++++++---------  .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> |  6 +++---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  2 +-
> >  3 files changed, 14 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> > index cf2b4dd4d865..a860f52d8bb0 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> > @@ -28,8 +28,9 @@
> >  #include "atom.h"
> >
> >  #ifndef CONFIG_DEV_COREDUMP
> > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> > -                    struct amdgpu_reset_context *reset_context)
> > +void amdgpu_coredump(struct amdgpu_device *adev, bool
> skip_vram_check,
> > +                    bool vram_lost, struct amdgpu_job *job)
> > +
> >  {
> >  }
> >  #else
> > @@ -315,7 +316,7 @@ amdgpu_devcoredump_read(char *buffer, loff_t
> offset, size_t count,
> >                 }
> >         }
> >
> > -       if (coredump->reset_vram_lost)
> > +       if (!(coredump->skip_vram_check) && coredump->reset_vram_lost)
> >                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>
> You might want to say something like:
> drm_printf(&p, "VRAM lost status skipped\n"); in the skip case so we know
> that we skipped it so users don't assume it wasn't lost.

Sounds good, I will add it.

Regardss.
Trigger

>
> Alex
>
> >
> >         return count - iter.remain;
> > @@ -326,12 +327,11 @@ static void amdgpu_devcoredump_free(void
> *data)
> >         kfree(data);
> >  }
> >
> > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> > -                    struct amdgpu_reset_context *reset_context)
> > +void amdgpu_coredump(struct amdgpu_device *adev, bool
> skip_vram_check,
> > +                    bool vram_lost, struct amdgpu_job *job)
> >  {
> > -       struct amdgpu_coredump_info *coredump;
> >         struct drm_device *dev = adev_to_drm(adev);
> > -       struct amdgpu_job *job = reset_context->job;
> > +       struct amdgpu_coredump_info *coredump;
> >         struct drm_sched_job *s_job;
> >
> >         coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); @@ -341,11
> > +341,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool
> vram_lost,
> >                 return;
> >         }
> >
> > +       coredump->skip_vram_check = skip_vram_check;
> >         coredump->reset_vram_lost = vram_lost;
> >
> > -       if (reset_context->job && reset_context->job->vm) {
> > +       if (job && job->vm) {
> >                 struct amdgpu_task_info *ti;
> > -               struct amdgpu_vm *vm = reset_context->job->vm;
> > +               struct amdgpu_vm *vm = job->vm;
> >
> >                 ti = amdgpu_vm_get_task_info_vm(vm);
> >                 if (ti) {
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> > index 52459512cb2b..c4e522e49251 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h
> > @@ -26,7 +26,6 @@
> >  #define __AMDGPU_DEV_COREDUMP_H__
> >
> >  #include "amdgpu.h"
> > -#include "amdgpu_reset.h"
> >
> >  #ifdef CONFIG_DEV_COREDUMP
> >
> > @@ -36,12 +35,13 @@ struct amdgpu_coredump_info {
> >         struct amdgpu_device            *adev;
> >         struct amdgpu_task_info         reset_task_info;
> >         struct timespec64               reset_time;
> > +       bool                            skip_vram_check;
> >         bool                            reset_vram_lost;
> >         struct amdgpu_ring              *ring;
> >  };
> >  #endif
> >
> > -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> > -                    struct amdgpu_reset_context *reset_context);
> > +void amdgpu_coredump(struct amdgpu_device *adev, bool
> skip_vram_check,
> > +                    bool vram_lost, struct amdgpu_job *job);
> >
> >  #endif
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 9885d0606b0a..825cc62cd75d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -5445,7 +5445,7 @@ int amdgpu_do_asic_reset(struct list_head
> *device_list_handle,
> >                                 vram_lost =
> > amdgpu_device_check_vram_lost(tmp_adev);
> >
> >                                 if (amdgpu_gpu_coredump &&
> (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)))
> > -                                       amdgpu_coredump(tmp_adev, vram_lost,
> reset_context);
> > +                                       amdgpu_coredump(tmp_adev,
> > + false, vram_lost, reset_context->job);
> >
> >                                 if (vram_lost) {
> >                                         DRM_INFO("VRAM is lost due to
> > GPU reset!\n");
> > --
> > 2.34.1
> >




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux