On 1/13/2025 7:12 AM, Jiang Liu wrote: > Add helper functions to track status for ras manager and ip blocks. > > Signed-off-by: Jiang Liu <gerry@xxxxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 38 +++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 +++++++ > 3 files changed, 85 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 5e55a44f9eef..f0f773659faf 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -377,12 +377,28 @@ int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block); > > #define AMDGPU_MAX_IP_NUM 16 > > +enum amdgpu_marker { > + // Markers for IRQs, used for both ip blocks and ras blocks. > + AMDGPU_MARKER_IRQ0 = 32, > + AMDGPU_MARKER_IRQ1, > + AMDGPU_MARKER_IRQ2, > + AMDGPU_MARKER_IRQ3, > + AMDGPU_MARKER_IRQ4, > + AMDGPU_MARKER_IRQ5, > + AMDGPU_MARKER_IRQ6, > + AMDGPU_MARKER_IRQ7, > + AMDGPU_MARKER_IRQ_MAX = 63, > +}; > + > +#define AMDGPU_MARKER_IRQ(idx) (AMDGPU_MARKER_IRQ0 + (idx)) > + > struct amdgpu_ip_block_status { > bool valid; > bool sw; > bool hw; > bool late_initialized; > bool hang; > + uint64_t markers; > }; > This fine grained levels maintained at IP layer doesn't look like a proper solution. It's either IP or RAS block has the required IRQs enabled or disabled. Unwinding them needs to be tracked at IRQ object layer and not here. Thanks, Lijo > struct amdgpu_ip_block_version { > @@ -410,6 +426,28 @@ amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev, > int amdgpu_device_ip_block_add(struct amdgpu_device *adev, > const struct amdgpu_ip_block_version *ip_block_version); > > +static inline void amdgpu_ip_block_set_marker(struct amdgpu_ip_block *ip_block, > + enum amdgpu_marker marker) > +{ > + WARN_ON(marker > 63); > + WARN_ON(ip_block->status.markers & (0x1ull << marker)); > + ip_block->status.markers |= 0x1ull << (int)marker; > +} > + > +static inline bool amdgpu_ip_block_test_and_clear_marker(struct amdgpu_ip_block *ip_block, > + enum amdgpu_marker marker) > +{ > + bool set = false; > + uint64_t value = 0x1ull << (int)marker; > + > + if ((ip_block->status.markers & value) != 0) { > + ip_block->status.markers &= ~value; > + set = true; > + } > + > + return set; > +} > + > /* > * BIOS. > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index f0924aa3f4e4..5e19d820ab34 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -5207,3 +5207,40 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev) > > return con->is_rma; > } > + > +bool amdgpu_ras_test_marker(struct amdgpu_device *adev, > + struct ras_common_if *head, int marker) > +{ > + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); > + > + if (obj && obj->markers & (0x1ull << marker)) > + return true; > + > + return false; > +} > + > +void amdgpu_ras_set_marker(struct amdgpu_device *adev, > + struct ras_common_if *head, int marker) > +{ > + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); > + > + WARN_ON(marker > 63); > + WARN_ON(obj->markers & (0x1ull << marker)); > + if (obj) > + obj->markers |= 0x1ull << marker; > +} > + > +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev, > + struct ras_common_if *head, int marker) > +{ > + bool set = false; > + uint64_t value = 0x1ull << marker; > + struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); > + > + if (obj && (obj->markers & value) != 0) { > + obj->markers &= ~value; > + set = true; > + } > + > + return set; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 82db986c36a0..35881087b17b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -634,6 +634,8 @@ struct ras_manager { > struct ras_common_if head; > /* reference count */ > int use; > + /* Flags for status tracking */ > + uint64_t markers; > /* ras block link */ > struct list_head node; > /* the device */ > @@ -977,4 +979,12 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, > const char *fmt, ...); > > bool amdgpu_ras_is_rma(struct amdgpu_device *adev); > + > +bool amdgpu_ras_test_marker(struct amdgpu_device *adev, > + struct ras_common_if *head, int marker); > +void amdgpu_ras_set_marker(struct amdgpu_device *adev, > + struct ras_common_if *head, int marker); > +bool amdgpu_ras_test_and_clear_marker(struct amdgpu_device *adev, > + struct ras_common_if *head, > + int marker); > #endif