Re: amdgpu 100% CPU usage causing freeze 1002:15d8

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Another potential better patch to try.

Alex

On Tue, Jan 28, 2025 at 10:07 AM Alex Deucher <alexdeucher@xxxxxxxxx> wrote:
>
> Can you try the attached patch (with no other patches applied)?  I
> think it should fix the issue.
>
> Alex
>
> On Sat, Jan 25, 2025 at 1:38 PM Marco Moock <mm@xxxxxxxxxx> wrote:
> >
> > Am 24.01.2025 um 16:40:37 Uhr schrieb Alex Deucher:
> >
> > > On Fri, Jan 24, 2025 at 9:17 AM Marco Moock <mm@xxxxxxxxxx> wrote:
> > > >
> > > > Am 20.01.2025 um 11:35:07 Uhr schrieb Alex Deucher:
> > > >
> > > > > On Thu, Jan 16, 2025 at 11:57 AM Marco Moock <mm@xxxxxxxxxx>
> > > > > wrote:
> > > > > >
> > > > > > Am 16.01.2025 um 11:32:42 Uhr schrieb Alex Deucher:
> > > > > >
> > > > > > > I'd like to see the driver messages leading up to that.
> > > > > >
> > > > > > I've now attached the entire dmesg without the firewall stuff.
> > > > >
> > > > > Does the attached test patch help?
> > > >
> > > > I've now compiled a kernel with the patch.
> > > > It doesn't change the freeze problem.
> > >
> > > Thanks,
> > >
> > > Does setting amdgpu.ppfeaturemask=0xfff73fff on the kernel command
> > > line in grub help?
> >
> > No crash anymore.
> >
> >
> > --
> > Gruß
> > Marco
> >
> > Send unsolicited bulk mail to 1737733237muell@xxxxxxxxxxxxxx
From 81a31522999f6a62ae89ea5a8325d8573dd775f7 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@xxxxxxx>
Date: Tue, 28 Jan 2025 11:55:22 -0500
Subject: [PATCH] drm/amdgpu/gfx9: manually control gfxoff for CS

When mesa started using compute queues more often
we started seeing additional hangs with compute queues.
Disabling gfxoff seems to mitigate that.  Manually
control gfxoff with command submissions to avoid
any issues related to gfxoff.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3861
Link: https://lists.freedesktop.org/archives/amd-gfx/2025-January/119116.html
Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 38 ++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 4fe97f3382a64..1d2f641d1dd94 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -7445,6 +7445,32 @@ static void gfx_v9_0_ring_emit_cleaner_shader(struct amdgpu_ring *ring)
 	amdgpu_ring_write(ring, 0);  /* RESERVED field, programmed to zero */
 }
 
+static void gfx_v9_0_ring_begin_use(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	/* Raven and PCO APUs seem to have stability issues
+	 * with compute and gfxoff.  Disallow gfxoff during
+	 * submission and allow again afterwards.
+	 */
+	amdgpu_gfx_off_ctrl(adev, false);
+
+	amdgpu_gfx_enforce_isolation_ring_begin_use(ring);
+}
+
+static void gfx_v9_0_ring_end_use(struct amdgpu_ring *ring)
+{
+	struct amdgpu_device *adev = ring->adev;
+
+	amdgpu_gfx_enforce_isolation_ring_end_use(ring);
+
+	/* Raven and PCO APUs seem to have stability issues
+	 * with compute and gfxoff.  Disallow gfxoff during
+	 * submission and allow again afterwards.
+	 */
+	amdgpu_gfx_off_ctrl(adev, true);
+}
+
 static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
 	.name = "gfx_v9_0",
 	.early_init = gfx_v9_0_early_init,
@@ -7518,8 +7544,8 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 	.emit_mem_sync = gfx_v9_0_emit_mem_sync,
 	.reset = gfx_v9_0_reset_kgq,
 	.emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
-	.begin_use = amdgpu_gfx_enforce_isolation_ring_begin_use,
-	.end_use = amdgpu_gfx_enforce_isolation_ring_end_use,
+	.begin_use = gfx_v9_0_ring_begin_use,
+	.end_use = gfx_v9_0_ring_end_use,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
@@ -7578,8 +7604,8 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
 	.patch_de = gfx_v9_0_ring_patch_de_meta,
 	.patch_ce = gfx_v9_0_ring_patch_ce_meta,
 	.emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
-	.begin_use = amdgpu_gfx_enforce_isolation_ring_begin_use,
-	.end_use = amdgpu_gfx_enforce_isolation_ring_end_use,
+	.begin_use = gfx_v9_0_ring_begin_use,
+	.end_use = gfx_v9_0_ring_end_use,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
@@ -7621,8 +7647,8 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.emit_wave_limit = gfx_v9_0_emit_wave_limit,
 	.reset = gfx_v9_0_reset_kcq,
 	.emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
-	.begin_use = amdgpu_gfx_enforce_isolation_ring_begin_use,
-	.end_use = amdgpu_gfx_enforce_isolation_ring_end_use,
+	.begin_use = gfx_v9_0_ring_begin_use,
+	.end_use = gfx_v9_0_ring_end_use,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
-- 
2.48.1


[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux