On Thu, Aug 18, 2022 at 03:38:12PM -0500, Bjorn Helgaas wrote: > [Adding amdgpu folks] > > On Wed, Aug 17, 2022 at 11:45:15PM +0000, bugzilla-daemon@xxxxxxxxxx wrote: > > https://bugzilla.kernel.org/show_bug.cgi?id=216373 > > > > Bug ID: 216373 > > Summary: Uncorrected errors reported for AMD GPU > > Kernel Version: v6.0-rc1 > > Regression: No Tom, thanks for trying out "pci=noaer". Hopefully we won't need the workaround for long. Could I trouble you to try the debug patch below and see if we get any stack trace clues in dmesg when the error happens? I'm sure the experts would have a better approach, but I'm amdgpu-illiterate, so this is all I can do :) Bjorn diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c4a6fe3070b6..fc34c66776bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -130,6 +130,14 @@ const char *amdgpu_asic_name[] = { "LAST", }; +void check_write(uint32_t v, void __iomem *base, uint32_t offset) +{ + if (offset == 0x7f000) { + pr_err("** writing %#010x to %px\n", v, base + offset); + dump_stack(); + } +} + /** * DOC: pcie_replay_count * @@ -512,9 +520,10 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) if (amdgpu_device_skip_hw_access(adev)) return; - if (offset < adev->rmmio_size) + if (offset < adev->rmmio_size) { + check_write(value, adev->rmmio, offset); writeb(value, adev->rmmio + offset); - else + } else BUG(); } @@ -542,6 +551,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, amdgpu_kiq_wreg(adev, reg, v); up_read(&adev->reset_domain->sem); } else { + check_write(v, adev->rmmio, reg * 4); writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } } else { @@ -574,6 +584,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, } else if ((reg * 4) >= adev->rmmio_size) { adev->pcie_wreg(adev, reg * 4, v); } else { + check_write(v, adev->rmmio, reg * 4); writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } } @@ -689,6 +700,7 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + check_write(reg_addr, adev->rmmio, pcie_index * 4); writel(reg_addr, pcie_index_offset); readl(pcie_index_offset); r = readl(pcie_data_offset); @@ -721,10 +733,12 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; /* read low 32 bits */ + check_write(reg_addr, adev->rmmio, pcie_index * 4); writel(reg_addr, pcie_index_offset); readl(pcie_index_offset); r = readl(pcie_data_offset); /* read high 32 bits */ + check_write(reg_addr + 4, adev->rmmio, pcie_index * 4); writel(reg_addr + 4, pcie_index_offset); readl(pcie_index_offset); r |= ((u64)readl(pcie_data_offset) << 32); @@ -755,8 +769,10 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + check_write(reg_addr, adev->rmmio, pcie_index * 4); writel(reg_addr, pcie_index_offset); readl(pcie_index_offset); + check_write(reg_data, adev->rmmio, pcie_data * 4); writel(reg_data, pcie_data_offset); readl(pcie_data_offset); spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); @@ -785,13 +801,17 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; /* write low 32 bits */ + check_write(reg_addr, adev->rmmio, pcie_index * 4); writel(reg_addr, pcie_index_offset); readl(pcie_index_offset); + check_write((u32)(reg_data & 0xffffffffULL), adev->rmmio, pcie_data * 4); writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); readl(pcie_data_offset); /* write high 32 bits */ + check_write(reg_addr + 4, adev->rmmio, pcie_index * 4); writel(reg_addr + 4, pcie_index_offset); readl(pcie_index_offset); + check_write((u32)(reg_data >> 32), adev->rmmio, pcie_data * 4); writel((u32)(reg_data >> 32), pcie_data_offset); readl(pcie_data_offset); spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 9be57389301b..b552d7c27ec0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -36,6 +36,8 @@ #include "soc15.h" #include "nv.h" +extern void check_write(uint32_t v, void __iomem *base, uint32_t offset); + #define POPULATE_UCODE_INFO(vf2pf_info, ucode, ver) \ do { \ vf2pf_info->ucode_info[ucode].id = ucode; \ @@ -900,11 +902,15 @@ static u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v if (offset == reg_access_ctrl->grbm_cntl) { /* if the target reg offset is grbm_cntl, write to scratch_reg2 */ + check_write(v, adev->rmmio, 4 * reg_access_ctrl->scratch_reg2); writel(v, scratch_reg2); + check_write(v, adev->rmmio, offset * 4); writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); } else if (offset == reg_access_ctrl->grbm_idx) { /* if the target reg offset is grbm_idx, write to scratch_reg3 */ + check_write(v, adev->rmmio, 4 * reg_access_ctrl->scratch_reg3); writel(v, scratch_reg3); + check_write(v, adev->rmmio, offset * 4); writel(v, ((void __iomem *)adev->rmmio) + (offset * 4)); } else { /* @@ -913,10 +919,14 @@ static u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v * SCRATCH_REG1[19:0] = address in dword * SCRATCH_REG1[26:24] = Error reporting */ + check_write(v, adev->rmmio, 4 * reg_access_ctrl->scratch_reg0); writel(v, scratch_reg0); + check_write(offset | flag, adev->rmmio, 4 * reg_access_ctrl->scratch_reg1); writel((offset | flag), scratch_reg1); - if (reg_access_ctrl->spare_int) + if (reg_access_ctrl->spare_int) { + check_write(1, adev->rmmio, 4 * reg_access_ctrl->spare_int); writel(1, spare_int); + } for (i = 0; i < timeout; i++) { tmp = readl(scratch_reg1);