If a 2nd fault comes in before the 1st is handled, the 1st fault will clear out the FAULT STATUS registers before the 2nd fault is handled. Thus we get a lot of zeroes. If status=0, just skip the L2 fault status information, to avoid confusion of why some VM fault status prints in dmesg are all zeroes. Signed-off-by: Kent Russell <kent.russell@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 ++++++++++ 4 files changed, 40 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 5cf2002fcba8..14a52c33bffa 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -175,6 +175,16 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, addr, entry->client_id, soc15_ih_clientid_name[entry->client_id]); + /* If status=0, we don't have anything to process. Most often, this is + * caused by a 2nd fault coming in before the 1st one is handled. + * Once the 1st fault is handled, the fault registers are cleared, so + * we have nothing to print for fault #2. In that case, don't try to + * print the fault status information. The information above is + * sufficient to note that another fault occurred. + */ + if (!status) + return 0; + if (!amdgpu_sriov_vf(adev)) hub->vmhub_funcs->print_l2_protection_fault_status(adev, status); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index 4df4d73038f8..c5fe31070cf3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -144,6 +144,16 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n", addr, entry->client_id); + /* If status=0, we don't have anything to process. Most often, this is + * caused by a 2nd fault coming in before the 1st one is handled. + * Once the 1st fault is handled, the fault registers are cleared, so + * we have nothing to print for fault #2. In that case, don't try to + * print the fault status information. The information above is + * sufficient to note that another fault occurred. + */ + if (!status) + return 0; + if (!amdgpu_sriov_vf(adev)) hub->vmhub_funcs->print_l2_protection_fault_status(adev, status); } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index e33f9e9058cc..fcfe512f1271 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -137,6 +137,16 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev, dev_err(adev->dev, " in page starting at address 0x%016llx from client %d\n", addr, entry->client_id); + /* If status=0, we don't have anything to process. Most often, this is + * caused by a 2nd fault coming in before the 1st one is handled. + * Once the 1st fault is handled, the fault registers are cleared, so + * we have nothing to print for fault #2. In that case, don't try to + * print the fault status information. The information above is + * sufficient to note that another fault occurred. + */ + if (!status) + return 0; + if (!amdgpu_sriov_vf(adev)) hub->vmhub_funcs->print_l2_protection_fault_status(adev, status); } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 010db0e58650..6da2ca28375e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -675,6 +675,16 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, if (!amdgpu_sriov_vf(adev)) WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + /* If status=0, we don't have anything to process. Most often, this is + * caused by a 2nd fault coming in before the 1st one is handled. + * Once the 1st fault is handled, the fault registers are cleared, so + * we have nothing to print for fault #2. In that case, don't try to + * print the fault status information. The information above is + * sufficient to note that another fault occurred. + */ + if (!status) + return 0; + amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub); dev_err(adev->dev, -- 2.43.0