Reviewed-By: Kent Russell <kent.russell@xxxxxxx> Kent -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Kim, Jonathan Sent: Tuesday, July 16, 2019 1:12 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: RE: [PATCH 3/3] drm/amdgpu: adding xgmi error monitoring Note patch 1 sent to internal amd mailing list for review. -----Original Message----- From: Kim, Jonathan <Jonathan.Kim@xxxxxxx> Sent: Tuesday, July 16, 2019 1:09 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Kim, Jonathan <Jonathan.Kim@xxxxxxx>; Kim, Jonathan <Jonathan.Kim@xxxxxxx> Subject: [PATCH 3/3] drm/amdgpu: adding xgmi error monitoring monitor xgmi errors via mc pie status through fica registers. Change-Id: Id80b6c2f635a294afe343cf55a03902e9a1787a5 Signed-off-by: Jonathan Kim <Jonathan.Kim@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 38 ++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index d11eba09eadd..4b87fda15ac5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -25,7 +25,7 @@ #include "amdgpu.h" #include "amdgpu_xgmi.h" #include "amdgpu_smu.h" - +#include "df/df_3_6_offset.h" static DEFINE_MUTEX(xgmi_mutex); @@ -131,9 +131,37 @@ static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, } +#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) +static ssize_t amdgpu_xgmi_show_error(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; + uint64_t fica_out; + unsigned int error_count = 0; + + ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); + ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); -static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); + fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_ctl_in); + if (fica_out != 0x1f) + pr_err("xGMI error counters not enabled!\n"); + + fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_status_in); + + if ((fica_out & 0xffff) == 2) + error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); + adev->df_funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); + + return snprintf(buf, PAGE_SIZE, "%d\n", error_count); } + + +static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, +NULL); static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, +NULL); static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, struct amdgpu_hive_info *hive) @@ -148,6 +176,12 @@ static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, return ret; } + /* Create xgmi error file */ + ret = device_create_file(adev->dev, &dev_attr_xgmi_error); + if (ret) + pr_err("failed to create xgmi_error\n"); + + /* Create sysfs link to hive info folder on the first device */ if (adev != hive->adev) { ret = sysfs_create_link(&adev->dev->kobj, hive->kobj, -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx