add amdgpu ras 'event_state' sysfs device attribute support Signed-off-by: Yang Wang <kevinyang.wang@xxxxxxx> Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 56 +++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 7 +++- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ca09316fbb6a..be053e168b64 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1731,6 +1731,39 @@ static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, return sysfs_emit(buf, "schema: 0x%x\n", con->schema); } +static struct { + enum ras_event_type type; + const char *name; +} dump_event[] = { + {RAS_EVENT_TYPE_FATAL, "Fatal Error"}, + {RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"}, + {RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, +}; + +static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct amdgpu_ras *con = + container_of(attr, struct amdgpu_ras, event_state_attr); + struct ras_event_manager *event_mgr = con->event_mgr; + struct ras_event_state *event_state; + int i, size = 0; + + if (!event_mgr) + return -EINVAL; + + size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); + for (i = 0; i < ARRAY_SIZE(dump_event); i++) { + event_state = &event_mgr->event_state[dump_event[i].type]; + size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n", + dump_event[i].name, + atomic64_read(&event_state->count), + event_state->last_seqno); + } + + return (ssize_t)size; +} + static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1748,6 +1781,7 @@ static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev) &con->features_attr.attr, &con->version_attr.attr, &con->schema_attr.attr, + &con->event_state_attr.attr, NULL }; struct attribute_group group = { @@ -1980,6 +2014,8 @@ static DEVICE_ATTR(version, 0444, amdgpu_ras_sysfs_version_show, NULL); static DEVICE_ATTR(schema, 0444, amdgpu_ras_sysfs_schema_show, NULL); +static DEVICE_ATTR(event_state, 0444, + amdgpu_ras_sysfs_event_state_show, NULL); static int amdgpu_ras_fs_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1990,6 +2026,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) &con->features_attr.attr, &con->version_attr.attr, &con->schema_attr.attr, + &con->event_state_attr.attr, NULL }; struct bin_attribute *bin_attrs[] = { @@ -2012,6 +2049,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) con->schema_attr = dev_attr_schema; sysfs_attr_init(attrs[2]); + /* add event_state entry */ + con->event_state_attr = dev_attr_event_state; + sysfs_attr_init(attrs[3]); + if (amdgpu_bad_page_threshold != 0) { /* add bad_page_features entry */ bin_attr_gpu_vram_bad_pages.private = NULL; @@ -3440,13 +3481,17 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev) static void ras_event_mgr_init(struct ras_event_manager *mgr) { + struct ras_event_state *event_state; int i; memset(mgr, 0, sizeof(*mgr)); atomic64_set(&mgr->seqno, 0); - for (i = 0; i < ARRAY_SIZE(mgr->last_seqno); i++) - mgr->last_seqno[i] = RAS_EVENT_INVALID_ID; + for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { + event_state = &mgr->event_state[i]; + event_state->last_seqno = RAS_EVENT_INVALID_ID; + atomic64_set(&event_state->count, 0); + } } static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) @@ -3960,6 +4005,7 @@ static struct ras_event_manager* __get_ras_event_mgr(struct amdgpu_device *adev) int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, const void *caller) { struct ras_event_manager *event_mgr; + struct ras_event_state *event_state; int ret = 0; if (type >= RAS_EVENT_TYPE_COUNT) { @@ -3973,7 +4019,9 @@ int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_ goto out; } - event_mgr->last_seqno[type] = atomic64_inc_return(&event_mgr->seqno); + event_state = &event_mgr->event_state[type]; + event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); + atomic64_inc(&event_state->count); out: if (ret && caller) @@ -3999,7 +4047,7 @@ u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type if (!event_mgr) return RAS_EVENT_INVALID_ID; - id = event_mgr->last_seqno[type]; + id = event_mgr->event_state[type].last_seqno; break; case RAS_EVENT_TYPE_INVALID: default: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 49ec8edcbe39..88a427a1c8cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -439,10 +439,14 @@ enum ras_event_type { RAS_EVENT_TYPE_POISON_CONSUMPTION, RAS_EVENT_TYPE_COUNT, }; +struct ras_event_state { + u64 last_seqno; + atomic64_t count; +}; struct ras_event_manager { atomic64_t seqno; - u64 last_seqno[RAS_EVENT_TYPE_COUNT]; + struct ras_event_state event_state[RAS_EVENT_TYPE_COUNT]; }; struct ras_event_id { @@ -496,6 +500,7 @@ struct amdgpu_ras { struct device_attribute features_attr; struct device_attribute version_attr; struct device_attribute schema_attr; + struct device_attribute event_state_attr; struct bin_attribute badpages_attr; struct dentry *de_ras_eeprom_table; /* block array */ -- 2.34.1