Signed-off-by: xinhui pan <xinhui.pan@xxxxxxx> Reviewed-by: Feifei Xu <Feifei.Xu@xxxxxxx> Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> --- tests/amdgpu/Makefile.am | 3 +- tests/amdgpu/amdgpu_test.c | 11 + tests/amdgpu/amdgpu_test.h | 22 ++ tests/amdgpu/meson.build | 2 +- tests/amdgpu/ras_tests.c | 594 +++++++++++++++++++++++++++++++++++++ 5 files changed, 630 insertions(+), 2 deletions(-) create mode 100644 tests/amdgpu/ras_tests.c diff --git a/tests/amdgpu/Makefile.am b/tests/amdgpu/Makefile.am index 447ff217..48278848 100644 --- a/tests/amdgpu/Makefile.am +++ b/tests/amdgpu/Makefile.am @@ -33,4 +33,5 @@ amdgpu_test_SOURCES = \ vcn_tests.c \ uve_ib.h \ deadlock_tests.c \ - vm_tests.c + vm_tests.c \ + ras_tests.c diff --git a/tests/amdgpu/amdgpu_test.c b/tests/amdgpu/amdgpu_test.c index a793ca7d..8fc7a0b9 100644 --- a/tests/amdgpu/amdgpu_test.c +++ b/tests/amdgpu/amdgpu_test.c @@ -56,6 +56,7 @@ #define UVD_ENC_TESTS_STR "UVD ENC Tests" #define DEADLOCK_TESTS_STR "Deadlock Tests" #define VM_TESTS_STR "VM Tests" +#define RAS_TESTS_STR "RAS Tests" /** * Open handles for amdgpu devices @@ -116,6 +117,12 @@ static CU_SuiteInfo suites[] = { .pCleanupFunc = suite_vm_tests_clean, .pTests = vm_tests, }, + { + .pName = RAS_TESTS_STR, + .pInitFunc = suite_ras_tests_init, + .pCleanupFunc = suite_ras_tests_clean, + .pTests = ras_tests, + }, CU_SUITE_INFO_NULL, }; @@ -165,6 +172,10 @@ static Suites_Active_Status suites_active_stat[] = { .pName = VM_TESTS_STR, .pActive = suite_vm_tests_enable, }, + { + .pName = RAS_TESTS_STR, + .pActive = suite_ras_tests_enable, + }, }; diff --git a/tests/amdgpu/amdgpu_test.h b/tests/amdgpu/amdgpu_test.h index af81eea8..bcd0bc7e 100644 --- a/tests/amdgpu/amdgpu_test.h +++ b/tests/amdgpu/amdgpu_test.h @@ -194,6 +194,28 @@ CU_BOOL suite_vm_tests_enable(void); */ extern CU_TestInfo vm_tests[]; + +/** + * Initialize ras test suite + */ +int suite_ras_tests_init(); + +/** + * Deinitialize deadlock test suite + */ +int suite_ras_tests_clean(); + +/** + * Decide if the suite is enabled by default or not. + */ +CU_BOOL suite_ras_tests_enable(void); + +/** + * Tests in ras test suite + */ +extern CU_TestInfo ras_tests[]; + + /** * Helper functions */ diff --git a/tests/amdgpu/meson.build b/tests/amdgpu/meson.build index 4c1237c6..95ed9305 100644 --- a/tests/amdgpu/meson.build +++ b/tests/amdgpu/meson.build @@ -24,7 +24,7 @@ if dep_cunit.found() files( 'amdgpu_test.c', 'basic_tests.c', 'bo_tests.c', 'cs_tests.c', 'vce_tests.c', 'uvd_enc_tests.c', 'vcn_tests.c', 'deadlock_tests.c', - 'vm_tests.c', + 'vm_tests.c', 'ras_tests.c', ), dependencies : [dep_cunit, dep_threads], include_directories : [inc_root, inc_drm, include_directories('../../amdgpu')], diff --git a/tests/amdgpu/ras_tests.c b/tests/amdgpu/ras_tests.c new file mode 100644 index 00000000..989eb153 --- /dev/null +++ b/tests/amdgpu/ras_tests.c @@ -0,0 +1,594 @@ +/* + * Copyright 2017 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * +*/ + +#include "CUnit/Basic.h" + +#include "amdgpu_test.h" +#include "amdgpu_drm.h" +#include "amdgpu_internal.h" +#include <unistd.h> +#include <fcntl.h> +#include <stdio.h> +#include "xf86drm.h" + +const char *ras_block_string[] = { + "umc", + "sdma", + "gfx", + "mmhub", + "athub", + "pcie_bif", + "hdp", + "xgmi_wafl", + "df", + "smn", + "sem", + "mp0", + "mp1", + "fuse", +}; + +#define ras_block_str(i) (ras_block_string[i]) + +enum amdgpu_ras_block { + AMDGPU_RAS_BLOCK__UMC = 0, + AMDGPU_RAS_BLOCK__SDMA, + AMDGPU_RAS_BLOCK__GFX, + AMDGPU_RAS_BLOCK__MMHUB, + AMDGPU_RAS_BLOCK__ATHUB, + AMDGPU_RAS_BLOCK__PCIE_BIF, + AMDGPU_RAS_BLOCK__HDP, + AMDGPU_RAS_BLOCK__XGMI_WAFL, + AMDGPU_RAS_BLOCK__DF, + AMDGPU_RAS_BLOCK__SMN, + AMDGPU_RAS_BLOCK__SEM, + AMDGPU_RAS_BLOCK__MP0, + AMDGPU_RAS_BLOCK__MP1, + AMDGPU_RAS_BLOCK__FUSE, + + AMDGPU_RAS_BLOCK__LAST +}; + +#define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST +#define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1) + +enum amdgpu_ras_error_type { + AMDGPU_RAS_ERROR__NONE = 0, + AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2, + AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4, + AMDGPU_RAS_ERROR__POISON = 8, +}; + +struct ras_common_if { + enum amdgpu_ras_block block; + enum amdgpu_ras_error_type type; + uint32_t sub_block_index; + char name[32]; +}; + +struct ras_inject_if { + struct ras_common_if head; + uint64_t address; + uint64_t value; +}; + +struct ras_debug_if { + union { + struct ras_common_if head; + struct ras_inject_if inject; + }; + int op; +}; +/* for now, only umc, gfx, sdma has implemented. */ +static uint32_t ras_block_mask_inject_query = (1 << AMDGPU_RAS_BLOCK__UMC); + +static uint32_t ras_block_mask_basic = (1 << AMDGPU_RAS_BLOCK__UMC) + | (1 << AMDGPU_RAS_BLOCK__SDMA) + | (1 << AMDGPU_RAS_BLOCK__GFX); + +struct amdgpu_ras_data { + amdgpu_device_handle device_handle; + uint32_t id; + uint32_t capability; +}; + +/* all devices who has ras supported */ +static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED]; +static int devices_count; + +static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle) +{ + union { + uint64_t feature_mask; + struct { + uint32_t enabled_features; + uint32_t supported_features; + }; + } features = { 0 }; + int ret; + + ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, + sizeof(features), &features); + if (ret) + return 0; + + return features.supported_features; +} + +static int get_file_contents(char *file, char *buf, int size); + +static int amdgpu_ras_lookup_id(drmDevicePtr device) +{ + char path[1024]; + char str[128]; + drmPciBusInfo info; + int i; + int ret; + + for (i = 0; i < MAX_CARDS_SUPPORTED; i++) { + memset(str, 0, sizeof(str)); + sprintf(path, "/sys/kernel/debug/dri/%d/name", i); + if (get_file_contents(path, str, sizeof(str)) <= 0) + continue; + + ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx", + &info.domain, &info.bus, &info.dev, &info.func); + if (ret != 4) + continue; + + if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0) + return i; + } + return -1; +} + +CU_BOOL suite_ras_tests_enable(void) +{ + amdgpu_device_handle device_handle; + uint32_t major_version; + uint32_t minor_version; + int i; + drmDevicePtr device; + + for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { + if (amdgpu_device_initialize(drm_amdgpu[i], &major_version, + &minor_version, &device_handle)) + continue; + + if (drmGetDevice2(drm_amdgpu[i], + DRM_DEVICE_GET_PCI_REVISION, + &device)) + continue; + + if (device->bustype == DRM_BUS_PCI && + amdgpu_ras_lookup_capability(device_handle)) { + amdgpu_device_deinitialize(device_handle); + return CU_TRUE; + } + + if (amdgpu_device_deinitialize(device_handle)) + continue; + } + + return CU_FALSE; +} + +int suite_ras_tests_init(void) +{ + drmDevicePtr device; + amdgpu_device_handle device_handle; + uint32_t major_version; + uint32_t minor_version; + uint32_t capability; + int id; + int i; + int r; + + for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) { + r = amdgpu_device_initialize(drm_amdgpu[i], &major_version, + &minor_version, &device_handle); + if (r) + continue; + + if (drmGetDevice2(drm_amdgpu[i], + DRM_DEVICE_GET_PCI_REVISION, + &device)) { + amdgpu_device_deinitialize(device_handle); + continue; + } + + if (device->bustype != DRM_BUS_PCI) { + amdgpu_device_deinitialize(device_handle); + continue; + } + + capability = amdgpu_ras_lookup_capability(device_handle); + if (capability == 0) { + amdgpu_device_deinitialize(device_handle); + continue; + + } + + id = amdgpu_ras_lookup_id(device); + if (id == -1) { + amdgpu_device_deinitialize(device_handle); + continue; + } + + devices[devices_count++] = (struct amdgpu_ras_data) { + device_handle, id, capability + }; + } + + if (devices_count == 0) + return CUE_SINIT_FAILED; + + return CUE_SUCCESS; +} + +int suite_ras_tests_clean(void) +{ + int r; + int i; + int ret = CUE_SUCCESS; + + for (i = 0; i < devices_count; i++) { + r = amdgpu_device_deinitialize(devices[i].device_handle); + if (r) + ret = CUE_SCLEAN_FAILED; + } + return ret; +} + +static void amdgpu_ras_disable_test(void); +static void amdgpu_ras_enable_test(void); +static void amdgpu_ras_inject_test(void); +static void amdgpu_ras_query_test(void); +static void amdgpu_ras_basic_test(void); + +CU_TestInfo ras_tests[] = { + { "ras basic test", amdgpu_ras_basic_test }, + { "ras query test", amdgpu_ras_query_test }, + { "ras inject test", amdgpu_ras_inject_test }, + { "ras disable test", amdgpu_ras_disable_test }, +#if 0 + { "ras enable test", amdgpu_ras_enable_test }, +#endif + CU_TEST_INFO_NULL, +}; + +//helpers + +static int test_card; +static char sysfs_path[1024]; +static char debugfs_path[1024]; +static uint32_t ras_mask; +static amdgpu_device_handle device_handle; + +static int set_test_card(int card) +{ + int i; + + test_card = card; + sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id); + sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id); + ras_mask = devices[card].capability; + device_handle = devices[card].device_handle; + + return 0; +} + +static const char *get_ras_sysfs_root(void) +{ + return sysfs_path; +} + +static const char *get_ras_debugfs_root(void) +{ + return debugfs_path; +} + +static int set_file_contents(char *file, char *buf, int size) +{ + int n, fd; + fd = open(file, O_WRONLY); + if (fd == -1) + return -1; + n = write(fd, buf, size); + close(fd); + return n; +} + +static int get_file_contents(char *file, char *buf, int size) +{ + int n, fd; + fd = open(file, O_RDONLY); + if (fd == -1) + return -1; + n = read(fd, buf, size); + close(fd); + return n; +} + +static int is_file_ok(char *file, int flags) +{ + int fd; + + fd = open(file, flags); + if (fd == -1) + return -1; + close(fd); + return 0; +} + +static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block) +{ + uint32_t feature_mask; + int ret; + + ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, + sizeof(feature_mask), &feature_mask); + if (ret) + return -1; + + return (1 << block) & feature_mask; +} + +static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block) +{ + return (1 << block) & ras_mask; +} + +static int amdgpu_ras_invoke(struct ras_debug_if *data) +{ + char path[1024]; + int ret; + + sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl"); + + ret = set_file_contents(path, (char *)data, sizeof(*data)) + - sizeof(*data); + return ret; +} + +static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block, + unsigned long *ue, unsigned long *ce) +{ + char buf[64]; + char name[1024]; + int ret; + + *ue = *ce = 0; + + if (amdgpu_ras_is_feature_supported(block) <= 0) + return -1; + + sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count"); + + if (is_file_ok(name, O_RDONLY)) + return 0; + + if (get_file_contents(name, buf, sizeof(buf)) <= 0) + return -1; + + if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2) + return -1; + + return 0; +} + +//tests +static void amdgpu_ras_features_test(int enable) +{ + struct ras_debug_if data; + int ret; + int i; + + data.op = enable; + for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { + struct ras_common_if head = { + .block = i, + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + .sub_block_index = 0, + .name = "", + }; + + if (amdgpu_ras_is_feature_supported(i) <= 0) + continue; + + data.head = head; + + ret = amdgpu_ras_invoke(&data); + CU_ASSERT_EQUAL(ret, 0); + + if (ret) + continue; + + ret = enable ^ amdgpu_ras_is_feature_enabled(i); + CU_ASSERT_EQUAL(ret, 0); + } +} + +static void amdgpu_ras_disable_test(void) +{ + int i; + for (i = 0; i < devices_count; i++) { + set_test_card(i); + amdgpu_ras_features_test(0); + } +} + +static void amdgpu_ras_enable_test(void) +{ + int i; + for (i = 0; i < devices_count; i++) { + set_test_card(i); + amdgpu_ras_features_test(1); + } +} + +static void __amdgpu_ras_inject_test(void) +{ + struct ras_debug_if data; + int ret; + int i; + unsigned long ue, ce, ue_old, ce_old; + + data.op = 2; + for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { + int timeout = 3; + struct ras_inject_if inject = { + .head = { + .block = i, + .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, + .sub_block_index = 0, + .name = "", + }, + .address = 0, + .value = 0, + }; + + if (amdgpu_ras_is_feature_enabled(i) <= 0) + continue; + + if (!((1 << i) & ras_block_mask_inject_query)) + continue; + + data.inject = inject; + + ret = amdgpu_ras_query_err_count(i, &ue_old, &ce_old); + CU_ASSERT_EQUAL(ret, 0); + + if (ret) + continue; + + ret = amdgpu_ras_invoke(&data); + CU_ASSERT_EQUAL(ret, 0); + + if (ret) + continue; + +loop: + while (timeout > 0) { + ret = amdgpu_ras_query_err_count(i, &ue, &ce); + CU_ASSERT_EQUAL(ret, 0); + + if (ret) + continue; + if (ue_old != ue) { + /*recovery takes ~10s*/ + sleep(10); + break; + } + + sleep(1); + timeout -= 1; + } + + CU_ASSERT_EQUAL(ue_old + 1, ue); + CU_ASSERT_EQUAL(ce_old, ce); + } +} + +static void amdgpu_ras_inject_test(void) +{ + int i; + for (i = 0; i < devices_count; i++) { + set_test_card(i); + __amdgpu_ras_inject_test(); + } +} + +static void __amdgpu_ras_query_test(void) +{ + unsigned long ue, ce; + int ret; + int i; + + for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) { + if (amdgpu_ras_is_feature_supported(i) <= 0) + continue; + + if (!((1 << i) & ras_block_mask_inject_query)) + continue; + + ret = amdgpu_ras_query_err_count(i, &ue, &ce); + CU_ASSERT_EQUAL(ret, 0); + } +} + +static void amdgpu_ras_query_test(void) +{ + int i; + for (i = 0; i < devices_count; i++) { + set_test_card(i); + __amdgpu_ras_query_test(); + } +} + +static void amdgpu_ras_basic_test(void) +{ + unsigned long ue, ce; + char name[1024]; + int ret; + int i; + int j; + uint32_t features; + char path[1024]; + + ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY); + CU_ASSERT_EQUAL(ret, 0); + + for (i = 0; i < devices_count; i++) { + set_test_card(i); + + ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES, + sizeof(features), &features); + CU_ASSERT_EQUAL(ret, 0); + + sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl"); + ret = is_file_ok(path, O_WRONLY); + CU_ASSERT_EQUAL(ret, 0); + + sprintf(path, "%s%s", get_ras_sysfs_root(), "features"); + ret = is_file_ok(path, O_RDONLY); + CU_ASSERT_EQUAL(ret, 0); + + for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) { + ret = amdgpu_ras_is_feature_supported(j); + if (ret <= 0) + continue; + + if (!((1 << j) & ras_block_mask_basic)) + continue; + + sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count"); + ret = is_file_ok(path, O_RDONLY); + CU_ASSERT_EQUAL(ret, 0); + + sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject"); + ret = is_file_ok(path, O_WRONLY); + CU_ASSERT_EQUAL(ret, 0); + } + } +} -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx