On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote: > From: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > > CPU hotplug, especially CPU0, can be flaky on commodity hardware. > > To improve test reliability and reponse times when testing larger runs we > need to handle those cases better. > > Handle failures to off-line a CPU by immediately skipping the test, and > failures to on-line a CPU by immediately rebooting the machine. > > This patch includes igt_sysrq_reboot implementation from Chris Wilson. > > v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is > set. (Petri Latvala) > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Petri Latvala <petri.latvala@xxxxxxxxx> > Cc: Tomi Sarvela <tomi.p.sarvela@xxxxxxxxx> Reviewed-by: Petri Latvala <petri.latvala@xxxxxxxxx> with two nitpicks below. > --- > lib/Makefile.sources | 2 ++ > lib/igt_core.c | 23 +++++++++++++++++++++++ > lib/igt_core.h | 1 + > lib/igt_sysrq.c | 22 ++++++++++++++++++++++ > lib/igt_sysrq.h | 30 ++++++++++++++++++++++++++++++ > lib/meson.build | 1 + > tests/perf_pmu.c | 38 +++++++++++++++++++++++++++++++------- > 7 files changed, 110 insertions(+), 7 deletions(-) > create mode 100644 lib/igt_sysrq.c > create mode 100644 lib/igt_sysrq.h > > diff --git a/lib/Makefile.sources b/lib/Makefile.sources > index 5b13ef8896c0..3d37ef1d1984 100644 > --- a/lib/Makefile.sources > +++ b/lib/Makefile.sources > @@ -35,6 +35,8 @@ lib_source_list = \ > igt_stats.h \ > igt_sysfs.c \ > igt_sysfs.h \ > + igt_sysrq.c \ > + igt_sysrq.h \ > igt_x86.h \ > igt_x86.c \ > igt_vgem.c \ > diff --git a/lib/igt_core.c b/lib/igt_core.c > index c292343de09e..3fd9f529f09f 100644 > --- a/lib/igt_core.c > +++ b/lib/igt_core.c > @@ -70,6 +70,7 @@ > #include "igt_core.h" > #include "igt_aux.h" > #include "igt_sysfs.h" > +#include "igt_sysrq.h" > #include "igt_rc.h" > > #define UNW_LOCAL_ONLY > @@ -1136,6 +1137,28 @@ void igt_fail(int exitcode) > } > } > > +/** > + * igt_fatal_error: > + * > + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR > + * environment variable is set, reboot the machine. > + * > + * Since out test runner (piglit) does support fatal test exit codes, we > + * implement the default behaviour by waiting endlessly. > + */ > +void __attribute__((noreturn)) igt_fatal_error(void) > +{ > + if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) { > + igt_warn("FATAL ERROR - REBOOTING"); > + igt_sysrq_reboot(); > + } else { > + igt_warn("FATAL ERROR"); > + for (;;) > + sleep(60); > + } > +} > + > + > /** > * igt_can_fail: > * > diff --git a/lib/igt_core.h b/lib/igt_core.h > index 7af2b4c109fe..66523a208c31 100644 > --- a/lib/igt_core.h > +++ b/lib/igt_core.h > @@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char *file, > const char *format, ...) > __attribute__((noreturn)); > void igt_exit(void) __attribute__((noreturn)); > +void igt_fatal_error(void) __attribute__((noreturn)); > > /** > * igt_ignore_warn: > diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c > new file mode 100644 > index 000000000000..fe3d2e344ff1 > --- /dev/null > +++ b/lib/igt_sysrq.c > @@ -0,0 +1,22 @@ > +#include <unistd.h> > +#include <fcntl.h> > +#include <stdlib.h> > +#include <sys/reboot.h> > + > +#include "igt_core.h" > + > +#include "igt_sysrq.h" > + Docs for igt_sysrq_reboot? > +void igt_sysrq_reboot(void) > +{ > + sync(); > + > + /* Try to be nice at first, and if that fails pull the trigger */ > + if (reboot(RB_AUTOBOOT)) { > + int fd = open("/proc/sysrq-trigger", O_WRONLY); > + igt_ignore_warn(write(fd, "b", 2)); > + close(fd); > + } > + > + abort(); > +} > diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h > new file mode 100644 > index 000000000000..422473d2a480 > --- /dev/null > +++ b/lib/igt_sysrq.h > @@ -0,0 +1,30 @@ > +/* > + * Copyright © 2018 Intel Corporation > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice (including the next > + * paragraph) shall be included in all copies or substantial portions of the > + * Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS > + * IN THE SOFTWARE. > + * > + */ > + > +#ifndef __IGT_SYSRQ_H__ > +#define __IGT_SYSRQ_H__ > + > +void igt_sysrq_reboot(void) __attribute__((noreturn)); > + > +#endif /* __IGT_SYSRQ_H__ */ > diff --git a/lib/meson.build b/lib/meson.build > index a9e53689b35d..b3b8b14a3f01 100644 > --- a/lib/meson.build > +++ b/lib/meson.build > @@ -14,6 +14,7 @@ lib_sources = [ > 'igt_stats.c', > 'igt_syncobj.c', > 'igt_sysfs.c', > + 'igt_sysrq.c', > 'igt_vgem.c', > 'igt_x86.c', > 'instdone.c', > diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c > index 3bbb18d2f216..8c75b0641785 100644 > --- a/tests/perf_pmu.c > +++ b/tests/perf_pmu.c > @@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd) > int link[2]; > int fd, ret; > int cur = 0; > + char buf; > > igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd))); > igt_require(cpu0_hotplug_support()); > @@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd) > } > > /* Offline followed by online a CPU. */ > - igt_assert_eq(write(cpufd, "0", 2), 2); > + > + ret = write(cpufd, "0", 2); > + if (ret < 0) { > + /* > + * If we failed to offline a CPU we don't want > + * to proceed. > + */ > + igt_warn("Failed to offline cpu%u! (%d)\n", > + cpu, errno); > + igt_assert_eq(write(link[1], "s", 1), 1); > + break; > + } > + > usleep(1e6); > - igt_assert_eq(write(cpufd, "1", 2), 2); > + > + ret = write(cpufd, "1", 2); > + if (ret < 0) { > + /* > + * Failed to bring a CPU back online is fatal > + * for the sanity of a test run so reboot > + * immediately. > + */ This is assuming what the user has configured igt_fatal_error() to do. Just a s/so reboot immediately// maybe? -- Petri Latvala _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx