Re: [igt-dev] [PATCH i-g-t] tests/perf_pmu: Handle CPU hotplug failures better

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Feb 23, 2018 at 11:34:53AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  2 ++
>  tests/perf_pmu.c     | 42 ++++++++++++++++++++++++++++++++++--------
>  5 files changed, 90 insertions(+), 8 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =	 	\
>  	igt_stats.h		\
>  	igt_sysfs.c		\
>  	igt_sysfs.h		\
> +	igt_sysrq.c		\
> +	igt_sysrq.h		\
>  	igt_x86.h		\
>  	igt_x86.c		\
>  	igt_vgem.c		\
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +
> +void igt_sysrq_reboot(void)
> +{
> +	sync();
> +
> +	/* Try to be nice at first, and if that fails pull the trigger */
> +	if (reboot(RB_AUTOBOOT)) {
> +		int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +		igt_ignore_warn(write(fd, "b", 2));
> +		close(fd);
> +	}
> +
> +	abort();
> +}


While the cause for taking this action might be dire, rebooting
people's machines can be kind of a dick move, even considering they're
running tests that can be fatal to the machine in other ways.

We have IGT_HANG and IGT_HANG_WITHOUT_RESET so the users can opt
in/out of some fatal behaviour already. I'm fine with auto-rebooting,
even as the default, if users can opt out of it with
IGT_NO_REBOOT_PRETTY_PLEASE or so.


-- 
Petri Latvala



> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index 8f14f6320ecf..63afd3ddb535 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -22,6 +22,7 @@ lib_headers = [
>  	'igt_stats.h',
>  	'igt_syncobj.h',
>  	'igt_sysfs.h',
> +	'igt_sysrq.h',
>  	'igt_x86.h',
>  	'igt_vgem.h',
>  	'instdone.h',
> @@ -69,6 +70,7 @@ lib_sources = [
>  	'igt_stats.c',
>  	'igt_syncobj.c',
>  	'igt_sysfs.c',
> +	'igt_sysrq.c',
>  	'igt_vgem.c',
>  	'igt_x86.c',
>  	'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..658d0976137f 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -41,6 +41,7 @@
>  #include "igt_core.h"
>  #include "igt_perf.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_pm.h"
>  #include "sw_sync.h"
>  
> @@ -965,6 +966,7 @@ static void cpu_hotplug(int gem_fd)
>  	int link[2];
>  	int fd, ret;
>  	int cur = 0;
> +	char buf;
>  
>  	igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>  	igt_require(cpu0_hotplug_support());
> @@ -994,7 +996,7 @@ static void cpu_hotplug(int gem_fd)
>  
>  		for (;;) {
>  			char name[128];
> -			int cpufd;
> +			int cpufd, ret;
>  
>  			igt_assert_lt(snprintf(name, sizeof(name),
>  					       "/sys/devices/system/cpu/cpu%d/online",
> @@ -1011,9 +1013,33 @@ static void cpu_hotplug(int gem_fd)
>  			}
>  
>  			/* Offline followed by online a CPU. */
> -			igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +			ret = write(cpufd, "0", 2);
> +			if (ret < 0) {
> +				/*
> +				 * If we failed to offline a CPU we don't want
> +				 * to proceed.
> +				 */
> +				igt_warn("Failed to offline cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_assert_eq(write(link[1], "s", 1), 1);
> +				break;
> +			}
> +
>  			usleep(1e6);
> -			igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +			ret = write(cpufd, "1", 2);
> +			if (ret < 0) {
> +				/*
> +				 * Failed to bring a CPU back online is fatal
> +				 * for the sanity of a test run so reboot
> +				 * immediately.
> +				 */
> +				igt_warn("Failed to online cpu%u! (%d)\n",
> +					 cpu, errno);
> +				igt_sysrq_reboot();
> +				igt_assert(0);
> +			}
>  
>  			close(cpufd);
>  			cpu++;
> @@ -1027,15 +1053,12 @@ static void cpu_hotplug(int gem_fd)
>  	 * until the CPU core shuffler finishes one loop.
>  	 */
>  	for (;;) {
> -		char buf;
> -		int ret2;
> -
>  		usleep(500e3);
>  		end_spin(gem_fd, spin[cur], 0);
>  
>  		/* Check if the child is signaling completion. */
> -		ret2 = read(link[0], &buf, 1);
> -		if ( ret2 == 1 || (ret2 < 0 && errno != EAGAIN))
> +		ret = read(link[0], &buf, 1);
> +		if ( ret == 1 || (ret < 0 && errno != EAGAIN))
>  			break;
>  
>  		igt_spin_batch_free(gem_fd, spin[cur]);
> @@ -1054,6 +1077,9 @@ static void cpu_hotplug(int gem_fd)
>  	close(fd);
>  	close(link[0]);
>  
> +	/* Skip if child signals a problem with offlining a CPU. */
> +	igt_skip_on(buf == 's');
> +
>  	assert_within_epsilon(val, ts[1] - ts[0], tolerance);
>  }
>  
> -- 
> 2.14.1
> 
> _______________________________________________
> igt-dev mailing list
> igt-dev@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/igt-dev
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux