From: pradeep <you@xxxxxxxxxxx> This patch introduces a soft lockup/drift test with stress. 1) Boot up a VM. 2) Build stress on host and guest. 3) run heartbeat monitor with the given options on server and host. 3) Run for a relatively long time length, ex: 12, 18 or 24 hours. 4) Output the test result and observe drift. Changes from v2: * Fixed up commands being used on guest, lack of proper output redirection was confusing aexpect * Proper clean up previous instances of the monitor programs lying around, as well as log files * Resort to another method of determining host IP if the same has no fully qualified hostname (stand alone laptops, for example) * Only use a single session on guest to execute all the commands. previous version was opening unneeded connections. * Fix stress execution in guest and host, now the stress instances effectively start * Actively open guest and host firewall rules so heartbeat monitor communication can happen Signed-off-by: Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> Signed-off-by: Pradeep Kumar Surisetty <psuriset@xxxxxxxxxxxxxxxxxx> --- client/tests/kvm/deps/heartbeat_slu.py | 205 ++++++++++++++++++++++++++++++++ client/tests/kvm/tests_base.cfg.sample | 18 +++ client/virt/tests/softlockup.py | 147 +++++++++++++++++++++++ 3 files changed, 370 insertions(+), 0 deletions(-) create mode 100755 client/tests/kvm/deps/heartbeat_slu.py create mode 100644 client/virt/tests/softlockup.py diff --git a/client/tests/kvm/deps/heartbeat_slu.py b/client/tests/kvm/deps/heartbeat_slu.py new file mode 100755 index 0000000..697bbbf --- /dev/null +++ b/client/tests/kvm/deps/heartbeat_slu.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python + +""" +Heartbeat server/client to detect soft lockups +""" + +import socket, os, sys, time, getopt + +def daemonize(output_file): + try: + pid = os.fork() + except OSError, e: + raise Exception, "error %d: %s" % (e.strerror, e.errno) + + if pid: + os._exit(0) + + os.umask(0) + os.setsid() + sys.stdout.flush() + sys.stderr.flush() + + if file: + output_handle = file(output_file, 'a+', 0) + # autoflush stdout/stderr + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) + else: + output_handle = file('/dev/null', 'a+') + + stdin_handle = open('/dev/null', 'r') + os.dup2(output_handle.fileno(), sys.stdout.fileno()) + os.dup2(output_handle.fileno(), sys.stderr.fileno()) + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) + +def recv_all(sock): + total_data = [] + while True: + data = sock.recv(1024) + if not data: + break + total_data.append(data) + return ''.join(total_data) + +def run_server(host, port, daemon, file, queue_size, threshold, drift): + if daemon: + daemonize(output_file=file) + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind((host, port)) + sock.listen(queue_size) + timeout_interval = threshold * 2 + prev_check_timestamp = float(time.time()) + while 1: + c_sock, c_addr = sock.accept() + heartbeat = recv_all(c_sock) + local_timestamp = float(time.time()) + drift = check_heartbeat(heartbeat, local_timestamp, threshold, check_drift) + # NOTE: this doesn't work if the only client is the one that timed + # out, but anything more complete would require another thread and + # a lock for client_prev_timestamp. + if local_timestamp - prev_check_timestamp > threshold * 2.0: + check_for_timeouts(threshold, check_drift) + prev_check_timestamp = local_timestamp + if verbose: + if check_drift: + print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift) + else: + print "%.2f: %s" % (local_timestamp, heartbeat) + +def run_client(host, port, daemon, file, interval): + if daemon: + daemonize(output_file=file) + seq = 1 + while 1: + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((host, port)) + heartbeat = get_heartbeat(seq) + sock.sendall(heartbeat) + sock.close() + if verbose: + print heartbeat + except socket.error, (value, message): + print "%.2f: ERROR, %d - %s" % (float(time.time()), value, message) + + seq += 1 + time.sleep(interval) + +def get_heartbeat(seq=1): + return "%s %06d %.2f" % (hostname, seq, float(time.time())) + +def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift): + hostname, seq, timestamp = heartbeat.rsplit() + timestamp = float(timestamp) + if client_prev_timestamp.has_key(hostname): + delta = local_timestamp - client_prev_timestamp[hostname] + if delta > threshold: + print "%.2f: ALERT, SLU detected on host %s, delta %ds" \ + % (float(time.time()), hostname, delta) + + client_prev_timestamp[hostname] = local_timestamp + + if check_drift: + if not client_clock_offset.has_key(hostname): + client_clock_offset[hostname] = timestamp - local_timestamp + client_prev_drift[hostname] = 0 + drift = timestamp - local_timestamp - client_clock_offset[hostname] + drift_delta = drift - client_prev_drift[hostname] + client_prev_drift[hostname] = drift + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) + +def check_for_timeouts(threshold, check_drift): + local_timestamp = float(time.time()) + hostname_list = list(client_prev_timestamp) + for hostname in hostname_list: + timestamp = client_prev_timestamp[hostname] + delta = local_timestamp - timestamp + if delta > threshold * 2: + print "%.2f: ALERT, SLU detected on host %s, no heartbeat for %ds" \ + % (local_timestamp, hostname, delta) + del client_prev_timestamp[hostname] + if check_drift: + del client_clock_offset[hostname] + del client_prev_drift[hostname] + +def usage(): + print """ +Usage: + + heartbeat_slu.py --server --address <bind_address> --port <bind_port> + [--file <output_file>] [--no-daemon] [--verbose] + [--threshold <heartbeat threshold>] + + heartbeat_slu.py --client --address <server_address> -p <server_port> + [--file output_file] [--no-daemon] [--verbose] + [--interval <heartbeat interval in seconds>] +""" + +# host information and global data +hostname = socket.gethostname() +client_prev_timestamp = {} +client_clock_offset = {} +client_prev_drift = {} + +# default param values +host_port = 9001 +host_address = '' +interval = 1 # seconds between heartbeats +threshold = 10 # seconds late till alert +is_server = False +is_daemon = True +file_server = "/tmp/heartbeat_server.out" +file_client = "/tmp/heartbeat_client.out" +file_selected = None +queue_size = 5 +verbose = False +check_drift = False + +# process cmdline opts +try: + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ + "server", "client", "no-daemon", "address=", "port=", + "file=", "server", "interval=", "threshold=", "verbose", + "check-drift", "help"]) +except getopt.GetoptError, e: + print "error: %s" % str(e) + usage() + exit(1) + +for param, value in opts: + if param in ["-p", "--port"]: + host_port = int(value) + elif param in ["-a", "--address"]: + host_address = value + elif param in ["-s", "--server"]: + is_server = True + elif param in ["-c", "--client"]: + is_server = False + elif param in ["--no-daemon"]: + is_daemon = False + elif param in ["-f", "--file"]: + file_selected = value + elif param in ["-i", "--interval"]: + interval = int(value) + elif param in ["-t", "--threshold"]: + threshold = int(value) + elif param in ["-d", "--check-drift"]: + check_drift = True + elif param in ["-v", "--verbose"]: + verbose = True + elif param in ["-h", "--help"]: + usage() + exit(0) + else: + print "error: unrecognized option: %s" % value + usage() + exit(1) + +# run until we're terminated +if is_server: + file_server = file_selected or file_server + run_server(host_address, host_port, is_daemon, file_server, queue_size, threshold, check_drift) +else: + file_client = file_selected or file_client + run_client(host_address, host_port, is_daemon, file_client, interval) diff --git a/client/tests/kvm/tests_base.cfg.sample b/client/tests/kvm/tests_base.cfg.sample index 65880d8..e9e41f9 100644 --- a/client/tests/kvm/tests_base.cfg.sample +++ b/client/tests/kvm/tests_base.cfg.sample @@ -420,6 +420,24 @@ variants: type = smbios_table start_vm = no + - softlockup: install setup unattended_install.cdrom + only Linux + type = softlockup + softlockup_files = stress-1.0.4.tar.gz + stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && cd stress-1.0.4 && ./configure && make && cd src" + server_setup_cmd = "%s/heartbeat_slu.py --server --threshold %s --file %s --port %s --verbose --check-drift" + client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1" + stress_cmd = "cd %s && cd stress-1.0.4 && cd src && nohup ./stress -c %s > /dev/null 2>&1&" + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk '{print$2}' | xargs kill -9 > /dev/null 2>&1" + kill_stress_cmd = "pkill -f stress > /dev/null 2>&1" + drift_cmd = "tail -1 %s | awk '{print $7}'" + monitor_log_file_server = /tmp/heartbeat_server.log + monitor_log_file_client = /tmp/heartbeat_client.log + monitor_port = 13330 + stress_threshold = 10 + # time_to_run (hours) = 12, 18, 24, 48 hours + test_length = 0.10 + - stress_boot: install setup image_copy unattended_install.cdrom type = stress_boot max_vms = 5 diff --git a/client/virt/tests/softlockup.py b/client/virt/tests/softlockup.py new file mode 100644 index 0000000..d946965 --- /dev/null +++ b/client/virt/tests/softlockup.py @@ -0,0 +1,147 @@ +import logging, os, socket, time +from autotest_lib.client.bin import utils + + +def run_softlockup(test, params, env): + """ + soft lockup/drift test with stress. + + 1) Boot up a VM. + 2) Build stress on host and guest. + 3) run heartbeat with the given options on server and host. + 3) Run for a relatively long time length. ex: 12, 18 or 24 hours. + 4) Output the test result and observe drift. + + @param test: KVM test object. + @param params: Dictionary with the test parameters. + @param env: Dictionary with test environment. + """ + stress_setup_cmd = params.get("stress_setup_cmd") + stress_cmd = params.get("stress_cmd") + server_setup_cmd = params.get("server_setup_cmd") + drift_cmd = params.get("drift_cmd") + kill_stress_cmd = params.get("kill_stress_cmd") + kill_monitor_cmd = params.get("kill_monitor_cmd") + + threshold = int(params.get("stress_threshold")) + monitor_log_file_server = params.get("monitor_log_file_server") + monitor_log_file_client = params.get("monitor_log_file_client") + test_length = int(3600 * float(params.get("test_length"))) + monitor_port = int(params.get("monitor_port")) + + vm = env.get_vm(params["main_vm"]) + login_timeout = int(params.get("login_timeout", 360)) + stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress") + monitor_dir = os.path.join(test.bindir, 'deps') + + + def _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd): + logging.info("Kill stress and monitor on guest") + try: + session.cmd(kill_stress_cmd) + except: + pass + try: + session.cmd(kill_monitor_cmd) + except: + pass + + + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): + logging.info("Kill stress and monitor on host") + utils.run(kill_stress_cmd, ignore_status=True) + utils.run(kill_monitor_cmd, ignore_status=True) + + + def host(): + logging.info("Setup monitor server on host") + # Kill previous instances of the host load programs, if any + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) + # Cleanup previous log instances + if os.path.isfile(monitor_log_file_server): + os.remove(monitor_log_file_server) + # Opening firewall ports on host + utils.run("iptables -F", ignore_status=True) + + # Run heartbeat on host + utils.run(server_setup_cmd % (monitor_dir, threshold, + monitor_log_file_server, monitor_port)) + + logging.info("Build stress on host") + # Uncompress and build stress on host + utils.run(stress_setup_cmd % stress_dir) + + logging.info("Run stress on host") + # stress_threads = 2 * n_cpus + threads_host = 2 * utils.count_cpus() + # Run stress test on host + utils.run(stress_cmd % (stress_dir, threads_host)) + + + def guest(): + try: + host_ip = socket.gethostbyname(socket.gethostname()) + except socket.error: + try: + # Hackish, but works well on stand alone (laptop) setups + # with access to the internet. If this fails, well, then + # not much else can be done... + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("redhat.com", 80)) + host_ip = s.getsockname()[0] + except socket.error, (value, e): + raise error.TestError("Could not determine host IP: %d %s" % + (value, e)) + + # Now, starting the guest + vm.verify_alive() + session = vm.wait_for_login(timeout=login_timeout) + + # Kill previous instances of the load programs, if any + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) + # Clean up previous log instances + session.cmd("rm -f %s" % monitor_log_file_client) + + # Opening firewall ports on guest + try: + session.cmd("iptables -F") + except: + pass + + # Get required files and copy them from host to guest + monitor_path = os.path.join(test.bindir, 'deps', 'heartbeat_slu.py') + stress_path = os.path.join(os.environ['AUTODIR'], "tests", "stress", + "stress-1.0.4.tar.gz") + vm.copy_files_to(monitor_path, "/tmp") + vm.copy_files_to(stress_path, "/tmp") + + logging.info("Setup monitor client on guest") + # Start heartbeat on guest + session.cmd(params.get("client_setup_cmd") % + ("/tmp", monitor_log_file_client, host_ip, monitor_port)) + + logging.info("Build stress on guest") + # Uncompress and build stress on guest + session.cmd(stress_setup_cmd % "/tmp", timeout=200) + + logging.info("Run stress on guest") + # stress_threads = 2 * n_vcpus + threads_guest = 2 * int(params.get("smp", 1)) + # Run stress test on guest + session.cmd(stress_cmd % ("/tmp", threads_guest)) + + # Wait and report + logging.debug("Wait for %d s", test_length) + time.sleep(test_length) + + # Kill instances of the load programs on both guest and host + _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd) + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) + + # Collect drift + drift = utils.system_output(drift_cmd % monitor_log_file_server) + logging.info("Drift noticed: %s", drift) + + + host() + guest() -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html