On Wed, 20 Jul 2011 22:30:09 -0300 Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> wrote: > From: pradeep <you@xxxxxxxxxxx> > > This patch introduces a soft lockup/drift test with stress. > > 1) Boot up a VM. > 2) Build stress on host and guest. > 3) run heartbeat monitor with the given options on server and > host. 3) Run for a relatively long time length, ex: 12, 18 or 24 > hours. 4) Output the test result and observe drift. Thanks for making changes. How about taking average of last 10 drift values? > > Changes from v2: > * Fixed up commands being used on guest, lack of proper output > redirection was confusing aexpect > * Proper clean up previous instances of the monitor programs > lying around, as well as log files > * Resort to another method of determining host IP if the same > has no fully qualified hostname (stand alone laptops, for > example) > * Only use a single session on guest to execute all the commands. > previous version was opening unneeded connections. > * Fix stress execution in guest and host, now the stress instances > effectively start > * Actively open guest and host firewall rules so heartbeat monitor > communication can happen > > Signed-off-by: Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> > Signed-off-by: Pradeep Kumar Surisetty <psuriset@xxxxxxxxxxxxxxxxxx> > --- > client/tests/kvm/deps/heartbeat_slu.py | 205 > ++++++++++++++++++++++++++++++++ > client/tests/kvm/tests_base.cfg.sample | 18 +++ > client/virt/tests/softlockup.py | 147 +++++++++++++++++++++++ > 3 files changed, 370 insertions(+), 0 deletions(-) create mode 100755 > client/tests/kvm/deps/heartbeat_slu.py create mode 100644 > client/virt/tests/softlockup.py > > diff --git a/client/tests/kvm/deps/heartbeat_slu.py > b/client/tests/kvm/deps/heartbeat_slu.py new file mode 100755 > index 0000000..697bbbf > --- /dev/null > +++ b/client/tests/kvm/deps/heartbeat_slu.py > @@ -0,0 +1,205 @@ > +#!/usr/bin/env python > + > +""" > +Heartbeat server/client to detect soft lockups > +""" > + > +import socket, os, sys, time, getopt > + > +def daemonize(output_file): > + try: > + pid = os.fork() > + except OSError, e: > + raise Exception, "error %d: %s" % (e.strerror, e.errno) > + > + if pid: > + os._exit(0) > + > + os.umask(0) > + os.setsid() > + sys.stdout.flush() > + sys.stderr.flush() > + > + if file: > + output_handle = file(output_file, 'a+', 0) > + # autoflush stdout/stderr > + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) > + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) > + else: > + output_handle = file('/dev/null', 'a+') > + > + stdin_handle = open('/dev/null', 'r') > + os.dup2(output_handle.fileno(), sys.stdout.fileno()) > + os.dup2(output_handle.fileno(), sys.stderr.fileno()) > + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) > + > +def recv_all(sock): > + total_data = [] > + while True: > + data = sock.recv(1024) > + if not data: > + break > + total_data.append(data) > + return ''.join(total_data) > + > +def run_server(host, port, daemon, file, queue_size, threshold, > drift): > + if daemon: > + daemonize(output_file=file) > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.bind((host, port)) > + sock.listen(queue_size) > + timeout_interval = threshold * 2 > + prev_check_timestamp = float(time.time()) > + while 1: > + c_sock, c_addr = sock.accept() > + heartbeat = recv_all(c_sock) > + local_timestamp = float(time.time()) > + drift = check_heartbeat(heartbeat, local_timestamp, > threshold, check_drift) > + # NOTE: this doesn't work if the only client is the one that > timed > + # out, but anything more complete would require another > thread and > + # a lock for client_prev_timestamp. > + if local_timestamp - prev_check_timestamp > threshold * 2.0: > + check_for_timeouts(threshold, check_drift) > + prev_check_timestamp = local_timestamp > + if verbose: > + if check_drift: > + print "%.2f: %s (%s)" % (local_timestamp, heartbeat, > drift) > + else: > + print "%.2f: %s" % (local_timestamp, heartbeat) > + > +def run_client(host, port, daemon, file, interval): > + if daemon: > + daemonize(output_file=file) > + seq = 1 > + while 1: > + try: > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > + sock.connect((host, port)) > + heartbeat = get_heartbeat(seq) > + sock.sendall(heartbeat) > + sock.close() > + if verbose: > + print heartbeat > + except socket.error, (value, message): > + print "%.2f: ERROR, %d - %s" % (float(time.time()), > value, message) + > + seq += 1 > + time.sleep(interval) > + > +def get_heartbeat(seq=1): > + return "%s %06d %.2f" % (hostname, seq, float(time.time())) > + > +def check_heartbeat(heartbeat, local_timestamp, threshold, > check_drift): > + hostname, seq, timestamp = heartbeat.rsplit() > + timestamp = float(timestamp) > + if client_prev_timestamp.has_key(hostname): > + delta = local_timestamp - client_prev_timestamp[hostname] > + if delta > threshold: > + print "%.2f: ALERT, SLU detected on host %s, delta %ds" \ > + % (float(time.time()), hostname, delta) > + > + client_prev_timestamp[hostname] = local_timestamp > + > + if check_drift: > + if not client_clock_offset.has_key(hostname): > + client_clock_offset[hostname] = timestamp - > local_timestamp > + client_prev_drift[hostname] = 0 > + drift = timestamp - local_timestamp - > client_clock_offset[hostname] > + drift_delta = drift - client_prev_drift[hostname] > + client_prev_drift[hostname] = drift > + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) > + > +def check_for_timeouts(threshold, check_drift): > + local_timestamp = float(time.time()) > + hostname_list = list(client_prev_timestamp) > + for hostname in hostname_list: > + timestamp = client_prev_timestamp[hostname] > + delta = local_timestamp - timestamp > + if delta > threshold * 2: > + print "%.2f: ALERT, SLU detected on host %s, no > heartbeat for %ds" \ > + % (local_timestamp, hostname, delta) > + del client_prev_timestamp[hostname] > + if check_drift: > + del client_clock_offset[hostname] > + del client_prev_drift[hostname] > + > +def usage(): > + print """ > +Usage: > + > + heartbeat_slu.py --server --address <bind_address> --port > <bind_port> > + [--file <output_file>] [--no-daemon] [--verbose] > + [--threshold <heartbeat threshold>] > + > + heartbeat_slu.py --client --address <server_address> -p > <server_port> > + [--file output_file] [--no-daemon] [--verbose] > + [--interval <heartbeat interval in seconds>] > +""" > + > +# host information and global data > +hostname = socket.gethostname() > +client_prev_timestamp = {} > +client_clock_offset = {} > +client_prev_drift = {} > + > +# default param values > +host_port = 9001 > +host_address = '' > +interval = 1 # seconds between heartbeats > +threshold = 10 # seconds late till alert > +is_server = False > +is_daemon = True > +file_server = "/tmp/heartbeat_server.out" > +file_client = "/tmp/heartbeat_client.out" > +file_selected = None > +queue_size = 5 > +verbose = False > +check_drift = False > + > +# process cmdline opts > +try: > + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ > + "server", "client", "no-daemon", "address=", > "port=", > + "file=", "server", "interval=", "threshold=", > "verbose", > + "check-drift", "help"]) > +except getopt.GetoptError, e: > + print "error: %s" % str(e) > + usage() > + exit(1) > + > +for param, value in opts: > + if param in ["-p", "--port"]: > + host_port = int(value) > + elif param in ["-a", "--address"]: > + host_address = value > + elif param in ["-s", "--server"]: > + is_server = True > + elif param in ["-c", "--client"]: > + is_server = False > + elif param in ["--no-daemon"]: > + is_daemon = False > + elif param in ["-f", "--file"]: > + file_selected = value > + elif param in ["-i", "--interval"]: > + interval = int(value) > + elif param in ["-t", "--threshold"]: > + threshold = int(value) > + elif param in ["-d", "--check-drift"]: > + check_drift = True > + elif param in ["-v", "--verbose"]: > + verbose = True > + elif param in ["-h", "--help"]: > + usage() > + exit(0) > + else: > + print "error: unrecognized option: %s" % value > + usage() > + exit(1) > + > +# run until we're terminated > +if is_server: > + file_server = file_selected or file_server > + run_server(host_address, host_port, is_daemon, file_server, > queue_size, threshold, check_drift) +else: > + file_client = file_selected or file_client > + run_client(host_address, host_port, is_daemon, file_client, > interval) diff --git a/client/tests/kvm/tests_base.cfg.sample > b/client/tests/kvm/tests_base.cfg.sample index 65880d8..e9e41f9 100644 > --- a/client/tests/kvm/tests_base.cfg.sample > +++ b/client/tests/kvm/tests_base.cfg.sample > @@ -420,6 +420,24 @@ variants: > type = smbios_table > start_vm = no > > + - softlockup: install setup unattended_install.cdrom > + only Linux > + type = softlockup > + softlockup_files = stress-1.0.4.tar.gz > + stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && > cd stress-1.0.4 && ./configure && make && cd src" > + server_setup_cmd = "%s/heartbeat_slu.py --server --threshold > %s --file %s --port %s --verbose --check-drift" > + client_setup_cmd = "%s/heartbeat_slu.py --client --address > %s --file %s --port %s --interval 1" > + stress_cmd = "cd %s && cd stress-1.0.4 && cd src && > nohup ./stress -c %s > /dev/null 2>&1&" > + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk > '{print$2}' | xargs kill -9 > /dev/null 2>&1" > + kill_stress_cmd = "pkill -f stress > /dev/null 2>&1" > + drift_cmd = "tail -1 %s | awk '{print $7}'" > + monitor_log_file_server = /tmp/heartbeat_server.log > + monitor_log_file_client = /tmp/heartbeat_client.log > + monitor_port = 13330 > + stress_threshold = 10 > + # time_to_run (hours) = 12, 18, 24, 48 hours > + test_length = 0.10 > + > - stress_boot: install setup image_copy unattended_install.cdrom > type = stress_boot > max_vms = 5 > diff --git a/client/virt/tests/softlockup.py > b/client/virt/tests/softlockup.py new file mode 100644 > index 0000000..d946965 > --- /dev/null > +++ b/client/virt/tests/softlockup.py > @@ -0,0 +1,147 @@ > +import logging, os, socket, time > +from autotest_lib.client.bin import utils > + > + > +def run_softlockup(test, params, env): > + """ > + soft lockup/drift test with stress. > + > + 1) Boot up a VM. > + 2) Build stress on host and guest. > + 3) run heartbeat with the given options on server and host. > + 3) Run for a relatively long time length. ex: 12, 18 or 24 hours. > + 4) Output the test result and observe drift. > + > + @param test: KVM test object. > + @param params: Dictionary with the test parameters. > + @param env: Dictionary with test environment. > + """ > + stress_setup_cmd = params.get("stress_setup_cmd") > + stress_cmd = params.get("stress_cmd") > + server_setup_cmd = params.get("server_setup_cmd") > + drift_cmd = params.get("drift_cmd") > + kill_stress_cmd = params.get("kill_stress_cmd") > + kill_monitor_cmd = params.get("kill_monitor_cmd") > + > + threshold = int(params.get("stress_threshold")) > + monitor_log_file_server = params.get("monitor_log_file_server") > + monitor_log_file_client = params.get("monitor_log_file_client") > + test_length = int(3600 * float(params.get("test_length"))) > + monitor_port = int(params.get("monitor_port")) > + > + vm = env.get_vm(params["main_vm"]) > + login_timeout = int(params.get("login_timeout", 360)) > + stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress") > + monitor_dir = os.path.join(test.bindir, 'deps') > + > + > + def _kill_guest_programs(session, kill_stress_cmd, > kill_monitor_cmd): > + logging.info("Kill stress and monitor on guest") > + try: > + session.cmd(kill_stress_cmd) > + except: > + pass > + try: > + session.cmd(kill_monitor_cmd) > + except: > + pass > + > + > + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): > + logging.info("Kill stress and monitor on host") > + utils.run(kill_stress_cmd, ignore_status=True) > + utils.run(kill_monitor_cmd, ignore_status=True) > + > + > + def host(): > + logging.info("Setup monitor server on host") > + # Kill previous instances of the host load programs, if any > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + # Cleanup previous log instances > + if os.path.isfile(monitor_log_file_server): > + os.remove(monitor_log_file_server) > + # Opening firewall ports on host > + utils.run("iptables -F", ignore_status=True) > + > + # Run heartbeat on host > + utils.run(server_setup_cmd % (monitor_dir, threshold, > + monitor_log_file_server, > monitor_port)) + > + logging.info("Build stress on host") > + # Uncompress and build stress on host > + utils.run(stress_setup_cmd % stress_dir) > + > + logging.info("Run stress on host") > + # stress_threads = 2 * n_cpus > + threads_host = 2 * utils.count_cpus() > + # Run stress test on host > + utils.run(stress_cmd % (stress_dir, threads_host)) > + > + > + def guest(): > + try: > + host_ip = socket.gethostbyname(socket.gethostname()) > + except socket.error: > + try: > + # Hackish, but works well on stand alone (laptop) > setups > + # with access to the internet. If this fails, well, > then > + # not much else can be done... > + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) > + s.connect(("redhat.com", 80)) > + host_ip = s.getsockname()[0] > + except socket.error, (value, e): > + raise error.TestError("Could not determine host IP: > %d %s" % > + (value, e)) > + > + # Now, starting the guest > + vm.verify_alive() > + session = vm.wait_for_login(timeout=login_timeout) > + > + # Kill previous instances of the load programs, if any > + _kill_guest_programs(session, kill_stress_cmd, > kill_monitor_cmd) > + # Clean up previous log instances > + session.cmd("rm -f %s" % monitor_log_file_client) > + > + # Opening firewall ports on guest > + try: > + session.cmd("iptables -F") > + except: > + pass > + > + # Get required files and copy them from host to guest > + monitor_path = os.path.join(test.bindir, 'deps', > 'heartbeat_slu.py') > + stress_path = os.path.join(os.environ['AUTODIR'], "tests", > "stress", > + "stress-1.0.4.tar.gz") > + vm.copy_files_to(monitor_path, "/tmp") > + vm.copy_files_to(stress_path, "/tmp") > + > + logging.info("Setup monitor client on guest") > + # Start heartbeat on guest > + session.cmd(params.get("client_setup_cmd") % > + ("/tmp", monitor_log_file_client, host_ip, > monitor_port)) + in tests_base.cfg client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1" in softlockup.py session.cmd(params.get("client_setup_cmd") % ("/tmp", monitor_log_file_client, host_ip, monitor_port)) address, file picking up options interchangeably. > + logging.info("Build stress on guest") > + # Uncompress and build stress on guest > + session.cmd(stress_setup_cmd % "/tmp", timeout=200) > + > + logging.info("Run stress on guest") > + # stress_threads = 2 * n_vcpus > + threads_guest = 2 * int(params.get("smp", 1)) > + # Run stress test on guest > + session.cmd(stress_cmd % ("/tmp", threads_guest)) > + > + # Wait and report > + logging.debug("Wait for %d s", test_length) > + time.sleep(test_length) > + > + # Kill instances of the load programs on both guest and host > + _kill_guest_programs(session, kill_stress_cmd, > kill_monitor_cmd) > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > + > + # Collect drift > + drift = utils.system_output(drift_cmd % > monitor_log_file_server) > + logging.info("Drift noticed: %s", drift) > + > + > + host() > + guest() -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html