On Fri, 12 Aug 2011 12:37:15 +0530 pradeep <psuriset@xxxxxxxxxxxxxxxxxx> wrote: > On Wed, 20 Jul 2011 22:30:09 -0300 > Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> wrote: > > > From: pradeep <you@xxxxxxxxxxx> > > > > This patch introduces a soft lockup/drift test with stress. > > > > 1) Boot up a VM. > > 2) Build stress on host and guest. > > 3) run heartbeat monitor with the given options on server and > > host. 3) Run for a relatively long time length, ex: 12, 18 or 24 > > hours. 4) Output the test result and observe drift. > > Thanks for making changes. > How about taking average of last 10 drift values? I observed below values for my softlockup test. More or less drift values are similar. (+0.01, +0.02). There wouldn't be much diff between last value or average of last 10 also. For stress & performance kind of tests, why do we need a PASS/FAIL. We just bother about drift value here. 1313148260.65: localhost.localdomain 000417 1313148259.45 (drift +0.01 (-0.00)) 1313148261.65: localhost.localdomain 000418 1313148260.46 (drift +0.02 (+0.01)) 1313148262.65: localhost.localdomain 000419 1313148261.46 (drift +0.02 (-0.00)) 1313148263.66: localhost.localdomain 000420 1313148262.46 (drift +0.02 (-0.00)) 1313148264.66: localhost.localdomain 000421 1313148263.46 (drift +0.01 (-0.00)) 1313148265.76: localhost.localdomain 000422 1313148264.56 (drift +0.01 (-0.00)) 1313148266.76: localhost.localdomain 000423 1313148265.56 (drift +0.01 (-0.00)) 1313148267.76: localhost.localdomain 000424 1313148266.57 (drift +0.02 (+0.01)) 1313148268.76: localhost.localdomain 000425 1313148267.57 (drift +0.02 (-0.00)) 1313148269.77: localhost.localdomain 000426 1313148268.57 (drift +0.02 (-0.00)) 1313148270.87: localhost.localdomain 000427 1313148269.67 (drift +0.01 (-0.00)) 1313148271.87: localhost.localdomain 000428 1313148270.68 (drift +0.02 (+0.01)) 1313148272.87: localhost.localdomain 000429 1313148271.68 (drift +0.02 (-0.00)) 1313148273.88: localhost.localdomain 000430 1313148272.68 (drift +0.02 (-0.00)) 1313148274.88: localhost.localdomain 000431 1313148273.68 (drift +0.01 (-0.00)) 1313148275.97: localhost.localdomain 000432 1313148274.78 (drift +0.02 (+0.01)) 1313148276.97: localhost.localdomain 000433 1313148275.78 (drift +0.02 (-0.00)) 1313148277.98: localhost.localdomain 000434 1313148276.78 (drift +0.02 (-0.00)) 1313148278.98: localhost.localdomain 000435 1313148277.78 (drift +0.01 (-0.00)) 1313148279.98: localhost.localdomain 000436 1313148278.78 (drift +0.01 (-0.00)) 1313148281.08: localhost.localdomain 000437 1313148279.89 (drift +0.02 (+0.01)) 1313148282.09: localhost.localdomain 000438 1313148280.89 (drift +0.02 (-0.00)) 1313148283.09: localhost.localdomain 000439 1313148281.89 (drift +0.01 (-0.00)) > > > > > Changes from v2: > > * Fixed up commands being used on guest, lack of proper output > > redirection was confusing aexpect > > * Proper clean up previous instances of the monitor programs > > lying around, as well as log files > > * Resort to another method of determining host IP if the same > > has no fully qualified hostname (stand alone laptops, for > > example) > > * Only use a single session on guest to execute all the commands. > > previous version was opening unneeded connections. > > * Fix stress execution in guest and host, now the stress instances > > effectively start > > * Actively open guest and host firewall rules so heartbeat monitor > > communication can happen > > > > Signed-off-by: Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> > > Signed-off-by: Pradeep Kumar Surisetty <psuriset@xxxxxxxxxxxxxxxxxx> > > --- > > client/tests/kvm/deps/heartbeat_slu.py | 205 > > ++++++++++++++++++++++++++++++++ > > client/tests/kvm/tests_base.cfg.sample | 18 +++ > > client/virt/tests/softlockup.py | 147 > > +++++++++++++++++++++++ 3 files changed, 370 insertions(+), 0 > > deletions(-) create mode 100755 > > client/tests/kvm/deps/heartbeat_slu.py create mode 100644 > > client/virt/tests/softlockup.py > > > > diff --git a/client/tests/kvm/deps/heartbeat_slu.py > > b/client/tests/kvm/deps/heartbeat_slu.py new file mode 100755 > > index 0000000..697bbbf > > --- /dev/null > > +++ b/client/tests/kvm/deps/heartbeat_slu.py > > @@ -0,0 +1,205 @@ > > +#!/usr/bin/env python > > + > > +""" > > +Heartbeat server/client to detect soft lockups > > +""" > > + > > +import socket, os, sys, time, getopt > > + > > +def daemonize(output_file): > > + try: > > + pid = os.fork() > > + except OSError, e: > > + raise Exception, "error %d: %s" % (e.strerror, e.errno) > > + > > + if pid: > > + os._exit(0) > > + > > + os.umask(0) > > + os.setsid() > > + sys.stdout.flush() > > + sys.stderr.flush() > > + > > + if file: > > + output_handle = file(output_file, 'a+', 0) > > + # autoflush stdout/stderr > > + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) > > + sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0) > > + else: > > + output_handle = file('/dev/null', 'a+') > > + > > + stdin_handle = open('/dev/null', 'r') > > + os.dup2(output_handle.fileno(), sys.stdout.fileno()) > > + os.dup2(output_handle.fileno(), sys.stderr.fileno()) > > + os.dup2(stdin_handle.fileno(), sys.stdin.fileno()) > > + > > +def recv_all(sock): > > + total_data = [] > > + while True: > > + data = sock.recv(1024) > > + if not data: > > + break > > + total_data.append(data) > > + return ''.join(total_data) > > + > > +def run_server(host, port, daemon, file, queue_size, threshold, > > drift): > > + if daemon: > > + daemonize(output_file=file) > > + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) > > + sock.bind((host, port)) > > + sock.listen(queue_size) > > + timeout_interval = threshold * 2 > > + prev_check_timestamp = float(time.time()) > > + while 1: > > + c_sock, c_addr = sock.accept() > > + heartbeat = recv_all(c_sock) > > + local_timestamp = float(time.time()) > > + drift = check_heartbeat(heartbeat, local_timestamp, > > threshold, check_drift) > > + # NOTE: this doesn't work if the only client is the one > > that timed > > + # out, but anything more complete would require another > > thread and > > + # a lock for client_prev_timestamp. > > + if local_timestamp - prev_check_timestamp > threshold * > > 2.0: > > + check_for_timeouts(threshold, check_drift) > > + prev_check_timestamp = local_timestamp > > + if verbose: > > + if check_drift: > > + print "%.2f: %s (%s)" % (local_timestamp, > > heartbeat, drift) > > + else: > > + print "%.2f: %s" % (local_timestamp, heartbeat) > > + > > +def run_client(host, port, daemon, file, interval): > > + if daemon: > > + daemonize(output_file=file) > > + seq = 1 > > + while 1: > > + try: > > + sock = socket.socket(socket.AF_INET, > > socket.SOCK_STREAM) > > + sock.connect((host, port)) > > + heartbeat = get_heartbeat(seq) > > + sock.sendall(heartbeat) > > + sock.close() > > + if verbose: > > + print heartbeat > > + except socket.error, (value, message): > > + print "%.2f: ERROR, %d - %s" % (float(time.time()), > > value, message) + > > + seq += 1 > > + time.sleep(interval) > > + > > +def get_heartbeat(seq=1): > > + return "%s %06d %.2f" % (hostname, seq, float(time.time())) > > + > > +def check_heartbeat(heartbeat, local_timestamp, threshold, > > check_drift): > > + hostname, seq, timestamp = heartbeat.rsplit() > > + timestamp = float(timestamp) > > + if client_prev_timestamp.has_key(hostname): > > + delta = local_timestamp - client_prev_timestamp[hostname] > > + if delta > threshold: > > + print "%.2f: ALERT, SLU detected on host %s, delta > > %ds" \ > > + % (float(time.time()), hostname, delta) > > + > > + client_prev_timestamp[hostname] = local_timestamp > > + > > + if check_drift: > > + if not client_clock_offset.has_key(hostname): > > + client_clock_offset[hostname] = timestamp - > > local_timestamp > > + client_prev_drift[hostname] = 0 > > + drift = timestamp - local_timestamp - > > client_clock_offset[hostname] > > + drift_delta = drift - client_prev_drift[hostname] > > + client_prev_drift[hostname] = drift > > + return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) > > + > > +def check_for_timeouts(threshold, check_drift): > > + local_timestamp = float(time.time()) > > + hostname_list = list(client_prev_timestamp) > > + for hostname in hostname_list: > > + timestamp = client_prev_timestamp[hostname] > > + delta = local_timestamp - timestamp > > + if delta > threshold * 2: > > + print "%.2f: ALERT, SLU detected on host %s, no > > heartbeat for %ds" \ > > + % (local_timestamp, hostname, delta) > > + del client_prev_timestamp[hostname] > > + if check_drift: > > + del client_clock_offset[hostname] > > + del client_prev_drift[hostname] > > + > > +def usage(): > > + print """ > > +Usage: > > + > > + heartbeat_slu.py --server --address <bind_address> --port > > <bind_port> > > + [--file <output_file>] [--no-daemon] > > [--verbose] > > + [--threshold <heartbeat threshold>] > > + > > + heartbeat_slu.py --client --address <server_address> -p > > <server_port> > > + [--file output_file] [--no-daemon] [--verbose] > > + [--interval <heartbeat interval in seconds>] > > +""" > > + > > +# host information and global data > > +hostname = socket.gethostname() > > +client_prev_timestamp = {} > > +client_clock_offset = {} > > +client_prev_drift = {} > > + > > +# default param values > > +host_port = 9001 > > +host_address = '' > > +interval = 1 # seconds between heartbeats > > +threshold = 10 # seconds late till alert > > +is_server = False > > +is_daemon = True > > +file_server = "/tmp/heartbeat_server.out" > > +file_client = "/tmp/heartbeat_client.out" > > +file_selected = None > > +queue_size = 5 > > +verbose = False > > +check_drift = False > > + > > +# process cmdline opts > > +try: > > + opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [ > > + "server", "client", "no-daemon", "address=", > > "port=", > > + "file=", "server", "interval=", "threshold=", > > "verbose", > > + "check-drift", "help"]) > > +except getopt.GetoptError, e: > > + print "error: %s" % str(e) > > + usage() > > + exit(1) > > + > > +for param, value in opts: > > + if param in ["-p", "--port"]: > > + host_port = int(value) > > + elif param in ["-a", "--address"]: > > + host_address = value > > + elif param in ["-s", "--server"]: > > + is_server = True > > + elif param in ["-c", "--client"]: > > + is_server = False > > + elif param in ["--no-daemon"]: > > + is_daemon = False > > + elif param in ["-f", "--file"]: > > + file_selected = value > > + elif param in ["-i", "--interval"]: > > + interval = int(value) > > + elif param in ["-t", "--threshold"]: > > + threshold = int(value) > > + elif param in ["-d", "--check-drift"]: > > + check_drift = True > > + elif param in ["-v", "--verbose"]: > > + verbose = True > > + elif param in ["-h", "--help"]: > > + usage() > > + exit(0) > > + else: > > + print "error: unrecognized option: %s" % value > > + usage() > > + exit(1) > > + > > +# run until we're terminated > > +if is_server: > > + file_server = file_selected or file_server > > + run_server(host_address, host_port, is_daemon, file_server, > > queue_size, threshold, check_drift) +else: > > + file_client = file_selected or file_client > > + run_client(host_address, host_port, is_daemon, file_client, > > interval) diff --git a/client/tests/kvm/tests_base.cfg.sample > > b/client/tests/kvm/tests_base.cfg.sample index 65880d8..e9e41f9 > > 100644 --- a/client/tests/kvm/tests_base.cfg.sample > > +++ b/client/tests/kvm/tests_base.cfg.sample > > @@ -420,6 +420,24 @@ variants: > > type = smbios_table > > start_vm = no > > > > + - softlockup: install setup unattended_install.cdrom > > + only Linux > > + type = softlockup > > + softlockup_files = stress-1.0.4.tar.gz > > + stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && > > cd stress-1.0.4 && ./configure && make && cd src" > > + server_setup_cmd = "%s/heartbeat_slu.py --server > > --threshold %s --file %s --port %s --verbose --check-drift" > > + client_setup_cmd = "%s/heartbeat_slu.py --client --address > > %s --file %s --port %s --interval 1" > > + stress_cmd = "cd %s && cd stress-1.0.4 && cd src && > > nohup ./stress -c %s > /dev/null 2>&1&" > > + kill_monitor_cmd = "ps aux | grep heart | grep -v grep | > > awk '{print$2}' | xargs kill -9 > /dev/null 2>&1" > > + kill_stress_cmd = "pkill -f stress > /dev/null 2>&1" > > + drift_cmd = "tail -1 %s | awk '{print $7}'" > > + monitor_log_file_server = /tmp/heartbeat_server.log > > + monitor_log_file_client = /tmp/heartbeat_client.log > > + monitor_port = 13330 > > + stress_threshold = 10 > > + # time_to_run (hours) = 12, 18, 24, 48 hours > > + test_length = 0.10 > > + > > - stress_boot: install setup image_copy > > unattended_install.cdrom type = stress_boot > > max_vms = 5 > > diff --git a/client/virt/tests/softlockup.py > > b/client/virt/tests/softlockup.py new file mode 100644 > > index 0000000..d946965 > > --- /dev/null > > +++ b/client/virt/tests/softlockup.py > > @@ -0,0 +1,147 @@ > > +import logging, os, socket, time > > +from autotest_lib.client.bin import utils > > + > > + > > +def run_softlockup(test, params, env): > > + """ > > + soft lockup/drift test with stress. > > + > > + 1) Boot up a VM. > > + 2) Build stress on host and guest. > > + 3) run heartbeat with the given options on server and host. > > + 3) Run for a relatively long time length. ex: 12, 18 or 24 > > hours. > > + 4) Output the test result and observe drift. > > + > > + @param test: KVM test object. > > + @param params: Dictionary with the test parameters. > > + @param env: Dictionary with test environment. > > + """ > > + stress_setup_cmd = params.get("stress_setup_cmd") > > + stress_cmd = params.get("stress_cmd") > > + server_setup_cmd = params.get("server_setup_cmd") > > + drift_cmd = params.get("drift_cmd") > > + kill_stress_cmd = params.get("kill_stress_cmd") > > + kill_monitor_cmd = params.get("kill_monitor_cmd") > > + > > + threshold = int(params.get("stress_threshold")) > > + monitor_log_file_server = params.get("monitor_log_file_server") > > + monitor_log_file_client = params.get("monitor_log_file_client") > > + test_length = int(3600 * float(params.get("test_length"))) > > + monitor_port = int(params.get("monitor_port")) > > + > > + vm = env.get_vm(params["main_vm"]) > > + login_timeout = int(params.get("login_timeout", 360)) > > + stress_dir = os.path.join(os.environ['AUTODIR'], > > "tests/stress") > > + monitor_dir = os.path.join(test.bindir, 'deps') > > + > > + > > + def _kill_guest_programs(session, kill_stress_cmd, > > kill_monitor_cmd): > > + logging.info("Kill stress and monitor on guest") > > + try: > > + session.cmd(kill_stress_cmd) > > + except: > > + pass > > + try: > > + session.cmd(kill_monitor_cmd) > > + except: > > + pass > > + > > + > > + def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd): > > + logging.info("Kill stress and monitor on host") > > + utils.run(kill_stress_cmd, ignore_status=True) > > + utils.run(kill_monitor_cmd, ignore_status=True) > > + > > + > > + def host(): > > + logging.info("Setup monitor server on host") > > + # Kill previous instances of the host load programs, if any > > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > > + # Cleanup previous log instances > > + if os.path.isfile(monitor_log_file_server): > > + os.remove(monitor_log_file_server) > > + # Opening firewall ports on host > > + utils.run("iptables -F", ignore_status=True) > > + > > + # Run heartbeat on host > > + utils.run(server_setup_cmd % (monitor_dir, threshold, > > + monitor_log_file_server, > > monitor_port)) + > > + logging.info("Build stress on host") > > + # Uncompress and build stress on host > > + utils.run(stress_setup_cmd % stress_dir) > > + > > + logging.info("Run stress on host") > > + # stress_threads = 2 * n_cpus > > + threads_host = 2 * utils.count_cpus() > > + # Run stress test on host > > + utils.run(stress_cmd % (stress_dir, threads_host)) > > + > > + > > + def guest(): > > + try: > > + host_ip = socket.gethostbyname(socket.gethostname()) > > + except socket.error: > > + try: > > + # Hackish, but works well on stand alone (laptop) > > setups > > + # with access to the internet. If this fails, well, > > then > > + # not much else can be done... > > + s = socket.socket(socket.AF_INET, > > socket.SOCK_DGRAM) > > + s.connect(("redhat.com", 80)) > > + host_ip = s.getsockname()[0] > > + except socket.error, (value, e): > > + raise error.TestError("Could not determine host IP: > > %d %s" % > > + (value, e)) > > + > > + # Now, starting the guest > > + vm.verify_alive() > > + session = vm.wait_for_login(timeout=login_timeout) > > + > > + # Kill previous instances of the load programs, if any > > + _kill_guest_programs(session, kill_stress_cmd, > > kill_monitor_cmd) > > + # Clean up previous log instances > > + session.cmd("rm -f %s" % monitor_log_file_client) > > + > > + # Opening firewall ports on guest > > + try: > > + session.cmd("iptables -F") > > + except: > > + pass > > + > > + # Get required files and copy them from host to guest > > + monitor_path = os.path.join(test.bindir, 'deps', > > 'heartbeat_slu.py') > > + stress_path = os.path.join(os.environ['AUTODIR'], "tests", > > "stress", > > + "stress-1.0.4.tar.gz") > > + vm.copy_files_to(monitor_path, "/tmp") > > + vm.copy_files_to(stress_path, "/tmp") > > + > > + logging.info("Setup monitor client on guest") > > + # Start heartbeat on guest > > + session.cmd(params.get("client_setup_cmd") % > > + ("/tmp", monitor_log_file_client, host_ip, > > monitor_port)) + > > in tests_base.cfg > > client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file > %s --port %s --interval 1" > > in softlockup.py > > session.cmd(params.get("client_setup_cmd") % ("/tmp", > monitor_log_file_client, host_ip, > monitor_port)) > > address, file picking up options interchangeably. > > > > + logging.info("Build stress on guest") > > + # Uncompress and build stress on guest > > + session.cmd(stress_setup_cmd % "/tmp", timeout=200) > > + > > + logging.info("Run stress on guest") > > + # stress_threads = 2 * n_vcpus > > + threads_guest = 2 * int(params.get("smp", 1)) > > + # Run stress test on guest > > + session.cmd(stress_cmd % ("/tmp", threads_guest)) > > + > > + # Wait and report > > + logging.debug("Wait for %d s", test_length) > > + time.sleep(test_length) > > + > > + # Kill instances of the load programs on both guest and > > host > > + _kill_guest_programs(session, kill_stress_cmd, > > kill_monitor_cmd) > > + _kill_host_programs(kill_stress_cmd, kill_monitor_cmd) > > + > > + # Collect drift > > + drift = utils.system_output(drift_cmd % > > monitor_log_file_server) > > + logging.info("Drift noticed: %s", drift) > > + > > + > > + host() > > + guest() > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html