[PATCH] Virt: Adding softlockup subtest

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: pradeep <you@xxxxxxxxxxx>

This patch introduces a soft lockup/drift test with stress.

    1) Boot up a VM.
    2) Build stress on host and guest.
    3) run heartbeat monitor with the given options on server and host.
    3) Run for a relatively long time length, ex: 12, 18 or 24 hours.
    4) Output the test result and observe drift.

Changes from v2:
 * Fixed up commands being used on guest, lack of proper output
   redirection was confusing aexpect
 * Proper clean up previous instances of the monitor programs
   lying around, as well as log files
 * Resort to another method of determining host IP if the same
   has no fully qualified hostname (stand alone laptops, for
   example)
 * Only use a single session on guest to execute all the commands.
   previous version was opening unneeded connections.
 * Fix stress execution in guest and host, now the stress instances
   effectively start
 * Actively open guest and host firewall rules so heartbeat monitor
   communication can happen

Signed-off-by: Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx>
Signed-off-by: Pradeep Kumar Surisetty <psuriset@xxxxxxxxxxxxxxxxxx>
---
 client/tests/kvm/deps/heartbeat_slu.py |  205 ++++++++++++++++++++++++++++++++
 client/tests/kvm/tests_base.cfg.sample |   18 +++
 client/virt/tests/softlockup.py        |  147 +++++++++++++++++++++++
 3 files changed, 370 insertions(+), 0 deletions(-)
 create mode 100755 client/tests/kvm/deps/heartbeat_slu.py
 create mode 100644 client/virt/tests/softlockup.py

diff --git a/client/tests/kvm/deps/heartbeat_slu.py b/client/tests/kvm/deps/heartbeat_slu.py
new file mode 100755
index 0000000..697bbbf
--- /dev/null
+++ b/client/tests/kvm/deps/heartbeat_slu.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+
+"""
+Heartbeat server/client to detect soft lockups
+"""
+
+import socket, os, sys, time, getopt
+
+def daemonize(output_file):
+    try:
+        pid = os.fork()
+    except OSError, e:
+        raise Exception, "error %d: %s" % (e.strerror, e.errno)
+
+    if pid:
+        os._exit(0)
+
+    os.umask(0)
+    os.setsid()
+    sys.stdout.flush()
+    sys.stderr.flush()
+
+    if file:
+        output_handle = file(output_file, 'a+', 0)
+        # autoflush stdout/stderr
+        sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+        sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0)
+    else:
+        output_handle = file('/dev/null', 'a+')
+
+    stdin_handle = open('/dev/null', 'r')
+    os.dup2(output_handle.fileno(), sys.stdout.fileno())
+    os.dup2(output_handle.fileno(), sys.stderr.fileno())
+    os.dup2(stdin_handle.fileno(), sys.stdin.fileno())
+
+def recv_all(sock):
+    total_data = []
+    while True:
+        data = sock.recv(1024)
+        if not data:
+            break
+        total_data.append(data)
+    return ''.join(total_data)
+
+def run_server(host, port, daemon, file, queue_size, threshold, drift):
+    if daemon:
+        daemonize(output_file=file)
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind((host, port))
+    sock.listen(queue_size)
+    timeout_interval = threshold * 2
+    prev_check_timestamp = float(time.time())
+    while 1:
+        c_sock, c_addr = sock.accept()
+        heartbeat = recv_all(c_sock)
+        local_timestamp = float(time.time())
+        drift = check_heartbeat(heartbeat, local_timestamp, threshold, check_drift)
+        # NOTE: this doesn't work if the only client is the one that timed
+        # out, but anything more complete would require another thread and
+        # a lock for client_prev_timestamp.
+        if local_timestamp - prev_check_timestamp > threshold * 2.0:
+            check_for_timeouts(threshold, check_drift)
+            prev_check_timestamp = local_timestamp
+        if verbose:
+            if check_drift:
+                print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift)
+            else:
+                print "%.2f: %s" % (local_timestamp, heartbeat)
+
+def run_client(host, port, daemon, file, interval):
+    if daemon:
+        daemonize(output_file=file)
+    seq = 1
+    while 1:
+        try:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 
+            sock.connect((host, port))
+            heartbeat = get_heartbeat(seq)
+            sock.sendall(heartbeat)
+            sock.close()
+            if verbose:
+                print heartbeat
+        except socket.error, (value, message):
+            print "%.2f: ERROR, %d - %s" % (float(time.time()), value, message)
+
+        seq += 1
+        time.sleep(interval)
+
+def get_heartbeat(seq=1):
+    return "%s %06d %.2f" % (hostname, seq, float(time.time()))
+
+def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift):
+    hostname, seq, timestamp = heartbeat.rsplit()
+    timestamp = float(timestamp)
+    if client_prev_timestamp.has_key(hostname):
+        delta = local_timestamp - client_prev_timestamp[hostname]
+        if delta > threshold:
+            print "%.2f: ALERT, SLU detected on host %s, delta %ds" \
+                % (float(time.time()), hostname, delta)
+
+    client_prev_timestamp[hostname] = local_timestamp
+
+    if check_drift:
+        if not client_clock_offset.has_key(hostname):
+            client_clock_offset[hostname] = timestamp - local_timestamp
+            client_prev_drift[hostname] = 0
+        drift = timestamp - local_timestamp - client_clock_offset[hostname]
+        drift_delta = drift - client_prev_drift[hostname]
+        client_prev_drift[hostname] = drift
+        return "drift %+4.2f (%+4.2f)" % (drift, drift_delta) 
+
+def check_for_timeouts(threshold, check_drift):
+    local_timestamp = float(time.time())
+    hostname_list = list(client_prev_timestamp)
+    for hostname in hostname_list:
+        timestamp = client_prev_timestamp[hostname]
+        delta = local_timestamp - timestamp
+        if delta > threshold * 2:
+            print "%.2f: ALERT, SLU detected on host %s, no heartbeat for %ds" \
+                % (local_timestamp, hostname, delta)
+            del client_prev_timestamp[hostname]
+            if check_drift:
+                del client_clock_offset[hostname]
+                del client_prev_drift[hostname]
+
+def usage():
+    print """
+Usage:
+    
+    heartbeat_slu.py --server --address <bind_address> --port <bind_port>
+                     [--file <output_file>] [--no-daemon] [--verbose]
+                     [--threshold <heartbeat threshold>]
+
+    heartbeat_slu.py --client --address <server_address> -p <server_port>
+                     [--file output_file] [--no-daemon] [--verbose]
+                     [--interval <heartbeat interval in seconds>]
+"""
+
+# host information and global data
+hostname = socket.gethostname()
+client_prev_timestamp = {}
+client_clock_offset = {}
+client_prev_drift = {}
+
+# default param values
+host_port = 9001
+host_address = ''
+interval = 1 # seconds between heartbeats
+threshold = 10 # seconds late till alert
+is_server = False
+is_daemon = True
+file_server = "/tmp/heartbeat_server.out"
+file_client = "/tmp/heartbeat_client.out"
+file_selected = None
+queue_size = 5
+verbose = False
+check_drift = False
+
+# process cmdline opts
+try:
+    opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [
+                    "server", "client", "no-daemon", "address=", "port=",
+                    "file=", "server", "interval=", "threshold=", "verbose",
+                    "check-drift", "help"])
+except getopt.GetoptError, e:
+    print "error: %s" % str(e)
+    usage()
+    exit(1)
+
+for param, value in opts:
+    if param in ["-p", "--port"]:
+        host_port = int(value)
+    elif param in ["-a", "--address"]:
+        host_address = value
+    elif param in ["-s", "--server"]:
+        is_server = True
+    elif param in ["-c", "--client"]:
+        is_server = False
+    elif param in ["--no-daemon"]:
+        is_daemon = False
+    elif param in ["-f", "--file"]:
+        file_selected = value
+    elif param in ["-i", "--interval"]:
+        interval = int(value)
+    elif param in ["-t", "--threshold"]:
+        threshold = int(value)
+    elif param in ["-d", "--check-drift"]:
+        check_drift = True
+    elif param in ["-v", "--verbose"]:
+        verbose = True
+    elif param in ["-h", "--help"]:
+        usage()
+        exit(0)
+    else:
+        print "error: unrecognized option: %s" % value
+        usage()
+        exit(1)
+
+# run until we're terminated
+if is_server:
+    file_server = file_selected or file_server
+    run_server(host_address, host_port, is_daemon, file_server, queue_size, threshold, check_drift)
+else:
+    file_client = file_selected or file_client
+    run_client(host_address, host_port, is_daemon, file_client, interval)
diff --git a/client/tests/kvm/tests_base.cfg.sample b/client/tests/kvm/tests_base.cfg.sample
index 65880d8..e9e41f9 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -420,6 +420,24 @@ variants:
         type = smbios_table
         start_vm = no
 
+    - softlockup: install setup unattended_install.cdrom
+        only Linux
+        type = softlockup
+        softlockup_files = stress-1.0.4.tar.gz
+        stress_setup_cmd = "cd %s && tar xvf stress-1.0.4.tar.gz && cd stress-1.0.4 && ./configure && make && cd src"
+        server_setup_cmd = "%s/heartbeat_slu.py --server --threshold %s --file %s --port %s --verbose --check-drift"
+        client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1"
+        stress_cmd  = "cd %s && cd stress-1.0.4 && cd src && nohup ./stress -c %s > /dev/null 2>&1&"
+        kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk '{print$2}' | xargs kill -9 > /dev/null 2>&1"
+        kill_stress_cmd = "pkill -f stress > /dev/null 2>&1"
+        drift_cmd = "tail -1 %s | awk '{print $7}'"
+        monitor_log_file_server = /tmp/heartbeat_server.log
+        monitor_log_file_client = /tmp/heartbeat_client.log
+        monitor_port = 13330
+        stress_threshold = 10
+        # time_to_run (hours) = 12, 18, 24, 48 hours
+        test_length = 0.10
+
     - stress_boot: install setup image_copy unattended_install.cdrom
         type = stress_boot
         max_vms = 5    
diff --git a/client/virt/tests/softlockup.py b/client/virt/tests/softlockup.py
new file mode 100644
index 0000000..d946965
--- /dev/null
+++ b/client/virt/tests/softlockup.py
@@ -0,0 +1,147 @@
+import logging, os, socket, time
+from autotest_lib.client.bin import utils
+
+
+def run_softlockup(test, params, env):
+    """
+    soft lockup/drift test with stress.
+
+    1) Boot up a VM.
+    2) Build stress on host and guest.
+    3) run heartbeat with the given options on server and host.
+    3) Run for a relatively long time length. ex: 12, 18 or 24 hours.
+    4) Output the test result and observe drift.
+
+    @param test: KVM test object.
+    @param params: Dictionary with the test parameters.
+    @param env: Dictionary with test environment.
+    """
+    stress_setup_cmd = params.get("stress_setup_cmd")
+    stress_cmd = params.get("stress_cmd")
+    server_setup_cmd = params.get("server_setup_cmd")
+    drift_cmd = params.get("drift_cmd")
+    kill_stress_cmd = params.get("kill_stress_cmd")
+    kill_monitor_cmd = params.get("kill_monitor_cmd")
+
+    threshold = int(params.get("stress_threshold"))
+    monitor_log_file_server = params.get("monitor_log_file_server")
+    monitor_log_file_client = params.get("monitor_log_file_client")
+    test_length = int(3600 * float(params.get("test_length")))
+    monitor_port = int(params.get("monitor_port"))
+
+    vm = env.get_vm(params["main_vm"])
+    login_timeout = int(params.get("login_timeout", 360))
+    stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress")
+    monitor_dir = os.path.join(test.bindir, 'deps')
+
+
+    def _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd):
+        logging.info("Kill stress and monitor on guest")
+        try:
+            session.cmd(kill_stress_cmd)
+        except:
+            pass
+        try:
+            session.cmd(kill_monitor_cmd)
+        except:
+            pass
+
+
+    def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd):
+        logging.info("Kill stress and monitor on host")
+        utils.run(kill_stress_cmd, ignore_status=True)
+        utils.run(kill_monitor_cmd, ignore_status=True)
+
+
+    def host():
+        logging.info("Setup monitor server on host")
+        # Kill previous instances of the host load programs, if any
+        _kill_host_programs(kill_stress_cmd, kill_monitor_cmd)
+        # Cleanup previous log instances
+        if os.path.isfile(monitor_log_file_server):
+            os.remove(monitor_log_file_server)
+        # Opening firewall ports on host
+        utils.run("iptables -F", ignore_status=True)
+
+        # Run heartbeat on host
+        utils.run(server_setup_cmd % (monitor_dir, threshold,
+                                      monitor_log_file_server, monitor_port))
+
+        logging.info("Build stress on host")
+        # Uncompress and build stress on host
+        utils.run(stress_setup_cmd % stress_dir)
+
+        logging.info("Run stress on host")
+        # stress_threads = 2 * n_cpus
+        threads_host = 2 * utils.count_cpus()
+        # Run stress test on host
+        utils.run(stress_cmd % (stress_dir, threads_host))
+
+
+    def guest():
+        try:
+            host_ip = socket.gethostbyname(socket.gethostname())
+        except socket.error:
+            try:
+                # Hackish, but works well on stand alone (laptop) setups
+                # with access to the internet. If this fails, well, then
+                # not much else can be done...
+                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+                s.connect(("redhat.com", 80))
+                host_ip = s.getsockname()[0]
+            except socket.error, (value, e):
+                raise error.TestError("Could not determine host IP: %d %s" %
+                                      (value, e))
+
+        # Now, starting the guest
+        vm.verify_alive()
+        session = vm.wait_for_login(timeout=login_timeout)
+
+        # Kill previous instances of the load programs, if any
+        _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd)
+        # Clean up previous log instances
+        session.cmd("rm -f %s" % monitor_log_file_client)
+
+        # Opening firewall ports on guest
+        try:
+            session.cmd("iptables -F")
+        except:
+            pass
+
+        # Get required files and copy them from host to guest
+        monitor_path = os.path.join(test.bindir, 'deps', 'heartbeat_slu.py')
+        stress_path = os.path.join(os.environ['AUTODIR'], "tests", "stress",
+                                   "stress-1.0.4.tar.gz")
+        vm.copy_files_to(monitor_path, "/tmp")
+        vm.copy_files_to(stress_path, "/tmp")
+
+        logging.info("Setup monitor client on guest")
+        # Start heartbeat on guest
+        session.cmd(params.get("client_setup_cmd") %
+                    ("/tmp", monitor_log_file_client, host_ip, monitor_port))
+
+        logging.info("Build stress on guest")
+        # Uncompress and build stress on guest
+        session.cmd(stress_setup_cmd % "/tmp", timeout=200)
+
+        logging.info("Run stress on guest")
+        # stress_threads = 2 * n_vcpus
+        threads_guest = 2 * int(params.get("smp", 1))
+        # Run stress test on guest
+        session.cmd(stress_cmd % ("/tmp", threads_guest))
+
+        # Wait and report
+        logging.debug("Wait for %d s", test_length)
+        time.sleep(test_length)
+
+        # Kill instances of the load programs on both guest and host
+        _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd)
+        _kill_host_programs(kill_stress_cmd, kill_monitor_cmd)
+
+        # Collect drift
+        drift = utils.system_output(drift_cmd %  monitor_log_file_server)
+        logging.info("Drift noticed: %s", drift)
+
+
+    host()
+    guest()
-- 
1.7.6

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux