Re: [PATCH] Virt: Adding softlockup subtest

Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> · Wed, 20 Jul 2011 22:38:03 -0300

On 07/20/2011 10:30 PM, Lucas Meneghel Rodrigues wrote:
From: pradeep<you@xxxxxxxxxxx>

Ok Pradeep, I checked out the new version of the test and made 
corrections to it (see changelog).

Now, what I don't quite like on this test is:

 * There's no PASS/FAIL criteria, that is, the test never fails. This 
is not good.
 * The method of determining the drift looks strange to me. The drift 
monitor produces a line of drifts every second, then your code checks 
only the last line of it. Not sure if this is correct.
 * Also, when trying out the test here I found problems. Did you 
actually run the test until the end? I kindly ask you to test with a 
reduced time length (say, 15 or 30 minutes). I've adapted the test so
it can use fractions of an hour instead of full hours.

So please, go through the new patch:

http://patchwork.test.kernel.org/patch/3570/mbox/

And give me a failure criteria and justify the drift calculation being 
done the way your are doing (or fix it).

Thanks,

Lucas

This patch introduces a soft lockup/drift test with stress.

     1) Boot up a VM.
     2) Build stress on host and guest.
     3) run heartbeat monitor with the given options on server and host.
     3) Run for a relatively long time length, ex: 12, 18 or 24 hours.
     4) Output the test result and observe drift.

Changes from v2:
  * Fixed up commands being used on guest, lack of proper output
    redirection was confusing aexpect
  * Proper clean up previous instances of the monitor programs
    lying around, as well as log files
  * Resort to another method of determining host IP if the same
    has no fully qualified hostname (stand alone laptops, for
    example)
  * Only use a single session on guest to execute all the commands.
    previous version was opening unneeded connections.
  * Fix stress execution in guest and host, now the stress instances
    effectively start
  * Actively open guest and host firewall rules so heartbeat monitor
    communication can happen

Signed-off-by: Lucas Meneghel Rodrigues<lmr@xxxxxxxxxx>
Signed-off-by: Pradeep Kumar Surisetty<psuriset@xxxxxxxxxxxxxxxxxx>
---
  client/tests/kvm/deps/heartbeat_slu.py |  205 ++++++++++++++++++++++++++++++++
  client/tests/kvm/tests_base.cfg.sample |   18 +++
  client/virt/tests/softlockup.py        |  147 +++++++++++++++++++++++
  3 files changed, 370 insertions(+), 0 deletions(-)
  create mode 100755 client/tests/kvm/deps/heartbeat_slu.py
  create mode 100644 client/virt/tests/softlockup.py

diff --git a/client/tests/kvm/deps/heartbeat_slu.py b/client/tests/kvm/deps/heartbeat_slu.py
new file mode 100755
index 0000000..697bbbf
--- /dev/null
+++ b/client/tests/kvm/deps/heartbeat_slu.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+
+"""
+Heartbeat server/client to detect soft lockups
+"""
+
+import socket, os, sys, time, getopt
+
+def daemonize(output_file):
+    try:
+        pid = os.fork()
+    except OSError, e:
+        raise Exception, "error %d: %s" % (e.strerror, e.errno)
+
+    if pid:
+        os._exit(0)
+
+    os.umask(0)
+    os.setsid()
+    sys.stdout.flush()
+    sys.stderr.flush()
+
+    if file:
+        output_handle = file(output_file, 'a+', 0)
+        # autoflush stdout/stderr
+        sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+        sys.stderr = os.fdopen(sys.stderr.fileno(), 'w', 0)
+    else:
+        output_handle = file('/dev/null', 'a+')
+
+    stdin_handle = open('/dev/null', 'r')
+    os.dup2(output_handle.fileno(), sys.stdout.fileno())
+    os.dup2(output_handle.fileno(), sys.stderr.fileno())
+    os.dup2(stdin_handle.fileno(), sys.stdin.fileno())
+
+def recv_all(sock):
+    total_data = []
+    while True:
+        data = sock.recv(1024)
+        if not data:
+            break
+        total_data.append(data)
+    return ''.join(total_data)
+
+def run_server(host, port, daemon, file, queue_size, threshold, drift):
+    if daemon:
+        daemonize(output_file=file)
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.bind((host, port))
+    sock.listen(queue_size)
+    timeout_interval = threshold * 2
+    prev_check_timestamp = float(time.time())
+    while 1:
+        c_sock, c_addr = sock.accept()
+        heartbeat = recv_all(c_sock)
+        local_timestamp = float(time.time())
+        drift = check_heartbeat(heartbeat, local_timestamp, threshold, check_drift)
+        # NOTE: this doesn't work if the only client is the one that timed
+        # out, but anything more complete would require another thread and
+        # a lock for client_prev_timestamp.
+        if local_timestamp - prev_check_timestamp>  threshold * 2.0:
+            check_for_timeouts(threshold, check_drift)
+            prev_check_timestamp = local_timestamp
+        if verbose:
+            if check_drift:
+                print "%.2f: %s (%s)" % (local_timestamp, heartbeat, drift)
+            else:
+                print "%.2f: %s" % (local_timestamp, heartbeat)
+
+def run_client(host, port, daemon, file, interval):
+    if daemon:
+        daemonize(output_file=file)
+    seq = 1
+    while 1:
+        try:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect((host, port))
+            heartbeat = get_heartbeat(seq)
+            sock.sendall(heartbeat)
+            sock.close()
+            if verbose:
+                print heartbeat
+        except socket.error, (value, message):
+            print "%.2f: ERROR, %d - %s" % (float(time.time()), value, message)
+
+        seq += 1
+        time.sleep(interval)
+
+def get_heartbeat(seq=1):
+    return "%s %06d %.2f" % (hostname, seq, float(time.time()))
+
+def check_heartbeat(heartbeat, local_timestamp, threshold, check_drift):
+    hostname, seq, timestamp = heartbeat.rsplit()
+    timestamp = float(timestamp)
+    if client_prev_timestamp.has_key(hostname):
+        delta = local_timestamp - client_prev_timestamp[hostname]
+        if delta>  threshold:
+            print "%.2f: ALERT, SLU detected on host %s, delta %ds" \
+                % (float(time.time()), hostname, delta)
+
+    client_prev_timestamp[hostname] = local_timestamp
+
+    if check_drift:
+        if not client_clock_offset.has_key(hostname):
+            client_clock_offset[hostname] = timestamp - local_timestamp
+            client_prev_drift[hostname] = 0
+        drift = timestamp - local_timestamp - client_clock_offset[hostname]
+        drift_delta = drift - client_prev_drift[hostname]
+        client_prev_drift[hostname] = drift
+        return "drift %+4.2f (%+4.2f)" % (drift, drift_delta)
+
+def check_for_timeouts(threshold, check_drift):
+    local_timestamp = float(time.time())
+    hostname_list = list(client_prev_timestamp)
+    for hostname in hostname_list:
+        timestamp = client_prev_timestamp[hostname]
+        delta = local_timestamp - timestamp
+        if delta>  threshold * 2:
+            print "%.2f: ALERT, SLU detected on host %s, no heartbeat for %ds" \
+                % (local_timestamp, hostname, delta)
+            del client_prev_timestamp[hostname]
+            if check_drift:
+                del client_clock_offset[hostname]
+                del client_prev_drift[hostname]
+
+def usage():
+    print """
+Usage:
+
+    heartbeat_slu.py --server --address<bind_address>  --port<bind_port>
+                     [--file<output_file>] [--no-daemon] [--verbose]
+                     [--threshold<heartbeat threshold>]
+
+    heartbeat_slu.py --client --address<server_address>  -p<server_port>
+                     [--file output_file] [--no-daemon] [--verbose]
+                     [--interval<heartbeat interval in seconds>]
+"""
+
+# host information and global data
+hostname = socket.gethostname()
+client_prev_timestamp = {}
+client_clock_offset = {}
+client_prev_drift = {}
+
+# default param values
+host_port = 9001
+host_address = ''
+interval = 1 # seconds between heartbeats
+threshold = 10 # seconds late till alert
+is_server = False
+is_daemon = True
+file_server = "/tmp/heartbeat_server.out"
+file_client = "/tmp/heartbeat_client.out"
+file_selected = None
+queue_size = 5
+verbose = False
+check_drift = False
+
+# process cmdline opts
+try:
+    opts, args = getopt.getopt(sys.argv[1:], "vhsfd:p:a:i:t:", [
+                    "server", "client", "no-daemon", "address=", "port=",
+                    "file=", "server", "interval=", "threshold=", "verbose",
+                    "check-drift", "help"])
+except getopt.GetoptError, e:
+    print "error: %s" % str(e)
+    usage()
+    exit(1)
+
+for param, value in opts:
+    if param in ["-p", "--port"]:
+        host_port = int(value)
+    elif param in ["-a", "--address"]:
+        host_address = value
+    elif param in ["-s", "--server"]:
+        is_server = True
+    elif param in ["-c", "--client"]:
+        is_server = False
+    elif param in ["--no-daemon"]:
+        is_daemon = False
+    elif param in ["-f", "--file"]:
+        file_selected = value
+    elif param in ["-i", "--interval"]:
+        interval = int(value)
+    elif param in ["-t", "--threshold"]:
+        threshold = int(value)
+    elif param in ["-d", "--check-drift"]:
+        check_drift = True
+    elif param in ["-v", "--verbose"]:
+        verbose = True
+    elif param in ["-h", "--help"]:
+        usage()
+        exit(0)
+    else:
+        print "error: unrecognized option: %s" % value
+        usage()
+        exit(1)
+
+# run until we're terminated
+if is_server:
+    file_server = file_selected or file_server
+    run_server(host_address, host_port, is_daemon, file_server, queue_size, threshold, check_drift)
+else:
+    file_client = file_selected or file_client
+    run_client(host_address, host_port, is_daemon, file_client, interval)
diff --git a/client/tests/kvm/tests_base.cfg.sample b/client/tests/kvm/tests_base.cfg.sample
index 65880d8..e9e41f9 100644
--- a/client/tests/kvm/tests_base.cfg.sample
+++ b/client/tests/kvm/tests_base.cfg.sample
@@ -420,6 +420,24 @@ variants:
          type = smbios_table
          start_vm = no

+    - softlockup: install setup unattended_install.cdrom
+        only Linux
+        type = softlockup
+        softlockup_files = stress-1.0.4.tar.gz
+        stress_setup_cmd = "cd %s&&  tar xvf stress-1.0.4.tar.gz&&  cd stress-1.0.4&&  ./configure&&  make&&  cd src"
+        server_setup_cmd = "%s/heartbeat_slu.py --server --threshold %s --file %s --port %s --verbose --check-drift"
+        client_setup_cmd = "%s/heartbeat_slu.py --client --address %s --file %s --port %s --interval 1"
+        stress_cmd  = "cd %s&&  cd stress-1.0.4&&  cd src&&  nohup ./stress -c %s>  /dev/null 2>&1&"
+        kill_monitor_cmd = "ps aux | grep heart | grep -v grep | awk '{print$2}' | xargs kill -9>  /dev/null 2>&1"
+        kill_stress_cmd = "pkill -f stress>  /dev/null 2>&1"
+        drift_cmd = "tail -1 %s | awk '{print $7}'"
+        monitor_log_file_server = /tmp/heartbeat_server.log
+        monitor_log_file_client = /tmp/heartbeat_client.log
+        monitor_port = 13330
+        stress_threshold = 10
+        # time_to_run (hours) = 12, 18, 24, 48 hours
+        test_length = 0.10
+
      - stress_boot: install setup image_copy unattended_install.cdrom
          type = stress_boot
          max_vms = 5
diff --git a/client/virt/tests/softlockup.py b/client/virt/tests/softlockup.py
new file mode 100644
index 0000000..d946965
--- /dev/null
+++ b/client/virt/tests/softlockup.py
@@ -0,0 +1,147 @@
+import logging, os, socket, time
+from autotest_lib.client.bin import utils
+
+
+def run_softlockup(test, params, env):
+    """
+    soft lockup/drift test with stress.
+
+    1) Boot up a VM.
+    2) Build stress on host and guest.
+    3) run heartbeat with the given options on server and host.
+    3) Run for a relatively long time length. ex: 12, 18 or 24 hours.
+    4) Output the test result and observe drift.
+
+    @param test: KVM test object.
+    @param params: Dictionary with the test parameters.
+    @param env: Dictionary with test environment.
+    """
+    stress_setup_cmd = params.get("stress_setup_cmd")
+    stress_cmd = params.get("stress_cmd")
+    server_setup_cmd = params.get("server_setup_cmd")
+    drift_cmd = params.get("drift_cmd")
+    kill_stress_cmd = params.get("kill_stress_cmd")
+    kill_monitor_cmd = params.get("kill_monitor_cmd")
+
+    threshold = int(params.get("stress_threshold"))
+    monitor_log_file_server = params.get("monitor_log_file_server")
+    monitor_log_file_client = params.get("monitor_log_file_client")
+    test_length = int(3600 * float(params.get("test_length")))
+    monitor_port = int(params.get("monitor_port"))
+
+    vm = env.get_vm(params["main_vm"])
+    login_timeout = int(params.get("login_timeout", 360))
+    stress_dir = os.path.join(os.environ['AUTODIR'], "tests/stress")
+    monitor_dir = os.path.join(test.bindir, 'deps')
+
+
+    def _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd):
+        logging.info("Kill stress and monitor on guest")
+        try:
+            session.cmd(kill_stress_cmd)
+        except:
+            pass
+        try:
+            session.cmd(kill_monitor_cmd)
+        except:
+            pass
+
+
+    def _kill_host_programs(kill_stress_cmd, kill_monitor_cmd):
+        logging.info("Kill stress and monitor on host")
+        utils.run(kill_stress_cmd, ignore_status=True)
+        utils.run(kill_monitor_cmd, ignore_status=True)
+
+
+    def host():
+        logging.info("Setup monitor server on host")
+        # Kill previous instances of the host load programs, if any
+        _kill_host_programs(kill_stress_cmd, kill_monitor_cmd)
+        # Cleanup previous log instances
+        if os.path.isfile(monitor_log_file_server):
+            os.remove(monitor_log_file_server)
+        # Opening firewall ports on host
+        utils.run("iptables -F", ignore_status=True)
+
+        # Run heartbeat on host
+        utils.run(server_setup_cmd % (monitor_dir, threshold,
+                                      monitor_log_file_server, monitor_port))
+
+        logging.info("Build stress on host")
+        # Uncompress and build stress on host
+        utils.run(stress_setup_cmd % stress_dir)
+
+        logging.info("Run stress on host")
+        # stress_threads = 2 * n_cpus
+        threads_host = 2 * utils.count_cpus()
+        # Run stress test on host
+        utils.run(stress_cmd % (stress_dir, threads_host))
+
+
+    def guest():
+        try:
+            host_ip = socket.gethostbyname(socket.gethostname())
+        except socket.error:
+            try:
+                # Hackish, but works well on stand alone (laptop) setups
+                # with access to the internet. If this fails, well, then
+                # not much else can be done...
+                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+                s.connect(("redhat.com", 80))
+                host_ip = s.getsockname()[0]
+            except socket.error, (value, e):
+                raise error.TestError("Could not determine host IP: %d %s" %
+                                      (value, e))
+
+        # Now, starting the guest
+        vm.verify_alive()
+        session = vm.wait_for_login(timeout=login_timeout)
+
+        # Kill previous instances of the load programs, if any
+        _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd)
+        # Clean up previous log instances
+        session.cmd("rm -f %s" % monitor_log_file_client)
+
+        # Opening firewall ports on guest
+        try:
+            session.cmd("iptables -F")
+        except:
+            pass
+
+        # Get required files and copy them from host to guest
+        monitor_path = os.path.join(test.bindir, 'deps', 'heartbeat_slu.py')
+        stress_path = os.path.join(os.environ['AUTODIR'], "tests", "stress",
+                                   "stress-1.0.4.tar.gz")
+        vm.copy_files_to(monitor_path, "/tmp")
+        vm.copy_files_to(stress_path, "/tmp")
+
+        logging.info("Setup monitor client on guest")
+        # Start heartbeat on guest
+        session.cmd(params.get("client_setup_cmd") %
+                    ("/tmp", monitor_log_file_client, host_ip, monitor_port))
+
+        logging.info("Build stress on guest")
+        # Uncompress and build stress on guest
+        session.cmd(stress_setup_cmd % "/tmp", timeout=200)
+
+        logging.info("Run stress on guest")
+        # stress_threads = 2 * n_vcpus
+        threads_guest = 2 * int(params.get("smp", 1))
+        # Run stress test on guest
+        session.cmd(stress_cmd % ("/tmp", threads_guest))
+
+        # Wait and report
+        logging.debug("Wait for %d s", test_length)
+        time.sleep(test_length)
+
+        # Kill instances of the load programs on both guest and host
+        _kill_guest_programs(session, kill_stress_cmd, kill_monitor_cmd)
+        _kill_host_programs(kill_stress_cmd, kill_monitor_cmd)
+
+        # Collect drift
+        drift = utils.system_output(drift_cmd %  monitor_log_file_server)
+        logging.info("Drift noticed: %s", drift)
+
+
+    host()
+    guest()

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html