Re: [KVM-AUTOTEST][PATCH] timedrift support

Lucas Meneghel Rodrigues <mrodrigu@xxxxxxxxxx> · Mon, 11 May 2009 09:59:49 -0300

On Mon, 2009-05-11 at 18:40 +0800, Bear Yang wrote:
> Hello.
> I have modified my script according Marcelo's suggestion. and resubmit 
> my script to you all. :)
> 
> Marcelo, Seems except you, no one care my script. I  still want to say 
> any suggestion  on my script would be greatly appreciated.
> 
> Thanks.
> 
> Bear
> 

Hi Bear, sorry, I had some hectic days here so I still haven't reviewed
your patch. 

As a general comment, I realize that in several occasions we are using
os.system() to execute commands on the host, when we would usually
prefer to use the utils.system() or utils.run() API, since it already
throws an exception when exit code != 0 (you can allways set ignore_fail
= True to avoid this behavior if needed) and we are working on doing a
better handling of stdout and stderr upstream.

My comments follow:

diff -urN kvm_runtest_2.bak/cpu_stress.c kvm_runtest_2/cpu_stress.c

--- kvm_runtest_2.bak/cpu_stress.c	1969-12-31 19:00:00.000000000 -0500
+++ kvm_runtest_2/cpu_stress.c	2009-05-05 22:35:34.000000000 -0400
@@ -0,0 +1,61 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <math.h>
+#include <unistd.h>
+
+#define MAX_CPUS 256
+#define BUFFSIZE 1024
+
+
+void worker_child(int cpu)
+{
+	int cur_freq;
+	int min_freq;
+	int max_freq;
+	int last_freq;
+	cpu_set_t mask;
+	int i;
+	double x;
+        int d = 0;
+	/*
+	 * bind this thread to the specified cpu 
+	 */
+	CPU_ZERO(&mask);
+	CPU_SET(cpu, &mask);
+	sched_setaffinity(0, CPU_SETSIZE, &mask);
+
+	while (d++ != 500000) {
+			for (i=0; i<100000; i++)
+				x = sqrt(x);
+	}
+
+	_exit(0);
+
+}
+
+
+main() {
+	cpu_set_t mask;
+	int i;
+	int code;
+
+	if (sched_getaffinity(0, CPU_SETSIZE, &mask) < 0){
+		perror ("sched_getaffinity");
+		exit(1);
+	}
+
+	for (i=0; i<CPU_SETSIZE; i++)
+		if (CPU_ISSET(i, &mask)){
+			printf ("CPU%d\n",i);
+			if (fork() == 0)
+				worker_child(i);
+		}
+
+
+	wait(&code);
+	exit (WEXITSTATUS(code));
+}

I believe we might want to use a more complete stress system, that can do IO stress and put 'memory pressure' on the host system. 
When I need to cause stress on a host, what I end up doing is to hack the stress.c program from LTP, because it can do memory and IO stress as well.
I will send you the stress.c program on a separate e-mail.

diff -urN kvm_runtest_2.bak/kvm_runtest_2.py kvm_runtest_2/kvm_runtest_2.py
--- kvm_runtest_2.bak/kvm_runtest_2.py	2009-04-29 06:17:29.000000000 -0400
+++ kvm_runtest_2/kvm_runtest_2.py	2009-04-29 08:06:32.000000000 -0400
@@ -36,6 +36,8 @@
                 "autotest":     test_routine("kvm_tests",           "run_autotest"),
                 "kvm_install":  test_routine("kvm_install",         "run_kvm_install"),
                 "linux_s3":     test_routine("kvm_tests",           "run_linux_s3"),
+                "ntp_server_setup": test_routine("kvm_tests",       "run_ntp_server_setup"),
+                "timedrift":    test_routine("kvm_tests",           "run_timedrift"),
                 }
 
         # Make it possible to import modules from the test's bindir
diff -urN kvm_runtest_2.bak/kvm_tests.cfg.sample kvm_runtest_2/kvm_tests.cfg.sample
--- kvm_runtest_2.bak/kvm_tests.cfg.sample	2009-04-29 06:17:29.000000000 -0400
+++ kvm_runtest_2/kvm_tests.cfg.sample	2009-04-29 08:09:36.000000000 -0400
@@ -81,6 +81,10 @@
     - linux_s3:      install setup
         type = linux_s3
 
+    - ntp_server_setup:
+        type = ntp_server_setup
+    - timedrift:      ntp_server_setup
+        type = timedrift
 # NICs
 variants:
     - @rtl8139:
diff -urN kvm_runtest_2.bak/kvm_tests.py kvm_runtest_2/kvm_tests.py
--- kvm_runtest_2.bak/kvm_tests.py	2009-04-29 06:17:29.000000000 -0400
+++ kvm_runtest_2/kvm_tests.py	2009-05-11 06:00:32.000000000 -0400
@@ -394,3 +394,247 @@
     kvm_log.info("VM resumed after S3")
 
     session.close()
+
+def run_ntp_server_setup(test, params, env):
+    
+    """NTP server configuration and related network file modification
+    """
+
+    kvm_log.info("stop the iptables service if it is running for timedrift testing")
+
+    if not os.system("/etc/init.d/iptables status"):
+        os.system("/etc/init.d/iptables stop")
+
+    # prevent dhcp client modify the ntp.conf
+    kvm_log.info("prevent dhcp client modify the ntp.conf")
+
+    config_file = "/etc/sysconfig/network"
+    network_file = open("/etc/sysconfig/network", "a")
+    string = "PEERNTP=no"
+
+    if os.system("grep %s %s" % (string, config_file)):
+        network_file.writelines(str(string)+'\n')
+    
+    network_file.close()
+  
+    # stop the ntp service if it is running
+    kvm_log.info("stop ntp service if it is running")
+
+    if not os.system("/etc/init.d/ntpd status"):
+        os.system("/etc/init.d/ntpd stop")
+        ntp_running = True
+
+    kvm_log.info("start ntp server on host with the custom config file.")
+
+    ntp_cmd = '''
+        echo "restrict default kod nomodify notrap nopeer noquery" >> /etc/timedrift.ntp.conf;\
+        echo "restrict 127.0.0.1" >> /etc/timedrift.ntp.conf;\
+        echo "driftfile /var/lib/ntp/drift" >> /etc/timedrift.ntp.conf;\
+        echo "keys /etc/ntp/keys" >> /etc/timedrift.ntp.conf;\
+        echo "server 127.127.1.0" >> /etc/timedrift.ntp.conf;\
+        echo "fudge 127.127.1.0 stratum 1" >> /etc/timedrift.ntp.conf;\
+        ntpd -c /etc/timedrift.ntp.conf;
+        '''
+    if os.system(ntp_cmd):
+        raise error.TestFail, "NTP server has not starting correct..."

Here you could have used regular utils.system API instead of os.system since it integrates better with the autotest infrastructure.
Instead of the if clause we'd put a try/except block. Minor nipticking, "NTP server has not started correctly..."

+    #kvm_log.info("sync system clock to BIOS")
+    #os.system("/sbin/hwclock --systohc")
+   
+def run_timedrift(test, params, env):
+    """judge wether the guest clock will encounter timedrift prblem or not. including three stages:

Typo, "whether"

+       1: try to sync the clock with host, if the offset value of guest clock is large than 1 sec.
+       2: running the cpu stress testing program<cpu_stress.c> on guest
+       3: then run analyze loop totally 20 times to determine if the clock on guest has time drift.
+    """
+    # variables using in timedrift testcase
+    cpu_stress_program = "cpu_stress.c"
+    remote_dir = '/root'
+
+    clock_resource_cmd = "cat /sys/devices/system/clocksource/clocksource0/current_clocksource"
+
+    pwd = os.path.join(os.environ['AUTODIR'],'tests/kvm_runtest_2')
+    cpu_stress_test = os.path.join(pwd, cpu_stress_program)
+    cpu_stress_cmdline = 'cd %s;gcc %s -lm;./a.out &' % (remote_dir, os.path.basename(cpu_stress_test))
+
+    cpu_stress_search_cmdline = "ps -ef|grep 'a.out'|grep -v grep"
+
+    hostname = os.environ.get("HOSTNAME")

Can't we use socket.gethostname() here instead of relying on environment variable values? 

+    if "localhost.localdomain" == hostname:
+        hostname = os.popen('hostname').read().split('\n')[0]
+        kvm_log.info("since get wrong hostname from python evnironment, then use the hostname get from system call(hostname).")
+
+    kvm_log.info("get host name :%s" % hostname)
+
+    # ntpdate info command and ntpdate sync command
+    ntpdate_info_cmd = "ntpdate -q %s" % hostname
+    ntpdate_sync_cmd = "ntpdate %s" % hostname
+
+    # get vm handle
+    vm = kvm_utils.env_get_vm(env,params.get("main_vm"))
+    if not vm:
+        raise error.TestError, "VM object not found in environment"
+    if not vm.is_alive():
+        raise error.TestError, "VM seems to be dead; Test requires a living VM"

I am seeing this piece of code to get the VM handle on several tests, I am starting to think we should factor this on an utility function...

+    kvm_log.info("Waiting for guest to be up...")
+
+    pxssh = kvm_utils.wait_for(vm.ssh_login, 240, 0, 2)
+    if not pxssh:
+        raise error.TestFail, "Could not log into guest"
+
+    kvm_log.info("Logged into guest IN run_timedrift function.")
+
+    # clock resource get from host and guest
+    host_clock_resource = os.popen(clock_resource_cmd).read().split('\n')[0]
+    kvm_log.info("the clock resource on host is :%s" % host_clock_resource)
+
+    pxssh.sendline(clock_resource_cmd)
+    s, o = pxssh.read_up_to_prompt()
+    guest_clock_resource = o.splitlines()[-2]
+    kvm_log.info("the clock resource on guest is :%s" % guest_clock_resource)
+
+    if host_clock_resource != guest_clock_resource:
+        #raise error.TestFail, "Host and Guest using different clock resource"
+        kvm_log.info("Host and Guest using different clock resource,Let's moving on.")
+    else:
+        kvm_log.info("Host and Guest using same clock resource,Let's moving on.")

Little mistake here, "Let's move on."

+    # helper function: 
+    # ntpdate_op: a entire process to get ntpdate command line result from guest.
+    # time_drift_or_not: get the numeric handing by regular expression and make timedrift calulation.
+    def ntpdate_op(command):
+        output = []
+        try:
+            pxssh = kvm_utils.wait_for(vm.ssh_login, 240, 0, 2)
+            if not pxssh:
+                raise error.TestFail, "Could not log into guest"
+
+            kvm_log.info("Logged in:(ntpdate_op)")
+
+            while True:
+                pxssh.sendline(command)
+                s, output = pxssh.read_up_to_prompt()
+                if "time server" in output:
+                    # output is a string contain the (ntpdate -q) infor on guest
+                    return True, output
+                else:
+                    continue
+        except:
+            pxssh.close()
+            return False, output
+        return False, output
+
+    def time_drift_or_not(output):
+        date_string = re.findall(r'offset [+-]?(.*) sec', output, re.M)
+        num = float(date_string[0])
+        if num >= 1:
+            kvm_log.info("guest clock has drifted in this scenario :%s %s" % (date_string, num))
+            return False
+        else:
+            kvm_log.info("guest clock running veracious in now stage :%s %s" % (date_string, num))
+            return True
+
+    # send the command and get the ouput from guest
+    # this loop will pick out several conditions need to be process
+    # Actually, we want to get the info match "time server", then script can analyzing it to
+    # determine if guest's clock need sync with host or not.
+    while True:
+        pxssh.sendline(ntpdate_info_cmd)
+        s, output = pxssh.read_up_to_prompt()
+        kvm_log.info("the ntpdate query info get from guest is below: \n%s" %output)
+        if ("no server suitable" not in output) and ("time server" not in output):
+            kvm_log.info("very creazying output got. let's try again")
+            continue
+        elif "no server suitable" in output:
+            kvm_log.info("seems NTP server is not ready for servicing")
+            time.sleep(30)
+            continue
+        elif "time server" in output:
+            # get the ntpdate info from guest
+            # kvm_log.info("Got the correct output for analyze. The output is below: \n%s" %output) 
+            break
+
+    kvm_log.info("get the ntpdate infomation from guest successfully :%s" % os.popen('date').read())
+
+    # judge the clock need to sync with host or not
+    while True:
+        date_string = re.findall(r'offset [+-]?(.*) sec', output, re.M)
+        num = float(date_string[0])
+        if num >= 1:
+            kvm_log.info("guest need sync with the server: %s" % hostname)
+            s, output = ntpdate_op(ntpdate_sync_cmd)
+            if s:
+                continue
+        else:
+            #pxssh.sendline("hwclock --systohc")
+            #kvm_log.info("guest clock sync prcdure is finished. then sync the guest clock to guest bios.")
+
+            #pxssh.sendline("hwclock --show")
+            #s, o = pxssh.read_up_to_prompt()
+            #kvm_log.info("the date infomation get from guest bios is :\n%s" % o)
+
+            pxssh.sendline(ntpdate_info_cmd)
+            s, o = pxssh.read_up_to_prompt()
+            kvm_log.info("guest clock after sync with host is :\n%s" % o)
+
+            break
+
+    kvm_log.info("Timedrift Preparation *Finished* at last :%s" % os.popen('date').read())
+
+    if not vm.scp_to_remote(cpu_stress_test, remote_dir):
+        raise error.TestError, "Could not copy program to guest."
+
+    pxssh.sendline(ntpdate_info_cmd)
+    s, o = pxssh.read_up_to_prompt()
+    kvm_log.info("the ntpdate query from host *BEFORE* running the cpu stress program.\n%s" % o)
+    pxssh.sendline(cpu_stress_cmdline)
+    s, o = pxssh.read_up_to_prompt()
+    kvm_log.info("running command line on guest and sleeping for 1200 secs.\n%s" % o)
+
+    time.sleep(1200)
+
+    while True:
+        if pxssh.get_command_status(cpu_stress_search_cmdline):
+            #(s, o) = pxssh.get_command_status_output(cpu_stress_search_cmdline)
+            #print "s is :%s" % s
+            #print "o is :%s" % o
+            #print "--------------------------------------------"
+            #aaa = pxssh.get_command_status(cpu_stress_search_cmdline)
+            #print "aaa is :%s" % aaa
+            #print "--------------------------------------------"
+
+            print "stress testing process has been completed and quit."
+            break
+        else:
+            print "stress testing on CPU has not finished yet.waiting for next detect after sleep 60 secs."
+            time.sleep(60)
+            continue
+
+    pxssh.sendline(ntpdate_info_cmd)
+    s, o = pxssh.read_up_to_prompt()
+    kvm_log.info("the ntpdate query from host *AFTER* running the cpu stress program.\n%s" % o)
+
+    pxssh.close()
+
+    # Sleep for analyze...
+    kvm_log.info("sleeping(180 secs) Starting... :%s" % os.popen('date').read())
+    time.sleep(180)
+    kvm_log.info("wakeup to get the analyzing... :%s" % os.popen('date').read())
+    count = 0
+    for i in range(1, 21):
+        kvm_log.info("this is %s time to get clock info from guest." % i)
+        s, o = ntpdate_op(ntpdate_info_cmd)
+        
+        if not s:
+            raise error.TestFail, "Guest seems hang or ssh service based on guest has been crash down"
+        
+        if not time_drift_or_not(o):
+            count += 1
+
+        if count == 5:
+            raise error.TestFail, "TimeDrift testing Abort because guest's clock has drift too much"
+
+        kvm_log.info("*********************** Sleep 30 seconds for next loop *************************")
+        time.sleep(60)
+

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html