I do have one major concern; other than that, it looks pretty good to me. I think Michael's suggestions are generally reasonable (although I'm fine with the code as it is as well). But if you do use his suggested get_results_dir_list please use "pid in pid_dir_dict" and not "pid_dir_dict.has_key(pid)". -- John On Tue, Sep 8, 2009 at 6:53 AM, Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> wrote: > This patch adds a system to watch user space segmentation > faults, writing core dumps and some degree of core dump > analysis report. We believe that such a system will be > beneficial for autotest as a whole, since the ability to > get core dumps and dump analysis for each app crashing > during an autotest execution can help test engineers with > richer debugging information. > > The system is comprised by 2 parts: > > * Modifications on test code that enable core dumps > generation, register a core handler script in the kernel > and check by generated core files at the end of each > test. > > * A core handler script that is going to write the > core on each test debug dir in a convenient way, with > a report that currently is comprised by the process that > died and a gdb stacktrace of the process. As the system > gets in shape, we could add more scripts that can do > fancier stuff (such as handlers that use frysk to get > more info such as memory maps, provided that we have > frysk installed in the machine). > > This is the proof of concept of the system. I am sending it > to the mailing list on this early stage so I can get > feedback on the feature. The system passes my basic > tests: > > * Run a simple long test, such as the kvm test, and > then crash an application while the test is running. I > get reports generated on test.debugdir > > * Run a slightly more complex control file, with 3 parallel > bonnie instances at once and crash an application while the > test is running. I get reports generated on all > test.debugdirs. > > 3rd try: > * Explicitely enable core dumps using the resource module > * Fixed a bug on the crash detection code, and factored > it into a utility function. > > I believe we are good to go now. > > Signed-off-by: Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> > --- > client/common_lib/test.py | 66 +++++++++++++- > client/tools/crash_handler.py | 202 +++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 266 insertions(+), 2 deletions(-) > create mode 100755 client/tools/crash_handler.py > > diff --git a/client/common_lib/test.py b/client/common_lib/test.py > index 362c960..65b78a3 100644 > --- a/client/common_lib/test.py > +++ b/client/common_lib/test.py > @@ -17,7 +17,7 @@ > # tmpdir eg. tmp/<tempname>_<testname.tag> > > import fcntl, os, re, sys, shutil, tarfile, tempfile, time, traceback > -import warnings, logging > +import warnings, logging, glob, resource > > from autotest_lib.client.common_lib import error > from autotest_lib.client.bin import utils > @@ -31,7 +31,6 @@ class base_test: > self.job = job > self.pkgmgr = job.pkgmgr > self.autodir = job.autodir > - > self.outputdir = outputdir > self.tagged_testname = os.path.basename(self.outputdir) > self.resultsdir = os.path.join(self.outputdir, 'results') > @@ -40,6 +39,7 @@ class base_test: > os.mkdir(self.profdir) > self.debugdir = os.path.join(self.outputdir, 'debug') > os.mkdir(self.debugdir) > + self.configure_crash_handler() > self.bindir = bindir > if hasattr(job, 'libdir'): > self.libdir = job.libdir > @@ -54,6 +54,66 @@ class base_test: > self.after_iteration_hooks = [] > > > + def configure_crash_handler(self): > + """ > + Configure the crash handler by: > + * Setting up core size to unlimited > + * Putting an appropriate crash handler on /proc/sys/kernel/core_pattern > + * Create files that the crash handler will use to figure which tests > + are active at a given moment > + > + The crash handler will pick up the core file and write it to > + self.debugdir, and perform analysis on it to generate a report. The > + program also outputs some results to syslog. > + > + If multiple tests are running, an attempt to verify if we still have > + the old PID on the system process table to determine whether it is a > + parent of the current test execution. If we can't determine it, the > + core file and the report file will be copied to all test debug dirs. > + """ > + self.pattern_file = '/proc/sys/kernel/core_pattern' > + try: > + # Enable core dumps > + resource.setrlimit(resource.RLIMIT_CORE, (-1, -1)) > + # Trying to backup core pattern and register our script > + self.core_pattern_backup = open(self.pattern_file, 'r').read() > + pattern_file = open(self.pattern_file, 'w') > + tools_dir = os.path.join(self.autodir, 'tools') > + crash_handler_path = os.path.join(tools_dir, 'crash_handler.py') > + pattern_file.write('|' + crash_handler_path + ' %p %t %u %s %h %e') > + # Writing the files that the crash handler is going to use > + self.debugdir_tmp_file = ('/tmp/autotest_results_dir.%s' % > + os.getpid()) > + utils.open_write_close(self.debugdir_tmp_file, self.debugdir + "\n") > + except Exception, e: > + self.crash_handling_enabled = False > + logging.error('Crash handling system disabled: %s' % e) > + else: > + self.crash_handling_enabled = True > + logging.debug('Crash handling system enabled.') > + > + > + def crash_handler_report(self): > + """ > + If core dumps are found on the debugdir after the execution of the > + test, let the user know. > + """ > + if self.crash_handling_enabled: > + core_dirs = glob.glob('%s/crash.*' % self.debugdir) > + if core_dirs: > + logging.warning('Programs crashed during test execution:') > + for dir in core_dirs: > + logging.warning('Please verify %s for more info', dir) > + # Remove the debugdir info file > + os.unlink(self.debugdir_tmp_file) > + # Restore the core pattern backup > + try: > + utils.open_write_close(self.pattern_file, > + self.core_pattern_backup) > + except EnvironmentError: > + pass > + > + > def assert_(self, expr, msg='Assertion failed.'): > if not expr: > raise error.TestError(msg) > @@ -377,6 +437,7 @@ class base_test: > traceback.print_exc() > print 'Now raising the earlier %s error' % exc_info[0] > finally: > + self.crash_handler_report() If this is going to be put in front of the logging.restore call, it probably needs to be wrapped in its own try-finally (with the logging.restore in the finally). Otherwise if it fails for any reason it's going to mess up all subsequent logging. > self.job.logging.restore() > try: > raise exc_info[0], exc_info[1], exc_info[2] > @@ -389,6 +450,7 @@ class base_test: > if run_cleanup: > _cherry_pick_call(self.cleanup, *args, **dargs) > finally: > + self.crash_handler_report() Same comment as above here. > self.job.logging.restore() > except error.AutotestError: > if self.network_destabilizing: > diff --git a/client/tools/crash_handler.py b/client/tools/crash_handler.py > new file mode 100755 > index 0000000..e281eb5 > --- /dev/null > +++ b/client/tools/crash_handler.py > @@ -0,0 +1,202 @@ > +#!/usr/bin/python > +""" > +Simple crash handling application for autotest > + > +@copyright Red Hat Inc 2009 > +@author Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> > +""" > +import sys, os, commands, glob, tempfile, shutil, syslog > + > + > +def get_parent_pid(pid): > + """ > + Returns the parent PID for a given PID, converted to an integer. > + > + @param pid: Process ID. > + """ > + try: > + stat_file_contents = open('/proc/%s/stat' % pid, 'r').readline() > + ppid = int(stat_file_contents.split(" ")[3]) > + except: > + # It is not possible to determine the parent because the process > + # already left the process table. > + ppid = 1 > + > + return ppid > + > + > +def pid_descends_from(pid_a, pid_b): > + """ > + Check whether pid_a descends from pid_b. > + > + @param pid_a: Process ID. > + @param pid_b: Process ID. > + """ > + pid_a = int(pid_a) > + pid_b = int(pid_b) > + current_pid = pid_a > + while current_pid > 1: > + if current_pid == pid_b: > + syslog.syslog(syslog.LOG_INFO, > + "PID %s descends from PID %s!" % (pid_a, pid_b)) > + return True > + else: > + current_pid = get_parent_pid(current_pid) > + syslog.syslog(syslog.LOG_INFO, > + "PID %s does not descend from PID %s" % (pid_a, pid_b)) > + return False > + > + > +def write_to_file(file_path, contents): > + """ > + Write contents to a given file path specified. If not specified, the file > + will be created. > + > + @param file_path: Path to a given file. > + @param contents: File contents. > + """ > + file_object = open(file_path, 'w') > + file_object.write(contents) > + file_object.close() > + > + > +def get_results_dir_list(pid, core_dir_basename): > + """ > + Get all valid output directories for the core file and the report. It works > + by inspecting files created by each test on /tmp and verifying if the > + PID of the process that crashed is a child or grandchild of the autotest > + test process. If it can't find any relationship (maybe a daemon that died > + during a test execution), it will write the core file to the debug dirs > + of all tests currently being executed. If there are no active autotest > + tests at a particular moment, it will return a list with ['/tmp']. > + > + @param pid: PID for the process that generated the core > + @param core_dir_basename: Basename for the directory that will hold both > + the core dump and the crash report. > + """ > + # Get all active test debugdir path files present > + debugdir_files = glob.glob("/tmp/autotest_results_dir.*") > + if debugdir_files: > + pid_dir_dict = {} > + for debugdir_file in debugdir_files: > + a_pid = debugdir_file.split('.')[-1] > + results_dir = open(debugdir_file, 'r').read().strip() > + pid_dir_dict[a_pid] = os.path.join(results_dir, core_dir_basename) > + > + results_dir_list = [] > + found_relation = False > + for a_pid, a_path in pid_dir_dict.iteritems(): > + if pid_descends_from(pid, a_pid): > + results_dir_list.append(a_path) > + found_relation = True > + > + # If we could not find any relations between the pids in the list with > + # the process that crashed, we can't tell for sure which test spawned > + # the process (maybe it is a daemon and started even before autotest > + # started), so we will have to output the core file to all active test > + # directories. > + if not found_relation: > + return pid_dir_dict.values() > + else: > + return results_dir_list > + > + else: > + path_inactive_autotest = os.path.join('/tmp', core_dir_basename) > + return [path_inactive_autotest] > + > + > +def get_info_from_core(path): > + """ > + Reads a core file and extracts a dictionary with useful core information. > + Right now, the only information extracted is the full executable name. > + > + @param path: Path to core file. > + """ > + # Here we are getting the executable full path in a very inelegant way :( > + # Since the 'right' solution for it is to make a library to get information > + # from core dump files, properly written, I'll leave this as it is for now. > + full_exe_path = commands.getoutput('strings %s | grep "_="' % > + path).strip("_=") > + if full_exe_path.startswith("./"): > + pwd = commands.getoutput('strings %s | grep "^PWD="' % > + path).strip("PWD=") > + full_exe_path = os.path.join(pwd, full_exe_path.strip("./")) > + > + return {'core_file': path, 'full_exe_path': full_exe_path} > + > + > +if __name__ == "__main__": > + syslog.openlog('AutotestCrashHandler', 0, syslog.LOG_DAEMON) > + (crashed_pid, time, uid, signal, hostname, exe) = sys.argv[1:] > + core_name = 'core' > + report_name = 'report' > + core_dir_name = 'crash.%s.%s' % (exe, crashed_pid) > + core_tmp_dir = tempfile.mkdtemp(prefix='core_', dir='/tmp') > + core_tmp_path = os.path.join(core_tmp_dir, core_name) > + gdb_command_path = os.path.join(core_tmp_dir, 'gdb_command') > + > + try: > + # Get the filtered results dir list > + current_results_dir_list = get_results_dir_list(crashed_pid, > + core_dir_name) > + > + # Write the core file to the appropriate directory > + # (we are piping it to this script) > + core_file = sys.stdin.read() > + write_to_file(core_tmp_path, core_file) > + > + # Write a command file for GDB > + gdb_command = 'bt full\n' > + write_to_file(gdb_command_path, gdb_command) > + > + # Get full command path > + exe_path = get_info_from_core(core_tmp_path)['full_exe_path'] > + > + # Take a backtrace from the running program > + gdb_cmd = 'gdb -e %s -c %s -x %s -n -batch -quiet' % (exe_path, > + core_tmp_path, > + gdb_command_path) > + backtrace = commands.getoutput(gdb_cmd) > + # Sanitize output before passing it to the report > + backtrace = backtrace.decode('utf-8', 'ignore') > + > + # Composing the format_dict > + format_dict = {} > + format_dict['program'] = exe_path > + format_dict['pid'] = crashed_pid > + format_dict['signal'] = signal > + format_dict['hostname'] = hostname > + format_dict['time'] = time > + format_dict['backtrace'] = backtrace > + > + report = """Autotest crash report > + > +Program: %(program)s > +PID: %(pid)s > +Signal: %(signal)s > +Hostname: %(hostname)s > +Time of the crash: %(time)s > +Program backtrace: > +%(backtrace)s > +""" % format_dict > + > + syslog.syslog(syslog.LOG_INFO, > + "Application %s, PID %s crashed" % > + (exe_path, crashed_pid)) > + > + # Now, for all results dir, let's create the directory if it doesn't > + # exist, and write the core file and the report to it. > + syslog.syslog(syslog.LOG_INFO, > + "Writing core files and reports to %s" % > + current_results_dir_list) > + for result_dir in current_results_dir_list: > + if not os.path.isdir(result_dir): > + os.makedirs(result_dir) > + core_path = os.path.join(result_dir, 'core') > + write_to_file(core_path, core_file) > + report_path = os.path.join(result_dir, 'report') > + write_to_file(report_path, report) > + > + finally: > + # Cleanup temporary directories > + shutil.rmtree(core_tmp_dir) > -- > 1.6.2.5 > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html