Sorry, I haven't had time to take a look yet. Been busy. :( I'll try and get in a review some time today. -- John On Sun, Sep 13, 2009 at 9:40 PM, Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> wrote: > Hi John, do you think the code looks good enough for inclusion? > > On Tue, Sep 8, 2009 at 10:53 AM, Lucas Meneghel Rodrigues > <lmr@xxxxxxxxxx> wrote: >> This patch adds a system to watch user space segmentation >> faults, writing core dumps and some degree of core dump >> analysis report. We believe that such a system will be >> beneficial for autotest as a whole, since the ability to >> get core dumps and dump analysis for each app crashing >> during an autotest execution can help test engineers with >> richer debugging information. >> >> The system is comprised by 2 parts: >> >> * Modifications on test code that enable core dumps >> generation, register a core handler script in the kernel >> and check by generated core files at the end of each >> test. >> >> * A core handler script that is going to write the >> core on each test debug dir in a convenient way, with >> a report that currently is comprised by the process that >> died and a gdb stacktrace of the process. As the system >> gets in shape, we could add more scripts that can do >> fancier stuff (such as handlers that use frysk to get >> more info such as memory maps, provided that we have >> frysk installed in the machine). >> >> This is the proof of concept of the system. I am sending it >> to the mailing list on this early stage so I can get >> feedback on the feature. The system passes my basic >> tests: >> >> * Run a simple long test, such as the kvm test, and >> then crash an application while the test is running. I >> get reports generated on test.debugdir >> >> * Run a slightly more complex control file, with 3 parallel >> bonnie instances at once and crash an application while the >> test is running. I get reports generated on all >> test.debugdirs. >> >> 3rd try: >> * Explicitely enable core dumps using the resource module >> * Fixed a bug on the crash detection code, and factored >> it into a utility function. >> >> I believe we are good to go now. >> >> Signed-off-by: Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> >> --- >> client/common_lib/test.py | 66 +++++++++++++- >> client/tools/crash_handler.py | 202 +++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 266 insertions(+), 2 deletions(-) >> create mode 100755 client/tools/crash_handler.py >> >> diff --git a/client/common_lib/test.py b/client/common_lib/test.py >> index 362c960..65b78a3 100644 >> --- a/client/common_lib/test.py >> +++ b/client/common_lib/test.py >> @@ -17,7 +17,7 @@ >> # tmpdir eg. tmp/<tempname>_<testname.tag> >> >> import fcntl, os, re, sys, shutil, tarfile, tempfile, time, traceback >> -import warnings, logging >> +import warnings, logging, glob, resource >> >> from autotest_lib.client.common_lib import error >> from autotest_lib.client.bin import utils >> @@ -31,7 +31,6 @@ class base_test: >> self.job = job >> self.pkgmgr = job.pkgmgr >> self.autodir = job.autodir >> - >> self.outputdir = outputdir >> self.tagged_testname = os.path.basename(self.outputdir) >> self.resultsdir = os.path.join(self.outputdir, 'results') >> @@ -40,6 +39,7 @@ class base_test: >> os.mkdir(self.profdir) >> self.debugdir = os.path.join(self.outputdir, 'debug') >> os.mkdir(self.debugdir) >> + self.configure_crash_handler() >> self.bindir = bindir >> if hasattr(job, 'libdir'): >> self.libdir = job.libdir >> @@ -54,6 +54,66 @@ class base_test: >> self.after_iteration_hooks = [] >> >> >> + def configure_crash_handler(self): >> + """ >> + Configure the crash handler by: >> + * Setting up core size to unlimited >> + * Putting an appropriate crash handler on /proc/sys/kernel/core_pattern >> + * Create files that the crash handler will use to figure which tests >> + are active at a given moment >> + >> + The crash handler will pick up the core file and write it to >> + self.debugdir, and perform analysis on it to generate a report. The >> + program also outputs some results to syslog. >> + >> + If multiple tests are running, an attempt to verify if we still have >> + the old PID on the system process table to determine whether it is a >> + parent of the current test execution. If we can't determine it, the >> + core file and the report file will be copied to all test debug dirs. >> + """ >> + self.pattern_file = '/proc/sys/kernel/core_pattern' >> + try: >> + # Enable core dumps >> + resource.setrlimit(resource.RLIMIT_CORE, (-1, -1)) >> + # Trying to backup core pattern and register our script >> + self.core_pattern_backup = open(self.pattern_file, 'r').read() >> + pattern_file = open(self.pattern_file, 'w') >> + tools_dir = os.path.join(self.autodir, 'tools') >> + crash_handler_path = os.path.join(tools_dir, 'crash_handler.py') >> + pattern_file.write('|' + crash_handler_path + ' %p %t %u %s %h %e') >> + # Writing the files that the crash handler is going to use >> + self.debugdir_tmp_file = ('/tmp/autotest_results_dir.%s' % >> + os.getpid()) >> + utils.open_write_close(self.debugdir_tmp_file, self.debugdir + "\n") >> + except Exception, e: >> + self.crash_handling_enabled = False >> + logging.error('Crash handling system disabled: %s' % e) >> + else: >> + self.crash_handling_enabled = True >> + logging.debug('Crash handling system enabled.') >> + >> + >> + def crash_handler_report(self): >> + """ >> + If core dumps are found on the debugdir after the execution of the >> + test, let the user know. >> + """ >> + if self.crash_handling_enabled: >> + core_dirs = glob.glob('%s/crash.*' % self.debugdir) >> + if core_dirs: >> + logging.warning('Programs crashed during test execution:') >> + for dir in core_dirs: >> + logging.warning('Please verify %s for more info', dir) >> + # Remove the debugdir info file >> + os.unlink(self.debugdir_tmp_file) >> + # Restore the core pattern backup >> + try: >> + utils.open_write_close(self.pattern_file, >> + self.core_pattern_backup) >> + except EnvironmentError: >> + pass >> + >> + >> def assert_(self, expr, msg='Assertion failed.'): >> if not expr: >> raise error.TestError(msg) >> @@ -377,6 +437,7 @@ class base_test: >> traceback.print_exc() >> print 'Now raising the earlier %s error' % exc_info[0] >> finally: >> + self.crash_handler_report() >> self.job.logging.restore() >> try: >> raise exc_info[0], exc_info[1], exc_info[2] >> @@ -389,6 +450,7 @@ class base_test: >> if run_cleanup: >> _cherry_pick_call(self.cleanup, *args, **dargs) >> finally: >> + self.crash_handler_report() >> self.job.logging.restore() >> except error.AutotestError: >> if self.network_destabilizing: >> diff --git a/client/tools/crash_handler.py b/client/tools/crash_handler.py >> new file mode 100755 >> index 0000000..e281eb5 >> --- /dev/null >> +++ b/client/tools/crash_handler.py >> @@ -0,0 +1,202 @@ >> +#!/usr/bin/python >> +""" >> +Simple crash handling application for autotest >> + >> +@copyright Red Hat Inc 2009 >> +@author Lucas Meneghel Rodrigues <lmr@xxxxxxxxxx> >> +""" >> +import sys, os, commands, glob, tempfile, shutil, syslog >> + >> + >> +def get_parent_pid(pid): >> + """ >> + Returns the parent PID for a given PID, converted to an integer. >> + >> + @param pid: Process ID. >> + """ >> + try: >> + stat_file_contents = open('/proc/%s/stat' % pid, 'r').readline() >> + ppid = int(stat_file_contents.split(" ")[3]) >> + except: >> + # It is not possible to determine the parent because the process >> + # already left the process table. >> + ppid = 1 >> + >> + return ppid >> + >> + >> +def pid_descends_from(pid_a, pid_b): >> + """ >> + Check whether pid_a descends from pid_b. >> + >> + @param pid_a: Process ID. >> + @param pid_b: Process ID. >> + """ >> + pid_a = int(pid_a) >> + pid_b = int(pid_b) >> + current_pid = pid_a >> + while current_pid > 1: >> + if current_pid == pid_b: >> + syslog.syslog(syslog.LOG_INFO, >> + "PID %s descends from PID %s!" % (pid_a, pid_b)) >> + return True >> + else: >> + current_pid = get_parent_pid(current_pid) >> + syslog.syslog(syslog.LOG_INFO, >> + "PID %s does not descend from PID %s" % (pid_a, pid_b)) >> + return False >> + >> + >> +def write_to_file(file_path, contents): >> + """ >> + Write contents to a given file path specified. If not specified, the file >> + will be created. >> + >> + @param file_path: Path to a given file. >> + @param contents: File contents. >> + """ >> + file_object = open(file_path, 'w') >> + file_object.write(contents) >> + file_object.close() >> + >> + >> +def get_results_dir_list(pid, core_dir_basename): >> + """ >> + Get all valid output directories for the core file and the report. It works >> + by inspecting files created by each test on /tmp and verifying if the >> + PID of the process that crashed is a child or grandchild of the autotest >> + test process. If it can't find any relationship (maybe a daemon that died >> + during a test execution), it will write the core file to the debug dirs >> + of all tests currently being executed. If there are no active autotest >> + tests at a particular moment, it will return a list with ['/tmp']. >> + >> + @param pid: PID for the process that generated the core >> + @param core_dir_basename: Basename for the directory that will hold both >> + the core dump and the crash report. >> + """ >> + # Get all active test debugdir path files present >> + debugdir_files = glob.glob("/tmp/autotest_results_dir.*") >> + if debugdir_files: >> + pid_dir_dict = {} >> + for debugdir_file in debugdir_files: >> + a_pid = debugdir_file.split('.')[-1] >> + results_dir = open(debugdir_file, 'r').read().strip() >> + pid_dir_dict[a_pid] = os.path.join(results_dir, core_dir_basename) >> + >> + results_dir_list = [] >> + found_relation = False >> + for a_pid, a_path in pid_dir_dict.iteritems(): >> + if pid_descends_from(pid, a_pid): >> + results_dir_list.append(a_path) >> + found_relation = True >> + >> + # If we could not find any relations between the pids in the list with >> + # the process that crashed, we can't tell for sure which test spawned >> + # the process (maybe it is a daemon and started even before autotest >> + # started), so we will have to output the core file to all active test >> + # directories. >> + if not found_relation: >> + return pid_dir_dict.values() >> + else: >> + return results_dir_list >> + >> + else: >> + path_inactive_autotest = os.path.join('/tmp', core_dir_basename) >> + return [path_inactive_autotest] >> + >> + >> +def get_info_from_core(path): >> + """ >> + Reads a core file and extracts a dictionary with useful core information. >> + Right now, the only information extracted is the full executable name. >> + >> + @param path: Path to core file. >> + """ >> + # Here we are getting the executable full path in a very inelegant way :( >> + # Since the 'right' solution for it is to make a library to get information >> + # from core dump files, properly written, I'll leave this as it is for now. >> + full_exe_path = commands.getoutput('strings %s | grep "_="' % >> + path).strip("_=") >> + if full_exe_path.startswith("./"): >> + pwd = commands.getoutput('strings %s | grep "^PWD="' % >> + path).strip("PWD=") >> + full_exe_path = os.path.join(pwd, full_exe_path.strip("./")) >> + >> + return {'core_file': path, 'full_exe_path': full_exe_path} >> + >> + >> +if __name__ == "__main__": >> + syslog.openlog('AutotestCrashHandler', 0, syslog.LOG_DAEMON) >> + (crashed_pid, time, uid, signal, hostname, exe) = sys.argv[1:] >> + core_name = 'core' >> + report_name = 'report' >> + core_dir_name = 'crash.%s.%s' % (exe, crashed_pid) >> + core_tmp_dir = tempfile.mkdtemp(prefix='core_', dir='/tmp') >> + core_tmp_path = os.path.join(core_tmp_dir, core_name) >> + gdb_command_path = os.path.join(core_tmp_dir, 'gdb_command') >> + >> + try: >> + # Get the filtered results dir list >> + current_results_dir_list = get_results_dir_list(crashed_pid, >> + core_dir_name) >> + >> + # Write the core file to the appropriate directory >> + # (we are piping it to this script) >> + core_file = sys.stdin.read() >> + write_to_file(core_tmp_path, core_file) >> + >> + # Write a command file for GDB >> + gdb_command = 'bt full\n' >> + write_to_file(gdb_command_path, gdb_command) >> + >> + # Get full command path >> + exe_path = get_info_from_core(core_tmp_path)['full_exe_path'] >> + >> + # Take a backtrace from the running program >> + gdb_cmd = 'gdb -e %s -c %s -x %s -n -batch -quiet' % (exe_path, >> + core_tmp_path, >> + gdb_command_path) >> + backtrace = commands.getoutput(gdb_cmd) >> + # Sanitize output before passing it to the report >> + backtrace = backtrace.decode('utf-8', 'ignore') >> + >> + # Composing the format_dict >> + format_dict = {} >> + format_dict['program'] = exe_path >> + format_dict['pid'] = crashed_pid >> + format_dict['signal'] = signal >> + format_dict['hostname'] = hostname >> + format_dict['time'] = time >> + format_dict['backtrace'] = backtrace >> + >> + report = """Autotest crash report >> + >> +Program: %(program)s >> +PID: %(pid)s >> +Signal: %(signal)s >> +Hostname: %(hostname)s >> +Time of the crash: %(time)s >> +Program backtrace: >> +%(backtrace)s >> +""" % format_dict >> + >> + syslog.syslog(syslog.LOG_INFO, >> + "Application %s, PID %s crashed" % >> + (exe_path, crashed_pid)) >> + >> + # Now, for all results dir, let's create the directory if it doesn't >> + # exist, and write the core file and the report to it. >> + syslog.syslog(syslog.LOG_INFO, >> + "Writing core files and reports to %s" % >> + current_results_dir_list) >> + for result_dir in current_results_dir_list: >> + if not os.path.isdir(result_dir): >> + os.makedirs(result_dir) >> + core_path = os.path.join(result_dir, 'core') >> + write_to_file(core_path, core_file) >> + report_path = os.path.join(result_dir, 'report') >> + write_to_file(report_path, report) >> + >> + finally: >> + # Cleanup temporary directories >> + shutil.rmtree(core_tmp_dir) >> -- >> 1.6.2.5 >> >> _______________________________________________ >> Autotest mailing list >> Autotest@xxxxxxxxxxxxxxx >> http://test.kernel.org/cgi-bin/mailman/listinfo/autotest >> > > > > -- > Lucas > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html