[PATCH] rteval: Print useful exception info and exit on missing measurement tool

Crystal Wood <crwood@xxxxxxxxxx> · Fri, 1 Nov 2024 15:54:46 -0500

Currently, if cyclictest or rtla (whichever rteval is configured to use)
is missing, the error output only says "measurement threads did not use
the full time slot" and then rteval hangs.

Avoid catching these exceptions so that they will be printed out,
pointing to the actual problem.  Also set up a thread exception hook to
do a prompt and (relatively) graceful exit if a thread has any uncaught
exception (this requires python >= 3.8).

Signed-off-by: Crystal Wood <crwood@xxxxxxxxxx>
---
 README                                   |  2 +-
 rteval/__init__.py                       | 12 ++++++++++++
 rteval/modules/__init__.py               | 14 +++++++++++---
 rteval/modules/measurement/cyclictest.py | 14 +++++---------
 rteval/modules/measurement/timerlat.py   | 13 +++++--------
 5 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/README b/README
index b352d7f..19704b4 100644
--- a/README
+++ b/README
@@ -16,7 +16,7 @@ The rteval source may be pulled from it's git tree on kernel.org:
 
 Rteval requires the following packages to run:
 
-Python >= 3.0
+Python >= 3.8
     http://www.python.org/download/
 
 python-lxml
diff --git a/rteval/__init__.py b/rteval/__init__.py
index 7c13e84..6097ddf 100644
--- a/rteval/__init__.py
+++ b/rteval/__init__.py
@@ -19,6 +19,7 @@ import threading
 import time
 from datetime import datetime
 import sysconfig
+from traceback import format_exception
 from rteval.modules.loads import LoadModules
 from rteval.modules.measurement import MeasurementModules
 from rteval.rtevalReport import rtevalReport
@@ -29,6 +30,7 @@ from rteval import version
 RTEVAL_VERSION = version.RTEVAL_VERSION
 
 EARLYSTOP = False
+threaderr = False
 
 stopsig = threading.Event()
 def sig_handler(signum, frame):
@@ -39,9 +41,17 @@ def sig_handler(signum, frame):
     else:
         raise RuntimeError(f"SIGNAL received! ({signum})")
 
+def except_hook(args):
+    global threaderr
+
+    threading.__excepthook__(args)
+    threaderr = True
+    stopsig.set()
+
 class RtEval(rtevalReport):
     def __init__(self, config, loadmods, measuremods, logger):
         self.__version = RTEVAL_VERSION
+        threading.excepthook = except_hook
 
         if not isinstance(config, rtevalConfig.rtevalConfig):
             raise TypeError("config variable is not an rtevalConfig object")
@@ -237,6 +247,8 @@ class RtEval(rtevalReport):
         global EARLYSTOP
         rtevalres = 0
         measure_start = self.__RunMeasurement()
+        if threaderr:
+            return 1
 
         self._report(measure_start, self.__rtevcfg.xslt_report)
         if self.__rtevcfg.sysreport:
diff --git a/rteval/modules/__init__.py b/rteval/modules/__init__.py
index eb29db8..9827651 100644
--- a/rteval/modules/__init__.py
+++ b/rteval/modules/__init__.py
@@ -124,8 +124,8 @@ class rtevalModulePrototype(threading.Thread):
 
     def WaitForCompletion(self, wtime=None):
         """ Blocks until the module has completed its workload """
-        if not self.shouldStart():
-            # If it hasn't been started yet, nothing to wait for
+        if self.hadRuntimeError() or not self.shouldStart():
+            # If it failed or hasn't been started yet, nothing to wait for
             return None
         return self.__events["finished"].wait(wtime)
 
@@ -175,7 +175,7 @@ class rtevalModulePrototype(threading.Thread):
         return self._donotrun is False
 
 
-    def run(self):
+    def __run(self):
         "Workload thread runner - takes care of keeping the workload running as long as needed"
         if self.shouldStop():
             return
@@ -215,6 +215,12 @@ class rtevalModulePrototype(threading.Thread):
 
         self._WorkloadCleanup()
 
+    def run(self):
+        try:
+            self.__run()
+        except Exception as e:
+            self._setRuntimeError()
+            raise e
 
     def MakeReport(self):
         """ required module method, needs to return an libxml2.xmlNode object
@@ -532,6 +538,8 @@ class RtEvalModules:
         rep_n = libxml2.newNode(self._report_tag)
 
         for (modname, mod) in self.__modules:
+            if mod.hadRuntimeError():
+                continue
             self._logger.log(Log.DEBUG, f"Getting report from {modname}")
             modrep_n = mod.MakeReport()
             if modrep_n is not None:
diff --git a/rteval/modules/measurement/cyclictest.py b/rteval/modules/measurement/cyclictest.py
index 2e8f6f1..3d25c20 100644
--- a/rteval/modules/measurement/cyclictest.py
+++ b/rteval/modules/measurement/cyclictest.py
@@ -285,15 +285,11 @@ class Cyclictest(rtevalModulePrototype):
                 fp.flush()
 
         self.__cyclicoutput.seek(0)
-        try:
-            self.__cyclicprocess = subprocess.Popen(self.__cmd,
-                                                    stdout=self.__cyclicoutput,
-                                                    stderr=self.__nullfp,
-                                                    stdin=self.__nullfp)
-            self.__started = True
-        except OSError:
-            self.__started = False
-
+        self.__cyclicprocess = subprocess.Popen(self.__cmd,
+                                                stdout=self.__cyclicoutput,
+                                                stderr=self.__nullfp,
+                                                stdin=self.__nullfp)
+        self.__started = True
 
     def WorkloadAlive(self):
         if self.__started:
diff --git a/rteval/modules/measurement/timerlat.py b/rteval/modules/measurement/timerlat.py
index 92bc070..df42777 100644
--- a/rteval/modules/measurement/timerlat.py
+++ b/rteval/modules/measurement/timerlat.py
@@ -252,14 +252,11 @@ class Timerlat(rtevalModulePrototype):
 
         self.__timerlat_out.seek(0)
         self.__timerlat_err.seek(0)
-        try:
-            self.__timerlat_process = subprocess.Popen(self.__cmd,
-                                                       stdout=self.__timerlat_out,
-                                                       stderr=self.__timerlat_err,
-                                                       stdin=None)
-            self.__started = True
-        except OSError:
-            self.__started = False
+        self.__timerlat_process = subprocess.Popen(self.__cmd,
+                                                   stdout=self.__timerlat_out,
+                                                   stderr=self.__timerlat_err,
+                                                   stdin=None)
+        self.__started = True
 
     def WorkloadAlive(self):
         if self.__started:
-- 
2.47.0