From: Darrick J. Wong <djwong@xxxxxxxxxx> Calls to systemd across dbus are remote procedure calls, which means that they're subject to transitory connection failures (e.g. systemd re-exec itself). We don't want to fail at the *first* sign of what could be temporary trouble, so implement a limited retry with fibonacci backoff before we resort to invoking xfs_scrub as a subprocess. Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> --- scrub/xfs_scrub_all.in | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/scrub/xfs_scrub_all.in b/scrub/xfs_scrub_all.in index a09566efdcd..71726cdf36d 100644 --- a/scrub/xfs_scrub_all.in +++ b/scrub/xfs_scrub_all.in @@ -165,6 +165,22 @@ def path_to_serviceunit(path, scrub_media): for line in proc.stdout: return line.decode(sys.stdout.encoding).strip() +def fibonacci(max_ret): + '''Yield fibonacci sequence up to but not including max_ret.''' + if max_ret < 1: + return + + x = 0 + y = 1 + yield 1 + + z = x + y + while z <= max_ret: + yield z + x = y + y = z + z = x + y + class scrub_service(scrub_control): '''Control object for xfs_scrub systemd service.''' def __init__(self, mnt, scrub_media): @@ -188,6 +204,25 @@ class scrub_service(scrub_control): self.unit = dbus.Interface(svc_obj, 'org.freedesktop.systemd1.Unit') + def __dbusrun(self, lambda_fn): + '''Call the lambda function to execute something on dbus. dbus + exceptions result in retries with Fibonacci backoff, and the + bindings will be rebuilt every time.''' + global debug + + fatal_ex = None + + for i in fibonacci(30): + try: + return lambda_fn() + except dbus.exceptions.DBusException as e: + if debug: + print(e) + fatal_ex = e + time.sleep(i) + self.bind() + raise fatal_ex + def state(self): '''Retrieve the active state for a systemd service. As of systemd 249, this is supposed to be one of the following: @@ -195,8 +230,10 @@ class scrub_service(scrub_control): or "deactivating". These strings are not localized.''' global debug + l = lambda: self.prop.Get('org.freedesktop.systemd1.Unit', + 'ActiveState') try: - return self.prop.Get('org.freedesktop.systemd1.Unit', 'ActiveState') + return self.__dbusrun(l) except Exception as e: if debug: print(e, file = sys.stderr) @@ -231,7 +268,7 @@ class scrub_service(scrub_control): print('starting %s' % self.unitname) try: - self.unit.Start('replace') + self.__dbusrun(lambda: self.unit.Start('replace')) return self.wait() except Exception as e: print(e, file = sys.stderr) @@ -245,7 +282,7 @@ class scrub_service(scrub_control): print('stopping %s' % self.unitname) try: - self.unit.Stop('replace') + self.__dbusrun(lambda: self.unit.Stop('replace')) return self.wait() except Exception as e: print(e, file = sys.stderr)