On Wed, Jan 08, 2025 at 19:42:58 +0000, Daniel P. Berrangé wrote: > The service unit "TimeoutStopSec" setting controls how long systemd > waits for a service to stop before aggressively killing it, defaulting > to 30 seconds if not set. > > When we're processing shutdown of VMs in response to OS shutdown, we > very likely need more than 30 seconds to complete this job, and can > not stop the daemon during this time. > > To avoid being prematurely killed, setup a timer that repeatedly > extends the "TimeoutStopSec" value while stop of running VMs is > arranged. > > This does mean if libvirt hangs while stoppping VMs, systemd won't > get to kill the libvirt daemon, but this is considered less harmful > that forcefully killing running VMs. > > Signed-off-by: Daniel P. Berrangé <berrange@xxxxxxxxxx> > --- > src/rpc/virnetdaemon.c | 62 ++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 62 insertions(+) > > diff --git a/src/rpc/virnetdaemon.c b/src/rpc/virnetdaemon.c > index 8cc7af1182..3ddb9b5404 100644 > --- a/src/rpc/virnetdaemon.c > +++ b/src/rpc/virnetdaemon.c > @@ -78,6 +78,9 @@ struct _virNetDaemon { > virNetDaemonShutdownCallback shutdownPrepareCb; > virNetDaemonShutdownCallback shutdownWaitCb; > virThread *shutdownPreserveThread; > + unsigned long long preserveStart; > + unsigned int preserveExtended; > + int preserveTimer; > int quitTimer; > virNetDaemonQuitPhase quit; > bool graceful; > @@ -93,6 +96,14 @@ struct _virNetDaemon { > > static virClass *virNetDaemonClass; > > +/* > + * The minimum additional shutdown time (secs) we should ask > + * systemd to allow, while state preservation operations > + * are running. A timer will run every 5 seconds, and > + * ensure at least this much extra time is requested > + */ > +#define VIR_NET_DAEMON_PRESERVE_MIN_TIME 30 > + > static int > daemonServerClose(void *payload, > const char *key G_GNUC_UNUSED, > @@ -162,6 +173,7 @@ virNetDaemonNew(void) > if (virEventRegisterDefaultImpl() < 0) > goto error; > > + dmn->preserveTimer = -1; > dmn->autoShutdownTimerID = -1; > > #ifndef WIN32 > @@ -727,6 +739,42 @@ daemonShutdownWait(void *opaque) > } > } > > +static void > +virNetDaemonPreserveTimer(int timerid G_GNUC_UNUSED, > + void *opaque) > +{ > + virNetDaemon *dmn = opaque; > + VIR_LOCK_GUARD lock = virObjectLockGuard(dmn); > + unsigned long long now = g_get_monotonic_time(); > + unsigned long long delta; > + > + if (dmn->quit != VIR_NET_DAEMON_QUIT_PRESERVING) > + return; > + > + VIR_DEBUG("Started at %llu now %llu extended %u", > + dmn->preserveStart, now, dmn->preserveExtended); > + > + /* Time since start of preserving state in usec */ > + delta = now - dmn->preserveStart; > + /* Converts to secs */ > + delta /= (1000ull * 1000ull); > + > + /* Want extra seconds grace to ensure this timer fires > + * again before system timeout expires, under high > + * load conditions */ > + delta += VIR_NET_DAEMON_PRESERVE_MIN_TIME; > + > + /* Deduct any extension we've previously asked for */ > + delta -= dmn->preserveExtended; > + > + /* Tell systemd how much more we need to extend by */ > + virSystemdNotifyExtendTimeout(delta); > + dmn->preserveExtended += delta; > + > + VIR_DEBUG("Extended by %llu", delta); > +} Assume: preserveStart = 0; preserveExtend = 30; /* from the initialization */ Calls: by assumption of starting time above above (all in seconds) the notify timeout simplifies to: notify(delta) = now + 30 - preserveExtend Iteration 0 is the direct call, all others are via timer iter now() notify(delta) preserveExtend 0 0 30 (hardcoded) 30 1 5 5 35 2 10 5 40 3 15 5 45 4 20 5 50 [...] 'man sd_notify' says for EXTEND_TIMEOUT_USEC=... The value specified is a time in microseconds during which the service must send a new message. So this means: - initial iteration indeed extends by 30 seconds - any subsequent iteration extends only by 5 - the timer is also 5 seconds, so it's likely to instantly timeout on second iteration - the complicated algorithm doesn't seem to be necessary, all you want is a constant which is 'timer_interval+tolerance' > static void > virNetDaemonQuitTimer(int timerid G_GNUC_UNUSED, > void *opaque) > @@ -781,11 +829,21 @@ virNetDaemonRun(virNetDaemon *dmn) > > if (dmn->quit == VIR_NET_DAEMON_QUIT_REQUESTED) { > VIR_DEBUG("Process quit request"); > + virSystemdNotifyStopping(); > virHashForEach(dmn->servers, daemonServerClose, NULL); > > if (dmn->shutdownPreserveThread) { > VIR_DEBUG("Shutdown preserve thread running"); > dmn->quit = VIR_NET_DAEMON_QUIT_PRESERVING; > + dmn->preserveStart = g_get_monotonic_time(); > + dmn->preserveExtended = VIR_NET_DAEMON_PRESERVE_MIN_TIME; ^^^ assumptions > + virSystemdNotifyExtendTimeout(dmn->preserveExtended); iter 0 > + if ((dmn->preserveTimer = virEventAddTimeout(5 * 1000, > + virNetDaemonPreserveTimer, > + dmn, NULL)) < 0) { subsequent iters via timer > + VIR_WARN("Failed to register preservation timer"); > + /* hope for the best */ > + } > } else { > VIR_DEBUG("Ready to shutdown"); > dmn->quit = VIR_NET_DAEMON_QUIT_READY; > @@ -866,6 +924,10 @@ static void virNetDaemonPreserveWorker(void *opaque) > dmn->quit = VIR_NET_DAEMON_QUIT_READY; > } > g_clear_pointer(&dmn->shutdownPreserveThread, g_free); > + if (dmn->preserveTimer != -1) { > + virEventRemoveTimeout(dmn->preserveTimer); > + dmn->preserveTimer = -1; > + } > } > > VIR_DEBUG("End preserve dmn=%p", dmn); > -- > 2.47.1 >