In cases when a QEMU process takes longer than the time sigterm and sigkill are issued to kill the process do not simply fail and leave the VM in state VIR_DOMAIN_SHUTDOWN until the daemon stops. Instead set up an fd on /proc/$pid and get notified when the QEMU process finally has terminated to cleanup the VM state. Resolves: https://issues.redhat.com/browse/RHEL-28819 Signed-off-by: Boris Fiuczynski <fiuczy@xxxxxxxxxxxxx> --- src/qemu/qemu_domain.c | 8 +++ src/qemu/qemu_domain.h | 2 + src/qemu/qemu_driver.c | 18 ++++++ src/qemu/qemu_process.c | 124 ++++++++++++++++++++++++++++++++++++++-- src/qemu/qemu_process.h | 1 + 5 files changed, 148 insertions(+), 5 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 2134b11038..8147ff02fd 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1889,6 +1889,11 @@ qemuDomainObjPrivateFree(void *data) virChrdevFree(priv->devs); + if (priv->pidMonitored >= 0) { + virEventRemoveHandle(priv->pidMonitored); + priv->pidMonitored = -1; + } + /* This should never be non-NULL if we get here, but just in case... */ if (priv->mon) { VIR_ERROR(_("Unexpected QEMU monitor still active during domain deletion")); @@ -1934,6 +1939,8 @@ qemuDomainObjPrivateAlloc(void *opaque) priv->blockjobs = virHashNew(virObjectUnref); priv->fds = virHashNew(g_object_unref); + priv->pidMonitored = -1; + /* agent commands block by default, user can choose different behavior */ priv->agentTimeout = VIR_DOMAIN_AGENT_RESPONSE_TIMEOUT_BLOCK; priv->migMaxBandwidth = QEMU_DOMAIN_MIG_BANDWIDTH_MAX; @@ -11680,6 +11687,7 @@ qemuProcessEventFree(struct qemuProcessEvent *event) case QEMU_PROCESS_EVENT_RESET: case QEMU_PROCESS_EVENT_NBDKIT_EXITED: case QEMU_PROCESS_EVENT_MONITOR_EOF: + case QEMU_PROCESS_EVENT_SHUTDOWN_COMPLETED: case QEMU_PROCESS_EVENT_LAST: break; } diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index d777559119..a5092dd7f0 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -119,6 +119,7 @@ struct _qemuDomainObjPrivate { bool beingDestroyed; char *pidfile; + int pidMonitored; virDomainPCIAddressSet *pciaddrs; virDomainUSBAddressSet *usbaddrs; @@ -469,6 +470,7 @@ typedef enum { QEMU_PROCESS_EVENT_UNATTENDED_MIGRATION, QEMU_PROCESS_EVENT_RESET, QEMU_PROCESS_EVENT_NBDKIT_EXITED, + QEMU_PROCESS_EVENT_SHUTDOWN_COMPLETED, QEMU_PROCESS_EVENT_LAST } qemuProcessEventType; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 9f3013e231..6b1e4084f6 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -4041,6 +4041,21 @@ processNbdkitExitedEvent(virDomainObj *vm, } +static void +processShutdownCompletedEvent(virQEMUDriver *driver, + virDomainObj *vm) +{ + if (virDomainObjBeginJob(vm, VIR_JOB_MODIFY) < 0) + return; + + if (virDomainObjIsActive(vm)) + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_UNKNOWN, + VIR_ASYNC_JOB_NONE, 0); + + virDomainObjEndJob(vm); +} + + static void qemuProcessEventHandler(void *data, void *opaque) { struct qemuProcessEvent *processEvent = data; @@ -4101,6 +4116,9 @@ static void qemuProcessEventHandler(void *data, void *opaque) case QEMU_PROCESS_EVENT_NBDKIT_EXITED: processNbdkitExitedEvent(vm, processEvent->data); break; + case QEMU_PROCESS_EVENT_SHUTDOWN_COMPLETED: + processShutdownCompletedEvent(driver, vm); + break; case QEMU_PROCESS_EVENT_LAST: break; } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 25dfd04272..c6f7d34168 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -25,6 +25,9 @@ #include <unistd.h> #include <signal.h> #include <sys/stat.h> +#if WITH_SYS_SYSCALL_H +# include <sys/syscall.h> +#endif #if defined(__linux__) # include <linux/capability.h> #elif defined(__FreeBSD__) @@ -8387,9 +8390,114 @@ qemuProcessCreatePretendCmdBuild(virDomainObj *vm, } +#if WITH_SYS_SYSCALL_H && defined(SYS_pidfd_open) +typedef struct { + virDomainObj *vm; + int pidfd; +} qemuProcessInShutdownEventData; + + +static qemuProcessInShutdownEventData* +qemuProcessInShutdownEventDataNew(virDomainObj *vm, int pidfd) +{ + qemuProcessInShutdownEventData *d = g_new(qemuProcessInShutdownEventData, 1); + d->vm = virObjectRef(vm); + d->pidfd = pidfd; + return d; +} + + +static void +qemuProcessInShutdownEventDataFree(qemuProcessInShutdownEventData *d) +{ + virObjectUnref(d->vm); + VIR_FORCE_CLOSE(d->pidfd); + g_free(d); +} + + +static void +qemuProcessInShutdownPidfdCb(int watch, + int fd, + int events G_GNUC_UNUSED, + void *opaque) +{ + qemuProcessInShutdownEventData *data = opaque; + virDomainObj *vm = data->vm; + + VIR_DEBUG("vm=%p name=%s pid=%lld fd=%d", + vm, vm->def->name, (long long)vm->pid, fd); + + virEventRemoveHandle(watch); + + virObjectLock(vm); + + VIR_INFO("QEMU process %lld finally completed termination", + (long long)vm->pid); + + QEMU_DOMAIN_PRIVATE(vm)->pidMonitored = -1; + qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_SHUTDOWN_COMPLETED, + 0, 0, NULL); + + virObjectUnlock(vm); +} +#endif /* WITH_SYS_SYSCALL_H && defined(SYS_pidfd_open) */ + + +static int +qemuProcessInShutdownStartMonitor(virDomainObj *vm) +{ +#if WITH_SYS_SYSCALL_H && defined(SYS_pidfd_open) + qemuDomainObjPrivate *priv = vm->privateData; + qemuProcessInShutdownEventData *data; + int pidfd; + int ret = -1; + + VIR_DEBUG("vm=%p name=%s pid=%lld pidMonitored=%d", + vm, vm->def->name, (long long)vm->pid, + priv->pidMonitored); + + if (priv->pidMonitored >= 0) { + VIR_DEBUG("Monitoring qemu in-shutdown process %i already set up", vm->pid); + goto cleanup; + } + + pidfd = syscall(SYS_pidfd_open, vm->pid, 0); + if (pidfd < 0) { + if (errno == ESRCH) /* process has already terminated */ + ret = 1; + goto cleanup; + } + + data = qemuProcessInShutdownEventDataNew(vm, pidfd); + if ((priv->pidMonitored = virEventAddHandle(pidfd, + VIR_EVENT_HANDLE_READABLE, + qemuProcessInShutdownPidfdCb, + data, + (virFreeCallback)qemuProcessInShutdownEventDataFree)) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("failed to monitor qemu in-shutdown process %1$i"), + vm->pid); + qemuProcessInShutdownEventDataFree(data); + goto cleanup; + } + VIR_DEBUG("Monitoring qemu in-shutdown process %i for termination", vm->pid); + ret = 0; + + cleanup: + return ret; +#else /* !WITH_SYS_SYSCALL_H || !defined(SYS_pidfd_open) */ + VIR_DEBUG("Monitoring qemu process %i not implemented", vm->pid); + return -1; +#endif /* !WITH_SYS_SYSCALL_H || !defined(SYS_pidfd_open) */ +} + + int qemuProcessKill(virDomainObj *vm, unsigned int flags) { + int ret = -1; + VIR_DEBUG("vm=%p name=%s pid=%lld flags=0x%x", vm, vm->def->name, (long long)vm->pid, flags); @@ -8410,10 +8518,16 @@ qemuProcessKill(virDomainObj *vm, unsigned int flags) /* Request an extra delay of two seconds per current nhostdevs * to be safe against stalls by the kernel freeing up the resources */ - return virProcessKillPainfullyDelay(vm->pid, - !!(flags & VIR_QEMU_PROCESS_KILL_FORCE), - vm->def->nhostdevs * 2, - false); + ret = virProcessKillPainfullyDelay(vm->pid, + !!(flags & VIR_QEMU_PROCESS_KILL_FORCE), + vm->def->nhostdevs * 2, + false); + + if (ret < 0 && (flags & VIR_QEMU_PROCESS_KILL_MONITOR_ON_ERROR)) + if (qemuProcessInShutdownStartMonitor(vm) == 1) + ret = 0; /* process termination detected */ + + return ret; } @@ -8438,7 +8552,7 @@ qemuProcessBeginStopJob(virDomainObj *vm, * cleared inside qemuProcessStop */ priv->beingDestroyed = true; - if (qemuProcessKill(vm, killFlags) < 0) + if (qemuProcessKill(vm, killFlags|VIR_QEMU_PROCESS_KILL_MONITOR_ON_ERROR) < 0) goto error; /* Wake up anything waiting on domain condition */ diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index cb67bfcd2d..2324aeb7bd 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -180,6 +180,7 @@ typedef enum { VIR_QEMU_PROCESS_KILL_FORCE = 1 << 0, VIR_QEMU_PROCESS_KILL_NOWAIT = 1 << 1, VIR_QEMU_PROCESS_KILL_NOCHECK = 1 << 2, /* bypass the running vm check */ + VIR_QEMU_PROCESS_KILL_MONITOR_ON_ERROR = 1 << 3, /* on error enable process monitor */ } virQemuProcessKillMode; int qemuProcessKill(virDomainObj *vm, unsigned int flags); -- 2.45.0