If some program is attached to cgroup and that cgroup is removed before detaching the program it might not be freed and will remain in the system until it's rebooted. This would not be that big deal to workaround if we wouldn't use machined to track our guests. Systemd tries to be the nice guy and if there is no process attached to TransientUnit it will destroy the unit and remove the cgroup from system which will hit the kernel bug. In order to prevent this behavior of systemd we can move another process into the guest cgroup and the cgroup will remain active even if we kill qemu process while destroying guest. We can then detach our BPF program from that cgroup and call machined terminate API to cleanup the cgroup. Signed-off-by: Pavel Hrdina <phrdina@xxxxxxxxxx> --- src/libvirt_private.syms | 1 + src/lxc/lxc_cgroup.c | 1 + src/qemu/qemu_cgroup.c | 6 +++- src/util/vircgroup.c | 15 +++++++++ src/util/vircgroup.h | 1 + src/util/vircgrouppriv.h | 2 ++ src/util/vircgroupv2.c | 68 ++++++++++++++++++++++++++++++++++++++-- src/util/virsystemd.c | 2 +- src/util/virsystemd.h | 2 ++ 9 files changed, 94 insertions(+), 4 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 0cff580de2..f4a81370bb 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -3021,6 +3021,7 @@ virSystemdCanHybridSleep; virSystemdCanSuspend; virSystemdCreateMachine; virSystemdGetMachineNameByPID; +virSystemdHasMachined; virSystemdHasMachinedResetCachedValue; virSystemdMakeScopeName; virSystemdMakeSliceName; diff --git a/src/lxc/lxc_cgroup.c b/src/lxc/lxc_cgroup.c index d93a19d684..a8cef74bb9 100644 --- a/src/lxc/lxc_cgroup.c +++ b/src/lxc/lxc_cgroup.c @@ -455,6 +455,7 @@ virCgroupPtr virLXCCgroupCreate(virDomainDefPtr def, nnicindexes, nicindexes, def->resource->partition, -1, + LXC_STATE_DIR, &cgroup) < 0) goto cleanup; diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c index 4931fb6575..9374a799a1 100644 --- a/src/qemu/qemu_cgroup.c +++ b/src/qemu/qemu_cgroup.c @@ -946,6 +946,7 @@ qemuInitCgroup(virDomainObjPtr vm, nnicindexes, nicindexes, vm->def->resource->partition, cfg->cgroupControllers, + cfg->stateDir, &priv->cgroup) < 0) { if (virCgroupNewIgnoreError()) goto done; @@ -1256,16 +1257,19 @@ int qemuRemoveCgroup(virDomainObjPtr vm) { qemuDomainObjPrivatePtr priv = vm->privateData; + int rc = 0; if (priv->cgroup == NULL) return 0; /* Not supported, so claim success */ + rc = virCgroupRemove(priv->cgroup); + if (virCgroupTerminateMachine(priv->machineName) < 0) { if (!virCgroupNewIgnoreError()) VIR_DEBUG("Failed to terminate cgroup for %s", vm->def->name); } - return virCgroupRemove(priv->cgroup); + return rc; } diff --git a/src/util/vircgroup.c b/src/util/vircgroup.c index 9d284e9bf4..19d81cf050 100644 --- a/src/util/vircgroup.c +++ b/src/util/vircgroup.c @@ -1105,6 +1105,7 @@ virCgroupNewMachineSystemd(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) { int rv; @@ -1151,6 +1152,16 @@ virCgroupNewMachineSystemd(const char *name, return -1; } + if (VIR_STRDUP((*group)->stateDir, stateDir) < 0) { + virCgroupFree(group); + return -1; + } + + if (VIR_STRDUP((*group)->vmName, name) < 0) { + virCgroupFree(group); + return -1; + } + if (virCgroupAddProcess(*group, pidleader) < 0) { virErrorPtr saved = virSaveLastError(); virCgroupRemove(*group); @@ -1233,6 +1244,7 @@ virCgroupNewMachine(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) { int rv; @@ -1249,6 +1261,7 @@ virCgroupNewMachine(const char *name, nicindexes, partition, controllers, + stateDir, group)) == 0) return 0; @@ -1301,6 +1314,8 @@ virCgroupFree(virCgroupPtr *group) VIR_FREE((*group)->unified.placement); VIR_FREE((*group)->path); + VIR_FREE((*group)->stateDir); + VIR_FREE((*group)->vmName); VIR_FREE(*group); } diff --git a/src/util/vircgroup.h b/src/util/vircgroup.h index 372009de4a..0cd90d3651 100644 --- a/src/util/vircgroup.h +++ b/src/util/vircgroup.h @@ -98,6 +98,7 @@ int virCgroupNewMachine(const char *name, int *nicindexes, const char *partition, int controllers, + const char *stateDir, virCgroupPtr *group) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) ATTRIBUTE_NONNULL(3); diff --git a/src/util/vircgrouppriv.h b/src/util/vircgrouppriv.h index 085fea375c..e0ccce88a0 100644 --- a/src/util/vircgrouppriv.h +++ b/src/util/vircgrouppriv.h @@ -62,6 +62,8 @@ typedef virCgroupV2Controller *virCgroupV2ControllerPtr; struct _virCgroup { char *path; + char *stateDir; + char *vmName; virCgroupBackendPtr backends[VIR_CGROUP_BACKEND_TYPE_LAST]; diff --git a/src/util/vircgroupv2.c b/src/util/vircgroupv2.c index 0a4aa15d0b..f5c43ae4bc 100644 --- a/src/util/vircgroupv2.c +++ b/src/util/vircgroupv2.c @@ -38,9 +38,12 @@ #include "vircgroup.h" #include "vircgroupbackend.h" #include "vircgroupv2.h" +#include "vircommand.h" #include "virerror.h" #include "virfile.h" #include "virlog.h" +#include "virpidfile.h" +#include "virprocess.h" #include "virstring.h" #include "virsystemd.h" @@ -479,6 +482,9 @@ virCgroupV2Remove(virCgroupPtr group) if (virCgroupV2DeviceRemoveProg(group) < 0) return -1; + if (virSystemdHasMachined() == 0) + virCgroupKillPainfully(group); + return virCgroupRemoveRecursively(grppath); } @@ -1899,17 +1905,74 @@ virCgroupV2DeviceReallocMap(int mapfd, } +static pid_t +virCgroupV2DeviceStartDummyProc(virCgroupPtr group) +{ + pid_t ret = -1; + pid_t pid = -1; + virCommandPtr cmd = NULL; + VIR_AUTOFREE(char *) pidfile = NULL; + VIR_AUTOFREE(char *) fileName = NULL; + + if (virAsprintf(&fileName, "cgroup-%s", group->vmName) < 0) + return -1; + + pidfile = virPidFileBuildPath(group->stateDir, fileName); + if (!pidfile) + return -1; + + cmd = virCommandNewArgList("/usr/bin/tail", "-f", "/dev/null", + "-s", "3600", NULL); + if (!cmd) + return -1; + + virCommandDaemonize(cmd); + virCommandSetPidFile(cmd, pidfile); + + if (virCommandRun(cmd, NULL) < 0) + goto cleanup; + + if (virPidFileReadPath(pidfile, &pid) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("failed to start dummy cgroup helper")); + goto cleanup; + } + + if (virCgroupAddMachineProcess(group, pid) < 0) + goto cleanup; + + ret = pid; + cleanup: + if (ret < 0) { + virCommandAbort(cmd); + if (pid > 0) + virProcessKillPainfully(pid, true); + } + if (pidfile) + unlink(pidfile); + virCommandFree(cmd); + return ret; +} + + static int virCgroupV2DeviceCreateProg(virCgroupPtr group) { - int mapfd; + int mapfd = -1; + pid_t pid = -1; if (group->unified.devices.progfd > 0 && group->unified.devices.mapfd > 0) return 0; + if (virSystemdHasMachined() == 0) { + pid = virCgroupV2DeviceStartDummyProc(group); + if (pid < 0) + return -1; + } + mapfd = virCgroupV2DeviceCreateMap(VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE); if (mapfd < 0) - return -1; + goto error; if (virCgroupV2DeviceAttachProg(group, mapfd, VIR_CGROUP_V2_INITIAL_BPF_MAP_SIZE) < 0) { @@ -1919,6 +1982,7 @@ virCgroupV2DeviceCreateProg(virCgroupPtr group) return 0; error: + virProcessKillPainfully(pid, true); VIR_FORCE_CLOSE(mapfd); return -1; } diff --git a/src/util/virsystemd.c b/src/util/virsystemd.c index f492ac1859..ca35b12a5c 100644 --- a/src/util/virsystemd.c +++ b/src/util/virsystemd.c @@ -138,7 +138,7 @@ void virSystemdHasMachinedResetCachedValue(void) * -1 = error * 0 = machine1 is available */ -static int +int virSystemdHasMachined(void) { int ret; diff --git a/src/util/virsystemd.h b/src/util/virsystemd.h index 7d9c0ebd62..553d4196f1 100644 --- a/src/util/virsystemd.h +++ b/src/util/virsystemd.h @@ -51,4 +51,6 @@ int virSystemdCanHybridSleep(bool *result); char *virSystemdGetMachineNameByPID(pid_t pid); +int virSystemdHasMachined(void); + #endif /* LIBVIRT_VIRSYSTEMD_H */ -- 2.20.1 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list