Calling qemuProcessStop without a job opens a way to race conditions with qemuDomainObjExitMonitor called in another thread. A real world example of such a race condition: - migration thread (A) calls qemuMigrationWaitForSpice - another thread (B) starts processing qemuDomainAbortJob API - thread B signals thread A via qemuDomainObjAbortAsyncJob - thread B enters monitor (qemuDomainObjEnterMonitor) - thread B calls qemuMonitorSend - thread A awakens and calls qemuProcessStop - thread A calls qemuMonitorClose and sets priv->mon to NULL - thread B calls qemuDomainObjExitMonitor with priv->mon == NULL => monitor stays ref'ed and locked Depending on how lucky we are, the race may result in a memory leak or it can even deadlock libvirtd's event loop if it tries to lock the monitor to process an event received before qemuMonitorClose was called. Signed-off-by: Jiri Denemark <jdenemar@xxxxxxxxxx> --- src/qemu/qemu_driver.c | 28 +++++++++++------- src/qemu/qemu_migration.c | 6 +++- src/qemu/qemu_process.c | 72 +++++++++++++++++++++++++++++++---------------- src/qemu/qemu_process.h | 1 + 4 files changed, 71 insertions(+), 36 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index dc448fb..3bccb55 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -2287,7 +2287,8 @@ qemuDomainDestroyFlags(virDomainPtr dom, if (priv->job.asyncJob == QEMU_ASYNC_JOB_MIGRATION_IN) stopFlags |= VIR_QEMU_PROCESS_STOP_MIGRATED; - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_DESTROYED, stopFlags); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_DESTROYED, + QEMU_ASYNC_JOB_NONE, stopFlags); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_DESTROYED); @@ -3297,7 +3298,8 @@ qemuDomainSaveInternal(virQEMUDriverPtr driver, virDomainPtr dom, goto endjob; /* Shut it down */ - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_SAVED, 0); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_SAVED, + QEMU_ASYNC_JOB_SAVE, 0); virDomainAuditStop(vm, "saved"); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_SAVED); @@ -3790,7 +3792,8 @@ qemuDomainCoreDumpWithFormat(virDomainPtr dom, endjob: if ((ret == 0) && (flags & VIR_DUMP_CRASH)) { - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_CRASHED, 0); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_CRASHED, + QEMU_ASYNC_JOB_DUMP, 0); virDomainAuditStop(vm, "crashed"); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, @@ -4070,7 +4073,8 @@ processGuestPanicEvent(virQEMUDriverPtr driver, /* fall through */ case VIR_DOMAIN_LIFECYCLE_CRASH_DESTROY: - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_CRASHED, 0); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_CRASHED, + QEMU_ASYNC_JOB_DUMP, 0); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_CRASHED); @@ -4599,7 +4603,7 @@ processMonitorEOFEvent(virQEMUDriverPtr driver, event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, eventReason); - qemuProcessStop(driver, vm, stopReason, stopFlags); + qemuProcessStop(driver, vm, stopReason, QEMU_ASYNC_JOB_NONE, stopFlags); virDomainAuditStop(vm, auditReason); qemuDomainEventQueue(driver, event); @@ -6609,7 +6613,7 @@ qemuDomainSaveImageStartVM(virConnectPtr conn, } if (virCommandWait(cmd, NULL) < 0) { - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, 0); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, asyncJob, 0); restored = false; } VIR_DEBUG("Decompression binary stderr: %s", NULLSTR(errbuf)); @@ -13626,7 +13630,8 @@ qemuDomainSnapshotCreateActiveInternal(virConnectPtr conn, if (flags & VIR_DOMAIN_SNAPSHOT_CREATE_HALT) { event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT); - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, 0); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, + QEMU_ASYNC_JOB_SNAPSHOT, 0); virDomainAuditStop(vm, "from-snapshot"); resume = false; } @@ -14468,7 +14473,8 @@ qemuDomainSnapshotCreateActiveExternal(virConnectPtr conn, if (flags & VIR_DOMAIN_SNAPSHOT_CREATE_HALT) { event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT); - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, 0); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, + QEMU_ASYNC_JOB_SNAPSHOT, 0); virDomainAuditStop(vm, "from-snapshot"); resume = false; thaw = 0; @@ -15328,7 +15334,8 @@ qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot, } virResetError(err); qemuProcessStop(driver, vm, - VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, 0); + VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, + QEMU_ASYNC_JOB_START, 0); virDomainAuditStop(vm, "from-snapshot"); detail = VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT; event = virDomainEventLifecycleNewFromObj(vm, @@ -15448,7 +15455,8 @@ qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot, if (virDomainObjIsActive(vm)) { /* Transitions 4, 7 */ - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, 0); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, + QEMU_ASYNC_JOB_START, 0); virDomainAuditStop(vm, "from-snapshot"); detail = VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT; event = virDomainEventLifecycleNewFromObj(vm, diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 72d7144..21804f6 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -3594,7 +3594,8 @@ qemuMigrationPrepareAny(virQEMUDriverPtr driver, if (!relabel) stopFlags |= VIR_QEMU_PROCESS_STOP_NO_RELABEL; virDomainAuditStart(vm, "migrated", false); - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, stopFlags); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, + QEMU_ASYNC_JOB_MIGRATION_IN, stopFlags); } qemuMigrationJobFinish(driver, vm); @@ -3903,6 +3904,7 @@ qemuMigrationConfirmPhase(virQEMUDriverPtr driver, qemuMigrationWaitForSpice(vm); qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_MIGRATED, + QEMU_ASYNC_JOB_MIGRATION_OUT, VIR_QEMU_PROCESS_STOP_MIGRATED); virDomainAuditStop(vm, "migrated"); @@ -5414,6 +5416,7 @@ qemuMigrationPerformJob(virQEMUDriverPtr driver, */ if (!v3proto) { qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_MIGRATED, + QEMU_ASYNC_JOB_MIGRATION_OUT, VIR_QEMU_PROCESS_STOP_MIGRATED); virDomainAuditStop(vm, "migrated"); event = virDomainEventLifecycleNewFromObj(vm, @@ -5896,6 +5899,7 @@ qemuMigrationFinish(virQEMUDriverPtr driver, !(flags & VIR_MIGRATE_OFFLINE) && virDomainObjIsActive(vm)) { qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, + QEMU_ASYNC_JOB_MIGRATION_IN, VIR_QEMU_PROCESS_STOP_MIGRATED); virDomainAuditStop(vm, "failed"); event = virDomainEventLifecycleNewFromObj(vm, diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 19ecf59..9d6afc4 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -3526,7 +3526,11 @@ qemuProcessReconnect(void *opaque) * really is and FAILED means "failed to start" */ state = VIR_DOMAIN_SHUTOFF_UNKNOWN; } - qemuProcessStop(driver, obj, state, stopFlags); + /* If BeginJob failed, we jumped here without a job, let's hope another + * thread didn't have a chance to start playing with the domain yet + * (it's all we can do anyway). + */ + qemuProcessStop(driver, obj, state, QEMU_ASYNC_JOB_NONE, stopFlags); } goto cleanup; } @@ -3564,8 +3568,13 @@ qemuProcessReconnectHelper(virDomainObjPtr obj, virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("Could not create thread. QEMU initialization " "might be incomplete")); - /* We can't spawn a thread and thus connect to monitor. Kill qemu. */ - qemuProcessStop(src->driver, obj, VIR_DOMAIN_SHUTOFF_FAILED, 0); + /* We can't spawn a thread and thus connect to monitor. Kill qemu. + * It's safe to call qemuProcessStop without a job here since there + * is no thread that could be doing anything else with the same domain + * object. + */ + qemuProcessStop(src->driver, obj, VIR_DOMAIN_SHUTOFF_FAILED, + QEMU_ASYNC_JOB_NONE, 0); qemuDomainRemoveInactive(src->driver, obj); virDomainObjEndAPI(&obj); @@ -4308,7 +4317,7 @@ qemuProcessStartValidate(virDomainDefPtr def, int qemuProcessInit(virQEMUDriverPtr driver, virDomainObjPtr vm, - qemuDomainAsyncJob asyncJob ATTRIBUTE_UNUSED, + qemuDomainAsyncJob asyncJob, bool migration, bool snap) { @@ -4381,7 +4390,7 @@ qemuProcessInit(virQEMUDriverPtr driver, stopFlags = VIR_QEMU_PROCESS_STOP_NO_RELABEL; if (migration) stopFlags |= VIR_QEMU_PROCESS_STOP_MIGRATED; - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, stopFlags); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, asyncJob, stopFlags); goto cleanup; } @@ -5298,7 +5307,7 @@ qemuProcessStart(virConnectPtr conn, stopFlags |= VIR_QEMU_PROCESS_STOP_MIGRATED; if (priv->mon) qemuMonitorSetDomainLog(priv->mon, NULL, NULL, NULL); - qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, stopFlags); + qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, asyncJob, stopFlags); goto cleanup; } @@ -5375,6 +5384,7 @@ qemuProcessBeginStopJob(virQEMUDriverPtr driver, void qemuProcessStop(virQEMUDriverPtr driver, virDomainObjPtr vm, virDomainShutoffReason reason, + qemuDomainAsyncJob asyncJob, unsigned int flags) { int ret; @@ -5389,27 +5399,34 @@ void qemuProcessStop(virQEMUDriverPtr driver, virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); qemuDomainLogContextPtr logCtxt = NULL; - VIR_DEBUG("Shutting down vm=%p name=%s id=%d pid=%llu flags=%x", + VIR_DEBUG("Shutting down vm=%p name=%s id=%d pid=%llu, " + "reason=%s, asyncJob=%s, flags=%x", vm, vm->def->name, vm->def->id, - (unsigned long long)vm->pid, flags); - - if (!virDomainObjIsActive(vm)) { - VIR_DEBUG("VM '%s' not active", vm->def->name); - virObjectUnref(cfg); - return; - } + (unsigned long long)vm->pid, + virDomainShutoffReasonTypeToString(reason), + qemuDomainAsyncJobTypeToString(asyncJob), + flags); /* This method is routinely used in clean up paths. Disable error * reporting so we don't squash a legit error. */ orig_err = virSaveLastError(); - /* - * We may unlock the vm in qemuProcessKill(), and another thread - * can lock the vm, and then call qemuProcessStop(). So we should - * set vm->def->id to -1 here to avoid qemuProcessStop() to be called twice. - */ + if (asyncJob != QEMU_ASYNC_JOB_NONE) { + if (qemuDomainObjBeginNestedJob(driver, vm, asyncJob) < 0) + goto cleanup; + } else if (priv->job.asyncJob != QEMU_ASYNC_JOB_NONE && + priv->job.asyncOwner == virThreadSelfID() && + priv->job.active != QEMU_JOB_ASYNC_NESTED) { + VIR_WARN("qemuProcessStop called without a nested job (async=%s)", + qemuDomainAsyncJobTypeToString(asyncJob)); + } + + if (!virDomainObjIsActive(vm)) { + VIR_DEBUG("VM '%s' not active", vm->def->name); + goto endjob; + } + vm->def->id = -1; - if (virAtomicIntDecAndTest(&driver->nactive) && driver->inhibitCallback) driver->inhibitCallback(false, driver->inhibitOpaque); @@ -5664,6 +5681,11 @@ void qemuProcessStop(virQEMUDriverPtr driver, vm->newDef = NULL; } + endjob: + if (asyncJob != QEMU_ASYNC_JOB_NONE) + qemuDomainObjEndJob(driver, vm); + + cleanup: if (orig_err) { virSetError(orig_err); virFreeError(orig_err); @@ -5946,13 +5968,13 @@ qemuProcessAutoDestroy(virDomainObjPtr dom, qemuDomainObjDiscardAsyncJob(driver, dom); } - if (qemuDomainObjBeginJob(driver, dom, - QEMU_JOB_DESTROY) < 0) - goto cleanup; - VIR_DEBUG("Killing domain"); - qemuProcessStop(driver, dom, VIR_DOMAIN_SHUTOFF_DESTROYED, stopFlags); + if (qemuProcessBeginStopJob(driver, dom, QEMU_JOB_DESTROY, true) < 0) + goto cleanup; + + qemuProcessStop(driver, dom, VIR_DOMAIN_SHUTOFF_DESTROYED, + QEMU_ASYNC_JOB_NONE, stopFlags); virDomainAuditStop(dom, "destroyed"); event = virDomainEventLifecycleNewFromObj(dom, diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index 69be99e..eb37863 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -121,6 +121,7 @@ int qemuProcessBeginStopJob(virQEMUDriverPtr driver, void qemuProcessStop(virQEMUDriverPtr driver, virDomainObjPtr vm, virDomainShutoffReason reason, + qemuDomainAsyncJob asyncJob, unsigned int flags); int qemuProcessAttach(virConnectPtr conn, -- 2.7.1 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list