When post-copy migration fails, the domain stays running on the destination with a VIR_DOMAIN_RUNNING_POSTCOPY_FAILED reason. Both the state and the reason can later be rewritten in case the domain gets paused for other reasons (such as an I/O error). Thus we need a separate place to remember the post-copy migration failed to be able to resume the migration. https://bugzilla.redhat.com/show_bug.cgi?id=2111948 Signed-off-by: Jiri Denemark <jdenemar@xxxxxxxxxx> --- src/conf/domain_conf.c | 7 ++++++- src/conf/virdomainjob.c | 1 + src/conf/virdomainjob.h | 1 + src/qemu/qemu_domainjob.c | 9 +++++++++ src/qemu/qemu_migration.c | 34 +++++++++++++++++++++++----------- src/qemu/qemu_process.c | 15 +++++++++++++++ 6 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c index 9e2eea79e7..f83586c549 100644 --- a/src/conf/domain_conf.c +++ b/src/conf/domain_conf.c @@ -27874,8 +27874,13 @@ virDomainObjGetState(virDomainObj *dom, int *reason) bool virDomainObjIsFailedPostcopy(virDomainObj *dom, - virDomainJobObj *job G_GNUC_UNUSED) + virDomainJobObj *job) { + if (job && job->asyncPaused && + (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN || + job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT)) + return true; + return ((dom->state.state == VIR_DOMAIN_PAUSED && dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) || (dom->state.state == VIR_DOMAIN_RUNNING && diff --git a/src/conf/virdomainjob.c b/src/conf/virdomainjob.c index 256b665a42..c4cbbe8f6d 100644 --- a/src/conf/virdomainjob.c +++ b/src/conf/virdomainjob.c @@ -174,6 +174,7 @@ virDomainObjResetAsyncJob(virDomainJobObj *job) job->asyncOwner = 0; g_clear_pointer(&job->asyncOwnerAPI, g_free); job->asyncStarted = 0; + job->asyncPaused = false; job->phase = 0; job->mask = VIR_JOB_DEFAULT_MASK; job->abortJob = false; diff --git a/src/conf/virdomainjob.h b/src/conf/virdomainjob.h index b1ac36a2fa..0d62bab287 100644 --- a/src/conf/virdomainjob.h +++ b/src/conf/virdomainjob.h @@ -176,6 +176,7 @@ struct _virDomainJobObj { unsigned long long asyncOwner; /* Thread which set current async job */ char *asyncOwnerAPI; /* The API which owns the async job */ unsigned long long asyncStarted; /* When the current async job started */ + bool asyncPaused; /* The async job is paused */ int phase; /* Job phase (mainly for migrations) */ unsigned long long mask; /* Jobs allowed during async job */ virDomainJobData *current; /* async job progress data */ diff --git a/src/qemu/qemu_domainjob.c b/src/qemu/qemu_domainjob.c index 8d958b9d21..27beb5229f 100644 --- a/src/qemu/qemu_domainjob.c +++ b/src/qemu/qemu_domainjob.c @@ -695,6 +695,8 @@ qemuDomainObjPrivateXMLFormatJob(virBuffer *buf, if (vm->job->asyncJob != VIR_ASYNC_JOB_NONE) { virBufferAsprintf(&attrBuf, " flags='0x%x'", vm->job->apiFlags); virBufferAsprintf(&attrBuf, " asyncStarted='%llu'", vm->job->asyncStarted); + if (vm->job->asyncPaused) + virBufferAddLit(&attrBuf, " asyncPaused='yes'"); } if (vm->job->cb && @@ -732,6 +734,7 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm, if ((tmp = virXPathString("string(@async)", ctxt))) { int async; + virTristateBool paused; if ((async = virDomainAsyncJobTypeFromString(tmp)) < 0) { virReportError(VIR_ERR_INTERNAL_ERROR, @@ -757,6 +760,12 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm, _("Invalid async job start")); return -1; } + + if (virXMLPropTristateBool(ctxt->node, "asyncPaused", VIR_XML_PROP_NONE, + &paused) < 0) + return -1; + + vm->job->asyncPaused = paused == VIR_TRISTATE_BOOL_YES; } if (virXMLPropUInt(ctxt->node, "flags", 16, VIR_XML_PROP_NONE, diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 27a74795d6..f258e7d700 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1666,17 +1666,19 @@ qemuMigrationSrcPostcopyFailed(virDomainObj *vm) state = virDomainObjGetState(vm, &reason); - VIR_DEBUG("%s/%s", + VIR_DEBUG("%s/%s, asyncPaused=%u", virDomainStateTypeToString(state), - virDomainStateReasonToString(state, reason)); + virDomainStateReasonToString(state, reason), + vm->job->asyncPaused); if (state != VIR_DOMAIN_PAUSED || - reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) + virDomainObjIsFailedPostcopy(vm, vm->job)) return; VIR_WARN("Migration of domain %s failed during post-copy; " "leaving the domain paused", vm->def->name); + vm->job->asyncPaused = true; virDomainObjSetState(vm, VIR_DOMAIN_PAUSED, VIR_DOMAIN_PAUSED_POSTCOPY_FAILED); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED, @@ -1696,21 +1698,31 @@ qemuMigrationDstPostcopyFailed(virDomainObj *vm) state = virDomainObjGetState(vm, &reason); - VIR_DEBUG("%s/%s", + VIR_DEBUG("%s/%s, asyncPaused=%u", virDomainStateTypeToString(state), - virDomainStateReasonToString(state, reason)); + virDomainStateReasonToString(state, reason), + vm->job->asyncPaused); - if (state != VIR_DOMAIN_RUNNING || - reason == VIR_DOMAIN_RUNNING_POSTCOPY_FAILED) + if ((state != VIR_DOMAIN_RUNNING && state != VIR_DOMAIN_PAUSED) || + virDomainObjIsFailedPostcopy(vm, vm->job)) return; VIR_WARN("Incoming migration of domain '%s' failed during post-copy; " "leaving the domain running", vm->def->name); - virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, - VIR_DOMAIN_RUNNING_POSTCOPY_FAILED); - event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, - VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED); + vm->job->asyncPaused = true; + if (state == VIR_DOMAIN_RUNNING) { + virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, + VIR_DOMAIN_RUNNING_POSTCOPY_FAILED); + event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, + VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED); + } else { + /* The domain was paused for other reasons (I/O error, ...) so we don't + * want to rewrite the original reason and just emit a postcopy-failed + * event. */ + event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED, + VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED); + } virObjectEventStateQueue(driver->domainEventState, event); } diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 6091c9f1a9..017a05d57e 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -712,6 +712,15 @@ qemuProcessHandleResume(qemuMonitor *mon G_GNUC_UNUSED, vm->def->name, virDomainRunningReasonTypeToString(reason), eventDetail); + /* When a domain is running in (failed) post-copy migration on the + * destination host, we need to make sure to set the appropriate reason + * here. */ + if (virDomainObjIsPostcopy(vm, vm->job)) { + if (virDomainObjIsFailedPostcopy(vm, vm->job)) + reason = VIR_DOMAIN_RUNNING_POSTCOPY_FAILED; + else + reason = VIR_DOMAIN_RUNNING_POSTCOPY; + } virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason); event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED, @@ -1491,6 +1500,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED, vm->def->name, virDomainStateTypeToString(state), NULLSTR(virDomainStateReasonToString(state, reason))); + vm->job->asyncPaused = false; virDomainObjSetState(vm, state, reason); event = virDomainEventLifecycleNewFromObj(vm, eventType, eventDetail); qemuDomainSaveStatus(vm); @@ -3420,6 +3430,7 @@ qemuProcessRestoreMigrationJob(virDomainObj *vm, job->privateData = g_steal_pointer(&vm->job->privateData); vm->job->privateData = jobPriv; vm->job->apiFlags = job->apiFlags; + vm->job->asyncPaused = job->asyncPaused; qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); } @@ -3645,6 +3656,7 @@ qemuProcessRecoverMigration(virQEMUDriver *driver, if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) { VIR_DEBUG("Post-copy migration of domain %s still running, it will be handled as unattended", vm->def->name); + vm->job->asyncPaused = false; return 0; } @@ -3653,6 +3665,9 @@ qemuProcessRecoverMigration(virQEMUDriver *driver, qemuMigrationSrcPostcopyFailed(vm); else qemuMigrationDstPostcopyFailed(vm); + /* Set the asyncPaused flag in case we're reconnecting to a domain + * started by an older libvirt. */ + vm->job->asyncPaused = true; return 0; } -- 2.39.0