When post-copy migration fails, we can't just abort the migration and resume the domain on the source host as it is already running on the destination host and no host has a complete state of the domain memory. Instead of the current approach of just marking the domain on both ends as paused/running with a post-copy failed sub state, we will keep the migration job active (even though the migration API will return failure) so that the state is more visible and we can better control what APIs can be called on the domains and even allow for resuming the migration. Signed-off-by: Jiri Denemark <jdenemar@xxxxxxxxxx> Reviewed-by: Peter Krempa <pkrempa@xxxxxxxxxx> Reviewed-by: Pavel Hrdina <phrdina@xxxxxxxxxx> --- Notes: Version 2: - squashed in a one line change from "qemu: Implement VIR_MIGRATE_POSTCOPY_RESUME for Confirm phase" src/qemu/qemu_migration.c | 95 ++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 26 deletions(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index fee3e8826b..0daf50d836 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -2221,10 +2221,17 @@ qemuMigrationSrcCleanup(virDomainObj *vm, VIR_WARN("Migration of domain %s finished but we don't know if the" " domain was successfully started on destination or not", vm->def->name); - qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT, - jobPriv->migParams, priv->job.apiFlags); - /* clear the job and let higher levels decide what to do */ - qemuMigrationJobFinish(vm); + + if (virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT)) { + qemuMigrationSrcPostcopyFailed(vm); + qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); + qemuMigrationJobContinue(vm); + } else { + qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT, + jobPriv->migParams, priv->job.apiFlags); + /* clear the job and let higher levels decide what to do */ + qemuMigrationJobFinish(vm); + } break; case QEMU_MIGRATION_PHASE_PERFORM3: @@ -3400,6 +3407,7 @@ qemuMigrationSrcConfirmPhase(virQEMUDriver *driver, qemuDomainObjPrivate *priv = vm->privateData; qemuDomainJobPrivate *jobPriv = priv->job.privateData; virDomainJobData *jobData = NULL; + qemuMigrationJobPhase phase; VIR_DEBUG("driver=%p, vm=%p, cookiein=%s, cookieinlen=%d, " "flags=0x%x, retcode=%d", @@ -3408,10 +3416,17 @@ qemuMigrationSrcConfirmPhase(virQEMUDriver *driver, virCheckFlags(QEMU_MIGRATION_FLAGS, -1); - qemuMigrationJobSetPhase(vm, - retcode == 0 - ? QEMU_MIGRATION_PHASE_CONFIRM3 - : QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED); + /* Keep the original migration phase in case post-copy failed as the job + * will stay active even though migration API finishes with an error. + */ + if (virDomainObjIsFailedPostcopy(vm)) + phase = priv->job.phase; + else if (retcode == 0) + phase = QEMU_MIGRATION_PHASE_CONFIRM3; + else + phase = QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED; + + qemuMigrationJobSetPhase(vm, phase); if (!(mig = qemuMigrationCookieParse(driver, vm->def, priv->origname, priv, cookiein, cookieinlen, @@ -3480,13 +3495,14 @@ qemuMigrationSrcConfirmPhase(virQEMUDriver *driver, virErrorRestore(&orig_err); if (virDomainObjGetState(vm, &reason) == VIR_DOMAIN_PAUSED && - reason == VIR_DOMAIN_PAUSED_POSTCOPY) + reason == VIR_DOMAIN_PAUSED_POSTCOPY) { qemuMigrationSrcPostcopyFailed(vm); - else + } else if (!virDomainObjIsFailedPostcopy(vm)) { qemuMigrationSrcRestoreDomainState(driver, vm); - qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT, - jobPriv->migParams, priv->job.apiFlags); + qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT, + jobPriv->migParams, priv->job.apiFlags); + } qemuDomainSaveStatus(vm); } @@ -3504,12 +3520,18 @@ qemuMigrationSrcConfirm(virQEMUDriver *driver, { qemuMigrationJobPhase phase; g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); + qemuDomainObjPrivate *priv = vm->privateData; int ret = -1; if (!qemuMigrationJobIsActive(vm, VIR_ASYNC_JOB_MIGRATION_OUT)) goto cleanup; - if (cancelled) + /* Keep the original migration phase in case post-copy failed as the job + * will stay active even though migration API finishes with an error. + */ + if (virDomainObjIsFailedPostcopy(vm)) + phase = priv->job.phase; + else if (cancelled) phase = QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED; else phase = QEMU_MIGRATION_PHASE_CONFIRM3; @@ -3517,12 +3539,19 @@ qemuMigrationSrcConfirm(virQEMUDriver *driver, qemuMigrationJobStartPhase(vm, phase); virCloseCallbacksUnset(driver->closeCallbacks, vm, qemuMigrationSrcCleanup); + qemuDomainCleanupRemove(vm, qemuProcessCleanupMigrationJob); ret = qemuMigrationSrcConfirmPhase(driver, vm, cookiein, cookieinlen, flags, cancelled); - qemuMigrationJobFinish(vm); + if (virDomainObjIsFailedPostcopy(vm)) { + qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); + qemuMigrationJobContinue(vm); + } else { + qemuMigrationJobFinish(vm); + } + if (!virDomainObjIsActive(vm)) { if (!cancelled && ret == 0 && flags & VIR_MIGRATE_UNDEFINE_SOURCE) { virDomainDeleteConfig(cfg->configDir, cfg->autostartDir, vm); @@ -5334,16 +5363,22 @@ qemuMigrationSrcPerformJob(virQEMUDriver *driver, if (ret < 0) virErrorPreserveLast(&orig_err); - /* v2 proto has no confirm phase so we need to reset migration parameters - * here - */ - if (!v3proto && ret < 0) - qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT, - jobPriv->migParams, priv->job.apiFlags); + if (virDomainObjIsFailedPostcopy(vm)) { + qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); + qemuMigrationJobContinue(vm); + } else { + /* v2 proto has no confirm phase so we need to reset migration parameters + * here + */ + if (!v3proto && ret < 0) + qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT, + jobPriv->migParams, priv->job.apiFlags); - qemuMigrationSrcRestoreDomainState(driver, vm); + qemuMigrationSrcRestoreDomainState(driver, vm); + + qemuMigrationJobFinish(vm); + } - qemuMigrationJobFinish(vm); if (!virDomainObjIsActive(vm) && ret == 0) { if (flags & VIR_MIGRATE_UNDEFINE_SOURCE) { virDomainDeleteConfig(cfg->configDir, cfg->autostartDir, vm); @@ -5414,11 +5449,12 @@ qemuMigrationSrcPerformPhase(virQEMUDriver *driver, goto endjob; endjob: - if (ret < 0) { + if (ret < 0 && !virDomainObjIsFailedPostcopy(vm)) { qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_OUT, jobPriv->migParams, priv->job.apiFlags); qemuMigrationJobFinish(vm); } else { + qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); qemuMigrationJobContinue(vm); } @@ -5879,10 +5915,17 @@ qemuMigrationDstFinish(virQEMUDriver *driver, g_clear_pointer(&priv->job.completed, virDomainJobDataFree); } - qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_IN, - jobPriv->migParams, priv->job.apiFlags); + if (virDomainObjIsFailedPostcopy(vm)) { + qemuProcessAutoDestroyRemove(driver, vm); + qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); + qemuMigrationJobContinue(vm); + } else { + qemuMigrationParamsReset(driver, vm, VIR_ASYNC_JOB_MIGRATION_IN, + jobPriv->migParams, priv->job.apiFlags); + + qemuMigrationJobFinish(vm); + } - qemuMigrationJobFinish(vm); if (!virDomainObjIsActive(vm)) qemuDomainRemoveInactive(driver, vm); -- 2.35.1