This phase marks a migration protocol as broken in a post-copy phase. Libvirt is no longer actively watching the migration in this phase as the migration API that started the migration failed. This may either happen when post-copy migration really fails (QEMU enters postcopy-paused migration state) or when the migration still progresses between both QEMU processes, but libvirt lost control of it because the connection between libvirt daemons (in p2p migration) or a daemon and client (non-p2p migration) was closed. For example, when one of the daemons was restarted. Signed-off-by: Jiri Denemark <jdenemar@xxxxxxxxxx> --- src/qemu/qemu_migration.c | 15 +++++++++++---- src/qemu/qemu_process.c | 16 +++++++++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 3f6921b4b2..c111dd8686 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -2369,6 +2369,7 @@ qemuMigrationSrcCleanup(virDomainObj *vm, vm->def->name); if (virDomainObjIsPostcopy(vm, VIR_DOMAIN_JOB_OPERATION_MIGRATION_OUT)) { + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuMigrationSrcPostcopyFailed(vm); qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); qemuMigrationJobContinue(vm); @@ -2380,8 +2381,10 @@ qemuMigrationSrcCleanup(virDomainObj *vm, } break; + case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED: case QEMU_MIGRATION_PHASE_BEGIN_RESUME: case QEMU_MIGRATION_PHASE_PERFORM_RESUME: + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuMigrationSrcPostcopyFailed(vm); qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); qemuMigrationJobContinue(vm); @@ -2402,7 +2405,6 @@ qemuMigrationSrcCleanup(virDomainObj *vm, case QEMU_MIGRATION_PHASE_PERFORM2: /* single phase outgoing migration; unreachable */ case QEMU_MIGRATION_PHASE_NONE: - case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED: case QEMU_MIGRATION_PHASE_LAST: /* unreachable */ ; @@ -3774,6 +3776,7 @@ qemuMigrationSrcConfirm(virQEMUDriver *driver, flags, cancelled); if (virDomainObjIsFailedPostcopy(vm)) { + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); qemuMigrationJobContinue(vm); } else { @@ -5607,6 +5610,7 @@ qemuMigrationSrcPerformJob(virQEMUDriver *driver, virErrorPreserveLast(&orig_err); if (virDomainObjIsFailedPostcopy(vm)) { + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); qemuMigrationJobContinue(vm); } else { @@ -5699,6 +5703,8 @@ qemuMigrationSrcPerformPhase(virQEMUDriver *driver, jobPriv->migParams, priv->job.apiFlags); qemuMigrationJobFinish(vm); } else { + if (ret < 0) + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); qemuMigrationJobContinue(vm); } @@ -5938,7 +5944,7 @@ qemuMigrationDstComplete(virQEMUDriver *driver, /* Guest is successfully running, so cancel previous auto destroy. There's * nothing to remove when we are resuming post-copy migration. */ - if (!virDomainObjIsFailedPostcopy(vm)) + if (job->phase < QEMU_MIGRATION_PHASE_POSTCOPY_FAILED) qemuProcessAutoDestroyRemove(driver, vm); /* Remove completed stats for post-copy, everything but timing fields @@ -6205,6 +6211,7 @@ qemuMigrationDstFinishActive(virQEMUDriver *driver, } if (virDomainObjIsFailedPostcopy(vm)) { + ignore_value(qemuMigrationJobSetPhase(vm, QEMU_MIGRATION_PHASE_POSTCOPY_FAILED)); qemuProcessAutoDestroyRemove(driver, vm); qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob); *finishJob = false; @@ -6327,9 +6334,9 @@ qemuMigrationProcessUnattended(virQEMUDriver *driver, vm->def->name); if (job == VIR_ASYNC_JOB_MIGRATION_IN) - phase = QEMU_MIGRATION_PHASE_FINISH3; + phase = QEMU_MIGRATION_PHASE_FINISH_RESUME; else - phase = QEMU_MIGRATION_PHASE_CONFIRM3; + phase = QEMU_MIGRATION_PHASE_CONFIRM_RESUME; if (qemuMigrationJobStartPhase(vm, phase) < 0) return; diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index c7ed0a5c56..f42c9a3018 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -1624,7 +1624,8 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED, * watching it in any thread. Let's make sure the migration is properly * finished in case we get a "completed" event. */ - if (virDomainObjIsFailedPostcopy(vm) && + if (virDomainObjIsPostcopy(vm, priv->job.current->operation) && + priv->job.phase == QEMU_MIGRATION_PHASE_POSTCOPY_FAILED && priv->job.asyncOwner == 0 && status == QEMU_MONITOR_MIGRATION_STATUS_COMPLETED) { struct qemuProcessEvent *proc = g_new0(struct qemuProcessEvent, 1); @@ -3566,7 +3567,6 @@ qemuProcessRecoverMigrationIn(virQEMUDriver *driver, case QEMU_MIGRATION_PHASE_PERFORM3_DONE: case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED: case QEMU_MIGRATION_PHASE_CONFIRM3: - case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED: case QEMU_MIGRATION_PHASE_BEGIN_RESUME: case QEMU_MIGRATION_PHASE_PERFORM_RESUME: case QEMU_MIGRATION_PHASE_CONFIRM_RESUME: @@ -3604,6 +3604,7 @@ qemuProcessRecoverMigrationIn(virQEMUDriver *driver, } break; + case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED: case QEMU_MIGRATION_PHASE_PREPARE_RESUME: case QEMU_MIGRATION_PHASE_FINISH_RESUME: return 1; @@ -3639,7 +3640,6 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver, case QEMU_MIGRATION_PHASE_PREPARE: case QEMU_MIGRATION_PHASE_FINISH2: case QEMU_MIGRATION_PHASE_FINISH3: - case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED: case QEMU_MIGRATION_PHASE_PREPARE_RESUME: case QEMU_MIGRATION_PHASE_FINISH_RESUME: case QEMU_MIGRATION_PHASE_LAST: @@ -3700,6 +3700,7 @@ qemuProcessRecoverMigrationOut(virQEMUDriver *driver, } return 1; + case QEMU_MIGRATION_PHASE_POSTCOPY_FAILED: case QEMU_MIGRATION_PHASE_BEGIN_RESUME: case QEMU_MIGRATION_PHASE_PERFORM_RESUME: return 1; @@ -3751,9 +3752,18 @@ qemuProcessRecoverMigration(virQEMUDriver *driver, return -1; if (rc > 0) { + job->phase = QEMU_MIGRATION_PHASE_POSTCOPY_FAILED; + if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) { VIR_DEBUG("Post-copy migration of domain %s still running, it " "will be handled as unattended", vm->def->name); + + if (state == VIR_DOMAIN_RUNNING) + reason = VIR_DOMAIN_RUNNING_POSTCOPY; + else + reason = VIR_DOMAIN_PAUSED_POSTCOPY; + + virDomainObjSetState(vm, state, reason); qemuProcessRestoreMigrationJob(vm, job); return 0; } -- 2.35.1