Make MIGRATION_OUT use the new helper methods. This also introduces new protection to migration v3 process: the migration job is held from Begin to Confirm to avoid changes to a domain during migration (esp. between Begin and Perform phases). This change, however brings a small issue when Prepare step fails during normal (not p2p or tunneled) migration when old client talks to new libvirtd. The old client doesn't call Confirm (with cancelled == 1) to end the migration job. I'm open to suggestions how to solve this in the best way. My idea is to use flags for Begin3 and Perform3 that would enable this extra protection by moving migration job start from Perform to Begin phase while maintaining the old behavior if the flags are not set. Note that virDomainAbortJob will be able to cancel migration in any phase (i.e., not only in Perform phase). --- src/libvirt.c | 11 ++- src/qemu/qemu_driver.c | 47 ++++++++- src/qemu/qemu_migration.c | 232 +++++++++++++++++++++++++++++++-------------- 3 files changed, 211 insertions(+), 79 deletions(-) diff --git a/src/libvirt.c b/src/libvirt.c index e00c64f..fa32d83 100644 --- a/src/libvirt.c +++ b/src/libvirt.c @@ -3761,7 +3761,7 @@ virDomainMigrateVersion3(virDomainPtr domain, int ret; virDomainInfo info; virErrorPtr orig_err = NULL; - int cancelled; + int cancelled = 1; VIR_DOMAIN_DEBUG(domain, "dconn=%p xmlin=%s, flags=%lu, " "dname=%s, uri=%s, bandwidth=%lu", dconn, NULLSTR(xmlin), flags, @@ -3798,14 +3798,16 @@ virDomainMigrateVersion3(virDomainPtr domain, (dconn, cookiein, cookieinlen, &cookieout, &cookieoutlen, uri, &uri_out, flags, dname, bandwidth, dom_xml); VIR_FREE (dom_xml); - if (ret == -1) - goto done; + if (ret == -1) { + /* Make sure Confirm doesn't overwrite the error */ + orig_err = virSaveLastError(); + goto confirm; + } if (uri == NULL && uri_out == NULL) { virLibConnError(VIR_ERR_INTERNAL_ERROR, _("domainMigratePrepare3 did not set uri")); virDispatchError(domain->conn); - cancelled = 1; goto finish; } if (uri_out) @@ -3871,6 +3873,7 @@ finish: if (!orig_err) orig_err = virSaveLastError(); +confirm: /* * If cancelled, then src VM will be restarted, else * it will be killed diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 9dcb248..e317c5b 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -6850,12 +6850,42 @@ qemuDomainMigrateBegin3(virDomainPtr domain, goto cleanup; } - xml = qemuMigrationBegin(driver, vm, xmlin, - cookieout, cookieoutlen); + if (qemuMigrationJobStart(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) + goto cleanup; + + if (!virDomainObjIsActive(vm)) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("domain is not running")); + goto endjob; + } + + if (!(xml = qemuMigrationBegin(driver, vm, xmlin, + cookieout, cookieoutlen))) + goto endjob; + + /* We keep the job active across API calls until the confirm() call. + * This prevents any other APIs being invoked while migration is taking + * place. + */ + if (qemuMigrationJobContinue(vm) == 0) { + vm = NULL; + qemuReportError(VIR_ERR_OPERATION_FAILED, + "%s", _("domain disappeared")); + VIR_FREE(xml); + if (cookieout) + VIR_FREE(*cookieout); + } cleanup: + if (vm) + virDomainObjUnlock(vm); qemuDriverUnlock(driver); return xml; + +endjob: + if (qemuMigrationJobFinish(driver, vm) == 0) + vm = NULL; + goto cleanup; } static int @@ -7005,7 +7035,6 @@ qemuDomainMigratePerform3(virDomainPtr dom, dconnuri, uri, cookiein, cookieinlen, cookieout, cookieoutlen, flags, dname, resource, true); - cleanup: qemuDriverUnlock(driver); return ret; @@ -7065,6 +7094,7 @@ qemuDomainMigrateConfirm3(virDomainPtr domain, struct qemud_driver *driver = domain->conn->privateData; virDomainObjPtr vm; int ret = -1; + enum qemuMigrationJobPhase phase; virCheckFlags(VIR_MIGRATE_LIVE | VIR_MIGRATE_PEER2PEER | @@ -7085,14 +7115,21 @@ qemuDomainMigrateConfirm3(virDomainPtr domain, goto cleanup; } - if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) + if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_OUT)) goto cleanup; + if (cancelled) + phase = QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED; + else + phase = QEMU_MIGRATION_PHASE_CONFIRM3; + + qemuMigrationJobStartPhase(driver, vm, phase); + ret = qemuMigrationConfirm(driver, domain->conn, vm, cookiein, cookieinlen, flags, cancelled); - if (qemuDomainObjEndJob(driver, vm) == 0) { + if (qemuMigrationJobFinish(driver, vm) == 0) { vm = NULL; } else if (!virDomainObjIsActive(vm) && (!vm->persistent || (flags & VIR_MIGRATE_UNDEFINE_SOURCE))) { diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 5e31b7d..bcca454 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -1009,6 +1009,7 @@ qemuDomainMigrateGraphicsRelocate(struct qemud_driver *driver, } +/* The caller is supposed to lock the vm and start a migration job. */ char *qemuMigrationBegin(struct qemud_driver *driver, virDomainObjPtr vm, const char *xmlin, @@ -1021,11 +1022,7 @@ char *qemuMigrationBegin(struct qemud_driver *driver, VIR_DEBUG("driver=%p, vm=%p, xmlin=%s, cookieout=%p, cookieoutlen=%p", driver, vm, NULLSTR(xmlin), cookieout, cookieoutlen); - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("domain is not running")); - goto cleanup; - } + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_BEGIN3); if (qemuProcessAutoDestroyActive(driver, vm)) { qemuReportError(VIR_ERR_OPERATION_INVALID, @@ -1062,7 +1059,6 @@ char *qemuMigrationBegin(struct qemud_driver *driver, } cleanup: - virDomainObjUnlock(vm); qemuMigrationCookieFree(mig); virDomainDefFree(def); return rv; @@ -1902,6 +1898,7 @@ static int doPeer2PeerMigrate2(struct qemud_driver *driver, * until the migration is complete. */ VIR_DEBUG("Perform %p", sconn); + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM2); if (flags & VIR_MIGRATE_TUNNELLED) ret = doTunnelMigrate(driver, vm, st, NULL, 0, NULL, NULL, @@ -2036,6 +2033,7 @@ static int doPeer2PeerMigrate3(struct qemud_driver *driver, * confirm migration completion. */ VIR_DEBUG("Perform3 %p uri=%s uri_out=%s", sconn, uri, uri_out); + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM3); VIR_FREE(cookiein); cookiein = cookieout; cookieinlen = cookieoutlen; @@ -2053,8 +2051,12 @@ static int doPeer2PeerMigrate3(struct qemud_driver *driver, flags, dname, resource); /* Perform failed. Make sure Finish doesn't overwrite the error */ - if (ret < 0) + if (ret < 0) { orig_err = virSaveLastError(); + } else { + qemuMigrationJobSetPhase(driver, vm, + QEMU_MIGRATION_PHASE_PERFORM3_DONE); + } /* If Perform returns < 0, then we need to cancel the VM * startup on the destination @@ -2211,35 +2213,32 @@ cleanup: } -int qemuMigrationPerform(struct qemud_driver *driver, - virConnectPtr conn, - virDomainObjPtr vm, - const char *xmlin, - const char *dconnuri, - const char *uri, - const char *cookiein, - int cookieinlen, - char **cookieout, - int *cookieoutlen, - unsigned long flags, - const char *dname, - unsigned long resource, - bool v3proto) +/* + * This implements perform part of the migration protocol when migration job + * does not need to be active across several APIs, i.e., peer2peer migration or + * perform phase of v2 non-peer2peer migration. + */ +static int +qemuMigrationPerformJob(struct qemud_driver *driver, + virConnectPtr conn, + virDomainObjPtr vm, + const char *xmlin, + const char *dconnuri, + const char *uri, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + unsigned long flags, + const char *dname, + unsigned long resource, + bool v3proto) { virDomainEventPtr event = NULL; int ret = -1; int resume = 0; - qemuDomainObjPrivatePtr priv = vm->privateData; - VIR_DEBUG("driver=%p, conn=%p, vm=%p, xmlin=%s, dconnuri=%s, " - "uri=%s, cookiein=%s, cookieinlen=%d, cookieout=%p, " - "cookieoutlen=%p, flags=%lu, dname=%s, resource=%lu, v3proto=%d", - driver, conn, vm, NULLSTR(xmlin), NULLSTR(dconnuri), - NULLSTR(uri), NULLSTR(cookiein), cookieinlen, - cookieout, cookieoutlen, flags, NULLSTR(dname), - resource, v3proto); - - if (qemuDomainObjBeginAsyncJobWithDriver(driver, vm, - QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) + + if (qemuMigrationJobStart(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0) goto cleanup; if (!virDomainObjIsActive(vm)) { @@ -2254,52 +2253,33 @@ int qemuMigrationPerform(struct qemud_driver *driver, goto endjob; } - memset(&priv->job.info, 0, sizeof(priv->job.info)); - priv->job.info.type = VIR_DOMAIN_JOB_UNBOUNDED; - resume = virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING; if ((flags & (VIR_MIGRATE_TUNNELLED | VIR_MIGRATE_PEER2PEER))) { - if (cookieinlen) { - qemuReportError(VIR_ERR_OPERATION_INVALID, - "%s", _("received unexpected cookie with P2P migration")); - goto endjob; - } - - if (doPeer2PeerMigrate(driver, conn, vm, xmlin, - dconnuri, uri, flags, dname, - resource, &v3proto) < 0) - /* doPeer2PeerMigrate already set the error, so just get out */ - goto endjob; + ret = doPeer2PeerMigrate(driver, conn, vm, xmlin, + dconnuri, uri, flags, dname, + resource, &v3proto); } else { - if (dconnuri) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, - "%s", _("Unexpected dconnuri parameter with non-peer2peer migration")); - goto endjob; - } - if (doNativeMigrate(driver, vm, uri, cookiein, cookieinlen, - cookieout, cookieoutlen, - flags, dname, resource) < 0) - goto endjob; + qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM2); + ret = doNativeMigrate(driver, vm, uri, cookiein, cookieinlen, + cookieout, cookieoutlen, + flags, dname, resource); } + if (ret < 0) + goto endjob; /* * In v3 protocol, the source VM is not killed off until the * confirm step. */ - if (v3proto) { - resume = 0; - } else { + if (!v3proto) { qemuProcessStop(driver, vm, 1, VIR_DOMAIN_SHUTOFF_MIGRATED); qemuAuditDomainStop(vm, "migrated"); - resume = 0; - event = virDomainEventNewFromObj(vm, VIR_DOMAIN_EVENT_STOPPED, VIR_DOMAIN_EVENT_STOPPED_MIGRATED); } - - ret = 0; + resume = 0; endjob: if (resume && virDomainObjGetState(vm, NULL) == VIR_DOMAIN_PAUSED) { @@ -2319,10 +2299,11 @@ endjob: VIR_DOMAIN_EVENT_RESUMED_MIGRATED); } if (vm) { - if (qemuDomainObjEndAsyncJob(driver, vm) == 0) { + if (qemuMigrationJobFinish(driver, vm) == 0) { vm = NULL; } else if (!virDomainObjIsActive(vm) && - (!vm->persistent || (flags & VIR_MIGRATE_UNDEFINE_SOURCE))) { + (!vm->persistent || + (ret == 0 && (flags & VIR_MIGRATE_UNDEFINE_SOURCE)))) { if (flags & VIR_MIGRATE_UNDEFINE_SOURCE) virDomainDeleteConfig(driver->configDir, driver->autostartDir, vm); virDomainRemoveInactive(&driver->domains, vm); @@ -2338,6 +2319,118 @@ cleanup: return ret; } +/* + * This implements perform phase of v3 migration protocol. + */ +static int +qemuMigrationPerformPhase(struct qemud_driver *driver, + virConnectPtr conn, + virDomainObjPtr vm, + const char *uri, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + unsigned long flags, + const char *dname, + unsigned long resource) +{ + virDomainEventPtr event = NULL; + int ret = -1; + bool resume; + + qemuMigrationJobStartPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM3); + + resume = virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING; + ret = doNativeMigrate(driver, vm, uri, cookiein, cookieinlen, + cookieout, cookieoutlen, + flags, dname, resource); + + if (ret < 0 && resume && + virDomainObjGetState(vm, NULL) == VIR_DOMAIN_PAUSED) { + /* we got here through some sort of failure; start the domain again */ + if (qemuProcessStartCPUs(driver, vm, conn, + VIR_DOMAIN_RUNNING_MIGRATION_CANCELED) < 0) { + /* Hm, we already know we are in error here. We don't want to + * overwrite the previous error, though, so we just throw something + * to the logs and hope for the best + */ + VIR_ERROR(_("Failed to resume guest %s after failure"), + vm->def->name); + } + + event = virDomainEventNewFromObj(vm, + VIR_DOMAIN_EVENT_RESUMED, + VIR_DOMAIN_EVENT_RESUMED_MIGRATED); + } + if (ret == 0) { + qemuMigrationJobSetPhase(driver, vm, + QEMU_MIGRATION_PHASE_PERFORM3_DONE); + } + + if (vm) { + if (qemuMigrationJobContinue(vm) == 0) { + vm = NULL; + } else if (!virDomainObjIsActive(vm) && !vm->persistent) { + virDomainRemoveInactive(&driver->domains, vm); + vm = NULL; + } + } + + if (vm) + virDomainObjUnlock(vm); + if (event) + qemuDomainEventQueue(driver, event); + return ret; +} + +int +qemuMigrationPerform(struct qemud_driver *driver, + virConnectPtr conn, + virDomainObjPtr vm, + const char *xmlin, + const char *dconnuri, + const char *uri, + const char *cookiein, + int cookieinlen, + char **cookieout, + int *cookieoutlen, + unsigned long flags, + const char *dname, + unsigned long resource, + bool v3proto) +{ + if ((flags & (VIR_MIGRATE_TUNNELLED | VIR_MIGRATE_PEER2PEER))) { + if (cookieinlen) { + qemuReportError(VIR_ERR_OPERATION_INVALID, + "%s", _("received unexpected cookie with P2P migration")); + return -1; + } + + return qemuMigrationPerformJob(driver, conn, vm, xmlin, dconnuri, uri, + cookiein, cookieinlen, cookieout, + cookieoutlen, flags, dname, resource, + v3proto); + } else { + if (dconnuri) { + qemuReportError(VIR_ERR_INTERNAL_ERROR, + "%s", _("Unexpected dconnuri parameter with non-peer2peer migration")); + return -1; + } + + if (v3proto) { + return qemuMigrationPerformPhase(driver, conn, vm, uri, + cookiein, cookieinlen, + cookieout, cookieoutlen, + flags, dname, resource); + } else { + return qemuMigrationPerformJob(driver, conn, vm, xmlin, dconnuri, + uri, cookiein, cookieinlen, + cookieout, cookieoutlen, flags, + dname, resource, v3proto); + } + } +} #if WITH_MACVTAP static void @@ -2559,15 +2652,14 @@ int qemuMigrationConfirm(struct qemud_driver *driver, driver, conn, vm, NULLSTR(cookiein), cookieinlen, flags, retcode); + qemuMigrationJobSetPhase(driver, vm, + retcode == 0 + ? QEMU_MIGRATION_PHASE_CONFIRM3 + : QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED); + if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0))) return -1; - if (!virDomainObjIsActive(vm)) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("guest unexpectedly quit")); - goto cleanup; - } - /* Did the migration go as planned? If yes, kill off the * domain object, but if no, resume CPUs */ -- 1.7.6 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list