On 07.11.2012 23:23, Jiri Denemark wrote: > On Wed, Nov 07, 2012 at 12:03:39 +0100, Michal Privoznik wrote: >> Currently, if user calls virDomainAbortJob we just issue >> 'migrate_cancel' and hope for the best. However, if user calls >> the API in wrong phase when migration hasn't been started yet >> (perform phase) the cancel request is just ignored. With this >> patch, the request is remembered and as soon as perform phase >> starts, migration is cancelled. >> --- >> src/qemu/qemu_domain.c | 26 ++++++++++++++++++++++++++ >> src/qemu/qemu_domain.h | 4 ++++ >> src/qemu/qemu_driver.c | 31 +++++++++++++++++++++++++++---- >> src/qemu/qemu_migration.c | 43 +++++++++++++++++++++++++++++++++++++++++-- >> 4 files changed, 98 insertions(+), 6 deletions(-) >> >> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c >> index a5592b9..031be5f 100644 >> --- a/src/qemu/qemu_domain.c >> +++ b/src/qemu/qemu_domain.c >> @@ -160,6 +160,7 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) >> job->mask = DEFAULT_JOB_MASK; >> job->start = 0; >> job->dump_memory_only = false; >> + job->asyncAbort = false; >> memset(&job->info, 0, sizeof(job->info)); >> } >> >> @@ -959,6 +960,31 @@ qemuDomainObjEndAsyncJob(struct qemud_driver *driver, virDomainObjPtr obj) >> return virObjectUnref(obj); >> } >> >> +void >> +qemuDomainObjAbortAsyncJob(virDomainObjPtr obj) >> +{ >> + qemuDomainObjPrivatePtr priv = obj->privateData; >> + >> + VIR_DEBUG("Requesting abort of async job: %s", >> + qemuDomainAsyncJobTypeToString(priv->job.asyncJob)); >> + >> + priv->job.asyncAbort = true; >> +} >> + >> +/** >> + * qemuDomainObjAbortAsyncJobRequested: >> + * @obj: domain object >> + * >> + * Was abort requested? @obj MUST be locked when calling this. >> + */ >> +bool >> +qemuDomainObjAbortAsyncJobRequested(virDomainObjPtr obj) >> +{ >> + qemuDomainObjPrivatePtr priv = obj->privateData; >> + >> + return priv->job.asyncAbort; >> +} >> + > > I don't think we need this qemuDomainObjAbortAsyncJobRequested function. It's > much easier and shorter if the caller just checks priv->job.asyncAbort > directly. The current code uses functions only for changing the values or more > complicated reads. > >> static int >> qemuDomainObjEnterMonitorInternal(struct qemud_driver *driver, >> bool driver_locked, >> diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h >> index 9c2f67c..9a31bbe 100644 >> --- a/src/qemu/qemu_domain.h >> +++ b/src/qemu/qemu_domain.h >> @@ -111,6 +111,7 @@ struct qemuDomainJobObj { >> unsigned long long start; /* When the async job started */ >> bool dump_memory_only; /* use dump-guest-memory to do dump */ >> virDomainJobInfo info; /* Async job progress data */ >> + bool asyncAbort; /* abort of async job requested */ >> }; >> >> typedef struct _qemuDomainPCIAddressSet qemuDomainPCIAddressSet; >> @@ -204,6 +205,9 @@ bool qemuDomainObjEndJob(struct qemud_driver *driver, >> bool qemuDomainObjEndAsyncJob(struct qemud_driver *driver, >> virDomainObjPtr obj) >> ATTRIBUTE_RETURN_CHECK; >> +void qemuDomainObjAbortAsyncJob(virDomainObjPtr obj); >> +bool qemuDomainObjAbortAsyncJobRequested(virDomainObjPtr obj); >> + >> void qemuDomainObjSetJobPhase(struct qemud_driver *driver, >> virDomainObjPtr obj, >> int phase); >> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c >> index 7b8eec6..009c2c8 100644 >> --- a/src/qemu/qemu_driver.c >> +++ b/src/qemu/qemu_driver.c >> @@ -10331,6 +10331,8 @@ static int qemuDomainAbortJob(virDomainPtr dom) { >> virDomainObjPtr vm; >> int ret = -1; >> qemuDomainObjPrivatePtr priv; >> + /* Poll every 50ms for job termination */ >> + struct timespec ts = { .tv_sec = 0, .tv_nsec = 50 * 1000 * 1000ull }; >> >> qemuDriverLock(driver); >> vm = virDomainFindByUUID(&driver->domains, dom->uuid); >> @@ -10365,10 +10367,31 @@ static int qemuDomainAbortJob(virDomainPtr dom) { >> goto endjob; >> } >> >> - VIR_DEBUG("Cancelling job at client request"); >> - qemuDomainObjEnterMonitor(driver, vm); >> - ret = qemuMonitorMigrateCancel(priv->mon); >> - qemuDomainObjExitMonitor(driver, vm); >> + qemuDomainObjAbortAsyncJob(vm); >> + VIR_DEBUG("Waiting for async job '%s' to finish", >> + qemuDomainAsyncJobTypeToString(priv->job.asyncJob)); >> + while (priv->job.asyncJob) { >> + if (qemuDomainObjEndJob(driver, vm) == 0) { >> + vm = NULL; >> + goto cleanup; >> + } >> + virDomainObjUnlock(vm); >> + >> + nanosleep(&ts, NULL); >> + >> + virDomainObjLock(vm); >> + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_ABORT) < 0) >> + goto cleanup; >> + >> + if (!virDomainObjIsActive(vm)) { >> + virReportError(VIR_ERR_OPERATION_INVALID, >> + "%s", _("domain is not running")); >> + goto endjob; >> + } >> + >> + } >> + >> + ret = 0; >> >> endjob: >> if (qemuDomainObjEndJob(driver, vm) == 0) > > AbortJob API was never a synchronous one and I don't think we need to change > it. Not to mention that this code unlocks and locks vm without holding an > extra reference to it. And even if it did so, it cannot guarantee that the job > it's checking in the loop is the same one it tried to cancel. It's quite > unlikely but the original job could have finished and another one could have > been started while this code was sleeping. However, AbortJob is documented as synchronous. If current implementation doesn't follow docs it is a bug. On the other hand, I don't recall anybody screaming about it so far. But that means nothing, right? > > In other words, I'd just change to do: > > VIR_DEBUG("Cancelling job at client request"); > + qemuDomainObjAbortAsyncJob(vm); > qemuDomainObjEnterMonitor(driver, vm); > ret = qemuMonitorMigrateCancel(priv->mon); > qemuDomainObjExitMonitor(driver, vm); > >> diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c >> index 5f8a9c5..c840686 100644 >> --- a/src/qemu/qemu_migration.c >> +++ b/src/qemu/qemu_migration.c >> @@ -1172,6 +1172,12 @@ qemuMigrationUpdateJobStatus(struct qemud_driver *driver, >> } >> priv->job.info.timeElapsed -= priv->job.start; >> >> + if (qemuDomainObjAbortAsyncJobRequested(vm)) { >> + VIR_DEBUG("Migration abort requested. Translating " >> + "status to MIGRATION_STATUS_CANCELLED"); >> + status = QEMU_MONITOR_MIGRATION_STATUS_CANCELLED; >> + } >> + > > This check seems to be to late and that complicates the code later. If we keep > qemuDomainAbortJob calling qemuMonitorMigrateCancel in qemuDomainAbortJob, we > may count on that and check priv.job.asyncAbort before entering the monitor > to tell QEMU to start migrating in qemuMigrationRun. If the abort is not > requested by then, it may only happen after we call qemuMonitorMigrateTo* and > in that case the migration will be properly aborted by > qemuMonitorMigrateCancel. > Not really. The issue I've seen was like this: Thread A Thread B 1. start async migration out job 2. Since job is set, abort it by calling 'migrate_cancel' 3. return to user 4. issue 'migrate' on the monitor And I think your suggestion makes it just harder to hit this race and not really avoid it. However with my code, we are guaranteed that 'migrate_cancel' will have an effect. But I agree that the check can be moved before the 'query_migrate' command so we don't have to enter the monitor when we know we are canceling migration. > >> ret = -1; >> switch (status) { >> case QEMU_MONITOR_MIGRATION_STATUS_INACTIVE: >> @@ -1214,6 +1220,35 @@ qemuMigrationUpdateJobStatus(struct qemud_driver *driver, >> return ret; >> } >> >> +static int >> +qemuMigrationCancel(struct qemud_driver *driver, virDomainObjPtr vm) >> +{ >> + qemuDomainObjPrivatePtr priv = vm->privateData; >> + int ret = -1; >> + >> + if (qemuDomainObjBeginJob(driver, vm, QEMU_JOB_ABORT) < 0) >> + goto cleanup; >> + >> + if (!virDomainObjIsActive(vm)) { >> + virReportError(VIR_ERR_OPERATION_INVALID, >> + "%s", _("domain is not running")); >> + goto endjob; >> + } >> + >> + qemuDomainObjEnterMonitor(driver, vm); >> + ret = qemuMonitorMigrateCancel(priv->mon); >> + qemuDomainObjExitMonitor(driver, vm); >> + >> +endjob: >> + if (qemuDomainObjEndJob(driver, vm) == 0) { >> + virReportError(VIR_ERR_OPEN_FAILED, "%s", >> + _("domain unexpectedly died")); >> + ret = -1; >> + } >> + >> +cleanup: >> + return ret; >> +} >> >> static int >> qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm, >> @@ -1262,10 +1297,14 @@ qemuMigrationWaitForCompletion(struct qemud_driver *driver, virDomainObjPtr vm, >> } >> >> cleanup: >> - if (priv->job.info.type == VIR_DOMAIN_JOB_COMPLETED) >> + if (priv->job.info.type == VIR_DOMAIN_JOB_COMPLETED) { >> return 0; >> - else >> + } else { >> + if (priv->job.info.type == VIR_DOMAIN_JOB_CANCELLED && >> + qemuMigrationCancel(driver, vm) < 0) >> + VIR_DEBUG("Cancelling job at client request"); >> return -1; >> + } >> } > > This hack is unnecessary when we do what I suggested above. > > But in any case, this patch does not actually allow the migration to be > cancelled at any phase. It just allow the migration to be cancelled during > Perform phase or before. And the cancellation would actually happen only > during Perform phase thus possibly doing unnecessary Prepare phase in case > someone tried to cancel the migration at the very beginning. However, since > even such improvement in this area is a nice step forward and this particular > case of aborting migration before it gets to Perform phase is the most visible > issue, we can address the rest of the issues later. > > I'm looking forward to a simplified v2 :-) > > Jirka > -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list