This is very helpful when we want to log and report why we could not acquire a state change lock. Reporting what job keeps it locked helps with understanding the issue. Moreover, after calling virDomainGetControlInfo, it's possible to tell whether libvirt is just stuck somewhere within the API (or it just forgot to cleanup the job) or whether libvirt is waiting for QEMU to reply. The error message will look like the following: # virsh resume cd error: Failed to resume domain cd error: Timed out during operation: cannot acquire state change lock (held by remoteDispatchDomainSuspend) https://bugzilla.redhat.com/show_bug.cgi?id=853839 Signed-off-by: Jiri Denemark <jdenemar@xxxxxxxxxx> --- src/qemu/qemu_domain.c | 43 +++++++++++++++++++++++++++++++++---------- src/qemu/qemu_domain.h | 2 ++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 559400c..dc55016 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -153,6 +153,7 @@ qemuDomainObjResetJob(qemuDomainObjPrivatePtr priv) job->active = QEMU_JOB_NONE; job->owner = 0; + job->ownerAPI = NULL; } static void @@ -162,6 +163,7 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv) job->asyncJob = QEMU_ASYNC_JOB_NONE; job->asyncOwner = 0; + job->asyncOwnerAPI = NULL; job->phase = 0; job->mask = QEMU_JOB_DEFAULT_MASK; job->dump_memory_only = false; @@ -1322,6 +1324,7 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, unsigned long long then; bool nested = job == QEMU_JOB_ASYNC_NESTED; virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); + const char *blocker = NULL; int ret = -1; VIR_DEBUG("Starting %s: %s (async=%s vm=%p name=%s)", @@ -1370,6 +1373,7 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, obj, obj->def->name); priv->job.active = job; priv->job.owner = virThreadSelfID(); + priv->job.ownerAPI = virThreadJobGet(); } else { VIR_DEBUG("Started async job: %s (vm=%p name=%s)", qemuDomainAsyncJobTypeToString(asyncJob), @@ -1379,6 +1383,7 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, goto cleanup; priv->job.asyncJob = asyncJob; priv->job.asyncOwner = virThreadSelfID(); + priv->job.asyncOwnerAPI = virThreadJobGet(); priv->job.current->started = now; } @@ -1389,29 +1394,47 @@ qemuDomainObjBeginJobInternal(virQEMUDriverPtr driver, return 0; error: - VIR_WARN("Cannot start job (%s, %s) for domain %s;" - " current job is (%s, %s) owned by (%llu, %llu)", + VIR_WARN("Cannot start job (%s, %s) for domain %s; " + "current job is (%s, %s) owned by (%llu %s, %llu %s)", qemuDomainJobTypeToString(job), qemuDomainAsyncJobTypeToString(asyncJob), obj->def->name, qemuDomainJobTypeToString(priv->job.active), qemuDomainAsyncJobTypeToString(priv->job.asyncJob), - priv->job.owner, priv->job.asyncOwner); + priv->job.owner, NULLSTR(priv->job.ownerAPI), + priv->job.asyncOwner, NULLSTR(priv->job.asyncOwnerAPI)); + + if (nested || qemuDomainNestedJobAllowed(priv, job)) + blocker = priv->job.ownerAPI; + else + blocker = priv->job.asyncOwnerAPI; ret = -1; if (errno == ETIMEDOUT) { - virReportError(VIR_ERR_OPERATION_TIMEOUT, - "%s", _("cannot acquire state change lock")); + if (blocker) { + virReportError(VIR_ERR_OPERATION_TIMEOUT, + _("cannot acquire state change lock (held by %s)"), + blocker); + } else { + virReportError(VIR_ERR_OPERATION_TIMEOUT, "%s", + _("cannot acquire state change lock")); + } ret = -2; } else if (cfg->maxQueuedJobs && priv->jobs_queued > cfg->maxQueuedJobs) { - virReportError(VIR_ERR_OPERATION_FAILED, - "%s", _("cannot acquire state change lock " - "due to max_queued limit")); + if (blocker) { + virReportError(VIR_ERR_OPERATION_FAILED, + _("cannot acquire state change lock (held by %s) " + "due to max_queued limit"), + blocker); + } else { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("cannot acquire state change lock " + "due to max_queued limit")); + } ret = -2; } else { - virReportSystemError(errno, - "%s", _("cannot acquire job mutex")); + virReportSystemError(errno, "%s", _("cannot acquire job mutex")); } cleanup: diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index ba8d398..a468982 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -117,10 +117,12 @@ struct qemuDomainJobObj { virCond cond; /* Use to coordinate jobs */ qemuDomainJob active; /* Currently running job */ unsigned long long owner; /* Thread id which set current job */ + const char *ownerAPI; /* The API which owns the job */ virCond asyncCond; /* Use to coordinate with async jobs */ qemuDomainAsyncJob asyncJob; /* Currently active async job */ unsigned long long asyncOwner; /* Thread which set current async job */ + const char *asyncOwnerAPI; /* The API which owns the async job */ int phase; /* Job phase (mainly for migrations) */ unsigned long long mask; /* Jobs allowed during async job */ bool dump_memory_only; /* use dump-guest-memory to do dump */ -- 2.3.3 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list