If a guest is paused, we were silently ignoring the quiesce flag, which results in unclean snapshots, contrary to the intent of the flag. Since we can't quiesce without guest agent support, we should instead fail if the guest is not running. Meanwhile, if we attempt a quiesce command, but the guest agent doesn't respond, and we time out, we may have left the command pending on the guest's queue, and when the guest resumes parsing commands, it will freeze even though our command is no longer around to issue a thaw. To be safe, we must _always_ pair every quiesce call with a counterpart thaw, even if the quiesce call failed due to a timeout, so that if a guest wakes up and starts processing a command backlog, it will not get stuck in a frozen state. * src/qemu/qemu_driver.c (qemuDomainSnapshotCreateDiskActive): Always issue thaw after a quiesce, even if quiesce failed. (qemuDomainSnapshotFSThaw): Add a parameter. --- See also: https://bugzilla.redhat.com/show_bug.cgi?id=804210 https://www.redhat.com/archives/libvir-list/2012-March/msg00708.html src/qemu/qemu_driver.c | 51 +++++++++++++++++++++++++++++++++-------------- 1 files changed, 36 insertions(+), 15 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 2c467ab..13ca92f 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -9584,24 +9584,36 @@ qemuDomainSnapshotFSFreeze(struct qemud_driver *driver, static int qemuDomainSnapshotFSThaw(struct qemud_driver *driver, - virDomainObjPtr vm) { + virDomainObjPtr vm, bool report) +{ qemuDomainObjPrivatePtr priv = vm->privateData; int thawed; + virErrorPtr err = NULL; if (priv->agentError) { - qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("QEMU guest agent is not " - "available due to an error")); + if (report) + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("QEMU guest agent is not " + "available due to an error")); return -1; } if (!priv->agent) { - qemuReportError(VIR_ERR_ARGUMENT_UNSUPPORTED, "%s", - _("QEMU guest agent is not configured")); + if (report) + qemuReportError(VIR_ERR_ARGUMENT_UNSUPPORTED, "%s", + _("QEMU guest agent is not configured")); return -1; } qemuDomainObjEnterAgent(driver, vm); + if (!report) + err = virGetLastError(); thawed = qemuAgentFSThaw(priv->agent); + if (!report) { + if (err) + virResetError(err); + else + virResetLastError(); + } qemuDomainObjExitAgent(driver, vm); return thawed; @@ -9907,6 +9919,7 @@ qemuDomainSnapshotCreateDiskActive(virConnectPtr conn, int ret = -1; int i; bool persist = false; + int thaw = 0; /* 1 if freeze succeeded, -1 if freeze failed */ if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0) return -1; @@ -9917,14 +9930,21 @@ qemuDomainSnapshotCreateDiskActive(virConnectPtr conn, goto endjob; } - - if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) { - if ((flags & VIR_DOMAIN_SNAPSHOT_CREATE_QUIESCE) && - (qemuDomainSnapshotFSFreeze(driver, vm) < 0)) { + /* If quiesce was requested, then issue a freeze command, and a + * counterpart thaw command, no matter what. The command will + * fail if the guest is paused or the guest agent is not + * running. */ + if (flags & VIR_DOMAIN_SNAPSHOT_CREATE_QUIESCE) { + if (qemuDomainSnapshotFSFreeze(driver, vm) < 0) { /* helper reported the error */ + thaw = -1; goto endjob; + } else { + thaw = 1; } + } + if (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) { /* In qemu, snapshot_blkdev on a single disk will pause cpus, * but this confuses libvirt since notifications are not given * when qemu resumes. And for multiple disks, libvirt must @@ -10001,11 +10021,6 @@ cleanup: _("resuming after snapshot failed")); goto endjob; } - - if ((flags & VIR_DOMAIN_SNAPSHOT_CREATE_QUIESCE) && - qemuDomainSnapshotFSThaw(driver, vm) < 0) { - /* helper reported the error */ - } } if (vm) { @@ -10016,6 +10031,12 @@ cleanup: } endjob: + if (vm && thaw != 0 && + qemuDomainSnapshotFSThaw(driver, vm, thaw > 0) < 0) { + /* helper reported the error, if it was needed */ + if (thaw > 0) + ret = -1; + } if (vm && (qemuDomainObjEndJob(driver, vm) == 0)) { /* Only possible if a transient vm quit while our locks were down, * in which case we don't want to save snapshot metadata. */ -- 1.7.7.6 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list