On Tue, Aug 30, 2011 at 03:06:36PM -0600, Eric Blake wrote: > Several users have reported problems with 'virsh start' failing because > it was encountering a managed save situation where the managed save file > was incomplete. Be more robust to this by using two different magic > numbers, so that newer libvirt can gracefully handle an incomplete file > differently than a complete one, while older libvirt will at least fail > up front rather than trying to load only to have qemu fail at the end. > > Managed save is a convenience - it exists to preserve as much state > as possible; if the state was not preserved, it is reasonable to just > log that fact, then proceed with a fresh boot. On the other hand, > user saves are under user control, so we must fail, but by making > the failure message distinct, the user can better decide how to handle > the situation of an incomplete save file. > > * src/qemu/qemu_driver.c (QEMUD_SAVE_PARTIAL): New define. > (qemuDomainSaveInternal): Use it to mark incomplete images. > (qemuDomainSaveImageOpen, qemuDomainObjRestore): Add parameter > that controls what to do with partial images. > (qemuDomainRestoreFlags, qemuDomainSaveImageGetXMLDesc) > (qemuDomainSaveImageDefineXML, qemuDomainObjStart): Update callers. > Based on an initial idea by Osier Yang. > --- > > This should implement everything mentioned here: > https://www.redhat.com/archives/libvir-list/2011-August/msg00854.html > > src/qemu/qemu_driver.c | 84 ++++++++++++++++++++++++++++++++++++++---------- > 1 files changed, 67 insertions(+), 17 deletions(-) > > diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c > index 9cb034b..61b33fe 100644 > --- a/src/qemu/qemu_driver.c > +++ b/src/qemu/qemu_driver.c > @@ -2111,9 +2111,12 @@ cleanup: > } > > > -#define QEMUD_SAVE_MAGIC "LibvirtQemudSave" > +#define QEMUD_SAVE_MAGIC "LibvirtQemudSave" > +#define QEMUD_SAVE_PARTIAL "LibvirtQemudPart" > #define QEMUD_SAVE_VERSION 2 > > +verify(sizeof(QEMUD_SAVE_MAGIC) == sizeof(QEMUD_SAVE_PARTIAL)); > + > enum qemud_save_formats { > QEMUD_SAVE_FORMAT_RAW = 0, > QEMUD_SAVE_FORMAT_GZIP = 1, > @@ -2331,7 +2334,7 @@ qemuDomainSaveInternal(struct qemud_driver *driver, virDomainPtr dom, > } > > memset(&header, 0, sizeof(header)); > - memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); > + memcpy(header.magic, QEMUD_SAVE_PARTIAL, sizeof(header.magic)); > header.version = QEMUD_SAVE_VERSION; > > header.compressed = compressed; > @@ -2435,12 +2438,37 @@ qemuDomainSaveInternal(struct qemud_driver *driver, virDomainPtr dom, > bypassSecurityDriver, > QEMU_ASYNC_JOB_SAVE) < 0) > goto endjob; > + > + /* Touch up file header to mark image complete. */ > + if (bypass_cache) { > + /* Reopen the file to touch up the header, since we aren't set > + * up to seek backwards on directFd. The reopened fd will > + * trigger a single page of file system cache pollution, but > + * that's acceptable. */ > + if (VIR_CLOSE(fd) < 0) { > + virReportSystemError(errno, _("unable to close %s"), path); > + goto endjob; > + } > + if (virFileDirectFdClose(directFd) < 0) > + goto endjob; > + fd = qemuOpenFile(driver, path, O_WRONLY, NULL, NULL); > + if (fd < 0) > + goto endjob; > + } else { > + if (lseek(fd, 0, SEEK_SET) != 0) { > + virReportSystemError(errno, _("unable to seek %s"), path); > + goto endjob; > + } > + } > + memcpy(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)); > + if (safewrite(fd, &header, sizeof(header)) != sizeof(header)) { > + virReportSystemError(errno, _("unable to write %s"), path); > + goto endjob; > + } > if (VIR_CLOSE(fd) < 0) { > virReportSystemError(errno, _("unable to close %s"), path); > goto endjob; > } > - if (virFileDirectFdClose(directFd) < 0) > - goto endjob; > > ret = 0; > > @@ -3769,15 +3797,16 @@ cleanup: > return ret; > } > > -/* Return -1 on failure, -2 if edit was specified but xmlin does not > - * represent any changes, and opened fd on all other success. */ > +/* Return -1 on most failures after raising error, -2 if edit was specified > + * but xmlin does not represent any changes (no error raised), -3 if corrupt > + * image was unlinked (no error raised), and opened fd on success. */ > static int ATTRIBUTE_NONNULL(3) ATTRIBUTE_NONNULL(4) > qemuDomainSaveImageOpen(struct qemud_driver *driver, > const char *path, > virDomainDefPtr *ret_def, > struct qemud_save_header *ret_header, > bool bypass_cache, virFileDirectFdPtr *directFd, > - const char *xmlin, bool edit) > + const char *xmlin, bool edit, bool unlink_corrupt) > { > int fd; > struct qemud_save_header header; > @@ -3807,8 +3836,22 @@ qemuDomainSaveImageOpen(struct qemud_driver *driver, > } > > if (memcmp(header.magic, QEMUD_SAVE_MAGIC, sizeof(header.magic)) != 0) { > - qemuReportError(VIR_ERR_OPERATION_FAILED, > - "%s", _("image magic is incorrect")); > + const char *msg = _("image magic is incorrect"); > + > + if (memcmp(header.magic, QEMUD_SAVE_PARTIAL, > + sizeof(header.magic)) == 0) { > + msg = _("save image is incomplete"); > + if (unlink_corrupt) { > + if (VIR_CLOSE(fd) < 0 || unlink(path) < 0) { > + virReportSystemError(errno, > + _("cannot remove corrupt file: %s"), > + path); > + goto error; > + } > + return -3; > + } > + } > + qemuReportError(VIR_ERR_OPERATION_FAILED, "%s", msg); > goto error; > } > > @@ -4009,7 +4052,7 @@ qemuDomainRestoreFlags(virConnectPtr conn, > > fd = qemuDomainSaveImageOpen(driver, path, &def, &header, > (flags & VIR_DOMAIN_SAVE_BYPASS_CACHE) != 0, > - &directFd, dxml, false); > + &directFd, dxml, false, false); > if (fd < 0) > goto cleanup; > > @@ -4071,7 +4114,7 @@ qemuDomainSaveImageGetXMLDesc(virConnectPtr conn, const char *path, > qemuDriverLock(driver); > > fd = qemuDomainSaveImageOpen(driver, path, &def, &header, false, NULL, > - NULL, false); > + NULL, false, false); > > if (fd < 0) > goto cleanup; > @@ -4102,7 +4145,7 @@ qemuDomainSaveImageDefineXML(virConnectPtr conn, const char *path, > qemuDriverLock(driver); > > fd = qemuDomainSaveImageOpen(driver, path, &def, &header, false, NULL, > - dxml, true); > + dxml, true, false); > > if (fd < 0) { > /* Check for special case of no change needed. */ > @@ -4147,6 +4190,8 @@ cleanup: > return ret; > } > > +/* Return 0 on success, 1 if incomplete saved image was silently unlinked, > + * and -1 on failure with error raised. */ > static int > qemuDomainObjRestore(virConnectPtr conn, > struct qemud_driver *driver, > @@ -4161,9 +4206,12 @@ qemuDomainObjRestore(virConnectPtr conn, > virFileDirectFdPtr directFd = NULL; > > fd = qemuDomainSaveImageOpen(driver, path, &def, &header, > - bypass_cache, &directFd, NULL, false); > - if (fd < 0) > + bypass_cache, &directFd, NULL, false, true); > + if (fd < 0) { > + if (fd == -3) > + ret = 1; > goto cleanup; > + } > > if (STRNEQ(vm->def->name, def->name) || > memcmp(vm->def->uuid, def->uuid, VIR_UUID_BUFLEN)) { > @@ -4472,10 +4520,12 @@ qemuDomainObjStart(virConnectPtr conn, > ret = qemuDomainObjRestore(conn, driver, vm, managed_save, > bypass_cache); > > - if ((ret == 0) && (unlink(managed_save) < 0)) > + if (ret == 0 && unlink(managed_save) < 0) > VIR_WARN("Failed to remove the managed state %s", managed_save); > - > - goto cleanup; > + if (ret > 0) > + VIR_WARN("Ignoring incomplete managed state %s", managed_save); > + else > + goto cleanup; > } > } Looks and sounds okay, ACK, Daniel -- Daniel Veillard | libxml Gnome XML XSLT toolkit http://xmlsoft.org/ daniel@xxxxxxxxxxxx | Rpmfind RPM search engine http://rpmfind.net/ http://veillard.com/ | virtualization library http://libvirt.org/ -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list