Introduce support for QEMU's new mapped-ram stream format [1]. mapped-ram can be enabled by setting the 'save_image_format' setting in qemu.conf to 'sparse'. To use mapped-ram with QEMU: - The 'mapped-ram' migration capability must be set to true - The 'multifd' migration capability must be set to true and the 'multifd-channels' migration parameter must set to 1 - QEMU must be provided an fdset containing the migration fd - The 'migrate' qmp command is invoked with a URI referencing the fdset and an offset where to start reading or writing the data stream, e.g. {"execute":"migrate", "arguments":{"detach":true,"resume":false, "uri":"file:/dev/fdset/0,offset=0x11921"}} The mapped-ram stream, in conjunction with direct IO and multifd support provided by subsequent patches, can significantly improve the time required to save VM memory state. The following tables compare mapped-ram with the existing, sequential save stream. In all cases, the save and restore operations are to/from a block device comprised of two NVMe disks in RAID0 configuration with xfs (~8600MiB/s). The values in the 'save time' and 'restore time' columns were scraped from the 'real' time reported by time(1). The 'Size' and 'Blocks' columns were provided by the corresponding outputs of stat(1). VM: 32G RAM, 1 vcpu, idle (shortly after boot) | save | restore | | time | time | Size | Blocks -----------------------+---------+---------+--------------+-------- legacy | 6.193s | 4.399s | 985744812 | 1925288 -----------------------+---------+---------+--------------+-------- mapped-ram | 5.109s | 1.176s | 34368554354 | 1774472 -----------------------+---------+---------+--------------+-------- legacy + direct IO | 5.725s | 4.512s | 985765251 | 1925328 -----------------------+---------+---------+--------------+-------- mapped-ram + direct IO | 4.627s | 1.490s | 34368554354 | 1774304 -----------------------+---------+---------+--------------+-------- mapped-ram + direct IO | | | | + multifd-channels=8 | 4.421s | 0.845s | 34368554318 | 1774312 ------------------------------------------------------------------- VM: 32G RAM, 30G dirty, 1 vcpu in tight loop dirtying memory | save | restore | | time | time | Size | Blocks -----------------------+---------+---------+--------------+--------- legacy | 25.800s | 14.332s | 33154309983 | 64754512 -----------------------+---------+---------+--------------+--------- mapped-ram | 18.742s | 15.027s | 34368559228 | 64617160 -----------------------+---------+---------+--------------+--------- legacy + direct IO | 13.115s | 18.050s | 33154310496 | 64754520 -----------------------+---------+---------+--------------+--------- mapped-ram + direct IO | 13.623s | 15.959s | 34368557392 | 64662040 -----------------------+-------- +---------+--------------+--------- mapped-ram + direct IO | | | | + multifd-channels=8 | 6.994s | 6.470s | 34368554980 | 64665776 -------------------------------------------------------------------- As can be seen from the tables, one caveat of mapped-ram is the logical file size of a saved image is basically equivalent to the VM memory size. Note however that mapped-ram typically uses fewer blocks on disk, hence the name 'sparse' for 'save_image_format'. Also note the mapped-ram stream is incompatible with the existing stream format, hence mapped-ram cannot be used to restore an image saved with the existing format and vice versa. [1] https://gitlab.com/qemu-project/qemu/-/blob/master/docs/devel/migration/mapped-ram.rst?ref_type=heads Signed-off-by: Jim Fehlig <jfehlig@xxxxxxxx> --- src/qemu/qemu_driver.c | 23 +++++- src/qemu/qemu_migration.c | 149 ++++++++++++++++++++++++++------------ src/qemu/qemu_migration.h | 4 +- src/qemu/qemu_monitor.c | 34 +++++++++ src/qemu/qemu_monitor.h | 4 + src/qemu/qemu_saveimage.c | 33 +++++---- src/qemu/qemu_saveimage.h | 1 + src/qemu/qemu_snapshot.c | 14 +++- 8 files changed, 199 insertions(+), 63 deletions(-) diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 78bfaa5b3a..f77516a4f4 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -2620,6 +2620,7 @@ qemuDomainSaveInternal(virQEMUDriver *driver, qemuDomainObjPrivate *priv = vm->privateData; virQEMUSaveData *data = NULL; g_autoptr(qemuDomainSaveCookie) cookie = NULL; + g_autoptr(qemuMigrationParams) saveParams = NULL; if (virDomainObjBeginAsyncJob(vm, VIR_ASYNC_JOB_SAVE, VIR_DOMAIN_JOB_OPERATION_SAVE, flags) < 0) @@ -2628,6 +2629,14 @@ qemuDomainSaveInternal(virQEMUDriver *driver, if (!qemuMigrationSrcIsAllowed(vm, false, VIR_ASYNC_JOB_SAVE, 0)) goto endjob; + if (format == QEMU_SAVE_FORMAT_SPARSE && + !qemuMigrationCapsGet(vm, QEMU_MIGRATION_CAP_MAPPED_RAM)) { + virReportError(VIR_ERR_OPERATION_UNSUPPORTED, + _("save image format %1$s is not supported by this QEMU binary"), + qemuSaveFormatTypeToString(format)); + goto endjob; + } + if (!virDomainObjIsActive(vm)) { virReportError(VIR_ERR_INTERNAL_ERROR, "%s", _("guest unexpectedly quit")); @@ -2691,8 +2700,11 @@ qemuDomainSaveInternal(virQEMUDriver *driver, goto endjob; xml = NULL; + if (!(saveParams = qemuMigrationParamsForSave(format == QEMU_SAVE_FORMAT_SPARSE))) + goto endjob; + ret = qemuSaveImageCreate(driver, vm, path, data, compressor, - flags, VIR_ASYNC_JOB_SAVE); + saveParams, flags, VIR_ASYNC_JOB_SAVE); if (ret < 0) goto endjob; @@ -3126,6 +3138,8 @@ doCoreDump(virQEMUDriver *driver, memory_dump_format) < 0) goto cleanup; } else { + g_autoptr(qemuMigrationParams) dump_params = NULL; + if (dumpformat != VIR_DOMAIN_CORE_DUMP_FORMAT_RAW) { virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s", _("kdump-compressed format is only supported with memory-only dump")); @@ -3135,8 +3149,11 @@ doCoreDump(virQEMUDriver *driver, if (!qemuMigrationSrcIsAllowed(vm, false, VIR_ASYNC_JOB_DUMP, 0)) goto cleanup; - if (qemuMigrationSrcToFile(driver, vm, fd, compressor, - VIR_ASYNC_JOB_DUMP) < 0) + if (!(dump_params = qemuMigrationParamsNew())) + goto cleanup; + + if (qemuMigrationSrcToFile(driver, vm, &fd, compressor, + dump_params, dump_flags, VIR_ASYNC_JOB_DUMP) < 0) goto cleanup; } diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c index 4efc03fc00..1e3bee3781 100644 --- a/src/qemu/qemu_migration.c +++ b/src/qemu/qemu_migration.c @@ -7069,46 +7069,17 @@ qemuMigrationProcessUnattended(virQEMUDriver *driver, } -/* Helper function called while vm is active. */ -int -qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm, - int fd, - virCommand *compressor, - virDomainAsyncJob asyncJob) +static int +qemuMigrationSrcToLegacyFile(virQEMUDriver *driver, + virDomainObj *vm, + int fd, + virCommand *compressor, + virDomainAsyncJob asyncJob) { qemuDomainObjPrivate *priv = vm->privateData; - int rc; int ret = -1; int pipeFD[2] = { -1, -1 }; - unsigned long saveMigBandwidth = priv->migMaxBandwidth; char *errbuf = NULL; - virErrorPtr orig_err = NULL; - g_autoptr(qemuMigrationParams) migParams = NULL; - - if (qemuMigrationSetDBusVMState(driver, vm) < 0) - return -1; - - /* Increase migration bandwidth to unlimited since target is a file. - * Failure to change migration speed is not fatal. */ - if (!(migParams = qemuMigrationParamsForSave(false))) - return -1; - - if (qemuMigrationParamsSetULL(migParams, - QEMU_MIGRATION_PARAM_MAX_BANDWIDTH, - QEMU_DOMAIN_MIG_BANDWIDTH_MAX * 1024 * 1024) < 0) - return -1; - - if (qemuMigrationParamsApply(vm, asyncJob, migParams, 0) < 0) - return -1; - - priv->migMaxBandwidth = QEMU_DOMAIN_MIG_BANDWIDTH_MAX; - - if (!virDomainObjIsActive(vm)) { - virReportError(VIR_ERR_INTERNAL_ERROR, "%s", - _("guest unexpectedly quit")); - /* nothing to tear down */ - return -1; - } if (compressor && virPipe(pipeFD) < 0) return -1; @@ -7125,7 +7096,7 @@ qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm, goto cleanup; if (!compressor) { - rc = qemuMonitorMigrateToFd(priv->mon, 0, fd); + ret = qemuMonitorMigrateToFd(priv->mon, 0, fd); } else { virCommandSetInputFD(compressor, pipeFD[0]); virCommandSetOutputFD(compressor, &fd); @@ -7141,12 +7112,98 @@ qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm, qemuDomainObjExitMonitor(vm); goto cleanup; } - rc = qemuMonitorMigrateToFd(priv->mon, 0, pipeFD[1]); + ret = qemuMonitorMigrateToFd(priv->mon, 0, pipeFD[1]); if (VIR_CLOSE(pipeFD[0]) < 0 || VIR_CLOSE(pipeFD[1]) < 0) VIR_WARN("failed to close intermediate pipe"); } qemuDomainObjExitMonitor(vm); + + cleanup: + VIR_FORCE_CLOSE(pipeFD[0]); + VIR_FORCE_CLOSE(pipeFD[1]); + + if (errbuf) { + VIR_DEBUG("Compression binary stderr: %s", NULLSTR(errbuf)); + VIR_FREE(errbuf); + } + + return ret; +} + + +static int +qemuMigrationSrcToSparseFile(virQEMUDriver *driver, + virDomainObj *vm, + int *fd, + unsigned int flags, + virDomainAsyncJob asyncJob) +{ + int ret; + + /* mapped-ram does not support directIO */ + if ((flags & VIR_DOMAIN_SAVE_BYPASS_CACHE)) { + virReportError(VIR_ERR_OPERATION_FAILED, "%s", + _("bypass cache unsupported by this system")); + return -1; + } + + if (qemuSecuritySetImageFDLabel(driver->securityManager, vm->def, *fd) < 0) + return -1; + + if (qemuDomainObjEnterMonitorAsync(vm, asyncJob) < 0) + return -1; + + ret = qemuMonitorMigrateToFdSet(vm, 0, fd); + qemuDomainObjExitMonitor(vm); + return ret; +} + + +/* Helper function called while vm is active. */ +int +qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm, + int *fd, + virCommand *compressor, + qemuMigrationParams *migParams, + unsigned int flags, + virDomainAsyncJob asyncJob) +{ + qemuDomainObjPrivate *priv = vm->privateData; + int rc; + int ret = -1; + unsigned long saveMigBandwidth = priv->migMaxBandwidth; + virErrorPtr orig_err = NULL; + + if (qemuMigrationSetDBusVMState(driver, vm) < 0) + return -1; + + /* Increase migration bandwidth to unlimited since target is a file. + * Failure to change migration speed is not fatal. */ + if (migParams && + qemuMigrationParamsSetULL(migParams, + QEMU_MIGRATION_PARAM_MAX_BANDWIDTH, + QEMU_DOMAIN_MIG_BANDWIDTH_MAX * 1024 * 1024) < 0) + return -1; + + if (qemuMigrationParamsApply(vm, asyncJob, migParams, 0) < 0) + return -1; + + priv->migMaxBandwidth = QEMU_DOMAIN_MIG_BANDWIDTH_MAX; + + if (!virDomainObjIsActive(vm)) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("guest unexpectedly quit")); + /* nothing to tear down */ + return -1; + } + + if (migParams && + qemuMigrationParamsCapEnabled(migParams, QEMU_MIGRATION_CAP_MAPPED_RAM)) + rc = qemuMigrationSrcToSparseFile(driver, vm, fd, flags, asyncJob); + else + rc = qemuMigrationSrcToLegacyFile(driver, vm, *fd, compressor, asyncJob); + if (rc < 0) goto cleanup; @@ -7172,8 +7229,17 @@ qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm, if (ret < 0 && !orig_err) virErrorPreserveLast(&orig_err); - /* Restore max migration bandwidth */ + /* Remove fdset passed to qemu and restore max migration bandwidth */ if (qemuDomainObjIsActive(vm)) { + if (qemuDomainObjEnterMonitorAsync(vm, asyncJob) == 0) { + qemuFDPass *fdPass = + qemuFDPassNewFromMonitor("libvirt-outgoing-migrate", priv->mon); + + if (fdPass) + qemuFDPassTransferMonitorRollback(fdPass, priv->mon); + qemuDomainObjExitMonitor(vm); + } + if (qemuMigrationParamsSetULL(migParams, QEMU_MIGRATION_PARAM_MAX_BANDWIDTH, saveMigBandwidth * 1024 * 1024) == 0) @@ -7182,13 +7248,6 @@ qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm, priv->migMaxBandwidth = saveMigBandwidth; } - VIR_FORCE_CLOSE(pipeFD[0]); - VIR_FORCE_CLOSE(pipeFD[1]); - if (errbuf) { - VIR_DEBUG("Compression binary stderr: %s", NULLSTR(errbuf)); - VIR_FREE(errbuf); - } - virErrorRestore(&orig_err); return ret; diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h index efe1b9e88a..9fa007b949 100644 --- a/src/qemu/qemu_migration.h +++ b/src/qemu/qemu_migration.h @@ -238,8 +238,10 @@ qemuMigrationSrcIsAllowed(virDomainObj *vm, int qemuMigrationSrcToFile(virQEMUDriver *driver, virDomainObj *vm, - int fd, + int *fd, virCommand *compressor, + qemuMigrationParams *migParams, + unsigned int flags, virDomainAsyncJob asyncJob) ATTRIBUTE_NONNULL(1) ATTRIBUTE_NONNULL(2) G_GNUC_WARN_UNUSED_RESULT; diff --git a/src/qemu/qemu_monitor.c b/src/qemu/qemu_monitor.c index 830ecbad1c..e2043c0120 100644 --- a/src/qemu/qemu_monitor.c +++ b/src/qemu/qemu_monitor.c @@ -2232,6 +2232,40 @@ qemuMonitorMigrateToFd(qemuMonitor *mon, } +int +qemuMonitorMigrateToFdSet(virDomainObj *vm, + unsigned int flags, + int *fd) +{ + qemuDomainObjPrivate *priv = vm->privateData; + qemuMonitor *mon = priv->mon; + off_t offset; + g_autoptr(qemuFDPass) fdPassMigrate = NULL; + g_autofree char *uri = NULL; + int ret; + + VIR_DEBUG("fd=%d flags=0x%x", *fd, flags); + + QEMU_CHECK_MONITOR(mon); + + if ((offset = lseek(*fd, 0, SEEK_CUR)) == -1) { + virReportSystemError(errno, + "%s", _("failed to seek on file descriptor")); + return -1; + } + + fdPassMigrate = qemuFDPassNew("libvirt-outgoing-migrate", priv); + qemuFDPassAddFD(fdPassMigrate, fd, "-fd"); + qemuFDPassTransferMonitor(fdPassMigrate, mon); + + uri = g_strdup_printf("file:%s,offset=%#lx", + qemuFDPassGetPath(fdPassMigrate), offset); + ret = qemuMonitorJSONMigrate(mon, flags, uri); + + return ret; +} + + int qemuMonitorMigrateToHost(qemuMonitor *mon, unsigned int flags, diff --git a/src/qemu/qemu_monitor.h b/src/qemu/qemu_monitor.h index 072f452e79..6da380aa65 100644 --- a/src/qemu/qemu_monitor.h +++ b/src/qemu/qemu_monitor.h @@ -859,6 +859,10 @@ int qemuMonitorMigrateToFd(qemuMonitor *mon, unsigned int flags, int fd); +int qemuMonitorMigrateToFdSet(virDomainObj *vm, + unsigned int flags, + int *fd); + int qemuMonitorMigrateToHost(qemuMonitor *mon, unsigned int flags, const char *protocol, diff --git a/src/qemu/qemu_saveimage.c b/src/qemu/qemu_saveimage.c index 59237cc46a..0ffbe03f24 100644 --- a/src/qemu/qemu_saveimage.c +++ b/src/qemu/qemu_saveimage.c @@ -430,6 +430,7 @@ qemuSaveImageCreateFd(virQEMUDriver *driver, virDomainObj *vm, const char *path, virFileWrapperFd *wrapperFd, + bool sparse, bool *needUnlink, unsigned int flags) { @@ -439,7 +440,7 @@ qemuSaveImageCreateFd(virQEMUDriver *driver, int directFlag = 0; unsigned int wrapperFlags = VIR_FILE_WRAPPER_NON_BLOCKING; - if (flags & VIR_DOMAIN_SAVE_BYPASS_CACHE) { + if (!sparse && flags & VIR_DOMAIN_SAVE_BYPASS_CACHE) { wrapperFlags |= VIR_FILE_WRAPPER_BYPASS_CACHE; directFlag = virFileDirectFdFlag(); if (directFlag < 0) { @@ -459,7 +460,7 @@ qemuSaveImageCreateFd(virQEMUDriver *driver, if (qemuSecuritySetImageFDLabel(driver->securityManager, vm->def, fd) < 0) return -1; - if (!(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags))) + if (!sparse && !(wrapperFd = virFileWrapperFdNew(&fd, path, wrapperFlags))) return -1; ret = fd; @@ -478,6 +479,7 @@ qemuSaveImageCreate(virQEMUDriver *driver, const char *path, virQEMUSaveData *data, virCommand *compressor, + qemuMigrationParams *saveParams, unsigned int flags, virDomainAsyncJob asyncJob) { @@ -486,9 +488,10 @@ qemuSaveImageCreate(virQEMUDriver *driver, int ret = -1; int fd = -1; virFileWrapperFd *wrapperFd = NULL; + bool sparse = data->header.format == QEMU_SAVE_FORMAT_SPARSE; /* Obtain the file handle. */ - fd = qemuSaveImageCreateFd(driver, vm, path, wrapperFd, &needUnlink, flags); + fd = qemuSaveImageCreateFd(driver, vm, path, wrapperFd, sparse, &needUnlink, flags); if (fd < 0) goto cleanup; @@ -497,7 +500,7 @@ qemuSaveImageCreate(virQEMUDriver *driver, goto cleanup; /* Perform the migration */ - if (qemuMigrationSrcToFile(driver, vm, fd, compressor, asyncJob) < 0) + if (qemuMigrationSrcToFile(driver, vm, &fd, compressor, saveParams, flags, asyncJob) < 0) goto cleanup; /* Touch up file header to mark image complete. */ @@ -505,14 +508,18 @@ qemuSaveImageCreate(virQEMUDriver *driver, /* Reopen the file to touch up the header, since we aren't set * up to seek backwards on wrapperFd. The reopened fd will * trigger a single page of file system cache pollution, but - * that's acceptable. */ - if (VIR_CLOSE(fd) < 0) { - virReportSystemError(errno, _("unable to close %1$s"), path); - goto cleanup; - } + * that's acceptable. + * If using mapped-ram, the fd was passed to qemu, so no need + * to close it. */ + if (!sparse) { + if (VIR_CLOSE(fd) < 0) { + virReportSystemError(errno, _("unable to close %1$s"), path); + goto cleanup; + } - if (qemuDomainFileWrapperFDClose(vm, wrapperFd) < 0) - goto cleanup; + if (qemuDomainFileWrapperFDClose(vm, wrapperFd) < 0) + goto cleanup; + } if ((fd = qemuDomainOpenFile(cfg, vm->def, path, O_WRONLY, NULL)) < 0 || virQEMUSaveDataFinish(data, &fd, path) < 0) @@ -569,8 +576,8 @@ qemuSaveImageGetCompressionProgram(const char *imageFormat, if ((ret = qemuSaveFormatTypeFromString(imageFormat)) < 0) goto error; - if (ret == QEMU_SAVE_FORMAT_RAW) - return QEMU_SAVE_FORMAT_RAW; + if (ret == QEMU_SAVE_FORMAT_RAW || ret == QEMU_SAVE_FORMAT_SPARSE) + return ret; if (!(prog = virFindFileInPath(imageFormat))) goto error; diff --git a/src/qemu/qemu_saveimage.h b/src/qemu/qemu_saveimage.h index a5b55c6b10..2b3d839e5b 100644 --- a/src/qemu/qemu_saveimage.h +++ b/src/qemu/qemu_saveimage.h @@ -138,6 +138,7 @@ qemuSaveImageCreate(virQEMUDriver *driver, const char *path, virQEMUSaveData *data, virCommand *compressor, + qemuMigrationParams *saveParams, unsigned int flags, virDomainAsyncJob asyncJob); diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c index ed140dd41c..3088b28716 100644 --- a/src/qemu/qemu_snapshot.c +++ b/src/qemu/qemu_snapshot.c @@ -1644,6 +1644,8 @@ qemuSnapshotCreateActiveExternal(virQEMUDriver *driver, /* do the memory snapshot if necessary */ if (memory) { + g_autoptr(qemuMigrationParams) snap_params = NULL; + /* check if migration is possible */ if (!qemuMigrationSrcIsAllowed(vm, false, VIR_ASYNC_JOB_SNAPSHOT, 0)) goto cleanup; @@ -1661,6 +1663,13 @@ qemuSnapshotCreateActiveExternal(virQEMUDriver *driver, "snapshot", false)) < 0) goto cleanup; + if (format == QEMU_SAVE_FORMAT_SPARSE) { + virReportError(VIR_ERR_OPERATION_FAILED, + _("Snapshots do not support image format %1$s"), + qemuSaveFormatTypeToString(format)); + goto cleanup; + } + if (!(xml = qemuDomainDefFormatLive(driver, priv->qemuCaps, vm->def, priv->origCPU, true, true)) || @@ -1675,8 +1684,11 @@ qemuSnapshotCreateActiveExternal(virQEMUDriver *driver, memory_existing = virFileExists(snapdef->memorysnapshotfile); + if (!(snap_params = qemuMigrationParamsNew())) + goto cleanup; + if ((ret = qemuSaveImageCreate(driver, vm, snapdef->memorysnapshotfile, - data, compressor, 0, + data, compressor, snap_params, 0, VIR_ASYNC_JOB_SNAPSHOT)) < 0) goto cleanup; -- 2.43.0