On Thu, Dec 08, 2022 at 14:31:00 +0100, Pavel Hrdina wrote: > When deleting snapshot we are starting block-commit job over all disks > that are part of the snapshot. > > This operation may fail as it writes data changes to the backing qcow2 > image so we need to wait for all the disks to finish the operation and > wait for correct signal from QEMU. If deleting active snapshot we will > get `ready` signal and for inactive snapshots we need to disable > autofinalize in order to get `pending` signal. > > At this point if commit for any disk fails for some reason and we abort > the VM is still in consistent state and user can fix the reason why the > deletion failed. > > After that we do `pivot` or `finalize` if it's active snapshot or not to > finish the block job. It still may fail but there is nothing else we can > do about it. > > Signed-off-by: Pavel Hrdina <phrdina@xxxxxxxxxx> > --- > src/qemu/qemu_snapshot.c | 266 +++++++++++++++++++++++++++++++++++---- > 1 file changed, 245 insertions(+), 21 deletions(-) > > diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c > index 882224b0a7..c493a3e94f 100644 > --- a/src/qemu/qemu_snapshot.c > +++ b/src/qemu/qemu_snapshot.c > @@ -2394,6 +2394,207 @@ qemuSnapshotChildrenReparent(void *payload, > } > > > +/* Deleting external snapshot is started by running qemu block-commit job. > + * We need to wait for all block-commit jobs to be 'ready' or 'pending' to > + * continue with external snapshot deletion. */ > +static int > +qemuSnapshotJobIsRunning(qemuBlockjobState state) This is more considering snapshot deletion blockjobs... so perhaps: qemuSnapshotDeleteBlockjobIsRunning ? > +{ > + switch (state) { > + case QEMU_BLOCKJOB_STATE_COMPLETED: > + case QEMU_BLOCKJOB_STATE_FAILED: > + case QEMU_BLOCKJOB_STATE_CANCELLED: > + case QEMU_BLOCKJOB_STATE_READY: > + case QEMU_BLOCKJOB_STATE_CONCLUDED: > + case QEMU_BLOCKJOB_STATE_PENDING: > + return 0; > + > + case QEMU_BLOCKJOB_STATE_NEW: > + case QEMU_BLOCKJOB_STATE_RUNNING: > + case QEMU_BLOCKJOB_STATE_ABORTING: > + case QEMU_BLOCKJOB_STATE_PIVOTING: > + return 1; > + > + case QEMU_BLOCKJOB_STATE_LAST: > + break; > + } > + > + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("invalid block job state")); > + return -1; Can this happen? > +} > + > + > +/* When finishing or aborting qemu blockjob we only need to know if the > + * job is still active or not. */ > +static int > +qemuSnapshotJobIsActive(qemuBlockjobState state) Same naming patern as above. > +{ > + switch (state) { > + case QEMU_BLOCKJOB_STATE_COMPLETED: > + case QEMU_BLOCKJOB_STATE_FAILED: > + case QEMU_BLOCKJOB_STATE_CANCELLED: > + case QEMU_BLOCKJOB_STATE_CONCLUDED: > + return 0; > + > + case QEMU_BLOCKJOB_STATE_READY: > + case QEMU_BLOCKJOB_STATE_NEW: > + case QEMU_BLOCKJOB_STATE_RUNNING: > + case QEMU_BLOCKJOB_STATE_ABORTING: > + case QEMU_BLOCKJOB_STATE_PENDING: > + case QEMU_BLOCKJOB_STATE_PIVOTING: > + return 1; > + > + case QEMU_BLOCKJOB_STATE_LAST: > + break; > + } > + > + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("invalid block job state")); > + return -1; -||- > +} > + > + > +/* Wait for qemu blockjob to finish 'block-commit' operation until it is > + * ready to be finished by calling 'block-pivot' or 'block-finalize'. */ > +static int > +qemuSnapshotJobRunning(virDomainObj *vm, > + qemuBlockJobData *job) > +{ > + int rc; > + qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_SNAPSHOT); > + > + while ((rc = qemuSnapshotJobIsRunning(job->state)) > 0) { > + if (qemuDomainObjWait(vm) < 0) > + return -1; > + qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_SNAPSHOT); > + } > + > + if (rc < 0) > + return -1; > + > + return 0; > +} > + > + > +/* Wait for qemu blockjob to be done after 'block-pivot' or 'block-finalize' > + * was started. */ > +static int > +qemuSnapshotJobFinishing(virDomainObj *vm, > + qemuBlockJobData *job) > +{ > + int rc; > + qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_SNAPSHOT); > + > + while ((rc = qemuSnapshotJobIsActive(job->state)) > 0) { > + if (qemuDomainObjWait(vm) < 0) > + return -1; > + qemuBlockJobUpdate(vm, job, VIR_ASYNC_JOB_SNAPSHOT); > + } > + > + if (rc < 0) > + return -1; > + > + return 0; > +} And both functions above have confusing naming too. > + > + > +static int > +qemuSnapshotDiscardExternal(virDomainObj *vm, > + GSList *externalData) > +{ > + GSList *cur = NULL; > + > + for (cur = externalData; cur; cur = g_slist_next(cur)) { > + qemuSnapshotDeleteExternalData *data = cur->data; > + virTristateBool autofinalize = VIR_TRISTATE_BOOL_NO; > + unsigned int commitFlags = VIR_DOMAIN_BLOCK_COMMIT_DELETE; > + > + if (data->domDisk->src == data->diskSrc) { > + commitFlags |= VIR_DOMAIN_BLOCK_COMMIT_ACTIVE; > + autofinalize = VIR_TRISTATE_BOOL_YES; > + } > + > + data->job = qemuBlockCommit(vm, > + data->domDisk, > + data->parentDiskSrc, > + data->diskSrc, > + data->prevDiskSrc, > + 0, > + VIR_ASYNC_JOB_SNAPSHOT, > + autofinalize, > + commitFlags); [1] > + > + if (!data->job) > + goto error; > + } > + > + for (cur = externalData; cur; cur = g_slist_next(cur)) { > + qemuSnapshotDeleteExternalData *data = cur->data; > + > + if (qemuSnapshotJobRunning(vm, data->job) < 0) > + goto error; > + > + if (data->job->state == QEMU_BLOCKJOB_STATE_FAILED) { > + virReportError(VIR_ERR_INTERNAL_ERROR, > + _("block commit failed while deleting disk '%s' snapshot: '%s'"), > + data->snapDisk->name, data->job->errmsg); > + goto error; > + } > + } > + > + for (cur = externalData; cur; cur = g_slist_next(cur)) { > + qemuSnapshotDeleteExternalData *data = cur->data; > + > + if (data->job->state == QEMU_BLOCKJOB_STATE_READY) { > + if (qemuBlockPivot(vm, data->job, VIR_ASYNC_JOB_SNAPSHOT, NULL) < 0) > + goto error; > + } else if (data->job->state == QEMU_BLOCKJOB_STATE_PENDING) { > + if (qemuBlockFinalize(vm, data->job, VIR_ASYNC_JOB_SNAPSHOT) < 0) > + goto error; > + } > + > + if (qemuSnapshotJobFinishing(vm, data->job) < 0) > + goto error; > + > + if (data->job->state == QEMU_BLOCKJOB_STATE_FAILED) { > + virReportError(VIR_ERR_INTERNAL_ERROR, > + _("finishing block job failed while deleting disk '%s' snapshot: '%s'"), > + data->snapDisk->name, data->job->errmsg); > + goto error; > + } > + > + qemuBlockJobSyncEnd(vm, data->job, VIR_ASYNC_JOB_SNAPSHOT); > + } So 'externalData' is passed here from the caller, and the caller simply calls the equivalent of 'g_free' on the individual struct. Now this leaks the 'job' field allocated in [1] since you are still holding the reference. > + > + return 0; > + > + error: > + for (cur = externalData; cur; cur = g_slist_next(cur)) { > + qemuDomainObjPrivate *priv = vm->privateData; > + qemuSnapshotDeleteExternalData *data = cur->data; > + > + if (!data->job) > + continue; > + > + qemuBlockJobUpdate(vm, data->job, VIR_ASYNC_JOB_SNAPSHOT); > + > + if (qemuSnapshotJobIsActive(data->job->state)) { > + if (qemuDomainObjEnterMonitorAsync(vm, VIR_ASYNC_JOB_SNAPSHOT) == 0) { > + ignore_value(qemuMonitorBlockJobCancel(priv->mon, data->job->name, false)); > + qemuDomainObjExitMonitor(vm); > + > + data->job->state = QEMU_BLOCKJOB_STATE_ABORTING; > + } > + } > + > + qemuBlockJobSyncEnd(vm, data->job, VIR_ASYNC_JOB_SNAPSHOT); > + } And in this code path too. > + > + return -1; > +} > + > + > static int > qemuSnapshotDiscardMetadata(virDomainObj *vm, > virDomainMomentObj *snap,