ping On Fri, Nov 11, 2022 at 02:54:38PM +0530, Shaleen Bathla wrote: > Problem: > libvirt has a 5 second timeout (generally) for hotplug/unplug > operations which can time out due to heavy load in guest. > > vcpu hotunplug occurs one vcpu at a time. > But, if we perform hotplug-unplug repeatedly, > Case 1: qemu sends multiple timedout vcpu unplug notification before > libvirt processed first one. > Case 2: when attempting a hotplug, qemu finishes unplug of another cpu > > libvirt waits for an async event notification from qemu regarding > successful vcpu delete. > qemu deletes its vcpu, vcpuinfo and sends notification to libvirt. > libvirt handles vcpu delete notification, and updates vcpuinfo > received from qemu in qemuDomainRefreshVcpuInfo(). > > qemu's vcpuinfo during refresh will not contain other deleted, sent vcpu > qemu's vcpuinfo will overwrite libvirt's vcpuinfo of the other pending > timedout vcpu(s) which qemu has deleted and notified libvirt. > The overwrite resets other timedout vcpu's alias to NULL and tid to 0. > > The error is then seen when validating tid of vcpus. > Example error log: > "internal error: qemu didn't report thread id for vcpu 'XX'" > > Solution: > Clear vcpu alias of only the vcpu that hotplug API calls. > Other vcpu-alias won't get reset when vcpuinfo refresh occurs. > Validation is also done for corresponding vcpus only, not all. > > Co-authored-by: Partha Satapathy <partha.satapathy@xxxxxxxxxx> > Signed-off-by: Shaleen Bathla <shaleen.bathla@xxxxxxxxxx> > --- > src/qemu/qemu_domain.c | 6 ++++-- > src/qemu/qemu_hotplug.c | 39 +++++++++++++++++++++++++++++++-------- > 2 files changed, 35 insertions(+), 10 deletions(-) > > diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c > index 3435da5bdc4c..6ae842d0e32f 100644 > --- a/src/qemu/qemu_domain.c > +++ b/src/qemu/qemu_domain.c > @@ -9773,8 +9773,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm, > vcpupriv->vcpus = info[i].vcpus; > VIR_FREE(vcpupriv->type); > vcpupriv->type = g_steal_pointer(&info[i].type); > - VIR_FREE(vcpupriv->alias); > - vcpupriv->alias = g_steal_pointer(&info[i].alias); > + if (info[i].alias) { > + VIR_FREE(vcpupriv->alias); > + vcpupriv->alias = g_steal_pointer(&info[i].alias); > + } > virJSONValueFree(vcpupriv->props); > vcpupriv->props = g_steal_pointer(&info[i].props); > vcpupriv->enable_id = info[i].id; > diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c > index da92ced2f444..f11b90a220ac 100644 > --- a/src/qemu/qemu_hotplug.c > +++ b/src/qemu/qemu_hotplug.c > @@ -6090,6 +6090,8 @@ qemuDomainRemoveVcpu(virDomainObj *vm, > unsigned int nvcpus = vcpupriv->vcpus; > virErrorPtr save_error = NULL; > size_t i; > + bool hasVcpuPids = qemuDomainHasVcpuPids(vm); > + bool rollback = false; > > if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0) > return -1; > @@ -6097,14 +6099,26 @@ qemuDomainRemoveVcpu(virDomainObj *vm, > /* validation requires us to set the expected state prior to calling it */ > for (i = vcpu; i < vcpu + nvcpus; i++) { > vcpuinfo = virDomainDefGetVcpu(vm->def, i); > + vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo); > + > vcpuinfo->online = false; > + VIR_FREE(vcpupriv->alias); /* clear vcpu alias of only this vcpu */ > + > + if (vcpupriv->tid != 0) { > + if (hasVcpuPids) > + virReportError(VIR_ERR_INTERNAL_ERROR, > + _("qemu reported thread id for inactive vcpu '%zu'"), i); > + > + rollback = true; > + break; > + } > } > > - if (qemuDomainValidateVcpuInfo(vm) < 0) { > + if (rollback) { > /* rollback vcpu count if the setting has failed */ > virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", false); > > - for (i = vcpu; i < vcpu + nvcpus; i++) { > + for (; i >= vcpu; i--) { > vcpuinfo = virDomainDefGetVcpu(vm->def, i); > vcpuinfo->online = true; > } > @@ -6141,6 +6155,9 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm, > return; > } > } > + > + VIR_DEBUG("%s not found in vcpulist of domain %s ", > + alias, vm->def->name); > } > > > @@ -6209,6 +6226,7 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver, > int rc; > int oldvcpus = virDomainDefGetVcpus(vm->def); > size_t i; > + bool hasVcpuPids = qemuDomainHasVcpuPids(vm); > > if (!qemuDomainSupportsNewVcpuHotplug(vm)) { > virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s", > @@ -6245,14 +6263,19 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver, > > vcpuinfo->online = true; > > - if (vcpupriv->tid > 0 && > - qemuProcessSetupVcpu(vm, i, true) < 0) > - return -1; > + if (vcpupriv->tid > 0) { > + if (qemuProcessSetupVcpu(vm, i, true) < 0) { > + return -1; > + } > + } else { > + if (hasVcpuPids) { > + virReportError(VIR_ERR_INTERNAL_ERROR, > + _("qemu didn't report thread id for vcpu '%zu'"), i); > + return -1; > + } > + } > } > > - if (qemuDomainValidateVcpuInfo(vm) < 0) > - return -1; > - > qemuDomainVcpuPersistOrder(vm->def); > > if (virDomainObjSave(vm, driver->xmlopt, cfg->stateDir) < 0) > -- > 2.31.1 > > V1 Patch : > https://listman.redhat.com/archives/libvir-list/2022-November/235470.html > > V1 Patch Review comments : > https://listman.redhat.com/archives/libvir-list/2022-November/235534.html > > Changes in V2 since V1: > - Patch addresses review comments from Peter Krempa. > - Validation is done for per cpu hotplug entity instead of all