On Thu, Nov 17, 2022 at 21:39:58 +0530, Shaleen Bathla wrote: > ping Sorry I was sick so didn't get to this until now. > On Fri, Nov 11, 2022 at 02:54:38PM +0530, Shaleen Bathla wrote: > > Problem: > > libvirt has a 5 second timeout (generally) for hotplug/unplug > > operations which can time out due to heavy load in guest. > > > > vcpu hotunplug occurs one vcpu at a time. > > But, if we perform hotplug-unplug repeatedly, > > Case 1: qemu sends multiple timedout vcpu unplug notification before > > libvirt processed first one. > > Case 2: when attempting a hotplug, qemu finishes unplug of another cpu > > > > libvirt waits for an async event notification from qemu regarding > > successful vcpu delete. > > qemu deletes its vcpu, vcpuinfo and sends notification to libvirt. > > libvirt handles vcpu delete notification, and updates vcpuinfo > > received from qemu in qemuDomainRefreshVcpuInfo(). > > > > qemu's vcpuinfo during refresh will not contain other deleted, sent vcpu > > qemu's vcpuinfo will overwrite libvirt's vcpuinfo of the other pending > > timedout vcpu(s) which qemu has deleted and notified libvirt. > > The overwrite resets other timedout vcpu's alias to NULL and tid to 0. > > > > The error is then seen when validating tid of vcpus. > > Example error log: > > "internal error: qemu didn't report thread id for vcpu 'XX'" > > > > Solution: > > Clear vcpu alias of only the vcpu that hotplug API calls. > > Other vcpu-alias won't get reset when vcpuinfo refresh occurs. > > Validation is also done for corresponding vcpus only, not all. > > > > Co-authored-by: Partha Satapathy <partha.satapathy@xxxxxxxxxx> > > Signed-off-by: Shaleen Bathla <shaleen.bathla@xxxxxxxxxx> > > --- > > src/qemu/qemu_domain.c | 6 ++++-- > > src/qemu/qemu_hotplug.c | 39 +++++++++++++++++++++++++++++++-------- > > 2 files changed, 35 insertions(+), 10 deletions(-) > > > > diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c > > index 3435da5bdc4c..6ae842d0e32f 100644 > > --- a/src/qemu/qemu_domain.c > > +++ b/src/qemu/qemu_domain.c > > @@ -9773,8 +9773,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm, > > vcpupriv->vcpus = info[i].vcpus; > > VIR_FREE(vcpupriv->type); > > vcpupriv->type = g_steal_pointer(&info[i].type); > > - VIR_FREE(vcpupriv->alias); > > - vcpupriv->alias = g_steal_pointer(&info[i].alias); > > + if (info[i].alias) { > > + VIR_FREE(vcpupriv->alias); > > + vcpupriv->alias = g_steal_pointer(&info[i].alias); > > + } > > virJSONValueFree(vcpupriv->props); > > vcpupriv->props = g_steal_pointer(&info[i].props); > > vcpupriv->enable_id = info[i].id; > > diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c > > index da92ced2f444..f11b90a220ac 100644 > > --- a/src/qemu/qemu_hotplug.c > > +++ b/src/qemu/qemu_hotplug.c > > @@ -6090,6 +6090,8 @@ qemuDomainRemoveVcpu(virDomainObj *vm, > > unsigned int nvcpus = vcpupriv->vcpus; > > virErrorPtr save_error = NULL; > > size_t i; > > + bool hasVcpuPids = qemuDomainHasVcpuPids(vm); > > + bool rollback = false; > > > > if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0) > > return -1; > > @@ -6097,14 +6099,26 @@ qemuDomainRemoveVcpu(virDomainObj *vm, > > /* validation requires us to set the expected state prior to calling it */ This comment no longer makes sense after this commit. > > for (i = vcpu; i < vcpu + nvcpus; i++) { > > vcpuinfo = virDomainDefGetVcpu(vm->def, i); > > + vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo); > > + > > vcpuinfo->online = false; > > + VIR_FREE(vcpupriv->alias); /* clear vcpu alias of only this vcpu */ > > + > > + if (vcpupriv->tid != 0) { > > + if (hasVcpuPids) > > + virReportError(VIR_ERR_INTERNAL_ERROR, > > + _("qemu reported thread id for inactive vcpu '%zu'"), i); Actually we can report this without checking hasVcpuPids as if qemu doesn't report thread ids the vcpu will not have any. > > + > > + rollback = true; > > + break; > > + } > > } > > > > - if (qemuDomainValidateVcpuInfo(vm) < 0) { > > + if (rollback) { > > /* rollback vcpu count if the setting has failed */ > > virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", false); > > > > - for (i = vcpu; i < vcpu + nvcpus; i++) { > > + for (; i >= vcpu; i--) { > > vcpuinfo = virDomainDefGetVcpu(vm->def, i); > > vcpuinfo->online = true; We don't need to roll back when we don't set it in the first place. > > } > > @@ -6141,6 +6155,9 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm, > > return; > > } > > } > > + > > + VIR_DEBUG("%s not found in vcpulist of domain %s ", > > + alias, vm->def->name); We prefer quotes around substitutions to make it clear what's the added part. > > } > > > > > > @@ -6209,6 +6226,7 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver, > > int rc; > > int oldvcpus = virDomainDefGetVcpus(vm->def); > > size_t i; > > + bool hasVcpuPids = qemuDomainHasVcpuPids(vm); > > > > if (!qemuDomainSupportsNewVcpuHotplug(vm)) { > > virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s", > > @@ -6245,14 +6263,19 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver, > > > > vcpuinfo->online = true; > > > > - if (vcpupriv->tid > 0 && > > - qemuProcessSetupVcpu(vm, i, true) < 0) > > - return -1; > > + if (vcpupriv->tid > 0) { > > + if (qemuProcessSetupVcpu(vm, i, true) < 0) { > > + return -1; > > + } > > + } else { > > + if (hasVcpuPids) { > > + virReportError(VIR_ERR_INTERNAL_ERROR, > > + _("qemu didn't report thread id for vcpu '%zu'"), i); > > + return -1; Well this way the code doesn't setup cgroups for any other cpus. This has to be checked at the end of the loop. > > + } > > + } > > } > > > > - if (qemuDomainValidateVcpuInfo(vm) < 0) > > - return -1; > > - > > qemuDomainVcpuPersistOrder(vm->def); > > > > if (virDomainObjSave(vm, driver->xmlopt, cfg->stateDir) < 0) I'll be posting a fixed version with all my suggestions addressed. Please make sure to give it a test.