On 23.08.2011 14:42, Daniel P. Berrange wrote: > On Tue, Aug 16, 2011 at 06:39:12PM +0200, Michal Privoznik wrote: >> If libvirt daemon gets restarted and there is (at least) one >> unresponsive qemu, the startup procedure hangs up. This patch creates >> one thread per vm in which we try to reconnect to monitor. Therefore, >> blocking in one thread will not affect other APIs. >> --- >> src/qemu/qemu_driver.c | 23 +++--------- >> src/qemu/qemu_process.c | 87 ++++++++++++++++++++++++++++++++++++++++++---- >> 2 files changed, 85 insertions(+), 25 deletions(-) >> >> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c >> index 421a98e..4574b6c 100644 >> --- a/src/qemu/qemu_driver.c >> +++ b/src/qemu/qemu_driver.c >> @@ -143,26 +143,15 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq >> >> virDomainObjLock(vm); >> virResetLastError(); >> - if (qemuDomainObjBeginJobWithDriver(data->driver, vm, >> - QEMU_JOB_MODIFY) < 0) { >> + if (vm->autostart && >> + !virDomainObjIsActive(vm) && >> + qemuDomainObjStart(data->conn, data->driver, vm, >> + false, false, >> + data->driver->autoStartBypassCache) < 0) { >> err = virGetLastError(); >> - VIR_ERROR(_("Failed to start job on VM '%s': %s"), >> + VIR_ERROR(_("Failed to autostart VM '%s': %s"), >> vm->def->name, >> err ? err->message : _("unknown error")); >> - } else { >> - if (vm->autostart && >> - !virDomainObjIsActive(vm) && >> - qemuDomainObjStart(data->conn, data->driver, vm, >> - false, false, >> - data->driver->autoStartBypassCache) < 0) { >> - err = virGetLastError(); >> - VIR_ERROR(_("Failed to autostart VM '%s': %s"), >> - vm->def->name, >> - err ? err->message : _("unknown error")); >> - } >> - >> - if (qemuDomainObjEndJob(data->driver, vm) == 0) >> - vm = NULL; >> } > > I'm not really seeing why this change is safe. You're removing the > job protection from the autostart code, but AFAICT, no where else > in the caller of this method is changed to acquire the job instead. > >> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c >> index 21e73a5..1daf6ae 100644 >> --- a/src/qemu/qemu_process.c >> +++ b/src/qemu/qemu_process.c >> @@ -820,6 +820,7 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm) >> { >> qemuDomainObjPrivatePtr priv = vm->privateData; >> int ret = -1; >> + qemuMonitorPtr mon = NULL; >> >> if (virSecurityManagerSetSocketLabel(driver->securityManager, vm) < 0) { >> VIR_ERROR(_("Failed to set security context for monitor for %s"), >> @@ -831,15 +832,29 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm) >> * deleted while the monitor is active */ >> virDomainObjRef(vm); >> >> - priv->mon = qemuMonitorOpen(vm, >> - priv->monConfig, >> - priv->monJSON, >> - &monitorCallbacks); >> + ignore_value(virTimeMs(&priv->monStart)); >> + virDomainObjUnlock(vm); >> + qemuDriverUnlock(driver); >> + >> + mon = qemuMonitorOpen(vm, >> + priv->monConfig, >> + priv->monJSON, >> + &monitorCallbacks); >> + >> + qemuDriverLock(driver); >> + virDomainObjLock(vm); >> + priv->monStart = 0; > > I'm also not convinced it is safe to release the driver/domain > locks in this way. At the very *least* you need to be acquiring > an extra reference on the virDomainObjPtr, ideally using the > EnterMonitor/ExitMonitor methods, otherwise some other thread > may destroy the virDomainObjPtr while it is unlocked. I cannot use EnterMonitor because it access priv->mon which does not exist at this time. > > Also what is the 'monStart' field used for ? For gathering statistics: virsh domcontrol <domain> > >> >> /* Safe to ignore value since ref count was incremented above */ >> - if (priv->mon == NULL) >> + if (mon == NULL) >> ignore_value(virDomainObjUnref(vm)); >> >> + if (!virDomainObjIsActive(vm)) { >> + qemuMonitorClose(mon); >> + goto error; >> + } >> + priv->mon = mon; >> + >> if (virSecurityManagerClearSocketLabel(driver->securityManager, vm) < 0) { >> VIR_ERROR(_("Failed to clear security context for monitor for %s"), >> vm->def->name); >> @@ -2484,21 +2499,25 @@ qemuProcessRecoverJob(struct qemud_driver *driver, >> struct qemuProcessReconnectData { >> virConnectPtr conn; >> struct qemud_driver *driver; >> + void *payload; >> }; >> /* >> * Open an existing VM's monitor, re-detect VCPU threads >> * and re-reserve the security labels in use >> */ >> static void >> -qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaque) >> +qemuProcessReconnect(void *opaque) >> { >> - virDomainObjPtr obj = payload; >> struct qemuProcessReconnectData *data = opaque; >> struct qemud_driver *driver = data->driver; >> + virDomainObjPtr obj = data->payload; >> qemuDomainObjPrivatePtr priv; >> virConnectPtr conn = data->conn; >> struct qemuDomainJobObj oldjob; >> >> + VIR_FREE(data); >> + >> + qemuDriverLock(driver); >> virDomainObjLock(obj); >> >> qemuDomainObjRestoreJob(obj, &oldjob); >> @@ -2572,12 +2591,15 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa >> if (virDomainObjUnref(obj) > 0) >> virDomainObjUnlock(obj); >> >> + qemuDriverUnlock(driver); >> + >> return; >> >> error: >> if (!virDomainObjIsActive(obj)) { >> if (virDomainObjUnref(obj) > 0) >> virDomainObjUnlock(obj); >> + qemuDriverUnlock(driver); >> return; >> } >> >> @@ -2591,6 +2613,55 @@ error: >> else >> virDomainObjUnlock(obj); >> } >> + qemuDriverUnlock(driver); >> +} >> + >> +static void >> +qemuProcessReconnectHelper(void *payload, >> + const void *name ATTRIBUTE_UNUSED, >> + void *opaque) >> +{ >> + virThread thread; >> + struct qemuProcessReconnectData *src = opaque; >> + struct qemuProcessReconnectData *data; >> + >> + if (VIR_ALLOC(data) < 0) { >> + virReportOOMError(); >> + return; >> + } >> + >> + memcpy(data, src, sizeof(*data)); >> + data->payload = payload; >> + >> + /* This iterator is called with driver being locked. >> + * However, we need to unlock it so qemuProcessReconnect, >> + * which will run in separate thread can lock it itself >> + * (if needed). The whole idea behind: qemuProcessReconnect: >> + * 1. lock driver >> + * 2. just before monitor reconnect, do lightweight MonitorEnter >> + * (unlock VM & driver) >> + * 3. Reconnect to monitor >> + * 4. do lightweight MonitorExit (lock driver & VM) >> + * 5. continue reconnect process >> + * 6. unlock driver >> + * >> + * It is necessary to NOT hold driver lock for the entire run >> + * of reconnect, otherwise we will get blocked if there is >> + * unresponsive qemu. >> + * However, iterating over hash table MUST be done on locked >> + * driver. >> + * >> + * NB, we can't do normal MonitorEnter & MonitorExit because >> + * these two lock the monitor lock, which does not exists in >> + * this early phase. >> + */ >> + qemuDriverUnlock(src->driver); >> + if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) { >> + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", >> + _("Could not create thread. QEMU initialization " >> + "might be incomplete")); >> + } >> + qemuDriverLock(src->driver); >> } > > This comment & code is not quite right. Since the 'qemuProcessReconnect' > function runs in a separate thread, there is no need to unlock the > src->driver here. Unlocking the driver is in fact dangerous, since this > means something can (accidentally or not) change the driver->domains > hash table, while we're in the middle of iterating over it. > > If you remove these qemuDriverUnlock/Lock calls here, it simply means > that the thread you spawn will start, but immediately block waiting > for the lock. When the thread that triggers reconnect finally > unlocks the driver later, all the threads will be able to start > executing. > > The other unsafe thing here though, is that the main thread owns the > reference of virDomainObjPtr, and something could happily destroy > this reference, before the thread you spawn gets a chance to run. > > So *before* spawning the thread, you need to start a job on the > virDomainObjPtr. The thread will then actually do the job work > and release the job when it is done. > > So this code should look something more like > > virDomainObjPtr obj = payload; > > ...begin job on obj... > if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) { > qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", > _("Could not create thread to reconnect to domain"); > ...end job on obj... > ...destroy the running guest PID if any.. > } else { > ...nothing. the thread ends the job... > } > > > Regards, > Daniel -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list