On Fri, Jul 16, 2010 at 09:38:10AM -0400, Chris Lalancette wrote: > If you try to execute two concurrent migrations p2p > from A->B and B->A, the two libvirtd's will deadlock > trying to perform the migrations. The reason for this is > that in p2p migration, the libvirtd's are responsible for > making the RPC Prepare, Migrate, and Finish calls. However, > they are currently holding the driver lock while doing so, > which basically guarantees deadlock in this scenario. > > This patch fixes the situation by adding > qemuDomainObjEnterRemoteWithDriver and > qemuDomainObjExitRemoteWithDriver helper methods. The Enter > take an additional object reference, then drops both the > domain object lock and the driver lock. The Exit takes > both the driver and domain object lock, then drops the > reference. Adding calls to these Enter and Exit helpers > around remote calls in the various migration methods > seems to fix the problem for me in testing. > > This should make the situation safe. The additional domain > object reference ensures that the domain object won't disappear > while this operation is happening. The BeginJob that is called > inside of qemudDomainMigratePerform ensures that we can't execute a > second migrate (or shutdown, or save, etc) job while the > migration is active. Finally, the additional check on the state > of the vm after we reacquire the locks ensures that we can't > be surprised by an external event (domain crash, etc). > > Signed-off-by: Chris Lalancette <clalance@xxxxxxxxxx> > --- > src/qemu/qemu_driver.c | 57 ++++++++++++++++++++++++++++++++++++++++++------ > 1 files changed, 50 insertions(+), 7 deletions(-) > > diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c > index 5f2607c..1eff4bc 100644 > --- a/src/qemu/qemu_driver.c > +++ b/src/qemu/qemu_driver.c > @@ -531,6 +531,21 @@ static void qemuDomainObjExitMonitorWithDriver(struct qemud_driver *driver, virD > } > } > > +static void qemuDomainObjEnterRemoteWithDriver(struct qemud_driver *driver, > + virDomainObjPtr obj) > +{ > + virDomainObjRef(obj); > + virDomainObjUnlock(obj); > + qemuDriverUnlock(driver); > +} > + > +static void qemuDomainObjExitRemoteWithDriver(struct qemud_driver *driver, > + virDomainObjPtr obj) > +{ > + qemuDriverLock(driver); > + virDomainObjLock(obj); > + virDomainObjUnref(obj); > +} > > static int qemuCgroupControllerActive(struct qemud_driver *driver, > int controller) > @@ -10753,14 +10768,25 @@ static int doTunnelMigrate(virDomainPtr dom, > /* virStreamNew only fails on OOM, and it reports the error itself */ > goto cleanup; > > + qemuDomainObjEnterRemoteWithDriver(driver, vm); > internalret = dconn->driver->domainMigratePrepareTunnel(dconn, st, > flags, dname, > resource, dom_xml); > + qemuDomainObjExitRemoteWithDriver(driver, vm); > > if (internalret < 0) > /* domainMigratePrepareTunnel sets the error for us */ > goto cleanup; > > + /* the domain may have shutdown or crashed while we had the locks dropped > + * in qemuDomainObjEnterRemoteWithDriver, so check again > + */ > + if (!virDomainObjIsActive(vm)) { > + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("guest unexpectedly quit")); > + goto cleanup; > + } > + > /* 3. start migration on source */ > qemuDomainObjEnterMonitorWithDriver(driver, vm); > if (flags & VIR_MIGRATE_NON_SHARED_DISK) > @@ -10833,8 +10859,10 @@ cancel: > > finish: > dname = dname ? dname : dom->name; > + qemuDomainObjEnterRemoteWithDriver(driver, vm); > ddomain = dconn->driver->domainMigrateFinish2 > (dconn, dname, NULL, 0, uri, flags, retval); > + qemuDomainObjExitRemoteWithDriver(driver, vm); > > cleanup: > if (client_sock != -1) > @@ -10873,19 +10901,32 @@ static int doNonTunnelMigrate(virDomainPtr dom, > virDomainPtr ddomain = NULL; > int retval = -1; > char *uri_out = NULL; > + int rc; > > + qemuDomainObjEnterRemoteWithDriver(driver, vm); > /* NB we don't pass 'uri' into this, since that's the libvirtd > * URI in this context - so we let dest pick it */ > - if (dconn->driver->domainMigratePrepare2(dconn, > - NULL, /* cookie */ > - 0, /* cookielen */ > - NULL, /* uri */ > - &uri_out, > - flags, dname, > - resource, dom_xml) < 0) > + rc = dconn->driver->domainMigratePrepare2(dconn, > + NULL, /* cookie */ > + 0, /* cookielen */ > + NULL, /* uri */ > + &uri_out, > + flags, dname, > + resource, dom_xml); > + qemuDomainObjExitRemoteWithDriver(driver, vm); > + if (rc < 0) > /* domainMigratePrepare2 sets the error for us */ > goto cleanup; > > + /* the domain may have shutdown or crashed while we had the locks dropped > + * in qemuDomainObjEnterRemoteWithDriver, so check again > + */ > + if (!virDomainObjIsActive(vm)) { > + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("guest unexpectedly quit")); > + goto cleanup; > + } > + > if (uri_out == NULL) { > qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", > _("domainMigratePrepare2 did not set uri")); > @@ -10899,8 +10940,10 @@ static int doNonTunnelMigrate(virDomainPtr dom, > > finish: > dname = dname ? dname : dom->name; > + qemuDomainObjEnterRemoteWithDriver(driver, vm); > ddomain = dconn->driver->domainMigrateFinish2 > (dconn, dname, NULL, 0, uri_out, flags, retval); > + qemuDomainObjExitRemoteWithDriver(driver, vm); > > if (ddomain) > virUnrefDomain(ddomain); ACK Daniel -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list