QEMU emits DEVICE_DELETED events during a device's "unparent" callback, but some additional cleanup occurs afterward via "finalize". In most cases libvirt can ignore the latter, but in the case of VFIO the closing of a device's group FD happens here, which is something libvirt needs to wait for before attempting to bind a hostdev back to a host driver. In the case of powernv, and possibly other host archs as well, failing to do this can lead to the host device driver crashing due to necessary setup (like restoring default DMA windows for the IOMMU group) not being completed yet. We attempt to avoid this here by polling the QEMU process for open FDs referencing /dev/vfio/<group num> and waiting for a certain period of time. In practice the delay between the DEVICE_DELETED event and closing of the group FD seems to be around 6 seconds, so we set the max wait time at 15 seconds. If we time out we leave the device in the inactiveList and bound to VFIO. We only attempt the wait if the last hostdev from an IOMMU group is being detached and there's reasonable expectation that the group FD will be closed soon. There are alternatives to this approach, like adding a specific group delete event to QEMU and handling this cleanup via and asynchronous event handler, nut since we do a similar poll-wait for things like KVM device passthrough this simple approach is hopefully a reasonable starting point at least. Signed-off-by: Michael Roth <mdroth@xxxxxxxxxxxxxxxxxx> --- src/libvirt_private.syms | 1 + src/qemu/qemu_hotplug.c | 34 +++++++++++++++++++++++++++++-- src/util/virfile.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virfile.h | 1 + 4 files changed, 86 insertions(+), 2 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index ba7fa39..787267c 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1657,6 +1657,7 @@ virFileIsDir; virFileIsExecutable; virFileIsLink; virFileIsMountPoint; +virFileIsOpenByPid; virFileIsSharedFS; virFileIsSharedFSType; virFileLength; diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index af5ee6f..d200bab 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -68,6 +68,8 @@ VIR_LOG_INIT("qemu.qemu_hotplug"); /* Wait up to 5 seconds for device removal to finish. */ unsigned long long qemuDomainRemoveDeviceWaitTime = 1000ull * 5; +/* Wait up to 15 seconds for iommu group close */ +unsigned long long qemuDomainRemoveDeviceGroupWaitTime = 1000ull * 15; /** * qemuDomainPrepareDisk: @@ -3830,6 +3832,32 @@ qemuDomainRemoveSCSIVHostDevice(virQEMUDriverPtr driver, } static int +qemuDomainWaitForDeviceGroupClose(virDomainObjPtr vm, int iommu_group) +{ + char *group_path; + unsigned long long remaining_ms = qemuDomainRemoveDeviceGroupWaitTime; + int rc = -1; + + if (virAsprintf(&group_path, "/dev/vfio/%d", iommu_group) < 0) + return -1; + + while ((rc = virFileIsOpenByPid(group_path, vm->pid)) == 1) { + if (remaining_ms <= 0) + break; + usleep(100*1000); + remaining_ms -= 100; + } + + VIR_DEBUG("IOMMU group %d FD status: %d, wait time: %llu ms", + iommu_group, rc, + qemuDomainRemoveDeviceGroupWaitTime - remaining_ms); + + VIR_FREE(group_path); + return rc; +} + + +static int qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, virDomainObjPtr vm, virDomainHostdevDefPtr hostdev) @@ -3933,8 +3961,10 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, virPCIDeviceAddressGetIOMMUGroupNum(&hostdev->source.subsys.u.pci.addr); if (virHostdevPCIDeviceGroupUnbindable(driver->hostdevMgr, iommu_group)) { - virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr, - iommu_group); + if (qemuDomainWaitForDeviceGroupClose(vm, iommu_group) == 0) { + virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr, + iommu_group); + } } } diff --git a/src/util/virfile.c b/src/util/virfile.c index d444b32..29b762f 100644 --- a/src/util/virfile.c +++ b/src/util/virfile.c @@ -4162,3 +4162,55 @@ virFileReadValueString(char **value, const char *format, ...) VIR_FREE(str); return ret; } + +int +virFileIsOpenByPid(const char *path, pid_t pid) +{ + struct dirent *ent; + DIR *filelist_dir; + char *filelist_path; + bool found = false; + int rc = -1; + + if (!path || !IS_ABSOLUTE_FILE_NAME(path)) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("invalid path: %s"), path ? path : "null"); + goto error; + } + + if (virAsprintf(&filelist_path, "/proc/%d/fd", pid) < 0) + goto error; + + if (virDirOpen(&filelist_dir, filelist_path) < 0) { + virReportError(VIR_ERR_INTERNAL_ERROR, + _("unable to open directory: %s"), filelist_path); + goto error; + } + + while (!found && + (rc = virDirRead(filelist_dir, &ent, filelist_path)) == 1) { + char *resolved_path = NULL; + char *link_path; + if ((rc = virAsprintf(&link_path, "%s/%s", filelist_path, ent->d_name)) < 0) + break; + if (virFileResolveLink(link_path, &resolved_path) == 0) { + if (resolved_path) { + VIR_DEBUG("checking absolute path for match (need: %s, got: %s)", + path, resolved_path); + if (STREQ(resolved_path, path)) + found = true; + VIR_FREE(resolved_path); + } + } + } + + VIR_DIR_CLOSE(filelist_dir); + error: + VIR_FREE(filelist_path); + + VIR_DEBUG("returning, rc: %d, found: %d", rc, found); + if (rc < 0) + return rc; + + return found ? 1 : 0; +} diff --git a/src/util/virfile.h b/src/util/virfile.h index 57ceb80..fb86786 100644 --- a/src/util/virfile.h +++ b/src/util/virfile.h @@ -347,6 +347,7 @@ int virFileReadValueScaledInt(unsigned long long *value, const char *format, ... int virFileReadValueString(char **value, const char *format, ...) ATTRIBUTE_FMT_PRINTF(2, 3); +int virFileIsOpenByPid(const char *path, pid_t pid); int virFileInData(int fd, int *inData, -- 2.7.4 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list