Certain hypervisors (like qemu/kvm) map the PCI bar(s) on the host when doing device passthrough. This can lead to a race condition where the hypervisor is still cleaning up the device while libvirt is trying to re-attach it to the host device driver. To avoid this situation, we look through /proc/iomem, and if the hypervisor is still holding onto the bar (denoted by the string in the matcher variable), then we can wait around a bit for that to clear up. v2: Thanks to review by DV, make sure we wait the full timeout per-device Signed-off-by: Chris Lalancette <clalance@xxxxxxxxxx> --- src/libvirt_private.syms | 1 + src/qemu/qemu_driver.c | 19 ++++++--- src/util/pci.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++ src/util/pci.h | 1 + 4 files changed, 112 insertions(+), 6 deletions(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index c912d81..e42c090 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -439,6 +439,7 @@ pciGetDevice; pciFreeDevice; pciDettachDevice; pciReAttachDevice; +pciWaitForDeviceCleanup; pciResetDevice; pciDeviceSetManaged; pciDeviceGetManaged; diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index e14ed8f..55550ef 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -2277,12 +2277,19 @@ qemuDomainReAttachHostDevices(virConnectPtr conn, for (i = 0; i < pciDeviceListCount(pcidevs); i++) { pciDevice *dev = pciDeviceListGet(pcidevs, i); - if (pciDeviceGetManaged(dev) && - pciReAttachDevice(NULL, dev) < 0) { - virErrorPtr err = virGetLastError(); - VIR_ERROR(_("Failed to re-attach PCI device: %s"), - err ? err->message : ""); - virResetError(err); + int retries = 100; + if (pciDeviceGetManaged(dev)) { + while (pciWaitForDeviceCleanup(dev, "kvm_assigned_device") + && retries) { + usleep(100*1000); + retries--; + } + if (pciReAttachDevice(NULL, dev) < 0) { + virErrorPtr err = virGetLastError(); + VIR_ERROR(_("Failed to re-attach PCI device: %s"), + err ? err->message : ""); + virResetError(err); + } } } diff --git a/src/util/pci.c b/src/util/pci.c index 0defbfe..09535bd 100644 --- a/src/util/pci.c +++ b/src/util/pci.c @@ -883,6 +883,103 @@ pciReAttachDevice(virConnectPtr conn, pciDevice *dev) return pciUnBindDeviceFromStub(conn, dev, driver); } +/* Certain hypervisors (like qemu/kvm) map the PCI bar(s) on + * the host when doing device passthrough. This can lead to a race + * condition where the hypervisor is still cleaning up the device while + * libvirt is trying to re-attach it to the host device driver. To avoid + * this situation, we look through /proc/iomem, and if the hypervisor is + * still holding onto the bar (denoted by the string in the matcher variable), + * then we can wait around a bit for that to clear up. + * + * A typical /proc/iomem looks like this (snipped for brevity): + * 00010000-0008efff : System RAM + * 0008f000-0008ffff : reserved + * ... + * 00100000-cc9fcfff : System RAM + * 00200000-00483d3b : Kernel code + * 00483d3c-005c88df : Kernel data + * cc9fd000-ccc71fff : ACPI Non-volatile Storage + * ... + * d0200000-d02fffff : PCI Bus #05 + * d0200000-d021ffff : 0000:05:00.0 + * d0200000-d021ffff : e1000e + * d0220000-d023ffff : 0000:05:00.0 + * d0220000-d023ffff : e1000e + * ... + * f0000000-f0003fff : 0000:00:1b.0 + * f0000000-f0003fff : kvm_assigned_device + * + * Returns 0 if we are clear to continue, and 1 if the hypervisor is still + * holding onto the resource. + */ +int +pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher) +{ + FILE *fp; + char line[160]; + unsigned long long start, end; + int consumed; + char *rest; + unsigned long long domain; + int bus, slot, function; + int in_matching_device; + int ret; + size_t match_depth; + + fp = fopen("/proc/iomem", "r"); + if (!fp) { + /* If we failed to open iomem, we just basically ignore the error. The + * unbind might succeed anyway, and besides, it's very likely we have + * no way to report the error + */ + VIR_DEBUG0("Failed to open /proc/iomem, trying to continue anyway"); + return 0; + } + + ret = 0; + in_matching_device = 0; + match_depth = 0; + while (fgets(line, sizeof(line), fp) != 0) { + /* the logic here is a bit confusing. For each line, we look to + * see if it matches the domain:bus:slot.function we were given. + * If this line matches the DBSF, then any subsequent lines indented + * by 2 spaces are the PCI regions for this device. It's also + * possible that none of the PCI regions are currently mapped, in + * which case we have no indented regions. This code handles all + * of these situations + */ + if (in_matching_device && (strspn(line, " ") == (match_depth + 2))) { + if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2) + continue; + + rest = line + consumed; + if (STRPREFIX(rest, matcher)) { + ret = 1; + break; + } + } + else { + in_matching_device = 0; + if (sscanf(line, "%Lx-%Lx : %n", &start, &end, &consumed) != 2) + continue; + + rest = line + consumed; + if (sscanf(rest, "%Lx:%x:%x.%x", &domain, &bus, &slot, &function) != 4) + continue; + + if (domain != dev->domain || bus != dev->bus || slot != dev->slot || + function != dev->function) + continue; + in_matching_device = 1; + match_depth = strspn(line, " "); + } + } + + fclose(fp); + + return ret; +} + static char * pciReadDeviceID(pciDevice *dev, const char *id_name) { diff --git a/src/util/pci.h b/src/util/pci.h index a1a19e7..e6ab137 100644 --- a/src/util/pci.h +++ b/src/util/pci.h @@ -81,5 +81,6 @@ int pciDeviceFileIterate(virConnectPtr conn, int pciDeviceIsAssignable(virConnectPtr conn, pciDevice *dev, int strict_acs_check); +int pciWaitForDeviceCleanup(pciDevice *dev, const char *matcher); #endif /* __VIR_PCI_H__ */ -- 1.6.6 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list