[libvirt] [PATCH 1/6] Add pci utility functions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add implementations of dettach, reattach and reset for
PCI devices.

Background to this code can be found here:

  http://marc.info/?l=kvm&m=123454366317045

Some notes:

  * pci-stub was first introduced in 2.6.29; if it's
    not available, then dettach can only unbind the
    device from the host driver

  * remove_id is queued up for 2.6.30; in it's absence
    reattach cannot re-bind the device to the original
    driver

  * If the device supports Function Level Reset, we
    just don't bother doing the reset - KVM will do
    this before and after a guest runs

  * Currently, if a reset would affect another device
    on the same bus, or another function on the same
    multi-function device, then we just refuse to do
    the reset. In future, we could allow the reset
    as long as it doesn't affect a device in use by
    the host or assigned to another guest

  * The device reset code is similar in concept to
    some of Xen's code

  * This code should use libpciaccess - libpci isn't
    an option because we can't handle malloc failures
    with it

Signed-off-by: Mark McLoughlin <markmc@xxxxxxxxxx>
---
 configure.in             |    6 +
 po/POTFILES.in           |    1 +
 src/Makefile.am          |    1 +
 src/libvirt_private.syms |    8 +
 src/pci.c                |  829 ++++++++++++++++++++++++++++++++++++++++++++++
 src/pci.h                |   44 +++
 6 files changed, 889 insertions(+), 0 deletions(-)
 create mode 100644 src/pci.c
 create mode 100644 src/pci.h

diff --git a/configure.in b/configure.in
index 72a64dd..a5b8208 100644
--- a/configure.in
+++ b/configure.in
@@ -128,6 +128,8 @@ AC_PATH_PROG([UDEVADM], [udevadm], [],
 	[/sbin:/usr/sbin:/usr/local/sbin:$PATH])
 AC_PATH_PROG([UDEVSETTLE], [udevsettle], [],
 	[/sbin:/usr/sbin:/usr/local/sbin:$PATH])
+AC_PATH_PROG([MODPROBE], [modprobe], [],
+	[/sbin:/usr/sbin:/usr/local/sbin:$PATH])
 
 AC_DEFINE_UNQUOTED([DNSMASQ],["$DNSMASQ"],
         [Location or name of the dnsmasq program])
@@ -141,6 +143,10 @@ if test -n "$UDEVSETTLE"; then
   AC_DEFINE_UNQUOTED([UDEVSETTLE],["$UDEVSETTLE"],
         [Location or name of the udevsettle program])
 fi
+if test -n "$MODPROBE"; then
+  AC_DEFINE_UNQUOTED([MODPROBE],["$MODPROBE"],
+        [Location or name of the modprobe program])
+fi
 
 dnl Specific dir for HTML output ?
 AC_ARG_WITH([html-dir], [AC_HELP_STRING([--with-html-dir=path],
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 7461f93..237d462 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -18,6 +18,7 @@ src/node_device_conf.c
 src/nodeinfo.c
 src/openvz_conf.c
 src/openvz_driver.c
+src/pci.c
 src/proxy_internal.c
 src/qemu_conf.c
 src/qemu_driver.c
diff --git a/src/Makefile.am b/src/Makefile.am
index 3a798d2..a636e8a 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -48,6 +48,7 @@ UTIL_SOURCES =							\
 		iptables.c iptables.h				\
 		logging.c logging.h				\
 		memory.c memory.h				\
+		pci.c pci.h					\
 		qparams.c qparams.h				\
 		threads.c threads.h				\
 		threads-pthread.h				\
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index fdda478..c8c08ae 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -229,6 +229,14 @@ virNodeDeviceObjUnlock;
 virNodeDeviceAssignDef;
 
 
+# pci.h
+pciGetDevice;
+pciFreeDevice;
+pciDettachDevice;
+pciReAttachDevice;
+pciResetDevice;
+
+
 # qparams.h
 qparam_get_query;
 qparam_query_parse;
diff --git a/src/pci.c b/src/pci.c
new file mode 100644
index 0000000..fb8b792
--- /dev/null
+++ b/src/pci.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ *
+ * Authors:
+ *     Mark McLoughlin <markmc@xxxxxxxxxx>
+ */
+
+#include <config.h>
+
+#include "pci.h"
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "logging.h"
+#include "memory.h"
+#include "util.h"
+#include "virterror_internal.h"
+
+#define PCI_SYSFS "/sys/bus/pci/"
+#define PCI_ID_LEN 10   /* "XXXX XXXX" */
+#define PCI_ADDR_LEN 13 /* "XXXX:XX:XX.X" */
+
+struct _pciDevice {
+    unsigned      domain;
+    unsigned      bus;
+    unsigned      slot;
+    unsigned      function;
+
+    char          name[PCI_ADDR_LEN]; /* domain:bus:slot.function */
+    char          id[PCI_ID_LEN];     /* product vendor */
+    char          path[PATH_MAX];
+    int           fd;
+
+    unsigned      initted;
+    unsigned      pcie_cap_pos;
+    unsigned      pci_pm_cap_pos;
+    unsigned      has_flr : 1;
+    unsigned      has_pm_reset : 1;
+};
+
+/* For virReportOOMError()  and virReportSystemError() */
+#define VIR_FROM_THIS VIR_FROM_NONE
+
+#define pciReportError(conn, code, fmt...)                     \
+    virReportErrorHelper(conn, VIR_FROM_NONE, code, __FILE__,  \
+                         __FUNCTION__, __LINE__, fmt)
+
+/* Specifications referenced in comments:
+ *  PCI30  - PCI Local Bus Specification 3.0
+ *  PCIe20 - PCI Express Base Specification 2.0
+ *  BR12   - PCI-to-PCI Bridge Architecture Specification 1.2
+ *  PM12   - PCI Bus Power Management Interface Specification 1.2
+ *  ECN_AF - Advanced Capabilities for Conventional PCI ECN
+ */
+
+/* Type 0 config space header length; PCI30 Section 6.1 Configuration Space Organization */
+#define PCI_CONF_LEN            0x100
+#define PCI_CONF_HEADER_LEN     0x40
+
+/* PCI30 6.2.1 */
+#define PCI_HEADER_TYPE         0x0e    /* Header type */
+#define  PCI_HEADER_TYPE_BRIDGE 0x1
+#define  PCI_HEADER_TYPE_MASK   0x7f
+#define  PCI_HEADER_TYPE_MULTI  0x80
+
+/* PCI30 6.2.1  Device Identification */
+#define PCI_CLASS_DEVICE        0x0a    /* Device class */
+
+/* Class Code for bridge; PCI30 D.7  Base Class 06h */
+#define PCI_CLASS_BRIDGE_PCI    0x0604
+
+/* PCI30 6.2.3  Device Status */
+#define PCI_STATUS              0x06    /* 16 bits */
+#define  PCI_STATUS_CAP_LIST    0x10    /* Support Capability List */
+
+/* PCI30 6.7  Capabilities List */
+#define PCI_CAPABILITY_LIST     0x34    /* Offset of first capability list entry */
+
+/* PM12 3.2.1  Capability Identifier */
+#define PCI_CAP_ID_PM           0x01    /* Power Management */
+/* PCI30 H Capability IDs */
+#define PCI_CAP_ID_EXP          0x10    /* PCI Express */
+/* ECN_AF 6.x.1.1  Capability ID for AF */
+#define PCI_CAP_ID_AF           0x13    /* Advanced Features */
+
+/* PCIe20 7.8.3  Device Capabilities Register (Offset 04h) */
+#define PCI_EXP_DEVCAP          0x4     /* Device capabilities */
+#define  PCI_EXP_DEVCAP_FLR     (1<<28) /* Function Level Reset */
+
+/* Header type 1 BR12 3.2 PCI-to-PCI Bridge Configuration Space Header Format */
+#define PCI_PRIMARY_BUS         0x18    /* BR12 3.2.5.2 Primary bus number */
+#define PCI_SECONDARY_BUS       0x19    /* BR12 3.2.5.3 Secondary bus number */
+#define PCI_SUBORDINATE_BUS     0x1a    /* BR12 3.2.5.4 Highest bus number behind the bridge */
+#define PCI_BRIDGE_CONTROL      0x3e
+/* BR12 3.2.5.18  Bridge Control Register */
+#define  PCI_BRIDGE_CTL_RESET   0x40    /* Secondary bus reset */
+
+/* PM12 3.2.4  Power Management Control/Status (Offset = 4) */
+#define PCI_PM_CTRL                4    /* PM control and status register */
+#define  PCI_PM_CTRL_STATE_MASK    0x3  /* Current power state (D0 to D3) */
+#define  PCI_PM_CTRL_STATE_D0      0x0  /* D0 state */
+#define  PCI_PM_CTRL_STATE_D3hot   0x3  /* D3 state */
+#define  PCI_PM_CTRL_NO_SOFT_RESET 0x4  /* No reset for D3hot->D0 */
+
+/* ECN_AF 6.x.1  Advanced Features Capability Structure */
+#define PCI_AF_CAP              0x3     /* Advanced features capabilities */
+#define  PCI_AF_CAP_FLR         0x2     /* Function Level Reset */
+
+static int
+pciOpenConfig(pciDevice *dev)
+{
+    int fd;
+
+    if (dev->fd > 0)
+        return 0;
+
+    fd = open(dev->path, O_RDWR);
+    if (fd < 0) {
+        char ebuf[1024];
+        VIR_WARN(_("Failed to open config space file '%s': %s"),
+                 dev->path, virStrerror(errno, ebuf, sizeof(ebuf)));
+        return -1;
+    }
+    VIR_DEBUG("%s %s: opened %s", dev->id, dev->name, dev->path);
+    dev->fd = fd;
+    return 0;
+}
+
+static int
+pciRead(pciDevice *dev, unsigned pos, uint8_t *buf, unsigned buflen)
+{
+    memset(buf, 0, buflen);
+
+    if (pciOpenConfig(dev) < 0)
+        return -1;
+
+    if (pread(dev->fd, buf, buflen, pos) < 0) {
+        char ebuf[1024];
+        VIR_WARN(_("Failed to read from '%s' : %s"), dev->path,
+                 virStrerror(errno, ebuf, sizeof(ebuf)));
+        return -1;
+    }
+    return 0;
+}
+
+static uint8_t
+pciRead8(pciDevice *dev, unsigned pos)
+{
+    uint8_t buf;
+    pciRead(dev, pos, &buf, sizeof(buf));
+    return buf;
+}
+
+static uint16_t
+pciRead16(pciDevice *dev, unsigned pos)
+{
+    uint8_t buf[2];
+    pciRead(dev, pos, &buf[0], sizeof(buf));
+    return (buf[0] << 0) | (buf[1] << 8);
+}
+
+static uint32_t
+pciRead32(pciDevice *dev, unsigned pos)
+{
+    uint8_t buf[4];
+    pciRead(dev, pos, &buf[0], sizeof(buf));
+    return (buf[0] << 0) | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24);
+}
+
+static int
+pciWrite(pciDevice *dev, unsigned pos, uint8_t *buf, unsigned buflen)
+{
+    if (pciOpenConfig(dev) < 0)
+        return -1;
+
+    if (pwrite(dev->fd, buf, buflen, pos) < 0) {
+        char ebuf[1024];
+        VIR_WARN(_("Failed to write to '%s' : %s"), dev->path,
+                 virStrerror(errno, ebuf, sizeof(ebuf)));
+        return -1;
+    }
+    return 0;
+}
+
+static void
+pciWrite16(pciDevice *dev, unsigned pos, uint16_t val)
+{
+    uint8_t buf[2] = { (val >> 0), (val >> 8) };
+    pciWrite(dev, pos, &buf[0], sizeof(buf));
+}
+
+static void
+pciWrite32(pciDevice *dev, unsigned pos, uint32_t val)
+{
+    uint8_t buf[4] = { (val >> 0), (val >> 8), (val >> 16), (val >> 14) };
+    pciWrite(dev, pos, &buf[0], sizeof(buf));
+}
+
+typedef int (*pciIterPredicate)(pciDevice *, pciDevice *);
+
+/* Iterate over available PCI devices calling @predicate
+ * to compare each one to @dev.
+ * Return -1 on error since we don't want to assume it is
+ * safe to reset if there is an error.
+ */
+static int
+pciIterDevices(virConnectPtr conn,
+               pciIterPredicate predicate,
+               pciDevice *dev,
+               pciDevice **matched)
+{
+    DIR *dir;
+    struct dirent *entry;
+
+    *matched = NULL;
+
+    VIR_DEBUG("%s %s: iterating over " PCI_SYSFS "devices", dev->id, dev->name);
+
+    dir = opendir(PCI_SYSFS "devices");
+    if (!dir) {
+        VIR_WARN0("Failed to open " PCI_SYSFS "devices");
+        return -1;
+    }
+
+    while ((entry = readdir(dir))) {
+        unsigned domain, bus, slot, function;
+        pciDevice *try;
+
+        /* Ignore '.' and '..' */
+        if (entry->d_name[0] == '.')
+            continue;
+
+        if (sscanf(entry->d_name, "%x:%x:%x.%x", &domain, &bus, &slot, &function) < 4) {
+            VIR_WARN("Unusual entry in " PCI_SYSFS "devices: %s", entry->d_name);
+            continue;
+        }
+
+        try = pciGetDevice(conn, domain, bus, slot, function);
+        if (!try)
+            return -1;
+
+        if (predicate(try, dev)) {
+            VIR_DEBUG("%s %s: iter matched on %s", dev->id, dev->name, try->name);
+            *matched = try;
+            break;
+        }
+        pciFreeDevice(conn, try);
+    }
+    closedir(dir);
+    return 0;
+}
+
+static uint8_t
+pciFindCapabilityOffset(pciDevice *dev, unsigned capability)
+{
+    uint16_t status;
+    uint8_t pos;
+
+    status = pciRead16(dev, PCI_STATUS);
+    if (!(status & PCI_STATUS_CAP_LIST))
+        return 0;
+
+    pos = pciRead8(dev, PCI_CAPABILITY_LIST);
+
+    /* Zero indicates last capability, capabilities can't
+     * be in the config space header and 0xff is returned
+     * by the kernel if we don't have access to this region
+     *
+     * Note: we're not handling loops or extended
+     * capabilities here.
+     */
+    while (pos >= PCI_CONF_HEADER_LEN && pos != 0xff) {
+        uint8_t capid = pciRead8(dev, pos);
+        if (capid == capability) {
+            VIR_DEBUG("%s %s: found cap 0x%.2x at 0x%.2x",
+                      dev->id, dev->name, capability, pos);
+            return pos;
+        }
+
+        pos = pciRead8(dev, pos + 1);
+    }
+
+    VIR_DEBUG("%s %s: failed to find cap 0x%.2x", dev->id, dev->name, capability);
+
+    return 0;
+}
+
+static unsigned
+pciDetectFunctionLevelReset(pciDevice *dev)
+{
+    uint16_t caps;
+    uint8_t pos;
+
+    /* The PCIe Function Level Reset capability allows
+     * individual device functions to be reset without
+     * affecting any other functions on the device or
+     * any other devices on the bus. This is only common
+     * on SR-IOV NICs at the moment.
+     */
+    if (dev->pcie_cap_pos) {
+        caps = pciRead16(dev, dev->pcie_cap_pos + PCI_EXP_DEVCAP);
+        if (caps & PCI_EXP_DEVCAP_FLR) {
+            VIR_DEBUG("%s %s: detected PCIe FLR capability", dev->id, dev->name);
+            return 1;
+        }
+    }
+
+    /* The PCI AF Function Level Reset capability is
+     * the same thing, except for conventional PCI
+     * devices. This is not common yet.
+     */
+    pos = pciFindCapabilityOffset(dev, PCI_CAP_ID_AF);
+    if (pos) {
+        caps = pciRead16(dev, pos + PCI_AF_CAP);
+        if (caps & PCI_AF_CAP_FLR) {
+            VIR_DEBUG("%s %s: detected PCI FLR capability", dev->id, dev->name);
+            return 1;
+        }
+    }
+
+    VIR_DEBUG("%s %s: no FLR capability found", dev->id, dev->name);
+
+    return 0;
+}
+
+/* Require the device has the PCI Power Management capability
+ * and that a D3hot->D0 transition will results in a full
+ * internal reset, not just a soft reset.
+ */
+static unsigned
+pciDetectPowerManagementReset(pciDevice *dev)
+{
+    if (dev->pci_pm_cap_pos) {
+        uint32_t ctl;
+
+        /* require the NO_SOFT_RESET bit is clear */
+        ctl = pciRead32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL);
+        if (!(ctl & PCI_PM_CTRL_NO_SOFT_RESET)) {
+            VIR_DEBUG("%s %s: detected PM reset capability", dev->id, dev->name);
+            return 1;
+        }
+    }
+
+    VIR_DEBUG("%s %s: no PM reset capability found", dev->id, dev->name);
+
+    return 0;
+}
+
+/* Any devices other than the one supplied on the same domain/bus ? */
+static int
+pciSharesBus(pciDevice *a, pciDevice *b)
+{
+    return
+        a->domain == b->domain &&
+        a->bus == b->bus &&
+        (a->slot != b->slot ||
+         a->function != b->function);
+}
+
+static int
+pciBusContainsOtherDevices(virConnectPtr conn, pciDevice *dev)
+{
+    pciDevice *matched = NULL;
+    if (pciIterDevices(conn, pciSharesBus, dev, &matched) < 0)
+        return 1;
+    if (!matched)
+        return 0;
+    pciFreeDevice(conn, matched);
+    return 1;
+}
+
+/* Any other functions on this device ? */
+static int
+pciSharesDevice(pciDevice *a, pciDevice *b)
+{
+    return
+        a->domain == b->domain &&
+        a->bus == b->bus &&
+        a->slot == b->slot &&
+        a->function != b->function;
+}
+
+static int
+pciDeviceContainsOtherFunctions(virConnectPtr conn, pciDevice *dev)
+{
+    pciDevice *matched = NULL;
+    if (pciIterDevices(conn, pciSharesDevice, dev, &matched) < 0)
+        return 1;
+    if (!matched)
+        return 0;
+    pciFreeDevice(conn, matched);
+    return 1;
+}
+
+/* Is @a the parent of @b ? */
+static int
+pciIsParent(pciDevice *a, pciDevice *b)
+{
+    uint16_t device_class;
+    uint8_t header_type, secondary, subordinate;
+
+    if (a->domain != b->domain)
+        return 0;
+
+    /* Is it a bridge? */
+    device_class = pciRead16(a, PCI_CLASS_DEVICE);
+    if (device_class != PCI_CLASS_BRIDGE_PCI)
+        return 0;
+
+    /* Is it a plane? */
+    header_type = pciRead8(a, PCI_HEADER_TYPE);
+    if ((header_type & PCI_HEADER_TYPE_MASK) != PCI_HEADER_TYPE_BRIDGE)
+        return 0;
+
+    secondary   = pciRead8(a, PCI_SECONDARY_BUS);
+    subordinate = pciRead8(a, PCI_SUBORDINATE_BUS);
+
+    VIR_DEBUG("%s %s: found parent device %s\n", b->id, b->name, a->name);
+
+    /* No, it's superman! */
+    return (b->bus >= secondary && b->bus <= subordinate);
+}
+
+static pciDevice *
+pciGetParentDevice(virConnectPtr conn, pciDevice *dev)
+{
+    pciDevice *parent = NULL;
+    pciIterDevices(conn, pciIsParent, dev, &parent);
+    return parent;
+}
+
+/* Secondary Bus Reset is our sledgehammer - it resets all
+ * devices behind a bus.
+ */
+static int
+pciTrySecondaryBusReset(virConnectPtr conn, pciDevice *dev)
+{
+    pciDevice *parent;
+    uint8_t config_space[PCI_CONF_LEN];
+    uint16_t ctl;
+    int ret = -1;
+
+    /* For now, we just refuse to do a secondary bus reset
+     * if there are other devices/functions behind the bus.
+     * In future, we could allow it so long as those devices
+     * are not in use by the host or other guests.
+     */
+    if (pciBusContainsOtherDevices(conn, dev)) {
+        VIR_WARN("Other devices on bus with %s, not doing bus reset",
+                 dev->name);
+        return -1;
+    }
+
+    /* Find the parent bus */
+    parent = pciGetParentDevice(conn, dev);
+    if (!parent) {
+        VIR_WARN("Failed to find parent device for %s", dev->name);
+        return -1;
+    }
+
+    VIR_DEBUG("%s %s: doing a secondary bus reset", dev->id, dev->name);
+
+    /* Save and restore the device's config space; we only do this
+     * for the supplied device since we refuse to do a reset if there
+     * are multiple devices/functions
+     */
+    if (pciRead(dev, 0, config_space, PCI_CONF_LEN) < 0) {
+        VIR_WARN("Failed to save PCI config space for %s", dev->name);
+        goto out;
+    }
+
+    /* Read the control register, set the reset flag, wait 200ms,
+     * unset the reset flag and wait 200ms.
+     */
+    ctl = pciRead16(dev, PCI_BRIDGE_CONTROL);
+
+    pciWrite16(parent, PCI_BRIDGE_CONTROL, ctl | PCI_BRIDGE_CTL_RESET);
+
+    usleep(200 * 1000); /* sleep 200ms */
+
+    pciWrite16(parent, PCI_BRIDGE_CONTROL, ctl);
+
+    usleep(200 * 1000); /* sleep 200ms */
+
+    if (pciWrite(dev, 0, config_space, PCI_CONF_LEN) < 0)
+        VIR_WARN("Failed to restore PCI config space for %s", dev->name);
+
+    ret = 0;
+out:
+    pciFreeDevice(conn, parent);
+    return ret;
+}
+
+/* Power management reset attempts to reset a device using a
+ * D-state transition from D3hot to D0. Note, in detect_pm_reset()
+ * above we require the device supports a full internal reset.
+ */
+static int
+pciTryPowerManagementReset(virConnectPtr conn, pciDevice *dev)
+{
+    uint8_t config_space[PCI_CONF_LEN];
+    uint32_t ctl;
+
+    if (!dev->pci_pm_cap_pos)
+        return -1;
+
+    /* For now, we just refuse to do a power management reset
+     * if there are other functions on this device.
+     * In future, we could allow it so long as those functions
+     * are not in use by the host or other guests.
+     */
+    if (pciDeviceContainsOtherFunctions(conn, dev)) {
+        VIR_WARN("%s contains other functions, not resetting", dev->name);
+        return -1;
+    }
+
+    /* Save and restore the device's config space. */
+    if (pciRead(dev, 0, &config_space[0], PCI_CONF_LEN) < 0) {
+        VIR_WARN("Failed to save PCI config space for %s", dev->name);
+        return -1;
+    }
+
+    VIR_DEBUG("%s %s: doing a power management reset", dev->id, dev->name);
+
+    ctl = pciRead32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL);
+    ctl &= ~PCI_PM_CTRL_STATE_MASK;
+
+    pciWrite32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL, ctl|PCI_PM_CTRL_STATE_D3hot);
+
+    usleep(10 * 1000); /* sleep 10ms */
+
+    pciWrite32(dev, dev->pci_pm_cap_pos + PCI_PM_CTRL, ctl|PCI_PM_CTRL_STATE_D0);
+
+    usleep(10 * 1000); /* sleep 10ms */
+
+    if (pciWrite(dev, 0, &config_space[0], PCI_CONF_LEN) < 0)
+        VIR_WARN("Failed to restore PCI config space for %s", dev->name);
+
+    return 0;
+}
+
+static int
+pciInitDevice(virConnectPtr conn, pciDevice *dev)
+{
+    if (pciOpenConfig(dev) < 0) {
+        virReportSystemError(conn, errno,
+                             _("Failed to open config space file '%s'"),
+                             dev->path);
+        return -1;
+    }
+
+    dev->pcie_cap_pos   = pciFindCapabilityOffset(dev, PCI_CAP_ID_EXP);
+    dev->pci_pm_cap_pos = pciFindCapabilityOffset(dev, PCI_CAP_ID_PM);
+    dev->has_flr        = pciDetectFunctionLevelReset(dev);
+    dev->has_pm_reset   = pciDetectPowerManagementReset(dev);
+    dev->initted        = 1;
+    return 0;
+}
+
+int
+pciResetDevice(virConnectPtr conn, pciDevice *dev)
+{
+    int ret = -1;
+
+    if (!dev->initted && pciInitDevice(conn, dev) < 0)
+        return -1;
+
+    /* KVM will perform FLR when starting and stopping
+     * a guest, so there is no need for us to do it here.
+     */
+    if (dev->has_flr)
+        return 0;
+
+    /* Bus reset is not an option with the root bus */
+    if (dev->bus != 0)
+        ret = pciTrySecondaryBusReset(conn, dev);
+
+    /* Next best option is a PCI power management reset */
+    if (ret < 0 && dev->has_pm_reset)
+        ret = pciTryPowerManagementReset(conn, dev);
+
+    if (ret < 0)
+        pciReportError(conn, VIR_ERR_NO_SUPPORT,
+                       _("No PCI reset capability available for %s"),
+                       dev->name);
+    return ret;
+}
+
+static int
+pciBindDeviceToStub(virConnectPtr conn, pciDevice *dev, const char *stub_module)
+{
+    char stub_dir[PATH_MAX];
+    char path[PATH_MAX];
+
+    snprintf(stub_dir, sizeof(stub_dir), PCI_SYSFS "drivers/%s", stub_module);
+
+    /* Try loading the stub module if it isn't already loaded;
+     * Do not return an error if the stub module is not available.
+     */
+    if (!virFileExists(stub_dir)) {
+        const char *const modprobeargv[] = { MODPROBE, stub_module, NULL };
+
+        if (virRun(conn, modprobeargv, NULL) < 0) {
+            char ebuf[1024];
+            VIR_WARN(_("modprobe %s failed: %s"), stub_module,
+                     virStrerror(errno, ebuf, sizeof ebuf));
+        }
+    }
+
+    if (!virFileExists(stub_dir)) {
+        VIR_WARN(_("%s module not available, cannot bind device %s to it"),
+                 stub_module, dev->name);
+    } else {
+        /* Add the PCI device ID to the stub's dynamic ID table;
+         * this is needed to allow us to bind the device to the stub.
+         * Note: if the device is not currently bound to any driver,
+         * stub will immediately be bound to the device. Also, note
+         * that if a new device with this ID is hotplugged, or if a probe
+         * is triggered for such a device, it will also be immediately
+         * bound by the stub.
+         */
+        snprintf(path, sizeof(path), "%s/new_id", stub_dir);
+        if (virFileWriteStr(path, dev->id) < 0) {
+            virReportSystemError(conn, errno,
+                                 _("Failed to add PCI device ID '%s' to %s"),
+                                 dev->id, stub_module);
+            return -1;
+        }
+    }
+
+    /* If the device is already bound to a driver, unbind it.
+     * Note, this will have rather unpleasant side effects if this
+     * PCI device happens to be IDE controller for the disk hosting
+     * your root filesystem.
+     */
+    snprintf(path, sizeof(path),
+             PCI_SYSFS "devices/%s/driver/unbind", dev->name);
+    if (virFileExists(path) && virFileWriteStr(path, dev->name) < 0) {
+        virReportSystemError(conn, errno,
+                             _("Failed to unbind PCI device '%s'"), dev->name);
+        return -1;
+    }
+
+    if (virFileExists(stub_dir)) {
+        /* If the device isn't already bound to pci-stub, try binding it now.
+         */
+        snprintf(path, sizeof(path), PCI_SYSFS "devices/%s/driver", dev->name);
+        if (!virFileLinkPointsTo(path, stub_dir)) {
+            snprintf(path, sizeof(path), "%s/bind", stub_dir);
+            if (virFileWriteStr(path, dev->name) < 0) {
+                virReportSystemError(conn, errno,
+                                     _("Failed to bind PCI device '%s' to %s"),
+                                     dev->name, stub_module);
+                return -1;
+            }
+        }
+
+        /* If 'remove_id' exists, remove the device id from pci-stub's dynamic
+         * ID table so that 'drivers_probe' works below.
+         */
+        snprintf(path, sizeof(path), "%s/remove_id", stub_dir);
+        if (virFileExists(path) && virFileWriteStr(path, dev->id) < 0) {
+            virReportSystemError(conn, errno,
+                                 _("Failed to remove PCI ID '%s' from %s"),
+                                 dev->id, stub_module);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int
+pciDettachDevice(virConnectPtr conn, pciDevice *dev)
+{
+    return pciBindDeviceToStub(conn, dev, "pci-stub");
+}
+
+static int
+pciUnBindDeviceFromStub(virConnectPtr conn, pciDevice *dev, const char *stub_module)
+{
+    char stub_dir[PATH_MAX];
+    char path[PATH_MAX];
+
+    snprintf(stub_dir, sizeof(stub_dir), PCI_SYSFS "drivers/%s", stub_module);
+
+    /* If the device is bound to stub, unbind it.
+     */
+    snprintf(path, sizeof(path), PCI_SYSFS "devices/%s/driver", dev->name);
+    if (virFileExists(stub_dir) && virFileLinkPointsTo(path, stub_dir)) {
+        snprintf(path, sizeof(path), "%s/unbind", stub_dir);
+        if (virFileWriteStr(path, dev->name) < 0) {
+            virReportSystemError(conn, errno,
+                                 _("Failed to bind PCI device '%s' to %s"),
+                                 dev->name, stub_module);
+            return -1;
+        }
+    }
+
+    /* Trigger a re-probe of the device is not in the stub's dynamic
+     * ID table. If the stub is available, but 'remove_id' isn't
+     * available, then re-probing would just cause the device to be
+     * re-bound to the stub.
+     */
+    snprintf(path, sizeof(path), "%s/remove_id", stub_dir);
+    if (!virFileExists(stub_dir) || virFileExists(path)) {
+        if (virFileWriteStr(PCI_SYSFS "drivers_probe", dev->name) < 0) {
+            virReportSystemError(conn, errno,
+                                 _("Failed to trigger a re-probe for PCI device '%s'"),
+                                 dev->name);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int
+pciReAttachDevice(virConnectPtr conn, pciDevice *dev)
+{
+    return pciUnBindDeviceFromStub(conn, dev, "pci-stub");
+}
+
+static char *
+pciReadDeviceID(pciDevice *dev, const char *id_name)
+{
+    char path[PATH_MAX];
+    char *id_str;
+
+    snprintf(path, sizeof(path), PCI_SYSFS "devices/%s/%s",
+             dev->name, id_name);
+
+    /* ID string is '0xNNNN\n' ... i.e. 7 bytes */
+    if (virFileReadAll(path, 7, &id_str) < 7) {
+        VIR_FREE(id_str);
+        return NULL;
+    }
+
+    /* Check for 0x suffix */
+    if (id_str[0] != '0' || id_str[1] != 'x') {
+        VIR_FREE(id_str);
+        return NULL;
+    }
+
+    /* Chop off the newline; we know the string is 7 bytes */
+    id_str[6] = '\0';
+
+    return id_str;
+}
+
+pciDevice *
+pciGetDevice(virConnectPtr conn,
+             unsigned domain,
+             unsigned bus,
+             unsigned slot,
+             unsigned function)
+{
+    pciDevice *dev;
+    char *vendor, *product;
+
+    if (VIR_ALLOC(dev) < 0) {
+        virReportOOMError(conn);
+        return NULL;
+    }
+
+    dev->domain   = domain;
+    dev->bus      = bus;
+    dev->slot     = slot;
+    dev->function = function;
+
+    snprintf(dev->name, sizeof(dev->name), "%.4x:%.2x:%.2x.%.1x",
+             dev->domain, dev->bus, dev->slot, dev->function);
+    snprintf(dev->path, sizeof(dev->path),
+             PCI_SYSFS "devices/%s/config", dev->name);
+
+    vendor  = pciReadDeviceID(dev, "vendor");
+    product = pciReadDeviceID(dev, "device");
+
+    if (!vendor || !product) {
+        pciReportError(conn, VIR_ERR_NO_SUPPORT,
+                       _("Failed to read product/vendor ID for %s"),
+                       dev->name);
+        VIR_FREE(product);
+        VIR_FREE(vendor);
+        pciFreeDevice(conn, dev);
+        return NULL;
+    }
+
+    /* strings contain '0x' prefix */
+    snprintf(dev->id, sizeof(dev->id), "%s %s", &vendor[2], &product[2]);
+
+    VIR_FREE(product);
+    VIR_FREE(vendor);
+
+    VIR_DEBUG("%s %s: initialized", dev->id, dev->name);
+
+    return dev;
+}
+
+void
+pciFreeDevice(virConnectPtr conn ATTRIBUTE_UNUSED, pciDevice *dev)
+{
+    VIR_DEBUG("%s %s: freeing", dev->id, dev->name);
+    if (dev->fd)
+        close(dev->fd);
+    VIR_FREE(dev);
+}
diff --git a/src/pci.h b/src/pci.h
new file mode 100644
index 0000000..47882ef
--- /dev/null
+++ b/src/pci.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ *
+ * Authors:
+ *     Mark McLoughlin <markmc@xxxxxxxxxx>
+ */
+
+#ifndef __VIR_PCI_H__
+#define __VIR_PCI_H__
+
+#include <config.h>
+#include "internal.h"
+
+typedef struct _pciDevice pciDevice;
+
+pciDevice *pciGetDevice      (virConnectPtr  conn,
+                              unsigned       domain,
+                              unsigned       bus,
+                              unsigned       slot,
+                              unsigned       function);
+void       pciFreeDevice     (virConnectPtr  conn,
+                              pciDevice     *dev);
+int        pciDettachDevice  (virConnectPtr  conn,
+                              pciDevice     *dev);
+int        pciReAttachDevice (virConnectPtr  conn,
+                              pciDevice     *dev);
+int        pciResetDevice    (virConnectPtr  conn,
+                              pciDevice     *dev);
+
+#endif /* __VIR_PCI_H__ */
-- 
1.6.0.6

--
Libvir-list mailing list
Libvir-list@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/libvir-list

[Index of Archives]     [Virt Tools]     [Libvirt Users]     [Lib OS Info]     [Fedora Users]     [Fedora Desktop]     [Fedora SELinux]     [Big List of Linux Books]     [Yosemite News]     [KDE Users]     [Fedora Tools]