[PATCH] device-assignment: Use PCI I/O port sysfs resource file when available

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



When supported by the host kernel, we can use read/write on the
PCI sysfs resource file for I/O port regions.  This allows us to
avoid raw in/out commands and works with deprivileged guests via
libvirt.  For uid 0 callers, we use in/out directly to avoid any
compatibility issues.

Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
---

 Required kernel patch pending here:
 http://www.spinics.net/lists/linux-pci/msg09389.html

 hw/device-assignment.c |  131 ++++++++++++++++++++++++++++++++++++------------
 hw/device-assignment.h |    1 
 2 files changed, 99 insertions(+), 33 deletions(-)

diff --git a/hw/device-assignment.c b/hw/device-assignment.c
index 2bba22f..37c1278 100644
--- a/hw/device-assignment.c
+++ b/hw/device-assignment.c
@@ -67,6 +67,28 @@ static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr)
     return region->u.r_baseport + (addr - region->e_physbase);
 }
 
+static int assigned_dev_ioport_rw(AssignedDevRegion *dev_region,
+                                  uint32_t addr, int len, uint32_t *val,
+                                  int write)
+{
+    if (dev_region->region->resource_fd == -1)
+        return -1;
+
+    if (write) {
+        if (pwrite(dev_region->region->resource_fd, val, len,
+                  (addr - dev_region->e_physbase)) != len) {
+            return -1;
+        }
+    } else {
+        if (pread(dev_region->region->resource_fd, val, len,
+                  (addr - dev_region->e_physbase)) != len) {
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
 static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
                                        uint32_t value)
 {
@@ -77,7 +99,9 @@ static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr,
 	  r_pio, (int)r_access->e_physbase,
 	  (unsigned long)r_access->u.r_baseport, value);
 
-    outb(value, r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 1, &value, 1) != 0) {
+        outb(value, r_pio);
+    }
 }
 
 static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
@@ -90,7 +114,9 @@ static void assigned_dev_ioport_writew(void *opaque, uint32_t addr,
           r_pio, (int)r_access->e_physbase,
 	  (unsigned long)r_access->u.r_baseport, value);
 
-    outw(value, r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 2, &value, 1) != 0) {
+        outw(value, r_pio);
+    }
 }
 
 static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
@@ -103,7 +129,9 @@ static void assigned_dev_ioport_writel(void *opaque, uint32_t addr,
 	  r_pio, (int)r_access->e_physbase,
           (unsigned long)r_access->u.r_baseport, value);
 
-    outl(value, r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 4, &value, 1) != 0) {
+        outl(value, r_pio);
+    }
 }
 
 static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
@@ -112,7 +140,9 @@ static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr)
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
     uint32_t value;
 
-    value = inb(r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 1, &value, 0) != 0) {
+        value = inb(r_pio);
+    }
 
     DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
@@ -127,7 +157,9 @@ static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr)
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
     uint32_t value;
 
-    value = inw(r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 2, &value, 0) != 0) {
+        value = inw(r_pio);
+    }
 
     DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
@@ -142,7 +174,9 @@ static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr)
     uint32_t r_pio = guest_to_host_ioport(r_access, addr);
     uint32_t value;
 
-    value = inl(r_pio);
+    if (assigned_dev_ioport_rw(r_access, addr, 4, &value, 0) != 0) {
+        value = inl(r_pio);
+    }
 
     DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n",
           r_pio, (int)r_access->e_physbase,
@@ -305,7 +339,7 @@ static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num,
     DEBUG("e_phys=0x%" FMT_PCIBUS " r_baseport=%x type=0x%x len=%" FMT_PCIBUS " region_num=%d \n",
           addr, region->u.r_baseport, type, size, region_num);
 
-    if (first_map) {
+    if (first_map && region->region->resource_fd < 0) {
 	struct ioperm_data *data;
 
 	data = qemu_mallocz(sizeof(struct ioperm_data));
@@ -586,19 +620,46 @@ static int assigned_dev_register_regions(PCIRegion *io_regions,
                              slow_map ? assigned_dev_iomem_map_slow
                                       : assigned_dev_iomem_map);
             continue;
+        } else {
+            /* handle port io regions */
+            uint32_t val;
+            int ret;
+
+            /* Test kernel support for ioport resource read/write.  Old
+             * kernels return EIO.  New kernels only allow 1/2/4 byte reads
+             * so should return EINVAL for a 3 byte read */
+            ret = pread(pci_dev->v_addrs[i].region->resource_fd, &val, 3, 0);
+            if (ret == 3) {
+                fprintf(stderr, "I/O port resource supports 3 byte read?!\n");
+                abort();
+            } else if (errno == EIO) {
+                fprintf(stderr,
+                        "pcisysfs does not support rw ioport resource\n");
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                pci_dev->v_addrs[i].region->resource_fd = -1;
+            } else if (errno != EINVAL) {
+                fprintf(stderr, "Unexpected return from ioport pread (%d) %s\n",
+                        errno, strerror(errno));
+                abort();
+            }
+
+            /* Root user can use direct access for compatibility */
+            if (getuid() == 0) {
+                close(pci_dev->v_addrs[i].region->resource_fd);
+                pci_dev->v_addrs[i].region->resource_fd = -1;
+            }
+            pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+            pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
+            pci_dev->v_addrs[i].r_size = cur_region->size;
+            pci_dev->v_addrs[i].e_size = 0;
+
+            pci_register_bar((PCIDevice *) pci_dev, i,
+                             cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
+                             assigned_dev_ioport_map);
+
+            /* not relevant for port io */
+            pci_dev->v_addrs[i].memory_index = 0;
         }
-        /* handle port io regions */
-        pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
-        pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr;
-        pci_dev->v_addrs[i].r_size = cur_region->size;
-        pci_dev->v_addrs[i].e_size = 0;
-
-        pci_register_bar((PCIDevice *) pci_dev, i,
-                         cur_region->size, PCI_BASE_ADDRESS_SPACE_IO,
-                         assigned_dev_ioport_map);
-
-        /* not relevant for port io */
-        pci_dev->v_addrs[i].memory_index = 0;
     }
 
     /* success */
@@ -705,20 +766,22 @@ again:
             continue;
         if (flags & IORESOURCE_MEM) {
             flags &= ~IORESOURCE_IO;
-            if (r != PCI_ROM_SLOT) {
-                snprintf(name, sizeof(name), "%sresource%d", dir, r);
-                fd = open(name, O_RDWR);
-                if (fd == -1)
-                    continue;
-                rp->resource_fd = fd;
-            }
-        } else
+        } else {
             flags &= ~IORESOURCE_PREFETCH;
+        }
+        if (r != PCI_ROM_SLOT) {
+            snprintf(name, sizeof(name), "%sresource%d", dir, r);
+            fd = open(name, O_RDWR);
+            if (fd == -1)
+                continue;
+            rp->resource_fd = fd;
+        }
 
         rp->type = flags;
         rp->valid = 1;
         rp->base_addr = start;
         rp->size = size;
+        pci_dev->v_addrs[r].region = rp;
         DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n",
               r, rp->size, start, rp->type, rp->resource_fd);
     }
@@ -780,8 +843,10 @@ static void free_assigned_device(AssignedDevice *dev)
                 continue;
 
             if (pci_region->type & IORESOURCE_IO) {
-                kvm_remove_ioperm_data(region->u.r_baseport, region->r_size);
-                continue;
+                if (pci_region->resource_fd < 0) {
+                    kvm_remove_ioperm_data(region->u.r_baseport,
+                                           region->r_size);
+                }
             } else if (pci_region->type & IORESOURCE_MEM) {
                 if (region->u.r_virtbase) {
                     if (region->memory_index) {
@@ -795,11 +860,11 @@ static void free_assigned_device(AssignedDevice *dev)
                         fprintf(stderr,
 				"Failed to unmap assigned device region: %s\n",
 				strerror(errno));
-                    if (pci_region->resource_fd >= 0) {
-                        close(pci_region->resource_fd);
-                    }
                 }
-	    }
+            }
+            if (pci_region->resource_fd >= 0) {
+                close(pci_region->resource_fd);
+            }
         }
 
         if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX)
diff --git a/hw/device-assignment.h b/hw/device-assignment.h
index 4e7fe87..9a3ea12 100644
--- a/hw/device-assignment.h
+++ b/hw/device-assignment.h
@@ -71,6 +71,7 @@ typedef struct {
     int num;            /* our index within v_addrs[] */
     pcibus_t e_size;    /* emulated size of region in bytes */
     pcibus_t r_size;    /* real size of region in bytes */
+    PCIRegion *region;
 } AssignedDevRegion;
 
 typedef struct AssignedDevice {

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux