KVM device-assignment implementation doesn't provide any mechanism to report PCI errors (related to the assigned device) to the guest VM. Similarly, events like suspend and resume aren't reported. This is a limitation to achieve high availability in a system where VMs are controlling devices directly. >From previous discussion, it's understood that VFIO is a great solution for kvm devices-assignment and ideally this work should be part of it. Unfortunately, a solution is needed till it gets more mature. The first step at reporting events and errors all the way up to the guest kernel is to provide a mechanism for the host kernel to notify userspace. This patches propose a solution based on pci-stub and UIO. Other solutions exists but this one was choosen for it's simplicity and compatibility with current model. All comments are welcome. Warning: Minimal testing. thanks, -Etienne Signed-off-by: Etienne Martineau <etmartin@xxxxxxxxx> --- drivers/uio/Kconfig | 11 ++ drivers/uio/Makefile | 1 + drivers/uio/uio_pci_stub.c | 359 ++++++++++++++++++++++++++++++++++++++++++ include/linux/Kbuild | 1 + include/linux/uio_pci_stub.h | 31 ++++ 5 files changed, 403 insertions(+), 0 deletions(-) create mode 100644 drivers/uio/uio_pci_stub.c create mode 100644 include/linux/uio_pci_stub.h diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig index bb44079..e4af9d4 100644 --- a/drivers/uio/Kconfig +++ b/drivers/uio/Kconfig @@ -94,4 +94,15 @@ config UIO_NETX To compile this driver as a module, choose M here; the module will be called uio_netx. +config UIO_PCI_STUB + tristate "Simple stub driver with AER capabilities" + depends on PCI + help + Say Y or M here if you want be able to reserve a PCI device + when it is going to be assigned to a guest operating system. + Also, this driver gives you the option to notify the guest + operating system in case where the device report an PCI error. + + When in doubt, say N. + endif diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile index 18fd818..c1eeedc 100644 --- a/drivers/uio/Makefile +++ b/drivers/uio/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_UIO_AEC) += uio_aec.o obj-$(CONFIG_UIO_SERCOS3) += uio_sercos3.o obj-$(CONFIG_UIO_PCI_GENERIC) += uio_pci_generic.o obj-$(CONFIG_UIO_NETX) += uio_netx.o +obj-$(CONFIG_UIO_PCI_STUB) += uio_pci_stub.o diff --git a/drivers/uio/uio_pci_stub.c b/drivers/uio/uio_pci_stub.c new file mode 100644 index 0000000..18fadcb --- /dev/null +++ b/drivers/uio/uio_pci_stub.c @@ -0,0 +1,359 @@ +/* + * uio_pci_stub.c - Simple stub driver with AER capabilities + * + * Copyright (C) 2010 Cisco Systems + * Author: Etienne Martineau <etmartin@xxxxxxxxx> + * + * Based on drivers/pci/pci-stub.c by Chris Wright, + * Copyright (C) 2008 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Usage is simple, allocate a new id to the uio_pci_stub driver and bind the + * device to it. For example: + * + * Since the driver does not declare any device ids, you must allocate + * id and bind the device to the driver yourself. For example: + * + * # echo "8086 10f5" > /sys/bus/pci/drivers/uio_pci_stub/new_id + * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/e1000e/unbind + * # echo -n 0000:00:19.0 > /sys/bus/pci/drivers/uio_pci_stub/bind + * # ls -l /sys/bus/pci/devices/0000:00:19.0/driver + * .../0000:00:19.0/driver -> ../../../bus/pci/drivers/uio_pci_stub + * + * uio_pci_stub is equivalent to pci-stub when no extra parameter is + * given to the module at load time. 'aer=1' will turn on PCIe AER error + * reporting. + * + * NOTE: There is no support for suspend and resume and current implementation + * is not based on eventfd. + */ + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/uio_driver.h> +#include <linux/uio_pci_stub.h> + +static int debug=0; +static int aer=0; +static char ids[1024] __initdata; + +#define DRIVER_VERSION "0.01" +#define DRIVER_AUTHOR "Etienne Martineau <etmartin@xxxxxxxxx>" +#define DRIVER_DESC "Simple stub driver with AER capabilities" +MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the stub driver, format is " + "\"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\"" + " and multiple comma separated entries can be specified"); +module_param_string(ids, ids, sizeof(ids), 0); +MODULE_PARM_DESC(debug, "Debugging mode enabled or not"); +module_param(debug, bool, 0644); +MODULE_PARM_DESC(aer, "AER error reporting enabled or not"); +module_param(aer, bool, 0644); + +#define DPRINTK(fmt, args...) \ + do{ \ + if(debug) \ + printk(KERN_DEBUG "%s: " fmt, __func__ , ## args); \ +} while (0) + +struct uio_pci_stub_priv { + atomic_t sync; + pci_ers_result_t result; + struct semaphore sem; + char name[UIO_MAX_NAME_SIZE]; +}; + +/* + * For every pci error handlers invoked, userspace is notified. It has + * access to the pci error code through 'logical BAR0. + * + * After each notification, Kernel will wait for user space to provide + * the pci error result. Upon timeout, kernel takes default action. + * + * Most is not all UIO drivers typically used 'value' to control the state of + * an interrupt in the interrupt controller. Here, 'value' transport + * the pci error result. + */ +static int uio_pci_stub_control(struct uio_info *info, s32 value) +{ + struct uio_pci_stub_priv *priv = info->priv; + enum pci_error_result result=value; + pci_ers_result_t pci_result; + + /* Sanity check */ + switch(result){ + case RESULT_NONE: + pci_result = PCI_ERS_RESULT_NONE; + break; + case RESULT_CAN_RECOVER: + pci_result = PCI_ERS_RESULT_CAN_RECOVER; + break; + case RESULT_NEED_RESET: + pci_result = PCI_ERS_RESULT_NEED_RESET; + break; + case RESULT_DISCONNECT: + pci_result = PCI_ERS_RESULT_DISCONNECT; + break; + case RESULT_RECOVERED: + pci_result = PCI_ERS_RESULT_RECOVERED; + break; + default: + return -EINVAL; + } + + if(atomic_inc_and_test(&priv->sync)){ + priv->result = pci_result; + up(&priv->sem); + return 0; + } + /* Userspace is out of sync */ + return -EPIPE; +} + +static int logical_bar_setup(struct uio_info *info, int n) +{ + void *ptr; + + ptr = (void*)__get_free_pages(GFP_KERNEL,0); + if(!ptr) + return -ENOMEM; + + info->mem[n].addr = virt_to_phys(ptr); + info->mem[n].size = PAGE_SIZE; + info->mem[n].memtype = UIO_MEM_LOGICAL; + info->mem[n].internal_addr = ptr; + return 0; +} + +static void logical_bar_release(struct uio_info *info, int n) +{ + if(info->mem[n].internal_addr) + free_pages((long unsigned int)info->mem[n].internal_addr,0); +} + +static int __devinit probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + int ret = -ENODEV; + struct uio_info *info; + struct uio_pci_stub_priv *priv; + + info = kzalloc(sizeof(struct uio_info), GFP_KERNEL); + if (!info){ + ret = -ENOMEM; + goto bad; + } + + priv = kzalloc(sizeof(struct uio_pci_stub_priv), GFP_KERNEL); + if (!priv){ + ret = -ENOMEM; + goto bad1; + } + + ret = logical_bar_setup(info, 0); + if(ret) + goto bad2; + + info->priv = priv; + info->version = DRIVER_VERSION; + info->irqcontrol = uio_pci_stub_control; + info->irq = UIO_IRQ_CUSTOM; + + snprintf(priv->name, UIO_MAX_NAME_SIZE, + FORMAT_UIO_DEV_NAME(dev->bus->number, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), id->vendor, id->device)); + info->name = priv->name; + + init_MUTEX_LOCKED(&priv->sem); + atomic_set(&priv->sync, 0); + pci_set_drvdata(dev, info); + + ret = uio_register_device(&dev->dev, info); + if(ret) + goto bad3; + + dev_printk(KERN_INFO, &dev->dev, "claimed by uio_pci_stub\n"); + return 0; + +bad3: + logical_bar_release(info, 0); +bad2: + kfree(priv); +bad1: + kfree(info); +bad: + return ret; +} + +static void remove(struct pci_dev *dev) +{ + struct uio_info *info = pci_get_drvdata(dev); + + uio_unregister_device(info); + pci_set_drvdata(dev, NULL); + logical_bar_release(info, 0); + kfree(info->priv); + kfree(info); +} + +/* ------------------ PCI Error Recovery infrastructure -------------- */ +static int notify_user(enum pci_error_code err_code, struct pci_dev *pdev) +{ + int err; + struct uio_info *info = pci_get_drvdata(pdev); + struct uio_pci_stub_priv *priv = info->priv; + struct uio_pci_stub_logical_bar *bar = info->mem[0].internal_addr; + + DPRINTK("AER error code %d",err_code); + + if(err_code == RESUME){/* No reply expected */ + bar->err_code = err_code; + uio_event_notify(info); + return 0; + } + + /* Notify user space */ + atomic_set(&priv->sync, -1); + bar->err_code = err_code; + uio_event_notify(info); + + /* Wait till userspace post on the semaphore. Arbitrary timeout... */ + err = down_timeout(&priv->sem, msecs_to_jiffies(50)); + if(!err){ + DPRINTK("AER result code %d",priv->result); + return priv->result; + } + + /* userspace post on the semaphore sometime after the timeout occurs */ + if(!atomic_inc_and_test(&priv->sync)) + down(&priv->sem); + + printk(KERN_INFO "AER userspace not responding"); + return PCI_ERS_RESULT_NONE; +} + +/** + * error_detected - called when PCI error is detected. + * @pdev: Pointer to PCI device + * @state: The current pci connection state + */ +static pci_ers_result_t error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + return notify_user(ERROR_DETECTED, pdev); +} + +/** + * mmio_enabled + * MMIO has been re-enabled, but not DMA + */ +static pci_ers_result_t mmio_enabled(struct pci_dev *pdev) +{ + return notify_user(MMIO_ENABLED, pdev); +} + +/** + * link_reset + * PCI Express link has been reset + */ +static pci_ers_result_t link_reset(struct pci_dev *pdev) +{ + return notify_user(LINK_RESET, pdev); +} + +/** + * slot_reset - called after the pci bus has been reset. + * @pdev: Pointer to PCI device + * + * Restart the card from scratch. + */ +static pci_ers_result_t slot_reset(struct pci_dev *pdev) +{ + return notify_user(SLOT_RESET, pdev); +} + +/** + * resume - resume normal operations + * @pdev: Pointer to PCI device + * + * Resume normal operations after an error recovery + * sequence has been completed. + */ +static void resume(struct pci_dev *pdev) +{ + notify_user(RESUME, pdev); +} + +static struct pci_error_handlers err_handler = { + .error_detected = error_detected, + .mmio_enabled = mmio_enabled, + .link_reset = link_reset, + .slot_reset = slot_reset, + .resume = resume, +}; + +static struct pci_driver driver = { + .name = "uio_pci_stub", + .id_table = NULL, /* only dynamic id's */ + .probe = probe, + .remove = remove, +}; + +static int __init init(void) +{ + char *p, *id; + int rc; + + pr_info(DRIVER_DESC " %s" " version: " DRIVER_VERSION "\n", + aer?"Turned on":"Turned off"); + + if(aer) + driver.err_handler = &err_handler; + + rc = pci_register_driver(&driver); + if (rc) + return rc; + + /* add ids specified in the module parameter */ + p = ids; + while ((id = strsep(&p, ","))) { + unsigned int vendor, device, subvendor = PCI_ANY_ID, + subdevice = PCI_ANY_ID, class=0, class_mask=0; + int fields; + + fields = sscanf(id, "%x:%x:%x:%x:%x:%x", + &vendor, &device, &subvendor, &subdevice, + &class, &class_mask); + + if (fields < 2) { + printk(KERN_WARNING + "pci-stub: invalid id string \"%s\"\n", id); + continue; + } + + printk(KERN_INFO + "pci-stub: add %04X:%04X sub=%04X:%04X cls=%08X/%08X\n", + vendor, device, subvendor, subdevice, class, class_mask); + + rc = pci_add_dynid(&driver, vendor, device, + subvendor, subdevice, class, class_mask, 0); + if (rc) + printk(KERN_WARNING + "pci-stub: failed to add dynamic id (%d)\n", rc); + } + + return 0; +} + +static void __exit cleanup(void) +{ + pci_unregister_driver(&driver); +} + +module_init(init); +module_exit(cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 831c463..045a5de 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -362,6 +362,7 @@ header-y += udf_fs_i.h header-y += udp.h header-y += uinput.h header-y += uio.h +header-y += uio_pci_stub.h header-y += ultrasound.h header-y += un.h header-y += unistd.h diff --git a/include/linux/uio_pci_stub.h b/include/linux/uio_pci_stub.h new file mode 100644 index 0000000..873c407 --- /dev/null +++ b/include/linux/uio_pci_stub.h @@ -0,0 +1,31 @@ +#ifndef __LINUX_UIO_PCI_STUB_H +#define __LINUX_UIO_PCI_STUB_H + +#ifndef UIO_MAX_NAME_SIZE +#define UIO_MAX_NAME_SIZE 64 +#endif + +#define FORMAT_UIO_DEV_NAME(vendorid,deviceid,busnr,dev,fcn)\ + "%x:%x.%x %x:%x",vendorid,deviceid,busnr,dev,fcn + +enum pci_error_code{ + ERROR_DETECTED, + MMIO_ENABLED, + LINK_RESET, + SLOT_RESET, + RESUME, +}; + +enum pci_error_result{ + RESULT_NONE, + RESULT_CAN_RECOVER, + RESULT_NEED_RESET, + RESULT_DISCONNECT, + RESULT_RECOVERED, +}; + +struct uio_pci_stub_logical_bar { + enum pci_error_code err_code; +}; + +#endif -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html