The patch adds new IOCTL command VFIO_EEH_INFO to VFIO container to support EEH functionality for PCI devices, which have been passed from host to guest via VFIO. Signed-off-by: Gavin Shan <gwshan@xxxxxxxxxxxxxxxxxx> --- arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/eeh-vfio.c | 593 ++++++++++++++++++++++++++++++ drivers/vfio/vfio_iommu_spapr_tce.c | 12 + include/uapi/linux/vfio.h | 57 +++ 4 files changed, 663 insertions(+) create mode 100644 arch/powerpc/platforms/powernv/eeh-vfio.c diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 63cebb9..2b15a03 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -6,5 +6,6 @@ obj-y += opal-msglog.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o +obj-$(CONFIG_VFIO_EEH) += eeh-vfio.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-vfio.c b/arch/powerpc/platforms/powernv/eeh-vfio.c new file mode 100644 index 0000000..69d5f2d --- /dev/null +++ b/arch/powerpc/platforms/powernv/eeh-vfio.c @@ -0,0 +1,593 @@ +/* + * The file intends to support EEH funtionality for those PCI devices, + * which have been passed through from host to guest via VFIO. So this + * file is naturally part of VFIO implementation on PowerNV platform. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2014. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/init.h> +#include <linux/io.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/msi.h> +#include <linux/pci.h> +#include <linux/string.h> +#include <linux/vfio.h> + +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/iommu.h> +#include <asm/opal.h> +#include <asm/msi_bitmap.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/tce.h> +#include <asm/uaccess.h> + +#include "powernv.h" +#include "pci.h" + +static int powernv_eeh_vfio_map(struct vfio_eeh_info *info) +{ + struct pci_bus *bus, *pe_bus; + struct pci_dev *pdev; + struct eeh_dev *edev; + struct eeh_pe *pe; + int domain, bus_no, devfn; + + /* Host address */ + domain = info->map.host_domain; + bus_no = (info->map.host_cfg_addr >> 8) & 0xff; + devfn = info->map.host_cfg_addr & 0xff; + + /* Find PCI bus */ + bus = pci_find_bus(domain, bus_no); + if (!bus) { + pr_warn("%s: PCI bus %04x:%02x not found\n", + __func__, domain, bus_no); + return -ENODEV; + } + + /* Find PCI device */ + pdev = pci_get_slot(bus, devfn); + if (!pdev) { + pr_warn("%s: PCI device %04x:%02x:%02x.%01x not found\n", + __func__, domain, bus_no, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + return -ENODEV; + } + + /* No EEH device - almost impossible */ + edev = pci_dev_to_eeh_dev(pdev); + if (unlikely(!edev)) { + pci_dev_put(pdev); + pr_warn("%s: No EEH dev for PCI device %s\n", + __func__, pci_name(pdev)); + return -ENODEV; + } + + /* Doesn't support PE migration between different PHBs */ + pe = edev->pe; + if (!eeh_pe_passed(pe)) { + pe_bus = eeh_pe_bus_get(pe); + BUG_ON(!pe_bus); + + /* PE# has format 00BBSS00 */ + pe->guest_addr.buid = info->map.guest_buid; + pe->guest_addr.pe_addr = pe_bus->number << 16; + eeh_pe_set_passed(pe, true); + } else if (pe->guest_addr.buid != info->map.guest_buid) { + pci_dev_put(pdev); + pr_warn("%s: Mismatched PHB BUID (0x%llx, 0x%llx)\n", + __func__, pe->guest_addr.buid, info->map.guest_buid); + return -EINVAL; + } + + edev->guest_addr.buid = info->map.guest_buid; + edev->guest_addr.config_addr = info->map.guest_cfg_addr; + eeh_dev_set_passed(edev, true); + + pr_debug("EEH: Host PCI dev %s to %llx-%02x:%02x.%01x\n", + pci_name(pdev), info->map.guest_buid, + (info->map.guest_cfg_addr >> 8) & 0xFF, + PCI_SLOT(info->map.guest_cfg_addr & 0xFF), + PCI_FUNC(info->map.guest_cfg_addr & 0xFF)); + + pci_dev_put(pdev); + return 0; +} + +static int powernv_eeh_vfio_unmap(struct vfio_eeh_info *info) +{ + struct eeh_vfio_pci_addr addr; + struct pci_dev *pdev; + struct eeh_dev *edev, *tmp; + struct eeh_pe *pe; + bool passed; + + /* Get EEH device */ + addr.buid = info->unmap.buid; + addr.config_addr = info->unmap.cfg_addr; + edev = eeh_vfio_dev_get(&addr); + if (!edev) { + pr_warn("%s: Cannot find %llx:%02x:%02x.%01x\n", + __func__, info->unmap.buid, + (info->unmap.cfg_addr >> 8) & 0xFF, + PCI_SLOT(info->unmap.cfg_addr & 0xFF), + PCI_FUNC(info->unmap.cfg_addr & 0xFF)); + return -ENODEV; + } + + /* Return EEH device */ + memset(&edev->guest_addr, 0, sizeof(edev->guest_addr)); + eeh_dev_set_passed(edev, false); + pdev = eeh_dev_to_pci_dev(edev); + pr_debug("EEH: Host PCI dev %s returned\n", + pdev ? pci_name(pdev) : "NULL"); + + /* Return PE if no EEH device is owned by guest */ + pe = edev->pe; + passed = false; + eeh_pe_for_each_dev(pe, edev, tmp) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev && pdev->subordinate) + continue; + + if (eeh_dev_passed(edev)) { + passed = true; + break; + } + } + + if (!passed) { + memset(&pe->guest_addr, 0, sizeof(pe->guest_addr)); + eeh_pe_set_passed(pe, false); + pr_debug("EEH: PHB#%x-PE#%x returned to host\n", + pe->phb->global_number, pe->addr); + } + + return 0; +} + +static int powernv_eeh_vfio_set_option(struct vfio_eeh_info *info) +{ + struct pnv_phb *phb; + struct eeh_dev *edev; + struct eeh_pe *pe; + struct eeh_vfio_pci_addr addr; + int opcode = info->option.option; + int ret = 0; + + /* Check opcode */ + if (opcode < EEH_OPT_DISABLE || opcode > EEH_OPT_THAW_DMA) { + pr_warn("%s: opcode %d out of range (%d, %d)\n", + __func__, opcode, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA); + ret = 3; + goto out; + } + + /* Option "enable" uses PCI config address */ + if (opcode == EEH_OPT_ENABLE) { + addr.buid = info->option.buid; + addr.config_addr = (info->option.addr >> 8) & 0xFFFF; + edev = eeh_vfio_dev_get(&addr); + if (!edev) { + pr_warn("%s: Cannot find %llx:%02x:%02x.%01x\n", + __func__, addr.buid, + (addr.config_addr >> 8) & 0xFF, + PCI_SLOT(addr.config_addr & 0xFF), + PCI_FUNC(addr.config_addr & 0xFF)); + ret = 7; + goto out; + } + phb = edev->phb->private_data; + } else { + addr.buid = info->option.buid; + addr.pe_addr = info->option.addr; + pe = eeh_vfio_pe_get(&addr); + if (!pe) { + pr_warn("%s: Cannot find PE %llx:%x\n", + __func__, addr.buid, addr.pe_addr); + ret = 7; + goto out; + } + phb = pe->phb->private_data; + } + + /* Insure that the EEH stuff has been initialized */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) { + pr_warn("%s: EEH disabled on PHB#%d\n", + __func__, phb->hose->global_number); + ret = 7; + goto out; + } + + /* + * The EEH functionality has been enabled on all PEs + * by default. So just return success. The same situation + * would be applied while we disable EEH functionality. + * However, the guest isn't expected to disable that + * at all. + */ + if (opcode == EEH_OPT_DISABLE || + opcode == EEH_OPT_ENABLE) { + ret = 0; + goto out; + } + + /* + * Call into the IODA dependent backend in order + * to enable DMA or MMIO for the indicated PE. + */ + if (phb->eeh_ops && phb->eeh_ops->set_option) { + if (phb->eeh_ops->set_option(pe, opcode)) { + pr_warn("%s: Failure from backend\n", + __func__); + ret = 1; + } + } else { + pr_warn("%s: Unsupported request\n", + __func__); + ret = 7; + } + +out: + return ret; +} + +static int powernv_eeh_vfio_get_addr(struct vfio_eeh_info *info) +{ + struct pnv_phb *phb; + struct eeh_dev *edev; + struct eeh_vfio_pci_addr addr; + int opcode = info->addr.option; + int ret = 0; + + /* Check opcode */ + if (opcode != 0 && opcode != 1) { + pr_warn("%s: opcode %d out of range (0, 1)\n", + __func__, opcode); + ret = 3; + goto out; + } + + /* Find EEH device */ + addr.buid = info->addr.buid; + addr.config_addr = (info->addr.cfg_addr >> 8 ) & 0xFFFF; + edev = eeh_vfio_dev_get(&addr); + if (!edev) { + pr_warn("%s: Cannot find %llx:%02x:%02x.%01x\n", + __func__, addr.buid, + (addr.config_addr >> 8) & 0xFF, + PCI_SLOT(addr.config_addr & 0xFF), + PCI_FUNC(addr.config_addr & 0xFF)); + ret = 7; + goto out; + } + phb = edev->phb->private_data; + + /* EEH enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) { + pr_warn("%s: EEH disabled on PHB#%d\n", + __func__, phb->hose->global_number); + ret = 3; + goto out; + } + + /* EEH device passed ? */ + if (!eeh_dev_passed(edev)) { + pr_warn("%s: EEH dev %llx:%02x:%02x.%01x owned by host\n", + __func__, addr.buid, + (addr.config_addr >> 8) & 0xFF, + PCI_SLOT(addr.config_addr & 0xFF), + PCI_FUNC(addr.config_addr & 0xFF)); + ret = 3; + goto out; + } + + /* + * Fill result according to opcode. We don't differentiate + * PCI bus and device sensitive PE here. + */ + if (opcode == 0) + info->addr.ret = edev->pe->guest_addr.pe_addr; + else + info->addr.ret = 1; +out: + return ret; +} + +static int powernv_eeh_vfio_get_state(struct vfio_eeh_info *info) +{ + struct pnv_phb *phb; + struct eeh_pe *pe; + struct eeh_vfio_pci_addr addr; + int result, ret = 0; + + /* Locate the PE */ + addr.buid = info->state.buid; + addr.pe_addr = info->state.pe_addr; + pe = eeh_vfio_pe_get(&addr); + if (!pe) { + pr_warn("%s: Cannot locate %llx:%x\n", + __func__, addr.buid, addr.pe_addr); + ret = 3; + goto out; + } + phb = pe->phb->private_data; + + /* EEH enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) { + pr_warn("%s: EEH disabled on PHB#%d\n", + __func__, phb->hose->global_number); + ret = 3; + goto out; + } + + /* Call to the IOC dependent function */ + if (phb->eeh_ops && phb->eeh_ops->get_state) { + result = phb->eeh_ops->get_state(pe); + + if (!(result & EEH_STATE_RESET_ACTIVE) && + (result & EEH_STATE_DMA_ENABLED) && + (result & EEH_STATE_MMIO_ENABLED)) + info->state.state = 0; + else if (result & EEH_STATE_RESET_ACTIVE) + info->state.state = 1; + else if (!(result & EEH_STATE_RESET_ACTIVE) && + !(result & EEH_STATE_DMA_ENABLED) && + !(result & EEH_STATE_MMIO_ENABLED)) + info->state.state = 2; + else if (!(result & EEH_STATE_RESET_ACTIVE) && + (result & EEH_STATE_DMA_ENABLED) && + !(result & EEH_STATE_MMIO_ENABLED)) + info->state.state = 4; + else + info->state.state = 5; + + ret = 0; + } else { + pr_warn("%s: Unsupported request\n", __func__); + ret = 3; + } + +out: + return ret; +} + +static int powernv_eeh_vfio_pe_reset(struct vfio_eeh_info *info) +{ + struct pnv_phb *phb; + struct eeh_pe *pe; + struct eeh_vfio_pci_addr addr; + int opcode = info->reset.option; + int ret = 0; + + /* Check opcode */ + if (opcode != EEH_RESET_DEACTIVATE && + opcode != EEH_RESET_HOT && + opcode != EEH_RESET_FUNDAMENTAL) { + pr_warn("%s: Unsupported opcode %d\n", + __func__, opcode); + ret = 3; + goto out; + } + + /* Locate the PE */ + addr.buid = info->reset.buid; + addr.pe_addr = info->reset.pe_addr; + pe = eeh_vfio_pe_get(&addr); + if (!pe) { + pr_warn("%s: Cannot locate %llx:%x\n", + __func__, addr.buid, addr.pe_addr); + ret = 3; + goto out; + } + phb = pe->phb->private_data; + + /* EEH enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) { + pr_warn("%s: EEH disabled on PHB#%d\n", + __func__, phb->hose->global_number); + ret = 3; + goto out; + } + + /* Call into the IODA dependent backend to do the reset */ + if (!phb->eeh_ops || + !phb->eeh_ops->set_option || + !phb->eeh_ops->reset) { + pr_warn("%s: Unsupported request\n", + __func__); + ret = 7; + } else { + /* + * The frozen PE might be caused by the mechanism called + * PAPR error injection, which is supposed to be one-shot + * without "sticky" bit as being stated by the spec. But + * the reality isn't that, at least on P7IOC. So we have + * to clear that to avoid recrusive error, which fails the + * recovery eventually. + */ + if (opcode == EEH_RESET_DEACTIVATE) + opal_pci_reset(phb->opal_id, + OPAL_PHB_ERROR, + OPAL_ASSERT_RESET); + + if (phb->eeh_ops->reset(pe, opcode)) { + pr_warn("%s: Failure from backend\n", __func__); + ret = 1; + goto out; + } + + /* + * The PE is still in frozen state and we need clear that. + * It's good to clear frozen state after deassert to avoid + * messy IO access during reset, which might cause recrusive + * frozen PE. + */ + if (opcode == EEH_RESET_DEACTIVATE) { + if (phb->eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO) || + phb->eeh_ops->set_option(pe, EEH_OPT_THAW_DMA)) { + pr_warn("%s: Cannot clear frozen state\n", + __func__); + ret = 1; + } + + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + } + } + +out: + return ret; +} + +static int powernv_eeh_vfio_pe_config(struct vfio_eeh_info *info) +{ + struct pnv_phb *phb; + struct eeh_pe *pe; + struct eeh_vfio_pci_addr addr; + int ret = 0; + + /* Locate the PE */ + addr.buid = info->config.buid; + addr.pe_addr = info->config.pe_addr; + pe = eeh_vfio_pe_get(&addr); + if (!pe) { + pr_warn("%s: Cannot locate %llx:%x\n", + __func__, addr.buid, addr.pe_addr); + ret = 3; + goto out; + } + phb = pe->phb->private_data; + + /* EEH enabled ? */ + if (!(phb->flags & PNV_PHB_FLAG_EEH)) { + pr_warn("%s: EEH disabled on PHB#%d\n", + __func__, phb->hose->global_number); + ret = 3; + goto out; + } + + /* + * The access to PCI config space on VFIO device has some + * limitations. Part of PCI config space, including BAR + * registers are not readable and writable. So the guest + * should have stale values for those registers and we have + * to restore them in host side. + */ + eeh_pe_restore_bars(pe); +out: + return ret; +} + +void eeh_vfio_release(struct iommu_table *tbl) +{ + struct pnv_ioda_pe *pnv_pe = container_of(tbl, struct pnv_ioda_pe, + tce32_table); + struct pnv_phb *phb = pnv_pe->phb; + struct eeh_pe *phb_pe, *pe; + struct eeh_dev dev, *edev, *tmp; + + /* Find PHB PE */ + phb_pe = eeh_phb_pe_get(phb->hose); + if (unlikely(!phb_pe)) { + pr_warn("%s: Cannot find PHB#%d PE\n", + __func__, phb->hose->global_number); + return; + } + + /* Find PE */ + memset(&dev, 0, sizeof(struct eeh_dev)); + dev.phb = phb->hose; + dev.pe_config_addr = pnv_pe->pe_number; + pe = eeh_pe_get(&dev); + if (unlikely(!pe)) { + pr_warn("%s: Cannot find PE instance for PHB#%d-PE#%d\n", + __func__, phb->hose->global_number, + pnv_pe->pe_number); + return; + } + + /* Release it to host */ + if (!eeh_pe_passed(pe)) + return; + + eeh_pe_for_each_dev(pe, edev, tmp) { + if (!eeh_dev_passed(edev)) + continue; + + memset(&edev->guest_addr, 0, sizeof(edev->guest_addr)); + eeh_dev_set_passed(edev, false); + } + + memset(&pe->guest_addr, 0, sizeof(pe->guest_addr)); + eeh_pe_set_passed(pe, false); +} +EXPORT_SYMBOL(eeh_vfio_release); + +int eeh_vfio_ioctl(unsigned long arg) +{ + struct vfio_eeh_info info; + int ret = -EINVAL; + + /* Copy over user argument */ + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) { + pr_warn("%s: Cannot copy user argument 0x%lx\n", + __func__, arg); + return -EFAULT; + } + + /* Sanity check */ + if (info.argsz != sizeof(info)) { + pr_warn("%s: Invalid argument size (%d, %ld)\n", + __func__, info.argsz, sizeof(info)); + return -EINVAL; + } + + /* Route according to operation */ + switch (info.op) { + case VFIO_EEH_OP_MAP: + ret = powernv_eeh_vfio_map(&info); + break; + case VFIO_EEH_OP_UNMAP: + ret = powernv_eeh_vfio_unmap(&info); + break; + case VFIO_EEH_OP_SET_OPTION: + ret = powernv_eeh_vfio_set_option(&info); + break; + case VFIO_EEH_OP_GET_ADDR: + ret = powernv_eeh_vfio_get_addr(&info); + break; + case VFIO_EEH_OP_GET_STATE: + ret = powernv_eeh_vfio_get_state(&info); + break; + case VFIO_EEH_OP_PE_RESET: + ret = powernv_eeh_vfio_pe_reset(&info); + break; + case VFIO_EEH_OP_PE_CONFIG: + ret = powernv_eeh_vfio_pe_config(&info); + break; + default: + pr_info("%s: Cannot handle op#%d\n", + __func__, info.op); + } + + /* Copy data back */ + if (!ret && copy_to_user((void __user *)arg, &info, sizeof(info))) { + pr_warn("%s: Cannot copy to user 0x%lx\n", + __func__, arg); + return -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(eeh_vfio_ioctl); diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index a84788b..c45dece 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -26,6 +26,11 @@ #define DRIVER_AUTHOR "aik@xxxxxxxxx" #define DRIVER_DESC "VFIO IOMMU SPAPR TCE" +#ifdef CONFIG_VFIO_EEH +extern void eeh_vfio_release(struct iommu_table *tbl); +extern int eeh_vfio_ioctl(unsigned long arg); +#endif + static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); @@ -283,6 +288,10 @@ static long tce_iommu_ioctl(void *iommu_data, tce_iommu_disable(container); mutex_unlock(&container->lock); return 0; +#ifdef CONFIG_VFIO_EEH + case VFIO_EEH_INFO: + return eeh_vfio_ioctl(arg); +#endif } return -ENOTTY; @@ -342,6 +351,9 @@ static void tce_iommu_detach_group(void *iommu_data, /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ container->tbl = NULL; +#ifdef CONFIG_VFIO_EEH + eeh_vfio_release(tbl); +#endif iommu_release_ownership(tbl); } mutex_unlock(&container->lock); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index cb9023d..1fd1bfb 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -455,6 +455,63 @@ struct vfio_iommu_spapr_tce_info { #define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) +/* + * The VFIO EEH info struct provides way to support EEH functionality + * for PCI device that is passed from host to guest via VFIO. + */ +#define VFIO_EEH_OP_MAP 0 +#define VFIO_EEH_OP_UNMAP 1 +#define VFIO_EEH_OP_SET_OPTION 2 +#define VFIO_EEH_OP_GET_ADDR 3 +#define VFIO_EEH_OP_GET_STATE 4 +#define VFIO_EEH_OP_PE_RESET 5 +#define VFIO_EEH_OP_PE_CONFIG 6 + +struct vfio_eeh_info { + __u32 argsz; + __u32 op; + + union { + struct vfio_eeh_map { + __u32 host_domain; + __u16 host_cfg_addr; + __u64 guest_buid; + __u16 guest_cfg_addr; + } map; + struct vfio_eeh_unmap { + __u64 buid; + __u16 cfg_addr; + } unmap; + struct vfio_eeh_set_option { + __u64 buid; + __u32 addr; + __u32 option; + } option; + struct vfio_eeh_pe_addr { + __u64 buid; + __u32 cfg_addr; + __u32 option; + __u32 ret; + } addr; + struct vfio_eeh_state { + __u64 buid; + __u32 pe_addr; + __u32 state; + } state; + struct vfio_eeh_reset { + __u64 buid; + __u32 pe_addr; + __u32 option; + } reset; + struct vfio_eeh_config { + __u64 buid; + __u32 pe_addr; + } config; + }; +}; + +#define VFIO_EEH_INFO _IO(VFIO_TYPE, VFIO_BASE + 21) + /* ***************************************************************** */ #endif /* _UAPIVFIO_H */ -- 1.8.3.2 -- To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html