Re: [PATCH v6 2/3] drivers/vfio: EEH support for VFIO PCI device

Alexander Graf <agraf@xxxxxxx> · Thu, 22 May 2014 11:55:29 +0200

On 22.05.14 10:23, Gavin Shan wrote:
The patch adds new IOCTL commands for VFIO PCI device to support
EEH functionality for PCI devices, which have been passed through
from host to somebody else via VFIO.

Signed-off-by: Gavin Shan <gwshan@xxxxxxxxxxxxxxxxxx>

This already looks a *lot* more sane than the previous versions. We're 
slowly getting there I think ;).

Ben, could you please check through the exported EEH interface itself 
and verify whether it does all the lockings correctly, uses the API 
properly and doesn't allow for a user space program to blow up the system?


---
  Documentation/vfio.txt         |  88 ++++++++++-
  arch/powerpc/include/asm/eeh.h |  17 +++
  arch/powerpc/kernel/eeh.c      | 321 +++++++++++++++++++++++++++++++++++++++++
  drivers/vfio/pci/vfio_pci.c    | 131 ++++++++++++++++-
  include/uapi/linux/vfio.h      |  53 +++++++
  5 files changed, 603 insertions(+), 7 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..dd13db6 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in real mode which provides
  an excellent performance which has limitations such as inability to do
  locked pages accounting in real time.
  
-So 3 additional ioctls have been added:
+4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
+which works on the basis of additional ioctl commands.
+
+So 8 additional ioctls have been added:
  
  	VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
  		of the DMA window on the PCI bus.
@@ -316,6 +319,20 @@ So 3 additional ioctls have been added:
  
  	VFIO_IOMMU_DISABLE - disables the container.
  
+	VFIO_EEH_PE_SET_OPTION - enables or disables EEH functinality on the

functionality

+		specified device. Also, it can be used to remove IO or DMA
+		stopped state on the frozen PE.
+
+	VFIO_EEH_PE_GET_ADDR - retrieve the unique address of the specified
+		PE or query PE sharing mode.

What is PE?

+
+	VFIO_EEH_PE_GET_STATE - retrieve PE's state: frozen or normal state.
+
+	VFIO_EEH_PE_RESET - do PE reset, which is one of the major steps for
+		error recovering.
+
+	VFIO_EEH_PE_CONFIGURE - configure the PCI bridges after PE reset. It's
+		one of the major steps for error recoverying.
  
  The code flow from the example above should be slightly changed:
  
@@ -346,6 +363,75 @@ The code flow from the example above should be slightly changed:
  	ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
  	.....
  
+Based on the initial example we have, the following piece of code could be
+reference for EEH setup and error handling:
+
+	struct vfio_eeh_pe_set_option option = { .argsz = sizeof(option) };
+	struct vfio_eeh_pe_get_addr addr = { .argsz = sizeof(addr) };
+	struct vfio_eeh_pe_get_state state = { .argsz = sizeof(state) };
+	struct vfio_eeh_pe_reset reset = { .argsz = sizeof(reset) };
+	struct vfio_eeh_pe_configure config = { .argsz = sizeof(config) };
+
+	....
+
+	/* Get a file descriptor for the device */
+	device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
+
+	/* Enable the EEH functionality on the device */
+	option.option = EEH_OPT_ENABLE;
+	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
+
+	/* Retrieve PE address and create and maintain PE by yourself */
+	addr.option = EEH_OPT_GET_PE_ADDR;
+	ioctl(device, VFIO_EEH_PE_GET_ADDR, &addr);
+
+	/* Assure EEH is supported on the PE and make PE functional */
+	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
+
+	/* Save device's state. pci_save_state() would be good enough
+	 * as an example.
+	 */
+
+	/* Test and setup the device */
+	ioctl(device, VFIO_DEVICE_GET_INFO, &device_info);
+
+	....
+
+	/* When 0xFF's returned from reading PCI config space or IO BARs
+	 * of the PCI device. Check the PE state to see if that has been
+	 * frozen.
+	 */
+	ioctl(device, VFIO_EEH_PE_GET_STATE, &state);
+
+	/* Waiting for pending PCI transactions to be completed and don't
+	 * produce any more PCI traffic from/to the affected PE until
+	 * recovery is finished.
+	 */
+
+	/* Enable IO for the affected PE and collect logs. Usually, the
+	 * standard part of PCI config space, AER registers are dumped
+	 * as logs for further analysis.
+	 */
+	option.option = EEH_OPT_THAW_MMIO;
+	ioctl(device, VFIO_EEH_PE_SET_OPTION, &option);
+
+	/* Issue PE reset */
+	reset.option = EEH_RESET_HOT;
+	ioctl(device, VFIO_EEH_PE_RESET, &reset);
+
+	/* Configure the PCI bridges for the affected PE */
+	ioctl(device, VFIO_EEH_PE_CONFIGURE, NULL);
+
+	/* Restored state we saved at initialization time. pci_restore_state()
+	 * is good enough as an example.
+	 */
+
+	/* Hopefully, error is recovered successfully. Now, you can resume to
+	 * start PCI traffic to/from the affected PE.
+	 */
+
+	....
+
  -------------------------------------------------------------------------------
  
  [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 34a2d83..dd5f1cf 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -191,6 +191,11 @@ enum {
  #define EEH_OPT_ENABLE		1	/* EEH enable	*/
  #define EEH_OPT_THAW_MMIO	2	/* MMIO enable	*/
  #define EEH_OPT_THAW_DMA	3	/* DMA enable	*/
+#define EEH_OPT_GET_PE_ADDR	0	/* Get PE addr	*/
+#define EEH_OPT_GET_PE_MODE	1	/* Get PE mode	*/
+#define EEH_PE_MODE_NONE	0	/* Invalid mode	*/
+#define EEH_PE_MODE_NOT_SHARED	1	/* Not shared	*/
+#define EEH_PE_MODE_SHARED	2	/* Shared mode	*/
  #define EEH_STATE_UNAVAILABLE	(1 << 0)	/* State unavailable	*/
  #define EEH_STATE_NOT_SUPPORT	(1 << 1)	/* EEH not supported	*/
  #define EEH_STATE_RESET_ACTIVE	(1 << 2)	/* Active reset		*/
@@ -198,6 +203,11 @@ enum {
  #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
  #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
  #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
+#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
+#define EEH_PE_STATE_RESET		1		/* PE reset	*/
+#define EEH_PE_STATE_STOPPED_IO_DMA	2		/* Stopped	*/
+#define EEH_PE_STATE_STOPPED_DMA	4		/* Stopped DMA	*/
+#define EEH_PE_STATE_UNAVAIL		5		/* Unavailable	*/
  #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
  #define EEH_RESET_HOT		1	/* Hot reset			*/
  #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
@@ -305,6 +315,13 @@ void eeh_add_device_late(struct pci_dev *);
  void eeh_add_device_tree_late(struct pci_bus *);
  void eeh_add_sysfs_files(struct pci_bus *);
  void eeh_remove_device(struct pci_dev *);
+int eeh_dev_open(struct pci_dev *pdev);
+void eeh_dev_release(struct pci_dev *pdev);
+int eeh_pe_set_option(struct pci_dev *pdev, int option);
+int eeh_pe_get_addr(struct pci_dev *pdev, int option);
+int eeh_pe_get_state(struct pci_dev *pdev);
+int eeh_pe_reset(struct pci_dev *pdev, int option);
+int eeh_pe_configure(struct pci_dev *pdev);
  
  /**
   * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..b90a474 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -108,6 +108,9 @@ struct eeh_ops *eeh_ops = NULL;
  /* Lock to avoid races due to multiple reports of an error */
  DEFINE_RAW_SPINLOCK(confirm_error_lock);
  
+/* Lock to protect passed flags */
+static DEFINE_MUTEX(eeh_dev_mutex);
+
  /* Buffer for reporting pci register dumps. Its here in BSS, and
   * not dynamically alloced, so that it ends up in RMO where RTAS
   * can access it.
@@ -1098,6 +1101,324 @@ void eeh_remove_device(struct pci_dev *dev)
  	edev->mode &= ~EEH_DEV_SYSFS;
  }
  
+/**
+ * eeh_dev_open - Mark EEH device and PE as passed through
+ * @pdev: PCI device
+ *
+ * Mark the indicated EEH device and PE as passed through.
+ * In the result, the EEH errors detected on the PE won't be
+ * reported. The owner of the device will be responsible for
+ * detection and recovery.
+ */
+int eeh_dev_open(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+
+	mutex_lock(&eeh_dev_mutex);
+
+	/* No PCI device ? */
+	if (!pdev) {
+		mutex_unlock(&eeh_dev_mutex);
+		return -ENODEV;
+	}
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !edev->pe) {
+		mutex_unlock(&eeh_dev_mutex);
+		return -ENODEV;
+	}
+
+	eeh_dev_set_passed(edev, true);
+	eeh_pe_set_passed(edev->pe, true);
+	mutex_unlock(&eeh_dev_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_dev_open);
+
+/**
+ * eeh_dev_release - Reclaim the ownership of EEH device
+ * @pdev: PCI device
+ *
+ * Reclaim ownership of EEH device, potentially the corresponding
+ * PE. In the result, the EEH errors detected on the PE will be
+ * reported and handled as usual.
+ */
+void eeh_dev_release(struct pci_dev *pdev)
+{
+	bool release_pe = true;
+	struct eeh_pe *pe = NULL;
+	struct eeh_dev *tmp, *edev;
+
+	mutex_lock(&eeh_dev_mutex);
+
+	/* No PCI device ? */
+	if (!pdev) {
+		mutex_unlock(&eeh_dev_mutex);
+		return;
+	}
+
+	/* No EEH device ? */
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(pe)) {
+		mutex_unlock(&eeh_dev_mutex);
+		return;
+	}
+
+	/* Release device */
+	pe = edev->pe;
+	eeh_dev_set_passed(edev, false);
+
+	/* Release PE */
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (eeh_dev_passed(edev)) {
+			release_pe = false;
+			break;
+		}
+	}
+
+	if (release_pe)
+		eeh_pe_set_passed(pe, false);
+
+	mutex_unlock(&eeh_dev_mutex);
+}
+EXPORT_SYMBOL(eeh_dev_release);
+
+static int eeh_dev_check(struct pci_dev *pdev,
+			 struct eeh_dev **pedev,
+			 struct eeh_pe **ppe)
+{
+	struct eeh_dev *edev;
+
+	/* No device ? */
+	if (!pdev)
+		return -ENODEV;
+
+	edev = pci_dev_to_eeh_dev(pdev);
+	if (!edev || !eeh_dev_passed(edev) ||
+	    !edev->pe || !eeh_pe_passed(edev->pe))
+		return -ENODEV;
+
+	if (pedev)
+		*pedev = edev;
+	if (ppe)
+		*ppe = edev->pe;
+
+	return 0;
+}
+
+/**
+ * eeh_pe_set_option - Set options for the indicated PE
+ * @pdev: PCI device
+ * @option: requested option
+ *
+ * The routine is called to enable or disable EEH functionality
+ * on the indicated PE, to enable IO or DMA for the frozen PE.
+ */
+int eeh_pe_set_option(struct pci_dev *pdev, int option)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	switch (option) {
+	case EEH_OPT_DISABLE:
+	case EEH_OPT_ENABLE:

This deserves a comment

+		break;
+	case EEH_OPT_THAW_MMIO:
+	case EEH_OPT_THAW_DMA:
+		if (!eeh_ops || !eeh_ops->set_option) {
+			ret = -ENOENT;
+			break;
+		}
+
+		ret = eeh_ops->set_option(pe, option);
+		break;
+	default:
+		pr_debug("%s: Option %d out of range (%d, %d)\n",
+			__func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_set_option);
+
+/**
+ * eeh_pe_get_addr - Retrieve the PE address or sharing mode
+ * @pdev: PCI device
+ * @option: option
+ *
+ * Retrieve the PE address or sharing mode.
+ */
+int eeh_pe_get_addr(struct pci_dev *pdev, int option)
+{
+	struct pci_bus *bus;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)

Probably safer to write this as ret < 0. Positive return values are 
success now.

+		return ret;
+
+	switch (option) {
+	case EEH_OPT_GET_PE_ADDR:
+		bus = eeh_pe_bus_get(pe);
+		if (!bus) {
+			ret = -ENODEV;
+			break;
+		}
+
+		/* PE address has format "00BBSS00" */
+		ret = bus->number << 16;
+		break;
+	case EEH_OPT_GET_PE_MODE:
+		/* Wa always have shared PE */

Wa?

+		ret = EEH_PE_MODE_SHARED;
+		break;
+	default:
+		pr_debug("%s: option %d out of range (0, 1)\n",
+			__func__, option);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_get_addr);
+
+/**
+ * eeh_pe_get_state - Retrieve PE's state
+ * @pdev: PCI device
+ *
+ * Retrieve the PE's state, which includes 3 aspects: enabled
+ * DMA, enabled IO and asserted reset.
+ */
+int eeh_pe_get_state(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int result, ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	if (!eeh_ops || !eeh_ops->get_state)
+		return -ENOENT;
+
+	result = eeh_ops->get_state(pe, NULL);
+	if (!(result & EEH_STATE_RESET_ACTIVE) &&
+	     (result & EEH_STATE_DMA_ENABLED) &&
+	     (result & EEH_STATE_MMIO_ENABLED))
+		ret = EEH_PE_STATE_NORMAL;
+	else if (result & EEH_STATE_RESET_ACTIVE)
+		ret = EEH_PE_STATE_RESET;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 !(result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		ret = EEH_PE_STATE_STOPPED_IO_DMA;
+	else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		 (result & EEH_STATE_DMA_ENABLED) &&
+		 !(result & EEH_STATE_MMIO_ENABLED))
+		ret = EEH_PE_STATE_STOPPED_DMA;
+	else
+		ret = EEH_PE_STATE_UNAVAIL;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_get_state);
+
+/**
+ * eeh_pe_reset - Issue PE reset according to specified type
+ * @pdev: PCI device
+ * @option: reset type
+ *
+ * The routine is called to reset the specified PE with the
+ * indicated type, either fundamental reset or hot reset.
+ * PE reset is the most important part for error recovery.
+ */
+int eeh_pe_reset(struct pci_dev *pdev, int option)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset)
+		return -ENOENT;
+
+	switch (option) {
+	case EEH_RESET_DEACTIVATE:
+		ret = eeh_ops->reset(pe, option);
+		if (ret)
+			break;
+
+		/*
+		 * The PE is still in frozen state and we need clear

to

+		 * that. It's good to clear frozen state after deassert
+		 * to avoid messy IO access during reset, which might
+		 * cause recursive frozen PE.
+		 */
+		ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+		if (!ret)
+			ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+		if (!ret)
+			eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+		break;
+	case EEH_RESET_HOT:
+	case EEH_RESET_FUNDAMENTAL:
+		ret = eeh_ops->reset(pe, option);
+		break;
+	default:
+		pr_debug("%s: Unsupported option %d\n",
+			__func__, option);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_reset);
+
+/**
+ * eeh_pe_configure - Configure PCI bridges after PE reset
+ * @pdev: PCI device
+ *
+ * The routine is called to restore the PCI config space for
+ * those PCI devices, especially PCI bridges affected by PE
+ * reset issued previously.

So would it make sense to combine this with the reset call?

+ */
+int eeh_pe_configure(struct pci_dev *pdev)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int ret = 0;
+
+	/* Device existing ? */
+	ret = eeh_dev_check(pdev, &edev, &pe);
+	if (ret)
+		return ret;
+
+	/* Restore config space for the affected devices */
+	eeh_pe_restore_bars(pe);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_pe_configure);
+
  static int proc_eeh_show(struct seq_file *m, void *v)
  {
  	if (!eeh_enabled()) {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..301ac18 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -25,6 +25,9 @@
  #include <linux/types.h>
  #include <linux/uaccess.h>
  #include <linux/vfio.h>
+#ifdef CONFIG_EEH
+#include <asm/eeh.h>
+#endif
  
  #include "vfio_pci_private.h"
  
@@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
  	pci_restore_state(pdev);
  }
  
+static void vfio_eeh_pci_release(struct pci_dev *pdev)
+{
+#ifdef CONFIG_EEH
+	eeh_dev_release(pdev);
+#endif
+}
+
  static void vfio_pci_release(void *device_data)
  {
  	struct vfio_pci_device *vdev = device_data;
  
-	if (atomic_dec_and_test(&vdev->refcnt))
+	if (atomic_dec_and_test(&vdev->refcnt)) {
+		vfio_eeh_pci_release(vdev->pdev);
  		vfio_pci_disable(vdev);
+	}
  
  	module_put(THIS_MODULE);
  }
  
+static int vfio_eeh_pci_open(struct pci_dev *pdev)
+{
+	int ret = 0;
+
+#ifdef CONFIG_EEH
+	ret = eeh_dev_open(pdev);
+#endif
+	return ret;
+}
+
  static int vfio_pci_open(void *device_data)
  {
  	struct vfio_pci_device *vdev = device_data;
+	int ret;
  
  	if (!try_module_get(THIS_MODULE))
  		return -ENODEV;
  
  	if (atomic_inc_return(&vdev->refcnt) == 1) {
-		int ret = vfio_pci_enable(vdev);
-		if (ret) {
-			module_put(THIS_MODULE);
-			return ret;
-		}
+		ret = vfio_pci_enable(vdev);
+		if (ret)
+			goto error;
+
+		ret = vfio_eeh_pci_open(vdev->pdev);
+		if (ret)
+			goto error;
  	}
  
  	return 0;
+error:
+	module_put(THIS_MODULE);
+	return ret;
  }
  
  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -321,6 +349,91 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
  	return walk.ret;
  }
  
+static int vfio_eeh_pci_ioctl(struct pci_dev *pdev,
+			      unsigned int cmd,
+			      unsigned long arg)
+{
+	unsigned long minsz;
+	int ret = 0;
+
+#ifdef CONFIG_EEH
+	switch (cmd) {
+	case VFIO_EEH_PE_SET_OPTION: {
+		struct vfio_eeh_pe_set_option option;
+
+		minsz = offsetofend(struct vfio_eeh_pe_set_option, option);
+		if (copy_from_user(&option, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (option.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_set_option(pdev, option.option);
+		break;
+	}
+	case VFIO_EEH_PE_GET_ADDR: {
+		struct vfio_eeh_pe_get_addr addr;
+
+		minsz = offsetofend(struct vfio_eeh_pe_get_addr, info);
+		if (copy_from_user(&addr, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (addr.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_get_addr(pdev, addr.option);
+		if (ret >= 0) {
+			addr.info = ret;
+			if (copy_to_user((void __user *)arg, &addr, minsz))
+				return -EFAULT;
+			ret = 0;
+		}
+
+		break;
+	}
+	case VFIO_EEH_PE_GET_STATE: {
+		struct vfio_eeh_pe_get_state state;
+
+		minsz = offsetofend(struct vfio_eeh_pe_get_state, state);
+		if (copy_from_user(&state, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (state.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_get_state(pdev);
+		if (ret >= 0) {
+			state.state = ret;
+			if (copy_to_user((void __user *)arg, &state, minsz))
+				return -EFAULT;
+			ret = 0;
+		}
+		break;
+	}
+	case VFIO_EEH_PE_RESET: {
+		struct vfio_eeh_pe_reset reset;
+
+		minsz = offsetofend(struct vfio_eeh_pe_reset, option);
+		if (copy_from_user(&reset, (void __user *)arg, minsz))
+			return -EFAULT;
+		if (reset.argsz < minsz)
+			return -EINVAL;
+
+		ret = eeh_pe_reset(pdev, reset.option);
+		break;
+	}
+	case VFIO_EEH_PE_CONFIGURE:
+		ret = eeh_pe_configure(pdev);
+		break;
+	default:
+		ret = -EINVAL;
+		pr_debug("%s: Cannot handle command %d\n",
+			__func__, cmd);
+	}
+#else
+	ret = -ENOENT;
+#endif
+
+	return ret;
+}
+
  static long vfio_pci_ioctl(void *device_data,
  			   unsigned int cmd, unsigned long arg)
  {
@@ -682,6 +795,12 @@ hot_reset_release:
  
  		kfree(groups);
  		return ret;
+	} else if (cmd == VFIO_EEH_PE_SET_OPTION ||
+		   cmd == VFIO_EEH_PE_GET_ADDR ||
+		   cmd == VFIO_EEH_PE_GET_STATE ||
+		   cmd == VFIO_EEH_PE_RESET ||
+		   cmd == VFIO_EEH_PE_CONFIGURE) {
+			return vfio_eeh_pci_ioctl(vdev->pdev, cmd, arg);
  	}
  
  	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d..ef55682 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -455,6 +455,59 @@ struct vfio_iommu_spapr_tce_info {
  
  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
  
+/*
+ * EEH functionality can be enabled or disabled on one specific device.
+ * Also, the DMA or IO frozen state can be removed from the frozen PE
+ * if required.
+ */
+struct vfio_eeh_pe_set_option {
+	__u32 argsz;

What is this argsz thing? Is this your way of maintaining backwards 
compatibility when we introduce new fields? A new field will change the 
ioctl number, so I don't think that makes a lot of sense :).

Just make the ioctl have a u32 as incoming argument. No fancy structs, 
no complicated code.

The same applies for a number of structs below.

+	__u32 option;
+};
+
+#define VFIO_EEH_PE_SET_OPTION		_IO(VFIO_TYPE, VFIO_BASE + 21)
+
+/*
+ * Each EEH PE should have unique address to be identified. The command
+ * helps to retrieve the address and the sharing mode of the PE.
+ */
+struct vfio_eeh_pe_get_addr {
+	__u32 argsz;
+	__u32 option;
+	__u32 info;

Any particular reason you need the info field? Can't the return value of 
the ioctl hold this? Then you only have a single u32 argument left to 
the ioctl again.


Alex

+};
+
+#define VFIO_EEH_PE_GET_ADDR		_IO(VFIO_TYPE, VFIO_BASE + 22)
+
+/*
+ * EEH PE might have been frozen because of PCI errors. Also, it might
+ * be experiencing reset for error revoery. The following command helps
+ * to get the state.
+ */
+struct vfio_eeh_pe_get_state {
+	__u32 argsz;
+	__u32 state;
+};
+
+#define VFIO_EEH_PE_GET_STATE		_IO(VFIO_TYPE, VFIO_BASE + 23)
+
+/*
+ * Reset is the major step to recover problematic PE. The following
+ * command helps on that.
+ */
+struct vfio_eeh_pe_reset {
+	__u32 argsz;
+	__u32 option;
+};
+
+#define VFIO_EEH_PE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 24)
+
+/*
+ * One of the steps for recovery after PE reset is to configure the
+ * PCI bridges affected by the PE reset.
+ */
+#define VFIO_EEH_PE_CONFIGURE		_IO(VFIO_TYPE, VFIO_BASE + 25)
+
  /* ***************************************************************** */
  
  #endif /* _UAPIVFIO_H */

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html