Re: [PATCHv3 05/10] PCI/ERR: Handle fatal error recovery

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Sep 18, 2018 at 05:56:57PM -0600, Keith Busch wrote:
> We don't need to be paranoid about the topology changing while handling an
> error. If the device has changed in a hotplug capable slot, we can rely
> on the presence detection handling to react to a changing topology. This
> patch restores the fatal error handling behavior that existed before
> merging DPC with AER with 7e9084b3674 ("PCI/AER: Handle ERR_FATAL with
> removal and re-enumeration of devices").

7e9084b3674 updated Documentation/PCI/pci-error-recovery.txt; doesn't
this require another update to keep it matching the code?

> Signed-off-by: Keith Busch <keith.busch@xxxxxxxxx>
> ---
>  drivers/pci/pci.h      |  4 +--
>  drivers/pci/pcie/aer.c | 12 +++++---
>  drivers/pci/pcie/dpc.c |  4 +--
>  drivers/pci/pcie/err.c | 75 ++++----------------------------------------------
>  4 files changed, 18 insertions(+), 77 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 21bfa30db18d..5a96978d3403 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -425,8 +425,8 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev)
>  #endif
>  
>  /* PCI error reporting and recovery */
> -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service);
> -void pcie_do_nonfatal_recovery(struct pci_dev *dev);
> +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
> +		      u32 service);
>  
>  bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
>  #ifdef CONFIG_PCIEASPM
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index b4d14acee66d..6b59a23568f8 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -1010,9 +1010,11 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info)
>  					info->status);
>  		pci_aer_clear_device_status(dev);
>  	} else if (info->severity == AER_NONFATAL)
> -		pcie_do_nonfatal_recovery(dev);
> +		pcie_do_recovery(dev, pci_channel_io_normal,
> +				 PCIE_PORT_SERVICE_AER);
>  	else if (info->severity == AER_FATAL)
> -		pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER);
> +		pcie_do_recovery(dev, pci_channel_io_frozen,
> +				 PCIE_PORT_SERVICE_AER);
>  	pci_dev_put(dev);
>  }
>  
> @@ -1048,9 +1050,11 @@ static void aer_recover_work_func(struct work_struct *work)
>  		}
>  		cper_print_aer(pdev, entry.severity, entry.regs);
>  		if (entry.severity == AER_NONFATAL)
> -			pcie_do_nonfatal_recovery(pdev);
> +			pcie_do_recovery(pdev, pci_channel_io_normal,
> +					 PCIE_PORT_SERVICE_AER);
>  		else if (entry.severity == AER_FATAL)
> -			pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER);
> +			pcie_do_recovery(pdev, pci_channel_io_frozen,
> +					 PCIE_PORT_SERVICE_AER);
>  		pci_dev_put(pdev);
>  	}
>  }
> diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
> index aacfb50eccfc..1ed07db8ea7d 100644
> --- a/drivers/pci/pcie/dpc.c
> +++ b/drivers/pci/pcie/dpc.c
> @@ -169,7 +169,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
>  
>  	reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1;
>  	ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5;
> -	dev_warn(dev, "DPC %s detected, remove downstream devices\n",
> +	dev_warn(dev, "DPC %s detected\n",
>  		 (reason == 0) ? "unmasked uncorrectable error" :
>  		 (reason == 1) ? "ERR_NONFATAL" :
>  		 (reason == 2) ? "ERR_FATAL" :
> @@ -186,7 +186,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
>  	}
>  
>  	/* We configure DPC so it only triggers on ERR_FATAL */
> -	pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC);
> +	pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC);
>  
>  	return IRQ_HANDLED;
>  }
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index 62ab665f0f03..644f3f725ef0 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -271,83 +271,20 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
>  	return result_data.result;
>  }
>  
> -/**
> - * pcie_do_fatal_recovery - handle fatal error recovery process
> - * @dev: pointer to a pci_dev data structure of agent detecting an error
> - *
> - * Invoked when an error is fatal. Once being invoked, removes the devices
> - * beneath this AER agent, followed by reset link e.g. secondary bus reset
> - * followed by re-enumeration of devices.
> - */
> -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
> -{
> -	struct pci_dev *udev;
> -	struct pci_bus *parent;
> -	struct pci_dev *pdev, *temp;
> -	pci_ers_result_t result;
> -
> -	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
> -		udev = dev;
> -	else
> -		udev = dev->bus->self;
> -
> -	parent = udev->subordinate;
> -	pci_walk_bus(parent, pci_dev_set_disconnected, NULL);
> -
> -	pci_lock_rescan_remove();
> -	pci_dev_get(dev);
> -	list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
> -					 bus_list) {
> -		pci_stop_and_remove_bus_device(pdev);
> -	}
> -
> -	result = reset_link(udev, service);
> -
> -	if ((service == PCIE_PORT_SERVICE_AER) &&
> -	    (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
> -		/*
> -		 * If the error is reported by a bridge, we think this error
> -		 * is related to the downstream link of the bridge, so we
> -		 * do error recovery on all subordinates of the bridge instead
> -		 * of the bridge and clear the error status of the bridge.
> -		 */
> -		pci_aer_clear_fatal_status(dev);
> -		pci_aer_clear_device_status(dev);
> -	}
> -
> -	if (result == PCI_ERS_RESULT_RECOVERED) {
> -		if (pcie_wait_for_link(udev, true))
> -			pci_rescan_bus(udev->bus);
> -		pci_info(dev, "Device recovery from fatal error successful\n");
> -	} else {
> -		pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
> -		pci_info(dev, "Device recovery from fatal error failed\n");
> -	}
> -
> -	pci_dev_put(dev);
> -	pci_unlock_rescan_remove();
> -}
> -
> -/**
> - * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
> - * @dev: pointer to a pci_dev data structure of agent detecting an error
> - *
> - * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
> - * error detected message to all downstream drivers within a hierarchy in
> - * question and return the returned code.
> - */
> -void pcie_do_nonfatal_recovery(struct pci_dev *dev)
> +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
> +		      u32 service)
>  {
>  	pci_ers_result_t status;
> -	enum pci_channel_state state;
> -
> -	state = pci_channel_io_normal;
>  
>  	status = broadcast_error_message(dev,
>  			state,
>  			"error_detected",
>  			report_error_detected);
>  
> +	if (state == pci_channel_io_frozen &&
> +	    reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED)
> +		goto failed;
> +
>  	if (status == PCI_ERS_RESULT_CAN_RECOVER)
>  		status = broadcast_error_message(dev,
>  				state,
> -- 
> 2.14.4
> 



[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux