Re: FAILED: patch "[PATCH] powerpc/powernv: Dump PHB diag-data immediately" failed to apply to 3.13-stable tree

Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> · Tue, 04 Mar 2014 14:34:16 +1100

On Mon, 2014-03-03 at 17:37 -0800, gregkh@xxxxxxxxxxxxxxxxxxx wrote:
> The patch below does not apply to the 3.13-stable tree.
> If someone wants it applied there, or to any other stable or longterm
> tree, then please email the backport, including the original git commit
> id to <stable@xxxxxxxxxxxxxxx>.

Gavin, can you send a backport please ?

Cheers,
Ben.

> thanks,
> 
> greg k-h
> 
> ------------------ original commit in Linus's tree ------------------
> 
> >From 947166043732b69878123bf31f51933ad0316080 Mon Sep 17 00:00:00 2001
> From: Gavin Shan <shangw@xxxxxxxxxxxxxxxxxx>
> Date: Tue, 25 Feb 2014 15:28:37 +0800
> Subject: [PATCH] powerpc/powernv: Dump PHB diag-data immediately
> 
> The PHB diag-data is important to help locating the root cause for
> EEH errors such as frozen PE or fenced PHB. However, the EEH core
> enables IO path by clearing part of HW registers before collecting
> this data causing it to be corrupted.
> 
> This patch fixes this by dumping the PHB diag-data immediately when
> frozen/fenced state on PE or PHB is detected for the first time in
> eeh_ops::get_state() or next_error() backend.
> 
> Signed-off-by: Gavin Shan <shangw@xxxxxxxxxxxxxxxxxx>
> CC: <stable@xxxxxxxxxxxxxxx>
> Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
> 
> diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
> index f51474336460..253fefe3d1a0 100644
> --- a/arch/powerpc/platforms/powernv/eeh-ioda.c
> +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
> @@ -114,6 +114,7 @@ DEFINE_SIMPLE_ATTRIBUTE(ioda_eeh_inbB_dbgfs_ops, ioda_eeh_inbB_dbgfs_get,
>  			ioda_eeh_inbB_dbgfs_set, "0x%llx\n");
>  #endif /* CONFIG_DEBUG_FS */
>  
> +
>  /**
>   * ioda_eeh_post_init - Chip dependent post initialization
>   * @hose: PCI controller
> @@ -221,6 +222,22 @@ static int ioda_eeh_set_option(struct eeh_pe *pe, int option)
>  	return ret;
>  }
>  
> +static void ioda_eeh_phb_diag(struct pci_controller *hose)
> +{
> +	struct pnv_phb *phb = hose->private_data;
> +	long rc;
> +
> +	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> +					 PNV_PCI_DIAG_BUF_SIZE);
> +	if (rc != OPAL_SUCCESS) {
> +		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
> +			    __func__, hose->global_number, rc);
> +		return;
> +	}
> +
> +	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
> +}
> +
>  /**
>   * ioda_eeh_get_state - Retrieve the state of PE
>   * @pe: EEH PE
> @@ -272,6 +289,9 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
>  			result |= EEH_STATE_DMA_ACTIVE;
>  			result |= EEH_STATE_MMIO_ENABLED;
>  			result |= EEH_STATE_DMA_ENABLED;
> +		} else if (!(pe->state & EEH_PE_ISOLATED)) {
> +			eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
> +			ioda_eeh_phb_diag(hose);
>  		}
>  
>  		return result;
> @@ -315,6 +335,15 @@ static int ioda_eeh_get_state(struct eeh_pe *pe)
>  			   __func__, fstate, hose->global_number, pe_no);
>  	}
>  
> +	/* Dump PHB diag-data for frozen PE */
> +	if (result != EEH_STATE_NOT_SUPPORT &&
> +	    (result & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) !=
> +	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE) &&
> +	    !(pe->state & EEH_PE_ISOLATED)) {
> +		eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
> +		ioda_eeh_phb_diag(hose);
> +	}
> +
>  	return result;
>  }
>  
> @@ -530,42 +559,6 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
>  }
>  
>  /**
> - * ioda_eeh_get_log - Retrieve error log
> - * @pe: EEH PE
> - * @severity: Severity level of the log
> - * @drv_log: buffer to store the log
> - * @len: space of the log buffer
> - *
> - * The function is used to retrieve error log from P7IOC.
> - */
> -static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
> -			    char *drv_log, unsigned long len)
> -{
> -	s64 ret;
> -	unsigned long flags;
> -	struct pci_controller *hose = pe->phb;
> -	struct pnv_phb *phb = hose->private_data;
> -
> -	spin_lock_irqsave(&phb->lock, flags);
> -
> -	ret = opal_pci_get_phb_diag_data2(phb->opal_id,
> -			phb->diag.blob, PNV_PCI_DIAG_BUF_SIZE);
> -	if (ret) {
> -		spin_unlock_irqrestore(&phb->lock, flags);
> -		pr_warning("%s: Can't get log for PHB#%x-PE#%x (%lld)\n",
> -			   __func__, hose->global_number, pe->addr, ret);
> -		return -EIO;
> -	}
> -
> -	/* The PHB diag-data is always indicative */
> -	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
> -
> -	spin_unlock_irqrestore(&phb->lock, flags);
> -
> -	return 0;
> -}
> -
> -/**
>   * ioda_eeh_configure_bridge - Configure the PCI bridges for the indicated PE
>   * @pe: EEH PE
>   *
> @@ -646,22 +639,6 @@ static void ioda_eeh_hub_diag(struct pci_controller *hose)
>  	}
>  }
>  
> -static void ioda_eeh_phb_diag(struct pci_controller *hose)
> -{
> -	struct pnv_phb *phb = hose->private_data;
> -	long rc;
> -
> -	rc = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag.blob,
> -					 PNV_PCI_DIAG_BUF_SIZE);
> -	if (rc != OPAL_SUCCESS) {
> -		pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
> -			    __func__, hose->global_number, rc);
> -		return;
> -	}
> -
> -	pnv_pci_dump_phb_diag_data(hose, phb->diag.blob);
> -}
> -
>  static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
>  			       struct eeh_pe **pe)
>  {
> @@ -835,6 +812,20 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
>  		}
>  
>  		/*
> +		 * EEH core will try recover from fenced PHB or
> +		 * frozen PE. In the time for frozen PE, EEH core
> +		 * enable IO path for that before collecting logs,
> +		 * but it ruins the site. So we have to dump the
> +		 * log in advance here.
> +		 */
> +		if ((ret == EEH_NEXT_ERR_FROZEN_PE  ||
> +		    ret == EEH_NEXT_ERR_FENCED_PHB) &&
> +		    !((*pe)->state & EEH_PE_ISOLATED)) {
> +			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
> +			ioda_eeh_phb_diag(hose);
> +		}
> +
> +		/*
>  		 * If we have no errors on the specific PHB or only
>  		 * informative error there, we continue poking it.
>  		 * Otherwise, we need actions to be taken by upper
> @@ -852,7 +843,6 @@ struct pnv_eeh_ops ioda_eeh_ops = {
>  	.set_option		= ioda_eeh_set_option,
>  	.get_state		= ioda_eeh_get_state,
>  	.reset			= ioda_eeh_reset,
> -	.get_log		= ioda_eeh_get_log,
>  	.configure_bridge	= ioda_eeh_configure_bridge,
>  	.next_error		= ioda_eeh_next_error
>  };

--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html