Re: [PATCH] AER: add ratelimit for PCIe AER corrected error storm log print

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[+cc Loic]

On Fri, Jan 08, 2021 at 09:19:00PM +0800, lvying6 wrote:
> PCIe AER corrected error storm will flush effective system log. System
> log will hold only the same PCIe AER correted error info. Add ratelimit
> for PCIe AER corrected error make system log hold other more effective
> system log info.
> 
> Signed-off-by: lvying6 <lvying6@xxxxxxxxxx>

Needs a real name, see https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst#n361

For general tips, see https://lore.kernel.org/r/20171026223701.GA25649@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

For my response to similar patch, see https://lore.kernel.org/r/20201204180615.GA1754610@bjorn-Precision-5520

TL;DR: we need to figure out and fix the root cause rather than
applying a band-aid.

> ---
>  drivers/pci/pcie/aer.c | 113 +++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 95 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 77b0f2c..ba20bb05 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -114,6 +114,76 @@ bool pci_aer_available(void)
>  	return !pcie_aer_disable && pci_msi_enabled();
>  }
>  
> +/* Not more than 2 messages every 5 seconds */
> +static DEFINE_RATELIMIT_STATE(ratelimit_aer, 5*HZ, 2);
> +
> +/*
> + * aer_ratelimit - AER log ratelimit
> + * @rs: ratelimit_state data
> + * @log_start: first aer log print statement
> + *
> + * a complete aer log is composed of log from several functions
> + * use printk_ratelimit for each aer log print statement will lose part
> + * of the aer log cause the log to be incomplete
> + *
> + * RETURNS:
> + * 0 means callbacks will be suppressed.
> + * 1 means go ahead and do it.
> + */
> +static int aer_ratelimit(struct ratelimit_state *rs, bool log_start)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	if (!rs->interval)
> +		return 1;
> +
> +	/*
> +	 * If we contend on this state's lock then almost
> +	 * by definition we are too busy to print a message,
> +	 * in addition to the one that will be printed by
> +	 * the entity that is holding the lock already:
> +	 */
> +	if (!raw_spin_trylock_irqsave(&rs->lock, flags))
> +		return 0;
> +
> +	if (!rs->begin)
> +		rs->begin = jiffies;
> +
> +	if (time_is_before_jiffies(rs->begin + rs->interval)) {
> +		if (rs->missed) {
> +			if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
> +				printk_deferred(KERN_WARNING
> +						"%s: %d callbacks suppressed\n",
> +						__func__, rs->missed);
> +				rs->missed = 0;
> +			}
> +		}
> +		rs->begin   = jiffies;
> +		rs->printed = 0;
> +	}
> +	if (rs->burst && log_start) {
> +		rs->printed++;
> +		if (rs->burst >= rs->printed) {
> +			/* The first log is in burst range */
> +			ret = 1;
> +		} else {
> +			/* The first log is out of  burst range, account miss times */
> +			rs->missed++;
> +			ret = 0;
> +		}
> +	} else if (rs->burst && rs->burst >= rs->printed && !log_start) {
> +		/* The remaining log is in burst range */
> +		ret = 1;
> +	} else {
> +		/* The remaining log is out of burst range */
> +		ret = 0;
> +	}
> +	raw_spin_unlock_irqrestore(&rs->lock, flags);
> +
> +	return ret;
> +}
> +
>  #ifdef CONFIG_PCIE_ECRC
>  
>  #define ECRC_POLICY_DEFAULT 0		/* ECRC set by BIOS */
> @@ -683,14 +753,15 @@ static void __aer_print_error(struct pci_dev *dev,
>  		level = KERN_ERR;
>  	}
>  
> -	for_each_set_bit(i, &status, 32) {
> -		errmsg = strings[i];
> -		if (!errmsg)
> -			errmsg = "Unknown Error Bit";
> +	if (aer_ratelimit(&ratelimit_aer, false))
> +		for_each_set_bit(i, &status, 32) {
> +			errmsg = strings[i];
> +			if (!errmsg)
> +				errmsg = "Unknown Error Bit";
>  
> -		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
> -				info->first_error == i ? " (First)" : "");
> -	}
> +			pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
> +					info->first_error == i ? " (First)" : "");
> +		}
>  	pci_dev_aer_stats_incr(dev, info);
>  }
>  
> @@ -701,8 +772,9 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  	const char *level;
>  
>  	if (!info->status) {
> -		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> -			aer_error_severity_string[info->severity]);
> +		if (aer_ratelimit(&ratelimit_aer, false))
> +			pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> +				aer_error_severity_string[info->severity]);
>  		goto out;
>  	}
>  
> @@ -711,20 +783,23 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  
>  	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
>  
> -	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> -		   aer_error_severity_string[info->severity],
> -		   aer_error_layer[layer], aer_agent_string[agent]);
> +	if (aer_ratelimit(&ratelimit_aer, false)) {
> +		pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> +			   aer_error_severity_string[info->severity],
> +			   aer_error_layer[layer], aer_agent_string[agent]);
>  
> -	pci_printk(level, dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> -		   dev->vendor, dev->device, info->status, info->mask);
> +		pci_printk(level, dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +			   dev->vendor, dev->device, info->status, info->mask);
> +	}
>  
>  	__aer_print_error(dev, info);
>  
> -	if (info->tlp_header_valid)
> +	if (info->tlp_header_valid && aer_ratelimit(&ratelimit_aer, false))
>  		__print_tlp_header(dev, &info->tlp);
>  
>  out:
> -	if (info->id && info->error_dev_num > 1 && info->id == id)
> +	if (info->id && info->error_dev_num > 1 && info->id == id
> +			&& aer_ratelimit(&ratelimit_aer, false))
>  		pci_err(dev, "  Error of this Agent is reported first\n");
>  
>  	trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
> @@ -924,7 +999,8 @@ static bool find_source_device(struct pci_dev *parent,
>  		pci_walk_bus(parent->subordinate, find_device_iter, e_info);
>  
>  	if (!e_info->error_dev_num) {
> -		pci_info(parent, "can't find device of ID%04x\n", e_info->id);
> +		if (aer_ratelimit(&ratelimit_aer, false))
> +			pci_info(parent, "can't find device of ID%04x\n", e_info->id);
>  		return false;
>  	}
>  	return true;
> @@ -1131,7 +1207,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
>  			e_info.multi_error_valid = 1;
>  		else
>  			e_info.multi_error_valid = 0;
> -		aer_print_port_info(pdev, &e_info);
> +		if (aer_ratelimit(&ratelimit_aer, true))
> +			aer_print_port_info(pdev, &e_info);
>  
>  		if (find_source_device(pdev, &e_info))
>  			aer_process_err_devices(&e_info);
> -- 
> 1.8.3.1
> 



[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux