[+cc Loic] On Fri, Jan 08, 2021 at 09:19:00PM +0800, lvying6 wrote: > PCIe AER corrected error storm will flush effective system log. System > log will hold only the same PCIe AER correted error info. Add ratelimit > for PCIe AER corrected error make system log hold other more effective > system log info. > > Signed-off-by: lvying6 <lvying6@xxxxxxxxxx> Needs a real name, see https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst#n361 For general tips, see https://lore.kernel.org/r/20171026223701.GA25649@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx For my response to similar patch, see https://lore.kernel.org/r/20201204180615.GA1754610@bjorn-Precision-5520 TL;DR: we need to figure out and fix the root cause rather than applying a band-aid. > --- > drivers/pci/pcie/aer.c | 113 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 95 insertions(+), 18 deletions(-) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index 77b0f2c..ba20bb05 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -114,6 +114,76 @@ bool pci_aer_available(void) > return !pcie_aer_disable && pci_msi_enabled(); > } > > +/* Not more than 2 messages every 5 seconds */ > +static DEFINE_RATELIMIT_STATE(ratelimit_aer, 5*HZ, 2); > + > +/* > + * aer_ratelimit - AER log ratelimit > + * @rs: ratelimit_state data > + * @log_start: first aer log print statement > + * > + * a complete aer log is composed of log from several functions > + * use printk_ratelimit for each aer log print statement will lose part > + * of the aer log cause the log to be incomplete > + * > + * RETURNS: > + * 0 means callbacks will be suppressed. > + * 1 means go ahead and do it. > + */ > +static int aer_ratelimit(struct ratelimit_state *rs, bool log_start) > +{ > + unsigned long flags; > + int ret; > + > + if (!rs->interval) > + return 1; > + > + /* > + * If we contend on this state's lock then almost > + * by definition we are too busy to print a message, > + * in addition to the one that will be printed by > + * the entity that is holding the lock already: > + */ > + if (!raw_spin_trylock_irqsave(&rs->lock, flags)) > + return 0; > + > + if (!rs->begin) > + rs->begin = jiffies; > + > + if (time_is_before_jiffies(rs->begin + rs->interval)) { > + if (rs->missed) { > + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { > + printk_deferred(KERN_WARNING > + "%s: %d callbacks suppressed\n", > + __func__, rs->missed); > + rs->missed = 0; > + } > + } > + rs->begin = jiffies; > + rs->printed = 0; > + } > + if (rs->burst && log_start) { > + rs->printed++; > + if (rs->burst >= rs->printed) { > + /* The first log is in burst range */ > + ret = 1; > + } else { > + /* The first log is out of burst range, account miss times */ > + rs->missed++; > + ret = 0; > + } > + } else if (rs->burst && rs->burst >= rs->printed && !log_start) { > + /* The remaining log is in burst range */ > + ret = 1; > + } else { > + /* The remaining log is out of burst range */ > + ret = 0; > + } > + raw_spin_unlock_irqrestore(&rs->lock, flags); > + > + return ret; > +} > + > #ifdef CONFIG_PCIE_ECRC > > #define ECRC_POLICY_DEFAULT 0 /* ECRC set by BIOS */ > @@ -683,14 +753,15 @@ static void __aer_print_error(struct pci_dev *dev, > level = KERN_ERR; > } > > - for_each_set_bit(i, &status, 32) { > - errmsg = strings[i]; > - if (!errmsg) > - errmsg = "Unknown Error Bit"; > + if (aer_ratelimit(&ratelimit_aer, false)) > + for_each_set_bit(i, &status, 32) { > + errmsg = strings[i]; > + if (!errmsg) > + errmsg = "Unknown Error Bit"; > > - pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, > - info->first_error == i ? " (First)" : ""); > - } > + pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg, > + info->first_error == i ? " (First)" : ""); > + } > pci_dev_aer_stats_incr(dev, info); > } > > @@ -701,8 +772,9 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) > const char *level; > > if (!info->status) { > - pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", > - aer_error_severity_string[info->severity]); > + if (aer_ratelimit(&ratelimit_aer, false)) > + pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", > + aer_error_severity_string[info->severity]); > goto out; > } > > @@ -711,20 +783,23 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) > > level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR; > > - pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", > - aer_error_severity_string[info->severity], > - aer_error_layer[layer], aer_agent_string[agent]); > + if (aer_ratelimit(&ratelimit_aer, false)) { > + pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", > + aer_error_severity_string[info->severity], > + aer_error_layer[layer], aer_agent_string[agent]); > > - pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", > - dev->vendor, dev->device, info->status, info->mask); > + pci_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", > + dev->vendor, dev->device, info->status, info->mask); > + } > > __aer_print_error(dev, info); > > - if (info->tlp_header_valid) > + if (info->tlp_header_valid && aer_ratelimit(&ratelimit_aer, false)) > __print_tlp_header(dev, &info->tlp); > > out: > - if (info->id && info->error_dev_num > 1 && info->id == id) > + if (info->id && info->error_dev_num > 1 && info->id == id > + && aer_ratelimit(&ratelimit_aer, false)) > pci_err(dev, " Error of this Agent is reported first\n"); > > trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask), > @@ -924,7 +999,8 @@ static bool find_source_device(struct pci_dev *parent, > pci_walk_bus(parent->subordinate, find_device_iter, e_info); > > if (!e_info->error_dev_num) { > - pci_info(parent, "can't find device of ID%04x\n", e_info->id); > + if (aer_ratelimit(&ratelimit_aer, false)) > + pci_info(parent, "can't find device of ID%04x\n", e_info->id); > return false; > } > return true; > @@ -1131,7 +1207,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc, > e_info.multi_error_valid = 1; > else > e_info.multi_error_valid = 0; > - aer_print_port_info(pdev, &e_info); > + if (aer_ratelimit(&ratelimit_aer, true)) > + aer_print_port_info(pdev, &e_info); > > if (find_source_device(pdev, &e_info)) > aer_process_err_devices(&e_info); > -- > 1.8.3.1 >