On Fri, 2009-06-12 at 11:08 +0800, Zhang, Yanmin wrote: > When a root port receive the same errors more than once before kernel > process them, the Multiple Error Messages Received flags are set by > hardware. Because root port could only save one kind of correctable > error source id and another uncorrectable error source id at the same > time, so the second message sender id is lost if the 2 messages are > sent from 2 different devices. Below patch searches all devices under > the root port when multiple messages are received. > > Signed-off-by: Zhang Yanmin <yanmin.zhang@xxxxxxxxxxxxxxx> > > --- > > diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c > --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c 2009-06-12 05:39:24.000000000 +0800 > +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c 2009-06-12 05:45:15.000000000 +0800 > @@ -145,13 +145,22 @@ static void set_downstream_devices_error > pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable); > } > > +static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev) > +{ > + if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) { > + e_info->dev[e_info->error_dev_num ++] = dev; checkpatch reports: ERROR: space prohibited before that '++' (ctx:WxB) #46: FILE: drivers/pci/pcie/aer/aerdrv_core.c:151: + e_info->dev[e_info->error_dev_num ++] = dev; Personally I would prefer: e_info->dev[e_info->error_dev_num] = dev; e_info->error_dev_num++; > + return 1; > + } else > + return 0; > +} > + This function is now doing more than just comparing device ID's. Perhaps you could rename it or put call add_error_device after compare_device_id in find_device_iter? > static int compare_device_id(struct pci_dev *dev, struct aer_err_info *e_info) > { > if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) { > /* > * Device ID match > */ > - e_info->dev = dev; > + add_error_device(e_info, dev); > return 1; > } > > @@ -166,20 +175,38 @@ static int find_device_iter(struct pci_d > u32 status; > u32 mask; > u16 reg16; > + int result; > struct aer_err_info *e_info = (struct aer_err_info *)data; > > /* > * When bus id is equal to 0, it might be a bad id > * reported by root port. > */ > - if (!nosourceid && (PCI_BUS(e_info->id) != 0)) > - return compare_device_id(dev, e_info); > + if (!nosourceid && (PCI_BUS(e_info->id) != 0)) { > + result = compare_device_id(dev, e_info); > + /* > + * If there is no multiple error, we stop > + * or continue based on the id comparing. > + */ > + if (!(e_info->flags & AER_MULTI_ERROR_VALID_FLAG)) > + return result; > + > + /* > + * If there are multiple errors and id does match, > + * We need continue to search other devices under > + * the root port. Return 0 means that. > + */ > + if (result) > + return 0; > + } > > /* > - * Next is to check when bus id is equal to 0 or > - * nosourceid==y. Some ports might lose the bus > - * id of error source id. We check AER status > - * registers to find the initial reporter. > + * When either > + * 1) nosourceid==y; > + * 2) bus id is equal to 0. Some ports might lose the bus > + * id of error source id; > + * 3) There are multiple errors and prior id comparing fails; > + * We check AER status registers to find the initial reporter. > */ > if (atomic_read(&dev->enable_cnt) == 0) > return 0; > @@ -208,8 +235,8 @@ static int find_device_iter(struct pci_d > pos + PCI_ERR_COR_MASK, > &mask); > if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) { > - e_info->dev = dev; > - return 1; > + add_error_device(e_info, dev); > + goto added; > } > } else { > pci_read_config_dword(dev, > @@ -219,12 +246,18 @@ static int find_device_iter(struct pci_d > pos + PCI_ERR_UNCOR_MASK, > &mask); > if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) { > - e_info->dev = dev; > - return 1; > + add_error_device(e_info, dev); > + goto added; > } > } > > return 0; > + > +added: > + if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) { > + return 0; > + } else > + return 1; checkpatch reports: WARNING: braces {} are not necessary for any arm of this statement #133: FILE: drivers/pci/pcie/aer/aerdrv_core.c:257: + if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) { [...] + } else [...] > } > > /** > @@ -705,6 +738,30 @@ static int get_device_error_info(struct > return AER_SUCCESS; > } > > +static inline void aer_process_err_devices(struct pcie_device *p_device, > + struct aer_err_info *e_info) > +{ > + int i; > + > + if (e_info->dev[0] == NULL) { Minor not. Can we use if (!e_info->dev[0]) { > + printk(KERN_DEBUG "%s->can't find device of ID%04x\n", > + __func__, e_info->id); I suspect we don't want to embed the function name here, and use dev_printk. > + } > + > + for (i = 0; i < e_info->error_dev_num; i ++) { checkpatch reports: ERROR: space prohibited before that '++' (ctx:WxB) #154: FILE: drivers/pci/pcie/aer/aerdrv_core.c:751: + for (i = 0; i < e_info->error_dev_num; i + > + if (e_info->dev[i] == NULL) again if (!e_info->dev[i]) You could also put this check in the for loop. > + break; > + > + if (get_device_error_info(e_info->dev[i], e_info) == > + AER_SUCCESS) { > + aer_print_error(e_info->dev[i], e_info); > + handle_error_source(p_device, > + e_info->dev[i], > + e_info); > + } > + } > +} > + > /** > * aer_isr_one_error - consume an error detected by root port > * @p_device: pointer to error root port service device > @@ -747,18 +804,7 @@ static void aer_isr_one_error(struct pci > e_info->flags |= AER_MULTI_ERROR_VALID_FLAG; > > find_source_device(p_device->port, e_info); > - if (e_info->dev == NULL) { > - printk(KERN_DEBUG "%s->can't find device of ID%04x\n", > - __func__, e_info->id); > - continue; > - } > - if (get_device_error_info(e_info->dev, e_info) == > - AER_SUCCESS) { > - aer_print_error(e_info->dev, e_info); > - handle_error_source(p_device, > - e_info->dev, > - e_info); > - } > + aer_process_err_devices(p_device, e_info); > } > > kfree(e_info); > diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h > --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h 2009-06-12 05:39:24.000000000 +0800 > +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h 2009-06-12 05:45:15.000000000 +0800 > @@ -57,8 +57,10 @@ struct header_log_regs { > unsigned int dw3; > }; > > +#define AER_MAX_MULTI_ERR_DEVICES 5 Is this number arbitrary or in the spec somewhere? > struct aer_err_info { > - struct pci_dev *dev; > + struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; > + int error_dev_num; > u16 id; > int severity; /* 0:NONFATAL | 1:FATAL | 2:COR */ > int flags; > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-pci" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- Andrew Patterson Hewlett-Packard -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html