Looks good, just realized my patch was broken too! :( On Wed, Jul 20, 2011 at 3:20 PM, Jon Mason <mason@xxxxxxxx> wrote: > On a given PCI-E fabric, each device, bridge, and root port can have a > different PCI-E maximum payload size. There is a sizable performance > boost for having the largest possible maximum payload size on each PCI-E > device. However, if improperly configured, fatal bus errors can occur. > Thus, it is important to ensure that PCI-E payloads sends by a device > are never larger than the MPS setting of all devices on the way to the > destination. > > This can be achieved two ways: > > - A conservative approach is to use the smallest common denominator of > the entire tree below a root complex for every device on that fabric. > > This means for example that having a 128 bytes MPS USB controller on one > leg of a switch will dramatically reduce performances of a video card or > 10GE adapter on another leg of that same switch. > > It also means that any hierarchy supporting hotplug slots (including > expresscard or thunderbolt I suppose, dbl check that) will have to be > entirely clamped to 128 bytes since we cannot predict what will be > plugged into those slots, and we cannot change the MPS on a "live" > system. > > - A more optimal way is possible, if it falls within a couple of > constraints: > * The top-level host bridge will never generate packets larger than the > smallest TLP (or if it can be controlled independently from its MPS at > least) > * The device will never generate packets larger than MPS (which can be > configured via MRRS) > * No support of direct PCI-E <-> PCI-E transfers between devices without > some additional code to specifically deal with that case > > Then we can use an approach that basically ignores downstream requests > and focuses exclusively on upstream requests. In that case, all we need > to care about is that a device MPS is no larger than its parent MPS, > which allows us to keep all switches/bridges to the max MPS supported by > their parent and eventually the PHB. > > In this case, your USB controller would no longer "starve" your 10GE > Ethernet and your hotplug slots won't affect your global MPS. > Additionally, the hotplugged devices themselves can be configured to a > larger MPS up to the value configured in the hotplug bridge. > > To choose between the two available options, two PCI kernel boot args > have been added to the PCI calls. "pcie_bus_safe" will provide the > former behavior, while "pcie_bus_perf" will perform the latter behavior. > By default, the latter behavior is used. > > NOTE: due to the location of the enablement, each arch will need to add > calls to this function. This patch only enables x86. > > This patch includes a number of changes recommended by Benjamin > Herrenschmidt. > > Signed-off-by: Jon Mason <mason@xxxxxxxx> > --- > arch/x86/pci/acpi.c | 9 +++ > drivers/pci/hotplug/pcihp_slot.c | 45 +------------ > drivers/pci/pci.c | 67 +++++++++++++++++ > drivers/pci/probe.c | 145 ++++++++++++++++++++++++++++++++++++++ > include/linux/pci.h | 15 ++++- > 5 files changed, 236 insertions(+), 45 deletions(-) > > diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c > index 0972315..4aeac46 100644 > --- a/arch/x86/pci/acpi.c > +++ b/arch/x86/pci/acpi.c > @@ -361,6 +361,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) > } > } > > + /* After the PCI-E bus has been walked and all devices discovered, > + * configure any settings of the fabric that might be necessary. > + */ > + if (bus) { > + struct pci_bus *child; > + list_for_each_entry(child, &bus->children, node) > + pcie_bus_configure_settings(child, child->self->pcie_mpss); > + } > + > if (!bus) > kfree(sd); > > diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c > index 749fdf0..753b21a 100644 > --- a/drivers/pci/hotplug/pcihp_slot.c > +++ b/drivers/pci/hotplug/pcihp_slot.c > @@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) > */ > } > > -/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */ > -static int pci_set_payload(struct pci_dev *dev) > -{ > - int pos, ppos; > - u16 pctl, psz; > - u16 dctl, dsz, dcap, dmax; > - struct pci_dev *parent; > - > - parent = dev->bus->self; > - pos = pci_find_capability(dev, PCI_CAP_ID_EXP); > - if (!pos) > - return 0; > - > - /* Read Device MaxPayload capability and setting */ > - pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl); > - pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap); > - dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5; > - dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD); > - > - /* Read Parent MaxPayload setting */ > - ppos = pci_find_capability(parent, PCI_CAP_ID_EXP); > - if (!ppos) > - return 0; > - pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl); > - psz = (pctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5; > - > - /* If parent payload > device max payload -> error > - * If parent payload > device payload -> set speed > - * If parent payload <= device payload -> do nothing > - */ > - if (psz > dmax) > - return -1; > - else if (psz > dsz) { > - dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz); > - pci_write_config_word(dev, pos + PCI_EXP_DEVCTL, > - (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) + > - (psz << 5)); > - } > - return 0; > -} > - > void pci_configure_slot(struct pci_dev *dev) > { > struct pci_dev *cdev; > @@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev) > (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI))) > return; > > - ret = pci_set_payload(dev); > - if (ret) > - dev_warn(&dev->dev, "could not set device max payload\n"); > + pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss); > > memset(&hpp, 0, sizeof(hpp)); > ret = pci_get_hp_params(dev, &hpp); > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c > index 56098b3..4c13d20 100644 > --- a/drivers/pci/pci.c > +++ b/drivers/pci/pci.c > @@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE; > unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE; > unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE; > > +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE; > + > /* > * The default CLS is used if arch didn't set CLS explicitly and not > * all pci devices agree on the same value. Arch can override either > @@ -3218,6 +3220,67 @@ out: > EXPORT_SYMBOL(pcie_set_readrq); > > /** > + * pcie_get_mps - get PCI Express maximum payload size > + * @dev: PCI device to query > + * > + * Returns maximum payload size in bytes > + * or appropriate error value. > + */ > +int pcie_get_mps(struct pci_dev *dev) > +{ > + int ret, cap; > + u16 ctl; > + > + cap = pci_pcie_cap(dev); > + if (!cap) > + return -EINVAL; > + > + ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); > + if (!ret) > + ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5); > + > + return ret; > +} > + > +/** > + * pcie_set_mps - set PCI Express maximum payload size > + * @dev: PCI device to query > + * @rq: maximum payload size in bytes > + * valid values are 128, 256, 512, 1024, 2048, 4096 > + * > + * If possible sets maximum payload size > + */ > +int pcie_set_mps(struct pci_dev *dev, int mps) > +{ > + int cap, err = -EINVAL; > + u16 ctl, v; > + > + if (mps < 128 || mps > 4096 || !is_power_of_2(mps)) > + goto out; > + > + v = ffs(mps) - 8; > + if (v > dev->pcie_mpss) > + goto out; > + v <<= 5; > + > + cap = pci_pcie_cap(dev); > + if (!cap) > + goto out; > + > + err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl); > + if (err) > + goto out; > + > + if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) { > + ctl &= ~PCI_EXP_DEVCTL_PAYLOAD; > + ctl |= v; > + err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl); > + } > +out: > + return err; > +} > + > +/** > * pci_select_bars - Make BAR mask from the type of resource > * @dev: the PCI device for which BAR mask is made > * @flags: resource type mask to be selected > @@ -3498,6 +3561,10 @@ static int __init pci_setup(char *str) > pci_hotplug_io_size = memparse(str + 9, &str); > } else if (!strncmp(str, "hpmemsize=", 10)) { > pci_hotplug_mem_size = memparse(str + 10, &str); > + } else if (!strncmp(str, "pcie_bus_safe", 13)) { > + pcie_bus_config = PCIE_BUS_SAFE; > + } else if (!strncmp(str, "pcie_bus_perf", 13)) { > + pcie_bus_config = PCIE_BUS_PERFORMANCE; > } else { > printk(KERN_ERR "PCI: Unknown option `%s'\n", > str); > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c > index 48849ff..fceca7c 100644 > --- a/drivers/pci/probe.c > +++ b/drivers/pci/probe.c > @@ -860,6 +860,8 @@ void set_pcie_port_type(struct pci_dev *pdev) > pdev->pcie_cap = pos; > pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, ®16); > pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4; > + pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, ®16); > + pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD; > } > > void set_pcie_hotplug_bridge(struct pci_dev *pdev) > @@ -1327,6 +1329,149 @@ int pci_scan_slot(struct pci_bus *bus, int devfn) > return nr; > } > > +static int pcie_find_smpss(struct pci_dev *dev, void *data) > +{ > + u8 *smpss = data; > + > + if (!pci_is_pcie(dev)) > + return 0; > + > + /* For PCIE hotplug enabled slots not connected directly to a > + * PCI-E root port, there can be problems when hotplugging > + * devices. This is due to the possibility of hotplugging a > + * device into the fabric with a smaller MPS that the devices > + * currently running have configured. Modifying the MPS on the > + * running devices could cause a fatal bus error due to an > + * incoming frame being larger than the newly configured MPS. > + * To work around this, the MPS for the entire fabric must be > + * set to the minimum size. Any devices hotplugged into this > + * fabric will have the minimum MPS set. If the PCI hotplug > + * slot is directly connected to the root port and there are not > + * other devices on the fabric (which seems to be the most > + * common case), then this is not an issue and MPS discovery > + * will occur as normal. > + */ > + if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) || > + dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT)) > + *smpss = 0; > + > + if (*smpss > dev->pcie_mpss) > + *smpss = dev->pcie_mpss; > + > + return 0; > +} > + > +static void pcie_write_mps(struct pci_dev *dev, int mps) > +{ > + int rc, dev_mpss; > + > + dev_mpss = 128 << dev->pcie_mpss; > + > + if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { > + if (dev->bus->self) { > + dev_dbg(&dev->bus->dev, "Bus MPSS %d\n", > + 128 << dev->bus->self->pcie_mpss); > + > + /* For "MPS Force Max", the assumption is made that > + * downstream communication will never be larger than > + * the MRRS. So, the MPS only needs to be configured > + * for the upstream communication. This being the case, > + * walk from the top down and set the MPS of the child > + * to that of the parent bus. > + */ > + mps = 128 << dev->bus->self->pcie_mpss; > + if (mps > dev_mpss) > + dev_warn(&dev->dev, "MPS configured higher than" > + " maximum supported by the device. If" > + " a bus issue occurs, try running with" > + " pci=pcie_bus_safe.\n"); > + } > + > + dev->pcie_mpss = ffs(mps) - 8; > + } > + > + rc = pcie_set_mps(dev, mps); > + if (rc) > + dev_err(&dev->dev, "Failed attempting to set the MPS\n"); > +} > + > +static void pcie_write_mrrs(struct pci_dev *dev, int mps) > +{ > + int rc, mrrs; > + > + if (pcie_bus_config == PCIE_BUS_PERFORMANCE) { > + int dev_mpss = 128 << dev->pcie_mpss; > + > + /* For Max performance, the MRRS must be set to the largest > + * supported value. However, it cannot be configured larger > + * than the MPS the device or the bus can support. This assumes > + * that the largest MRRS available on the device cannot be > + * smaller than the device MPSS. > + */ > + mrrs = mps < dev_mpss ? mps : dev_mpss; > + } else > + /* In the "safe" case, configure the MRRS for fairness on the > + * bus by making all devices have the same size > + */ > + mrrs = mps; > + > + > + /* MRRS is a R/W register. Invalid values can be written, but a > + * subsiquent read will verify if the value is acceptable or not. > + * If the MRRS value provided is not acceptable (e.g., too large), > + * shrink the value until it is acceptable to the HW. > + */ > + while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) { > + rc = pcie_set_readrq(dev, mrrs); > + if (rc) > + dev_err(&dev->dev, "Failed attempting to set the MRRS\n"); > + > + mrrs /= 2; > + } > +} > + > +static int pcie_bus_configure_set(struct pci_dev *dev, void *data) > +{ > + int mps = 128 << *(u8 *)data; > + > + if (!pci_is_pcie(dev)) > + return 0; > + > + dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", > + pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev)); > + > + pcie_write_mps(dev, mps); > + pcie_write_mrrs(dev, mps); > + > + dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n", > + pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev)); > + > + return 0; > +} > + > +/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down, > + * parents then children fashion. If this changes, then this code will not > + * work as designed. > + */ > +void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss) > +{ > + u8 smpss = mpss; > + > + if (!bus->self) > + return; > + > + if (!pci_is_pcie(bus->self)) > + return; > + > + if (pcie_bus_config == PCIE_BUS_SAFE) { > + pcie_find_smpss(bus->self, &smpss); > + pci_walk_bus(bus, pcie_find_smpss, &smpss); > + } > + > + pcie_bus_configure_set(bus->self, &smpss); > + pci_walk_bus(bus, pcie_bus_configure_set, &smpss); > +} > + > unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus) > { > unsigned int devfn, pass, max = bus->secondary; > diff --git a/include/linux/pci.h b/include/linux/pci.h > index c446b5c..161aa45 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -251,7 +251,8 @@ struct pci_dev { > u8 revision; /* PCI revision, low byte of class word */ > u8 hdr_type; /* PCI header type (`multi' flag masked out) */ > u8 pcie_cap; /* PCI-E capability offset */ > - u8 pcie_type; /* PCI-E device/port type */ > + u8 pcie_type:4; /* PCI-E device/port type */ > + u8 pcie_mpss:3; /* PCI-E Max Payload Size Supported */ > u8 rom_base_reg; /* which config register controls the ROM */ > u8 pin; /* which interrupt pin this device uses */ > > @@ -617,6 +618,16 @@ struct pci_driver { > /* these external functions are only available when PCI support is enabled */ > #ifdef CONFIG_PCI > > +extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss); > + > +enum pcie_bus_config_types { > + PCIE_BUS_PERFORMANCE, > + PCIE_BUS_SAFE, > + PCIE_BUS_PEER2PEER, > +}; > + > +extern enum pcie_bus_config_types pcie_bus_config; > + > extern struct bus_type pci_bus_type; > > /* Do NOT directly access these two variables, unless you are arch specific pci > @@ -796,6 +807,8 @@ int pcix_get_mmrbc(struct pci_dev *dev); > int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc); > int pcie_get_readrq(struct pci_dev *dev); > int pcie_set_readrq(struct pci_dev *dev, int rq); > +int pcie_get_mps(struct pci_dev *dev); > +int pcie_set_mps(struct pci_dev *dev, int mps); > int __pci_reset_function(struct pci_dev *dev); > int pci_reset_function(struct pci_dev *dev); > void pci_update_resource(struct pci_dev *dev, int resno); > -- > 1.7.6 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-pci" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html