Re: [RFC v7] PCI: Set PCI-E Max Payload Size on fabric

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Tested-by: Jordan_Hargrave@xxxxxxxx

On Wed, Jul 20, 2011 at 5:58 PM, jordan hargrave <jharg93@xxxxxxxxx> wrote:
> Looks good, just realized my patch was broken too! :(
>
> On Wed, Jul 20, 2011 at 3:20 PM, Jon Mason <mason@xxxxxxxx> wrote:
>> On a given PCI-E fabric, each device, bridge, and root port can have a
>> different PCI-E maximum payload size.  There is a sizable performance
>> boost for having the largest possible maximum payload size on each PCI-E
>> device.  However, if improperly configured, fatal bus errors can occur.
>> Thus, it is important to ensure that PCI-E payloads sends by a device
>> are never larger than the MPS setting of all devices on the way to the
>> destination.
>>
>> This can be achieved two ways:
>>
>> - A conservative approach is to use the smallest common denominator of
>>  the entire tree below a root complex for every device on that fabric.
>>
>> This means for example that having a 128 bytes MPS USB controller on one
>> leg of a switch will dramatically reduce performances of a video card or
>> 10GE adapter on another leg of that same switch.
>>
>> It also means that any hierarchy supporting hotplug slots (including
>> expresscard or thunderbolt I suppose, dbl check that) will have to be
>> entirely clamped to 128 bytes since we cannot predict what will be
>> plugged into those slots, and we cannot change the MPS on a "live"
>> system.
>>
>> - A more optimal way is possible, if it falls within a couple of
>>  constraints:
>> * The top-level host bridge will never generate packets larger than the
>>  smallest TLP (or if it can be controlled independently from its MPS at
>>  least)
>> * The device will never generate packets larger than MPS (which can be
>>  configured via MRRS)
>> * No support of direct PCI-E <-> PCI-E transfers between devices without
>>  some additional code to specifically deal with that case
>>
>> Then we can use an approach that basically ignores downstream requests
>> and focuses exclusively on upstream requests. In that case, all we need
>> to care about is that a device MPS is no larger than its parent MPS,
>> which allows us to keep all switches/bridges to the max MPS supported by
>> their parent and eventually the PHB.
>>
>> In this case, your USB controller would no longer "starve" your 10GE
>> Ethernet and your hotplug slots won't affect your global MPS.
>> Additionally, the hotplugged devices themselves can be configured to a
>> larger MPS up to the value configured in the hotplug bridge.
>>
>> To choose between the two available options, two PCI kernel boot args
>> have been added to the PCI calls.  "pcie_bus_safe" will provide the
>> former behavior, while "pcie_bus_perf" will perform the latter behavior.
>> By default, the latter behavior is used.
>>
>> NOTE: due to the location of the enablement, each arch will need to add
>> calls to this function.  This patch only enables x86.
>>
>> This patch includes a number of changes recommended by Benjamin
>> Herrenschmidt.
>>
>> Signed-off-by: Jon Mason <mason@xxxxxxxx>
>> ---
>>  arch/x86/pci/acpi.c              |    9 +++
>>  drivers/pci/hotplug/pcihp_slot.c |   45 +------------
>>  drivers/pci/pci.c                |   67 +++++++++++++++++
>>  drivers/pci/probe.c              |  145 ++++++++++++++++++++++++++++++++++++++
>>  include/linux/pci.h              |   15 ++++-
>>  5 files changed, 236 insertions(+), 45 deletions(-)
>>
>> diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
>> index 0972315..4aeac46 100644
>> --- a/arch/x86/pci/acpi.c
>> +++ b/arch/x86/pci/acpi.c
>> @@ -361,6 +361,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
>>                }
>>        }
>>
>> +       /* After the PCI-E bus has been walked and all devices discovered,
>> +        * configure any settings of the fabric that might be necessary.
>> +        */
>> +       if (bus) {
>> +               struct pci_bus *child;
>> +               list_for_each_entry(child, &bus->children, node)
>> +                       pcie_bus_configure_settings(child, child->self->pcie_mpss);
>> +       }
>> +
>>        if (!bus)
>>                kfree(sd);
>>
>> diff --git a/drivers/pci/hotplug/pcihp_slot.c b/drivers/pci/hotplug/pcihp_slot.c
>> index 749fdf0..753b21a 100644
>> --- a/drivers/pci/hotplug/pcihp_slot.c
>> +++ b/drivers/pci/hotplug/pcihp_slot.c
>> @@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
>>         */
>>  }
>>
>> -/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
>> -static int pci_set_payload(struct pci_dev *dev)
>> -{
>> -       int pos, ppos;
>> -       u16 pctl, psz;
>> -       u16 dctl, dsz, dcap, dmax;
>> -       struct pci_dev *parent;
>> -
>> -       parent = dev->bus->self;
>> -       pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
>> -       if (!pos)
>> -               return 0;
>> -
>> -       /* Read Device MaxPayload capability and setting */
>> -       pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
>> -       pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
>> -       dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
>> -       dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
>> -
>> -       /* Read Parent MaxPayload setting */
>> -       ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
>> -       if (!ppos)
>> -               return 0;
>> -       pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
>> -       psz = (pctl &  PCI_EXP_DEVCTL_PAYLOAD) >> 5;
>> -
>> -       /* If parent payload > device max payload -> error
>> -        * If parent payload > device payload -> set speed
>> -        * If parent payload <= device payload -> do nothing
>> -        */
>> -       if (psz > dmax)
>> -               return -1;
>> -       else if (psz > dsz) {
>> -               dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
>> -               pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
>> -                                     (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
>> -                                     (psz << 5));
>> -       }
>> -       return 0;
>> -}
>> -
>>  void pci_configure_slot(struct pci_dev *dev)
>>  {
>>        struct pci_dev *cdev;
>> @@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev)
>>                        (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
>>                return;
>>
>> -       ret = pci_set_payload(dev);
>> -       if (ret)
>> -               dev_warn(&dev->dev, "could not set device max payload\n");
>> +       pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
>>
>>        memset(&hpp, 0, sizeof(hpp));
>>        ret = pci_get_hp_params(dev, &hpp);
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 56098b3..4c13d20 100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
>>  unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
>>  unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
>>
>> +enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
>> +
>>  /*
>>  * The default CLS is used if arch didn't set CLS explicitly and not
>>  * all pci devices agree on the same value.  Arch can override either
>> @@ -3218,6 +3220,67 @@ out:
>>  EXPORT_SYMBOL(pcie_set_readrq);
>>
>>  /**
>> + * pcie_get_mps - get PCI Express maximum payload size
>> + * @dev: PCI device to query
>> + *
>> + * Returns maximum payload size in bytes
>> + *    or appropriate error value.
>> + */
>> +int pcie_get_mps(struct pci_dev *dev)
>> +{
>> +       int ret, cap;
>> +       u16 ctl;
>> +
>> +       cap = pci_pcie_cap(dev);
>> +       if (!cap)
>> +               return -EINVAL;
>> +
>> +       ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
>> +       if (!ret)
>> +               ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
>> +
>> +       return ret;
>> +}
>> +
>> +/**
>> + * pcie_set_mps - set PCI Express maximum payload size
>> + * @dev: PCI device to query
>> + * @rq: maximum payload size in bytes
>> + *    valid values are 128, 256, 512, 1024, 2048, 4096
>> + *
>> + * If possible sets maximum payload size
>> + */
>> +int pcie_set_mps(struct pci_dev *dev, int mps)
>> +{
>> +       int cap, err = -EINVAL;
>> +       u16 ctl, v;
>> +
>> +       if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
>> +               goto out;
>> +
>> +       v = ffs(mps) - 8;
>> +       if (v > dev->pcie_mpss)
>> +               goto out;
>> +       v <<= 5;
>> +
>> +       cap = pci_pcie_cap(dev);
>> +       if (!cap)
>> +               goto out;
>> +
>> +       err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
>> +       if (err)
>> +               goto out;
>> +
>> +       if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
>> +               ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
>> +               ctl |= v;
>> +               err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
>> +       }
>> +out:
>> +       return err;
>> +}
>> +
>> +/**
>>  * pci_select_bars - Make BAR mask from the type of resource
>>  * @dev: the PCI device for which BAR mask is made
>>  * @flags: resource type mask to be selected
>> @@ -3498,6 +3561,10 @@ static int __init pci_setup(char *str)
>>                                pci_hotplug_io_size = memparse(str + 9, &str);
>>                        } else if (!strncmp(str, "hpmemsize=", 10)) {
>>                                pci_hotplug_mem_size = memparse(str + 10, &str);
>> +                       } else if (!strncmp(str, "pcie_bus_safe", 13)) {
>> +                               pcie_bus_config = PCIE_BUS_SAFE;
>> +                       } else if (!strncmp(str, "pcie_bus_perf", 13)) {
>> +                               pcie_bus_config = PCIE_BUS_PERFORMANCE;
>>                        } else {
>>                                printk(KERN_ERR "PCI: Unknown option `%s'\n",
>>                                                str);
>> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
>> index 48849ff..fceca7c 100644
>> --- a/drivers/pci/probe.c
>> +++ b/drivers/pci/probe.c
>> @@ -860,6 +860,8 @@ void set_pcie_port_type(struct pci_dev *pdev)
>>        pdev->pcie_cap = pos;
>>        pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
>>        pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
>> +       pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
>> +       pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
>>  }
>>
>>  void set_pcie_hotplug_bridge(struct pci_dev *pdev)
>> @@ -1327,6 +1329,149 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
>>        return nr;
>>  }
>>
>> +static int pcie_find_smpss(struct pci_dev *dev, void *data)
>> +{
>> +       u8 *smpss = data;
>> +
>> +       if (!pci_is_pcie(dev))
>> +               return 0;
>> +
>> +       /* For PCIE hotplug enabled slots not connected directly to a
>> +        * PCI-E root port, there can be problems when hotplugging
>> +        * devices.  This is due to the possibility of hotplugging a
>> +        * device into the fabric with a smaller MPS that the devices
>> +        * currently running have configured.  Modifying the MPS on the
>> +        * running devices could cause a fatal bus error due to an
>> +        * incoming frame being larger than the newly configured MPS.
>> +        * To work around this, the MPS for the entire fabric must be
>> +        * set to the minimum size.  Any devices hotplugged into this
>> +        * fabric will have the minimum MPS set.  If the PCI hotplug
>> +        * slot is directly connected to the root port and there are not
>> +        * other devices on the fabric (which seems to be the most
>> +        * common case), then this is not an issue and MPS discovery
>> +        * will occur as normal.
>> +        */
>> +       if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
>> +           dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
>> +               *smpss = 0;
>> +
>> +       if (*smpss > dev->pcie_mpss)
>> +               *smpss = dev->pcie_mpss;
>> +
>> +       return 0;
>> +}
>> +
>> +static void pcie_write_mps(struct pci_dev *dev, int mps)
>> +{
>> +       int rc, dev_mpss;
>> +
>> +       dev_mpss = 128 << dev->pcie_mpss;
>> +
>> +       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
>> +               if (dev->bus->self) {
>> +                       dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
>> +                               128 << dev->bus->self->pcie_mpss);
>> +
>> +                       /* For "MPS Force Max", the assumption is made that
>> +                        * downstream communication will never be larger than
>> +                        * the MRRS.  So, the MPS only needs to be configured
>> +                        * for the upstream communication.  This being the case,
>> +                        * walk from the top down and set the MPS of the child
>> +                        * to that of the parent bus.
>> +                        */
>> +                       mps = 128 << dev->bus->self->pcie_mpss;
>> +                       if (mps > dev_mpss)
>> +                               dev_warn(&dev->dev, "MPS configured higher than"
>> +                                        " maximum supported by the device.  If"
>> +                                        " a bus issue occurs, try running with"
>> +                                        " pci=pcie_bus_safe.\n");
>> +               }
>> +
>> +               dev->pcie_mpss = ffs(mps) - 8;
>> +       }
>> +
>> +       rc = pcie_set_mps(dev, mps);
>> +       if (rc)
>> +               dev_err(&dev->dev, "Failed attempting to set the MPS\n");
>> +}
>> +
>> +static void pcie_write_mrrs(struct pci_dev *dev, int mps)
>> +{
>> +       int rc, mrrs;
>> +
>> +       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
>> +               int dev_mpss = 128 << dev->pcie_mpss;
>> +
>> +               /* For Max performance, the MRRS must be set to the largest
>> +                * supported value.  However, it cannot be configured larger
>> +                * than the MPS the device or the bus can support.  This assumes
>> +                * that the largest MRRS available on the device cannot be
>> +                * smaller than the device MPSS.
>> +                */
>> +               mrrs = mps < dev_mpss ? mps : dev_mpss;
>> +       } else
>> +               /* In the "safe" case, configure the MRRS for fairness on the
>> +                * bus by making all devices have the same size
>> +                */
>> +               mrrs = mps;
>> +
>> +
>> +       /* MRRS is a R/W register.  Invalid values can be written, but a
>> +        * subsiquent read will verify if the value is acceptable or not.
>> +        * If the MRRS value provided is not acceptable (e.g., too large),
>> +        * shrink the value until it is acceptable to the HW.
>> +        */
>> +       while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
>> +               rc = pcie_set_readrq(dev, mrrs);
>> +               if (rc)
>> +                       dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
>> +
>> +               mrrs /= 2;
>> +       }
>> +}
>> +
>> +static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
>> +{
>> +       int mps = 128 << *(u8 *)data;
>> +
>> +       if (!pci_is_pcie(dev))
>> +               return 0;
>> +
>> +       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
>> +                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
>> +
>> +       pcie_write_mps(dev, mps);
>> +       pcie_write_mrrs(dev, mps);
>> +
>> +       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
>> +                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
>> +
>> +       return 0;
>> +}
>> +
>> +/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
>> + * parents then children fashion.  If this changes, then this code will not
>> + * work as designed.
>> + */
>> +void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
>> +{
>> +       u8 smpss = mpss;
>> +
>> +       if (!bus->self)
>> +               return;
>> +
>> +       if (!pci_is_pcie(bus->self))
>> +               return;
>> +
>> +       if (pcie_bus_config == PCIE_BUS_SAFE) {
>> +               pcie_find_smpss(bus->self, &smpss);
>> +               pci_walk_bus(bus, pcie_find_smpss, &smpss);
>> +       }
>> +
>> +       pcie_bus_configure_set(bus->self, &smpss);
>> +       pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
>> +}
>> +
>>  unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
>>  {
>>        unsigned int devfn, pass, max = bus->secondary;
>> diff --git a/include/linux/pci.h b/include/linux/pci.h
>> index c446b5c..161aa45 100644
>> --- a/include/linux/pci.h
>> +++ b/include/linux/pci.h
>> @@ -251,7 +251,8 @@ struct pci_dev {
>>        u8              revision;       /* PCI revision, low byte of class word */
>>        u8              hdr_type;       /* PCI header type (`multi' flag masked out) */
>>        u8              pcie_cap;       /* PCI-E capability offset */
>> -       u8              pcie_type;      /* PCI-E device/port type */
>> +       u8              pcie_type:4;    /* PCI-E device/port type */
>> +       u8              pcie_mpss:3;    /* PCI-E Max Payload Size Supported */
>>        u8              rom_base_reg;   /* which config register controls the ROM */
>>        u8              pin;            /* which interrupt pin this device uses */
>>
>> @@ -617,6 +618,16 @@ struct pci_driver {
>>  /* these external functions are only available when PCI support is enabled */
>>  #ifdef CONFIG_PCI
>>
>> +extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
>> +
>> +enum pcie_bus_config_types {
>> +       PCIE_BUS_PERFORMANCE,
>> +       PCIE_BUS_SAFE,
>> +       PCIE_BUS_PEER2PEER,
>> +};
>> +
>> +extern enum pcie_bus_config_types pcie_bus_config;
>> +
>>  extern struct bus_type pci_bus_type;
>>
>>  /* Do NOT directly access these two variables, unless you are arch specific pci
>> @@ -796,6 +807,8 @@ int pcix_get_mmrbc(struct pci_dev *dev);
>>  int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
>>  int pcie_get_readrq(struct pci_dev *dev);
>>  int pcie_set_readrq(struct pci_dev *dev, int rq);
>> +int pcie_get_mps(struct pci_dev *dev);
>> +int pcie_set_mps(struct pci_dev *dev, int mps);
>>  int __pci_reset_function(struct pci_dev *dev);
>>  int pci_reset_function(struct pci_dev *dev);
>>  void pci_update_resource(struct pci_dev *dev, int resno);
>> --
>> 1.7.6
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>> the body of a message to majordomo@xxxxxxxxxxxxxxx
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux