Hi David, Thanks for working on this. I gave it a look and added my comments inline below. I have a few concerns, but overall I like it :) To clarify, you are only trying to add support for Bridges that have EA entries to describe the resource range of downstream devices? I ask because the spec allows for bridges that don't include an EA entry for this. I think we will need to add more code to make sure we don't break hot-plugging for non-EA devices. What if something happens on a non-EA bus that causes the PCI core to reassign bus numbers? I don't think the current code can make sure the bus numbers stay the same. Also, I think we should split up the patch set to make it easier to review. Maybe something like: 1 - Register/Constant definitions 2 - EA entry parser/RCiEP support 3 - Bridge support 4 - SR-IOV support Thanks, Sean On Mon, Sep 28, 2015 at 05:10:39PM -0700, David Daney wrote: > From: "Sean O. Stalley" <sean.stalley@xxxxxxxxx> > > Add support for devices using Enhanced Allocation entries instead of BARs. > This patch allows the kernel to parse the EA Extended Capability structure > in PCI configspace and claim the BAR-equivalent resources. > > Signed-off-by: Sean O. Stalley <sean.stalley@xxxxxxxxx> > [david.daney@xxxxxxxxxx: Added support for bridges and SRIOV] > Signed-off-by: David Daney <david.daney@xxxxxxxxxx> > --- > drivers/pci/iov.c | 15 ++- > drivers/pci/pci.c | 278 ++++++++++++++++++++++++++++++++++++++++++++++++ > drivers/pci/pci.h | 4 + > drivers/pci/probe.c | 42 +++++++- > drivers/pci/setup-bus.c | 3 + > include/linux/pci.h | 1 + > 6 files changed, 339 insertions(+), 4 deletions(-) > > diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c > index ee0ebff..c034575 100644 > --- a/drivers/pci/iov.c > +++ b/drivers/pci/iov.c > @@ -435,9 +435,20 @@ found: > > nres = 0; > for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { > + int ea_bar = pci_ea_find_ent_with_bei(dev, > + i + PCI_EA_BEI_VF_BAR0); > + > res = &dev->resource[i + PCI_IOV_RESOURCES]; > - bar64 = __pci_read_base(dev, pci_bar_unknown, res, > - pos + PCI_SRIOV_BAR + i * 4); > + > + if (ea_bar > 0) { > + if (pci_ea_decode_ent(dev, ea_bar, res)) > + continue; Do we want to decode here? This patch calls pci_ea_decode_ent() a lot more than I think it should... I think we only want to call the decode function once per entry, since it requests resources & reads configspace. Do you think we could find a way to only call decode() once during initialization, then use info in the resource struct after that? > + bar64 = (res->flags & IORESOURCE_MEM_64) ? 1 : 0; > + } else { > + bar64 = __pci_read_base(dev, pci_bar_unknown, res, > + pos + PCI_SRIOV_BAR + i * 4); > + } > + > if (!res->flags) > continue; > if (resource_size(res) & (PAGE_SIZE - 1)) { > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c > index 6a9a111..7c60b16 100644 > --- a/drivers/pci/pci.c > +++ b/drivers/pci/pci.c > @@ -2148,6 +2148,284 @@ void pci_pm_init(struct pci_dev *dev) > } > } > > +static unsigned long pci_ea_set_flags(struct pci_dev *dev, u8 prop) > +{ > + unsigned long flags = IORESOURCE_PCI_FIXED | IORESOURCE_SIZEALIGN; Why did you add the IORESOURCE_SIZEALIGN flag? EA allows for unaligned resources. > + > + switch (prop) { > + case PCI_EA_P_MEM: > + case PCI_EA_P_VIRT_MEM: > + case PCI_EA_P_BRIDGE_MEM: > + flags |= IORESOURCE_MEM; > + break; > + case PCI_EA_P_MEM_PREFETCH: > + case PCI_EA_P_VIRT_MEM_PREFETCH: > + case PCI_EA_P_BRIDGE_MEM_PREFETCH: > + flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH; > + break; > + case PCI_EA_P_IO: > + case PCI_EA_P_BRIDGE_IO: > + flags |= IORESOURCE_IO; > + break; > + default: > + return 0; > + } > + > + return flags; > +} > + > +static struct resource *pci_ea_get_resource(struct pci_dev *dev, u8 bei) > +{ > + if (bei <= PCI_STD_RESOURCE_END) > + return &dev->resource[bei]; > + else if (bei == PCI_EA_BEI_ROM) > + return &dev->resource[PCI_ROM_RESOURCE]; For SR-IOV, you would need to handle the case where the bei is between PCI_EA_BEI_IOV and PCI_EA_BEI_IOV_END (9-14) For Bridges, we need to handle the PCI_EA_BEI_BRIDGE (6) case. > + else > + return NULL; > +} > + > +int pci_ea_decode_ent(struct pci_dev *dev, int offset, struct resource *res) > +{ > + struct resource *r; > + int i; > + unsigned long mask; > + int ent_offset = offset; > + int ent_size; > + resource_size_t start; > + resource_size_t end; > + unsigned long flags; > + u32 dw0; > + u32 base; > + u32 max_offset; > + bool support_64 = (sizeof(resource_size_t) >= 8); > + > + pci_read_config_dword(dev, ent_offset, &dw0); > + ent_offset += 4; > + > + /* Entry size field indicates DWORDs after 1st */ > + ent_size = ((dw0 & PCI_EA_ES) + 1) << 2; > + > + /* Try to use primary properties, otherwise fall back to secondary */ > + flags = pci_ea_set_flags(dev, PCI_EA_PP(dw0)); > + if (!flags) > + flags = pci_ea_set_flags(dev, PCI_EA_SP(dw0)); > + > + if (!flags) { > + dev_err(&dev->dev, "%s: Entry EA properties not supported\n", > + __func__); > + goto out; > + } > + > + /* Read Base */ > + pci_read_config_dword(dev, ent_offset, &base); > + start = (base & PCI_EA_FIELD_MASK); > + ent_offset += 4; > + > + /* Read MaxOffset */ > + pci_read_config_dword(dev, ent_offset, &max_offset); > + ent_offset += 4; > + > + /* Read Base MSBs (if 64-bit entry) */ > + if (base & PCI_EA_IS_64) { > + u32 base_upper; > + > + pci_read_config_dword(dev, ent_offset, &base_upper); > + ent_offset += 4; > + > + flags |= IORESOURCE_MEM_64; > + > + /* entry starts above 32-bit boundary, can't use */ > + if (!support_64 && base_upper) > + goto out; > + > + if (support_64) > + start |= ((u64)base_upper << 32); > + } > + > + dev_dbg(&dev->dev, "%s: start = %pa\n", __func__, &start); > + > + end = start + (max_offset | 0x03); > + > + /* Read MaxOffset MSBs (if 64-bit entry) */ > + if (max_offset & PCI_EA_IS_64) { > + u32 max_offset_upper; > + > + pci_read_config_dword(dev, ent_offset, &max_offset_upper); > + ent_offset += 4; > + > + flags |= IORESOURCE_MEM_64; > + > + /* entry too big, can't use */ > + if (!support_64 && max_offset_upper) > + goto out; > + > + if (support_64) > + end += ((u64)max_offset_upper << 32); > + } > + > + dev_dbg(&dev->dev, "%s: end = %pa\n", __func__, &end); > + > + if (end < start) { > + dev_err(&dev->dev, "EA Entry crosses address boundary\n"); > + goto out; > + } > + > + if (ent_size != ent_offset - offset) { > + dev_err(&dev->dev, "EA entry size does not match length read\n" > + "(Entry Size:%u Length Read:%u)\n", > + ent_size, ent_offset - offset); > + goto out; > + } > + > + res->name = pci_name(dev); > + res->start = start; > + res->end = end; > + res->flags = flags; > + mask = IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; > + pci_bus_for_each_resource(dev->bus, r, i) { > + if (!r) > + continue; > + if ((r->flags & mask) == (flags & mask) && > + resource_contains(r, res)) { > + request_resource(r, res); > + break; > + } > + } Yinghai had some objections to claiming/requesting resources this early in initialization. Bjorn thought it would be better to modify the existing pci_claim_resource() function then try to do it ourselves here. (V1 did something similar, but I pulled it out in V2) > + if (!res->parent) { > + if (flags & IORESOURCE_IO) > + res->parent = &ioport_resource; > + else > + res->parent = &iomem_resource; > + } I had this code in V1. Bjorn didn't like defaulting to these resources. Also, is there a reason you call request_resource() for resources above, but not here? > + return 0; > +out: > + return -EINVAL; > +} > + I'm not sure if this finder code is necessary. You could avoid having to call pci_ea_find_ent_with_prop() by making the decoder set the right resource struct for the bridge. You could avoid having to call pci_ea_find_ent_with_bei() by adding an EA flag to the resources. When the SR-IOV code usually reads the BAR, just check if the EA flag is set, and skip the read (since the resource has already been set). > +static int pci_ea_find_ent_with_x(struct pci_dev *dev, > + bool (*matcher)(u32, int), int m) > +{ > + int ent_offset; > + u8 num_ent; > + u32 dw0; > + > + if (!dev->ea_cap) > + return -ENOENT; > + > + /* determine the number of entries */ > + pci_read_config_byte(dev, dev->ea_cap + PCI_EA_NUM_ENT, &num_ent); > + num_ent &= PCI_EA_NUM_ENT_MASK; > + > + ent_offset = dev->ea_cap + PCI_EA_FIRST_ENT; > + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) > + ent_offset += 4; > + > + while (num_ent) { > + pci_read_config_dword(dev, ent_offset, &dw0); > + > + if (matcher(dw0, m)) > + return ent_offset; > + > + /* Entry size field indicates DWORDs after 1st */ > + ent_offset += ((dw0 & PCI_EA_ES) + 1) << 2; > + num_ent--; > + } > + return -ENOENT; > +} > + > +static bool pci_ea_match_prop(u32 dw0, int prop) > +{ > + return PCI_EA_PP(dw0) == prop || PCI_EA_SP(dw0) == prop; > +} > + > +int pci_ea_find_ent_with_prop(struct pci_dev *dev, int prop) > +{ > + return pci_ea_find_ent_with_x(dev, pci_ea_match_prop, prop); > +} > + > +static bool pci_ea_match_bei(u32 dw0, int bei) > +{ > + return PCI_EA_BEI(dw0) == bei; > +} > + > +int pci_ea_find_ent_with_bei(struct pci_dev *dev, int bei) > +{ > + return pci_ea_find_ent_with_x(dev, pci_ea_match_bei, bei); > +} > + > +/* Read an Enhanced Allocation (EA) entry */ > +static int pci_ea_read(struct pci_dev *dev, int offset) > +{ > + struct resource *res; > + int ent_offset = offset; > + int ent_size; > + u32 dw0; > + > + pci_read_config_dword(dev, ent_offset, &dw0); > + ent_offset += 4; > + > + /* Entry size field indicates DWORDs after 1st */ > + ent_size = ((dw0 & PCI_EA_ES) + 1) << 2; > + > + switch (PCI_EA_PP(dw0)) { > + case PCI_EA_P_MEM: > + case PCI_EA_P_MEM_PREFETCH: > + case PCI_EA_P_IO: > + break; > + default: > + switch (PCI_EA_SP(dw0)) { > + case PCI_EA_P_MEM: > + case PCI_EA_P_MEM_PREFETCH: > + case PCI_EA_P_IO: > + break; > + default: > + goto out; > + } > + } > + if (!(dw0 & PCI_EA_ENABLE)) /* Entry not enabled */ > + goto out; > + > + res = pci_ea_get_resource(dev, PCI_EA_BEI(dw0)); > + if (!res) { > + dev_err(&dev->dev, "%s: Unsupported EA entry BEI\n", __func__); > + goto out; > + } > + > + pci_ea_decode_ent(dev, offset, res); > +out: > + return offset + ent_size; > +} > + > +/* Enhanced Allocation Initalization */ > +void pci_ea_init(struct pci_dev *dev) > +{ > + int ea; > + u8 num_ent; > + int offset; > + int i; > + > + /* find PCI EA capability in list */ > + ea = pci_find_capability(dev, PCI_CAP_ID_EA); > + if (!ea) > + return; > + > + dev->ea_cap = ea; > + > + /* determine the number of entries */ > + pci_read_config_byte(dev, ea + PCI_EA_NUM_ENT, &num_ent); > + num_ent &= PCI_EA_NUM_ENT_MASK; > + > + offset = ea + PCI_EA_FIRST_ENT; > + > + /* Skip DWORD 2 for type 1 functions */ > + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) > + offset += 4; > + > + /* parse each EA entry */ > + for (i = 0; i < num_ent; ++i) > + offset = pci_ea_read(dev, offset); > +} > + > static void pci_add_saved_cap(struct pci_dev *pci_dev, > struct pci_cap_saved_state *new_cap) > { > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 24ba9dc..80542ac 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -78,6 +78,10 @@ bool pci_dev_keep_suspended(struct pci_dev *dev); > void pci_config_pm_runtime_get(struct pci_dev *dev); > void pci_config_pm_runtime_put(struct pci_dev *dev); > void pci_pm_init(struct pci_dev *dev); > +void pci_ea_init(struct pci_dev *dev); > +int pci_ea_decode_ent(struct pci_dev *dev, int offset, struct resource *res); > +int pci_ea_find_ent_with_prop(struct pci_dev *dev, int prop); > +int pci_ea_find_ent_with_bei(struct pci_dev *dev, int prop); > void pci_allocate_cap_save_buffers(struct pci_dev *dev); > void pci_free_cap_save_buffers(struct pci_dev *dev); > > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c > index 0b2be17..52ae57d 100644 > --- a/drivers/pci/probe.c > +++ b/drivers/pci/probe.c > @@ -338,6 +338,7 @@ static void pci_read_bridge_io(struct pci_bus *child) > unsigned long io_mask, io_granularity, base, limit; > struct pci_bus_region region; > struct resource *res; > + int ea_ent; > > io_mask = PCI_IO_RANGE_MASK; > io_granularity = 0x1000; > @@ -348,6 +349,12 @@ static void pci_read_bridge_io(struct pci_bus *child) > } > > res = child->resource[0]; > + ea_ent = pci_ea_find_ent_with_prop(dev, PCI_EA_P_BRIDGE_IO); > + if (ea_ent > 0 && !pci_ea_decode_ent(dev, ea_ent, res)) { Wouldn't this make EA entries for bridges get decoded (& requested) twice (Here & when the EA entry is initially parsed)? Happens below too for memory entries... > + dev_dbg(&dev->dev, " bridge window %pR via EA\n", res); > + return; > + } > + > pci_read_config_byte(dev, PCI_IO_BASE, &io_base_lo); > pci_read_config_byte(dev, PCI_IO_LIMIT, &io_limit_lo); Don't we want to use the EA window (instead of whatever is written in base limit)? > base = (io_base_lo & io_mask) << 8; > @@ -378,8 +385,14 @@ static void pci_read_bridge_mmio(struct pci_bus *child) > unsigned long base, limit; > struct pci_bus_region region; > struct resource *res; > + int ea_ent; > > res = child->resource[1]; > + ea_ent = pci_ea_find_ent_with_prop(dev, PCI_EA_P_BRIDGE_MEM); > + if (ea_ent > 0 && !pci_ea_decode_ent(dev, ea_ent, res)) { > + dev_dbg(&dev->dev, " bridge window %pR via EA\n", res); > + return; > + } > pci_read_config_word(dev, PCI_MEMORY_BASE, &mem_base_lo); > pci_read_config_word(dev, PCI_MEMORY_LIMIT, &mem_limit_lo); Don't we want to use the EA window (instead of whatever is written in base limit)? > base = ((unsigned long) mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16; > @@ -401,8 +414,14 @@ static void pci_read_bridge_mmio_pref(struct pci_bus *child) > pci_bus_addr_t base, limit; > struct pci_bus_region region; > struct resource *res; > + int ea_ent; > > res = child->resource[2]; > + ea_ent = pci_ea_find_ent_with_prop(dev, PCI_EA_P_BRIDGE_MEM_PREFETCH); > + if (ea_ent > 0 && !pci_ea_decode_ent(dev, ea_ent, res)) { > + dev_dbg(&dev->dev, " bridge window %pR via EA\n", res); > + return; > + } > pci_read_config_word(dev, PCI_PREF_MEMORY_BASE, &mem_base_lo); > pci_read_config_word(dev, PCI_PREF_MEMORY_LIMIT, &mem_limit_lo); Don't we want to use the EA window (instead of whatever is written in base limit)? > base64 = (mem_base_lo & PCI_PREF_RANGE_MASK) << 16; > @@ -801,8 +820,24 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) > > pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses); > primary = buses & 0xFF; > - secondary = (buses >> 8) & 0xFF; > - subordinate = (buses >> 16) & 0xFF; > + if (dev->ea_cap) { > + u32 sdw; > + > + pci_read_config_dword(dev, dev->ea_cap + 4, &sdw); > + if (sdw & 0xFF) > + secondary = sdw & 0xFF; > + else > + secondary = (buses >> 8) & 0xFF; > + > + sdw >>= 8; > + if (sdw & 0xFF) > + subordinate = sdw & 0xFF; > + else > + subordinate = (buses >> 16) & 0xFF; > + } else { > + secondary = (buses >> 8) & 0xFF; > + subordinate = (buses >> 16) & 0xFF; > + } We could refactor this to avoid duplicate assignment lines. What if we did something like this: secondary = (buses >> 8) & 0xFF; subordinate = (buses >> 16) & 0xFF; if (dev->ea_cap) { u32 sdw; pci_read_config_dword(dev, dev->ea_cap + 4, &sdw); if (sdw & 0xFF) secondary = sdw & 0xFF; sdw >>= 8; if (sdw & 0xFF) subordinate = sdw & 0xFF; } > > dev_dbg(&dev->dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n", > secondary, subordinate, pass); > @@ -1598,6 +1633,9 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) > > static void pci_init_capabilities(struct pci_dev *dev) > { > + /* Enhanced Allocation */ > + pci_ea_init(dev); > + > /* MSI/MSI-X list */ > pci_msi_init_pci_dev(dev); > > diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c > index 508cc56..1158c71 100644 > --- a/drivers/pci/setup-bus.c > +++ b/drivers/pci/setup-bus.c > @@ -1026,6 +1026,9 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, > if (!b_res) > return -ENOSPC; > > + if (b_res->flags & IORESOURCE_PCI_FIXED) > + return 0; /* Fixed from EA, we cannot change it */ > + Other things set the IORESOURCE_PCI_FIXED flag, we can't assume that it was because of EA. > memset(aligns, 0, sizeof(aligns)); > max_order = 0; > size = 0; > diff --git a/include/linux/pci.h b/include/linux/pci.h > index b505b50..41b85ed 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -384,6 +384,7 @@ struct pci_dev { > phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */ > size_t romlen; /* Length of ROM if it's not from the BAR */ > char *driver_override; /* Driver name to force a match */ > + u8 ea_cap; /* Enhanced Allocation capability offset */ > }; > > static inline struct pci_dev *pci_physfn(struct pci_dev *dev) > -- > 1.9.1 > -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html