On Sat, May 09, 2015 at 08:24:14PM +1000, Alexey Kardashevskiy wrote: >On 05/01/2015 04:02 PM, Gavin Shan wrote: >>We're having the hardware or enforced (on P7IOC) limitation: M64 > >I would think if it is enforced, then it is enforced by hardware but you say >"hardware OR enforced" :) > PHB3 doesn't have M64DT from hardware. P7IOC supports that, but I don't utilize the capability. So I called it's enforced. Maybe it may be more clear to have "software enforced" ? :-) > >>segment#x can only be assigned to PE#x. IO and M32 segment can be >>mapped to arbitrary PE# via IODT and M32DT. It means the PE number >>should be x if M64 segment#x has been assigned to the PE. Also, each >>PE own one M64 segment at most. Currently, we are reserving PE# >>according to root port's M64 window. It won't be reliable once we >>extend M64 windows of root port, or the upstream port of the PCIE >>switch behind root port to PHB's M64 window, in order to support >>PCI hotplug in future. >> >>The patch reserves PE# for M64 segments according to the M64 resources >>of the PCI devices (not bridges) contained in the PE. Besides, it's >>always worthy to trace the M64 segments consumed by the PE, which can >>be released at PCI unplugging time. >> >>Signed-off-by: Gavin Shan <gwshan@xxxxxxxxxxxxxxxxxx> >>--- >> arch/powerpc/platforms/powernv/pci-ioda.c | 190 ++++++++++++++++++------------ >> arch/powerpc/platforms/powernv/pci.h | 10 +- >> 2 files changed, 122 insertions(+), 78 deletions(-) >> >>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c >>index 646962f..a994882 100644 >>--- a/arch/powerpc/platforms/powernv/pci-ioda.c >>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c >>@@ -283,28 +283,78 @@ fail: >> return -EIO; >> } >> >>-static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb) >>+/* We extend the M64 window of root port, or the upstream bridge port >>+ * of the PCIE switch behind root port. So we shouldn't reserve PEs >>+ * for M64 resources because there are no (normal) PCI devices consuming > >"PCI devices"? Not "root ports or PCI bridges"? > I have "no (normal) PCI devices" here, which means root port and PCI bridges are excluded. >>+ * M64 resources on the PCI buses leading from root port, or the upstream >>+ * bridge port.The function returns true if the indicated PCI bus needs >>+ * reserved PEs because of M64 resources in advance. Otherwise, the >>+ * function returns false. >>+ */ >>+static bool pnv_ioda_need_m64_pe(struct pnv_phb *phb, >>+ struct pci_bus *bus) >> { >>- resource_size_t sgsz = phb->ioda.m64_segsize; >>+ /* Root bus */ > >The comment is too obvious as the call below is called "pci_is_root_bus" :) > Indeed, it will be dropped in next revision. >>+ if (!bus || pci_is_root_bus(bus)) >>+ return false; >>+ >>+ /* Bus leading from root port. We need check what types of PCI >>+ * devices on the bus. If it's connecting PCI bridge, we don't >>+ * need reserve M64 PEs for it. Otherwise, we still need to do >>+ * that. >>+ */ >>+ if (pci_is_root_bus(bus->self->bus)) { >>+ struct pci_dev *pdev; >>+ >>+ list_for_each_entry(pdev, &bus->devices, bus_list) { >>+ if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) >>+ return true; >>+ } >>+ >>+ return false; >>+ } >>+ >>+ /* Bus leading from the upstream bridge port on top level */ >>+ if (pci_is_root_bus(bus->self->bus->self->bus)) > > >Is it for second level bridges? Like root->bridge->bridge? And for 3 levels >you will need a PE? > It's for upstream port of PCIe switch behind root port (a bit complicated). Yes, the bus leaded from the downstream port will need a PE as you said. >>+ return false; >>+ >>+ return true; >>+} >>+ >>+static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb, >>+ struct pci_bus *bus) >>+{ >>+ resource_size_t segsz = phb->ioda.m64_segsize; >> struct pci_dev *pdev; >> struct resource *r; >>- int base, step, i; >>+ unsigned long pe_no, limit; >>+ int i; >> >>- /* >>- * Root bus always has full M64 range and root port has >>- * M64 range used in reality. So we're checking root port >>- * instead of root bus. >>+ if (!pnv_ioda_need_m64_pe(phb, bus)) >>+ return; >>+ >>+ /* The bridge's M64 window might have been extended to the >>+ * PHB's M64 window in order to support PCI hotplug. So the >>+ * bridge's M64 window isn't reliable to be used for picking >>+ * PE# for its leading PCI bus. We have to check the M64 >>+ * resources consumed by the PCI devices, which seat on the >>+ * PCI bus. >> */ >>- list_for_each_entry(pdev, &phb->hose->bus->devices, bus_list) { >>- for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { >>- r = &pdev->resource[PCI_BRIDGE_RESOURCES + i]; >>- if (!r->parent || >>- !pnv_pci_is_mem_pref_64(r->flags)) >>+ list_for_each_entry(pdev, &bus->devices, bus_list) { >>+ for (i = 0; i < PCI_NUM_RESOURCES; i++) { >>+#ifdef CONFIG_PCI_IOV >>+ if (i >= PCI_IOV_RESOURCES && i <= PCI_IOV_RESOURCE_END) >>+ continue; >>+#endif >>+ r = &pdev->resource[i]; >>+ if (!r->flags || r->start >= r->end || >>+ !r->parent || !pnv_pci_is_mem_pref_64(r->flags)) >> continue; >> >>- base = (r->start - phb->ioda.m64_base) / sgsz; >>- for (step = 0; step < resource_size(r) / sgsz; step++) >>- pnv_ioda_reserve_pe(phb, base + step); >>+ pe_no = (r->start - phb->ioda.m64_base) / segsz; >>+ limit = ALIGN(r->end - phb->ioda.m64_base, segsz) / segsz; >>+ for (; pe_no < limit; pe_no++) >>+ pnv_ioda_reserve_pe(phb, pe_no); >> } >> } >> } >>@@ -316,85 +366,64 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb, >> struct pci_dev *pdev; >> struct resource *r; >> struct pnv_ioda_pe *master_pe, *pe; >>- unsigned long size, *pe_alloc; >>- bool found; >>- int start, i, j; >>- >>- /* Root bus shouldn't use M64 */ >>- if (pci_is_root_bus(bus)) >>- return IODA_INVALID_PE; >>- >>- /* We support only one M64 window on each bus */ >>- found = false; >>- pci_bus_for_each_resource(bus, r, i) { >>- if (r && r->parent && >>- pnv_pci_is_mem_pref_64(r->flags)) { >>- found = true; >>- break; >>- } >>- } >>+ unsigned long size, *pe_bitsmap; > >s/pe_bitsmap/pe_bitmap/ > Yeah, will fix it up. Thanks! >>+ unsigned long pe_no, limit; >>+ int i; >> >>- /* No M64 window found ? */ >>- if (!found) >>+ if (!pnv_ioda_need_m64_pe(phb, bus)) >> return IODA_INVALID_PE; >> >>- /* Allocate bitmap */ >>+ /* Allocate bitmap */ >> size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); >>- pe_alloc = kzalloc(size, GFP_KERNEL); >>- if (!pe_alloc) { >>- pr_warn("%s: Out of memory !\n", >>- __func__); >>+ pe_bitsmap = kzalloc(size, GFP_KERNEL); >>+ if (!pe_bitsmap) { >>+ pr_warn("%s: Out of memory !\n", __func__); >> return IODA_INVALID_PE; >> } >> >>- /* >>- * Figure out reserved PE numbers by the PE >>- * the its child PEs. >>- */ >>- start = (r->start - phb->ioda.m64_base) / segsz; >>- for (i = 0; i < resource_size(r) / segsz; i++) >>- set_bit(start + i, pe_alloc); >>- >>- if (all) >>- goto done; >>- >>- /* >>- * If the PE doesn't cover all subordinate buses, >>- * we need subtract from reserved PEs for children. >>+ /* The bridge's M64 window might be extended to PHB's M64 >>+ * window by intention to support PCI hotplug. So we have >>+ * to check the M64 resources consumed by the PCI devices >>+ * on the PCI bus. >> */ >> list_for_each_entry(pdev, &bus->devices, bus_list) { >>- if (!pdev->subordinate) >>- continue; >>+ for (i = 0; i < PCI_NUM_RESOURCES; i++) { >>+#ifdef CONFIG_PCI_IOV >>+ if (i >= PCI_IOV_RESOURCES && >>+ i <= PCI_IOV_RESOURCE_END) >>+ continue; >>+#endif >>+ /* Don't scan bridge's window if the PE >>+ * doesn't contain its subordinate bus. >>+ */ >>+ if (!all && i >= PCI_BRIDGE_RESOURCES && >>+ i <= PCI_BRIDGE_RESOURCE_END) >>+ continue; >> >>- pci_bus_for_each_resource(pdev->subordinate, r, i) { >>- if (!r || !r->parent || >>- !pnv_pci_is_mem_pref_64(r->flags)) >>+ r = &pdev->resource[i]; >>+ if (!r->flags || r->start >= r->end || >>+ !r->parent || !pnv_pci_is_mem_pref_64(r->flags)) >> continue; >> >>- start = (r->start - phb->ioda.m64_base) / segsz; >>- for (j = 0; j < resource_size(r) / segsz ; j++) >>- clear_bit(start + j, pe_alloc); >>- } >>- } >>+ pe_no = (r->start - phb->ioda.m64_base) / segsz; >>+ limit = ALIGN(r->end - phb->ioda.m64_base, segsz) / segsz; >>+ for (; pe_no < limit; pe_no++) >>+ set_bit(pe_no, pe_bitsmap); >>+ } >>+ } >> >>- /* >>- * the current bus might not own M64 window and that's all >>- * contributed by its child buses. For the case, we needn't >>- * pick M64 dependent PE#. >>- */ >>- if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { >>- kfree(pe_alloc); >>+ /* No M64 window found ? */ >>+ if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) { >>+ kfree(pe_bitsmap); >> return IODA_INVALID_PE; >> } >> >>- /* >>- * Figure out the master PE and put all slave PEs to master >>- * PE's list to form compound PE. >>+ /* Figure out the master PE and put all slave PEs >>+ * to master PE's list to form compound PE. >> */ >>-done: >> master_pe = NULL; >> i = -1; >>- while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < >>+ while ((i = find_next_bit(pe_bitsmap, phb->ioda.total_pe, i + 1)) < >> phb->ioda.total_pe) { >> pe = &phb->ioda.pe_array[i]; >> >>@@ -408,6 +437,13 @@ done: >> list_add_tail(&pe->list, &master_pe->slaves); >> } >> >>+ /* Pick the M64 segment, which should be available. Also, > >test_and_set_bit() does not pick or choose, it just marks PE#pe_number used. > It's true. I will replace "Pick" with "Reserve" M64 segment in next revision. If that's still not what you're suggesting, please let me know :-) >>+ * those M64 segments consumed by slave PEs are contributed >>+ * to the master PE. >>+ */ >>+ BUG_ON(test_and_set_bit(pe->pe_number, phb->ioda.m64_segmap)); >>+ BUG_ON(test_and_set_bit(pe->pe_number, master_pe->m64_segmap)); >>+ >> /* P7IOC supports M64DT, which helps mapping M64 segment >> * to one particular PE#. Unfortunately, PHB3 has fixed >> * mapping between M64 segment and PE#. In order for same >>@@ -431,7 +467,7 @@ done: >> } >> } >> >>- kfree(pe_alloc); >>+ kfree(pe_bitsmap); >> return master_pe->pe_number; >> } >> >>@@ -1233,7 +1269,7 @@ static void pnv_pci_ioda_setup_PEs(void) >> >> /* M64 layout might affect PE allocation */ >> if (phb->reserve_m64_pe) >>- phb->reserve_m64_pe(phb); >>+ phb->reserve_m64_pe(phb, phb->hose->bus); >> >> pnv_ioda_setup_PEs(hose->bus); >> } >>diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h >>index 070ee88..19022cf 100644 >>--- a/arch/powerpc/platforms/powernv/pci.h >>+++ b/arch/powerpc/platforms/powernv/pci.h >>@@ -49,6 +49,13 @@ struct pnv_ioda_pe { >> /* PE number */ >> unsigned int pe_number; >> >>+ /* IO/M32/M64 segments consumed by the PE. Each PE can >>+ * have one M64 segment at most, but M64 segments consumed >>+ * by slave PEs will be contributed to the master PE. One >>+ * PE can own multiple IO and M32 segments. >>+ */ >>+ unsigned long m64_segmap[8]; > > >Why 8? 64*8 = 512 segments? s'8'512/sizeof(unsigned long)' may be? > There are 128 M64 segments for P7IOC, but 256 M64 segments for PHB3. 512 is number bigger than 128 and 256. I still prefer m64_segmap[8] :-) > >>+ >> /* "Weight" assigned to the PE for the sake of DMA resource >> * allocations >> */ >>@@ -114,7 +121,7 @@ struct pnv_phb { >> u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); >> void (*shutdown)(struct pnv_phb *phb); >> int (*init_m64)(struct pnv_phb *phb); >>- void (*reserve_m64_pe)(struct pnv_phb *phb); >>+ void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus); >> int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all); >> int (*get_pe_state)(struct pnv_phb *phb, int pe_no); >> void (*freeze_pe)(struct pnv_phb *phb, int pe_no); >>@@ -153,6 +160,7 @@ struct pnv_phb { >> struct mutex pe_alloc_mutex; >> >> /* M32 & IO segment maps */ >>+ unsigned long m64_segmap[8]; >> unsigned int *m32_segmap; >> unsigned int *io_segmap; >> struct pnv_ioda_pe *pe_array; >> Thanks, Gavin -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html