On Thu, Apr 30, 2020 at 11:15 PM Max Gurtovoy <maxg@xxxxxxxxxxxx> wrote: > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c > index 57d3a6a..9ecc576 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -3706,18 +3706,208 @@ static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus) > } > } > > +#ifdef CONFIG_PCI_P2PDMA > +static DEFINE_MUTEX(p2p_mutex); > + > +static bool pnv_pci_controller_owns_addr(struct pci_controller *hose, > + phys_addr_t addr, size_t size) > +{ > + int i; > + > + /* > + * It seems safe to assume the full range is under the same PHB, so we > + * can ignore the size. > + */ > + for (i = 0; i < ARRAY_SIZE(hose->mem_resources); i++) { > + struct resource *res = &hose->mem_resources[i]; > + > + if (res->flags && addr >= res->start && addr < res->end) > + return true; > + } > + return false; > +} > + > +/* > + * find the phb owning a mmio address if not owned locally > + */ > +static struct pnv_phb *pnv_pci_find_owning_phb(struct pci_dev *pdev, > + phys_addr_t addr, size_t size) > +{ > + struct pci_controller *hose; > + > + /* fast path */ > + if (pnv_pci_controller_owns_addr(pdev->bus->sysdata, addr, size)) > + return NULL; Do we actually need this fast path? It's going to be slow either way. Also if a device is doing p2p to another device under the same PHB then it should not be happening via the root complex. Is this a case you've tested? > + list_for_each_entry(hose, &hose_list, list_node) { > + struct pnv_phb *phb = hose->private_data; > + > + if (phb->type != PNV_PHB_NPU_NVLINK && > + phb->type != PNV_PHB_NPU_OCAPI) { > + if (pnv_pci_controller_owns_addr(hose, addr, size)) > + return phb; > + } > + } > + return NULL; > +} > + > +static u64 pnv_pci_dma_dir_to_opal_p2p(enum dma_data_direction dir) > +{ > + if (dir == DMA_TO_DEVICE) > + return OPAL_PCI_P2P_STORE; > + else if (dir == DMA_FROM_DEVICE) > + return OPAL_PCI_P2P_LOAD; > + else if (dir == DMA_BIDIRECTIONAL) > + return OPAL_PCI_P2P_LOAD | OPAL_PCI_P2P_STORE; > + else > + return 0; > +} > + > +static int pnv_pci_ioda_enable_p2p(struct pci_dev *initiator, > + struct pnv_phb *phb_target, > + enum dma_data_direction dir) > +{ > + struct pci_controller *hose; > + struct pnv_phb *phb_init; > + struct pnv_ioda_pe *pe_init; > + u64 desc; > + int rc; > + > + if (!opal_check_token(OPAL_PCI_SET_P2P)) > + return -ENXIO; > + > + hose = pci_bus_to_host(initiator->bus); > + phb_init = hose->private_data; You can use the pci_bus_to_pnvhb() helper > + > + pe_init = pnv_ioda_get_pe(initiator); > + if (!pe_init) > + return -ENODEV; > + > + if (!pe_init->tce_bypass_enabled) > + return -EINVAL; > + > + /* > + * Configuring the initiator's PHB requires to adjust its TVE#1 > + * setting. Since the same device can be an initiator several times for > + * different target devices, we need to keep a reference count to know > + * when we can restore the default bypass setting on its TVE#1 when > + * disabling. Opal is not tracking PE states, so we add a reference > + * count on the PE in linux. > + * > + * For the target, the configuration is per PHB, so we keep a > + * target reference count on the PHB. > + */ This irks me a bit because configuring the DMA address limits for the TVE is the kernel's job. What we really should be doing is using opal_pci_map_pe_dma_window_real() to set the bypass-mode address limit for the TVE to something large enough to hit the MMIO ranges rather than having set_p2p do it as a side effect. Unfortunately, for some reason skiboot doesn't implement support for enabling 56bit addressing using opal_pci_map_pe_dma_window_real() and we do need to support older kernel's which used this stuff so I guess we're stuck with it for now. It'd be nice if we could fix this in the longer term though... > + mutex_lock(&p2p_mutex); > + > + desc = OPAL_PCI_P2P_ENABLE | pnv_pci_dma_dir_to_opal_p2p(dir); > + /* always go to opal to validate the configuration */ > + rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id, desc, > + pe_init->pe_number); > + if (rc != OPAL_SUCCESS) { > + rc = -EIO; > + goto out; > + } > + > + pe_init->p2p_initiator_count++; > + phb_target->p2p_target_count++; > + > + rc = 0; > +out: > + mutex_unlock(&p2p_mutex); > + return rc; > +} > + > +static int pnv_pci_dma_map_resource(struct pci_dev *pdev, > + phys_addr_t phys_addr, size_t size, > + enum dma_data_direction dir) > +{ > + struct pnv_phb *target_phb; > + > + target_phb = pnv_pci_find_owning_phb(pdev, phys_addr, size); > + if (!target_phb) > + return 0; > + > + return pnv_pci_ioda_enable_p2p(pdev, target_phb, dir); > +} > + > +static int pnv_pci_ioda_disable_p2p(struct pci_dev *initiator, > + struct pnv_phb *phb_target) > +{ > + struct pci_controller *hose; > + struct pnv_phb *phb_init; > + struct pnv_ioda_pe *pe_init; > + int rc; > + > + if (!opal_check_token(OPAL_PCI_SET_P2P)) > + return -ENXIO; This should probably have a WARN_ON() since we can't hit this path unless the initial map succeeds. > + hose = pci_bus_to_host(initiator->bus); > + phb_init = hose->private_data; pci_bus_to_pnvhb() > + pe_init = pnv_ioda_get_pe(initiator); > + if (!pe_init) > + return -ENODEV; > + > + mutex_lock(&p2p_mutex); > + > + if (!pe_init->p2p_initiator_count || !phb_target->p2p_target_count) { > + rc = -EINVAL; > + goto out; > + } > + > + if (--pe_init->p2p_initiator_count == 0) > + pnv_pci_ioda2_set_bypass(pe_init, true); > + > + if (--phb_target->p2p_target_count == 0) { > + rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id, > + 0, pe_init->pe_number); > + if (rc != OPAL_SUCCESS) { > + rc = -EIO; > + goto out; > + } > + } > + > + rc = 0; > +out: > + mutex_unlock(&p2p_mutex); > + return rc; > +} > + > +static void pnv_pci_dma_unmap_resource(struct pci_dev *pdev, > + dma_addr_t addr, size_t size, > + enum dma_data_direction dir) > +{ > + struct pnv_phb *target_phb; > + int rc; > + > + target_phb = pnv_pci_find_owning_phb(pdev, addr, size); > + if (!target_phb) > + return; > + > + rc = pnv_pci_ioda_disable_p2p(pdev, target_phb); > + if (rc) > + dev_err(&pdev->dev, "Failed to undo PCI peer-to-peer setup for address %llx: %d\n", > + addr, rc); Use pci_err() or pe_err().