On 19/11/2018 12:12, David Gibson wrote: > On Tue, Nov 13, 2018 at 07:28:19PM +1100, Alexey Kardashevskiy wrote: >> At the moment powernv registers an IOMMU group for each PE. There is >> an exception though - NPU (an emulated PCI bridge representing an NVLink); >> powernv attaches these bridges to the GPU IOMMU group which becomes >> a master. >> >> Now we have POWER9 systems with GPUs connected to each other directly, >> bypassing PCI. At the moment powernv does not control these links so >> it has to put such interconnected GPUs to the same IOMMU group which >> means that the old scheme with a GPU as a master won't work - there will >> be up to 3 GPUs in such group. >> >> This introduces a npu_comp struct which represents a compound IOMMU >> group made of multiple PEs. This converts the existing NVLink1 code to >> use the new scheme. From now on, each PE must have a valid >> iommu_table_group_ops which will either be called directly (a single PE >> group) or indirectly from a compound group. >> >> This moves IOMMU group registration for NPU-connected GPUs to npu-dma.c. >> For POWER8, this stores a new compound group pointer in a PE (so a GPU >> is still a master); for POWER9 the new group pointer is stored in an NPU. >> >> Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> >> --- >> arch/powerpc/include/asm/pci.h | 1 + >> arch/powerpc/platforms/powernv/pci.h | 7 + >> arch/powerpc/platforms/powernv/npu-dma.c | 286 ++++++++++++++++++++-- >> arch/powerpc/platforms/powernv/pci-ioda.c | 173 +++---------- >> 4 files changed, 308 insertions(+), 159 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h >> index baf2886..0c72f18 100644 >> --- a/arch/powerpc/include/asm/pci.h >> +++ b/arch/powerpc/include/asm/pci.h >> @@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index); >> extern int pnv_npu2_init(struct pci_controller *hose); >> extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, >> unsigned long msr); >> +extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev); >> >> #endif /* __ASM_POWERPC_PCI_H */ >> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h >> index cf9f748..aef4bb5 100644 >> --- a/arch/powerpc/platforms/powernv/pci.h >> +++ b/arch/powerpc/platforms/powernv/pci.h >> @@ -62,6 +62,7 @@ struct pnv_ioda_pe { >> >> /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ >> struct iommu_table_group table_group; >> + struct npu_comp *npucomp; >> >> /* 64-bit TCE bypass region */ >> bool tce_bypass_enabled; >> @@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); >> extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); >> extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); >> extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); >> +extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, >> + __u64 window_size, __u32 levels); >> extern int pnv_eeh_post_init(void); >> >> extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, >> @@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, >> extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); >> extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); >> extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); >> +extern struct iommu_table_group *pnv_try_setup_npu_table_group( >> + struct pnv_ioda_pe *pe); >> +extern struct iommu_table_group *pnv_npu_compound_attach( >> + struct pnv_ioda_pe *pe); >> >> /* pci-ioda-tce.c */ >> #define POWERNV_IOMMU_DEFAULT_LEVELS 1 >> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c >> index 1792c7e..2231f4c 100644 >> --- a/arch/powerpc/platforms/powernv/npu-dma.c >> +++ b/arch/powerpc/platforms/powernv/npu-dma.c >> @@ -317,31 +317,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = { >> .unset_window = pnv_npu_unset_window, >> .take_ownership = pnv_npu_take_ownership, >> }; >> - >> -struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) >> -{ >> - struct pnv_phb *phb = npe->phb; >> - struct pci_bus *pbus = phb->hose->bus; >> - struct pci_dev *npdev, *gpdev = NULL, *gptmp; >> - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); >> - >> - if (!gpe || !gpdev) >> - return NULL; >> - >> - npe->table_group.ops = &pnv_pci_npu_ops; >> - >> - list_for_each_entry(npdev, &pbus->devices, bus_list) { >> - gptmp = pnv_pci_get_gpu_dev(npdev); >> - >> - if (gptmp != gpdev) >> - continue; >> - >> - pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); >> - iommu_group_add_device(gpe->table_group.group, &npdev->dev); >> - } >> - >> - return gpe; >> -} >> #endif /* !CONFIG_IOMMU_API */ >> >> /* >> @@ -349,6 +324,17 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) >> */ >> /* Maximum possible number of ATSD MMIO registers per NPU */ >> #define NV_NMMU_ATSD_REGS 8 >> +#define NV_NPU_MAX_PE_NUM 16 >> + >> +/* >> + * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or >> + * up to 3 x (GPU + 2xNPUs) (POWER9). >> + */ >> +struct npu_comp { >> + struct iommu_table_group table_group; >> + int pe_num; >> + struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM]; >> +}; >> >> /* An NPU descriptor, valid for POWER9 only */ >> struct npu { >> @@ -365,6 +351,8 @@ struct npu { >> struct list_head next; >> >> struct pci_controller *hose; >> + >> + struct npu_comp npucomp; >> }; > > I'm confused by this. The comment simply there are multiple NPUs in a > single composite-group, but the np_comp structure is embedded in the > npu structure, implying there's a copy per-NPU. Yeah, there is a naming confusion. NPU is a big chunk in the CPU with 6 links, and this is what the "struct npu" above describes. And there are 6 NPU emulated bridge devices which you can see in lspci with the "ibmnpu" driver bound to them. I guess from now on I will refer to the big NPU as "NPU" and to the emulated bridge device as "NVLink2" or "NVlink2 emulated device" unless you have a better suggestion (Alistair does not though). -- Alexey