IODA2 systems (POWER8/9) allow DMA windows at 2 fixed locations - 0 and 0x800.0000.0000.0000==1<<59, stored in TVT as TVE0/1. PHB4 on POWER9 has a "TVT Select 'GTE4GB' Option" which allows mapping both windows at 0 and selecting one based on IOBA address - accesses below 4GB go via TVE0 and above 4GB - via TVE1. Note that the TVE1's window still has to allocate TCEs for below 4GB. This changes iommu=iommy_bypass mode to move the second window at 4GB if possible. When TVE1_4GB enabled, this creates a small (2GB typically) 32 bit window as there is no need to cover as much of lower DMA space - the 4GB+ window does it better anyway. As the physical TCE table from TVE1 maps PCI space from 0 and we want it look like a 1:1 mapping with a fixed 4GB offset, this adds a iommu_table::it_tceoff field which is a number of reserved TCEs covering first 4GB of DMA space. This keeps the existing behavior by default as the TVE1_4GB flag is set per PHB by device assignment is done on PE basis and managing both modes dynamically might get nasty. Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> --- arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/include/asm/opal-api.h | 9 ++- arch/powerpc/include/asm/opal.h | 2 + arch/powerpc/platforms/powernv/opal-call.c | 2 + arch/powerpc/platforms/powernv/pci-ioda.c | 66 +++++++++++++++++++--- 5 files changed, 70 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index acf64a73ead1..b9c4af9f129c 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -97,6 +97,7 @@ struct iommu_table { unsigned long it_level_size; unsigned long it_allocated_size; unsigned long it_offset; /* Offset into global table */ + unsigned long it_tceoff; unsigned long it_base; /* mapped address of tce table */ unsigned long it_index; /* which iommu table this is */ unsigned long it_type; /* type: PCI or Virtual Bus */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..7873754f5ea6 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -214,7 +214,9 @@ #define OPAL_SECVAR_GET 176 #define OPAL_SECVAR_GET_NEXT 177 #define OPAL_SECVAR_ENQUEUE_UPDATE 178 -#define OPAL_LAST 178 +#define OPAL_PHB_SET_OPTION 179 +#define OPAL_PHB_GET_OPTION 180 +#define OPAL_LAST 180 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ @@ -437,6 +439,11 @@ enum OpalSlotLedState { OPAL_SLOT_LED_STATE_ON = 1 /* LED is ON */ }; +enum OpalPhbOption { + OPAL_PHB_OPTION_TVE1_4GB = 0x1, + OPAL_PHB_OPTION_MMIO_EEH_DISABLE = 0x2, +}; + /* * Address cycle types for LPC accesses. These also correspond * to the content of the first cell of the "reg" property for diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 9986ac34b8e2..89b712288cdd 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -142,6 +142,8 @@ int64_t opal_pci_map_pe_dma_window(uint64_t phb_id, uint16_t pe_number, uint16_t int64_t opal_pci_map_pe_dma_window_real(uint64_t phb_id, uint16_t pe_number, uint16_t dma_window_number, uint64_t pci_start_addr, uint64_t pci_mem_size); +int64_t opal_phb_set_option(uint64_t phb_id, uint64_t opt, uint64_t setting); +int64_t opal_phb_get_option(uint64_t phb_id, uint64_t opt, uint64_t *setting); int64_t opal_pci_reset(uint64_t id, uint8_t reset_scope, uint8_t assert_state); int64_t opal_pci_get_hub_diag_data(uint64_t hub_id, void *diag_buffer, diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 5cd0f52d258f..3130d5a41570 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -293,3 +293,5 @@ OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG); OPAL_CALL(opal_secvar_get, OPAL_SECVAR_GET); OPAL_CALL(opal_secvar_get_next, OPAL_SECVAR_GET_NEXT); OPAL_CALL(opal_secvar_enqueue_update, OPAL_SECVAR_ENQUEUE_UPDATE); +OPAL_CALL(opal_phb_set_option, OPAL_PHB_SET_OPTION); +OPAL_CALL(opal_phb_get_option, OPAL_PHB_GET_OPTION); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 27a505a5edb4..cba2cb2e1119 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2367,7 +2367,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, } static long pnv_pci_ioda2_set_bypass_iommu(struct pnv_ioda_pe *pe, - unsigned long bus_offset) + unsigned long bus_offset, unsigned long tbl_offset) { struct pnv_phb *phb = pe->phb; long rc; @@ -2376,6 +2376,8 @@ static long pnv_pci_ioda2_set_bypass_iommu(struct pnv_ioda_pe *pe, pgsizes = pnv_ioda_parse_tce_sizes(phb); + /* Filter sizes to have round number of TCEs to cover 0..tbl_offset */ + pgsizes &= tbl_offset | (tbl_offset - 1); if (!pgsizes) return -1; @@ -2386,17 +2388,19 @@ static long pnv_pci_ioda2_set_bypass_iommu(struct pnv_ioda_pe *pe, 1 /* window number */, bus_offset, __fls(pgsizes), - roundup_pow_of_two(memory_hotplug_max()), + roundup_pow_of_two(memory_hotplug_max() + + tbl_offset), 2 /* levels */, false /* userspace cache */, &tbl); if (rc) return -1; + tbl->it_tceoff = tbl_offset >> tbl->it_page_shift; for_each_memblock(memory, r) pnv_ioda2_tce_build(tbl, (r->base >> tbl->it_page_shift) + - tbl->it_offset, + tbl->it_offset + tbl->it_tceoff, r->size >> tbl->it_page_shift, (unsigned long) __va(r->base), DMA_BIDIRECTIONAL, @@ -2438,8 +2442,22 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) } if (pnv_iommu_bypass_mode == PNV_IOMMU_TCE_BYPASS) { + if (!opal_phb_set_option(phb->opal_id, + OPAL_PHB_OPTION_TVE1_4GB, 1)) { + pe->table_group.tce64_start = SZ_4G; + if (!pnv_pci_ioda2_set_bypass_iommu(pe, + pe->table_group.tce64_start, SZ_4G)) { + pe->tce_bypass_enabled = true; + pe_info(pe, "Enabled 64-bit IOMMU bypass at %llx\n", + pe->table_group.tce64_start); + return; + } + pe_err(pe, "Enabled TVE1_4GB but failed to configure TCE table"); + opal_phb_set_option(phb->opal_id, + OPAL_PHB_OPTION_TVE1_4GB, 0); + } if (!pnv_pci_ioda2_set_bypass_iommu(pe, - pe->table_group.tce64_start)) { + pe->table_group.tce64_start, 0)) { pe->tce_bypass_enabled = true; pe_info(pe, "Enabled 64-bit IOMMU bypass at %llx\n", pe->table_group.tce64_start); @@ -2450,6 +2468,10 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) } if (pnv_iommu_bypass_mode == PNV_IOMMU_NO_TRANSLATE) { + /* + * FIXME: if we enable dynamic switch, here we need to disable + * OPAL_PCI_PHB_FLAG_TVE1_4GB + */ top = roundup_pow_of_two(memblock_end_of_DRAM()); if (!opal_pci_map_pe_dma_window_real(phb->opal_id, pe->pe_number, window_id, @@ -2521,6 +2543,15 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) */ /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */ window_size = min((maxblock * 8) << tceshift, max_memory); + + /* + * If we get TVE#1_4GB on, there is no point in having a huge default + * DMA window. + */ + if (pnv_iommu_bypass_mode == PNV_IOMMU_TCE_BYPASS) + window_size = min_t(u64, pe->table_group.tce32_size, + window_size); + /* Each TCE level cannot exceed maxblock so go multilevel if needed */ tces_order = ilog2(window_size >> tceshift); tcelevel_order = ilog2(maxblock >> 3); @@ -2611,6 +2642,9 @@ unsigned long pnv_pci_ioda2_get_table_size(int num, __u32 page_shift, !is_power_of_2(window_size)) return 0; + if (pnv_iommu_bypass_mode == PNV_IOMMU_TCE_BYPASS && num == 1) + window_size = roundup_pow_of_two(window_size + SZ_4G); + /* Calculate a direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; table_shift = entries_shift + 3; @@ -2636,15 +2670,29 @@ static long pnv_pci_ioda2_create_table_userspace( { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); - __u64 bus_offset = num ? - pe->table_group.tce64_start : table_group->tce32_start; - long ret = pnv_pci_ioda2_create_table(pe->phb->hose->node, - num, bus_offset, page_shift, window_size, levels, true, + __u64 bus_offset, tce_offset = 0, win_size = window_size; + long ret; + + if (num == 0) { + bus_offset = table_group->tce32_start; + } else if (table_group->tce64_start == SZ_4G) { + bus_offset = table_group->tce32_start; + tce_offset = SZ_4G; + win_size = roundup_pow_of_two(window_size + tce_offset); + } else { + bus_offset = table_group->tce64_start; + } + + ret = pnv_pci_ioda2_create_table(pe->phb->hose->node, + num, bus_offset, page_shift, win_size, levels, true, ptbl); - if (!ret) + if (!ret) { (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size( num, page_shift, window_size, levels); + (*ptbl)->it_tceoff = tce_offset >> page_shift; + } + return ret; } -- 2.17.1