Dear All, On 11.08.2024 09:09, Baruch Siach wrote: > From: Catalin Marinas <catalin.marinas@xxxxxxx> > > Hardware DMA limit might not be power of 2. When RAM range starts above > 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit > can not encode this limit. > > Use plain address for DMA zone limit. > > Since DMA zone can now potentially span beyond 4GB physical limit of > DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. > > Signed-off-by: Catalin Marinas <catalin.marinas@xxxxxxx> > Co-developed-by: Baruch Siach <baruch@xxxxxxxxxx> > Signed-off-by: Baruch Siach <baruch@xxxxxxxxxx> > --- This patch landed recently in linux-next as commit ba0fb44aed47 ("dma-mapping: replace zone_dma_bits by zone_dma_limit"). During my tests I found that it introduces the following warning on ARM64/Rockchip based Odroid M1 board (arch/arm64/boot/dts/rockchip/rk3568-odroid-m1.dts): ------------[ cut here ]------------ dwmmc_rockchip fe2b0000.mmc: swiotlb addr 0x00000001faf00000+4096 overflow (mask ffffffff, bus limit 0). WARNING: CPU: 3 PID: 1 at kernel/dma/swiotlb.c:1594 swiotlb_map+0x2f0/0x308 Modules linked in: CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc4+ #15278 Hardware name: Hardkernel ODROID-M1 (DT) pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : swiotlb_map+0x2f0/0x308 lr : swiotlb_map+0x2f0/0x308 ... Call trace: swiotlb_map+0x2f0/0x308 dma_direct_map_sg+0x9c/0x2e4 __dma_map_sg_attrs+0x28/0x94 dma_map_sg_attrs+0x10/0x24 dw_mci_pre_dma_transfer+0xb8/0xf4 dw_mci_pre_req+0x50/0x68 mmc_blk_mq_issue_rq+0x3e0/0x964 mmc_mq_queue_rq+0x118/0x2b4 blk_mq_dispatch_rq_list+0x21c/0x714 __blk_mq_sched_dispatch_requests+0x490/0x58c blk_mq_sched_dispatch_requests+0x30/0x6c blk_mq_run_hw_queue+0x284/0x40c blk_mq_flush_plug_list.part.0+0x190/0x974 blk_mq_flush_plug_list+0x1c/0x2c __blk_flush_plug+0xe4/0x140 blk_finish_plug+0x38/0x4c __ext4_get_inode_loc+0x22c/0x654 __ext4_get_inode_loc_noinmem+0x40/0xa8 __ext4_iget+0x154/0xcc0 ext4_get_journal_inode+0x30/0x110 ext4_load_and_init_journal+0x9c/0xaf0 ext4_fill_super+0x1fec/0x2d90 get_tree_bdev+0x140/0x1d8 ext4_get_tree+0x18/0x24 vfs_get_tree+0x28/0xe8 path_mount+0x3e8/0xb7c init_mount+0x68/0xac do_mount_root+0x108/0x1dc mount_root_generic+0x100/0x330 mount_root+0x160/0x2d0 initrd_load+0x1f0/0x2a0 prepare_namespace+0x4c/0x29c kernel_init_freeable+0x4b4/0x50c kernel_init+0x20/0x1d8 ret_from_fork+0x10/0x20 irq event stamp: 1305682 hardirqs last enabled at (1305681): [<ffff8000800e332c>] console_unlock+0x124/0x130 hardirqs last disabled at (1305682): [<ffff80008124e684>] el1_dbg+0x24/0x8c softirqs last enabled at (1305678): [<ffff80008005be1c>] handle_softirqs+0x4cc/0x4e4 softirqs last disabled at (1305665): [<ffff8000800105b0>] __do_softirq+0x14/0x20 ---[ end trace 0000000000000000 ]--- This "bus limit 0" seems to be a bit suspicious to me as well as the fact that swiotlb is used for the MMC DMA. I will investigate this further tomorrow. The board boots fine though. > arch/arm64/mm/init.c | 30 +++++++++++++++--------------- > arch/powerpc/mm/mem.c | 5 ++++- > arch/s390/mm/init.c | 2 +- > include/linux/dma-direct.h | 2 +- > kernel/dma/direct.c | 6 +++--- > kernel/dma/pool.c | 4 ++-- > kernel/dma/swiotlb.c | 6 +++--- > 7 files changed, 29 insertions(+), 26 deletions(-) > > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index 9b5ab6818f7f..c45e2152ca9e 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -115,35 +115,35 @@ static void __init arch_reserve_crashkernel(void) > } > > /* > - * Return the maximum physical address for a zone accessible by the given bits > - * limit. If DRAM starts above 32-bit, expand the zone to the maximum > + * Return the maximum physical address for a zone given its limit. > + * If DRAM starts above 32-bit, expand the zone to the maximum > * available memory, otherwise cap it at 32-bit. > */ > -static phys_addr_t __init max_zone_phys(unsigned int zone_bits) > +static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit) > { > - phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); > phys_addr_t phys_start = memblock_start_of_DRAM(); > > if (phys_start > U32_MAX) > - zone_mask = PHYS_ADDR_MAX; > - else if (phys_start > zone_mask) > - zone_mask = U32_MAX; > + zone_limit = PHYS_ADDR_MAX; > + else if (phys_start > zone_limit) > + zone_limit = U32_MAX; > > - return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; > + return min(zone_limit, memblock_end_of_DRAM() - 1) + 1; > } > > static void __init zone_sizes_init(void) > { > unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; > - unsigned int __maybe_unused acpi_zone_dma_bits; > - unsigned int __maybe_unused dt_zone_dma_bits; > - phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); > + phys_addr_t __maybe_unused acpi_zone_dma_limit; > + phys_addr_t __maybe_unused dt_zone_dma_limit; > + phys_addr_t __maybe_unused dma32_phys_limit = > + max_zone_phys(DMA_BIT_MASK(32)); > > #ifdef CONFIG_ZONE_DMA > - acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); > - dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); > - zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); > - arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); > + acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address(); > + dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL); > + zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit); > + arm64_dma_phys_limit = max_zone_phys(zone_dma_limit); > max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); > #endif > #ifdef CONFIG_ZONE_DMA32 > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index d325217ab201..05b7f702b3f7 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -216,7 +216,7 @@ static int __init mark_nonram_nosave(void) > * everything else. GFP_DMA32 page allocations automatically fall back to > * ZONE_DMA. > * > - * By using 31-bit unconditionally, we can exploit zone_dma_bits to inform the > + * By using 31-bit unconditionally, we can exploit zone_dma_limit to inform the > * generic DMA mapping code. 32-bit only devices (if not handled by an IOMMU > * anyway) will take a first dip into ZONE_NORMAL and get otherwise served by > * ZONE_DMA. > @@ -230,6 +230,7 @@ void __init paging_init(void) > { > unsigned long long total_ram = memblock_phys_mem_size(); > phys_addr_t top_of_ram = memblock_end_of_DRAM(); > + int zone_dma_bits; > > #ifdef CONFIG_HIGHMEM > unsigned long v = __fix_to_virt(FIX_KMAP_END); > @@ -256,6 +257,8 @@ void __init paging_init(void) > else > zone_dma_bits = 31; > > + zone_dma_limit = DMA_BIT_MASK(zone_dma_bits); > + > #ifdef CONFIG_ZONE_DMA > max_zone_pfns[ZONE_DMA] = min(max_low_pfn, > 1UL << (zone_dma_bits - PAGE_SHIFT)); > diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c > index ddcd39ef4346..91fc2b91adfc 100644 > --- a/arch/s390/mm/init.c > +++ b/arch/s390/mm/init.c > @@ -97,7 +97,7 @@ void __init paging_init(void) > > vmem_map_init(); > sparse_init(); > - zone_dma_bits = 31; > + zone_dma_limit = DMA_BIT_MASK(31); > memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); > max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); > max_zone_pfns[ZONE_NORMAL] = max_low_pfn; > diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h > index edbe13d00776..d7e30d4f7503 100644 > --- a/include/linux/dma-direct.h > +++ b/include/linux/dma-direct.h > @@ -12,7 +12,7 @@ > #include <linux/mem_encrypt.h> > #include <linux/swiotlb.h> > > -extern unsigned int zone_dma_bits; > +extern u64 zone_dma_limit; > > /* > * Record the mapping of CPU physical to DMA addresses for a given region. > diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c > index 4480a3cd92e0..f2ba074a6a54 100644 > --- a/kernel/dma/direct.c > +++ b/kernel/dma/direct.c > @@ -20,7 +20,7 @@ > * it for entirely different regions. In that case the arch code needs to > * override the variable below for dma-direct to work properly. > */ > -unsigned int zone_dma_bits __ro_after_init = 24; > +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); > > static inline dma_addr_t phys_to_dma_direct(struct device *dev, > phys_addr_t phys) > @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) > * zones. > */ > *phys_limit = dma_to_phys(dev, dma_limit); > - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) > + if (*phys_limit <= zone_dma_limit) > return GFP_DMA; > if (*phys_limit <= DMA_BIT_MASK(32)) > return GFP_DMA32; > @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) > * part of the check. > */ > if (IS_ENABLED(CONFIG_ZONE_DMA)) > - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); > + min_mask = min_t(u64, min_mask, zone_dma_limit); > return mask >= phys_to_dma_unencrypted(dev, min_mask); > } > > diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c > index d10613eb0f63..7b04f7575796 100644 > --- a/kernel/dma/pool.c > +++ b/kernel/dma/pool.c > @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) > /* CMA can't cross zone boundaries, see cma_activate_area() */ > end = cma_get_base(cma) + size - 1; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) > - return end <= DMA_BIT_MASK(zone_dma_bits); > + return end <= zone_dma_limit; > if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) > - return end <= DMA_BIT_MASK(32); > + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); > return true; > } > > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > index df68d29740a0..abcf3fa63a56 100644 > --- a/kernel/dma/swiotlb.c > +++ b/kernel/dma/swiotlb.c > @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, > if (!remap) > io_tlb_default_mem.can_grow = true; > if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); > + io_tlb_default_mem.phys_limit = zone_dma_limit; > else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) > - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); > + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); > else > io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); > #endif > @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, > } > > gfp &= ~GFP_ZONEMASK; > - if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) > + if (phys_limit <= zone_dma_limit) > gfp |= __GFP_DMA; > else if (phys_limit <= DMA_BIT_MASK(32)) > gfp |= __GFP_DMA32; Best regards -- Marek Szyprowski, PhD Samsung R&D Institute Poland