Hello, This is looking better, but I still have some concerns. On Thu, Jul 16, 2015 at 10:04:32AM +0100, Yong Wu wrote: > This patch is for ARM Short Descriptor Format. > > Signed-off-by: Yong Wu <yong.wu@xxxxxxxxxxxx> > --- > drivers/iommu/Kconfig | 18 + > drivers/iommu/Makefile | 1 + > drivers/iommu/io-pgtable-arm-short.c | 742 ++++++++++++++++++++++++++++++++++ > drivers/iommu/io-pgtable-arm.c | 3 - > drivers/iommu/io-pgtable.c | 4 + > drivers/iommu/io-pgtable.h | 13 + > 6 files changed, 778 insertions(+), 3 deletions(-) > create mode 100644 drivers/iommu/io-pgtable-arm-short.c [...] > +#define ARM_SHORT_PGDIR_SHIFT 20 > +#define ARM_SHORT_PAGE_SHIFT 12 > +#define ARM_SHORT_PTRS_PER_PTE \ > + (1 << (ARM_SHORT_PGDIR_SHIFT - ARM_SHORT_PAGE_SHIFT)) > +#define ARM_SHORT_BYTES_PER_PTE \ > + (ARM_SHORT_PTRS_PER_PTE * sizeof(arm_short_iopte)) > + > +/* level 1 pagetable */ > +#define ARM_SHORT_PGD_TYPE_PGTABLE BIT(0) > +#define ARM_SHORT_PGD_SECTION_XN (BIT(0) | BIT(4)) > +#define ARM_SHORT_PGD_TYPE_SECTION BIT(1) > +#define ARM_SHORT_PGD_PGTABLE_XN BIT(2) This should be PXN, but I'm not even sure why we care for a table at level 1. > +#define ARM_SHORT_PGD_B BIT(2) > +#define ARM_SHORT_PGD_C BIT(3) > +#define ARM_SHORT_PGD_PGTABLE_NS BIT(3) > +#define ARM_SHORT_PGD_IMPLE BIT(9) > +#define ARM_SHORT_PGD_TEX0 BIT(12) > +#define ARM_SHORT_PGD_S BIT(16) > +#define ARM_SHORT_PGD_nG BIT(17) > +#define ARM_SHORT_PGD_SUPERSECTION BIT(18) > +#define ARM_SHORT_PGD_SECTION_NS BIT(19) > + > +#define ARM_SHORT_PGD_TYPE_SUPERSECTION \ > + (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION) > +#define ARM_SHORT_PGD_SECTION_TYPE_MSK \ > + (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION) > +#define ARM_SHORT_PGD_PGTABLE_TYPE_MSK \ > + (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_TYPE_PGTABLE) > +#define ARM_SHORT_PGD_TYPE_IS_PGTABLE(pgd) \ > + (((pgd) & ARM_SHORT_PGD_PGTABLE_TYPE_MSK) == ARM_SHORT_PGD_TYPE_PGTABLE) > +#define ARM_SHORT_PGD_TYPE_IS_SECTION(pgd) \ > + (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == ARM_SHORT_PGD_TYPE_SECTION) > +#define ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(pgd) \ > + (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == \ > + ARM_SHORT_PGD_TYPE_SUPERSECTION) > +#define ARM_SHORT_PGD_PGTABLE_MSK 0xfffffc00 > +#define ARM_SHORT_PGD_SECTION_MSK (~(SZ_1M - 1)) > +#define ARM_SHORT_PGD_SUPERSECTION_MSK (~(SZ_16M - 1)) > + > +/* level 2 pagetable */ > +#define ARM_SHORT_PTE_TYPE_LARGE BIT(0) > +#define ARM_SHORT_PTE_SMALL_XN BIT(0) > +#define ARM_SHORT_PTE_TYPE_SMALL BIT(1) > +#define ARM_SHORT_PTE_B BIT(2) > +#define ARM_SHORT_PTE_C BIT(3) > +#define ARM_SHORT_PTE_SMALL_TEX0 BIT(6) > +#define ARM_SHORT_PTE_IMPLE BIT(9) This is AP[2] for small pages. > +#define ARM_SHORT_PTE_S BIT(10) > +#define ARM_SHORT_PTE_nG BIT(11) > +#define ARM_SHORT_PTE_LARGE_TEX0 BIT(12) > +#define ARM_SHORT_PTE_LARGE_XN BIT(15) > +#define ARM_SHORT_PTE_LARGE_MSK (~(SZ_64K - 1)) > +#define ARM_SHORT_PTE_SMALL_MSK (~(SZ_4K - 1)) > +#define ARM_SHORT_PTE_TYPE_MSK \ > + (ARM_SHORT_PTE_TYPE_LARGE | ARM_SHORT_PTE_TYPE_SMALL) > +#define ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(pte) \ > + (((((pte) & ARM_SHORT_PTE_TYPE_MSK) >> 1) << 1)\ > + == ARM_SHORT_PTE_TYPE_SMALL) I see what you're trying to do here, but the shifting is confusing. I think it's better doing something like: ((pte) & ARM_SHORT_PTE_TYPE_SMALL) == ARM_SHORT_PTE_TYPE_SMALL or even just: (pte) & ARM_SHORT_PTE_TYPE_SMALL > +#define ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(pte) \ > + (((pte) & ARM_SHORT_PTE_TYPE_MSK) == ARM_SHORT_PTE_TYPE_LARGE) > + > +#define ARM_SHORT_PGD_IDX(a) ((a) >> ARM_SHORT_PGDIR_SHIFT) > +#define ARM_SHORT_PTE_IDX(a) \ > + (((a) >> ARM_SHORT_PAGE_SHIFT) & (ARM_SHORT_PTRS_PER_PTE - 1)) > + > +#define ARM_SHORT_GET_PTE_VA(pgd) \ > + (phys_to_virt((unsigned long)pgd & ARM_SHORT_PGD_PGTABLE_MSK)) Maybe better named as ARM_SHORT_GET_PGTABLE_VA ? > + > +#define ARM_SHORT_PTE_LARGE_GET_PROT(pte) \ > + (((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK) > + > +#define ARM_SHORT_PGD_GET_PROT(pgd) \ > + (((pgd) & (~ARM_SHORT_PGD_SECTION_MSK)) & ~ARM_SHORT_PGD_SUPERSECTION) > + > +static bool selftest_running; > + > +static arm_short_iopte * > +arm_short_get_pte_in_pgd(arm_short_iopte pgd, unsigned int iova) > +{ > + arm_short_iopte *pte; > + > + pte = ARM_SHORT_GET_PTE_VA(pgd); > + pte += ARM_SHORT_PTE_IDX(iova); > + return pte; > +} > + > +static void _arm_short_free_pgtable(struct arm_short_io_pgtable *data, > + arm_short_iopte *pgd) > +{ > + const struct iommu_gather_ops *tlb = data->iop.cfg.tlb; > + arm_short_iopte *pte; > + int i; > + > + pte = ARM_SHORT_GET_PTE_VA(*pgd); > + for (i = 0; i < ARM_SHORT_PTRS_PER_PTE; i++) { > + if (pte[i] != 0) > + return; > + } Do you actually need this loop if you're not warning or returning an error? > + > + /* Free whole pte and set pgd to zero while all pte is unmap */ > + kmem_cache_free(data->ptekmem, pte); > + *pgd = 0; I still don't think this is safe. What stops the page table walker from speculatively walking freed memory? > + tlb->flush_pgtable(pgd, sizeof(*pgd), data->iop.cookie); > +} > + > +static arm_short_iopte > +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large) > +{ > + arm_short_iopte pteprot; > + > + pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG; > + pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE : > + ARM_SHORT_PTE_TYPE_SMALL; > + if (prot & IOMMU_CACHE) > + pteprot |= ARM_SHORT_PTE_B | ARM_SHORT_PTE_C; > + if (prot & IOMMU_WRITE) > + pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 : > + ARM_SHORT_PTE_SMALL_TEX0; This doesn't make any sense. TEX[2:0] is all about memory attributes, not permissions, so you're making the mapping write-back, write-allocate but that's not what the IOMMU_* values are about. > + if (prot & IOMMU_NOEXEC) > + pteprot |= large ? ARM_SHORT_PTE_LARGE_XN : > + ARM_SHORT_PTE_SMALL_XN; > + return pteprot; > +} > + > +static arm_short_iopte > +__arm_short_pgd_prot(struct arm_short_io_pgtable *data, int prot, bool super) > +{ > + arm_short_iopte pgdprot; > + > + pgdprot = ARM_SHORT_PGD_S | ARM_SHORT_PGD_nG; > + pgdprot |= super ? ARM_SHORT_PGD_TYPE_SUPERSECTION : > + ARM_SHORT_PGD_TYPE_SECTION; > + if (prot & IOMMU_CACHE) > + pgdprot |= ARM_SHORT_PGD_C | ARM_SHORT_PGD_B; > + if (prot & IOMMU_WRITE) > + pgdprot |= ARM_SHORT_PGD_TEX0; Likewise. > + if (prot & IOMMU_NOEXEC) > + pgdprot |= ARM_SHORT_PGD_SECTION_XN; > + if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS) > + pgdprot |= ARM_SHORT_PGD_SECTION_NS; > + return pgdprot; > +} > + > +static arm_short_iopte > +__arm_short_pte_prot_split(struct arm_short_io_pgtable *data, > + arm_short_iopte pgdprot, > + arm_short_iopte pteprot_large, > + bool large) > +{ > + arm_short_iopte pteprot = 0; > + > + pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG; > + pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE : > + ARM_SHORT_PTE_TYPE_SMALL; > + /* section to pte prot */ > + if (pgdprot & ARM_SHORT_PGD_C) > + pteprot |= ARM_SHORT_PTE_C; > + if (pgdprot & ARM_SHORT_PGD_B) > + pteprot |= ARM_SHORT_PTE_B; > + if (pgdprot & ARM_SHORT_PGD_TEX0) > + pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 : > + ARM_SHORT_PTE_SMALL_TEX0; > + if (pgdprot & ARM_SHORT_PGD_nG) > + pteprot |= ARM_SHORT_PTE_nG; > + if (pgdprot & ARM_SHORT_PGD_SECTION_XN) > + pteprot |= large ? ARM_SHORT_PTE_LARGE_XN : > + ARM_SHORT_PTE_SMALL_XN; > + > + /* large page to small page pte prot. Only large page may split */ > + if (pteprot_large && !large) { > + if (pteprot_large & ARM_SHORT_PTE_LARGE_TEX0) > + pteprot |= ARM_SHORT_PTE_SMALL_TEX0; > + if (pteprot_large & ARM_SHORT_PTE_LARGE_XN) > + pteprot |= ARM_SHORT_PTE_SMALL_XN; > + } > + return pteprot; > +} > + > +static arm_short_iopte > +__arm_short_pgtable_prot(struct arm_short_io_pgtable *data, bool noexec) > +{ > + arm_short_iopte pgdprot = 0; > + > + pgdprot = ARM_SHORT_PGD_TYPE_PGTABLE; > + if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS) > + pgdprot |= ARM_SHORT_PGD_PGTABLE_NS; > + if (noexec) > + pgdprot |= ARM_SHORT_PGD_PGTABLE_XN; I don't think you need to worry about XN bits for PGTABLEs. > + return pgdprot; > +} > + > +static int > +_arm_short_map(struct arm_short_io_pgtable *data, > + unsigned int iova, phys_addr_t paddr, > + arm_short_iopte pgdprot, arm_short_iopte pteprot, > + bool large) > +{ > + const struct iommu_gather_ops *tlb = data->iop.cfg.tlb; > + arm_short_iopte *pgd = data->pgd, *pte; > + void *cookie = data->iop.cookie, *pte_va; > + unsigned int ptenr = large ? 16 : 1; > + int i, quirk = data->iop.cfg.quirks; > + bool ptenew = false; > + > + pgd += ARM_SHORT_PGD_IDX(iova); > + > + if (!pteprot) { /* section or supersection */ > + if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) > + pgdprot &= ~ARM_SHORT_PGD_SECTION_XN; > + pte = pgd; > + pteprot = pgdprot; > + } else { /* page or largepage */ > + if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) { > + if (large) { /* special Bit */ This definitely needs a better comment! What exactly are you doing here and what is that quirk all about? > + if (pteprot & ARM_SHORT_PTE_LARGE_TEX0) { > + pteprot &= ~ARM_SHORT_PTE_LARGE_TEX0; > + pteprot |= ARM_SHORT_PTE_SMALL_TEX0; > + } > + pteprot &= ~ARM_SHORT_PTE_LARGE_XN; > + } else { > + pteprot &= ~ARM_SHORT_PTE_SMALL_XN; > + } > + } > + > + if (!(*pgd)) { > + pte_va = kmem_cache_zalloc(data->ptekmem, GFP_ATOMIC); > + if (unlikely(!pte_va)) > + return -ENOMEM; > + ptenew = true; > + *pgd = virt_to_phys(pte_va) | pgdprot; > + kmemleak_ignore(pte_va); > + tlb->flush_pgtable(pgd, sizeof(*pgd), cookie); I think you need to flush this before it becomes visible to the walker. > + } > + pte = arm_short_get_pte_in_pgd(*pgd, iova); > + } > + > + pteprot |= (arm_short_iopte)paddr; > + for (i = 0; i < ptenr; i++) { > + if (pte[i]) {/* Someone else may have allocated for this pte */ > + WARN_ON(!selftest_running); > + goto err_exist_pte; > + } > + pte[i] = pteprot; > + } > + tlb->flush_pgtable(pte, ptenr * sizeof(*pte), cookie); > + > + return 0; > + > +err_exist_pte: > + while (i--) > + pte[i] = 0; > + if (ptenew) > + kmem_cache_free(data->ptekmem, pte_va); > + return -EEXIST; > +} > + > +static int arm_short_map(struct io_pgtable_ops *ops, unsigned long iova, > + phys_addr_t paddr, size_t size, int prot) > +{ > + struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops); > + const struct iommu_gather_ops *tlb = data->iop.cfg.tlb; > + int ret; > + arm_short_iopte pgdprot = 0, pteprot = 0; > + bool large; > + > + /* If no access, then nothing to do */ > + if (!(prot & (IOMMU_READ | IOMMU_WRITE))) > + return 0; > + > + switch (size) { > + case SZ_4K: > + case SZ_64K: > + large = (size == SZ_64K) ? true : false; > + pteprot = __arm_short_pte_prot(data, prot, large); > + pgdprot = __arm_short_pgtable_prot(data, prot & IOMMU_NOEXEC); > + break; > + > + case SZ_1M: > + case SZ_16M: > + large = (size == SZ_16M) ? true : false; > + pgdprot = __arm_short_pgd_prot(data, prot, large); > + break; > + default: > + return -EINVAL; > + } > + > + if (WARN_ON((iova | paddr) & (size - 1))) > + return -EINVAL; > + > + ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large); > + > + tlb->tlb_add_flush(iova, size, true, data->iop.cookie); > + tlb->tlb_sync(data->iop.cookie); In _arm_short_map, it looks like you can only go from invalid -> valid, so why do you need to flush the TLB here? > + return ret; > +} > + > +static phys_addr_t arm_short_iova_to_phys(struct io_pgtable_ops *ops, > + unsigned long iova) > +{ > + struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops); > + arm_short_iopte *pte, *pgd = data->pgd; > + phys_addr_t pa = 0; > + > + pgd += ARM_SHORT_PGD_IDX(iova); > + > + if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) { > + pte = arm_short_get_pte_in_pgd(*pgd, iova); > + > + if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte)) { > + pa = (*pte) & ARM_SHORT_PTE_LARGE_MSK; > + pa |= iova & ~ARM_SHORT_PTE_LARGE_MSK; > + } else if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte)) { > + pa = (*pte) & ARM_SHORT_PTE_SMALL_MSK; > + pa |= iova & ~ARM_SHORT_PTE_SMALL_MSK; > + } > + } else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) { > + pa = (*pgd) & ARM_SHORT_PGD_SECTION_MSK; > + pa |= iova & ~ARM_SHORT_PGD_SECTION_MSK; > + } else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) { > + pa = (*pgd) & ARM_SHORT_PGD_SUPERSECTION_MSK; > + pa |= iova & ~ARM_SHORT_PGD_SUPERSECTION_MSK; > + } > + > + return pa; > +} > + > +static int > +arm_short_split_blk_unmap(struct io_pgtable_ops *ops, unsigned int iova, > + phys_addr_t paddr, size_t size, > + arm_short_iopte pgdprotup, arm_short_iopte pteprotup, > + size_t blk_size) > +{ > + struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops); > + const struct iommu_gather_ops *tlb = data->iop.cfg.tlb; > + struct io_pgtable_cfg *cfg = &data->iop.cfg; > + unsigned long *pgbitmap = &cfg->pgsize_bitmap; > + unsigned int blk_base, blk_start, blk_end; > + arm_short_iopte pgdprot, pteprot; > + size_t mapsize = 0, nextmapsize; > + phys_addr_t blk_paddr; > + int ret; > + unsigned int i; > + > + /* find the nearest mapsize */ > + for (i = find_first_bit(pgbitmap, BITS_PER_LONG); > + i < BITS_PER_LONG && ((1 << i) < blk_size) && > + IS_ALIGNED(size, 1 << i); > + i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1)) > + mapsize = 1 << i; > + > + if (WARN_ON(!mapsize)) > + return 0; /* Bytes unmapped */ > + nextmapsize = 1 << i; > + > + blk_base = iova & ~(blk_size - 1); > + blk_start = blk_base; > + blk_end = blk_start + blk_size; > + blk_paddr = paddr; > + > + for (; blk_start < blk_end; > + blk_start += mapsize, blk_paddr += mapsize) { > + /* Unmap! */ > + if (blk_start == iova) > + continue; > + > + /* Try to upper map */ > + if (blk_base != blk_start && > + IS_ALIGNED(blk_start | blk_paddr, nextmapsize) && > + mapsize != nextmapsize) { > + mapsize = nextmapsize; > + i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1); > + if (i < BITS_PER_LONG) > + nextmapsize = 1 << i; > + } > + > + if (mapsize == SZ_1M) { > + pgdprot = pgdprotup; > + pgdprot |= __arm_short_pgd_prot(data, 0, false); > + pteprot = 0; > + } else { /* small or large page */ > + bool noexec = (blk_size == SZ_64K) ? > + (pteprotup & ARM_SHORT_PTE_LARGE_XN) : > + (pgdprotup & ARM_SHORT_PGD_SECTION_XN); > + > + pteprot = __arm_short_pte_prot_split( > + data, pgdprotup, pteprotup, > + mapsize == SZ_64K); > + pgdprot = __arm_short_pgtable_prot(data, noexec); > + } > + > + ret = _arm_short_map(data, blk_start, blk_paddr, pgdprot, > + pteprot, mapsize == SZ_64K); > + if (ret < 0) { > + /* Free the table we allocated */ > + arm_short_iopte *pgd = data->pgd, *pte; > + > + pgd += ARM_SHORT_PGD_IDX(blk_base); > + if (*pgd) { > + pte = ARM_SHORT_GET_PTE_VA(*pgd); > + kmem_cache_free(data->ptekmem, pte); > + *pgd = 0; > + tlb->flush_pgtable(pgd, sizeof(*pgd), > + data->iop.cookie); Again, the ordering looks totally wrong here. You need to: (1) Zero the pgd pointer (2) Flush the pointer, so the walker can no longer follow the page table (3) Invalidate the TLB, so we don't have any cached intermediate walks (4) Sync the TLB (5) Free the memory That said, I don't fully follow the logic here. > + } > + return 0;/* Bytes unmapped */ > + } > + tlb->tlb_add_flush(blk_start, mapsize, true, data->iop.cookie); > + tlb->tlb_sync(data->iop.cookie); Why are you doing this here for every iteration? Will -- To unsubscribe from this list: send the line "unsubscribe devicetree" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html