This adds functions to manipulate first level page tables which could be used by a scalale mode capable IOMMU unit. Cc: Ashok Raj <ashok.raj@xxxxxxxxx> Cc: Jacob Pan <jacob.jun.pan@xxxxxxxxxxxxxxx> Cc: Kevin Tian <kevin.tian@xxxxxxxxx> Cc: Liu Yi L <yi.l.liu@xxxxxxxxx> Cc: Yi Sun <yi.y.sun@xxxxxxxxxxxxxxx> Signed-off-by: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx> --- drivers/iommu/Makefile | 2 +- drivers/iommu/intel-iommu.c | 33 +++ drivers/iommu/intel-pgtable.c | 376 +++++++++++++++++++++++++++++ include/linux/intel-iommu.h | 33 ++- include/trace/events/intel_iommu.h | 60 +++++ 5 files changed, 502 insertions(+), 2 deletions(-) create mode 100644 drivers/iommu/intel-pgtable.c diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 35d17094fe3b..aa04f4c3ae26 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_ARM_SMMU) += arm-smmu.o arm-smmu-impl.o obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o -obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o +obj-$(CONFIG_INTEL_IOMMU) += intel-trace.o intel-pgtable.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += intel-iommu-debugfs.o obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 66f76f6df2c2..a314892ee72b 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1670,6 +1670,37 @@ static void free_dmar_iommu(struct intel_iommu *iommu) #endif } +/* First level 5-level paging support */ +static bool first_lvl_5lp_support(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + static int first_level_5lp_supported = -1; + + if (likely(first_level_5lp_supported != -1)) + return first_level_5lp_supported; + + first_level_5lp_supported = 1; +#ifdef CONFIG_X86 + /* Match IOMMU first level and CPU paging mode */ + if (!cpu_feature_enabled(X86_FEATURE_LA57)) { + first_level_5lp_supported = 0; + return first_level_5lp_supported; + } +#endif /* #ifdef CONFIG_X86 */ + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + if (!cap_5lp_support(iommu->cap)) { + first_level_5lp_supported = 0; + break; + } + } + rcu_read_unlock(); + + return first_level_5lp_supported; +} + static struct dmar_domain *alloc_domain(int flags) { struct dmar_domain *domain; @@ -1683,6 +1714,8 @@ static struct dmar_domain *alloc_domain(int flags) domain->flags = flags; domain->has_iotlb_device = false; domain->ops = &second_lvl_pgtable_ops; + domain->first_lvl_5lp = first_lvl_5lp_support(); + spin_lock_init(&domain->page_table_lock); INIT_LIST_HEAD(&domain->devices); return domain; diff --git a/drivers/iommu/intel-pgtable.c b/drivers/iommu/intel-pgtable.c new file mode 100644 index 000000000000..4a26d08a7570 --- /dev/null +++ b/drivers/iommu/intel-pgtable.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * intel-pgtable.c - Intel IOMMU page table manipulation library + * + * Copyright (C) 2019 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx> + */ + +#define pr_fmt(fmt) "DMAR: " fmt +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/io.h> +#include <linux/export.h> +#include <linux/intel-iommu.h> +#include <asm/cacheflush.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <trace/events/intel_iommu.h> + +/* + * first_lvl_map: Map a range of IO virtual address to physical addresses. + */ +#ifdef CONFIG_X86 +#define pgtable_populate(domain, nm) \ +do { \ + void *__new = alloc_pgtable_page(domain->nid); \ + if (!__new) \ + return -ENOMEM; \ + smp_wmb(); \ + spin_lock(&(domain)->page_table_lock); \ + if (nm ## _present(*nm)) { \ + free_pgtable_page(__new); \ + } else { \ + set_##nm(nm, __##nm(__pa(__new) | _PAGE_TABLE)); \ + domain_flush_cache(domain, nm, sizeof(nm##_t)); \ + } \ + spin_unlock(&(domain)->page_table_lock); \ +} while (0) + +static int +first_lvl_map_pte_range(struct dmar_domain *domain, pmd_t *pmd, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + pte_t *pte, *first_pte; + u64 pfn; + + pfn = phys_addr >> PAGE_SHIFT; + if (unlikely(pmd_none(*pmd))) + pgtable_populate(domain, pmd); + + first_pte = pte = pte_offset_kernel(pmd, addr); + + do { + if (pte_present(*pte)) + pr_crit("ERROR: PTE for vPFN 0x%llx already set to 0x%llx\n", + pfn, (unsigned long long)pte_val(*pte)); + set_pte(pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + + domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); + + return 0; +} + +static int +first_lvl_map_pmd_range(struct dmar_domain *domain, pud_t *pud, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + unsigned long next; + pmd_t *pmd; + + if (unlikely(pud_none(*pud))) + pgtable_populate(domain, pud); + pmd = pmd_offset(pud, addr); + + phys_addr -= addr; + do { + next = pmd_addr_end(addr, end); + if (first_lvl_map_pte_range(domain, pmd, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int +first_lvl_map_pud_range(struct dmar_domain *domain, p4d_t *p4d, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + unsigned long next; + pud_t *pud; + + if (unlikely(p4d_none(*p4d))) + pgtable_populate(domain, p4d); + + pud = pud_offset(p4d, addr); + + phys_addr -= addr; + do { + next = pud_addr_end(addr, end); + if (first_lvl_map_pmd_range(domain, pud, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int +first_lvl_map_p4d_range(struct dmar_domain *domain, pgd_t *pgd, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + unsigned long next; + p4d_t *p4d; + + if (domain->first_lvl_5lp && unlikely(pgd_none(*pgd))) + pgtable_populate(domain, pgd); + + p4d = p4d_offset(pgd, addr); + + phys_addr -= addr; + do { + next = p4d_addr_end(addr, end); + if (first_lvl_map_pud_range(domain, p4d, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +int first_lvl_map_range(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, int dma_prot) +{ + unsigned long next; + pgprot_t prot; + pgd_t *pgd; + + trace_domain_mm_map(domain, addr, end, phys_addr); + + /* + * There is no PAGE_KERNEL_WO for a pte entry, so let's use RW + * for a pte that requires write operation. + */ + prot = dma_prot & DMA_PTE_WRITE ? PAGE_KERNEL : PAGE_KERNEL_RO; + if (WARN_ON(addr >= end)) + return -EINVAL; + + phys_addr -= addr; + pgd = pgd_offset_pgd(domain->pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (first_lvl_map_p4d_range(domain, pgd, addr, next, + phys_addr + addr, prot)) + return -ENOMEM; + } while (pgd++, addr = next, addr != end); + + return 0; +} + +/* + * first_lvl_unmap: Unmap an existing mapping between a range of IO virtual + * address and physical addresses. + */ +static struct page * +first_lvl_unmap_pte_range(struct dmar_domain *domain, pmd_t *pmd, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + unsigned long start; + pte_t *pte, *first_pte; + + start = addr; + pte = pte_offset_kernel(pmd, addr); + first_pte = pte; + do { + set_pte(pte, __pte(0)); + } while (pte++, addr += PAGE_SIZE, addr != end); + + domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); + + /* + * Reclaim pmd page, lock is unnecessary here if it owns + * the whole range. + */ + if (start != end && IS_ALIGNED(start | end, PMD_SIZE)) { + struct page *pte_page; + + pte_page = pmd_page(*pmd); + pte_page->freelist = freelist; + freelist = pte_page; + pmd_clear(pmd); + domain_flush_cache(domain, pmd, sizeof(pmd_t)); + } + + return freelist; +} + +static struct page * +first_lvl_unmap_pmd_range(struct dmar_domain *domain, pud_t *pud, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + pmd_t *pmd; + unsigned long start, next; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + freelist = first_lvl_unmap_pte_range(domain, pmd, + addr, next, freelist); + } while (pmd++, addr = next, addr != end); + + /* + * Reclaim pud page, lock is unnecessary here if it owns + * the whole range. + */ + if (start != end && IS_ALIGNED(start | end, PUD_SIZE)) { + struct page *pmd_page; + + pmd_page = pud_page(*pud); + pmd_page->freelist = freelist; + freelist = pmd_page; + pud_clear(pud); + domain_flush_cache(domain, pud, sizeof(pud_t)); + } + + return freelist; +} + +static struct page * +first_lvl_unmap_pud_range(struct dmar_domain *domain, p4d_t *p4d, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + pud_t *pud; + unsigned long start, next; + + start = addr; + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + freelist = first_lvl_unmap_pmd_range(domain, pud, + addr, next, freelist); + } while (pud++, addr = next, addr != end); + + /* + * Reclaim p4d page, lock is unnecessary here if it owns + * the whole range. + */ + if (start != end && IS_ALIGNED(start | end, P4D_SIZE)) { + struct page *pud_page; + + pud_page = p4d_page(*p4d); + pud_page->freelist = freelist; + freelist = pud_page; + p4d_clear(p4d); + domain_flush_cache(domain, p4d, sizeof(p4d_t)); + } + + return freelist; +} + +static struct page * +first_lvl_unmap_p4d_range(struct dmar_domain *domain, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct page *freelist) +{ + p4d_t *p4d; + unsigned long start, next; + + start = addr; + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + freelist = first_lvl_unmap_pud_range(domain, p4d, + addr, next, freelist); + } while (p4d++, addr = next, addr != end); + + /* + * Reclaim pgd page, lock is unnecessary here if it owns + * the whole range. + */ + if (domain->first_lvl_5lp && start != end && + IS_ALIGNED(start | end, PGDIR_SIZE)) { + struct page *p4d_page; + + p4d_page = pgd_page(*pgd); + p4d_page->freelist = freelist; + freelist = p4d_page; + pgd_clear(pgd); + domain_flush_cache(domain, pgd, sizeof(pgd_t)); + } + + return freelist; +} + +struct page *first_lvl_unmap_range(struct dmar_domain *domain, + unsigned long addr, unsigned long end) +{ + pgd_t *pgd; + unsigned long next; + struct page *freelist = NULL; + + trace_domain_mm_unmap(domain, addr, end); + + if (WARN_ON(addr >= end)) + return NULL; + + pgd = pgd_offset_pgd(domain->pgd, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + freelist = first_lvl_unmap_p4d_range(domain, pgd, + addr, next, freelist); + } while (pgd++, addr = next, addr != end); + + return freelist; +} + +static pte_t *iova_to_pte(struct dmar_domain *domain, unsigned long iova) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (WARN_ON_ONCE(!IS_ALIGNED(iova, PAGE_SIZE))) + return NULL; + + pgd = pgd_offset_pgd(domain->pgd, iova); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + p4d = p4d_offset(pgd, iova); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, iova); + if (pud_none_or_clear_bad(pud)) + return NULL; + + pmd = pmd_offset(pud, iova); + if (pmd_none_or_clear_bad(pmd)) + return NULL; + + return pte_offset_kernel(pmd, iova); +} + +phys_addr_t +first_lvl_iova_to_phys(struct dmar_domain *domain, unsigned long iova) +{ + pte_t *pte = iova_to_pte(domain, PAGE_ALIGN(iova)); + + if (!pte || !pte_present(*pte)) + return 0; + + return (pte_val(*pte) & PTE_PFN_MASK) | (iova & ~PAGE_MASK); +} +#endif /* CONFIG_X86 */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 9b259756057b..9273e3f59078 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -540,9 +540,11 @@ struct dmar_domain { struct iova_domain iovad; /* iova's that belong to this domain */ /* page table used by this domain */ - struct dma_pte *pgd; /* virtual address */ + void *pgd; /* virtual address */ + spinlock_t page_table_lock; /* Protects page tables */ int gaw; /* max guest address width */ const struct pgtable_ops *ops; /* page table ops */ + bool first_lvl_5lp; /* First level 5-level paging support */ /* adjusted guest address width, 0 is level 2 30-bit */ int agaw; @@ -708,6 +710,35 @@ int for_each_device_domain(int (*fn)(struct device_domain_info *info, void iommu_flush_write_buffer(struct intel_iommu *iommu); int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); +#ifdef CONFIG_X86 +int first_lvl_map_range(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, int dma_prot); +struct page *first_lvl_unmap_range(struct dmar_domain *domain, + unsigned long addr, unsigned long end); +phys_addr_t first_lvl_iova_to_phys(struct dmar_domain *domain, + unsigned long iova); +#else +static inline int +first_lvl_map_range(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr, int dma_prot) +{ + return -ENODEV; +} + +static inline struct page * +first_lvl_unmap_range(struct dmar_domain *domain, + unsigned long addr, unsigned long end) +{ + return NULL; +} + +static inline phys_addr_t +first_lvl_iova_to_phys(struct dmar_domain *domain, unsigned long iova) +{ + return 0; +} +#endif /* CONFIG_X86 */ + #ifdef CONFIG_INTEL_IOMMU_SVM extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); diff --git a/include/trace/events/intel_iommu.h b/include/trace/events/intel_iommu.h index 54e61d456cdf..e8c95290fd13 100644 --- a/include/trace/events/intel_iommu.h +++ b/include/trace/events/intel_iommu.h @@ -99,6 +99,66 @@ DEFINE_EVENT(dma_unmap, bounce_unmap_single, TP_ARGS(dev, dev_addr, size) ); +DECLARE_EVENT_CLASS(domain_map, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr), + + TP_ARGS(domain, addr, end, phys_addr), + + TP_STRUCT__entry( + __field(struct dmar_domain *, domain) + __field(unsigned long, addr) + __field(unsigned long, end) + __field(phys_addr_t, phys_addr) + ), + + TP_fast_assign( + __entry->domain = domain; + __entry->addr = addr; + __entry->end = end; + __entry->phys_addr = phys_addr; + ), + + TP_printk("domain=%p addr=0x%lx end=0x%lx phys_addr=0x%llx", + __entry->domain, __entry->addr, __entry->end, + (unsigned long long)__entry->phys_addr) +); + +DEFINE_EVENT(domain_map, domain_mm_map, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end, phys_addr_t phys_addr), + + TP_ARGS(domain, addr, end, phys_addr) +); + +DECLARE_EVENT_CLASS(domain_unmap, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end), + + TP_ARGS(domain, addr, end), + + TP_STRUCT__entry( + __field(struct dmar_domain *, domain) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->domain = domain; + __entry->addr = addr; + __entry->end = end; + ), + + TP_printk("domain=%p addr=0x%lx end=0x%lx", + __entry->domain, __entry->addr, __entry->end) +); + +DEFINE_EVENT(domain_unmap, domain_mm_unmap, + TP_PROTO(struct dmar_domain *domain, unsigned long addr, + unsigned long end), + + TP_ARGS(domain, addr, end) +); #endif /* _TRACE_INTEL_IOMMU_H */ /* This part must be outside protection */ -- 2.17.1