In order to be able to efficiently free empty page table levels, count the number of entries in each page table my incremeanting and decremeanting refcount every time a PTE is inserted or removed form the page table. For this to work correctly, add two helper function: dma_clear_pte and dma_set_pte where counting is performed, Also, modify the code so every page table entry is always updated using the two new functions. Signed-off-by: Pasha Tatashin <pasha.tatashin@xxxxxxxxxx> --- drivers/iommu/intel/iommu.c | 40 +++++++++++++++++++++--------------- drivers/iommu/intel/iommu.h | 41 +++++++++++++++++++++++++++++++------ 2 files changed, 58 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 897159dba47d..4688ef797161 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -949,7 +949,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, if (domain->use_first_level) pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; - if (cmpxchg64(&pte->val, 0ULL, pteval)) + if (dma_set_pte(pte, pteval)) /* Someone else set it while we were thinking; use theirs. */ free_pgtable_page(tmp_page); else @@ -1021,7 +1021,8 @@ static void dma_pte_clear_range(struct dmar_domain *domain, continue; } do { - dma_clear_pte(pte); + if (dma_pte_present(pte)) + dma_clear_pte(pte); start_pfn += lvl_to_nr_pages(large_page); pte++; } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); @@ -1062,7 +1063,8 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level, */ if (level < retain_level && !(start_pfn > level_pfn || last_pfn < level_pfn + level_size(level) - 1)) { - dma_clear_pte(pte); + if (dma_pte_present(pte)) + dma_clear_pte(pte); domain_flush_cache(domain, pte, sizeof(*pte)); free_pgtable_page(level_pte); } @@ -1093,12 +1095,13 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, } } -/* When a page at a given level is being unlinked from its parent, we don't - need to *modify* it at all. All we need to do is make a list of all the - pages which can be freed just as soon as we've flushed the IOTLB and we - know the hardware page-walk will no longer touch them. - The 'pte' argument is the *parent* PTE, pointing to the page that is to - be freed. */ +/* + * A given page at a given level is being unlinked from its parent. + * We need to make a list of all the pages which can be freed just as soon as + * we've flushed the IOTLB and we know the hardware page-walk will no longer + * touch them. The 'pte' argument is the *parent* PTE, pointing to the page + * that is to be freed. + */ static void dma_pte_list_pagetables(struct dmar_domain *domain, int level, struct dma_pte *pte, struct list_head *freelist) @@ -1106,17 +1109,20 @@ static void dma_pte_list_pagetables(struct dmar_domain *domain, struct page *pg; pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT); - list_add_tail(&pg->lru, freelist); - - if (level == 1) - return; - pte = page_address(pg); + do { - if (dma_pte_present(pte) && !dma_pte_superpage(pte)) - dma_pte_list_pagetables(domain, level - 1, pte, freelist); + if (dma_pte_present(pte)) { + if (level > 1 && !dma_pte_superpage(pte)) { + dma_pte_list_pagetables(domain, level - 1, pte, + freelist); + } + dma_clear_pte(pte); + } pte++; } while (!first_pte_in_page(pte)); + + list_add_tail(&pg->lru, freelist); } static void dma_pte_clear_level(struct dmar_domain *domain, int level, @@ -2244,7 +2250,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, /* We don't need lock here, nobody else * touches the iova range */ - tmp = cmpxchg64_local(&pte->val, 0ULL, pteval); + tmp = dma_set_pte(pte, pteval); if (tmp) { static int dumps = 5; pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index ce030c5b5772..f1ea508f45bd 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -802,11 +802,6 @@ struct dma_pte { u64 val; }; -static inline void dma_clear_pte(struct dma_pte *pte) -{ - pte->val = 0; -} - static inline u64 dma_pte_addr(struct dma_pte *pte) { #ifdef CONFIG_64BIT @@ -818,9 +813,43 @@ static inline u64 dma_pte_addr(struct dma_pte *pte) #endif } +#define DMA_PTEVAL_PRESENT(pteval) (((pteval) & 3) != 0) static inline bool dma_pte_present(struct dma_pte *pte) { - return (pte->val & 3) != 0; + return DMA_PTEVAL_PRESENT(pte->val); +} + +static inline void dma_clear_pte(struct dma_pte *pte) +{ + u64 old_pteval; + + old_pteval = xchg(&pte->val, 0ULL); + if (DMA_PTEVAL_PRESENT(old_pteval)) { + struct page *pg = virt_to_page(pte); + int rc = page_ref_dec_return(pg); + + WARN_ON_ONCE(rc > 512 || rc < 1); + } else { + /* Ensure that we cleared a valid entry from the page table */ + WARN_ON(1); + } +} + +static inline u64 dma_set_pte(struct dma_pte *pte, u64 pteval) +{ + u64 old_pteval; + + /* Ensure we about to set a valid entry to the page table */ + WARN_ON(!DMA_PTEVAL_PRESENT(pteval)); + old_pteval = cmpxchg64(&pte->val, 0ULL, pteval); + if (old_pteval == 0) { + struct page *pg = virt_to_page(pte); + int rc = page_ref_inc_return(pg); + + WARN_ON_ONCE(rc > 513 || rc < 2); + } + + return old_pteval; } static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, -- 2.43.0.472.g3155946c3a-goog