On Mon, Sep 10, 2018 at 06:29:07PM +1000, Alexey Kardashevskiy wrote: > At the moment the real mode handler of H_PUT_TCE calls iommu_tce_xchg_rm() > which in turn reads the old TCE and if it was a valid entry - marks > the physical page dirty if it was mapped for writing. Since it is > the real mode, realmode_pfn_to_page() is used instead of pfn_to_page() > to get the page struct. However SetPageDirty() itself reads the compound > page head and returns a virtual address for the head page struct and > setting dirty bit for that kills the system. > > This adds additional dirty bit tracking into the MM/IOMMU API for use > in the real mode. Note that this does not change how VFIO and > KVM (in virtual mode) set this bit. The KVM (real mode) changes include: > - use the lowest bit of the cached host phys address to carry > the dirty bit; > - mark pages dirty when they are unpinned which happens when > the preregistered memory is released which always happens in virtual > mode; > - add mm_iommu_ua_mark_dirty_rm() helper to set delayed dirty bit; > - change iommu_tce_xchg_rm() to take the kvm struct for the mm to use > in the new mm_iommu_ua_mark_dirty_rm() helper; > - move iommu_tce_xchg_rm() to book3s_64_vio_hv.c (which is the only > caller anyway) to reduce the real mode KVM and IOMMU knowledge > across different subsystems. > > This removes realmode_pfn_to_page() as it is not used anymore. > > While we at it, remove some EXPORT_SYMBOL_GPL() as that code is for > the real mode only and modules cannot call it anyway. > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > Changes: > v2: > * only do delaying dirtying for the real mode > * no change in VFIO IOMMU SPAPR TCE driver is needed anymore > * inverted MM_IOMMU_TABLE_GROUP_PAGE_MASK > --- > arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - > arch/powerpc/include/asm/iommu.h | 2 -- > arch/powerpc/include/asm/mmu_context.h | 1 + > arch/powerpc/kernel/iommu.c | 25 -------------- > arch/powerpc/kvm/book3s_64_vio_hv.c | 39 +++++++++++++++++----- > arch/powerpc/mm/init_64.c | 49 ---------------------------- > arch/powerpc/mm/mmu_context_iommu.c | 34 ++++++++++++++++--- > 7 files changed, 62 insertions(+), 89 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h > index 13a688f..2fdc865 100644 > --- a/arch/powerpc/include/asm/book3s/64/pgtable.h > +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h > @@ -1051,7 +1051,6 @@ static inline void vmemmap_remove_mapping(unsigned long start, > return hash__vmemmap_remove_mapping(start, page_size); > } > #endif > -struct page *realmode_pfn_to_page(unsigned long pfn); > > static inline pte_t pmd_pte(pmd_t pmd) > { > diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h > index ab3a4fb..3d4b88c 100644 > --- a/arch/powerpc/include/asm/iommu.h > +++ b/arch/powerpc/include/asm/iommu.h > @@ -220,8 +220,6 @@ extern void iommu_del_device(struct device *dev); > extern int __init tce_iommu_bus_notifier_init(void); > extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, > unsigned long *hpa, enum dma_data_direction *direction); > -extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, > - unsigned long *hpa, enum dma_data_direction *direction); > #else > static inline void iommu_register_group(struct iommu_table_group *table_group, > int pci_domain_number, > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > index b2f89b6..b694d6a 100644 > --- a/arch/powerpc/include/asm/mmu_context.h > +++ b/arch/powerpc/include/asm/mmu_context.h > @@ -38,6 +38,7 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > unsigned long ua, unsigned int pageshift, unsigned long *hpa); > extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, > unsigned long ua, unsigned int pageshift, unsigned long *hpa); > +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); > extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); > extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); > #endif > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c > index af7a20d..19b4c62 100644 > --- a/arch/powerpc/kernel/iommu.c > +++ b/arch/powerpc/kernel/iommu.c > @@ -1013,31 +1013,6 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, > } > EXPORT_SYMBOL_GPL(iommu_tce_xchg); > > -#ifdef CONFIG_PPC_BOOK3S_64 > -long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, > - unsigned long *hpa, enum dma_data_direction *direction) > -{ > - long ret; > - > - ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); > - > - if (!ret && ((*direction == DMA_FROM_DEVICE) || > - (*direction == DMA_BIDIRECTIONAL))) { > - struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT); > - > - if (likely(pg)) { > - SetPageDirty(pg); > - } else { > - tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); > - ret = -EFAULT; > - } > - } > - > - return ret; > -} > -EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm); > -#endif > - > int iommu_take_ownership(struct iommu_table *tbl) > { > unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; > diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c > index 506a4d4..6821ead 100644 > --- a/arch/powerpc/kvm/book3s_64_vio_hv.c > +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c > @@ -187,12 +187,35 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, > EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); > > #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > -static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) > +static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, > + unsigned long entry, unsigned long *hpa, > + enum dma_data_direction *direction) > +{ > + long ret; > + > + ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); > + > + if (!ret && ((*direction == DMA_FROM_DEVICE) || > + (*direction == DMA_BIDIRECTIONAL))) { > + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); > + /* > + * kvmppc_rm_tce_iommu_do_map() updates the UA cache after > + * calling this so we still get here a valid UA. > + */ > + if (pua && *pua) > + mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua)); > + } > + > + return ret; > +} > + > +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, > + unsigned long entry) > { > unsigned long hpa = 0; > enum dma_data_direction dir = DMA_NONE; > > - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); > + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); > } > > static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, > @@ -224,7 +247,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, > unsigned long hpa = 0; > long ret; > > - if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) > + if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) > /* > * real mode xchg can fail if struct page crosses > * a page boundary > @@ -236,7 +259,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, > > ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); > if (ret) > - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); > + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); > > return ret; > } > @@ -282,7 +305,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, > if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) > return H_CLOSED; > > - ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); > + ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); > if (ret) { > mm_iommu_mapped_dec(mem); > /* > @@ -371,7 +394,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > return ret; > > WARN_ON_ONCE_RM(1); > - kvmppc_rm_clear_tce(stit->tbl, entry); > + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); > } > > kvmppc_tce_put(stt, entry, tce); > @@ -520,7 +543,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > goto unlock_exit; > > WARN_ON_ONCE_RM(1); > - kvmppc_rm_clear_tce(stit->tbl, entry); > + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); > } > > kvmppc_tce_put(stt, entry + i, tce); > @@ -571,7 +594,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, > return ret; > > WARN_ON_ONCE_RM(1); > - kvmppc_rm_clear_tce(stit->tbl, entry); > + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); > } > } > > diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c > index 51ce091..7a9886f 100644 > --- a/arch/powerpc/mm/init_64.c > +++ b/arch/powerpc/mm/init_64.c > @@ -308,55 +308,6 @@ void register_page_bootmem_memmap(unsigned long section_nr, > { > } > > -/* > - * We do not have access to the sparsemem vmemmap, so we fallback to > - * walking the list of sparsemem blocks which we already maintain for > - * the sake of crashdump. In the long run, we might want to maintain > - * a tree if performance of that linear walk becomes a problem. > - * > - * realmode_pfn_to_page functions can fail due to: > - * 1) As real sparsemem blocks do not lay in RAM continously (they > - * are in virtual address space which is not available in the real mode), > - * the requested page struct can be split between blocks so get_page/put_page > - * may fail. > - * 2) When huge pages are used, the get_page/put_page API will fail > - * in real mode as the linked addresses in the page struct are virtual > - * too. > - */ > -struct page *realmode_pfn_to_page(unsigned long pfn) > -{ > - struct vmemmap_backing *vmem_back; > - struct page *page; > - unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; > - unsigned long pg_va = (unsigned long) pfn_to_page(pfn); > - > - for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { > - if (pg_va < vmem_back->virt_addr) > - continue; > - > - /* After vmemmap_list entry free is possible, need check all */ > - if ((pg_va + sizeof(struct page)) <= > - (vmem_back->virt_addr + page_size)) { > - page = (struct page *) (vmem_back->phys + pg_va - > - vmem_back->virt_addr); > - return page; > - } > - } > - > - /* Probably that page struct is split between real pages */ > - return NULL; > -} > -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); > - > -#else > - > -struct page *realmode_pfn_to_page(unsigned long pfn) > -{ > - struct page *page = pfn_to_page(pfn); > - return page; > -} > -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); > - > #endif /* CONFIG_SPARSEMEM_VMEMMAP */ > > #ifdef CONFIG_PPC_BOOK3S_64 > diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c > index c9ee9e2..56c2234 100644 > --- a/arch/powerpc/mm/mmu_context_iommu.c > +++ b/arch/powerpc/mm/mmu_context_iommu.c > @@ -18,11 +18,15 @@ > #include <linux/migrate.h> > #include <linux/hugetlb.h> > #include <linux/swap.h> > +#include <linux/sizes.h> > #include <asm/mmu_context.h> > #include <asm/pte-walk.h> > > static DEFINE_MUTEX(mem_list_mutex); > > +#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 > +#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) > + > struct mm_iommu_table_group_mem_t { > struct list_head next; > struct rcu_head rcu; > @@ -263,6 +267,9 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) > if (!page) > continue; > > + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) > + SetPageDirty(page); > + > put_page(page); > mem->hpas[i] = 0; > } > @@ -360,7 +367,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, > > return ret; > } > -EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); > > struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, > unsigned long ua, unsigned long entries) > @@ -390,7 +396,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > if (pageshift > mem->pageshift) > return -EFAULT; > > - *hpa = *va | (ua & ~PAGE_MASK); > + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); > > return 0; > } > @@ -413,11 +419,31 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, > if (!pa) > return -EFAULT; > > - *hpa = *pa | (ua & ~PAGE_MASK); > + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); > > return 0; > } > -EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); > + > +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) > +{ > + struct mm_iommu_table_group_mem_t *mem; > + long entry; > + void *va; > + unsigned long *pa; > + > + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); > + if (!mem) > + return; > + > + entry = (ua - mem->ua) >> PAGE_SHIFT; > + va = &mem->hpas[entry]; > + > + pa = (void *) vmalloc_to_phys(va); > + if (!pa) > + return; > + > + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; > +} > > long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) > { -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature