On Tue, Jun 26, 2018 at 03:59:26PM +1000, Alexey Kardashevskiy wrote: > We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that > an IOMMU page is contained in the physical page so the PCI hardware won't > get access to unassigned host memory. > > However we do not have this check in KVM fastpath (H_PUT_TCE accelerated > code) so the user space can pin memory backed with 64k pages and create > a hardware TCE table with a bigger page size. We were lucky so far and > did not hit this yet as the very first time the mapping happens > we do not have tbl::it_userspace allocated yet and fall back to > the userspace which in turn calls VFIO IOMMU driver and that fails > because of the check in vfio_iommu_spapr_tce.c which is really > sustainable solution. > > This stores the smallest preregistered page size in the preregistered > region descriptor and changes the mm_iommu_xxx API to check this against > the IOMMU page size. > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> > --- > Changes: > v2: > * explicitly check for compound pages before calling compound_order() > > --- > The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to > advertise 16MB pages to the guest; a typical pseries guest will use 16MB > for IOMMU pages without checking the mmu pagesize and this will fail > at https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256 > > With the change, mapping will fail in KVM and the guest will print: > > mlx5_core 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000 18 1f returned 0 (liobn = 0x80000001 starting addr = 8000000 0) > mlx5_core 0000:00:00.0: created tce table LIOBN 0x80000001 for /pci@800000020000000/ethernet@0 > mlx5_core 0000:00:00.0: failed to map direct window for > /pci@800000020000000/ethernet@0: -1 [snip] > @@ -124,7 +125,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, > struct mm_iommu_table_group_mem_t **pmem) > { > struct mm_iommu_table_group_mem_t *mem; > - long i, j, ret = 0, locked_entries = 0; > + long i, j, ret = 0, locked_entries = 0, pageshift; > struct page *page = NULL; > > mutex_lock(&mem_list_mutex); > @@ -166,6 +167,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, > goto unlock_exit; > } > > + mem->pageshift = 30; /* start from 1G pages - the biggest we have */ What about 16G pages on an HPT system? > for (i = 0; i < entries; ++i) { > if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), > 1/* pages */, 1/* iswrite */, &page)) { > @@ -199,6 +202,11 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, > } > } > populate: > + pageshift = PAGE_SHIFT; > + if (PageCompound(page)) > + pageshift += compound_order(compound_head(page)); > + mem->pageshift = min_t(unsigned int, mem->pageshift, pageshift); Why not make mem->pageshift and pageshift local the same type to avoid the min_t() ? > + > mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; > } > > @@ -349,7 +357,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, > EXPORT_SYMBOL_GPL(mm_iommu_find); > > long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > - unsigned long ua, unsigned long *hpa) > + unsigned long ua, unsigned int pageshift, unsigned long *hpa) > { > const long entry = (ua - mem->ua) >> PAGE_SHIFT; > u64 *va = &mem->hpas[entry]; > @@ -357,6 +365,9 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > if (entry >= mem->entries) > return -EFAULT; > > + if (pageshift > mem->pageshift) > + return -EFAULT; > + > *hpa = *va | (ua & ~PAGE_MASK); > > return 0; > @@ -364,7 +375,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, > EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); > > long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, > - unsigned long ua, unsigned long *hpa) > + unsigned long ua, unsigned int pageshift, unsigned long *hpa) > { > const long entry = (ua - mem->ua) >> PAGE_SHIFT; > void *va = &mem->hpas[entry]; > @@ -373,6 +384,9 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, > if (entry >= mem->entries) > return -EFAULT; > > + if (pageshift > mem->pageshift) > + return -EFAULT; > + > pa = (void *) vmalloc_to_phys(va); > if (!pa) > return -EFAULT; > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c > index 2da5f05..7cd63b0 100644 > --- a/drivers/vfio/vfio_iommu_spapr_tce.c > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c > @@ -467,7 +467,7 @@ static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container, > if (!mem) > return -EINVAL; > > - ret = mm_iommu_ua_to_hpa(mem, tce, phpa); > + ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa); > if (ret) > return -EINVAL; > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature