On 17/10/2023 09:18, Suthikulpanit, Suravee wrote: > Hi Joao, > > On 9/23/2023 8:25 AM, Joao Martins wrote: >> ... >> diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c >> index 2892aa1b4dc1..099ccb04f52f 100644 >> --- a/drivers/iommu/amd/io_pgtable.c >> +++ b/drivers/iommu/amd/io_pgtable.c >> @@ -486,6 +486,89 @@ static phys_addr_t iommu_v1_iova_to_phys(struct >> io_pgtable_ops *ops, unsigned lo >> return (__pte & ~offset_mask) | (iova & offset_mask); >> } >> +static bool pte_test_dirty(u64 *ptep, unsigned long size) >> +{ >> + bool dirty = false; >> + int i, count; >> + >> + /* >> + * 2.2.3.2 Host Dirty Support >> + * When a non-default page size is used , software must OR the >> + * Dirty bits in all of the replicated host PTEs used to map >> + * the page. The IOMMU does not guarantee the Dirty bits are >> + * set in all of the replicated PTEs. Any portion of the page >> + * may have been written even if the Dirty bit is set in only >> + * one of the replicated PTEs. >> + */ >> + count = PAGE_SIZE_PTE_COUNT(size); >> + for (i = 0; i < count; i++) { >> + if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *) &ptep[i])) { >> + dirty = true; >> + break; >> + } >> + } >> + >> + return dirty; >> +} >> + >> +static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size) >> +{ >> + bool dirty = false; >> + int i, count; >> + >> + /* >> + * 2.2.3.2 Host Dirty Support >> + * When a non-default page size is used , software must OR the >> + * Dirty bits in all of the replicated host PTEs used to map >> + * the page. The IOMMU does not guarantee the Dirty bits are >> + * set in all of the replicated PTEs. Any portion of the page >> + * may have been written even if the Dirty bit is set in only >> + * one of the replicated PTEs. >> + */ >> + count = PAGE_SIZE_PTE_COUNT(size); >> + for (i = 0; i < count; i++) >> + if (test_and_clear_bit(IOMMU_PTE_HD_BIT, >> + (unsigned long *) &ptep[i])) >> + dirty = true; >> + >> + return dirty; >> +} > > Can we consolidate the two functions above where we can pass the flag and check > if IOMMU_DIRTY_NO_CLEAR is set? > I guess so yes -- it was initially to have an efficient tight loop to check all replicated PTEs, but I think I found a way to merge everything e.g. diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 099ccb04f52f..953f867b4943 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -486,8 +486,10 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo return (__pte & ~offset_mask) | (iova & offset_mask); } -static bool pte_test_dirty(u64 *ptep, unsigned long size) +static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, + unsigned long flags) { + bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; bool dirty = false; int i, count; @@ -501,35 +503,20 @@ static bool pte_test_dirty(u64 *ptep, unsigned long size) * one of the replicated PTEs. */ count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count; i++) { - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *) &ptep[i])) { + for (i = 0; i < count && test_only; i++) { + if (test_bit(IOMMU_PTE_HD_BIT, + (unsigned long *) &ptep[i])) { dirty = true; break; } } - return dirty; -} - -static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size) -{ - bool dirty = false; - int i, count; - - /* - * 2.2.3.2 Host Dirty Support - * When a non-default page size is used , software must OR the - * Dirty bits in all of the replicated host PTEs used to map - * the page. The IOMMU does not guarantee the Dirty bits are - * set in all of the replicated PTEs. Any portion of the page - * may have been written even if the Dirty bit is set in only - * one of the replicated PTEs. - */ - count = PAGE_SIZE_PTE_COUNT(size); - for (i = 0; i < count; i++) + for (i = 0; i < count && !test_only; i++) { if (test_and_clear_bit(IOMMU_PTE_HD_BIT, - (unsigned long *) &ptep[i])) + (unsigned long *) &ptep[i])) { dirty = true; + } + } return dirty; } @@ -559,9 +546,7 @@ static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, * Mark the whole IOVA range as dirty even if only one of * the replicated PTEs were marked dirty. */ - if (((flags & IOMMU_DIRTY_NO_CLEAR) && - pte_test_dirty(ptep, pgsize)) || - pte_test_and_clear_dirty(ptep, pgsize)) + if (pte_test_and_clear_dirty(ptep, pgsize, flags)) iommu_dirty_bitmap_record(dirty, iova, pgsize); iova += pgsize; } while (iova < end); >> + >> +static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, >> + unsigned long iova, size_t size, >> + unsigned long flags, >> + struct iommu_dirty_bitmap *dirty) >> +{ >> + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); >> + unsigned long end = iova + size - 1; >> + >> + do { >> + unsigned long pgsize = 0; >> + u64 *ptep, pte; >> + >> + ptep = fetch_pte(pgtable, iova, &pgsize); >> + if (ptep) >> + pte = READ_ONCE(*ptep); >> + if (!ptep || !IOMMU_PTE_PRESENT(pte)) { >> + pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); >> + iova += pgsize; >> + continue; >> + } >> + >> + /* >> + * Mark the whole IOVA range as dirty even if only one of >> + * the replicated PTEs were marked dirty. >> + */ >> + if (((flags & IOMMU_DIRTY_NO_CLEAR) && >> + pte_test_dirty(ptep, pgsize)) || >> + pte_test_and_clear_dirty(ptep, pgsize)) >> + iommu_dirty_bitmap_record(dirty, iova, pgsize); >> + iova += pgsize; >> + } while (iova < end); >> + You earlier point made me discover that the test-only case might end up clearing the PTE unnecessarily. But I have addressed it in the previous comment >> + return 0; >> +} >> + >> /* >> * ---------------------------------------------------- >> */ >> @@ -527,6 +610,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct >> io_pgtable_cfg *cfg, void *coo >> pgtable->iop.ops.map_pages = iommu_v1_map_pages; >> pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; >> pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; >> + pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; >> return &pgtable->iop; >> } >> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c >> index af36c627022f..31b333cc6fe1 100644 >> --- a/drivers/iommu/amd/iommu.c >> +++ b/drivers/iommu/amd/iommu.c >> .... >> @@ -2156,11 +2160,17 @@ static inline u64 dma_max_address(void) >> return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); >> } >> +static bool amd_iommu_hd_support(struct amd_iommu *iommu) >> +{ >> + return iommu && (iommu->features & FEATURE_HDSUP); >> +} >> + > > You can use the newly introduced check_feature(u64 mask) to check the HD support. > It appears that the check_feature() is logically equivalent to check_feature_on_all_iommus(); where this check is per-device/per-iommu check to support potentially nature of different IOMMUs with different features. Being per-IOMMU would allow you to have firmware to not advertise certain IOMMU features on some devices while still supporting for others. I understand this is not a thing in x86, but the UAPI supports it. Having said that, you still want me to switch to check_feature() ? I think iommufd tree next branch is still in v6.6-rc2, so I am not sure I can really use check_feature() yet without leading Jason individual branch into compile errors. This all eventually gets merged into linux-next daily, but my impression is that individual maintainer's next is compilable? Worst case I submit a follow-up post merge cleanup to switch to check_feature()? [I can't use use check_feature_on_all_iommus() as that's removed by this commit below] > (See > https://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git/commit/?h=next&id=7b7563a93437ef945c829538da28f0095f1603ec) > >> ... >> @@ -2252,6 +2268,9 @@ static int amd_iommu_attach_device(struct iommu_domain >> *dom, >> return 0; >> dev_data->defer_attach = false; >> + if (dom->dirty_ops && iommu && >> + !(iommu->features & FEATURE_HDSUP)) > > if (dom->dirty_ops && !check_feature(FEATURE_HDSUP)) > OK -- will switch depending on above paragraph >> + return -EINVAL; >> if (dev_data->domain) >> detach_device(dev); >> @@ -2371,6 +2390,11 @@ static bool amd_iommu_capable(struct device *dev, enum >> iommu_cap cap) >> return true; >> case IOMMU_CAP_DEFERRED_FLUSH: >> return true; >> + case IOMMU_CAP_DIRTY: { >> + struct amd_iommu *iommu = rlookup_amd_iommu(dev); >> + >> + return amd_iommu_hd_support(iommu); > > return check_feature(FEATURE_HDSUP); > Likewise