Re: [PATCH v3 17/19] iommu/amd: Access/Dirty bit support in IOPTEs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 17/10/2023 09:18, Suthikulpanit, Suravee wrote:
> Hi Joao,
> 
> On 9/23/2023 8:25 AM, Joao Martins wrote:
>> ...
>> diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
>> index 2892aa1b4dc1..099ccb04f52f 100644
>> --- a/drivers/iommu/amd/io_pgtable.c
>> +++ b/drivers/iommu/amd/io_pgtable.c
>> @@ -486,6 +486,89 @@ static phys_addr_t iommu_v1_iova_to_phys(struct
>> io_pgtable_ops *ops, unsigned lo
>>       return (__pte & ~offset_mask) | (iova & offset_mask);
>>   }
>>   +static bool pte_test_dirty(u64 *ptep, unsigned long size)
>> +{
>> +    bool dirty = false;
>> +    int i, count;
>> +
>> +    /*
>> +     * 2.2.3.2 Host Dirty Support
>> +     * When a non-default page size is used , software must OR the
>> +     * Dirty bits in all of the replicated host PTEs used to map
>> +     * the page. The IOMMU does not guarantee the Dirty bits are
>> +     * set in all of the replicated PTEs. Any portion of the page
>> +     * may have been written even if the Dirty bit is set in only
>> +     * one of the replicated PTEs.
>> +     */
>> +    count = PAGE_SIZE_PTE_COUNT(size);
>> +    for (i = 0; i < count; i++) {
>> +        if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *) &ptep[i])) {
>> +            dirty = true;
>> +            break;
>> +        }
>> +    }
>> +
>> +    return dirty;
>> +}
>> +
>> +static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size)
>> +{
>> +    bool dirty = false;
>> +    int i, count;
>> +
>> +    /*
>> +     * 2.2.3.2 Host Dirty Support
>> +     * When a non-default page size is used , software must OR the
>> +     * Dirty bits in all of the replicated host PTEs used to map
>> +     * the page. The IOMMU does not guarantee the Dirty bits are
>> +     * set in all of the replicated PTEs. Any portion of the page
>> +     * may have been written even if the Dirty bit is set in only
>> +     * one of the replicated PTEs.
>> +     */
>> +    count = PAGE_SIZE_PTE_COUNT(size);
>> +    for (i = 0; i < count; i++)
>> +        if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
>> +                    (unsigned long *) &ptep[i]))
>> +            dirty = true;
>> +
>> +    return dirty;
>> +}
> 
> Can we consolidate the two functions above where we can pass the flag and check
> if IOMMU_DIRTY_NO_CLEAR is set?
> 
I guess so yes -- it was initially to have an efficient tight loop to check all
replicated PTEs, but I think I found a way to merge everything e.g.

diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index 099ccb04f52f..953f867b4943 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -486,8 +486,10 @@ static phys_addr_t iommu_v1_iova_to_phys(struct
io_pgtable_ops *ops, unsigned lo
        return (__pte & ~offset_mask) | (iova & offset_mask);
 }

-static bool pte_test_dirty(u64 *ptep, unsigned long size)
+static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
+                                    unsigned long flags)
 {
+       bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
        bool dirty = false;
        int i, count;

@@ -501,35 +503,20 @@ static bool pte_test_dirty(u64 *ptep, unsigned long size)
         * one of the replicated PTEs.
         */
        count = PAGE_SIZE_PTE_COUNT(size);
-       for (i = 0; i < count; i++) {
-               if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *) &ptep[i])) {
+       for (i = 0; i < count && test_only; i++) {
+               if (test_bit(IOMMU_PTE_HD_BIT,
+                            (unsigned long *) &ptep[i])) {
                        dirty = true;
                        break;
                }
        }

-       return dirty;
-}
-
-static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size)
-{
-       bool dirty = false;
-       int i, count;
-
-       /*
-        * 2.2.3.2 Host Dirty Support
-        * When a non-default page size is used , software must OR the
-        * Dirty bits in all of the replicated host PTEs used to map
-        * the page. The IOMMU does not guarantee the Dirty bits are
-        * set in all of the replicated PTEs. Any portion of the page
-        * may have been written even if the Dirty bit is set in only
-        * one of the replicated PTEs.
-        */
-       count = PAGE_SIZE_PTE_COUNT(size);
-       for (i = 0; i < count; i++)
+       for (i = 0; i < count && !test_only; i++) {
                if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
-                                       (unsigned long *) &ptep[i]))
+                                      (unsigned long *) &ptep[i])) {
                        dirty = true;
+               }
+       }

        return dirty;
 }
@@ -559,9 +546,7 @@ static int iommu_v1_read_and_clear_dirty(struct
io_pgtable_ops *ops,
                 * Mark the whole IOVA range as dirty even if only one of
                 * the replicated PTEs were marked dirty.
                 */
-               if (((flags & IOMMU_DIRTY_NO_CLEAR) &&
-                               pte_test_dirty(ptep, pgsize)) ||
-                   pte_test_and_clear_dirty(ptep, pgsize))
+               if (pte_test_and_clear_dirty(ptep, pgsize, flags))
                        iommu_dirty_bitmap_record(dirty, iova, pgsize);
                iova += pgsize;
        } while (iova < end);

>> +
>> +static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
>> +                     unsigned long iova, size_t size,
>> +                     unsigned long flags,
>> +                     struct iommu_dirty_bitmap *dirty)
>> +{
>> +    struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
>> +    unsigned long end = iova + size - 1;
>> +
>> +    do {
>> +        unsigned long pgsize = 0;
>> +        u64 *ptep, pte;
>> +
>> +        ptep = fetch_pte(pgtable, iova, &pgsize);
>> +        if (ptep)
>> +            pte = READ_ONCE(*ptep);
>> +        if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
>> +            pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
>> +            iova += pgsize;
>> +            continue;
>> +        }
>> +
>> +        /*
>> +         * Mark the whole IOVA range as dirty even if only one of
>> +         * the replicated PTEs were marked dirty.
>> +         */
>> +        if (((flags & IOMMU_DIRTY_NO_CLEAR) &&
>> +                pte_test_dirty(ptep, pgsize)) ||
>> +            pte_test_and_clear_dirty(ptep, pgsize))
>> +            iommu_dirty_bitmap_record(dirty, iova, pgsize);
>> +        iova += pgsize;
>> +    } while (iova < end);
>> +

You earlier point made me discover that the test-only case might end up clearing
the PTE unnecessarily. But I have addressed it in the previous comment

>> +    return 0;
>> +}
>> +
>>   /*
>>    * ----------------------------------------------------
>>    */
>> @@ -527,6 +610,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct
>> io_pgtable_cfg *cfg, void *coo
>>       pgtable->iop.ops.map_pages    = iommu_v1_map_pages;
>>       pgtable->iop.ops.unmap_pages  = iommu_v1_unmap_pages;
>>       pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
>> +    pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
>>         return &pgtable->iop;
>>   }
>> diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
>> index af36c627022f..31b333cc6fe1 100644
>> --- a/drivers/iommu/amd/iommu.c
>> +++ b/drivers/iommu/amd/iommu.c
>> ....
>> @@ -2156,11 +2160,17 @@ static inline u64 dma_max_address(void)
>>       return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
>>   }
>>   +static bool amd_iommu_hd_support(struct amd_iommu *iommu)
>> +{
>> +    return iommu && (iommu->features & FEATURE_HDSUP);
>> +}
>> +
> 
> You can use the newly introduced check_feature(u64 mask) to check the HD support.
> 

It appears that the check_feature() is logically equivalent to
check_feature_on_all_iommus(); where this check is per-device/per-iommu check to
support potentially nature of different IOMMUs with different features. Being
per-IOMMU would allow you to have firmware to not advertise certain IOMMU
features on some devices while still supporting for others. I understand this is
not a thing in x86, but the UAPI supports it. Having said that, you still want
me to switch to check_feature() ?

I think iommufd tree next branch is still in v6.6-rc2, so I am not sure I can
really use check_feature() yet without leading Jason individual branch into
compile errors. This all eventually gets merged into linux-next daily, but my
impression is that individual maintainer's next is compilable? Worst case I
submit a follow-up post merge cleanup to switch to check_feature()? [I can't use
use check_feature_on_all_iommus() as that's removed by this commit below]

> (See
> https://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu.git/commit/?h=next&id=7b7563a93437ef945c829538da28f0095f1603ec)
> 
>> ...
>> @@ -2252,6 +2268,9 @@ static int amd_iommu_attach_device(struct iommu_domain
>> *dom,
>>           return 0;
>>         dev_data->defer_attach = false;
>> +    if (dom->dirty_ops && iommu &&
>> +        !(iommu->features & FEATURE_HDSUP))
> 
>     if (dom->dirty_ops && !check_feature(FEATURE_HDSUP))
> 
OK -- will switch depending on above paragraph

>> +        return -EINVAL;
>>         if (dev_data->domain)
>>           detach_device(dev);
>> @@ -2371,6 +2390,11 @@ static bool amd_iommu_capable(struct device *dev, enum
>> iommu_cap cap)
>>           return true;
>>       case IOMMU_CAP_DEFERRED_FLUSH:
>>           return true;
>> +    case IOMMU_CAP_DIRTY: {
>> +        struct amd_iommu *iommu = rlookup_amd_iommu(dev);
>> +
>> +        return amd_iommu_hd_support(iommu);
> 
>         return check_feature(FEATURE_HDSUP);
> 
Likewise



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux