Re: [PATCH v4 11/18] iommu/amd: Access/Dirty bit support in IOPTEs

Joao Martins <joao.m.martins@xxxxxxxxxx> · Fri, 20 Oct 2023 12:20:44 +0100

On 20/10/2023 10:34, Joao Martins wrote:
> On 20/10/2023 03:21, Baolu Lu wrote:
>> On 10/19/23 7:58 PM, Joao Martins wrote:
>>> On 19/10/2023 01:17, Joao Martins wrote:
>>>> On 19/10/2023 00:11, Jason Gunthorpe wrote:
>>>>> On Wed, Oct 18, 2023 at 09:27:08PM +0100, Joao Martins wrote:
>>>>>> +static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
>>>>>> +                     unsigned long iova, size_t size,
>>>>>> +                     unsigned long flags,
>>>>>> +                     struct iommu_dirty_bitmap *dirty)
>>>>>> +{
>>>>>> +    struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
>>>>>> +    unsigned long end = iova + size - 1;
>>>>>> +
>>>>>> +    do {
>>>>>> +        unsigned long pgsize = 0;
>>>>>> +        u64 *ptep, pte;
>>>>>> +
>>>>>> +        ptep = fetch_pte(pgtable, iova, &pgsize);
>>>>>> +        if (ptep)
>>>>>> +            pte = READ_ONCE(*ptep);
>>>>> It is fine for now, but this is so slow for something that is such a
>>>>> fast path. We are optimizing away a TLB invalidation but leaving
>>>>> this???
>>>>>
>>>> More obvious reason is that I'm still working towards the 'faster' page table
>>>> walker. Then map/unmap code needs to do similar lookups so thought of reusing
>>>> the same functions as map/unmap initially. And improve it afterwards or when
>>>> introducing the splitting.
>>>>
>>>>> It is a radix tree, you walk trees by retaining your position at each
>>>>> level as you go (eg in a function per-level call chain or something)
>>>>> then ++ is cheap. Re-searching the entire tree every time is madness.
>>>> I'm aware -- I have an improved page-table walker for AMD[0] (not yet for Intel;
>>>> still in the works),
>>> Sigh, I realized that Intel's pfn_to_dma_pte() (main lookup function for
>>> map/unmap/iova_to_phys) does something a little off when it finds a non-present
>>> PTE. It allocates a page table to it; which is not OK in this specific case (I
>>> would argue it's neither for iova_to_phys but well maybe I misunderstand the
>>> expectation of that API).
>>
>> pfn_to_dma_pte() doesn't allocate page for a non-present PTE if the
>> target_level parameter is set to 0. See below line 932.
>>
>>  913 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
>>  914                           unsigned long pfn, int *target_level,
>>  915                           gfp_t gfp)
>>  916 {
>>
>> [...]
>>
>>  927         while (1) {
>>  928                 void *tmp_page;
>>  929
>>  930                 offset = pfn_level_offset(pfn, level);
>>  931                 pte = &parent[offset];
>>  932                 if (!*target_level && (dma_pte_superpage(pte) ||
>> !dma_pte_present(pte)))
>>  933                         break;
>>
>> So both iova_to_phys() and read_and_clear_dirty() are doing things
>> right:
>>
>>     struct dma_pte *pte;
>>     int level = 0;
>>
>>     pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
>>                              &level, GFP_KERNEL);
>>     if (pte && dma_pte_present(pte)) {
>>         /* The PTE is valid, check anything you want! */
>>         ... ...
>>     }
>>
>> Or, I am overlooking something else?
> 
> You're right, thanks for the keeping me straight -- I was already doing the
> right thing. I've forgotten about it in the midst of the other code -- Probably
> worth a comment in the caller to make it obvious.

For what is worth, this is the improved page-table walker I have in staging (as
a separate patch) alongside AMD. It is quite similar, except AMD IOMMU has a
bigger featureset in the PTEs page size it can represent, but the crux of the
walking is the same, bearing different coding style in the IOMMU drivers.

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 97558b420e35..f6990962af2a 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4889,14 +4889,52 @@ static int intel_iommu_set_dirty_tracking(struct
iommu_domain *domain,
        return ret;
 }

+static int walk_dirty_dma_pte_level(struct dmar_domain *domain, int level,
+                                   struct dma_pte *pte, unsigned long start_pfn,
+                                   unsigned long last_pfn, unsigned long flags,
+                                   struct iommu_dirty_bitmap *dirty)
+{
+       unsigned long pfn, page_size;
+
+       pfn = start_pfn;
+       pte = &pte[pfn_level_offset(pfn, level)];
+
+       do {
+               unsigned long level_pfn = pfn & level_mask(level);
+               unsigned long level_last;
+
+               if (!dma_pte_present(pte))
+                       goto next;
+
+               if (level > 1 && !dma_pte_superpage(pte)) {
+                       level_last = level_pfn + level_size(level) - 1;
+                       level_last = min(level_last, last_pfn);
+                       walk_dirty_dma_pte_level(domain, level - 1,
+                                                phys_to_virt(dma_pte_addr(pte)),
+                                                pfn, level_last,
+                                                flags, dirty);
+               } else {
+                       page_size = level_size(level) << VTD_PAGE_SHIFT;
+
+                       if (dma_sl_pte_test_and_clear_dirty(pte, flags))
+                               iommu_dirty_bitmap_record(dirty,
+                                                         pfn << VTD_PAGE_SHIFT,
+                                                         page_size);
+               }
+next:
+               pfn = level_pfn + level_size(level);
+       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+
+       return 0;
+}
+
 static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
                                            unsigned long iova, size_t size,
                                            unsigned long flags,
                                            struct iommu_dirty_bitmap *dirty)
 {
        struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-       unsigned long end = iova + size - 1;
-       unsigned long pgsize;
+       unsigned long start_pfn, last_pfn;

        /*
         * IOMMUFD core calls into a dirty tracking disabled domain without an
@@ -4907,24 +4945,14 @@ static int intel_iommu_read_and_clear_dirty(struct
iommu_domain *domain,
        if (!dmar_domain->dirty_tracking && dirty->bitmap)
                return -EINVAL;

-       do {
-               struct dma_pte *pte;
-               int lvl = 0;
-
-               pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
-                                    GFP_ATOMIC);
-               pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
-               if (!pte || !dma_pte_present(pte)) {
-                       iova += pgsize;
-                       continue;
-               }

-               if (dma_sl_pte_test_and_clear_dirty(pte, flags))
-                       iommu_dirty_bitmap_record(dirty, iova, pgsize);
-               iova += pgsize;
-       } while (iova < end);
+       start_pfn = iova >> VTD_PAGE_SHIFT;
+       last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;

-       return 0;
+       return walk_dirty_dma_pte_level(dmar_domain,
+                                       agaw_to_level(dmar_domain->agaw),
+                                       dmar_domain->pgd, start_pfn, last_pfn,
+                                       flags, dirty);
 }

 const struct iommu_dirty_ops intel_dirty_ops = {