Re: [PATCH v4 12/18] iommu/intel: Access/Dirty bit support for SL domains

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 19/10/2023 04:04, Baolu Lu wrote:
> On 10/19/23 4:27 AM, Joao Martins wrote:
>> IOMMU advertises Access/Dirty bits for second-stage page table if the
>> extended capability DMAR register reports it (ECAP, mnemonic ECAP.SSADS).
>> The first stage table is compatible with CPU page table thus A/D bits are
>> implicitly supported. Relevant Intel IOMMU SDM ref for first stage table
>> "3.6.2 Accessed, Extended Accessed, and Dirty Flags" and second stage table
>> "3.7.2 Accessed and Dirty Flags".
>>
>> First stage page table is enabled by default so it's allowed to set dirty
>> tracking and no control bits needed, it just returns 0. To use SSADS, set
>> bit 9 (SSADE) in the scalable-mode PASID table entry and flush the IOTLB
>> via pasid_flush_caches() following the manual. Relevant SDM refs:
>>
>> "3.7.2 Accessed and Dirty Flags"
>> "6.5.3.3 Guidance to Software for Invalidations,
>>   Table 23. Guidance to Software for Invalidations"
>>
>> PTE dirty bit is located in bit 9 and it's cached in the IOTLB so flush
>> IOTLB to make sure IOMMU attempts to set the dirty bit again. Note that
>> iommu_dirty_bitmap_record() will add the IOVA to iotlb_gather and thus the
>> caller of the iommu op will flush the IOTLB. Relevant manuals over the
>> hardware translation is chapter 6 with some special mention to:
>>
>> "6.2.3.1 Scalable-Mode PASID-Table Entry Programming Considerations"
>> "6.2.4 IOTLB"
>>
>> Select IOMMUFD_DRIVER only if IOMMUFD is enabled, given that IOMMU dirty
>> tracking requires IOMMUFD.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx>
>> ---
>>   drivers/iommu/intel/Kconfig |   1 +
>>   drivers/iommu/intel/iommu.c | 104 +++++++++++++++++++++++++++++++++-
>>   drivers/iommu/intel/iommu.h |  17 ++++++
>>   drivers/iommu/intel/pasid.c | 109 ++++++++++++++++++++++++++++++++++++
>>   drivers/iommu/intel/pasid.h |   4 ++
>>   5 files changed, 234 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
>> index 2e56bd79f589..f5348b80652b 100644
>> --- a/drivers/iommu/intel/Kconfig
>> +++ b/drivers/iommu/intel/Kconfig
>> @@ -15,6 +15,7 @@ config INTEL_IOMMU
>>       select DMA_OPS
>>       select IOMMU_API
>>       select IOMMU_IOVA
>> +    select IOMMUFD_DRIVER if IOMMUFD
>>       select NEED_DMA_MAP_STATE
>>       select DMAR_TABLE
>>       select SWIOTLB
>> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
>> index 017aed5813d8..405b459416d5 100644
>> --- a/drivers/iommu/intel/iommu.c
>> +++ b/drivers/iommu/intel/iommu.c
>> @@ -300,6 +300,7 @@ static int iommu_skip_te_disable;
>>   #define IDENTMAP_AZALIA        4
>>     const struct iommu_ops intel_iommu_ops;
>> +const struct iommu_dirty_ops intel_dirty_ops;
>>     static bool translation_pre_enabled(struct intel_iommu *iommu)
>>   {
>> @@ -4077,10 +4078,12 @@ static struct iommu_domain
>> *intel_iommu_domain_alloc(unsigned type)
>>   static struct iommu_domain *
>>   intel_iommu_domain_alloc_user(struct device *dev, u32 flags)
>>   {
>> +    bool enforce_dirty = (flags & IOMMU_HWPT_ALLOC_ENFORCE_DIRTY);
>>       struct iommu_domain *domain;
>>       struct intel_iommu *iommu;
>>   -    if (flags & (~IOMMU_HWPT_ALLOC_NEST_PARENT))
>> +    if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT|
>> +               IOMMU_HWPT_ALLOC_ENFORCE_DIRTY)))
>>           return ERR_PTR(-EOPNOTSUPP);
>>         iommu = device_to_iommu(dev, NULL, NULL);
>> @@ -4090,6 +4093,9 @@ intel_iommu_domain_alloc_user(struct device *dev, u32
>> flags)
>>       if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !ecap_nest(iommu->ecap))
>>           return ERR_PTR(-EOPNOTSUPP);
>>   +    if (enforce_dirty && !slads_supported(iommu))
>> +        return ERR_PTR(-EOPNOTSUPP);
>> +
>>       /*
>>        * domain_alloc_user op needs to fully initialize a domain
>>        * before return, so uses iommu_domain_alloc() here for
>> @@ -4098,6 +4104,15 @@ intel_iommu_domain_alloc_user(struct device *dev, u32
>> flags)
>>       domain = iommu_domain_alloc(dev->bus);
>>       if (!domain)
>>           domain = ERR_PTR(-ENOMEM);
>> +
>> +    if (!IS_ERR(domain) && enforce_dirty) {
>> +        if (to_dmar_domain(domain)->use_first_level) {
>> +            iommu_domain_free(domain);
>> +            return ERR_PTR(-EOPNOTSUPP);
>> +        }
>> +        domain->dirty_ops = &intel_dirty_ops;
>> +    }
>> +
>>       return domain;
>>   }
>>   @@ -4121,6 +4136,9 @@ static int prepare_domain_attach_device(struct
>> iommu_domain *domain,
>>       if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
>>           return -EINVAL;
>>   +    if (domain->dirty_ops && !slads_supported(iommu))
>> +        return -EINVAL;
>> +
>>       /* check if this iommu agaw is sufficient for max mapped address */
>>       addr_width = agaw_to_width(iommu->agaw);
>>       if (addr_width > cap_mgaw(iommu->cap))
>> @@ -4375,6 +4393,8 @@ static bool intel_iommu_capable(struct device *dev, enum
>> iommu_cap cap)
>>           return dmar_platform_optin();
>>       case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
>>           return ecap_sc_support(info->iommu->ecap);
>> +    case IOMMU_CAP_DIRTY:
>> +        return slads_supported(info->iommu);
>>       default:
>>           return false;
>>       }
>> @@ -4772,6 +4792,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain
>> *domain,
>>       if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
>>           return -EOPNOTSUPP;
>>   +    if (domain->dirty_ops)
>> +        return -EINVAL;
>> +
>>       if (context_copied(iommu, info->bus, info->devfn))
>>           return -EBUSY;
>>   @@ -4830,6 +4853,85 @@ static void *intel_iommu_hw_info(struct device *dev,
>> u32 *length, u32 *type)
>>       return vtd;
>>   }
>>   +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
>> +                      bool enable)
>> +{
>> +    struct dmar_domain *dmar_domain = to_dmar_domain(domain);
>> +    struct device_domain_info *info;
>> +    int ret = -EINVAL;
>> +
>> +    spin_lock(&dmar_domain->lock);
>> +    if (dmar_domain->dirty_tracking == enable)
>> +        goto out_unlock;
>> +
>> +    list_for_each_entry(info, &dmar_domain->devices, link) {
>> +        ret = intel_pasid_setup_dirty_tracking(info->iommu, info->domain,
>> +                             info->dev, IOMMU_NO_PASID,
>> +                             enable);
>> +        if (ret)
>> +            goto err_unwind;
>> +
>> +    }
>> +
>> +    if (!ret)
>> +        dmar_domain->dirty_tracking = enable;
> 
> We should also support setting dirty tracking even if the domain has not
> been attached to any device?
> 
Considering this rides on hwpt-alloc which attaches a device on domain
allocation, then this shouldn't be possible in pratice. But I take this is to
improve 'future' resilience, as there's nothing bad coming from below change you
suggested.

> To achieve this, we can remove ret initialization and remove the above
> check. Make the default path a successful one.
> 
>     int ret;
> 
>     [...]
> 
>     dmar_domain->dirty_tracking = enable;
> 
OK, if above makes sense.

>> +out_unlock:
>> +    spin_unlock(&dmar_domain->lock);
>> +
>> +    return 0;
>> +
>> +err_unwind:
>> +    list_for_each_entry(info, &dmar_domain->devices, link)
>> +        intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
>> +                      info->dev, IOMMU_NO_PASID,
>> +                      dmar_domain->dirty_tracking);
>> +    spin_unlock(&dmar_domain->lock);
>> +    return ret;
>> +}
>> +
>> +static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
>> +                        unsigned long iova, size_t size,
>> +                        unsigned long flags,
>> +                        struct iommu_dirty_bitmap *dirty)
>> +{
>> +    struct dmar_domain *dmar_domain = to_dmar_domain(domain);
>> +    unsigned long end = iova + size - 1;
>> +    unsigned long pgsize;
>> +
>> +    /*
>> +     * IOMMUFD core calls into a dirty tracking disabled domain without an
>> +     * IOVA bitmap set in order to clean dirty bits in all PTEs that might
>> +     * have occurred when we stopped dirty tracking. This ensures that we
>> +     * never inherit dirtied bits from a previous cycle.
>> +     */
>> +    if (!dmar_domain->dirty_tracking && dirty->bitmap)
>> +        return -EINVAL;
>> +
>> +    do {
>> +        struct dma_pte *pte;
>> +        int lvl = 0;
>> +
>> +        pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
>> +                     GFP_ATOMIC);
>> +        pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
>> +        if (!pte || !dma_pte_present(pte)) {
>> +            iova += pgsize;
>> +            continue;
>> +        }
>> +
>> +        if (dma_sl_pte_test_and_clear_dirty(pte, flags))
>> +            iommu_dirty_bitmap_record(dirty, iova, pgsize);
>> +        iova += pgsize;
>> +    } while (iova < end);
>> +
>> +    return 0;
>> +}
>> +
>> +const struct iommu_dirty_ops intel_dirty_ops = {
>> +    .set_dirty_tracking    = intel_iommu_set_dirty_tracking,
>> +    .read_and_clear_dirty   = intel_iommu_read_and_clear_dirty,
>> +};
>> +
>>   const struct iommu_ops intel_iommu_ops = {
>>       .capable        = intel_iommu_capable,
>>       .hw_info        = intel_iommu_hw_info,
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index c18fb699c87a..27bcfd3bacdd 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -48,6 +48,9 @@
>>   #define DMA_FL_PTE_DIRTY    BIT_ULL(6)
>>   #define DMA_FL_PTE_XD        BIT_ULL(63)
>>   +#define DMA_SL_PTE_DIRTY_BIT    9
>> +#define DMA_SL_PTE_DIRTY    BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
>> +
>>   #define ADDR_WIDTH_5LEVEL    (57)
>>   #define ADDR_WIDTH_4LEVEL    (48)
>>   @@ -539,6 +542,9 @@ enum {
>>   #define sm_supported(iommu)    (intel_iommu_sm && ecap_smts((iommu)->ecap))
>>   #define pasid_supported(iommu)    (sm_supported(iommu) &&            \
>>                    ecap_pasid((iommu)->ecap))
>> +#define slads_supported(iommu) (sm_supported(iommu) &&                 \
>> +                ecap_slads((iommu)->ecap))
>> +
>>     struct pasid_entry;
>>   struct pasid_state_entry;
>> @@ -592,6 +598,7 @@ struct dmar_domain {
>>                        * otherwise, goes through the second
>>                        * level.
>>                        */
>> +    u8 dirty_tracking:1;        /* Dirty tracking is enabled */
>>         spinlock_t lock;        /* Protect device tracking lists */
>>       struct list_head devices;    /* all devices' list */
>> @@ -781,6 +788,16 @@ static inline bool dma_pte_present(struct dma_pte *pte)
>>       return (pte->val & 3) != 0;
>>   }
>>   +static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
>> +                           unsigned long flags)
>> +{
>> +    if (flags & IOMMU_DIRTY_NO_CLEAR)
>> +        return (pte->val & DMA_SL_PTE_DIRTY) != 0;
>> +
>> +    return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
>> +                  (unsigned long *)&pte->val);
>> +}
>> +
>>   static inline bool dma_pte_superpage(struct dma_pte *pte)
>>   {
>>       return (pte->val & DMA_PTE_LARGE_PAGE);
>> diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
>> index 8f92b92f3d2a..785384a59d55 100644
>> --- a/drivers/iommu/intel/pasid.c
>> +++ b/drivers/iommu/intel/pasid.c
>> @@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64
>> bits)
>>       WRITE_ONCE(*ptr, (old & ~mask) | bits);
>>   }
>>   +static inline u64 pasid_get_bits(u64 *ptr)
>> +{
>> +    return READ_ONCE(*ptr);
>> +}
>> +
>>   /*
>>    * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
>>    * PASID entry.
>> @@ -335,6 +340,36 @@ static inline void pasid_set_fault_enable(struct
>> pasid_entry *pe)
>>       pasid_set_bits(&pe->val[0], 1 << 1, 0);
>>   }
>>   +/*
>> + * Enable second level A/D bits by setting the SLADE (Second Level
>> + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
>> + * entry.
>> + */
>> +static inline void pasid_set_ssade(struct pasid_entry *pe)
>> +{
>> +    pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9);
>> +}
>> +
>> +/*
>> + * Enable second level A/D bits by setting the SLADE (Second Level
> 
> nit: Disable second level ....
> 
/me nods

>> + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
>> + * entry.
>> + */
>> +static inline void pasid_clear_ssade(struct pasid_entry *pe)
>> +{
>> +    pasid_set_bits(&pe->val[0], 1 << 9, 0);
>> +}
>> +
>> +/*
>> + * Checks if second level A/D bits by setting the SLADE (Second Level
>> + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
>> + * entry is enabled.
>> + */
>> +static inline bool pasid_get_ssade(struct pasid_entry *pe)
>> +{
>> +    return pasid_get_bits(&pe->val[0]) & (1 << 9);
>> +}
>> +
>>   /*
>>    * Setup the WPE(Write Protect Enable) field (Bit 132) of a
>>    * scalable mode PASID entry.
>> @@ -627,6 +662,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
>>       pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
>>       pasid_set_fault_enable(pte);
>>       pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
>> +    if (domain->dirty_tracking)
>> +        pasid_set_ssade(pte);
>>         pasid_set_present(pte);
>>       spin_unlock(&iommu->lock);
>> @@ -636,6 +673,78 @@ int intel_pasid_setup_second_level(struct intel_iommu
>> *iommu,
>>       return 0;
>>   }
>>   +/*
>> + * Set up dirty tracking on a second only translation type.
> 
> nit: ... on a second only or nested translation type.
> 
OK -- had changed the check, but not the comment :/

>> + */
>> +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
>> +                     struct dmar_domain *domain,
>> +                     struct device *dev, u32 pasid,
>> +                     bool enabled)
>> +{
>> +    struct pasid_entry *pte;
>> +    u16 did, pgtt;
>> +
>> +    spin_lock(&iommu->lock);
>> +
>> +    pte = intel_pasid_get_entry(dev, pasid);
>> +    if (!pte) {
>> +        spin_unlock(&iommu->lock);
>> +        dev_err_ratelimited(dev,
>> +                    "Failed to get pasid entry of PASID %d\n",
>> +                    pasid);
>> +        return -ENODEV;
>> +    }
>> +
>> +    did = domain_id_iommu(domain, iommu);
>> +    pgtt = pasid_pte_get_pgtt(pte);
>> +    if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && pgtt != PASID_ENTRY_PGTT_NESTED) {
>> +        spin_unlock(&iommu->lock);
>> +        dev_err_ratelimited(dev,
>> +                    "Dirty tracking not supported on translation type %d\n",
>> +                    pgtt);
>> +        return -EOPNOTSUPP;
>> +    }
>> +
>> +    if (pasid_get_ssade(pte) == enabled) {
>> +        spin_unlock(&iommu->lock);
>> +        return 0;
>> +    }
>> +
>> +    if (enabled)
>> +        pasid_set_ssade(pte);
>> +    else
>> +        pasid_clear_ssade(pte);
>> +    spin_unlock(&iommu->lock);
>> +
>> +    if (!ecap_coherent(iommu->ecap))
>> +        clflush_cache_range(pte, sizeof(*pte));
>> +
>> +    /*
>> +     * From VT-d spec table 25 "Guidance to Software for Invalidations":
>> +     *
>> +     * - PASID-selective-within-Domain PASID-cache invalidation
>> +     *   If (PGTT=SS or Nested)
>> +     *    - Domain-selective IOTLB invalidation
>> +     *   Else
>> +     *    - PASID-selective PASID-based IOTLB invalidation
>> +     * - If (pasid is RID_PASID)
>> +     *    - Global Device-TLB invalidation to affected functions
>> +     *   Else
>> +     *    - PASID-based Device-TLB invalidation (with S=1 and
>> +     *      Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
>> +     */
>> +    pasid_cache_invalidation_with_pasid(iommu, did, pasid);
>> +
>> +    if (pgtt == PASID_ENTRY_PGTT_SL_ONLY || pgtt == PASID_ENTRY_PGTT_NESTED)
> 
> Above check is unnecessary.
> 
Ah yes, we're validating beforehand above.

>> +        iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
>> +
>> +    /* Device IOTLB doesn't need to be flushed in caching mode. */
>> +    if (!cap_caching_mode(iommu->cap))
>> +        devtlb_invalidation_with_pasid(iommu, dev, pasid);
>> +
>> +    return 0;
>> +}
>> +
>>   /*
>>    * Set up the scalable mode pasid entry for passthrough translation type.
>>    */
>> diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
>> index 4e9e68c3c388..958050b093aa 100644
>> --- a/drivers/iommu/intel/pasid.h
>> +++ b/drivers/iommu/intel/pasid.h
>> @@ -106,6 +106,10 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
>>   int intel_pasid_setup_second_level(struct intel_iommu *iommu,
>>                      struct dmar_domain *domain,
>>                      struct device *dev, u32 pasid);
>> +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
>> +                     struct dmar_domain *domain,
>> +                     struct device *dev, u32 pasid,
>> +                     bool enabled);
>>   int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
>>                      struct dmar_domain *domain,
>>                      struct device *dev, u32 pasid);
> 
> Others look good to me. Thank you very much!
> 
> With above addressed,
> 
> Reviewed-by: Lu Baolu <baolu.lu@xxxxxxxxxxxxxxx>
> 
Thanks



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux