On 17/10/2023 03:08, Baolu Lu wrote: > On 10/17/23 12:00 AM, Joao Martins wrote: >>>> The iommu_dirty_bitmap is defined in iommu core. The iommu driver has no >>>> need to understand it and check its member anyway. >>>> >>> (...) The iommu driver has no need to understand it. iommu_dirty_bitmap_record() >>> already makes those checks in case there's no iova_bitmap to set bits to. >>> >> This is all true but the reason I am checking iommu_dirty_bitmap::bitmap is to >> essentially not record anything in the iova bitmap and just clear the dirty bits >> from the IOPTEs, all when dirty tracking is technically disabled. This is done >> internally only when starting dirty tracking, and thus to ensure that we cleanup >> all dirty bits before we enable dirty tracking to have a consistent snapshot as >> opposed to inheriting dirties from the past. > > It's okay since it serves a functional purpose. Can you please add some > comments around the code to explain the rationale. > I added this comment below: + /* + * IOMMUFD core calls into a dirty tracking disabled domain without an + * IOVA bitmap set in order to clean dirty bits in all PTEs that might + * have occured when we stopped dirty tracking. This ensures that we + * never inherit dirtied bits from a previous cycle. + */ Also fixed an issue where I could theoretically clear the bit with IOMMU_NO_CLEAR. Essentially passed the read_and_clear_dirty flags and let dma_sl_pte_test_and_clear_dirty() to test and test-and-clear, similar to AMD: @@ -781,6 +788,16 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, + unsigned long flags) +{ + if (flags & IOMMU_DIRTY_NO_CLEAR) + return (pte->val & DMA_SL_PTE_DIRTY) != 0; + + return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, + (unsigned long *)&pte->val); +} + Anyhow, see below the full diff compared to this patch. Some things are in tree that is different to submitted from this patch. diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 2e56bd79f589..dedb8ae3cba8 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -15,6 +15,7 @@ config INTEL_IOMMU select DMA_OPS select IOMMU_API select IOMMU_IOVA + select IOMMUFD_DRIVER select NEED_DMA_MAP_STATE select DMAR_TABLE select SWIOTLB diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index fabfe363f1f9..0e3f532f3bca 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4075,11 +4075,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return NULL; } -static bool intel_iommu_slads_supported(struct intel_iommu *iommu) -{ - return sm_supported(iommu) && ecap_slads(iommu->ecap); -} - static struct iommu_domain * intel_iommu_domain_alloc_user(struct device *dev, u32 flags) { @@ -4087,6 +4082,10 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) struct iommu_domain *domain; struct intel_iommu *iommu; + if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT| + IOMMU_HWPT_ALLOC_ENFORCE_DIRTY))) + return ERR_PTR(-EOPNOTSUPP); + iommu = device_to_iommu(dev, NULL, NULL); if (!iommu) return ERR_PTR(-ENODEV); @@ -4094,15 +4093,26 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags) if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && !ecap_nest(iommu->ecap)) return ERR_PTR(-EOPNOTSUPP); - if (enforce_dirty && - !intel_iommu_slads_supported(iommu)) + if (enforce_dirty && !slads_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); + /* + * domain_alloc_user op needs to fully initialize a domain + * before return, so uses iommu_domain_alloc() here for + * simple. + */ domain = iommu_domain_alloc(dev->bus); if (!domain) domain = ERR_PTR(-ENOMEM); - if (domain && enforce_dirty) + + if (!IS_ERR(domain) && enforce_dirty) { + if (to_dmar_domain(domain)->use_first_level) { + iommu_domain_free(domain); + return ERR_PTR(-EOPNOTSUPP); + } domain->dirty_ops = &intel_dirty_ops; + } + return domain; } @@ -4113,7 +4123,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) } static int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) + struct device *dev, ioasid_t pasid) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu; @@ -4126,7 +4136,8 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; - if (domain->dirty_ops && !intel_iommu_slads_supported(iommu)) + if (domain->dirty_ops && + (!slads_supported(iommu) || pasid != IOMMU_NO_PASID)) return -EINVAL; /* check if this iommu agaw is sufficient for max mapped address */ @@ -4164,7 +4175,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, if (info->domain) device_block_translation(dev); - ret = prepare_domain_attach_device(domain, dev); + ret = prepare_domain_attach_device(domain, dev, IOMMU_NO_PASID); if (ret) return ret; @@ -4384,7 +4395,7 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY: - return intel_iommu_slads_supported(info->iommu); + return slads_supported(info->iommu); default: return false; } @@ -4785,7 +4796,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; - ret = prepare_domain_attach_device(domain, dev); + ret = prepare_domain_attach_device(domain, dev, pasid); if (ret) return ret; @@ -4848,31 +4859,31 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, int ret = -EINVAL; spin_lock(&dmar_domain->lock); - if (!(dmar_domain->dirty_tracking ^ enable) || - list_empty(&dmar_domain->devices)) { - spin_unlock(&dmar_domain->lock); - return 0; - } + if (dmar_domain->dirty_tracking == enable) + goto out_unlock; list_for_each_entry(info, &dmar_domain->devices, link) { - /* First-level page table always enables dirty bit*/ - if (dmar_domain->use_first_level) { - ret = 0; - break; - } - ret = intel_pasid_setup_dirty_tracking(info->iommu, info->domain, info->dev, IOMMU_NO_PASID, enable); if (ret) - break; + goto err_unwind; } if (!ret) dmar_domain->dirty_tracking = enable; +out_unlock: spin_unlock(&dmar_domain->lock); + return 0; + +err_unwind: + list_for_each_entry(info, &dmar_domain->devices, link) + intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain, + info->dev, IOMMU_NO_PASID, + dmar_domain->dirty_tracking); + spin_unlock(&dmar_domain->lock); return ret; } @@ -4886,14 +4897,16 @@ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, unsigned long pgsize; bool ad_enabled; - spin_lock(&dmar_domain->lock); + /* + * IOMMUFD core calls into a dirty tracking disabled domain without an + * IOVA bitmap set in order to clean dirty bits in all PTEs that might + * have occured when we stopped dirty tracking. This ensures that we + * never inherit dirtied bits from a previous cycle. + */ ad_enabled = dmar_domain->dirty_tracking; - spin_unlock(&dmar_domain->lock); - if (!ad_enabled && dirty->bitmap) return -EINVAL; - rcu_read_lock(); do { struct dma_pte *pte; int lvl = 0; @@ -4906,14 +4919,10 @@ static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, continue; } - /* It is writable, set the bitmap */ - if (((flags & IOMMU_DIRTY_NO_CLEAR) && - dma_sl_pte_dirty(pte)) || - dma_sl_pte_test_and_clear_dirty(pte)) + if (dma_sl_pte_test_and_clear_dirty(pte, flags)) iommu_dirty_bitmap_record(dirty, iova, pgsize); iova += pgsize; } while (iova < end); - rcu_read_unlock(); return 0; } diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index bccd44db3316..0b390d9e669b 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -542,6 +542,9 @@ enum { #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) +#define slads_supported(iommu) (sm_supported(iommu) && \ + ecap_slads((iommu)->ecap)) + struct pasid_entry; struct pasid_state_entry; @@ -785,13 +788,12 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } -static inline bool dma_sl_pte_dirty(struct dma_pte *pte) +static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, + unsigned long flags) { - return (pte->val & DMA_SL_PTE_DIRTY) != 0; -} + if (flags & IOMMU_DIRTY_NO_CLEAR) + return (pte->val & DMA_SL_PTE_DIRTY) != 0; -static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte) -{ return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, (unsigned long *)&pte->val); } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 03814942d59c..785384a59d55 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -686,15 +686,29 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, spin_lock(&iommu->lock); - did = domain_id_iommu(domain, iommu); pte = intel_pasid_get_entry(dev, pasid); if (!pte) { spin_unlock(&iommu->lock); - dev_err(dev, "Failed to get pasid entry of PASID %d\n", pasid); + dev_err_ratelimited(dev, + "Failed to get pasid entry of PASID %d\n", + pasid); return -ENODEV; } + did = domain_id_iommu(domain, iommu); pgtt = pasid_pte_get_pgtt(pte); + if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && pgtt != PASID_ENTRY_PGTT_NESTED) { + spin_unlock(&iommu->lock); + dev_err_ratelimited(dev, + "Dirty tracking not supported on translation type %d\n", + pgtt); + return -EOPNOTSUPP; + } + + if (pasid_get_ssade(pte) == enabled) { + spin_unlock(&iommu->lock); + return 0; + } if (enabled) pasid_set_ssade(pte); @@ -702,6 +716,9 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, pasid_clear_ssade(pte); spin_unlock(&iommu->lock); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + /* * From VT-d spec table 25 "Guidance to Software for Invalidations": * @@ -720,8 +737,6 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, if (pgtt == PASID_ENTRY_PGTT_SL_ONLY || pgtt == PASID_ENTRY_PGTT_NESTED) iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - else - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); /* Device IOTLB doesn't need to be flushed in caching mode. */ if (!cap_caching_mode(iommu->cap))