Hi Yuri, On Tue, Oct 15, 2019 at 05:11:11PM +0200, Yuri Volchkov wrote: > Currently dmar_fault handler only prints a message in the dmesg. This > commit introduces counters - how many faults have happened, and > exposes them via sysfs. Each pci device will have an entry > 'dmar_faults' reading from which will give user 3 lines > remap: xxx > read: xxx > write: xxx I think you should have three files instead of putting all these in a single file. See https://lore.kernel.org/r/20190621072911.GA21600@xxxxxxxxx They should also be documented in Documentation/ABI/ I'm not sure this should be DMAR-specific. Couldn't we count similar events for other IOMMUs as well? > This functionality is targeted for health monitoring daemons. > > Signed-off-by: Yuri Volchkov <volchkov@xxxxxxxxx> > --- > drivers/iommu/dmar.c | 133 +++++++++++++++++++++++++++++++----- > drivers/pci/pci-sysfs.c | 20 ++++++ > include/linux/intel-iommu.h | 3 + > include/linux/pci.h | 11 +++ > 4 files changed, 150 insertions(+), 17 deletions(-) > > diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c > index eecd6a421667..0749873e3e41 100644 > --- a/drivers/iommu/dmar.c > +++ b/drivers/iommu/dmar.c > @@ -1107,6 +1107,7 @@ static void free_iommu(struct intel_iommu *iommu) > } > > if (iommu->irq) { > + destroy_workqueue(iommu->fault_wq); > if (iommu->pr_irq) { > free_irq(iommu->pr_irq, iommu); > dmar_free_hwirq(iommu->pr_irq); > @@ -1672,9 +1673,46 @@ void dmar_msi_read(int irq, struct msi_msg *msg) > raw_spin_unlock_irqrestore(&iommu->register_lock, flag); > } > > -static int dmar_fault_do_one(struct intel_iommu *iommu, int type, > - u8 fault_reason, int pasid, u16 source_id, > - unsigned long long addr) > +struct dmar_fault_info { > + struct work_struct work; > + struct intel_iommu *iommu; > + int type; > + int pasid; > + u16 source_id; > + unsigned long long addr; > + u8 fault_reason; > +}; > + > +static struct kmem_cache *dmar_fault_info_cache; > +int __init dmar_fault_info_cache_init(void) > +{ > + int ret = 0; > + > + dmar_fault_info_cache = > + kmem_cache_create("dmar_fault_info", > + sizeof(struct dmar_fault_info), 0, > + SLAB_HWCACHE_ALIGN, NULL); > + if (!dmar_fault_info_cache) { > + pr_err("Couldn't create dmar_fault_info cache\n"); > + ret = -ENOMEM; > + } > + > + return ret; > +} > + > +static inline void *alloc_dmar_fault_info(void) > +{ > + return kmem_cache_alloc(dmar_fault_info_cache, GFP_ATOMIC); > +} > + > +static inline void free_dmar_fault_info(void *vaddr) > +{ > + kmem_cache_free(dmar_fault_info_cache, vaddr); > +} > + > +static int dmar_fault_dump_one(struct intel_iommu *iommu, int type, > + u8 fault_reason, int pasid, u16 source_id, > + unsigned long long addr) > { > const char *reason; > int fault_type; > @@ -1695,6 +1733,57 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type, > return 0; > } > > +static int dmar_fault_handle_one(struct dmar_fault_info *info) > +{ > + struct pci_dev *pdev; > + u8 devfn; > + atomic_t *pcnt; > + > + devfn = PCI_DEVFN(PCI_SLOT(info->source_id), PCI_FUNC(info->source_id)); > + pdev = pci_get_domain_bus_and_slot(info->iommu->segment, > + PCI_BUS_NUM(info->source_id), devfn); I'm sure you've considered this already, but it's not completely clear to me whether these counters should be in the pci_dev (as in this patch) or in something IOMMU-related. The pci_dev is nice because you automatically have counters for each PCI device, and most faults can be tied back to a device. But on the other hand, it's not the PCI device actually detecting and reporting the error. It's the IOMMU reporting the fault, and while it's *likely* there's a pci_dev corresponding to the bus/device/function info in the IOMMU error registers, there's no guarantee: the device may have been hot-removed between reading the IOMMU registers and doing the pci_get_domain_bus_and_slot(), or (I suspect) faults could be caused by corrupted or malicious TLPs. Another possible issue is that if the counts are in the pci_dev, they're lost if the device is removed, which might happen while diagnosing faulty hardware. So I tend to think this is really IOMMU error information and the IOMMU driver should handle it itself, including logging and exposing it via sysfs. > + if (!pdev) > + return -ENXIO; > + > + if (info->fault_reason == INTR_REMAP) > + pcnt = &pdev->faults_cnt.remap; > + else if (info->type) > + pcnt = &pdev->faults_cnt.read; > + else > + pcnt = &pdev->faults_cnt.write; > + > + atomic_inc(pcnt); pci_get_domain_bus_and_slot() increments pdev's reference count, so you would need to release it here. > + return 0; > +} > + > +static void dmar_fault_handle_work(struct work_struct *work) > +{ > + struct dmar_fault_info *info; > + > + info = container_of(work, struct dmar_fault_info, work); > + > + dmar_fault_handle_one(info); > + free_dmar_fault_info(info); > +} > + > +static int dmar_fault_schedule_one(struct intel_iommu *iommu, int type, > + u8 fault_reason, int pasid, u16 source_id, > + unsigned long long addr) > +{ > + struct dmar_fault_info *info = alloc_dmar_fault_info(); > + > + INIT_WORK(&info->work, dmar_fault_handle_work); > + info->iommu = iommu; > + info->type = type; > + info->fault_reason = fault_reason; > + info->pasid = pasid; > + info->source_id = source_id; > + info->addr = addr; > + > + return queue_work(iommu->fault_wq, &info->work); > +} > + > #define PRIMARY_FAULT_REG_LEN (16) > irqreturn_t dmar_fault(int irq, void *dev_id) > { > @@ -1733,20 +1822,18 @@ irqreturn_t dmar_fault(int irq, void *dev_id) > if (!(data & DMA_FRCD_F)) > break; > > - if (!ratelimited) { > - fault_reason = dma_frcd_fault_reason(data); > - type = dma_frcd_type(data); > + fault_reason = dma_frcd_fault_reason(data); > + type = dma_frcd_type(data); > > - pasid = dma_frcd_pasid_value(data); > - data = readl(iommu->reg + reg + > - fault_index * PRIMARY_FAULT_REG_LEN + 8); > - source_id = dma_frcd_source_id(data); > + pasid = dma_frcd_pasid_value(data); > + data = readl(iommu->reg + reg + > + fault_index * PRIMARY_FAULT_REG_LEN + 8); > + source_id = dma_frcd_source_id(data); > > - pasid_present = dma_frcd_pasid_present(data); > - guest_addr = dmar_readq(iommu->reg + reg + > + pasid_present = dma_frcd_pasid_present(data); > + guest_addr = dmar_readq(iommu->reg + reg + > fault_index * PRIMARY_FAULT_REG_LEN); > - guest_addr = dma_frcd_page_addr(guest_addr); > - } > + guest_addr = dma_frcd_page_addr(guest_addr); > > /* clear the fault */ > writel(DMA_FRCD_F, iommu->reg + reg + > @@ -1756,9 +1843,13 @@ irqreturn_t dmar_fault(int irq, void *dev_id) > > if (!ratelimited) > /* Using pasid -1 if pasid is not present */ > - dmar_fault_do_one(iommu, type, fault_reason, > - pasid_present ? pasid : -1, > - source_id, guest_addr); > + dmar_fault_dump_one(iommu, type, fault_reason, > + pasid_present ? pasid : -1, > + source_id, guest_addr); > + if (irq != -1) > + dmar_fault_schedule_one(iommu, type, fault_reason, > + pasid_present ? pasid : -1, > + source_id, guest_addr); > > fault_index++; > if (fault_index >= cap_num_fault_regs(iommu->cap)) > @@ -1784,6 +1875,10 @@ int dmar_set_interrupt(struct intel_iommu *iommu) > if (iommu->irq) > return 0; > > + iommu->fault_wq = alloc_ordered_workqueue("fault_%s", 0, iommu->name); > + if (!iommu->fault_wq) > + return -ENOMEM; > + > irq = dmar_alloc_hwirq(iommu->seq_id, iommu->node, iommu); > if (irq > 0) { > iommu->irq = irq; > @@ -1803,6 +1898,9 @@ int __init enable_drhd_fault_handling(void) > struct dmar_drhd_unit *drhd; > struct intel_iommu *iommu; > > + if (dmar_fault_info_cache_init()) > + return -1; > + > /* > * Enable fault control interrupt. > */ > @@ -1813,6 +1911,7 @@ int __init enable_drhd_fault_handling(void) > if (ret) { > pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n", > (unsigned long long)drhd->reg_base_addr, ret); > + kmem_cache_destroy(dmar_fault_info_cache); > return -1; > } > > diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c > index 07bd84c50238..d01514fca6d1 100644 > --- a/drivers/pci/pci-sysfs.c > +++ b/drivers/pci/pci-sysfs.c > @@ -263,6 +263,23 @@ static ssize_t ari_enabled_show(struct device *dev, > } > static DEVICE_ATTR_RO(ari_enabled); > > +#ifdef CONFIG_DMAR_TABLE > +static ssize_t dmar_faults_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct pci_dev *pdev = to_pci_dev(dev); > + int remap, read, write; > + > + remap = atomic_xchg(&pdev->faults_cnt.remap, 0); > + read = atomic_xchg(&pdev->faults_cnt.read, 0); > + write = atomic_xchg(&pdev->faults_cnt.write, 0); Doesn't this clear-on-read approach allow anybody to clear these counters? This looks like it's world-readable, and I'm not sure an unprivileged user should be able to clear the counters. > + > + return sprintf(buf, "remap: %d\nread: %d\nwrite: %d\n", remap, read, > + write); > +} > +static DEVICE_ATTR_RO(dmar_faults); > +#endif > + > static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, > char *buf) > { > @@ -623,6 +640,9 @@ static struct attribute *pci_dev_attrs[] = { > #endif > &dev_attr_driver_override.attr, > &dev_attr_ari_enabled.attr, > +#ifdef CONFIG_DMAR_TABLE > + &dev_attr_dmar_faults.attr, > +#endif > NULL, > }; > > diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h > index ed11ef594378..f8963c833fb0 100644 > --- a/include/linux/intel-iommu.h > +++ b/include/linux/intel-iommu.h > @@ -552,6 +552,9 @@ struct intel_iommu { > struct iommu_device iommu; /* IOMMU core code handle */ > int node; > u32 flags; /* Software defined flags */ > +#ifdef CONFIG_DMAR_TABLE > + struct workqueue_struct *fault_wq; /* Fault handler */ > +#endif > }; > > /* PCI domain-device relationship */ > diff --git a/include/linux/pci.h b/include/linux/pci.h > index 14efa2586a8c..d9cc94fbf0ee 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -286,6 +286,14 @@ struct pci_vpd; > struct pci_sriov; > struct pci_p2pdma; > > +#ifdef CONFIG_DMAR_TABLE > +struct pci_dmar_faults_cnt { > + atomic_t read; /* How many read faults occurred */ > + atomic_t write; /* How many write faults occurred */ > + atomic_t remap; /* How many remap faults occurred */ > +}; > +#endif > + > /* The pci_dev structure describes PCI devices */ > struct pci_dev { > struct list_head bus_list; /* Node in per-bus list */ > @@ -468,6 +476,9 @@ struct pci_dev { > char *driver_override; /* Driver name to force a match */ > > unsigned long priv_flags; /* Private flags for the PCI driver */ > +#ifdef CONFIG_DMAR_TABLE > + struct pci_dmar_faults_cnt faults_cnt; /* Dmar fault statistics */ > +#endif > }; > > static inline struct pci_dev *pci_physfn(struct pci_dev *dev) > -- > 2.23.0 > > > > > Amazon Development Center Germany GmbH > Krausenstr. 38 > 10117 Berlin > Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich > Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B > Sitz: Berlin > Ust-ID: DE 289 237 879 > > >