[ping] Hi Christian or Alex, Do you mind reviewing this change? Thanks, Â Felix On 2017-09-21 08:09 PM, Felix Kuehling wrote: > When many wavefronts cause VM faults at the same time, it can > overwhelm the interrupt handler and cause IH ring overflows before > the driver can notify or kill the faulting application. > > As a workaround I'm introducing limited per-VM fault credit. After > that number of VM faults have occurred, further VM faults are > filtered out at the prescreen stage of processing. > > This depends on the PASID in the interrupt packet, so it currently > only works for KFD contexts. > > Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 +++++++++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 ++++++- > drivers/gpu/drm/amd/amdgpu/cik_ih.c | 19 +++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/cz_ih.c | 19 +++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 19 +++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 19 +++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 11 +++++++---- > 7 files changed, 112 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 8fcc743..c91d5c7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2682,6 +2682,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > } > > INIT_KFIFO(vm->faults); > + vm->fault_credit = 16; > > return 0; > > @@ -2776,6 +2777,36 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) > } > > /** > + * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID > + * > + * @adev: amdgpu_device pointer > + * @pasid: PASID do identify the VM > + * > + * This function is expected to be called in interrupt context. Returns > + * true if there was fault credit, false otherwise > + */ > +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev, > + unsigned int pasid) > +{ > + struct amdgpu_vm *vm; > + > + spin_lock(&adev->vm_manager.pasid_lock); > + vm = idr_find(&adev->vm_manager.pasid_idr, pasid); > + spin_unlock(&adev->vm_manager.pasid_lock); > + if (!vm) > + /* VM not found, can't track fault credit */ > + return true; > + > + /* No lock needed. only accessed by IRQ handler */ > + if (!vm->fault_credit) > + /* Too many faults in this VM */ > + return false; > + > + vm->fault_credit--; > + return true; > +} > + > +/** > * amdgpu_vm_manager_init - init the VM manager > * > * @adev: amdgpu_device pointer > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index 447ed6e..66efbc2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -165,8 +165,11 @@ struct amdgpu_vm { > /* Flag to indicate ATS support from PTE for GFX9 */ > bool pte_support_ats; > > - /* Up to 128 pending page faults */ > + /* Up to 128 pending retry page faults */ > DECLARE_KFIFO(faults, u64, 128); > + > + /* Limit non-retry fault storms */ > + unsigned int fault_credit; > }; > > struct amdgpu_vm_id { > @@ -244,6 +247,8 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev); > int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > int vm_context, unsigned int pasid); > void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); > +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev, > + unsigned int pasid); > void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm, > struct list_head *validated, > struct amdgpu_bo_list_entry *entry); > diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c > index 07d3d89..a870b35 100644 > --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c > @@ -237,8 +237,23 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev) > */ > static bool cik_ih_prescreen_iv(struct amdgpu_device *adev) > { > - /* Process all interrupts */ > - return true; > + u32 ring_index = adev->irq.ih.rptr >> 2; > + u16 pasid; > + > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > + case 146: > + case 147: > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > + return true; > + break; > + default: > + /* Not a VM fault */ > + return true; > + } > + > + adev->irq.ih.rptr += 16; > + return false; > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c > index b6cdf4a..fa61d64 100644 > --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c > @@ -216,8 +216,23 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev) > */ > static bool cz_ih_prescreen_iv(struct amdgpu_device *adev) > { > - /* Process all interrupts */ > - return true; > + u32 ring_index = adev->irq.ih.rptr >> 2; > + u16 pasid; > + > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > + case 146: > + case 147: > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > + return true; > + break; > + default: > + /* Not a VM fault */ > + return true; > + } > + > + adev->irq.ih.rptr += 16; > + return false; > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c > index 65ed6d3..bd592cb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c > @@ -216,8 +216,23 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev) > */ > static bool iceland_ih_prescreen_iv(struct amdgpu_device *adev) > { > - /* Process all interrupts */ > - return true; > + u32 ring_index = adev->irq.ih.rptr >> 2; > + u16 pasid; > + > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > + case 146: > + case 147: > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > + return true; > + break; > + default: > + /* Not a VM fault */ > + return true; > + } > + > + adev->irq.ih.rptr += 16; > + return false; > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c > index 5ed0069..aa4e320 100644 > --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c > @@ -227,8 +227,23 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev) > */ > static bool tonga_ih_prescreen_iv(struct amdgpu_device *adev) > { > - /* Process all interrupts */ > - return true; > + u32 ring_index = adev->irq.ih.rptr >> 2; > + u16 pasid; > + > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > + case 146: > + case 147: > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > + return true; > + break; > + default: > + /* Not a VM fault */ > + return true; > + } > + > + adev->irq.ih.rptr += 16; > + return false; > } > > /** > diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > index a3b30d8..6973257 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > @@ -260,15 +260,18 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev) > return true; > } > > - /* Not a retry fault */ > - if (!(dw5 & 0x80)) > - return true; > - > pasid = dw3 & 0xffff; > /* No PASID, can't identify faulting process */ > if (!pasid) > return true; > > + /* Not a retry fault, check fault credit */ > + if (!(dw5 & 0x80)) { > + if (!amdgpu_vm_pasid_fault_credit(adev, pasid)) > + goto ignore_iv; > + return true; > + } > + > addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12); > key = AMDGPU_VM_FAULT(pasid, addr); > r = amdgpu_ih_add_fault(adev, key);