> -----Original Message----- > From: Kuehling, Felix > Sent: Tuesday, September 26, 2017 2:38 PM > To: amd-gfx at lists.freedesktop.org; Koenig, Christian; Deucher, Alexander > Subject: Re: [PATCH] drm/amdgpu: Handle GPUVM fault storms > > [ping] > > Hi Christian or Alex, > > Do you mind reviewing this change? Reviewed-by: Alex Deucher <alexander.deucher at amd.com> > > Thanks, > Â Felix > > > On 2017-09-21 08:09 PM, Felix Kuehling wrote: > > When many wavefronts cause VM faults at the same time, it can > > overwhelm the interrupt handler and cause IH ring overflows before > > the driver can notify or kill the faulting application. > > > > As a workaround I'm introducing limited per-VM fault credit. After > > that number of VM faults have occurred, further VM faults are > > filtered out at the prescreen stage of processing. > > > > This depends on the PASID in the interrupt packet, so it currently > > only works for KFD contexts. > > > > Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 > +++++++++++++++++++++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 ++++++- > > drivers/gpu/drm/amd/amdgpu/cik_ih.c | 19 +++++++++++++++++-- > > drivers/gpu/drm/amd/amdgpu/cz_ih.c | 19 +++++++++++++++++-- > > drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 19 +++++++++++++++++-- > > drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 19 +++++++++++++++++-- > > drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 11 +++++++---- > > 7 files changed, 112 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > > index 8fcc743..c91d5c7 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > > @@ -2682,6 +2682,7 @@ int amdgpu_vm_init(struct amdgpu_device > *adev, struct amdgpu_vm *vm, > > } > > > > INIT_KFIFO(vm->faults); > > + vm->fault_credit = 16; > > > > return 0; > > > > @@ -2776,6 +2777,36 @@ void amdgpu_vm_fini(struct amdgpu_device > *adev, struct amdgpu_vm *vm) > > } > > > > /** > > + * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID > > + * > > + * @adev: amdgpu_device pointer > > + * @pasid: PASID do identify the VM > > + * > > + * This function is expected to be called in interrupt context. Returns > > + * true if there was fault credit, false otherwise > > + */ > > +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev, > > + unsigned int pasid) > > +{ > > + struct amdgpu_vm *vm; > > + > > + spin_lock(&adev->vm_manager.pasid_lock); > > + vm = idr_find(&adev->vm_manager.pasid_idr, pasid); > > + spin_unlock(&adev->vm_manager.pasid_lock); > > + if (!vm) > > + /* VM not found, can't track fault credit */ > > + return true; > > + > > + /* No lock needed. only accessed by IRQ handler */ > > + if (!vm->fault_credit) > > + /* Too many faults in this VM */ > > + return false; > > + > > + vm->fault_credit--; > > + return true; > > +} > > + > > +/** > > * amdgpu_vm_manager_init - init the VM manager > > * > > * @adev: amdgpu_device pointer > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > > index 447ed6e..66efbc2 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > > @@ -165,8 +165,11 @@ struct amdgpu_vm { > > /* Flag to indicate ATS support from PTE for GFX9 */ > > bool pte_support_ats; > > > > - /* Up to 128 pending page faults */ > > + /* Up to 128 pending retry page faults */ > > DECLARE_KFIFO(faults, u64, 128); > > + > > + /* Limit non-retry fault storms */ > > + unsigned int fault_credit; > > }; > > > > struct amdgpu_vm_id { > > @@ -244,6 +247,8 @@ void amdgpu_vm_manager_fini(struct > amdgpu_device *adev); > > int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm > *vm, > > int vm_context, unsigned int pasid); > > void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm > *vm); > > +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev, > > + unsigned int pasid); > > void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm, > > struct list_head *validated, > > struct amdgpu_bo_list_entry *entry); > > diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c > b/drivers/gpu/drm/amd/amdgpu/cik_ih.c > > index 07d3d89..a870b35 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c > > +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c > > @@ -237,8 +237,23 @@ static u32 cik_ih_get_wptr(struct amdgpu_device > *adev) > > */ > > static bool cik_ih_prescreen_iv(struct amdgpu_device *adev) > > { > > - /* Process all interrupts */ > > - return true; > > + u32 ring_index = adev->irq.ih.rptr >> 2; > > + u16 pasid; > > + > > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > > + case 146: > > + case 147: > > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > > + return true; > > + break; > > + default: > > + /* Not a VM fault */ > > + return true; > > + } > > + > > + adev->irq.ih.rptr += 16; > > + return false; > > } > > > > /** > > diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c > b/drivers/gpu/drm/amd/amdgpu/cz_ih.c > > index b6cdf4a..fa61d64 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c > > +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c > > @@ -216,8 +216,23 @@ static u32 cz_ih_get_wptr(struct amdgpu_device > *adev) > > */ > > static bool cz_ih_prescreen_iv(struct amdgpu_device *adev) > > { > > - /* Process all interrupts */ > > - return true; > > + u32 ring_index = adev->irq.ih.rptr >> 2; > > + u16 pasid; > > + > > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > > + case 146: > > + case 147: > > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > > + return true; > > + break; > > + default: > > + /* Not a VM fault */ > > + return true; > > + } > > + > > + adev->irq.ih.rptr += 16; > > + return false; > > } > > > > /** > > diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c > b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c > > index 65ed6d3..bd592cb 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c > > +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c > > @@ -216,8 +216,23 @@ static u32 iceland_ih_get_wptr(struct > amdgpu_device *adev) > > */ > > static bool iceland_ih_prescreen_iv(struct amdgpu_device *adev) > > { > > - /* Process all interrupts */ > > - return true; > > + u32 ring_index = adev->irq.ih.rptr >> 2; > > + u16 pasid; > > + > > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > > + case 146: > > + case 147: > > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > > + return true; > > + break; > > + default: > > + /* Not a VM fault */ > > + return true; > > + } > > + > > + adev->irq.ih.rptr += 16; > > + return false; > > } > > > > /** > > diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c > b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c > > index 5ed0069..aa4e320 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c > > +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c > > @@ -227,8 +227,23 @@ static u32 tonga_ih_get_wptr(struct > amdgpu_device *adev) > > */ > > static bool tonga_ih_prescreen_iv(struct amdgpu_device *adev) > > { > > - /* Process all interrupts */ > > - return true; > > + u32 ring_index = adev->irq.ih.rptr >> 2; > > + u16 pasid; > > + > > + switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { > > + case 146: > > + case 147: > > + pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; > > + if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) > > + return true; > > + break; > > + default: > > + /* Not a VM fault */ > > + return true; > > + } > > + > > + adev->irq.ih.rptr += 16; > > + return false; > > } > > > > /** > > diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > > index a3b30d8..6973257 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > > +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > > @@ -260,15 +260,18 @@ static bool vega10_ih_prescreen_iv(struct > amdgpu_device *adev) > > return true; > > } > > > > - /* Not a retry fault */ > > - if (!(dw5 & 0x80)) > > - return true; > > - > > pasid = dw3 & 0xffff; > > /* No PASID, can't identify faulting process */ > > if (!pasid) > > return true; > > > > + /* Not a retry fault, check fault credit */ > > + if (!(dw5 & 0x80)) { > > + if (!amdgpu_vm_pasid_fault_credit(adev, pasid)) > > + goto ignore_iv; > > + return true; > > + } > > + > > addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12); > > key = AMDGPU_VM_FAULT(pasid, addr); > > r = amdgpu_ih_add_fault(adev, key);