Hmm, that's a clever (and elegant) little data structure. The series is Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> Regards, Felix On 3/7/2019 8:28 AM, Christian König wrote: > Further testing showed that the idea with the chash doesn't work as expected. > Especially we can't predict when we can remove the entries from the hash again. > > So replace the chash with a ring buffer/hash mix where entries in the container > age automatically based on their timestamp. > > v2: use ring buffer / hash mix > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 49 ++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 34 ++++++++++++++ > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 60 ++----------------------- > 3 files changed, 86 insertions(+), 57 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > index 5a32a0d2ad31..579cadd16886 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > @@ -240,3 +240,52 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc) > dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n", > mc->agp_size >> 20, mc->agp_start, mc->agp_end); > } > + > +/** > + * amdgpu_gmc_filter_faults - filter VM faults > + * > + * @adev: amdgpu device structure > + * @addr: address of the VM fault > + * @pasid: PASID of the process causing the fault > + * @timestamp: timestamp of the fault > + * > + * Returns: > + * True if the fault was filtered and should not be processed further. > + * False if the fault is a new one and needs to be handled. > + */ > +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, > + uint16_t pasid, uint64_t timestamp) > +{ > + struct amdgpu_gmc *gmc = &adev->gmc; > + > + uint64_t stamp, key = addr << 4 | pasid; > + struct amdgpu_gmc_fault *fault; > + uint32_t hash; > + > + /* If we don't have space left in the ring buffer return immediately */ > + stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) - > + AMDGPU_GMC_FAULT_TIMEOUT; > + if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp) > + return true; > + > + /* Try to find the fault in the hash */ > + hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER); > + fault = &gmc->fault_ring[gmc->fault_hash[hash].idx]; > + do { > + if (fault->key == key) > + return true; > + > + stamp = fault->timestamp; > + fault = &gmc->fault_ring[fault->next]; > + } while (fault->timestamp < stamp); > + > + /* Add the fault to the ring */ > + fault = &gmc->fault_ring[gmc->last_fault]; > + fault->key = key; > + fault->timestamp = timestamp; > + > + /* And update the hash */ > + fault->next = gmc->fault_hash[hash].idx; > + gmc->fault_hash[hash].idx = gmc->last_fault++; > + return false; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h > index 6ce45664ff87..071145ac67b5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h > @@ -43,8 +43,34 @@ > */ > #define AMDGPU_GMC_HOLE_MASK 0x0000ffffffffffffULL > > +/* > + * Ring size as power of two for the log of recent faults. > + */ > +#define AMDGPU_GMC_FAULT_RING_ORDER 8 > +#define AMDGPU_GMC_FAULT_RING_SIZE (1 << AMDGPU_GMC_FAULT_RING_ORDER) > + > +/* > + * Hash size as power of two for the log of recent faults > + */ > +#define AMDGPU_GMC_FAULT_HASH_ORDER 8 > +#define AMDGPU_GMC_FAULT_HASH_SIZE (1 << AMDGPU_GMC_FAULT_HASH_ORDER) > + > +/* > + * Number of IH timestamp ticks until a fault is considered handled > + */ > +#define AMDGPU_GMC_FAULT_TIMEOUT 5000ULL > + > struct firmware; > > +/* > + * GMC page fault information > + */ > +struct amdgpu_gmc_fault { > + uint64_t timestamp; > + uint64_t next:AMDGPU_GMC_FAULT_RING_ORDER; > + uint64_t key:52; > +}; > + > /* > * VMHUB structures, functions & helpers > */ > @@ -141,6 +167,12 @@ struct amdgpu_gmc { > struct kfd_vm_fault_info *vm_fault_info; > atomic_t vm_fault_info_updated; > > + struct amdgpu_gmc_fault fault_ring[AMDGPU_GMC_FAULT_RING_SIZE]; > + struct { > + uint64_t idx:AMDGPU_GMC_FAULT_RING_ORDER; > + } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE]; > + uint64_t last_fault:AMDGPU_GMC_FAULT_RING_ORDER; > + > const struct amdgpu_gmc_funcs *gmc_funcs; > > struct amdgpu_xgmi xgmi; > @@ -195,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev, > struct amdgpu_gmc *mc); > void amdgpu_gmc_agp_location(struct amdgpu_device *adev, > struct amdgpu_gmc *mc); > +bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, > + uint16_t pasid, uint64_t timestamp); > > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 84904bd680df..0c37c0afb1bd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -416,62 +416,6 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct amdgpu_device *adev, > return 0; > } > > -/** > - * vega10_ih_prescreen_iv - prescreen an interrupt vector > - * > - * @adev: amdgpu_device pointer > - * > - * Returns true if the interrupt vector should be further processed. > - */ > -static bool gmc_v9_0_prescreen_iv(struct amdgpu_device *adev, > - struct amdgpu_iv_entry *entry, > - uint64_t addr) > -{ > - struct amdgpu_vm *vm; > - u64 key; > - int r; > - > - /* No PASID, can't identify faulting process */ > - if (!entry->pasid) > - return true; > - > - /* Not a retry fault */ > - if (!(entry->src_data[1] & 0x80)) > - return true; > - > - /* Track retry faults in per-VM fault FIFO. */ > - spin_lock(&adev->vm_manager.pasid_lock); > - vm = idr_find(&adev->vm_manager.pasid_idr, entry->pasid); > - if (!vm) { > - /* VM not found, process it normally */ > - spin_unlock(&adev->vm_manager.pasid_lock); > - return true; > - } > - > - key = AMDGPU_VM_FAULT(entry->pasid, addr); > - r = amdgpu_vm_add_fault(vm->fault_hash, key); > - > - /* Hash table is full or the fault is already being processed, > - * ignore further page faults > - */ > - if (r != 0) { > - spin_unlock(&adev->vm_manager.pasid_lock); > - return false; > - } > - /* No locking required with single writer and single reader */ > - r = kfifo_put(&vm->faults, key); > - if (!r) { > - /* FIFO is full. Ignore it until there is space */ > - amdgpu_vm_clear_fault(vm->fault_hash, key); > - spin_unlock(&adev->vm_manager.pasid_lock); > - return false; > - } > - > - spin_unlock(&adev->vm_manager.pasid_lock); > - /* It's the first fault for this address, process it normally */ > - return true; > -} > - > static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, > struct amdgpu_irq_src *source, > struct amdgpu_iv_entry *entry) > @@ -484,9 +428,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, > addr = (u64)entry->src_data[0] << 12; > addr |= ((u64)entry->src_data[1] & 0xf) << 44; > > - if (!gmc_v9_0_prescreen_iv(adev, entry, addr)) > + if (retry_fault && amdgpu_gmc_filter_faults(adev, addr, entry->pasid, > + entry->timestamp)) > return 1; /* This also prevents sending it to KFD */ > > + /* If it's the first fault for this address, process it normally */ > if (!amdgpu_sriov_vf(adev)) { > status = RREG32(hub->vm_l2_pro_fault_status); > WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx