Am 2021-11-24 um 5:58 p.m. schrieb Philip Yang: > IH ring1 is used to process GPU retry fault, overflow is enabled to > drain retry fault because we want receive other interrupts while > handling retry fault to recover range. There is no overflow flag set > when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow > and drain retry fault. > > Add amdgpu_ih_function interface decode_iv_ts for different chips to get > timestamp from IV entry with different iv size and timestamp offset. > amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10. > > Drain retry fault is done if processed_timestamp is equal to or larger > than checkpoint timestamp. Page fault handler skips retry fault entry if > entry timestamp goes backward. > > Signed-off-by: Philip Yang <Philip.Yang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 58 +++++++++++++------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h | 16 ++++++- > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 5 +++ > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 5 +++ > drivers/gpu/drm/amd/amdgpu/navi10_ih.c | 1 + > drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 1 + > drivers/gpu/drm/amd/amdgpu/vega20_ih.c | 1 + > drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 2 +- > 8 files changed, 56 insertions(+), 33 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c > index 0c7963dfacad..3e043acaab82 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c > @@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv, > } > } > > -/* Waiter helper that checks current rptr matches or passes checkpoint wptr */ > -static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev, > - struct amdgpu_ih_ring *ih, > - uint32_t checkpoint_wptr, > - uint32_t *prev_rptr) > -{ > - uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask); > - > - /* rptr has wrapped. */ > - if (cur_rptr < *prev_rptr) > - cur_rptr += ih->ptr_mask + 1; > - *prev_rptr = cur_rptr; > - > - /* check ring is empty to workaround missing wptr overflow flag */ > - return cur_rptr >= checkpoint_wptr || > - (cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih); > -} > - > /** > - * amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to checkpoint > + * amdgpu_ih_wait_on_checkpoint_process_ts - wait to process IVs up to checkpoint > * > * @adev: amdgpu_device pointer > * @ih: ih ring to process > * > * Used to ensure ring has processed IVs up to the checkpoint write pointer. > */ > -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev, > +int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev, > struct amdgpu_ih_ring *ih) > { > - uint32_t checkpoint_wptr, rptr; > + uint32_t checkpoint_wptr; > + uint64_t checkpoint_ts; > + long timeout = HZ; > > if (!ih->enabled || adev->shutdown) > return -ENODEV; > > checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); > - /* Order wptr with rptr. */ > + /* Order wptr with ring data. */ > rmb(); > - rptr = READ_ONCE(ih->rptr); > - > - /* wptr has wrapped. */ > - if (rptr > checkpoint_wptr) > - checkpoint_wptr += ih->ptr_mask + 1; > + checkpoint_ts = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1); > > - return wait_event_interruptible(ih->wait_process, > - amdgpu_ih_has_checkpoint_processed(adev, ih, > - checkpoint_wptr, &rptr)); > + return wait_event_interruptible_timeout(ih->wait_process, > + !amdgpu_ih_ts_after(ih->processed_timestamp, checkpoint_ts), > + timeout); > } > > /** > @@ -298,4 +278,22 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev, > > /* wptr/rptr are in bytes! */ > ih->rptr += 32; > + if (ih == &adev->irq.ih1 && > + amdgpu_ih_ts_after(ih->processed_timestamp, entry->timestamp)) > + ih->processed_timestamp = entry->timestamp; I'd call this "latest_decoded_timestamp". At this point it hasn't been processed yet. Also, I think it would be safe and cheap enough to do this on all IH rings, in case someone finds it useful for something else, e.g. using amdgpu_ih_wait_on_checkpoint_process_ts on IH ring 0. > +} > + > +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr, > + signed int offset) > +{ > + uint32_t iv_size = 32; > + uint32_t dw1, dw2; > + uint32_t index; > + > + rptr += iv_size * offset; > + index = (rptr & ih->ptr_mask) >> 2; > + > + dw1 = le32_to_cpu(ih->ring[index + 1]); > + dw2 = le32_to_cpu(ih->ring[index + 2]); > + return dw1 | ((u64)(dw2 & 0xffff) << 32); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h > index 0649b59830a5..dd1c2eded6b9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h > @@ -68,20 +68,30 @@ struct amdgpu_ih_ring { > > /* For waiting on IH processing at checkpoint. */ > wait_queue_head_t wait_process; > + uint64_t processed_timestamp; > }; > > +/* return true if time stamp t2 is after t1 with 48bit wrap around */ > +#define amdgpu_ih_ts_after(t1, t2) \ > + (((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL) > + > /* provided by the ih block */ > struct amdgpu_ih_funcs { > /* ring read/write ptr handling, called from interrupt context */ > u32 (*get_wptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); > void (*decode_iv)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, > struct amdgpu_iv_entry *entry); > + uint64_t (*decode_iv_ts)(struct amdgpu_ih_ring *ih, u32 rptr, > + signed int offset); > void (*set_rptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); > }; > > #define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), (ih)) > #define amdgpu_ih_decode_iv(adev, iv) \ > (adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv)) > +#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \ > + (WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ? 0 : \ > + (adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr), (offset))) > #define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), (ih)) > > int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, > @@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, > void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); > void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv, > unsigned int num_dw); > -int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev, > - struct amdgpu_ih_ring *ih); > +int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev, > + struct amdgpu_ih_ring *ih); > int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); > void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev, > struct amdgpu_ih_ring *ih, > struct amdgpu_iv_entry *entry); > +uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr, > + signed int offset); > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > index 3ec5ff5a6dbe..b129898db433 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c > @@ -119,6 +119,11 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, > return 1; > } > > + /* Stale retry fault if timestamp goes backward */ > + if (entry->ih == &adev->irq.ih1 && > + amdgpu_ih_ts_after(entry->timestamp, entry->ih->processed_timestamp)) > + return 1; > + This check should go before amdgpu_gmc_filter_faults. Otherwise amdgpu_gmc_filter_faults may later drop a real fault because it added a stale fault in its hash table. > /* Try to handle the recoverable page faults by filling page > * tables > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index cb82404df534..c0d9b254bbb5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -535,6 +535,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, > return 1; > } > > + /* Stale retry fault if timestamp goes backward */ > + if (entry->ih == &adev->irq.ih1 && > + amdgpu_ih_ts_after(entry->timestamp, entry->ih->processed_timestamp)) > + return 1; > + Same as above. Regards, Felix > /* Try to handle the recoverable page faults by filling page > * tables > */ > diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c > index 38241cf0e1f1..8ce5b8ca1fd7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c > @@ -716,6 +716,7 @@ static const struct amd_ip_funcs navi10_ih_ip_funcs = { > static const struct amdgpu_ih_funcs navi10_ih_funcs = { > .get_wptr = navi10_ih_get_wptr, > .decode_iv = amdgpu_ih_decode_iv_helper, > + .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper, > .set_rptr = navi10_ih_set_rptr > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > index a9ca6988009e..3070466f54e1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c > @@ -640,6 +640,7 @@ const struct amd_ip_funcs vega10_ih_ip_funcs = { > static const struct amdgpu_ih_funcs vega10_ih_funcs = { > .get_wptr = vega10_ih_get_wptr, > .decode_iv = amdgpu_ih_decode_iv_helper, > + .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper, > .set_rptr = vega10_ih_set_rptr > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c > index f51dfc38ac65..3b4eb8285943 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c > +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c > @@ -688,6 +688,7 @@ const struct amd_ip_funcs vega20_ih_ip_funcs = { > static const struct amdgpu_ih_funcs vega20_ih_funcs = { > .get_wptr = vega20_ih_get_wptr, > .decode_iv = amdgpu_ih_decode_iv_helper, > + .decode_iv_ts = amdgpu_ih_decode_iv_ts_helper, > .set_rptr = vega20_ih_set_rptr > }; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > index 10868d5b549f..663489ae56d7 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > @@ -1974,7 +1974,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms) > > pr_debug("drain retry fault gpu %d svms %p\n", i, svms); > > - amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev, > + amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev, > &pdd->dev->adev->irq.ih1); > pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); > }