[AMD Official Use Only - Internal Distribution Only]
I think each kiq operation will call ring_alloc for the package space , why not just check whether this allocation is succeed or not ?
Shaoyun.liu
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of
Yintian Tao
Sent: Wednesday, April 22, 2020 10:50 AM
To: Koenig, Christian <Christian.Koenig@xxxxxxx>; Liu, Monk
<Monk.Liu@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx>
Cc: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Tao, Yintian <Yintian.Tao@xxxxxxx>
Subject: [PATCH] drm/amdgpu: protect kiq overrun
Wait for the oldest sequence on the kiq ring to be signaled in order to make sure there will be no kiq overrun.
v2: remove unused the variable and correct
kiq max_sub_num value
Signed-off-by: Yintian Tao <yttao@xxxxxxx>
---
.../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 6 ++++
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 6 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 30 +++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 3 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 6 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 6 ++++
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 7 +++++
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 7 +++++
8 files changed, 71 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..fac8b9713dfc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -325,6 +325,12 @@ static int kgd_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
mec, pipe, queue_id);
spin_lock(&adev->gfx.kiq.ring_lock);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ pr_err("critical bug! too many kiq submission\n");
+ goto out_unlock;
+ }
+
r = amdgpu_ring_alloc(kiq_ring, 7);
if (r) {
pr_err("Failed to alloc KIQ (%d).\n", r); diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2ac5e7..fd42c126510f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -323,6 +323,12 @@ int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
mec, pipe, queue_id);
spin_lock(&adev->gfx.kiq.ring_lock);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ pr_err("critical bug! too many kiq submissions\n");
+ goto out_unlock;
+ }
+
r = amdgpu_ring_alloc(kiq_ring, 7);
if (r) {
pr_err("Failed to alloc KIQ (%d).\n", r); diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a721b0e0ff69..84e66c45df37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -321,6 +321,9 @@ int amdgpu_gfx_kiq_init_ring(struct amdgpu_device *adev,
AMDGPU_RING_PRIO_DEFAULT);
if (r)
dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
+ else
+ kiq->max_sub_num = (ring->ring_size / 4) /
+ (ring->funcs->align_mask + 1);
return r;
}
@@ -663,6 +666,21 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
return 0;
}
+int amdgpu_gfx_kiq_is_avail(struct amdgpu_kiq *kiq) {
+ uint32_t seq = 0;
+ signed long r = 0;
+
+ seq = abs(kiq->ring.fence_drv.sync_seq - kiq->max_sub_num);
+ if (seq > kiq->max_sub_num) {
+ r = amdgpu_fence_wait_polling(&kiq->ring, seq,
+ MAX_KIQ_REG_WAIT);
+ return r < 1 ? -ETIME : 0;
+ }
+
+ return 0;
+}
+
uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) {
signed long r, cnt = 0;
@@ -674,6 +692,12 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
BUG_ON(!ring->funcs->emit_rreg);
spin_lock_irqsave(&kiq->ring_lock, flags);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ spin_unlock_irqrestore(&kiq->ring_lock, flags);
+ goto failed_kiq_read;
+ }
+
if (amdgpu_device_wb_get(adev, ®_val_offs)) {
spin_unlock_irqrestore(&kiq->ring_lock, flags);
pr_err("critical bug! too many kiq readers\n"); @@ -728,6 +752,12 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
BUG_ON(!ring->funcs->emit_wreg);
spin_lock_irqsave(&kiq->ring_lock, flags);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ spin_unlock_irqrestore(&kiq->ring_lock, flags);
+ goto failed_kiq_write;
+ }
+
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_wreg(ring, reg, v);
amdgpu_fence_emit_polling(ring, &seq); diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index ee698f0246d8..1ee59a927bd9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -103,6 +103,7 @@ struct amdgpu_kiq {
struct amdgpu_ring ring;
struct amdgpu_irq_src irq;
const struct kiq_pm4_funcs *pmf;
+ uint32_t max_sub_num;
};
/*
@@ -387,4 +388,6 @@ int amdgpu_gfx_cp_ecc_error_irq(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry);
uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg,
uint32_t v);
+
+int amdgpu_gfx_kiq_is_avail(struct amdgpu_kiq *kiq);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 8c10084f44ef..4b027006d072 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -57,6 +57,12 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev,
uint32_t seq;
spin_lock_irqsave(&kiq->ring_lock, flags);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ spin_unlock_irqrestore(&kiq->ring_lock, flags);
+ goto failed_kiq;
+ }
+
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
ref, mask);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5b1549f167b0..a136e2229f7a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4051,6 +4051,12 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
BUG_ON(!ring->funcs->emit_rreg);
spin_lock_irqsave(&kiq->ring_lock, flags);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ spin_unlock_irqrestore(&kiq->ring_lock, flags);
+ goto failed_kiq_read;
+ }
+
if (amdgpu_device_wb_get(adev, ®_val_offs)) {
spin_unlock_irqrestore(&kiq->ring_lock, flags);
pr_err("critical bug! too many kiq readers\n"); diff --git
a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 30b75d79efdb..77d8bc9c0111 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -423,6 +423,13 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct
amdgpu_device *adev,
if (amdgpu_emu_mode == 0 && ring->sched.ready) {
spin_lock(&adev->gfx.kiq.ring_lock);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ spin_unlock(&kiq->ring_lock);
+ DRM_ERROR("too many kiq submissions\n");
+ return -ETIME;
+ }
+
/* 2 dwords flush + 8 dwords fence */
amdgpu_ring_alloc(ring, kiq->pmf->invalidate_tlbs_size + 8);
kiq->pmf->kiq_invalidate_tlbs(ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index fecdbc471983..c429a2a5fe3d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -614,6 +614,13 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
ndw += kiq->pmf->invalidate_tlbs_size;
spin_lock(&adev->gfx.kiq.ring_lock);
+ r = amdgpu_gfx_kiq_is_avail(&adev->gfx.kiq);
+ if (r) {
+ spin_unlock(&kiq->ring_lock);
+ DRM_ERROR("critical bug! too many kiq submissions\n");
+ return -ETIME;
+ }
+
/* 2 dwords flush + 8 dwords fence */
amdgpu_ring_alloc(ring, ndw);
if (vega20_xgmi_wa)
--
2.17.1
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7CSh
aoyun.Liu%40amd.com%7C0cd26307c0a149ebe0bd08d7e6cc84b1%7C3dd8961fe4884
e608e11a82d994e183d%7C0%7C0%7C637231638419034830&sdata=rxkhyMutFwb
e5Nw%2BeBGiESW9wTdflDUo%2F4xEvbCbR6U%3D&reserved=0