RE: [PATCH v2 06/07] drm/amdgpu: add option params to enforce process isolation between graphics and compute

"Yin, ZhenGuo (Chris)" <ZhenGuo.Yin@xxxxxxx> · Thu, 8 Jun 2023 02:58:12 +0000

[AMD Official Use Only - General]

Hi Felix

Yes. This patch is aiming to avoid concurrent running of kernel graphics queue and kernel compute queue.

Previously we might get wrong guilty job if a bad compute job and a good gfx job submitted concurrently.
Like Xorg might be crashed if we submitted a bad compute job in another process at the same time.

Advanced TDR can resolve this issue by the extra job resubmission to identify the real bad job.
After removing advanced TDR(commit 06a2d7cc3f0476be4682ef90eb09a28fa3daed37), we need another method to fix this issue.

As suggested from Christian, we can use the reserved vmid to prevent the concurrency of compute and gfx to enforce isolation.
Here we only focus on KGQ/KCQ, not considering KFD compute since it is using another set of VMID.
Christian, do you have any comments?

Best,
Zhenguo
-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@xxxxxxx>
Sent: Wednesday, June 7, 2023 9:35 PM
To: Li, Chong(Alan) <Chong.Li@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Koenig, Christian <Christian.Koenig@xxxxxxx>; Yin, ZhenGuo (Chris) <ZhenGuo.Yin@xxxxxxx>
Subject: Re: [PATCH v2 06/07] drm/amdgpu: add option params to enforce process isolation between graphics and compute

I can't see the other patches in this series, so I'm missing some context. I don't understand what "process isolation between graphics and compute" means here. It seems to be unrelated to KFD compute. This patch seems to be mostly about handling of reserved VMIDs. Maybe you're trying to avoid running Vulcan graphics and compute concurrently? But this does not prevent concurrency with KFD compute.

Can you clarify what this is for?

Thanks,
   Felix

Am 2023-06-07 um 06:57 schrieb Chong Li:
> enforce process isolation between graphics and compute via using the same reserved vmid.
>
> v2: remove params "struct amdgpu_vm *vm" from
>      amdgpu_vmid_alloc_reserved and amdgpu_vmid_free_reserved.
>
> Signed-off-by: Chong Li <chongli2@xxxxxxx>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 17 +++++++----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h |  6 ++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 22 +++++++++++++++++-----
>   5 files changed, 36 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index ce196badf42d..ef098a7287d0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -215,6 +215,7 @@ extern int amdgpu_force_asic_type;
>   extern int amdgpu_smartshift_bias;
>   extern int amdgpu_use_xgmi_p2p;
>   extern int amdgpu_mtype_local;
> +extern bool enforce_isolation;
>   #ifdef CONFIG_HSA_AMD
>   extern int sched_policy;
>   extern bool debug_evictions;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 3d91e123f9bd..fdb6fb8229ab 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -153,7 +153,7 @@ uint amdgpu_pg_mask = 0xffffffff;
>   uint amdgpu_sdma_phase_quantum = 32;
>   char *amdgpu_disable_cu;
>   char *amdgpu_virtual_display;
> -
> +bool enforce_isolation;
>   /*
>    * OverDrive(bit 14) disabled by default
>    * GFX DCS(bit 19) disabled by default @@ -973,6 +973,14 @@
> MODULE_PARM_DESC(
>                                               4 = AMDGPU_CPX_PARTITION_MODE)");
>   module_param_named(user_partt_mode, amdgpu_user_partt_mode, uint,
> 0444);
>
> +
> +/**
> + * DOC: enforce_isolation (bool)
> + * enforce process isolation between graphics and compute via using the same reserved vmid.
> + */
> +module_param(enforce_isolation, bool, 0444);
> +MODULE_PARM_DESC(enforce_isolation, "enforce process isolation
> +between graphics and compute . enforce_isolation = on");
> +
>   /* These devices are not supported by amdgpu.
>    * They are supported by the mach64, r128, radeon drivers
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> index c991ca0b7a1c..ff1ea99292fb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> @@ -409,7 +409,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>       if (r || !idle)
>               goto error;
>
> -     if (vm->reserved_vmid[vmhub]) {
> +     if (vm->reserved_vmid[vmhub] || (enforce_isolation && (vmhub ==
> +AMDGPU_GFXHUB(0)))) {
>               r = amdgpu_vmid_grab_reserved(vm, ring, job, &id, fence);
>               if (r || !id)
>                       goto error;
> @@ -460,14 +460,11 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>   }
>
>   int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev,
> -                            struct amdgpu_vm *vm,
>                              unsigned vmhub)
>   {
>       struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>
>       mutex_lock(&id_mgr->lock);
> -     if (vm->reserved_vmid[vmhub])
> -             goto unlock;
>
>       ++id_mgr->reserved_use_count;
>       if (!id_mgr->reserved) {
> @@ -479,27 +476,23 @@ int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev,
>               list_del_init(&id->list);
>               id_mgr->reserved = id;
>       }
> -     vm->reserved_vmid[vmhub] = true;
>
> -unlock:
>       mutex_unlock(&id_mgr->lock);
>       return 0;
>   }
>
>   void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
> -                            struct amdgpu_vm *vm,
>                              unsigned vmhub)
>   {
>       struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>
>       mutex_lock(&id_mgr->lock);
> -     if (vm->reserved_vmid[vmhub] &&
> -         !--id_mgr->reserved_use_count) {
> +     if (!--id_mgr->reserved_use_count) {
>               /* give the reserved ID back to normal round robin */
>               list_add(&id_mgr->reserved->list, &id_mgr->ids_lru);
>               id_mgr->reserved = NULL;
>       }
> -     vm->reserved_vmid[vmhub] = false;
> +
>       mutex_unlock(&id_mgr->lock);
>   }
>
> @@ -578,6 +571,10 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev)
>                       list_add_tail(&id_mgr->ids[j].list, &id_mgr->ids_lru);
>               }
>       }
> +     /* alloc a default reserved vmid to enforce isolation */
> +     if (enforce_isolation)
> +             amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(0));
> +
>   }
>
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> index d1cc09b45da4..68add23dc87c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> @@ -79,11 +79,9 @@ void amdgpu_pasid_free_delayed(struct dma_resv *resv,
>   bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
>                              struct amdgpu_vmid *id);
>   int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev,
> -                            struct amdgpu_vm *vm,
> -                            unsigned vmhub);
> +                                     unsigned vmhub);
>   void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
> -                            struct amdgpu_vm *vm,
> -                            unsigned vmhub);
> +                                     unsigned vmhub);
>   int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>                    struct amdgpu_job *job, struct dma_fence **fence);
>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ea3d0be152fc..73900ab545c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2358,8 +2358,14 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>       }
>
>       dma_fence_put(vm->last_update);
> -     for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
> -             amdgpu_vmid_free_reserved(adev, vm, i);
> +
> +     for (i = 0; i < AMDGPU_MAX_VMHUBS; i++) {
> +             if (vm->reserved_vmid[i]) {
> +                     amdgpu_vmid_free_reserved(adev, i);
> +                     vm->reserved_vmid[i] = false;
> +             }
> +     }
> +
>   }
>
>   /**
> @@ -2447,13 +2453,19 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>       switch (args->in.op) {
>       case AMDGPU_VM_OP_RESERVE_VMID:
>               /* We only have requirement to reserve vmid from gfxhub */
> -             r = amdgpu_vmid_alloc_reserved(adev, &fpriv->vm,
> -                                            AMDGPU_GFXHUB(0));
> +             if (!fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) {
> +                     r = amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(0));
> +                     fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)] = true;
> +             }
> +
>               if (r)
>                       return r;
>               break;
>       case AMDGPU_VM_OP_UNRESERVE_VMID:
> -             amdgpu_vmid_free_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB(0));
> +             if (fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) {
> +                     amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(0));
> +                     fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)] = false;
> +             }
>               break;
>       default:
>               return -EINVAL;