Am 2022-03-17 um 15:37 schrieb Tushar Patel:
---
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
drivers/gpu/drm/amd/amdkfd/kfd_device.c | 21 ++++++++++++---------
2 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 4c20c23d6ba0..bda1b5132ee8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -680,7 +680,7 @@ MODULE_PARM_DESC(sched_policy,
* Maximum number of processes that HWS can schedule concurrently. The maximum is the
* number of VMIDs assigned to the HWS, which is also the default.
*/
-int hws_max_conc_proc = 8;
+int hws_max_conc_proc = -1;
module_param(hws_max_conc_proc, int, 0444);
MODULE_PARM_DESC(hws_max_conc_proc,
"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 339e12c94cff..39073f72fe5f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -483,15 +483,18 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
}
/* Verify module parameters regarding mapped process number*/
- if ((hws_max_conc_proc < 0)
- || (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
- dev_err(kfd_device,
- "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
- hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
- kfd->vm_info.vmid_num_kfd);
- kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
- } else
- kfd->max_proc_per_quantum = hws_max_conc_proc;
+ kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
+ if (hws_max_conc_proc != -1) {
+ if ((hws_max_conc_proc > 0)
+ && (hws_max_conc_proc < kfd->vm_info.vmid_num_kfd)) {
I think this should be <= kfd->vm_info.vmid_num_kfd.
+ kfd->max_proc_per_quantum = hws_max_conc_proc;
+ } else {
+ dev_err(kfd_device,
+ "hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
+ hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
+ kfd->vm_info.vmid_num_kfd);
I think this error message is the wrong approach. hws_max_conc_proc is a
global setting that affects all GPUs. Different GPUs may have different
numbers of VMIDs. So we can't treat (hws_max_conc_proc >
kfd->vm_info.vmid_num_kfd) as an error. It may be an error on one GPU
but perfectly fine on another.
I think you can simplify this if-else like this and get rid of the dev_err:
kfd->max_proc_per_quantum = min(hws_max_conc_proc,
kfd->vm_info.vmid_num_kfd);
Regards,
Felix
+ }
+ }
/* calculate max size of mqds needed for queues */
size = max_num_of_queues_per_device *