Re: FW: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)

Christian König <christian.koenig@xxxxxxx> · Tue, 28 Jul 2020 11:04:26 +0200

The patch looks totally mangled to me, e.g. some spaces and new lines 
are missing.

Probably because it was forwarded.

Christian.

Am 28.07.20 um 10:59 schrieb Liu, Monk:
[AMD Official Use Only - Internal Distribution Only]

-----Original Message-----
From: Monk Liu <Monk.Liu@xxxxxxx>
Sent: Tuesday, July 28, 2020 2:59 PM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Liu, Monk <Monk.Liu@xxxxxxx>
Subject: [PATCH] drm/amdgpu: introduce a new parameter to configure how many KCQ we want(v3)

what:
the MQD's save and restore of KCQ (kernel compute queue) cost lots of clocks during world switch which impacts a lot to multi-VF performance

how:
introduce a paramter to control the number of KCQ to avoid performance drop if there is no kernel compute queue needed

notes:
this paramter only affects gfx 8/9/10

v2:
refine namings

v3:
choose queues for each ring to that try best to cross pipes evenly.

TODO:
in the future we will let hypervisor driver to set this paramter automatically thus no need for user to configure it through modprobe in virtual machine

Signed-off-by: Monk Liu <Monk.Liu@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 58 +++++++++++++++---------------
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c     | 30 ++++++++--------
  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c      | 29 +++++++--------
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 31 ++++++++--------
  7 files changed, 87 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e97c088..de11136 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -201,6 +201,7 @@ extern int amdgpu_si_support;  #ifdef CONFIG_DRM_AMDGPU_CIK  extern int amdgpu_cik_support;  #endif
+extern int amdgpu_num_kcq;

  #define AMDGPU_VM_MAX_NUM_CTX4096
  #define AMDGPU_SG_THRESHOLD(256*1024*1024)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac9..cf445bab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1199,6 +1199,11 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)

  amdgpu_gmc_tmz_set(adev);

+if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
+amdgpu_num_kcq = 8;
+dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid paramter provided by user\n");
+}
+
  return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 6291f5f..b545c40 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -150,6 +150,7 @@ int amdgpu_noretry;
  int amdgpu_force_asic_type = -1;
  int amdgpu_tmz = 0;
  int amdgpu_reset_method = -1; /* auto */
+int amdgpu_num_kcq = -1;

  struct amdgpu_mgpu_info mgpu_info = {
  .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
@@ -765,6 +766,9 @@ module_param_named(tmz, amdgpu_tmz, int, 0444);  MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)");  module_param_named(reset_method, amdgpu_reset_method, int, 0444);

+MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to
+setup (8 if set to greater than 8 or less than 0, only affect gfx
+8+)"); module_param_named(num_kcq, amdgpu_num_kcq, int, 0444);
+
  static const struct pci_device_id pciidlist[] = {  #ifdef  CONFIG_DRM_AMDGPU_SI
  {0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 8eff017..f83a9a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -202,40 +202,42 @@ bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,

  void amdgpu_gfx_compute_queue_acquire(struct amdgpu_device *adev)  {
-int i, queue, pipe, mec;
+int i, queue, pipe;
  bool multipipe_policy = amdgpu_gfx_is_multipipe_capable(adev);
+int max_queues_per_mec = min(adev->gfx.mec.num_pipe_per_mec *
+ adev->gfx.mec.num_queue_per_pipe,
+ adev->gfx.num_compute_rings);
+
+if (multipipe_policy) {
+/* policy: make queues evenly cross all pipes on MEC1 only */
+for (i = 0; i < max_queues_per_mec; i++) {
+pipe = i % adev->gfx.mec.num_pipe_per_mec;
+queue = (i / adev->gfx.mec.num_pipe_per_mec) %
+adev->gfx.mec.num_queue_per_pipe;
+
+set_bit(pipe * adev->gfx.mec.num_queue_per_pipe + queue,
+adev->gfx.mec.queue_bitmap);
+}
+} else {
+int mec;

-/* policy for amdgpu compute queue ownership */
-for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) {
-queue = i % adev->gfx.mec.num_queue_per_pipe;
-pipe = (i / adev->gfx.mec.num_queue_per_pipe)
-% adev->gfx.mec.num_pipe_per_mec;
-mec = (i / adev->gfx.mec.num_queue_per_pipe)
-/ adev->gfx.mec.num_pipe_per_mec;
-
-/* we've run out of HW */
-if (mec >= adev->gfx.mec.num_mec)
-break;
+/* policy: amdgpu owns all queues in the given pipe */
+for (i = 0; i < adev->gfx.num_compute_rings; ++i) {
+queue = i % adev->gfx.mec.num_queue_per_pipe;
+pipe = (i / adev->gfx.mec.num_queue_per_pipe)
+% adev->gfx.mec.num_pipe_per_mec;
+mec = (i / adev->gfx.mec.num_queue_per_pipe)
+/ adev->gfx.mec.num_pipe_per_mec;

-if (multipipe_policy) {
-/* policy: amdgpu owns the first two queues of the first MEC */
-if (mec == 0 && queue < 2)
-set_bit(i, adev->gfx.mec.queue_bitmap);
-} else {
-/* policy: amdgpu owns all queues in the first pipe */
-if (mec == 0 && pipe == 0)
-set_bit(i, adev->gfx.mec.queue_bitmap);
+/* we've run out of HW */
+if (mec >= adev->gfx.mec.num_mec)
+break;
+
+set_bit(i, adev->gfx.mec.queue_bitmap);
  }
  }

-/* update the number of active compute rings */
-adev->gfx.num_compute_rings =
-bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
-
-/* If you hit this case and edited the policy, you probably just
- * need to increase AMDGPU_MAX_COMPUTE_RINGS */
-if (WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS))
-adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
+dev_info(adev->dev, "mec queue bitmap weight=%d\n",
+bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES));
  }

  void amdgpu_gfx_graphics_queue_acquire(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index db9f1e8..3a93b3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4022,21 +4022,23 @@ static int gfx_v10_0_mec_init(struct amdgpu_device *adev)
  amdgpu_gfx_compute_queue_acquire(adev);
  mec_hpd_size = adev->gfx.num_compute_rings * GFX10_MEC_HPD_SIZE;

-r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
-      AMDGPU_GEM_DOMAIN_GTT,
-      &adev->gfx.mec.hpd_eop_obj,
-      &adev->gfx.mec.hpd_eop_gpu_addr,
-      (void **)&hpd);
-if (r) {
-dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
-gfx_v10_0_mec_fini(adev);
-return r;
-}
+if (mec_hpd_size) {
+r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_GTT,
+  &adev->gfx.mec.hpd_eop_obj,
+  &adev->gfx.mec.hpd_eop_gpu_addr,
+  (void **)&hpd);
+if (r) {
+dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
+gfx_v10_0_mec_fini(adev);
+return r;
+}

-memset(hpd, 0, mec_hpd_size);
+memset(hpd, 0, mec_hpd_size);

-amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
-amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
+amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
+amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
+}

  if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
  mec_hdr = (const struct gfx_firmware_header_v1_0 *)adev->gfx.mec_fw->data; @@ -7159,7 +7161,7 @@ static int gfx_v10_0_early_init(void *handle)
  break;
  }

-adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
+adev->gfx.num_compute_rings = amdgpu_num_kcq;

  gfx_v10_0_set_kiq_pm4_funcs(adev);
  gfx_v10_0_set_ring_funcs(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 8d72089..eb4b812 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1343,21 +1343,22 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
  amdgpu_gfx_compute_queue_acquire(adev);

  mec_hpd_size = adev->gfx.num_compute_rings * GFX8_MEC_HPD_SIZE;
+if (mec_hpd_size) {
+r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  &adev->gfx.mec.hpd_eop_obj,
+  &adev->gfx.mec.hpd_eop_gpu_addr,
+  (void **)&hpd);
+if (r) {
+dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
+return r;
+}

-r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
-      AMDGPU_GEM_DOMAIN_VRAM,
-      &adev->gfx.mec.hpd_eop_obj,
-      &adev->gfx.mec.hpd_eop_gpu_addr,
-      (void **)&hpd);
-if (r) {
-dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
-return r;
-}
-
-memset(hpd, 0, mec_hpd_size);
+memset(hpd, 0, mec_hpd_size);

-amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
-amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
+amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
+amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
+}

  return 0;
  }
@@ -5294,7 +5295,7 @@ static int gfx_v8_0_early_init(void *handle)
  struct amdgpu_device *adev = (struct amdgpu_device *)handle;

  adev->gfx.num_gfx_rings = GFX8_NUM_GFX_RINGS;
-adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
+adev->gfx.num_compute_rings = amdgpu_num_kcq;
  adev->gfx.funcs = &gfx_v8_0_gfx_funcs;
  gfx_v8_0_set_ring_funcs(adev);
  gfx_v8_0_set_irq_funcs(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index e4e751f..43ad044 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1938,22 +1938,23 @@ static int gfx_v9_0_mec_init(struct amdgpu_device *adev)
  /* take ownership of the relevant compute queues */
  amdgpu_gfx_compute_queue_acquire(adev);
  mec_hpd_size = adev->gfx.num_compute_rings * GFX9_MEC_HPD_SIZE;
+if (mec_hpd_size) {
+r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  &adev->gfx.mec.hpd_eop_obj,
+  &adev->gfx.mec.hpd_eop_gpu_addr,
+  (void **)&hpd);
+if (r) {
+dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
+gfx_v9_0_mec_fini(adev);
+return r;
+}

-r = amdgpu_bo_create_reserved(adev, mec_hpd_size, PAGE_SIZE,
-      AMDGPU_GEM_DOMAIN_VRAM,
-      &adev->gfx.mec.hpd_eop_obj,
-      &adev->gfx.mec.hpd_eop_gpu_addr,
-      (void **)&hpd);
-if (r) {
-dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
-gfx_v9_0_mec_fini(adev);
-return r;
-}
-
-memset(hpd, 0, mec_hpd_size);
+memset(hpd, 0, mec_hpd_size);

-amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
-amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
+amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
+amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
+}

  mec_hdr = (const struct gfx_firmware_header_v1_0 *)adev->gfx.mec_fw->data;

@@ -4625,7 +4626,7 @@ static int gfx_v9_0_early_init(void *handle)
  adev->gfx.num_gfx_rings = 0;
  else
  adev->gfx.num_gfx_rings = GFX9_NUM_GFX_RINGS;
-adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
+adev->gfx.num_compute_rings = amdgpu_num_kcq;
  gfx_v9_0_set_kiq_pm4_funcs(adev);
  gfx_v9_0_set_ring_funcs(adev);
  gfx_v9_0_set_irq_funcs(adev);
--
2.7.4


_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx