[PATCH] drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2)

maraeo@xxxxxxxxx (Marek Olšák) · Wed, 24 Aug 2016 15:20:01 +0200

From: Marek OlÅ¡Ã¡k <marek.olsak@xxxxxxx>

The old mechanism used a per-submission limit that didn't take previous
submissions within the same time frame into account. It also filled VRAM
slowly when VRAM usage dropped due to a big eviction or buffer deallocation.

This new method establishes a configurable MBps limit that is obeyed when
VRAM usage is very high. When VRAM usage is not very high, it gives
the driver the freedom to fill it quickly. The result is more consistent
performance.

It can't keep the BO move rate low if lots of evictions are happening due
to VRAM fragmentation, or if a big buffer is being migrated.

The amdgpu.moverate parameter can be used to set a non-default limit.
Measurements can be done to find out which amdgpu.moverate setting gives
the best results.

Mainly APUs and cards with small VRAM will benefit from this. For F1 2015,
anything with 2 GB VRAM or less will benefit.

Some benchmark results - F1 2015 (Tonga 2GB):

Limit      MinFPS AvgFPS
Old code:  14     32.6
128 MB/s:  28     41
64 MB/s:   15.5   43
32 MB/s:   28.7   43.4
8 MB/s:    27.8   44.4
8 MB/s:    21.9   42.8 (different run)

Random drops in Min FPS can still occur (due to fragmented VRAM?), but
the average FPS is much better. 8 MB/s is probably a good limit for this
game & the current VRAM management. The random FPS drops are still to be
tackled.

v2: use a spinlock

Signed-off-by: Marek OlÅ¡Ã¡k <marek.olsak at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   9 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 152 ++++++++++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |   4 +
 4 files changed, 127 insertions(+), 48 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 916da80..5753f0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -58,20 +58,21 @@
 #include "amd_powerplay.h"
 
 #include "gpu_scheduler.h"
 
 /*
  * Modules parameters.
  */
 extern int amdgpu_modeset;
 extern int amdgpu_vram_limit;
 extern int amdgpu_gart_size;
+extern int amdgpu_moverate;
 extern int amdgpu_benchmarking;
 extern int amdgpu_testing;
 extern int amdgpu_audio;
 extern int amdgpu_disp_priority;
 extern int amdgpu_hw_i2c;
 extern int amdgpu_pcie_gen2;
 extern int amdgpu_msi;
 extern int amdgpu_lockup_timeout;
 extern int amdgpu_dpm;
 extern int amdgpu_smc_load_fw;
@@ -2029,20 +2030,28 @@ struct amdgpu_device {
 	struct amdgpu_mman		mman;
 	struct amdgpu_vram_scratch	vram_scratch;
 	struct amdgpu_wb		wb;
 	atomic64_t			vram_usage;
 	atomic64_t			vram_vis_usage;
 	atomic64_t			gtt_usage;
 	atomic64_t			num_bytes_moved;
 	atomic64_t			num_evictions;
 	atomic_t			gpu_reset_counter;
 
+	/* data for buffer migration throttling */
+	struct {
+		spinlock_t		lock;
+		s64			last_update_us;
+		s64			accum_us; /* accumulated microseconds */
+		u32			log2_max_MBps;
+	} mm_stats;
+
 	/* display */
 	bool				enable_virtual_display;
 	struct amdgpu_mode_info		mode_info;
 	/* For pre-DCE11. DCE11 and later are in "struct amdgpu_device->dm" */
 	struct work_struct		hotplug_work;
 	struct amdgpu_irq_src		crtc_irq;
 	struct amdgpu_irq_src		pageflip_irq;
 	struct amdgpu_irq_src		hpd_irq;
 
 	/* rings */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d5d61a7..b0e11ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -229,91 +229,145 @@ free_partial_kdata:
 		drm_free_large(p->chunks[i].kdata);
 	kfree(p->chunks);
 put_ctx:
 	amdgpu_ctx_put(p->ctx);
 free_chunk:
 	kfree(chunk_array);
 
 	return ret;
 }
 
-/* Returns how many bytes TTM can move per IB.
+/* Convert microseconds to bytes. */
+static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
+{
+	if (us <= 0 || !adev->mm_stats.log2_max_MBps)
+		return 0;
+
+	/* Since accum_us is incremented by a million per second, just
+	 * multiply it by the number of MB/s to get the number of bytes.
+	 */
+	return us << adev->mm_stats.log2_max_MBps;
+}
+
+static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
+{
+	if (!adev->mm_stats.log2_max_MBps)
+		return 0;
+
+	return bytes >> adev->mm_stats.log2_max_MBps;
+}
+
+/* Returns how many bytes TTM can move right now. If no bytes can be moved,
+ * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
+ * which means it can go over the threshold once. If that happens, the driver
+ * will be in debt and no other buffer migrations can be done until that debt
+ * is repaid.
+ *
+ * This approach allows moving a buffer of any size (it's important to allow
+ * that).
+ *
+ * The currency is simply time in microseconds and it increases as the clock
+ * ticks. The accumulated microseconds (us) are converted to bytes and
+ * returned.
  */
 static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 {
-	u64 real_vram_size = adev->mc.real_vram_size;
-	u64 vram_usage = atomic64_read(&adev->vram_usage);
+	s64 time_us, increment_us;
+	u64 max_bytes;
+	u64 free_vram, total_vram, used_vram;
 
-	/* This function is based on the current VRAM usage.
+	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
+	 * throttling.
 	 *
-	 * - If all of VRAM is free, allow relocating the number of bytes that
-	 *   is equal to 1/4 of the size of VRAM for this IB.
+	 * It means that in order to get full max MBps, at least 5 IBs per
+	 * second must be submitted and not more than 200ms apart from each
+	 * other.
+	 */
+	const s64 us_upper_bound = 200000;
 
-	 * - If more than one half of VRAM is occupied, only allow relocating
-	 *   1 MB of data for this IB.
-	 *
-	 * - From 0 to one half of used VRAM, the threshold decreases
-	 *   linearly.
-	 *         __________________
-	 * 1/4 of -|\               |
-	 * VRAM    | \              |
-	 *         |  \             |
-	 *         |   \            |
-	 *         |    \           |
-	 *         |     \          |
-	 *         |      \         |
-	 *         |       \________|1 MB
-	 *         |----------------|
-	 *    VRAM 0 %             100 %
-	 *         used            used
-	 *
-	 * Note: It's a threshold, not a limit. The threshold must be crossed
-	 * for buffer relocations to stop, so any buffer of an arbitrary size
-	 * can be moved as long as the threshold isn't crossed before
-	 * the relocation takes place. We don't want to disable buffer
-	 * relocations completely.
+	if (!adev->mm_stats.log2_max_MBps)
+		return 0;
+
+	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
+	used_vram = atomic64_read(&adev->vram_usage);
+	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
+
+	spin_lock(&adev->mm_stats.lock);
+
+	/* Increase the amount of accumulated us. */
+	time_us = ktime_to_us(ktime_get());
+	increment_us = time_us - adev->mm_stats.last_update_us;
+	adev->mm_stats.last_update_us = time_us;
+	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
+                                      us_upper_bound);
+
+	/* This prevents the short period of low performance when the VRAM
+	 * usage is low and the driver is in debt or doesn't have enough
+	 * accumulated us to fill VRAM quickly.
 	 *
-	 * The idea is that buffers should be placed in VRAM at creation time
-	 * and TTM should only do a minimum number of relocations during
-	 * command submission. In practice, you need to submit at least
-	 * a dozen IBs to move all buffers to VRAM if they are in GTT.
+	 * The situation can occur in these cases:
+	 * - a lot of VRAM is freed by userspace
+	 * - the presence of a big buffer causes a lot of evictions
+	 *   (solution: split buffers into smaller ones)
 	 *
-	 * Also, things can get pretty crazy under memory pressure and actual
-	 * VRAM usage can change a lot, so playing safe even at 50% does
-	 * consistently increase performance.
+	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
+	 * accum_us to a positive number.
 	 */
+	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
+		s64 min_us;
+
+		/* Be more aggresive on dGPUs. Try to fill a portion of free
+		 * VRAM now.
+		 */
+		if (!(adev->flags & AMD_IS_APU))
+			min_us = bytes_to_us(adev, free_vram / 4);
+		else
+			min_us = 0; /* Reset accum_us on APUs. */
+
+		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
+	}
 
-	u64 half_vram = real_vram_size >> 1;
-	u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;
-	u64 bytes_moved_threshold = half_free_vram >> 1;
-	return max(bytes_moved_threshold, 1024*1024ull);
+	/* This returns 0 if the driver is in debt to disallow (optional)
+	 * buffer moves.
+	 */
+	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
+
+	spin_unlock(&adev->mm_stats.lock);
+	return max_bytes;
+}
+
+/* Report how many bytes have really been moved for the last command
+ * submission. This can result in a debt that can stop buffer migrations
+ * temporarily.
+ */
+static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
+					 u64 num_bytes)
+{
+	spin_lock(&adev->mm_stats.lock);
+	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
+	spin_unlock(&adev->mm_stats.lock);
 }
 
 static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
 				 struct amdgpu_bo *bo)
 {
 	u64 initial_bytes_moved;
 	uint32_t domain;
 	int r;
 
 	if (bo->pin_count)
 		return 0;
 
-	/* Avoid moving this one if we have moved too many buffers
-	 * for this IB already.
-	 *
-	 * Note that this allows moving at least one buffer of
-	 * any size, because it doesn't take the current "bo"
-	 * into account. We don't want to disallow buffer moves
-	 * completely.
+	/* Don't move this buffer if we have depleted our allowance
+	 * to move it. Don't move anything if the threshold is zero.
 	 */
-	if (p->bytes_moved <= p->bytes_moved_threshold)
+	if (p->bytes_moved < p->bytes_moved_threshold)
 		domain = bo->prefered_domains;
 	else
 		domain = bo->allowed_domains;
 
 retry:
 	amdgpu_ttm_placement_from_domain(bo, domain);
 	initial_bytes_moved = atomic64_read(&bo->adev->num_bytes_moved);
 	r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false);
 	p->bytes_moved += atomic64_read(&bo->adev->num_bytes_moved) -
 		initial_bytes_moved;
@@ -488,20 +542,22 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 		DRM_ERROR("amdgpu_cs_list_validate(duplicates) failed.\n");
 		goto error_validate;
 	}
 
 	r = amdgpu_cs_list_validate(p, &p->validated);
 	if (r) {
 		DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");
 		goto error_validate;
 	}
 
+	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
+
 	fpriv->vm.last_eviction_counter =
 		atomic64_read(&p->adev->num_evictions);
 
 	if (p->bo_list) {
 		struct amdgpu_bo *gds = p->bo_list->gds_obj;
 		struct amdgpu_bo *gws = p->bo_list->gws_obj;
 		struct amdgpu_bo *oa = p->bo_list->oa_obj;
 		struct amdgpu_vm *vm = &fpriv->vm;
 		unsigned i;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2608138..8877291 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1521,20 +1521,21 @@ static bool amdgpu_device_is_virtual(void)
  * Returns 0 for success or an error on failure.
  * Called at driver startup.
  */
 int amdgpu_device_init(struct amdgpu_device *adev,
 		       struct drm_device *ddev,
 		       struct pci_dev *pdev,
 		       uint32_t flags)
 {
 	int r, i;
 	bool runtime = false;
+	u32 max_MBps;
 
 	adev->shutdown = false;
 	adev->dev = &pdev->dev;
 	adev->ddev = ddev;
 	adev->pdev = pdev;
 	adev->flags = flags;
 	adev->asic_type = flags & AMD_ASIC_MASK;
 	adev->is_atom_bios = false;
 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
 	adev->mc.gtt_size = 512 * 1024 * 1024;
@@ -1580,20 +1581,21 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
 	/* Registers mapping */
 	/* TODO: block userspace mapping of io register */
 	spin_lock_init(&adev->mmio_idx_lock);
 	spin_lock_init(&adev->smc_idx_lock);
 	spin_lock_init(&adev->pcie_idx_lock);
 	spin_lock_init(&adev->uvd_ctx_idx_lock);
 	spin_lock_init(&adev->didt_idx_lock);
 	spin_lock_init(&adev->gc_cac_idx_lock);
 	spin_lock_init(&adev->audio_endpt_idx_lock);
+	spin_lock_init(&adev->mm_stats.lock);
 
 	INIT_LIST_HEAD(&adev->shadow_list);
 	mutex_init(&adev->shadow_list_lock);
 
 	adev->rmmio_base = pci_resource_start(adev->pdev, 5);
 	adev->rmmio_size = pci_resource_len(adev->pdev, 5);
 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
 	if (adev->rmmio == NULL) {
 		return -ENOMEM;
 	}
@@ -1693,20 +1695,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
 	r = amdgpu_init(adev);
 	if (r) {
 		dev_err(adev->dev, "amdgpu_init failed\n");
 		amdgpu_fini(adev);
 		goto failed;
 	}
 
 	adev->accel_working = true;
 
+	/* Initialize the buffer migration limit. */
+	if (amdgpu_moverate >= 0)
+		max_MBps = amdgpu_moverate;
+	else
+		max_MBps = 8; /* Allow 8 MB/s. */
+	/* Get a log2 for easy divisions. */
+	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
+
 	amdgpu_fbdev_init(adev);
 
 	r = amdgpu_ib_pool_init(adev);
 	if (r) {
 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
 		goto failed;
 	}
 
 	r = amdgpu_ib_ring_tests(adev);
 	if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 5bcfeed..fe2d26b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -54,20 +54,21 @@
  *           at the end of IBs.
  * - 3.3.0 - Add VM support for UVD on supported hardware.
  * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS.
  */
 #define KMS_DRIVER_MAJOR	3
 #define KMS_DRIVER_MINOR	4
 #define KMS_DRIVER_PATCHLEVEL	0
 
 int amdgpu_vram_limit = 0;
 int amdgpu_gart_size = -1; /* auto */
+int amdgpu_moverate = -1; /* auto */
 int amdgpu_benchmarking = 0;
 int amdgpu_testing = 0;
 int amdgpu_audio = -1;
 int amdgpu_disp_priority = 0;
 int amdgpu_hw_i2c = 0;
 int amdgpu_pcie_gen2 = -1;
 int amdgpu_msi = -1;
 int amdgpu_lockup_timeout = 0;
 int amdgpu_dpm = -1;
 int amdgpu_smc_load_fw = 1;
@@ -92,20 +93,23 @@ unsigned amdgpu_pcie_lane_cap = 0;
 unsigned amdgpu_cg_mask = 0xffffffff;
 unsigned amdgpu_pg_mask = 0xffffffff;
 char *amdgpu_virtual_display = NULL;
 
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
 module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
 
 MODULE_PARM_DESC(gartsize, "Size of PCIE/IGP gart to setup in megabytes (32, 64, etc., -1 = auto)");
 module_param_named(gartsize, amdgpu_gart_size, int, 0600);
 
+MODULE_PARM_DESC(moverate, "Maximum buffer migration rate in MB/s. (32, 64, etc., -1=auto, 0=1=disabled)");
+module_param_named(moverate, amdgpu_moverate, int, 0600);
+
 MODULE_PARM_DESC(benchmark, "Run benchmark");
 module_param_named(benchmark, amdgpu_benchmarking, int, 0444);
 
 MODULE_PARM_DESC(test, "Run tests");
 module_param_named(test, amdgpu_testing, int, 0444);
 
 MODULE_PARM_DESC(audio, "Audio enable (-1 = auto, 0 = disable, 1 = enable)");
 module_param_named(audio, amdgpu_audio, int, 0444);
 
 MODULE_PARM_DESC(disp_priority, "Display Priority (0 = auto, 1 = normal, 2 = high)");
-- 
2.7.4