Looks very good to me, binding time and byte is good idea according free vram. Did you test more games for your algorithm? Regards, David Zhou On 2016å¹´08æ??24æ?¥ 06:19, Marek Olšák wrote: > From: Marek Olšák <marek.olsak at amd.com> > > The old mechanism used a per-submission limit that didn't take previous > submissions within the same time frame into account. It also filled VRAM > slowly when VRAM usage dropped due to a big eviction or buffer deallocation. > > This new method establishes a configurable MBps limit that is obeyed when > VRAM usage is very high. When VRAM usage is not very high, it gives > the driver the freedom to fill it quickly. The result is more consistent > performance. > > It can't keep the BO move rate low if lots of evictions are happening due > to VRAM fragmentation, or if a big buffer is being migrated. > > The amdgpu.moverate parameter can be used to set a non-default limit. > Measurements can be done to find out which amdgpu.moverate setting gives > the best results. > > Mainly APUs and cards with small VRAM will benefit from this. For F1 2015, > anything with 2 GB VRAM or less will benefit. > > Some benchmark results - F1 2015 (Tonga 2GB): > > Limit MinFPS AvgFPS > Old code: 14 32.6 > 128 MB/s: 28 41 > 64 MB/s: 15.5 43 > 32 MB/s: 28.7 43.4 > 8 MB/s: 27.8 44.4 > 8 MB/s: 21.9 42.8 (different run) > > Random drops in Min FPS can still occur (due to fragmented VRAM?), but > the average FPS is much better. 8 MB/s is probably a good limit for this > game & the current VRAM management. The random FPS drops are still to be > tackled. > > Signed-off-by: Marek Olšák <marek.olsak at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 152 ++++++++++++++++++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 + > 4 files changed, 127 insertions(+), 48 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 916da80..8bde95c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -58,20 +58,21 @@ > #include "amd_powerplay.h" > > #include "gpu_scheduler.h" > > /* > * Modules parameters. > */ > extern int amdgpu_modeset; > extern int amdgpu_vram_limit; > extern int amdgpu_gart_size; > +extern int amdgpu_moverate; > extern int amdgpu_benchmarking; > extern int amdgpu_testing; > extern int amdgpu_audio; > extern int amdgpu_disp_priority; > extern int amdgpu_hw_i2c; > extern int amdgpu_pcie_gen2; > extern int amdgpu_msi; > extern int amdgpu_lockup_timeout; > extern int amdgpu_dpm; > extern int amdgpu_smc_load_fw; > @@ -2029,20 +2030,28 @@ struct amdgpu_device { > struct amdgpu_mman mman; > struct amdgpu_vram_scratch vram_scratch; > struct amdgpu_wb wb; > atomic64_t vram_usage; > atomic64_t vram_vis_usage; > atomic64_t gtt_usage; > atomic64_t num_bytes_moved; > atomic64_t num_evictions; > atomic_t gpu_reset_counter; > > + /* data for buffer migration throttling */ > + struct { > + struct mutex lock; > + s64 last_update_us; > + s64 accum_us; /* accumulated microseconds */ > + u32 log2_max_MBps; > + } mm_stats; > + > /* display */ > bool enable_virtual_display; > struct amdgpu_mode_info mode_info; > /* For pre-DCE11. DCE11 and later are in "struct amdgpu_device->dm" */ > struct work_struct hotplug_work; > struct amdgpu_irq_src crtc_irq; > struct amdgpu_irq_src pageflip_irq; > struct amdgpu_irq_src hpd_irq; > > /* rings */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index d5d61a7..c6b4946 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -229,91 +229,145 @@ free_partial_kdata: > drm_free_large(p->chunks[i].kdata); > kfree(p->chunks); > put_ctx: > amdgpu_ctx_put(p->ctx); > free_chunk: > kfree(chunk_array); > > return ret; > } > > -/* Returns how many bytes TTM can move per IB. > +/* Convert microseconds to bytes. */ > +static u64 us_to_bytes(struct amdgpu_device *adev, s64 us) > +{ > + if (us <= 0 || !adev->mm_stats.log2_max_MBps) > + return 0; > + > + /* Since accum_us is incremented by a million per second, just > + * multiply it by the number of MB/s to get the number of bytes. > + */ > + return us << adev->mm_stats.log2_max_MBps; > +} > + > +static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes) > +{ > + if (!adev->mm_stats.log2_max_MBps) > + return 0; > + > + return bytes >> adev->mm_stats.log2_max_MBps; > +} > + > +/* Returns how many bytes TTM can move right now. If no bytes can be moved, > + * it returns 0. If it returns non-zero, it's OK to move at least one buffer, > + * which means it can go over the threshold once. If that happens, the driver > + * will be in debt and no other buffer migrations can be done until that debt > + * is repaid. > + * > + * This approach allows moving a buffer of any size (it's important to allow > + * that). > + * > + * The currency is simply time in microseconds and it increases as the clock > + * ticks. The accumulated microseconds (us) are converted to bytes and > + * returned. > */ > static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev) > { > - u64 real_vram_size = adev->mc.real_vram_size; > - u64 vram_usage = atomic64_read(&adev->vram_usage); > + s64 time_us, increment_us; > + u64 max_bytes; > + u64 free_vram, total_vram, used_vram; > > - /* This function is based on the current VRAM usage. > + /* Allow a maximum of 200 accumulated ms. This is basically per-IB > + * throttling. > * > - * - If all of VRAM is free, allow relocating the number of bytes that > - * is equal to 1/4 of the size of VRAM for this IB. > + * It means that in order to get full max MBps, at least 5 IBs per > + * second must be submitted and not more than 200ms apart from each > + * other. > + */ > + const s64 us_upper_bound = 200000; > > - * - If more than one half of VRAM is occupied, only allow relocating > - * 1 MB of data for this IB. > - * > - * - From 0 to one half of used VRAM, the threshold decreases > - * linearly. > - * __________________ > - * 1/4 of -|\ | > - * VRAM | \ | > - * | \ | > - * | \ | > - * | \ | > - * | \ | > - * | \ | > - * | \________|1 MB > - * |----------------| > - * VRAM 0 % 100 % > - * used used > - * > - * Note: It's a threshold, not a limit. The threshold must be crossed > - * for buffer relocations to stop, so any buffer of an arbitrary size > - * can be moved as long as the threshold isn't crossed before > - * the relocation takes place. We don't want to disable buffer > - * relocations completely. > + if (!adev->mm_stats.log2_max_MBps) > + return 0; > + > + total_vram = adev->mc.real_vram_size - adev->vram_pin_size; > + used_vram = atomic64_read(&adev->vram_usage); > + free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram; > + > + mutex_lock(&adev->mm_stats.lock); > + > + /* Increase the amount of accumulated us. */ > + time_us = ktime_to_us(ktime_get()); > + increment_us = time_us - adev->mm_stats.last_update_us; > + adev->mm_stats.last_update_us = time_us; > + adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us, > + us_upper_bound); > + > + /* This prevents the short period of low performance when the VRAM > + * usage is low and the driver is in debt or doesn't have enough > + * accumulated us to fill VRAM quickly. > * > - * The idea is that buffers should be placed in VRAM at creation time > - * and TTM should only do a minimum number of relocations during > - * command submission. In practice, you need to submit at least > - * a dozen IBs to move all buffers to VRAM if they are in GTT. > + * The situation can occur in these cases: > + * - a lot of VRAM is freed by userspace > + * - the presence of a big buffer causes a lot of evictions > + * (solution: split buffers into smaller ones) > * > - * Also, things can get pretty crazy under memory pressure and actual > - * VRAM usage can change a lot, so playing safe even at 50% does > - * consistently increase performance. > + * If 128 MB or 1/8th of VRAM is free, start filling it now by setting > + * accum_us to a positive number. > */ > + if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) { > + s64 min_us; > + > + /* Be more aggresive on dGPUs. Try to fill a portion of free > + * VRAM now. > + */ > + if (!(adev->flags & AMD_IS_APU)) > + min_us = bytes_to_us(adev, free_vram / 4); > + else > + min_us = 0; /* Reset accum_us on APUs. */ > + > + adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us); > + } > > - u64 half_vram = real_vram_size >> 1; > - u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage; > - u64 bytes_moved_threshold = half_free_vram >> 1; > - return max(bytes_moved_threshold, 1024*1024ull); > + /* This returns 0 if the driver is in debt to disallow (optional) > + * buffer moves. > + */ > + max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us); > + > + mutex_unlock(&adev->mm_stats.lock); > + return max_bytes; > +} > + > +/* Report how many bytes have really been moved for the last command > + * submission. This can result in a debt that can stop buffer migrations > + * temporarily. > + */ > +static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, > + u64 num_bytes) > +{ > + mutex_lock(&adev->mm_stats.lock); > + adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes); > + mutex_unlock(&adev->mm_stats.lock); > } > > static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p, > struct amdgpu_bo *bo) > { > u64 initial_bytes_moved; > uint32_t domain; > int r; > > if (bo->pin_count) > return 0; > > - /* Avoid moving this one if we have moved too many buffers > - * for this IB already. > - * > - * Note that this allows moving at least one buffer of > - * any size, because it doesn't take the current "bo" > - * into account. We don't want to disallow buffer moves > - * completely. > + /* Don't move this buffer if we have depleted our allowance > + * to move it. Don't move anything if the threshold is zero. > */ > - if (p->bytes_moved <= p->bytes_moved_threshold) > + if (p->bytes_moved < p->bytes_moved_threshold) > domain = bo->prefered_domains; > else > domain = bo->allowed_domains; > > retry: > amdgpu_ttm_placement_from_domain(bo, domain); > initial_bytes_moved = atomic64_read(&bo->adev->num_bytes_moved); > r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); > p->bytes_moved += atomic64_read(&bo->adev->num_bytes_moved) - > initial_bytes_moved; > @@ -488,20 +542,22 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > DRM_ERROR("amdgpu_cs_list_validate(duplicates) failed.\n"); > goto error_validate; > } > > r = amdgpu_cs_list_validate(p, &p->validated); > if (r) { > DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n"); > goto error_validate; > } > > + amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved); > + > fpriv->vm.last_eviction_counter = > atomic64_read(&p->adev->num_evictions); > > if (p->bo_list) { > struct amdgpu_bo *gds = p->bo_list->gds_obj; > struct amdgpu_bo *gws = p->bo_list->gws_obj; > struct amdgpu_bo *oa = p->bo_list->oa_obj; > struct amdgpu_vm *vm = &fpriv->vm; > unsigned i; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 2608138..9eae198 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1521,20 +1521,21 @@ static bool amdgpu_device_is_virtual(void) > * Returns 0 for success or an error on failure. > * Called at driver startup. > */ > int amdgpu_device_init(struct amdgpu_device *adev, > struct drm_device *ddev, > struct pci_dev *pdev, > uint32_t flags) > { > int r, i; > bool runtime = false; > + u32 max_MBps; > > adev->shutdown = false; > adev->dev = &pdev->dev; > adev->ddev = ddev; > adev->pdev = pdev; > adev->flags = flags; > adev->asic_type = flags & AMD_ASIC_MASK; > adev->is_atom_bios = false; > adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; > adev->mc.gtt_size = 512 * 1024 * 1024; > @@ -1566,20 +1567,21 @@ int amdgpu_device_init(struct amdgpu_device *adev, > pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); > > /* mutex initialization are all done here so we > * can recall function without having locking issues */ > mutex_init(&adev->vm_manager.lock); > atomic_set(&adev->irq.ih.lock, 0); > mutex_init(&adev->pm.mutex); > mutex_init(&adev->gfx.gpu_clock_mutex); > mutex_init(&adev->srbm_mutex); > mutex_init(&adev->grbm_idx_mutex); > + mutex_init(&adev->mm_stats.lock); > mutex_init(&adev->mn_lock); > hash_init(adev->mn_hash); > > amdgpu_check_arguments(adev); > > /* Registers mapping */ > /* TODO: block userspace mapping of io register */ > spin_lock_init(&adev->mmio_idx_lock); > spin_lock_init(&adev->smc_idx_lock); > spin_lock_init(&adev->pcie_idx_lock); > @@ -1693,20 +1695,28 @@ int amdgpu_device_init(struct amdgpu_device *adev, > > r = amdgpu_init(adev); > if (r) { > dev_err(adev->dev, "amdgpu_init failed\n"); > amdgpu_fini(adev); > goto failed; > } > > adev->accel_working = true; > > + /* Initialize the buffer migration limit. */ > + if (amdgpu_moverate >= 0) > + max_MBps = amdgpu_moverate; > + else > + max_MBps = 8; /* Allow 8 MB/s. */ > + /* Get a log2 for easy divisions. */ > + adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); > + > amdgpu_fbdev_init(adev); > > r = amdgpu_ib_pool_init(adev); > if (r) { > dev_err(adev->dev, "IB initialization failed (%d).\n", r); > goto failed; > } > > r = amdgpu_ib_ring_tests(adev); > if (r) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 5bcfeed..fe2d26b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -54,20 +54,21 @@ > * at the end of IBs. > * - 3.3.0 - Add VM support for UVD on supported hardware. > * - 3.4.0 - Add AMDGPU_INFO_NUM_EVICTIONS. > */ > #define KMS_DRIVER_MAJOR 3 > #define KMS_DRIVER_MINOR 4 > #define KMS_DRIVER_PATCHLEVEL 0 > > int amdgpu_vram_limit = 0; > int amdgpu_gart_size = -1; /* auto */ > +int amdgpu_moverate = -1; /* auto */ > int amdgpu_benchmarking = 0; > int amdgpu_testing = 0; > int amdgpu_audio = -1; > int amdgpu_disp_priority = 0; > int amdgpu_hw_i2c = 0; > int amdgpu_pcie_gen2 = -1; > int amdgpu_msi = -1; > int amdgpu_lockup_timeout = 0; > int amdgpu_dpm = -1; > int amdgpu_smc_load_fw = 1; > @@ -92,20 +93,23 @@ unsigned amdgpu_pcie_lane_cap = 0; > unsigned amdgpu_cg_mask = 0xffffffff; > unsigned amdgpu_pg_mask = 0xffffffff; > char *amdgpu_virtual_display = NULL; > > MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes"); > module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); > > MODULE_PARM_DESC(gartsize, "Size of PCIE/IGP gart to setup in megabytes (32, 64, etc., -1 = auto)"); > module_param_named(gartsize, amdgpu_gart_size, int, 0600); > > +MODULE_PARM_DESC(moverate, "Maximum buffer migration rate in MB/s. (32, 64, etc., -1=auto, 0=1=disabled)"); > +module_param_named(moverate, amdgpu_moverate, int, 0600); > + > MODULE_PARM_DESC(benchmark, "Run benchmark"); > module_param_named(benchmark, amdgpu_benchmarking, int, 0444); > > MODULE_PARM_DESC(test, "Run tests"); > module_param_named(test, amdgpu_testing, int, 0444); > > MODULE_PARM_DESC(audio, "Audio enable (-1 = auto, 0 = disable, 1 = enable)"); > module_param_named(audio, amdgpu_audio, int, 0444); > > MODULE_PARM_DESC(disp_priority, "Display Priority (0 = auto, 1 = normal, 2 = high)");