On Thu, Jul 06, 2017 at 07:51:27PM +0900, Michel Dänzer wrote: > From: John Brooks <john at fastquake.com> > > The BO move throttling code is designed to allow VRAM to fill quickly if it > is relatively empty. However, this does not take into account situations > where the visible VRAM is smaller than total VRAM, and total VRAM may not > be close to full but the visible VRAM segment is under pressure. In such > situations, visible VRAM would experience unrestricted swapping and > performance would drop. > > Add a separate counter specifically for moves involving visible VRAM, and > check it before moving BOs there. > > v2: Only perform calculations for separate counter if visible VRAM is > smaller than total VRAM. (Michel Dänzer) > v3: [Michel Dänzer] > * Use BO's location rather than the AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED > flag to determine whether to account a move for visible VRAM in most > cases. > * Use a single > > if (adev->mc.visible_vram_size < adev->mc.real_vram_size) { > > block in amdgpu_cs_get_threshold_for_moves. > > Fixes: 95844d20ae02 (drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2)) > Signed-off-by: John Brooks <john at fastquake.com> > Signed-off-by: Michel Dänzer <michel.daenzer at amd.com> Changes look good. You can have my Reviewed-by: John Brooks <john at fastquake.com> if you want it. > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 92 ++++++++++++++++++++++++------ > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 12 +++- > 3 files changed, 87 insertions(+), 23 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index b95c1074d42c..463d6c241157 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1162,7 +1162,9 @@ struct amdgpu_cs_parser { > struct list_head validated; > struct dma_fence *fence; > uint64_t bytes_moved_threshold; > + uint64_t bytes_moved_vis_threshold; > uint64_t bytes_moved; > + uint64_t bytes_moved_vis; > struct amdgpu_bo_list_entry *evictable; > > /* user fence */ > @@ -1596,6 +1598,7 @@ struct amdgpu_device { > spinlock_t lock; > s64 last_update_us; > s64 accum_us; /* accumulated microseconds */ > + s64 accum_us_vis; /* for visible VRAM */ > u32 log2_max_MBps; > } mm_stats; > > @@ -1892,7 +1895,8 @@ void amdgpu_pci_config_reset(struct amdgpu_device *adev); > bool amdgpu_need_post(struct amdgpu_device *adev); > void amdgpu_update_display_priority(struct amdgpu_device *adev); > > -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes); > +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes, > + u64 num_vis_bytes); > void amdgpu_ttm_placement_from_domain(struct amdgpu_bo *abo, u32 domain); > bool amdgpu_ttm_bo_is_amdgpu_bo(struct ttm_buffer_object *bo); > int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 82131d70a06b..44ec11d4d733 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -220,10 +220,11 @@ static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes) > * ticks. The accumulated microseconds (us) are converted to bytes and > * returned. > */ > -static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev) > +static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev, > + u64 *max_bytes, > + u64 *max_vis_bytes) > { > s64 time_us, increment_us; > - u64 max_bytes; > u64 free_vram, total_vram, used_vram; > > /* Allow a maximum of 200 accumulated ms. This is basically per-IB > @@ -235,8 +236,11 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev) > */ > const s64 us_upper_bound = 200000; > > - if (!adev->mm_stats.log2_max_MBps) > - return 0; > + if (!adev->mm_stats.log2_max_MBps) { > + *max_bytes = 0; > + *max_vis_bytes = 0; > + return; > + } > > total_vram = adev->mc.real_vram_size - adev->vram_pin_size; > used_vram = atomic64_read(&adev->vram_usage); > @@ -277,23 +281,45 @@ static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev) > adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us); > } > > - /* This returns 0 if the driver is in debt to disallow (optional) > + /* This is set to 0 if the driver is in debt to disallow (optional) > * buffer moves. > */ > - max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us); > + *max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us); > + > + /* Do the same for visible VRAM if half of it is free */ > + if (adev->mc.visible_vram_size < adev->mc.real_vram_size) { > + u64 total_vis_vram = adev->mc.visible_vram_size; > + u64 used_vis_vram = atomic64_read(&adev->vram_vis_usage); > + > + if (used_vis_vram < total_vis_vram) { > + u64 free_vis_vram = total_vis_vram - used_vis_vram; > + adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis + > + increment_us, us_upper_bound); > + > + if (free_vis_vram >= total_vis_vram / 2) > + adev->mm_stats.accum_us_vis = > + max(bytes_to_us(adev, free_vis_vram / 2), > + adev->mm_stats.accum_us_vis); > + } > + > + *max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis); > + } else { > + *max_vis_bytes = 0; > + } > > spin_unlock(&adev->mm_stats.lock); > - return max_bytes; > } > > /* Report how many bytes have really been moved for the last command > * submission. This can result in a debt that can stop buffer migrations > * temporarily. > */ > -void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes) > +void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes, > + u64 num_vis_bytes) > { > spin_lock(&adev->mm_stats.lock); > adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes); > + adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes); > spin_unlock(&adev->mm_stats.lock); > } > > @@ -301,7 +327,7 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p, > struct amdgpu_bo *bo) > { > struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); > - u64 initial_bytes_moved; > + u64 initial_bytes_moved, bytes_moved; > uint32_t domain; > int r; > > @@ -311,17 +337,35 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p, > /* Don't move this buffer if we have depleted our allowance > * to move it. Don't move anything if the threshold is zero. > */ > - if (p->bytes_moved < p->bytes_moved_threshold) > - domain = bo->prefered_domains; > - else > + if (p->bytes_moved < p->bytes_moved_threshold) { > + if (adev->mc.visible_vram_size < adev->mc.real_vram_size && > + (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) { > + /* And don't move a CPU_ACCESS_REQUIRED BO to limited > + * visible VRAM if we've depleted our allowance to do > + * that. > + */ > + if (p->bytes_moved_vis < p->bytes_moved_vis_threshold) > + domain = bo->prefered_domains; > + else > + domain = bo->allowed_domains; > + } else { > + domain = bo->prefered_domains; > + } > + } else { > domain = bo->allowed_domains; > + } > > retry: > amdgpu_ttm_placement_from_domain(bo, domain); > initial_bytes_moved = atomic64_read(&adev->num_bytes_moved); > r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); > - p->bytes_moved += atomic64_read(&adev->num_bytes_moved) - > - initial_bytes_moved; > + bytes_moved = atomic64_read(&adev->num_bytes_moved) - > + initial_bytes_moved; > + p->bytes_moved += bytes_moved; > + if (adev->mc.visible_vram_size < adev->mc.real_vram_size && > + bo->tbo.mem.mem_type == TTM_PL_VRAM && > + bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT) > + p->bytes_moved_vis += bytes_moved; > > if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) { > domain = bo->allowed_domains; > @@ -347,7 +391,8 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p, > struct amdgpu_bo_list_entry *candidate = p->evictable; > struct amdgpu_bo *bo = candidate->robj; > struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); > - u64 initial_bytes_moved; > + u64 initial_bytes_moved, bytes_moved; > + bool update_bytes_moved_vis; > uint32_t other; > > /* If we reached our current BO we can forget it */ > @@ -367,10 +412,17 @@ static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p, > > /* Good we can try to move this BO somewhere else */ > amdgpu_ttm_placement_from_domain(bo, other); > + update_bytes_moved_vis = > + adev->mc.visible_vram_size < adev->mc.real_vram_size && > + bo->tbo.mem.mem_type == TTM_PL_VRAM && > + bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT; > initial_bytes_moved = atomic64_read(&adev->num_bytes_moved); > r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); > - p->bytes_moved += atomic64_read(&adev->num_bytes_moved) - > + bytes_moved = atomic64_read(&adev->num_bytes_moved) - > initial_bytes_moved; > + p->bytes_moved += bytes_moved; > + if (update_bytes_moved_vis) > + p->bytes_moved_vis += bytes_moved; > > if (unlikely(r)) > break; > @@ -550,8 +602,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > list_splice(&need_pages, &p->validated); > } > > - p->bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(p->adev); > + amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold, > + &p->bytes_moved_vis_threshold); > p->bytes_moved = 0; > + p->bytes_moved_vis = 0; > p->evictable = list_last_entry(&p->validated, > struct amdgpu_bo_list_entry, > tv.head); > @@ -575,8 +629,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > goto error_validate; > } > > - amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved); > - > + amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, > + p->bytes_moved_vis); > fpriv->vm.last_eviction_counter = > atomic64_read(&p->adev->num_evictions); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index a85e75327456..e429829ae93d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -322,7 +322,7 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev, > struct amdgpu_bo *bo; > enum ttm_bo_type type; > unsigned long page_align; > - u64 initial_bytes_moved; > + u64 initial_bytes_moved, bytes_moved; > size_t acc_size; > int r; > > @@ -398,8 +398,14 @@ int amdgpu_bo_create_restricted(struct amdgpu_device *adev, > r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type, > &bo->placement, page_align, !kernel, NULL, > acc_size, sg, resv, &amdgpu_ttm_bo_destroy); > - amdgpu_cs_report_moved_bytes(adev, > - atomic64_read(&adev->num_bytes_moved) - initial_bytes_moved); > + bytes_moved = atomic64_read(&adev->num_bytes_moved) - > + initial_bytes_moved; > + if (adev->mc.visible_vram_size < adev->mc.real_vram_size && > + bo->tbo.mem.mem_type == TTM_PL_VRAM && > + bo->tbo.mem.start < adev->mc.visible_vram_size >> PAGE_SHIFT) > + amdgpu_cs_report_moved_bytes(adev, bytes_moved, bytes_moved); > + else > + amdgpu_cs_report_moved_bytes(adev, bytes_moved, 0); > > if (unlikely(r != 0)) > return r; > -- > 2.13.2 >