On Sat, May 10, 2014 at 6:17 AM, Christian König <deathsimple@xxxxxxxxxxx> wrote: > From: Christian König <christian.koenig@xxxxxxx> > > This patch implements support for VRAM page table entry compression. > PTE construction is enhanced to identify physically contiguous page > ranges and mark them in the PTE fragment field. L1/L2 TLB support is > enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments, significantly > improving TLB utilization for VRAM allocations. > > Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn. > Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS > on default settings at 1920x1200 resolution with vsync disabled. > > See main comment in radeon_vm.c for a technical description. > > v2 (chk): rebased and simplified. > v3 (chk): add missing hw setup > v4 (chk): rebased on current drm-fixes-3.15 > v5 (chk): fix comments and commit text > > Signed-off-by: Jay Cornwall <jay@xxxxxxxxxxxx> > Signed-off-by: Christian König <christian.koenig@xxxxxxx> Applied both to my 3.16 tree. Thanks! Alex > --- > drivers/gpu/drm/radeon/cik.c | 4 +- > drivers/gpu/drm/radeon/ni.c | 2 + > drivers/gpu/drm/radeon/radeon.h | 5 +++ > drivers/gpu/drm/radeon/radeon_vm.c | 91 +++++++++++++++++++++++++++++++++++--- > drivers/gpu/drm/radeon/si.c | 5 ++- > 5 files changed, 98 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c > index 5143e0b..ce26b2a 100644 > --- a/drivers/gpu/drm/radeon/cik.c > +++ b/drivers/gpu/drm/radeon/cik.c > @@ -5329,6 +5329,7 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) > WREG32(MC_VM_MX_L1_TLB_CNTL, > (0xA << 7) | > ENABLE_L1_TLB | > + ENABLE_L1_FRAGMENT_PROCESSING | > SYSTEM_ACCESS_MODE_NOT_IN_SYS | > ENABLE_ADVANCED_DRIVER_MODEL | > SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); > @@ -5341,7 +5342,8 @@ static int cik_pcie_gart_enable(struct radeon_device *rdev) > CONTEXT1_IDENTITY_ACCESS_MODE(1)); > WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); > WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | > - L2_CACHE_BIGK_FRAGMENT_SIZE(6)); > + BANK_SELECT(4) | > + L2_CACHE_BIGK_FRAGMENT_SIZE(4)); > /* setup context0 */ > WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); > WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); > diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c > index d246e04..5e8db9b 100644 > --- a/drivers/gpu/drm/radeon/ni.c > +++ b/drivers/gpu/drm/radeon/ni.c > @@ -1228,12 +1228,14 @@ static int cayman_pcie_gart_enable(struct radeon_device *rdev) > SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); > /* Setup L2 cache */ > WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | > + ENABLE_L2_FRAGMENT_PROCESSING | > ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | > ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | > EFFECTIVE_L2_QUEUE_SIZE(7) | > CONTEXT1_IDENTITY_ACCESS_MODE(1)); > WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); > WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | > + BANK_SELECT(6) | > L2_CACHE_BIGK_FRAGMENT_SIZE(6)); > /* setup context0 */ > WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); > diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h > index 91be3d6..5c4742c 100644 > --- a/drivers/gpu/drm/radeon/radeon.h > +++ b/drivers/gpu/drm/radeon/radeon.h > @@ -854,6 +854,11 @@ struct radeon_mec { > #define R600_PTE_READABLE (1 << 5) > #define R600_PTE_WRITEABLE (1 << 6) > > +/* PTE (Page Table Entry) fragment field for different page sizes */ > +#define R600_PTE_FRAG_4KB (0 << 7) > +#define R600_PTE_FRAG_64KB (4 << 7) > +#define R600_PTE_FRAG_256KB (6 << 7) > + > struct radeon_vm_pt { > struct radeon_bo *bo; > uint64_t addr; > diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c > index 2aae6ce..f8d5b65 100644 > --- a/drivers/gpu/drm/radeon/radeon_vm.c > +++ b/drivers/gpu/drm/radeon/radeon_vm.c > @@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct radeon_device *rdev, > } > > /** > + * radeon_vm_frag_ptes - add fragment information to PTEs > + * > + * @rdev: radeon_device pointer > + * @ib: IB for the update > + * @pe_start: first PTE to handle > + * @pe_end: last PTE to handle > + * @addr: addr those PTEs should point to > + * @flags: hw mapping flags > + * > + * Global and local mutex must be locked! > + */ > +static void radeon_vm_frag_ptes(struct radeon_device *rdev, > + struct radeon_ib *ib, > + uint64_t pe_start, uint64_t pe_end, > + uint64_t addr, uint32_t flags) > +{ > + /** > + * The MC L1 TLB supports variable sized pages, based on a fragment > + * field in the PTE. When this field is set to a non-zero value, page > + * granularity is increased from 4KB to (1 << (12 + frag)). The PTE > + * flags are considered valid for all PTEs within the fragment range > + * and corresponding mappings are assumed to be physically contiguous. > + * > + * The L1 TLB can store a single PTE for the whole fragment, > + * significantly increasing the space available for translation > + * caching. This leads to large improvements in throughput when the > + * TLB is under pressure. > + * > + * The L2 TLB distributes small and large fragments into two > + * asymmetric partitions. The large fragment cache is significantly > + * larger. Thus, we try to use large fragments wherever possible. > + * Userspace can support this by aligning virtual base address and > + * allocation size to the fragment size. > + */ > + > + /* NI is optimized for 256KB fragments, SI and newer for 64KB */ > + uint64_t frag_flags = rdev->family == CHIP_CAYMAN ? > + R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB; > + uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80; > + > + uint64_t frag_start = ALIGN(pe_start, frag_align); > + uint64_t frag_end = pe_end & ~(frag_align - 1); > + > + unsigned count; > + > + /* system pages are non continuously */ > + if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) || > + (frag_start >= frag_end)) { > + > + count = (pe_end - pe_start) / 8; > + radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, > + RADEON_GPU_PAGE_SIZE, flags); > + return; > + } > + > + /* handle the 4K area at the beginning */ > + if (pe_start != frag_start) { > + count = (frag_start - pe_start) / 8; > + radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count, > + RADEON_GPU_PAGE_SIZE, flags); > + addr += RADEON_GPU_PAGE_SIZE * count; > + } > + > + /* handle the area in the middle */ > + count = (frag_end - frag_start) / 8; > + radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count, > + RADEON_GPU_PAGE_SIZE, flags | frag_flags); > + > + /* handle the 4K area at the end */ > + if (frag_end != pe_end) { > + addr += RADEON_GPU_PAGE_SIZE * count; > + count = (pe_end - frag_end) / 8; > + radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count, > + RADEON_GPU_PAGE_SIZE, flags); > + } > +} > + > +/** > * radeon_vm_update_ptes - make sure that page tables are valid > * > * @rdev: radeon_device pointer > @@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, > if ((last_pte + 8 * count) != pte) { > > if (count) { > - radeon_asic_vm_set_page(rdev, ib, last_pte, > - last_dst, count, > - RADEON_GPU_PAGE_SIZE, > - flags); > + radeon_vm_frag_ptes(rdev, ib, last_pte, > + last_pte + 8 * count, > + last_dst, flags); > } > > count = nptes; > @@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct radeon_device *rdev, > } > > if (count) { > - radeon_asic_vm_set_page(rdev, ib, last_pte, > - last_dst, count, > - RADEON_GPU_PAGE_SIZE, flags); > + radeon_vm_frag_ptes(rdev, ib, last_pte, > + last_pte + 8 * count, > + last_dst, flags); > } > } > > diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c > index 22a63c9..dece3be 100644 > --- a/drivers/gpu/drm/radeon/si.c > +++ b/drivers/gpu/drm/radeon/si.c > @@ -4044,18 +4044,21 @@ static int si_pcie_gart_enable(struct radeon_device *rdev) > WREG32(MC_VM_MX_L1_TLB_CNTL, > (0xA << 7) | > ENABLE_L1_TLB | > + ENABLE_L1_FRAGMENT_PROCESSING | > SYSTEM_ACCESS_MODE_NOT_IN_SYS | > ENABLE_ADVANCED_DRIVER_MODEL | > SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU); > /* Setup L2 cache */ > WREG32(VM_L2_CNTL, ENABLE_L2_CACHE | > + ENABLE_L2_FRAGMENT_PROCESSING | > ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE | > ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE | > EFFECTIVE_L2_QUEUE_SIZE(7) | > CONTEXT1_IDENTITY_ACCESS_MODE(1)); > WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE); > WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY | > - L2_CACHE_BIGK_FRAGMENT_SIZE(0)); > + BANK_SELECT(4) | > + L2_CACHE_BIGK_FRAGMENT_SIZE(4)); > /* setup context0 */ > WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); > WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12); > -- > 1.9.1 > > _______________________________________________ > dri-devel mailing list > dri-devel@xxxxxxxxxxxxxxxxxxxxx > http://lists.freedesktop.org/mailman/listinfo/dri-devel _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/dri-devel