Re: [PATCH 1/2] drm/radeon: add large PTE support for NI, SI and CIK v4

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 01.05.2014 19:29, schrieb Jay Cornwall:
On 2014-05-01 11:52, Christian König wrote:

Some minor comment fixes inline. I've been using v3 of this patch on SI for quite a while, with no visible failures.

Thanks for the notes. I've added them to my v5 of the patch and also updated the file name in the commit message (we have moved that stuff to radeon_vm.c in the meantime).

Christian.


Thanks for pushing this.

From: Christian König <christian.koenig@xxxxxxx>

This patch implements support for VRAM page table entry compression.
PTE construction is enhanced to identify physically contiguous page
ranges and mark them in the PTE fragment field. L1 TLB and L2 cache
^^^^^^^^^^^^^^^^^^^
This should read L1/L2 TLB. HW spec refers to the L2 TLB as the VM L2 "cache", which confused the draft comments.

support is enabled for 64KB (SI/CIK) and 256KB (NI) PTE fragments,
significantly improving TLB utilization for VRAM allocations.

Linear store bandwidth is improved from 60GB/s to 125GB/s on Pitcairn.
Unigine Heaven 3.0 sees an average improvement from 24.7 to 27.7 FPS
on default settings at 1920x1200 resolution with vsync disabled.

See main comment in radeon_gart.c gives a technical description.

v2 (chk): rebased and simplified.
v3 (chk): add missing hw setup
v4 (chk): rebased on current drm-fixes-3.15

Signed-off-by: Jay Cornwall <jay@xxxxxxxxxxxx>
Signed-off-by: Christian König <christian.koenig@xxxxxxx>
---
 drivers/gpu/drm/radeon/cik.c       |  4 +-
 drivers/gpu/drm/radeon/ni.c        |  2 +
 drivers/gpu/drm/radeon/radeon.h    |  5 +++
drivers/gpu/drm/radeon/radeon_vm.c | 91 +++++++++++++++++++++++++++++++++++---
 drivers/gpu/drm/radeon/si.c        |  5 ++-
 5 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
index 38f3fcc..38da9f3 100644
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@@ -5395,6 +5395,7 @@ static int cik_pcie_gart_enable(struct
radeon_device *rdev)
     WREG32(MC_VM_MX_L1_TLB_CNTL,
            (0xA << 7) |
            ENABLE_L1_TLB |
+           ENABLE_L1_FRAGMENT_PROCESSING |
            SYSTEM_ACCESS_MODE_NOT_IN_SYS |
            ENABLE_ADVANCED_DRIVER_MODEL |
            SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
@@ -5407,7 +5408,8 @@ static int cik_pcie_gart_enable(struct
radeon_device *rdev)
            CONTEXT1_IDENTITY_ACCESS_MODE(1));
     WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
     WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
-           L2_CACHE_BIGK_FRAGMENT_SIZE(6));
+           BANK_SELECT(4) |
+           L2_CACHE_BIGK_FRAGMENT_SIZE(4));
     /* setup context0 */
WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
     WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c
index d246e04..5e8db9b 100644
--- a/drivers/gpu/drm/radeon/ni.c
+++ b/drivers/gpu/drm/radeon/ni.c
@@ -1228,12 +1228,14 @@ static int cayman_pcie_gart_enable(struct
radeon_device *rdev)
            SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
     /* Setup L2 cache */
     WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
+           ENABLE_L2_FRAGMENT_PROCESSING |
            ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
            ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
            EFFECTIVE_L2_QUEUE_SIZE(7) |
            CONTEXT1_IDENTITY_ACCESS_MODE(1));
     WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
     WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
+           BANK_SELECT(6) |
            L2_CACHE_BIGK_FRAGMENT_SIZE(6));
     /* setup context0 */
WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 6852861..e3d6be3 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -854,6 +854,11 @@ struct radeon_mec {
 #define R600_PTE_READABLE    (1 << 5)
 #define R600_PTE_WRITEABLE    (1 << 6)

+/* PTE (Page Table Entry) fragment field for different page sizes */
+#define R600_PTE_FRAG_4KB    (0 << 7)
+#define R600_PTE_FRAG_64KB    (4 << 7)
+#define R600_PTE_FRAG_256KB    (6 << 7)
+
 struct radeon_vm_pt {
     struct radeon_bo        *bo;
     uint64_t            addr;
diff --git a/drivers/gpu/drm/radeon/radeon_vm.c
b/drivers/gpu/drm/radeon/radeon_vm.c
index 2aae6ce..6bf656e 100644
--- a/drivers/gpu/drm/radeon/radeon_vm.c
+++ b/drivers/gpu/drm/radeon/radeon_vm.c
@@ -658,6 +658,84 @@ int radeon_vm_update_page_directory(struct
radeon_device *rdev,
 }

 /**
+ * radeon_vm_frag_ptes - add fragment information to PTEs
+ *
+ * @rdev: radeon_device pointer
+ * @ib: IB for the update
+ * @pe_start: first PTE to handle
+ * @pe_end: last PTE to handle
+ * @addr: addr those PTEs should point to
+ * @flags: hw mapping flags
+ *
+ * Global and local mutex must be locked!
+ */
+static void radeon_vm_frag_ptes(struct radeon_device *rdev,
+                struct radeon_ib *ib,
+                uint64_t pe_start, uint64_t pe_end,
+                uint64_t addr, uint32_t flags)
+{
+    /**
+     * The MC L1 TLB supports variable sized pages, based on a fragment
+ * field in the PTE. When this field is set to a non-zero value, page
+     * granularity is increased from 4KB to (1 << (12 + frag)). The PTE
+ * flags are considered valid for all PTEs within the fragment range + * and corresponding mappings are assumed to be physically contiguous.
+     *
+     * The L1 TLB can store a single PTE for the whole fragment,
+     * significantly increasing the space available for translation
+     * caching. This leads to large improvements in throughput when the
+     * TLB is under pressure.
+     *
+     * The L2 cache distributes small and large fragments into two
                  ^^^^^
Again, L2 TLB.

+     * asymmetric partitions. The large fragment cache is significantly
+     * larger. Thus, we try to use large fragments wherever possible.
+     * Userspace can support this by aligning virtual base address and
+     * allocation size to the fragment size.
+     */
+
+    /* NI is optimized for 256KB fragments, SI and newer for 64KB */
+    uint64_t frag_flags = rdev->family == CHIP_CAYMAN ?
+            R600_PTE_FRAG_256KB : R600_PTE_FRAG_64KB;
+    uint64_t frag_align = rdev->family == CHIP_CAYMAN ? 0x200 : 0x80;
+
+    uint64_t frag_start = ALIGN(pe_start, frag_align);
+    uint64_t frag_end = pe_end & ~(frag_align - 1);
+
+    unsigned count;
+
+    /* system pages are non continuously */
+    if ((flags & R600_PTE_SYSTEM) || !(flags & R600_PTE_VALID) ||
+        (frag_start >= frag_end)) {
+
+        count = (pe_end - pe_start) / 8;
+        radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
+                    RADEON_GPU_PAGE_SIZE, flags);
+        return;
+    }
+
+    /* handle the 4K area at the beginning */
+    if (pe_start != frag_start) {
+        count = (frag_start - pe_start) / 8;
+        radeon_asic_vm_set_page(rdev, ib, pe_start, addr, count,
+                    RADEON_GPU_PAGE_SIZE, flags);
+        addr += RADEON_GPU_PAGE_SIZE * count;
+    }
+
+    /* handle the area in the middle */
+    count = (frag_end - frag_start) / 8;
+    radeon_asic_vm_set_page(rdev, ib, frag_start, addr, count,
+                RADEON_GPU_PAGE_SIZE, flags | frag_flags);
+
+    /* handle the 4K area at the end */
+    if (frag_end != pe_end) {
+        addr += RADEON_GPU_PAGE_SIZE * count;
+        count = (pe_end - frag_end) / 8;
+        radeon_asic_vm_set_page(rdev, ib, frag_end, addr, count,
+                    RADEON_GPU_PAGE_SIZE, flags);
+    }
+}
+
+/**
  * radeon_vm_update_ptes - make sure that page tables are valid
  *
  * @rdev: radeon_device pointer
@@ -703,10 +781,9 @@ static void radeon_vm_update_ptes(struct
radeon_device *rdev,
         if ((last_pte + 8 * count) != pte) {

             if (count) {
-                radeon_asic_vm_set_page(rdev, ib, last_pte,
-                            last_dst, count,
-                            RADEON_GPU_PAGE_SIZE,
-                            flags);
+                radeon_vm_frag_ptes(rdev, ib, last_pte,
+                            last_pte + 8 * count,
+                            last_dst, flags);
             }

             count = nptes;
@@ -721,9 +798,9 @@ static void radeon_vm_update_ptes(struct
radeon_device *rdev,
     }

     if (count) {
-        radeon_asic_vm_set_page(rdev, ib, last_pte,
-                    last_dst, count,
-                    RADEON_GPU_PAGE_SIZE, flags);
+        radeon_vm_frag_ptes(rdev, ib, last_pte,
+                    last_pte + 8 * count,
+                    last_dst, flags);
     }
 }

diff --git a/drivers/gpu/drm/radeon/si.c b/drivers/gpu/drm/radeon/si.c
index 22a63c9..dece3be 100644
--- a/drivers/gpu/drm/radeon/si.c
+++ b/drivers/gpu/drm/radeon/si.c
@@ -4044,18 +4044,21 @@ static int si_pcie_gart_enable(struct
radeon_device *rdev)
     WREG32(MC_VM_MX_L1_TLB_CNTL,
            (0xA << 7) |
            ENABLE_L1_TLB |
+           ENABLE_L1_FRAGMENT_PROCESSING |
            SYSTEM_ACCESS_MODE_NOT_IN_SYS |
            ENABLE_ADVANCED_DRIVER_MODEL |
            SYSTEM_APERTURE_UNMAPPED_ACCESS_PASS_THRU);
     /* Setup L2 cache */
     WREG32(VM_L2_CNTL, ENABLE_L2_CACHE |
+           ENABLE_L2_FRAGMENT_PROCESSING |
            ENABLE_L2_PTE_CACHE_LRU_UPDATE_BY_WRITE |
            ENABLE_L2_PDE0_CACHE_LRU_UPDATE_BY_WRITE |
            EFFECTIVE_L2_QUEUE_SIZE(7) |
            CONTEXT1_IDENTITY_ACCESS_MODE(1));
     WREG32(VM_L2_CNTL2, INVALIDATE_ALL_L1_TLBS | INVALIDATE_L2_CACHE);
     WREG32(VM_L2_CNTL3, L2_CACHE_BIGK_ASSOCIATIVITY |
-           L2_CACHE_BIGK_FRAGMENT_SIZE(0));
+           BANK_SELECT(4) |
+           L2_CACHE_BIGK_FRAGMENT_SIZE(4));
     /* setup context0 */
WREG32(VM_CONTEXT0_PAGE_TABLE_START_ADDR, rdev->mc.gtt_start >> 12);
     WREG32(VM_CONTEXT0_PAGE_TABLE_END_ADDR, rdev->mc.gtt_end >> 12);
_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/dri-devel

_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/dri-devel





[Index of Archives]     [Linux DRI Users]     [Linux Intel Graphics]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux