On Fri, Oct 05, 2018 at 06:38:35PM +0530, Sharat Masetty wrote: > The last level system cache can be partitioned to 32 different slices > of which GPU has two slices preallocated. One slice is used for caching GPU > buffers and the other slice is used for caching the GPU SMMU pagetables. > This patch talks to the core system cache driver to acquire the slice handles, > configure the SCID's to those slices and activates and deactivates the slices > upon GPU power collapse and restore. > > Some support from the IOMMU driver is also needed to make use of the > system cache. IOMMU_SYS_CACHE is a buffer protection flag which enables > caching GPU data buffers in the system cache with memory attributes such > as outer cacheable, read-allocate, write-allocate for buffers. The GPU > then has the ability to override a few cacheability parameters which it > does to override write-allocate to write-no-allocate as the GPU hardware > does not benefit much from it. > > Similarly DOMAIN_ATTR_USE_SYS_CACHE is another domain level attribute > used by the IOMMU driver to set the right attributes to cache the hardware > pagetables into the system cache. > > Signed-off-by: Sharat Masetty <smasetty@xxxxxxxxxxxxxx> > --- > drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 159 +++++++++++++++++++++++++++++++++- > drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 9 ++ > drivers/gpu/drm/msm/msm_iommu.c | 13 +++ > drivers/gpu/drm/msm/msm_mmu.h | 3 + > 4 files changed, 183 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > index 177dbfc..1790dde 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > @@ -8,6 +8,7 @@ > #include "a6xx_gmu.xml.h" > > #include <linux/devfreq.h> > +#include <linux/soc/qcom/llcc-qcom.h> > > static inline bool _a6xx_check_idle(struct msm_gpu *gpu) > { > @@ -674,6 +675,151 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu) > ~0 > }; > > +#define A6XX_LLC_NUM_GPU_SCIDS 5 > +#define A6XX_GPU_LLC_SCID_NUM_BITS 5 > + > +#define A6XX_GPU_LLC_SCID_MASK \ > + ((1 << (A6XX_LLC_NUM_GPU_SCIDS * A6XX_GPU_LLC_SCID_NUM_BITS)) - 1) > + > +#define A6XX_GPUHTW_LLC_SCID_SHIFT 25 > +#define A6XX_GPUHTW_LLC_SCID_MASK \ > + (((1 << A6XX_GPU_LLC_SCID_NUM_BITS) - 1) << A6XX_GPUHTW_LLC_SCID_SHIFT) > + > +static inline void a6xx_gpu_cx_rmw(struct a6xx_llc *llc, > + u32 reg, u32 mask, u32 or) > +{ > + msm_rmw(llc->mmio + (reg << 2), mask, or); > +} > + > +static void a6xx_llc_deactivate(struct msm_gpu *gpu) > +{ > + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); > + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); > + struct a6xx_llc *llc = &a6xx_gpu->llc; > + > + llcc_slice_deactivate(llc->gpu_llc_slice); > + llcc_slice_deactivate(llc->gpuhtw_llc_slice); > +} > + > +static void a6xx_llc_activate(struct msm_gpu *gpu) > +{ > + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); > + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); > + struct a6xx_llc *llc = &a6xx_gpu->llc; > + > + if (!llc->mmio) > + return; > + > + /* > + * If the LLCC_GPU slice activated, program the sub-cache ID for all > + * GPU blocks > + */ > + if (!llcc_slice_activate(llc->gpu_llc_slice)) > + a6xx_gpu_cx_rmw(llc, > + REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1, > + A6XX_GPU_LLC_SCID_MASK, > + (llc->cntl1_regval & > + A6XX_GPU_LLC_SCID_MASK)); > + > + /* > + * If the LLCC_GPUHTW slice activated, program the sub-cache ID for the > + * GPU pagetables > + */ > + if (!llcc_slice_activate(llc->gpuhtw_llc_slice)) > + a6xx_gpu_cx_rmw(llc, > + REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1, > + A6XX_GPUHTW_LLC_SCID_MASK, > + (llc->cntl1_regval & > + A6XX_GPUHTW_LLC_SCID_MASK)); > + > + /* Program cacheability overrides */ > + a6xx_gpu_cx_rmw(llc, REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, > + llc->cntl0_regval); > +} > + > +void a6xx_llc_slices_destroy(struct a6xx_llc *llc) > +{ > + if (llc->mmio) { > + iounmap(llc->mmio); > + llc->mmio = NULL; > + } > + > + llcc_slice_putd(llc->gpu_llc_slice); > + llc->gpu_llc_slice = NULL; I don't think these need to be put back to NULL - we shouldn't touch them again after this point. > + > + llcc_slice_putd(llc->gpuhtw_llc_slice); > + llc->gpuhtw_llc_slice = NULL; > +} > + > +static int a6xx_llc_slices_init(struct platform_device *pdev, > + struct a6xx_llc *llc) > +{ > + int i; > + > + /* Map registers */ > + llc->mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx"); > + if (IS_ERR(llc->mmio)) { > + llc->mmio = NULL; > + return -1; Return a valid error code here even if we don't care what it is. -ENODEV maybe. And in fact, if we don't care what it is (LLCC is very optional) then just don't return anything at all. > + } > + > + /* Get the system cache slice descriptor for GPU and GPUHTWs */ > + llc->gpu_llc_slice = llcc_slice_getd(LLCC_GPU); > + llc->gpuhtw_llc_slice = llcc_slice_getd(LLCC_GPUHTW); > + if (IS_ERR(llc->gpu_llc_slice) && IS_ERR(llc->gpuhtw_llc_slice)) > + return -1; > + > + /* > + * Setup GPU system cache CNTL0 and CNTL1 register values. > + * These values will be programmed everytime GPU comes out > + * of power collapse as these are non-retention registers. > + */ > + > + /* > + * CNTL0 provides options to override the settings for the > + * read and write allocation policies for the LLC. These > + * overrides are global for all memory transactions from > + * the GPU. > + * > + * 0x3: read-no-alloc-overridden = 0 > + * read-no-alloc = 0 - Allocate lines on read miss > + * write-no-alloc-overridden = 1 > + * write-no-alloc = 1 - Do not allocates lines on write miss > + */ > + llc->cntl0_regval = 0x03; > + > + /* > + * CNTL1 is used to specify SCID for (CP, TP, VFD, CCU and UBWC > + * FLAG cache) GPU blocks. This value will be passed along with > + * the address for any memory transaction from GPU to identify > + * the sub-cache for that transaction. > + * > + * Currently there is only one SCID allocated for all GPU blocks > + * Hence set same SCID for all the blocks. This last sentence is not needed > + */ > + > + if (!IS_ERR(llc->gpu_llc_slice)) { > + u32 gpu_scid = llcc_get_slice_id(llc->gpu_llc_slice); > + > + for (i = 0; i < A6XX_LLC_NUM_GPU_SCIDS; i++) > + llc->cntl1_regval |= > + gpu_scid << (A6XX_GPU_LLC_SCID_NUM_BITS * i); > + } > + > + /* > + * Set SCID for GPU IOMMU. This will be used to access > + * page tables that are cached in LLC. > + */ > + if (!IS_ERR(llc->gpuhtw_llc_slice)) { > + u32 gpuhtw_scid = llcc_get_slice_id(llc->gpuhtw_llc_slice); > + > + llc->cntl1_regval |= > + gpuhtw_scid << A6XX_GPUHTW_LLC_SCID_SHIFT; > + } > + > + return 0; > +} > + > static int a6xx_pm_resume(struct msm_gpu *gpu) > { > struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); > @@ -686,6 +832,9 @@ static int a6xx_pm_resume(struct msm_gpu *gpu) > > msm_gpu_resume_devfreq(gpu); > > + /* Activate LLC slices */ > + a6xx_llc_activate(gpu); > + > return ret; > } > > @@ -694,6 +843,9 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) > struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); > struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); > > + /* Deactivate LLC slices */ > + a6xx_llc_deactivate(gpu); > + > devfreq_suspend_device(gpu->devfreq.devfreq); > > /* > @@ -753,6 +905,8 @@ static void a6xx_destroy(struct msm_gpu *gpu) > drm_gem_object_unreference_unlocked(a6xx_gpu->sqe_bo); > } > > + a6xx_llc_slices_destroy(&a6xx_gpu->llc); > + > a6xx_gmu_remove(a6xx_gpu); > > adreno_gpu_cleanup(adreno_gpu); > @@ -819,7 +973,10 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev) > adreno_gpu->registers = a6xx_registers; > adreno_gpu->reg_offsets = a6xx_register_offsets; > > - ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1, 0); > + ret = a6xx_llc_slices_init(pdev, &a6xx_gpu->llc); Yep - there is no reason to take a ret and not deal with it. > + > + ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1, > + ret ? 0 : MMU_FEATURE_USE_SYSTEM_CACHE); > if (ret) { > a6xx_destroy(&(a6xx_gpu->base.base)); > return ERR_PTR(ret); > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > index 4127dce..86353e8 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > @@ -12,6 +12,14 @@ > > extern bool hang_debug; > > +struct a6xx_llc { > + void __iomem *mmio; > + void *gpu_llc_slice; > + void *gpuhtw_llc_slice; > + u32 cntl0_regval; > + u32 cntl1_regval; > +}; > + > struct a6xx_gpu { > struct adreno_gpu base; > > @@ -21,6 +29,7 @@ struct a6xx_gpu { > struct msm_ringbuffer *cur_ring; > > struct a6xx_gmu gmu; > + struct a6xx_llc llc; > }; > > #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base) > diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c > index e80c79b..66612c4 100644 > --- a/drivers/gpu/drm/msm/msm_iommu.c > +++ b/drivers/gpu/drm/msm/msm_iommu.c > @@ -38,6 +38,16 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char * const *names, > int cnt) > { > struct msm_iommu *iommu = to_msm_iommu(mmu); > + int gpu_htw_llc = 1; > + > + /* > + * This allows GPU to set the bus attributes required > + * to use system cache on behalf of the iommu page table > + * walker. > + */ > + if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE)) > + iommu_domain_set_attr(iommu->domain, > + DOMAIN_ATTR_USE_SYS_CACHE, &gpu_htw_llc); > > return iommu_attach_device(iommu->domain, mmu->dev); > } > @@ -56,6 +66,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova, > struct msm_iommu *iommu = to_msm_iommu(mmu); > size_t ret; > > + if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE)) > + prot |= IOMMU_SYS_CACHE; > + > ret = iommu_map_sg(iommu->domain, iova, sgt->sgl, sgt->nents, prot); > WARN_ON(ret < 0); > > diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h > index 9b9f43f..524790b 100644 > --- a/drivers/gpu/drm/msm/msm_mmu.h > +++ b/drivers/gpu/drm/msm/msm_mmu.h > @@ -49,6 +49,9 @@ struct msm_mmu_funcs { > bool (*is_domain_secure)(struct msm_mmu *mmu); > }; > > +/* MMU features */ > +#define MMU_FEATURE_USE_SYSTEM_CACHE (1 << 0) > + > struct msm_mmu { > const struct msm_mmu_funcs *funcs; > struct device *dev; > -- > 1.9.1 > -- The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project