On Fri, Jul 11, 2014 at 12:50:11AM +0300, Oded Gabbay wrote: > This patch adds the code base of the scheduler, which handles queue > creation, deletion and scheduling on the CP of the GPU. > > Signed-off-by: Oded Gabbay <oded.gabbay@xxxxxxx> I would rather see all this squashed, this gave feeling that driver can access register which is latter remove. I know jungling with patch squashing can be daunting but really it makes reviewing hard here because i have to jump back and forth to see if thing i am looking at really matter in the final version. Cheers, Jérôme > --- > drivers/gpu/hsa/radeon/Makefile | 3 +- > drivers/gpu/hsa/radeon/cik_regs.h | 213 +++++++ > drivers/gpu/hsa/radeon/kfd_device.c | 1 + > drivers/gpu/hsa/radeon/kfd_registers.c | 50 ++ > drivers/gpu/hsa/radeon/kfd_sched_cik_static.c | 800 ++++++++++++++++++++++++++ > drivers/gpu/hsa/radeon/kfd_vidmem.c | 61 ++ > 6 files changed, 1127 insertions(+), 1 deletion(-) > create mode 100644 drivers/gpu/hsa/radeon/cik_regs.h > create mode 100644 drivers/gpu/hsa/radeon/kfd_registers.c > create mode 100644 drivers/gpu/hsa/radeon/kfd_sched_cik_static.c > create mode 100644 drivers/gpu/hsa/radeon/kfd_vidmem.c > > diff --git a/drivers/gpu/hsa/radeon/Makefile b/drivers/gpu/hsa/radeon/Makefile > index 989518a..28da10c 100644 > --- a/drivers/gpu/hsa/radeon/Makefile > +++ b/drivers/gpu/hsa/radeon/Makefile > @@ -4,6 +4,7 @@ > > radeon_kfd-y := kfd_module.o kfd_device.o kfd_chardev.o \ > kfd_pasid.o kfd_topology.o kfd_process.o \ > - kfd_doorbell.o > + kfd_doorbell.o kfd_sched_cik_static.o kfd_registers.o \ > + kfd_vidmem.o > > obj-$(CONFIG_HSA_RADEON) += radeon_kfd.o > diff --git a/drivers/gpu/hsa/radeon/cik_regs.h b/drivers/gpu/hsa/radeon/cik_regs.h > new file mode 100644 > index 0000000..d0cdc57 > --- /dev/null > +++ b/drivers/gpu/hsa/radeon/cik_regs.h > @@ -0,0 +1,213 @@ > +/* > + * Copyright 2014 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#ifndef CIK_REGS_H > +#define CIK_REGS_H > + > +#define BIF_DOORBELL_CNTL 0x530Cu > + > +#define SRBM_GFX_CNTL 0xE44 > +#define PIPEID(x) ((x) << 0) > +#define MEID(x) ((x) << 2) > +#define VMID(x) ((x) << 4) > +#define QUEUEID(x) ((x) << 8) > + > +#define SQ_CONFIG 0x8C00 > + > +#define SH_MEM_BASES 0x8C28 > +/* if PTR32, these are the bases for scratch and lds */ > +#define PRIVATE_BASE(x) ((x) << 0) /* scratch */ > +#define SHARED_BASE(x) ((x) << 16) /* LDS */ > +#define SH_MEM_APE1_BASE 0x8C2C > +/* if PTR32, this is the base location of GPUVM */ > +#define SH_MEM_APE1_LIMIT 0x8C30 > +/* if PTR32, this is the upper limit of GPUVM */ > +#define SH_MEM_CONFIG 0x8C34 > +#define PTR32 (1 << 0) > +#define ALIGNMENT_MODE(x) ((x) << 2) > +#define SH_MEM_ALIGNMENT_MODE_DWORD 0 > +#define SH_MEM_ALIGNMENT_MODE_DWORD_STRICT 1 > +#define SH_MEM_ALIGNMENT_MODE_STRICT 2 > +#define SH_MEM_ALIGNMENT_MODE_UNALIGNED 3 > +#define DEFAULT_MTYPE(x) ((x) << 4) > +#define APE1_MTYPE(x) ((x) << 7) > + > +/* valid for both DEFAULT_MTYPE and APE1_MTYPE */ > +#define MTYPE_NONCACHED 3 > + > + > +#define SH_STATIC_MEM_CONFIG 0x9604u > + > +#define TC_CFG_L1_LOAD_POLICY0 0xAC68 > +#define TC_CFG_L1_LOAD_POLICY1 0xAC6C > +#define TC_CFG_L1_STORE_POLICY 0xAC70 > +#define TC_CFG_L2_LOAD_POLICY0 0xAC74 > +#define TC_CFG_L2_LOAD_POLICY1 0xAC78 > +#define TC_CFG_L2_STORE_POLICY0 0xAC7C > +#define TC_CFG_L2_STORE_POLICY1 0xAC80 > +#define TC_CFG_L2_ATOMIC_POLICY 0xAC84 > +#define TC_CFG_L1_VOLATILE 0xAC88 > +#define TC_CFG_L2_VOLATILE 0xAC8C > + > +#define CP_PQ_WPTR_POLL_CNTL 0xC20C > +#define WPTR_POLL_EN (1 << 31) > + > +#define CP_ME1_PIPE0_INT_CNTL 0xC214 > +#define CP_ME1_PIPE1_INT_CNTL 0xC218 > +#define CP_ME1_PIPE2_INT_CNTL 0xC21C > +#define CP_ME1_PIPE3_INT_CNTL 0xC220 > +#define CP_ME2_PIPE0_INT_CNTL 0xC224 > +#define CP_ME2_PIPE1_INT_CNTL 0xC228 > +#define CP_ME2_PIPE2_INT_CNTL 0xC22C > +#define CP_ME2_PIPE3_INT_CNTL 0xC230 > +#define DEQUEUE_REQUEST_INT_ENABLE (1 << 13) > +#define WRM_POLL_TIMEOUT_INT_ENABLE (1 << 17) > +#define PRIV_REG_INT_ENABLE (1 << 23) > +#define TIME_STAMP_INT_ENABLE (1 << 26) > +#define GENERIC2_INT_ENABLE (1 << 29) > +#define GENERIC1_INT_ENABLE (1 << 30) > +#define GENERIC0_INT_ENABLE (1 << 31) > +#define CP_ME1_PIPE0_INT_STATUS 0xC214 > +#define CP_ME1_PIPE1_INT_STATUS 0xC218 > +#define CP_ME1_PIPE2_INT_STATUS 0xC21C > +#define CP_ME1_PIPE3_INT_STATUS 0xC220 > +#define CP_ME2_PIPE0_INT_STATUS 0xC224 > +#define CP_ME2_PIPE1_INT_STATUS 0xC228 > +#define CP_ME2_PIPE2_INT_STATUS 0xC22C > +#define CP_ME2_PIPE3_INT_STATUS 0xC230 > +#define DEQUEUE_REQUEST_INT_STATUS (1 << 13) > +#define WRM_POLL_TIMEOUT_INT_STATUS (1 << 17) > +#define PRIV_REG_INT_STATUS (1 << 23) > +#define TIME_STAMP_INT_STATUS (1 << 26) > +#define GENERIC2_INT_STATUS (1 << 29) > +#define GENERIC1_INT_STATUS (1 << 30) > +#define GENERIC0_INT_STATUS (1 << 31) > + > +#define CP_HPD_EOP_BASE_ADDR 0xC904 > +#define CP_HPD_EOP_BASE_ADDR_HI 0xC908 > +#define CP_HPD_EOP_VMID 0xC90C > +#define CP_HPD_EOP_CONTROL 0xC910 > +#define EOP_SIZE(x) ((x) << 0) > +#define EOP_SIZE_MASK (0x3f << 0) > +#define CP_MQD_BASE_ADDR 0xC914 > +#define CP_MQD_BASE_ADDR_HI 0xC918 > +#define CP_HQD_ACTIVE 0xC91C > +#define CP_HQD_VMID 0xC920 > + > +#define CP_HQD_PERSISTENT_STATE 0xC924u > +#define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) > + > +#define CP_HQD_PIPE_PRIORITY 0xC928u > +#define CP_HQD_QUEUE_PRIORITY 0xC92Cu > +#define CP_HQD_QUANTUM 0xC930u > +#define QUANTUM_EN 1U > +#define QUANTUM_SCALE_1MS (1U << 4) > +#define QUANTUM_DURATION(x) ((x) << 8) > + > +#define CP_HQD_PQ_BASE 0xC934 > +#define CP_HQD_PQ_BASE_HI 0xC938 > +#define CP_HQD_PQ_RPTR 0xC93C > +#define CP_HQD_PQ_RPTR_REPORT_ADDR 0xC940 > +#define CP_HQD_PQ_RPTR_REPORT_ADDR_HI 0xC944 > +#define CP_HQD_PQ_WPTR_POLL_ADDR 0xC948 > +#define CP_HQD_PQ_WPTR_POLL_ADDR_HI 0xC94C > +#define CP_HQD_PQ_DOORBELL_CONTROL 0xC950 > +#define DOORBELL_OFFSET(x) ((x) << 2) > +#define DOORBELL_OFFSET_MASK (0x1fffff << 2) > +#define DOORBELL_SOURCE (1 << 28) > +#define DOORBELL_SCHD_HIT (1 << 29) > +#define DOORBELL_EN (1 << 30) > +#define DOORBELL_HIT (1 << 31) > +#define CP_HQD_PQ_WPTR 0xC954 > +#define CP_HQD_PQ_CONTROL 0xC958 > +#define QUEUE_SIZE(x) ((x) << 0) > +#define QUEUE_SIZE_MASK (0x3f << 0) > +#define RPTR_BLOCK_SIZE(x) ((x) << 8) > +#define RPTR_BLOCK_SIZE_MASK (0x3f << 8) > +#define MIN_AVAIL_SIZE(x) ((x) << 20) > +#define PQ_ATC_EN (1 << 23) > +#define PQ_VOLATILE (1 << 26) > +#define NO_UPDATE_RPTR (1 << 27) > +#define UNORD_DISPATCH (1 << 28) > +#define ROQ_PQ_IB_FLIP (1 << 29) > +#define PRIV_STATE (1 << 30) > +#define KMD_QUEUE (1 << 31) > + > +#define DEFAULT_RPTR_BLOCK_SIZE RPTR_BLOCK_SIZE(5) > +#define DEFAULT_MIN_AVAIL_SIZE MIN_AVAIL_SIZE(3) > + > +#define CP_HQD_IB_BASE_ADDR 0xC95Cu > +#define CP_HQD_IB_BASE_ADDR_HI 0xC960u > +#define CP_HQD_IB_RPTR 0xC964u > +#define CP_HQD_IB_CONTROL 0xC968u > +#define IB_ATC_EN (1U << 23) > +#define DEFAULT_MIN_IB_AVAIL_SIZE (3U << 20) > + > +#define CP_HQD_DEQUEUE_REQUEST 0xC974 > +#define DEQUEUE_REQUEST_DRAIN 1 > + > +#define CP_HQD_SEMA_CMD 0xC97Cu > +#define CP_HQD_MSG_TYPE 0xC980u > +#define CP_HQD_ATOMIC0_PREOP_LO 0xC984u > +#define CP_HQD_ATOMIC0_PREOP_HI 0xC988u > +#define CP_HQD_ATOMIC1_PREOP_LO 0xC98Cu > +#define CP_HQD_ATOMIC1_PREOP_HI 0xC990u > +#define CP_HQD_HQ_SCHEDULER0 0xC994u > +#define CP_HQD_HQ_SCHEDULER1 0xC998u > + > + > +#define CP_MQD_CONTROL 0xC99C > +#define MQD_VMID(x) ((x) << 0) > +#define MQD_VMID_MASK (0xf << 0) > +#define MQD_CONTROL_PRIV_STATE_EN (1U << 8) > + > +#define GRBM_GFX_INDEX 0x30800 > +#define INSTANCE_INDEX(x) ((x) << 0) > +#define SH_INDEX(x) ((x) << 8) > +#define SE_INDEX(x) ((x) << 16) > +#define SH_BROADCAST_WRITES (1 << 29) > +#define INSTANCE_BROADCAST_WRITES (1 << 30) > +#define SE_BROADCAST_WRITES (1 << 31) > + > +#define SQC_CACHES 0x30d20 > +#define SQC_POLICY 0x8C38u > +#define SQC_VOLATILE 0x8C3Cu > + > +#define CP_PERFMON_CNTL 0x36020 > + > +#define ATC_VMID0_PASID_MAPPING 0x339Cu > +#define ATC_VMID_PASID_MAPPING_UPDATE_STATUS 0x3398u > +#define ATC_VMID_PASID_MAPPING_VALID (1U << 31) > + > +#define ATC_VM_APERTURE0_CNTL 0x3310u > +#define ATS_ACCESS_MODE_NEVER 0 > +#define ATS_ACCESS_MODE_ALWAYS 1 > + > +#define ATC_VM_APERTURE0_CNTL2 0x3318u > +#define ATC_VM_APERTURE0_HIGH_ADDR 0x3308u > +#define ATC_VM_APERTURE0_LOW_ADDR 0x3300u > +#define ATC_VM_APERTURE1_CNTL 0x3314u > +#define ATC_VM_APERTURE1_CNTL2 0x331Cu > +#define ATC_VM_APERTURE1_HIGH_ADDR 0x330Cu > +#define ATC_VM_APERTURE1_LOW_ADDR 0x3304u > + > +#endif > diff --git a/drivers/gpu/hsa/radeon/kfd_device.c b/drivers/gpu/hsa/radeon/kfd_device.c > index 4e9fe6c..465c822 100644 > --- a/drivers/gpu/hsa/radeon/kfd_device.c > +++ b/drivers/gpu/hsa/radeon/kfd_device.c > @@ -28,6 +28,7 @@ > #include "kfd_scheduler.h" > > static const struct kfd_device_info bonaire_device_info = { > + .scheduler_class = &radeon_kfd_cik_static_scheduler_class, > .max_pasid_bits = 16, > }; > > diff --git a/drivers/gpu/hsa/radeon/kfd_registers.c b/drivers/gpu/hsa/radeon/kfd_registers.c > new file mode 100644 > index 0000000..223debd > --- /dev/null > +++ b/drivers/gpu/hsa/radeon/kfd_registers.c > @@ -0,0 +1,50 @@ > +/* > + * Copyright 2014 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#include <linux/io.h> > +#include "kfd_priv.h" > + > +/* In KFD, "reg" is the byte offset of the register. */ > +static void __iomem *reg_address(struct kfd_dev *dev, uint32_t reg) > +{ > + return dev->regs + reg; > +} > + > +void radeon_kfd_write_reg(struct kfd_dev *dev, uint32_t reg, uint32_t value) > +{ > + writel(value, reg_address(dev, reg)); > +} > + > +uint32_t radeon_kfd_read_reg(struct kfd_dev *dev, uint32_t reg) > +{ > + return readl(reg_address(dev, reg)); > +} > + > +void radeon_kfd_lock_srbm_index(struct kfd_dev *dev) > +{ > + kfd2kgd->lock_srbm_gfx_cntl(dev->kgd); > +} > + > +void radeon_kfd_unlock_srbm_index(struct kfd_dev *dev) > +{ > + kfd2kgd->unlock_srbm_gfx_cntl(dev->kgd); > +} > diff --git a/drivers/gpu/hsa/radeon/kfd_sched_cik_static.c b/drivers/gpu/hsa/radeon/kfd_sched_cik_static.c > new file mode 100644 > index 0000000..b986ff9 > --- /dev/null > +++ b/drivers/gpu/hsa/radeon/kfd_sched_cik_static.c > @@ -0,0 +1,800 @@ > +/* > + * Copyright 2014 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#include <linux/log2.h> > +#include <linux/mutex.h> > +#include <linux/slab.h> > +#include <linux/types.h> > +#include <linux/uaccess.h> > +#include "kfd_priv.h" > +#include "kfd_scheduler.h" > +#include "cik_regs.h" > + > +/* CIK CP hardware is arranged with 8 queues per pipe and 8 pipes per MEC (microengine for compute). > + * The first MEC is ME 1 with the GFX ME as ME 0. > + * We split the CP with the KGD, they take the first N pipes and we take the rest. > + */ > +#define CIK_QUEUES_PER_PIPE 8 > +#define CIK_PIPES_PER_MEC 4 > + > +#define CIK_MAX_PIPES (2 * CIK_PIPES_PER_MEC) > + > +#define CIK_NUM_VMID 16 > + > +#define CIK_HPD_SIZE_LOG2 11 > +#define CIK_HPD_SIZE (1U << CIK_HPD_SIZE_LOG2) > +#define CIK_HPD_ALIGNMENT 256 > +#define CIK_MQD_ALIGNMENT 4 > + > +#pragma pack(push, 4) > + > +struct cik_hqd_registers { > + u32 cp_mqd_base_addr; > + u32 cp_mqd_base_addr_hi; > + u32 cp_hqd_active; > + u32 cp_hqd_vmid; > + u32 cp_hqd_persistent_state; > + u32 cp_hqd_pipe_priority; > + u32 cp_hqd_queue_priority; > + u32 cp_hqd_quantum; > + u32 cp_hqd_pq_base; > + u32 cp_hqd_pq_base_hi; > + u32 cp_hqd_pq_rptr; > + u32 cp_hqd_pq_rptr_report_addr; > + u32 cp_hqd_pq_rptr_report_addr_hi; > + u32 cp_hqd_pq_wptr_poll_addr; > + u32 cp_hqd_pq_wptr_poll_addr_hi; > + u32 cp_hqd_pq_doorbell_control; > + u32 cp_hqd_pq_wptr; > + u32 cp_hqd_pq_control; > + u32 cp_hqd_ib_base_addr; > + u32 cp_hqd_ib_base_addr_hi; > + u32 cp_hqd_ib_rptr; > + u32 cp_hqd_ib_control; > + u32 cp_hqd_iq_timer; > + u32 cp_hqd_iq_rptr; > + u32 cp_hqd_dequeue_request; > + u32 cp_hqd_dma_offload; > + u32 cp_hqd_sema_cmd; > + u32 cp_hqd_msg_type; > + u32 cp_hqd_atomic0_preop_lo; > + u32 cp_hqd_atomic0_preop_hi; > + u32 cp_hqd_atomic1_preop_lo; > + u32 cp_hqd_atomic1_preop_hi; > + u32 cp_hqd_hq_scheduler0; > + u32 cp_hqd_hq_scheduler1; > + u32 cp_mqd_control; > +}; > + > +struct cik_mqd { > + u32 header; > + u32 dispatch_initiator; > + u32 dimensions[3]; > + u32 start_idx[3]; > + u32 num_threads[3]; > + u32 pipeline_stat_enable; > + u32 perf_counter_enable; > + u32 pgm[2]; > + u32 tba[2]; > + u32 tma[2]; > + u32 pgm_rsrc[2]; > + u32 vmid; > + u32 resource_limits; > + u32 static_thread_mgmt01[2]; > + u32 tmp_ring_size; > + u32 static_thread_mgmt23[2]; > + u32 restart[3]; > + u32 thread_trace_enable; > + u32 reserved1; > + u32 user_data[16]; > + u32 vgtcs_invoke_count[2]; > + struct cik_hqd_registers queue_state; > + u32 dequeue_cntr; > + u32 interrupt_queue[64]; > +}; > + > +struct cik_mqd_padded { > + struct cik_mqd mqd; > + u8 padding[1024 - sizeof(struct cik_mqd)]; /* Pad MQD out to 1KB. (HW requires 4-byte alignment.) */ > +}; > + > +#pragma pack(pop) > + > +struct cik_static_private { > + struct kfd_dev *dev; > + > + struct mutex mutex; > + > + unsigned int first_pipe; > + unsigned int num_pipes; > + > + unsigned long free_vmid_mask; /* unsigned long to make set/clear_bit happy */ > + > + /* Everything below here is offset by first_pipe. E.g. bit 0 in > + * free_queues is queue 0 in pipe first_pipe > + */ > + > + /* Queue q on pipe p is at bit QUEUES_PER_PIPE * p + q. */ > + unsigned long free_queues[DIV_ROUND_UP(CIK_MAX_PIPES * CIK_QUEUES_PER_PIPE, BITS_PER_LONG)]; > + > + kfd_mem_obj hpd_mem; /* Single allocation for HPDs for all KFD pipes. */ > + kfd_mem_obj mqd_mem; /* Single allocation for all MQDs for all KFD > + * pipes. This is actually struct cik_mqd_padded. */ > + uint64_t hpd_addr; /* GPU address for hpd_mem. */ > + uint64_t mqd_addr; /* GPU address for mqd_mem. */ > + /* > + * Pointer for mqd_mem. > + * We keep this mapped because multiple processes may need to access it > + * in parallel and this is simpler than controlling concurrent kmaps > + */ > + struct cik_mqd_padded *mqds; > +}; > + > +struct cik_static_process { > + unsigned int vmid; > + pasid_t pasid; > +}; > + > +struct cik_static_queue { > + unsigned int queue; /* + first_pipe * QUEUES_PER_PIPE */ > + > + uint64_t mqd_addr; > + struct cik_mqd *mqd; > + > + void __user *pq_addr; > + void __user *rptr_address; > + doorbell_t __user *wptr_address; > + uint32_t doorbell_index; > + > + uint32_t queue_size_encoded; /* CP_HQD_PQ_CONTROL.QUEUE_SIZE takes the queue size as log2(size) - 3. */ > +}; > + > +static uint32_t lower_32(uint64_t x) > +{ > + return (uint32_t)x; > +} > + > +static uint32_t upper_32(uint64_t x) > +{ > + return (uint32_t)(x >> 32); > +} > + > +/* SRBM_GFX_CNTL provides the MEC/pipe/queue and vmid for many registers that are > + * In particular, CP_HQD_* and CP_MQD_* are instanced for each queue. CP_HPD_* are instanced for each pipe. > + * SH_MEM_* are instanced per-VMID. > + * > + * We provide queue_select, pipe_select and vmid_select helpers that should be used before accessing > + * registers from those groups. Note that these overwrite each other, e.g. after vmid_select the current > + * selected MEC/pipe/queue is undefined. > + * > + * SRBM_GFX_CNTL and the registers it indexes are shared with KGD. You must be holding the srbm_gfx_cntl > + * lock via lock_srbm_index before setting SRBM_GFX_CNTL or accessing any of the instanced registers. > + */ > +static uint32_t make_srbm_gfx_cntl_mpqv(unsigned int me, unsigned int pipe, unsigned int queue, unsigned int vmid) > +{ > + return QUEUEID(queue) | VMID(vmid) | MEID(me) | PIPEID(pipe); > +} > + > +static void pipe_select(struct cik_static_private *priv, unsigned int pipe) > +{ > + unsigned int pipe_in_mec = (pipe + priv->first_pipe) % CIK_PIPES_PER_MEC; > + unsigned int mec = (pipe + priv->first_pipe) / CIK_PIPES_PER_MEC; > + > + WRITE_REG(priv->dev, SRBM_GFX_CNTL, make_srbm_gfx_cntl_mpqv(mec+1, pipe_in_mec, 0, 0)); > +} > + > +static void queue_select(struct cik_static_private *priv, unsigned int queue) > +{ > + unsigned int queue_in_pipe = queue % CIK_QUEUES_PER_PIPE; > + unsigned int pipe = queue / CIK_QUEUES_PER_PIPE + priv->first_pipe; > + unsigned int pipe_in_mec = pipe % CIK_PIPES_PER_MEC; > + unsigned int mec = pipe / CIK_PIPES_PER_MEC; > + > +#if 0 > + dev_err(radeon_kfd_chardev(), "queue select %d = %u/%u/%u = 0x%08x\n", queue, mec+1, pipe_in_mec, queue_in_pipe, > + make_srbm_gfx_cntl_mpqv(mec+1, pipe_in_mec, queue_in_pipe, 0)); > +#endif > + > + WRITE_REG(priv->dev, SRBM_GFX_CNTL, make_srbm_gfx_cntl_mpqv(mec+1, pipe_in_mec, queue_in_pipe, 0)); > +} > + > +static void vmid_select(struct cik_static_private *priv, unsigned int vmid) > +{ > + WRITE_REG(priv->dev, SRBM_GFX_CNTL, make_srbm_gfx_cntl_mpqv(0, 0, 0, vmid)); > +} > + > +static void lock_srbm_index(struct cik_static_private *priv) > +{ > + radeon_kfd_lock_srbm_index(priv->dev); > +} > + > +static void unlock_srbm_index(struct cik_static_private *priv) > +{ > + WRITE_REG(priv->dev, SRBM_GFX_CNTL, 0); /* Be nice to KGD, reset indexed CP registers to the GFX pipe. */ > + radeon_kfd_unlock_srbm_index(priv->dev); > +} > + > +/* One-time setup for all compute pipes. They need to be programmed with the address & size of the HPD EOP buffer. */ > +static void init_pipes(struct cik_static_private *priv) > +{ > + unsigned int i; > + > + lock_srbm_index(priv); > + > + for (i = 0; i < priv->num_pipes; i++) { > + uint64_t pipe_hpd_addr = priv->hpd_addr + i * CIK_HPD_SIZE; > + > + pipe_select(priv, i); > + > + WRITE_REG(priv->dev, CP_HPD_EOP_BASE_ADDR, lower_32(pipe_hpd_addr >> 8)); > + WRITE_REG(priv->dev, CP_HPD_EOP_BASE_ADDR_HI, upper_32(pipe_hpd_addr >> 8)); > + WRITE_REG(priv->dev, CP_HPD_EOP_VMID, 0); > + WRITE_REG(priv->dev, CP_HPD_EOP_CONTROL, CIK_HPD_SIZE_LOG2 - 1); > + } > + > + unlock_srbm_index(priv); > +} > + > +/* Program the VMID -> PASID mapping for one VMID. > + * PASID 0 is special: it means to associate no PASID with that VMID. > + * This function waits for the VMID/PASID mapping to complete. > + */ > +static void set_vmid_pasid_mapping(struct cik_static_private *priv, unsigned int vmid, pasid_t pasid) > +{ > + /* We have to assume that there is no outstanding mapping. > + * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because a mapping > + * is in progress or because a mapping finished and the SW cleared it. > + * So the protocol is to always wait & clear. > + */ > + > + uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ATC_VMID_PASID_MAPPING_VALID; > + > + WRITE_REG(priv->dev, ATC_VMID0_PASID_MAPPING + vmid*sizeof(uint32_t), pasid_mapping); > + > + while (!(READ_REG(priv->dev, ATC_VMID_PASID_MAPPING_UPDATE_STATUS) & (1U << vmid))) > + cpu_relax(); > + WRITE_REG(priv->dev, ATC_VMID_PASID_MAPPING_UPDATE_STATUS, 1U << vmid); > +} > + > +static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) > +{ > + /* In 64-bit mode, we can only control the top 3 bits of the LDS, scratch and GPUVM apertures. > + * The hardware fills in the remaining 59 bits according to the following pattern: > + * LDS: X0000000'00000000 - X0000001'00000000 (4GB) > + * Scratch: X0000001'00000000 - X0000002'00000000 (4GB) > + * GPUVM: Y0010000'00000000 - Y0020000'00000000 (1TB) > + * > + * (where X/Y is the configurable nybble with the low-bit 0) > + * > + * LDS and scratch will have the same top nybble programmed in the top 3 bits of SH_MEM_BASES.PRIVATE_BASE. > + * GPUVM can have a different top nybble programmed in the top 3 bits of SH_MEM_BASES.SHARED_BASE. > + * We don't bother to support different top nybbles for LDS/Scratch and GPUVM. > + */ > + > + BUG_ON((top_address_nybble & 1) || top_address_nybble > 0xE); > + > + return PRIVATE_BASE(top_address_nybble << 12) | SHARED_BASE(top_address_nybble << 12); > +} > + > +/* Initial programming for all ATS registers. > + * - enable ATS for all compute VMIDs > + * - clear the VMID/PASID mapping for all compute VMIDS > + * - program the shader core flat address settings: > + * -- 64-bit mode > + * -- unaligned access allowed > + * -- noncached (this is the only CPU-coherent mode in CIK) > + * -- APE 1 disabled > + */ > +static void init_ats(struct cik_static_private *priv) > +{ > + unsigned int i; > + > + /* Enable self-ringing doorbell recognition and direct the BIF to send > + * untranslated writes to the IOMMU before comparing to the aperture.*/ > + WRITE_REG(priv->dev, BIF_DOORBELL_CNTL, 0); > + > + WRITE_REG(priv->dev, ATC_VM_APERTURE0_CNTL, ATS_ACCESS_MODE_ALWAYS); > + WRITE_REG(priv->dev, ATC_VM_APERTURE0_CNTL2, priv->free_vmid_mask); > + WRITE_REG(priv->dev, ATC_VM_APERTURE0_LOW_ADDR, 0); > + WRITE_REG(priv->dev, ATC_VM_APERTURE0_HIGH_ADDR, 0); > + > + WRITE_REG(priv->dev, ATC_VM_APERTURE1_CNTL, 0); > + WRITE_REG(priv->dev, ATC_VM_APERTURE1_CNTL2, 0); > + WRITE_REG(priv->dev, ATC_VM_APERTURE1_LOW_ADDR, 0); > + WRITE_REG(priv->dev, ATC_VM_APERTURE1_HIGH_ADDR, 0); > + > + lock_srbm_index(priv); > + > + for (i = 0; i < CIK_NUM_VMID; i++) { > + if (priv->free_vmid_mask & (1U << i)) { > + uint32_t sh_mem_config; > + > + set_vmid_pasid_mapping(priv, i, 0); > + > + vmid_select(priv, i); > + > + sh_mem_config = ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED); > + sh_mem_config |= DEFAULT_MTYPE(MTYPE_NONCACHED); > + > + WRITE_REG(priv->dev, SH_MEM_CONFIG, sh_mem_config); > + > + /* Configure apertures: > + * LDS: 0x60000000'00000000 - 0x60000001'00000000 (4GB) > + * Scratch: 0x60000001'00000000 - 0x60000002'00000000 (4GB) > + * GPUVM: 0x60010000'00000000 - 0x60020000'00000000 (1TB) > + */ > + WRITE_REG(priv->dev, SH_MEM_BASES, compute_sh_mem_bases_64bit(6)); > + > + /* Scratch aperture is not supported for now. */ > + WRITE_REG(priv->dev, SH_STATIC_MEM_CONFIG, 0); > + > + /* APE1 disabled for now. */ > + WRITE_REG(priv->dev, SH_MEM_APE1_BASE, 1); > + WRITE_REG(priv->dev, SH_MEM_APE1_LIMIT, 0); > + } > + } > + > + unlock_srbm_index(priv); > +} > + > +static void exit_ats(struct cik_static_private *priv) > +{ > + unsigned int i; > + > + for (i = 0; i < CIK_NUM_VMID; i++) > + if (priv->free_vmid_mask & (1U << i)) > + set_vmid_pasid_mapping(priv, i, 0); > + > + WRITE_REG(priv->dev, ATC_VM_APERTURE0_CNTL, ATS_ACCESS_MODE_NEVER); > + WRITE_REG(priv->dev, ATC_VM_APERTURE0_CNTL2, 0); > +} > + > +static struct cik_static_private *kfd_scheduler_to_private(struct kfd_scheduler *scheduler) > +{ > + return (struct cik_static_private *)scheduler; > +} > + > +static struct cik_static_process *kfd_process_to_private(struct kfd_scheduler_process *process) > +{ > + return (struct cik_static_process *)process; > +} > + > +static struct cik_static_queue *kfd_queue_to_private(struct kfd_scheduler_queue *queue) > +{ > + return (struct cik_static_queue *)queue; > +} > + > +static int cik_static_create(struct kfd_dev *dev, struct kfd_scheduler **scheduler) > +{ > + struct cik_static_private *priv; > + unsigned int i; > + int err; > + void *hpdptr; > + > + priv = kmalloc(sizeof(*priv), GFP_KERNEL); > + if (priv == NULL) > + return -ENOMEM; > + > + mutex_init(&priv->mutex); > + > + priv->dev = dev; > + > + priv->first_pipe = dev->shared_resources.first_compute_pipe; > + priv->num_pipes = dev->shared_resources.compute_pipe_count; > + > + for (i = 0; i < priv->num_pipes * CIK_QUEUES_PER_PIPE; i++) > + __set_bit(i, priv->free_queues); > + > + priv->free_vmid_mask = dev->shared_resources.compute_vmid_bitmap; > + > + /* > + * Allocate memory for the HPDs. This is hardware-owned per-pipe data. > + * The driver never accesses this memory after zeroing it. It doesn't even have > + * to be saved/restored on suspend/resume because it contains no data when there > + * are no active queues. > + */ > + err = radeon_kfd_vidmem_alloc(dev, > + CIK_HPD_SIZE * priv->num_pipes * 2, > + PAGE_SIZE, > + KFD_MEMPOOL_SYSTEM_WRITECOMBINE, > + &priv->hpd_mem); > + if (err) > + goto err_hpd_alloc; > + > + err = radeon_kfd_vidmem_kmap(dev, priv->hpd_mem, &hpdptr); > + if (err) > + goto err_hpd_kmap; > + memset(hpdptr, 0, CIK_HPD_SIZE * priv->num_pipes); > + radeon_kfd_vidmem_unkmap(dev, priv->hpd_mem); > + > + /* > + * Allocate memory for all the MQDs. > + * These are per-queue data that is hardware owned but with driver init. > + * The driver has to copy this data into HQD registers when a > + * pipe is (re)activated. > + */ > + err = radeon_kfd_vidmem_alloc(dev, > + sizeof(struct cik_mqd_padded) * priv->num_pipes * CIK_QUEUES_PER_PIPE, > + PAGE_SIZE, > + KFD_MEMPOOL_SYSTEM_CACHEABLE, > + &priv->mqd_mem); > + if (err) > + goto err_mqd_alloc; > + radeon_kfd_vidmem_kmap(dev, priv->mqd_mem, (void **)&priv->mqds); > + if (err) > + goto err_mqd_kmap; > + > + *scheduler = (struct kfd_scheduler *)priv; > + > + return 0; > + > +err_mqd_kmap: > + radeon_kfd_vidmem_free(dev, priv->mqd_mem); > +err_mqd_alloc: > +err_hpd_kmap: > + radeon_kfd_vidmem_free(dev, priv->hpd_mem); > +err_hpd_alloc: > + mutex_destroy(&priv->mutex); > + kfree(priv); > + return err; > +} > + > +static void cik_static_destroy(struct kfd_scheduler *scheduler) > +{ > + struct cik_static_private *priv = kfd_scheduler_to_private(scheduler); > + > + radeon_kfd_vidmem_unkmap(priv->dev, priv->mqd_mem); > + radeon_kfd_vidmem_free(priv->dev, priv->mqd_mem); > + radeon_kfd_vidmem_free(priv->dev, priv->hpd_mem); > + > + mutex_destroy(&priv->mutex); > + > + kfree(priv); > +} > + > +static void cik_static_start(struct kfd_scheduler *scheduler) > +{ > + struct cik_static_private *priv = kfd_scheduler_to_private(scheduler); > + > + radeon_kfd_vidmem_gpumap(priv->dev, priv->hpd_mem, &priv->hpd_addr); > + radeon_kfd_vidmem_gpumap(priv->dev, priv->mqd_mem, &priv->mqd_addr); > + > + init_pipes(priv); > + init_ats(priv); > +} > + > +static void cik_static_stop(struct kfd_scheduler *scheduler) > +{ > + struct cik_static_private *priv = kfd_scheduler_to_private(scheduler); > + > + exit_ats(priv); > + > + radeon_kfd_vidmem_ungpumap(priv->dev, priv->hpd_mem); > + radeon_kfd_vidmem_ungpumap(priv->dev, priv->mqd_mem); > +} > + > +static bool allocate_vmid(struct cik_static_private *priv, unsigned int *vmid) > +{ > + bool ok = false; > + > + mutex_lock(&priv->mutex); > + > + if (priv->free_vmid_mask != 0) { > + unsigned int v = __ffs64(priv->free_vmid_mask); > + > + clear_bit(v, &priv->free_vmid_mask); > + *vmid = v; > + > + ok = true; > + } > + > + mutex_unlock(&priv->mutex); > + > + return ok; > +} > + > +static void release_vmid(struct cik_static_private *priv, unsigned int vmid) > +{ > + /* It's okay to race against allocate_vmid because this only adds bits to free_vmid_mask. > + * And set_bit/clear_bit are atomic wrt each other. */ > + set_bit(vmid, &priv->free_vmid_mask); > +} > + > +static void setup_vmid_for_process(struct cik_static_private *priv, struct cik_static_process *p) > +{ > + set_vmid_pasid_mapping(priv, p->vmid, p->pasid); > + > + /* > + * SH_MEM_CONFIG and others need to be programmed differently > + * for 32/64-bit processes. And maybe other reasons. > + */ > +} > + > +static int > +cik_static_register_process(struct kfd_scheduler *scheduler, struct kfd_process *process, > + struct kfd_scheduler_process **scheduler_process) > +{ > + struct cik_static_private *priv = kfd_scheduler_to_private(scheduler); > + > + struct cik_static_process *hwp; > + > + hwp = kmalloc(sizeof(*hwp), GFP_KERNEL); > + if (hwp == NULL) > + return -ENOMEM; > + > + if (!allocate_vmid(priv, &hwp->vmid)) { > + kfree(hwp); > + return -ENOMEM; > + } > + > + hwp->pasid = process->pasid; > + > + setup_vmid_for_process(priv, hwp); > + > + *scheduler_process = (struct kfd_scheduler_process *)hwp; > + > + return 0; > +} > + > +static void cik_static_deregister_process(struct kfd_scheduler *scheduler, > + struct kfd_scheduler_process *scheduler_process) > +{ > + struct cik_static_private *priv = kfd_scheduler_to_private(scheduler); > + struct cik_static_process *pp = kfd_process_to_private(scheduler_process); > + > + release_vmid(priv, pp->vmid); > + kfree(pp); > +} > + > +static bool allocate_hqd(struct cik_static_private *priv, unsigned int *queue) > +{ > + bool ok = false; > + unsigned int q; > + > + mutex_lock(&priv->mutex); > + > + q = find_first_bit(priv->free_queues, priv->num_pipes * CIK_QUEUES_PER_PIPE); > + > + if (q != priv->num_pipes * CIK_QUEUES_PER_PIPE) { > + clear_bit(q, priv->free_queues); > + *queue = q; > + > + ok = true; > + } > + > + mutex_unlock(&priv->mutex); > + > + return ok; > +} > + > +static void release_hqd(struct cik_static_private *priv, unsigned int queue) > +{ > + /* It's okay to race against allocate_hqd because this only adds bits to free_queues. > + * And set_bit/clear_bit are atomic wrt each other. */ > + set_bit(queue, priv->free_queues); > +} > + > +static void init_mqd(const struct cik_static_queue *queue, const struct cik_static_process *process) > +{ > + struct cik_mqd *mqd = queue->mqd; > + > + memset(mqd, 0, sizeof(*mqd)); > + > + mqd->header = 0xC0310800; > + mqd->pipeline_stat_enable = 1; > + mqd->static_thread_mgmt01[0] = 0xffffffff; > + mqd->static_thread_mgmt01[1] = 0xffffffff; > + mqd->static_thread_mgmt23[0] = 0xffffffff; > + mqd->static_thread_mgmt23[1] = 0xffffffff; > + > + mqd->queue_state.cp_mqd_base_addr = lower_32(queue->mqd_addr); > + mqd->queue_state.cp_mqd_base_addr_hi = upper_32(queue->mqd_addr); > + mqd->queue_state.cp_mqd_control = MQD_CONTROL_PRIV_STATE_EN; > + > + mqd->queue_state.cp_hqd_pq_base = lower_32((uintptr_t)queue->pq_addr >> 8); > + mqd->queue_state.cp_hqd_pq_base_hi = upper_32((uintptr_t)queue->pq_addr >> 8); > + mqd->queue_state.cp_hqd_pq_control = QUEUE_SIZE(queue->queue_size_encoded) | DEFAULT_RPTR_BLOCK_SIZE > + | DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; > + mqd->queue_state.cp_hqd_pq_rptr_report_addr = lower_32((uintptr_t)queue->rptr_address); > + mqd->queue_state.cp_hqd_pq_rptr_report_addr_hi = upper_32((uintptr_t)queue->rptr_address); > + mqd->queue_state.cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(queue->doorbell_index) | DOORBELL_EN; > + mqd->queue_state.cp_hqd_vmid = process->vmid; > + mqd->queue_state.cp_hqd_active = 1; > + > + mqd->queue_state.cp_hqd_persistent_state = DEFAULT_CP_HQD_PERSISTENT_STATE; > + > + /* The values for these 3 are from WinKFD. */ > + mqd->queue_state.cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | QUANTUM_DURATION(10); > + mqd->queue_state.cp_hqd_pipe_priority = 1; > + mqd->queue_state.cp_hqd_queue_priority = 15; > + > + mqd->queue_state.cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; > +} > + > +/* Write the HQD registers and activate the queue. > + * Requires that SRBM_GFX_CNTL has already been programmed for the queue. > + */ > +static void load_hqd(struct cik_static_private *priv, struct cik_static_queue *queue) > +{ > + struct kfd_dev *dev = priv->dev; > + const struct cik_hqd_registers *qs = &queue->mqd->queue_state; > + > + WRITE_REG(dev, CP_MQD_BASE_ADDR, qs->cp_mqd_base_addr); > + WRITE_REG(dev, CP_MQD_BASE_ADDR_HI, qs->cp_mqd_base_addr_hi); > + WRITE_REG(dev, CP_MQD_CONTROL, qs->cp_mqd_control); > + > + WRITE_REG(dev, CP_HQD_PQ_BASE, qs->cp_hqd_pq_base); > + WRITE_REG(dev, CP_HQD_PQ_BASE_HI, qs->cp_hqd_pq_base_hi); > + WRITE_REG(dev, CP_HQD_PQ_CONTROL, qs->cp_hqd_pq_control); > + /* DOORBELL_CONTROL before WPTR because WPTR writes are dropped if DOORBELL_HIT is set. */ > + WRITE_REG(dev, CP_HQD_PQ_DOORBELL_CONTROL, qs->cp_hqd_pq_doorbell_control); > + WRITE_REG(dev, CP_HQD_PQ_WPTR, qs->cp_hqd_pq_wptr); > + WRITE_REG(dev, CP_HQD_PQ_RPTR, qs->cp_hqd_pq_rptr); > + WRITE_REG(dev, CP_HQD_PQ_RPTR_REPORT_ADDR, qs->cp_hqd_pq_rptr_report_addr); > + WRITE_REG(dev, CP_HQD_PQ_RPTR_REPORT_ADDR_HI, qs->cp_hqd_pq_rptr_report_addr_hi); > + > + WRITE_REG(dev, CP_HQD_VMID, qs->cp_hqd_vmid); > + WRITE_REG(dev, CP_HQD_PERSISTENT_STATE, qs->cp_hqd_persistent_state); > + WRITE_REG(dev, CP_HQD_QUANTUM, qs->cp_hqd_quantum); > + WRITE_REG(dev, CP_HQD_PIPE_PRIORITY, qs->cp_hqd_pipe_priority); > + WRITE_REG(dev, CP_HQD_QUEUE_PRIORITY, qs->cp_hqd_queue_priority); > + > + WRITE_REG(dev, CP_HQD_IB_CONTROL, qs->cp_hqd_ib_control); > + WRITE_REG(dev, CP_HQD_IB_BASE_ADDR, qs->cp_hqd_ib_base_addr); > + WRITE_REG(dev, CP_HQD_IB_BASE_ADDR_HI, qs->cp_hqd_ib_base_addr_hi); > + WRITE_REG(dev, CP_HQD_IB_RPTR, qs->cp_hqd_ib_rptr); > + WRITE_REG(dev, CP_HQD_SEMA_CMD, qs->cp_hqd_sema_cmd); > + WRITE_REG(dev, CP_HQD_MSG_TYPE, qs->cp_hqd_msg_type); > + WRITE_REG(dev, CP_HQD_ATOMIC0_PREOP_LO, qs->cp_hqd_atomic0_preop_lo); > + WRITE_REG(dev, CP_HQD_ATOMIC0_PREOP_HI, qs->cp_hqd_atomic0_preop_hi); > + WRITE_REG(dev, CP_HQD_ATOMIC1_PREOP_LO, qs->cp_hqd_atomic1_preop_lo); > + WRITE_REG(dev, CP_HQD_ATOMIC1_PREOP_HI, qs->cp_hqd_atomic1_preop_hi); > + WRITE_REG(dev, CP_HQD_HQ_SCHEDULER0, qs->cp_hqd_hq_scheduler0); > + WRITE_REG(dev, CP_HQD_HQ_SCHEDULER1, qs->cp_hqd_hq_scheduler1); > + > + WRITE_REG(dev, CP_HQD_ACTIVE, 1); > +} > + > +static void activate_queue(struct cik_static_private *priv, struct cik_static_queue *queue) > +{ > + bool wptr_shadow_valid; > + doorbell_t wptr_shadow; > + > + /* Avoid sleeping while holding the SRBM lock. */ > + wptr_shadow_valid = !get_user(wptr_shadow, queue->wptr_address); > + > + lock_srbm_index(priv); > + queue_select(priv, queue->queue); > + > + load_hqd(priv, queue); > + > + /* Doorbell and wptr are special because there is a race when reactivating a queue. > + * Since doorbell writes to deactivated queues are ignored by hardware, the application > + * shadows the doorbell into memory at queue->wptr_address. > + * > + * We want the queue to automatically resume processing as if it were always active, > + * so we want to copy from queue->wptr_address into the wptr/doorbell. > + * > + * The race is that the app could write a new wptr into the doorbell before we > + * write the shadowed wptr, resulting in an old wptr written later. > + * > + * The hardware solves this ignoring CP_HQD_WPTR writes after a doorbell write. > + * So the KFD can activate the doorbell then write the shadow wptr to CP_HQD_WPTR > + * knowing it will be ignored if the user has written a more-recent doorbell. > + */ > + if (wptr_shadow_valid) > + WRITE_REG(priv->dev, CP_HQD_PQ_WPTR, wptr_shadow); > + > + unlock_srbm_index(priv); > +} > + > +static void drain_hqd(struct cik_static_private *priv) > +{ > + WRITE_REG(priv->dev, CP_HQD_DEQUEUE_REQUEST, DEQUEUE_REQUEST_DRAIN); > +} > + > +static void wait_hqd_inactive(struct cik_static_private *priv) > +{ > + while (READ_REG(priv->dev, CP_HQD_ACTIVE) != 0) > + cpu_relax(); > +} > + > +static void deactivate_queue(struct cik_static_private *priv, struct cik_static_queue *queue) > +{ > + lock_srbm_index(priv); > + queue_select(priv, queue->queue); > + > + drain_hqd(priv); > + wait_hqd_inactive(priv); > + > + unlock_srbm_index(priv); > +} > + > +#define BIT_MASK_64(high, low) (((1ULL << (high)) - 1) & ~((1ULL << (low)) - 1)) > +#define RING_ADDRESS_BAD_BIT_MASK (~BIT_MASK_64(48, 8)) > +#define RWPTR_ADDRESS_BAD_BIT_MASK (~BIT_MASK_64(48, 2)) > + > +#define MAX_QUEUE_SIZE (1ULL << 32) > +#define MIN_QUEUE_SIZE (1ULL << 10) > + > +static int > +cik_static_create_queue(struct kfd_scheduler *scheduler, > + struct kfd_scheduler_process *process, > + struct kfd_scheduler_queue *queue, > + void __user *ring_address, > + uint64_t ring_size, > + void __user *rptr_address, > + void __user *wptr_address, > + unsigned int doorbell) > +{ > + struct cik_static_private *priv = kfd_scheduler_to_private(scheduler); > + struct cik_static_process *hwp = kfd_process_to_private(process); > + struct cik_static_queue *hwq = kfd_queue_to_private(queue); > + > + if ((uint64_t)ring_address & RING_ADDRESS_BAD_BIT_MASK > + || (uint64_t)rptr_address & RWPTR_ADDRESS_BAD_BIT_MASK > + || (uint64_t)wptr_address & RWPTR_ADDRESS_BAD_BIT_MASK) > + return -EINVAL; > + > + if (ring_size > MAX_QUEUE_SIZE || ring_size < MIN_QUEUE_SIZE || !is_power_of_2(ring_size)) > + return -EINVAL; > + > + if (!allocate_hqd(priv, &hwq->queue)) > + return -ENOMEM; > + > + hwq->mqd_addr = priv->mqd_addr + sizeof(struct cik_mqd_padded) * hwq->queue; > + hwq->mqd = &priv->mqds[hwq->queue].mqd; > + hwq->pq_addr = ring_address; > + hwq->rptr_address = rptr_address; > + hwq->wptr_address = wptr_address; > + hwq->doorbell_index = doorbell; > + hwq->queue_size_encoded = ilog2(ring_size) - 3; > + > + init_mqd(hwq, hwp); > + activate_queue(priv, hwq); > + > + return 0; > +} > + > +static void > +cik_static_destroy_queue(struct kfd_scheduler *scheduler, struct kfd_scheduler_queue *queue) > +{ > + struct cik_static_private *priv = kfd_scheduler_to_private(scheduler); > + struct cik_static_queue *hwq = kfd_queue_to_private(queue); > + > + deactivate_queue(priv, hwq); > + > + release_hqd(priv, hwq->queue); > +} > + > +const struct kfd_scheduler_class radeon_kfd_cik_static_scheduler_class = { > + .name = "CIK static scheduler", > + .create = cik_static_create, > + .destroy = cik_static_destroy, > + .start = cik_static_start, > + .stop = cik_static_stop, > + .register_process = cik_static_register_process, > + .deregister_process = cik_static_deregister_process, > + .queue_size = sizeof(struct cik_static_queue), > + .create_queue = cik_static_create_queue, > + .destroy_queue = cik_static_destroy_queue, > +}; > diff --git a/drivers/gpu/hsa/radeon/kfd_vidmem.c b/drivers/gpu/hsa/radeon/kfd_vidmem.c > new file mode 100644 > index 0000000..c8d3770 > --- /dev/null > +++ b/drivers/gpu/hsa/radeon/kfd_vidmem.c > @@ -0,0 +1,61 @@ > +/* > + * Copyright 2014 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > + * OTHER DEALINGS IN THE SOFTWARE. > + */ > + > +#include "kfd_priv.h" > + > +int radeon_kfd_vidmem_alloc(struct kfd_dev *kfd, size_t size, size_t alignment, > + enum kfd_mempool pool, kfd_mem_obj *mem_obj) > +{ > + return kfd2kgd->allocate_mem(kfd->kgd, > + size, > + alignment, > + (enum kgd_memory_pool)pool, > + (struct kgd_mem **)mem_obj); > +} > + > +void radeon_kfd_vidmem_free(struct kfd_dev *kfd, kfd_mem_obj mem_obj) > +{ > + kfd2kgd->free_mem(kfd->kgd, (struct kgd_mem *)mem_obj); > +} > + > +int radeon_kfd_vidmem_gpumap(struct kfd_dev *kfd, kfd_mem_obj mem_obj, > + uint64_t *vmid0_address) > +{ > + return kfd2kgd->gpumap_mem(kfd->kgd, > + (struct kgd_mem *)mem_obj, > + vmid0_address); > +} > + > +void radeon_kfd_vidmem_ungpumap(struct kfd_dev *kfd, kfd_mem_obj mem_obj) > +{ > + kfd2kgd->ungpumap_mem(kfd->kgd, (struct kgd_mem *)mem_obj); > +} > + > +int radeon_kfd_vidmem_kmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj, void **ptr) > +{ > + return kfd2kgd->kmap_mem(kfd->kgd, (struct kgd_mem *)mem_obj, ptr); > +} > + > +void radeon_kfd_vidmem_unkmap(struct kfd_dev *kfd, kfd_mem_obj mem_obj) > +{ > + kfd2kgd->unkmap_mem(kfd->kgd, (struct kgd_mem *)mem_obj); > +} > -- > 1.9.1 > _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/dri-devel