On 3/19/2025 10:37 AM, jesse.zhang@xxxxxxx wrote: > From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx> > > This commit updates the VM flush implementation for the SDMA engine. > > - Added a new function `sdma_v4_4_2_get_invalidate_req` to construct the VM_INVALIDATE_ENG0_REQ > register value for the specified VMID and flush type. This function ensures that all relevant > page table cache levels (L1 PTEs, L2 PTEs, and L2 PDEs) are invalidated. > > - Modified the `sdma_v4_4_2_ring_emit_vm_flush` function to use the new `sdma_v4_4_2_get_invalidate_req` > function. The updated function emits the necessary register writes and waits to perform a VM flush > for the specified VMID. It updates the PTB address registers and issues a VM invalidation request > using the specified VM invalidation engine. > > - Included the necessary header file `gc/gc_9_0_sh_mask.h` to provide access to the required register > definitions. > > v2: vm flush by the vm inalidation packet (Lijo) > v3: code stle and define thh macro for the vm invalidation packet (Christian) > v4: Format definition sdma vm invalidate packet (Lijo) > > Suggested-by: Lijo Lazar <lijo.lazar@xxxxxxx> > Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 77 +++++++++++++++---- > .../gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h | 54 +++++++++++++ > 2 files changed, 117 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > index fd34dc138081..06ce0c98ef5d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > @@ -31,6 +31,7 @@ > #include "amdgpu_ucode.h" > #include "amdgpu_trace.h" > #include "amdgpu_reset.h" > +#include "gc/gc_9_0_sh_mask.h" > > #include "sdma/sdma_4_4_2_offset.h" > #include "sdma/sdma_4_4_2_sh_mask.h" > @@ -1292,21 +1293,71 @@ static void sdma_v4_4_2_ring_emit_pipeline_sync(struct amdgpu_ring *ring) > seq, 0xffffffff, 4); > } > > - > -/** > - * sdma_v4_4_2_ring_emit_vm_flush - vm flush using sDMA > +/* > + * sdma_v4_4_2_get_invalidate_req - Construct the VM_INVALIDATE_ENG0_REQ register value > + * @vmid: The VMID to invalidate > + * @flush_type: The type of flush (0 = legacy, 1 = lightweight, 2 = heavyweight) > * > - * @ring: amdgpu_ring pointer > - * @vmid: vmid number to use > - * @pd_addr: address > + * This function constructs the VM_INVALIDATE_ENG0_REQ register value for the specified VMID > + * and flush type. It ensures that all relevant page table cache levels (L1 PTEs, L2 PTEs, and > + * L2 PDEs) are invalidated. > + */ > +static uint32_t sdma_v4_4_2_get_invalidate_req(unsigned int vmid, > + uint32_t flush_type) > +{ > + u32 req = 0; > + > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, > + PER_VMID_INVALIDATE_REQ, 1 << vmid); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, flush_type); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE2, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L1_PTES, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, > + CLEAR_PROTECTION_FAULT_STATUS_ADDR, 0); > + > + return req; > +} > + > +/* > + * sdma_v4_4_2_ring_emit_vm_flush - Emit VM flush commands for SDMA > + * @ring: The SDMA ring > + * @vmid: The VMID to flush > + * @pd_addr: The page directory address > * > - * Update the page table base and flush the VM TLB > - * using sDMA. > + * This function emits the necessary register writes and waits to perform a VM flush for the > + * specified VMID. It updates the PTB address registers and issues a VM invalidation request > + * using the specified VM invalidation engine. > */ > static void sdma_v4_4_2_ring_emit_vm_flush(struct amdgpu_ring *ring, > - unsigned vmid, uint64_t pd_addr) > + unsigned int vmid, uint64_t pd_addr) > { > - amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr); > + struct amdgpu_device *adev = ring->adev; > + uint32_t req = sdma_v4_4_2_get_invalidate_req(vmid, 0); > + unsigned int eng = ring->vm_inv_eng; > + struct amdgpu_vmhub *hub = &adev->vmhub[ring->vm_hub]; > + > + amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + > + (hub->ctx_addr_distance * vmid), > + lower_32_bits(pd_addr)); > + > + amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + > + (hub->ctx_addr_distance * vmid), > + upper_32_bits(pd_addr)); > + /* > + * Construct and emit the VM invalidation packet > + */ > + amdgpu_ring_write(ring, > + SDMA_PKT_VM_INVALIDATE_HEADER_OP(SDMA_OP_VM_INVALIDATE) | > + SDMA_PKT_VM_INVALIDATE_HEADER_SUB_OP(SDMA_SUBOP_VM_INVALIDATE) | > + SDMA_PKT_VM_INVALIDATE_HEADER_XCC0_ENGINE_ID(0x1f) | > + SDMA_PKT_VM_INVALIDATE_HEADER_XCC1_ENGINE_ID(0x1f | > + SDMA_PKT_VM_INVALIDATE_HEADER_MMHUB_ENGINE_ID(eng))); > + amdgpu_ring_write(ring, VM_INVALIDATE_REQ_INVALIDATE(req)); > + amdgpu_ring_write(ring, 0); > + amdgpu_ring_write(ring, VM_INVALIDATE_ADDR_RANGE_INVALIDATE_ACK(BIT(vmid))); > } > > static void sdma_v4_4_2_ring_emit_wreg(struct amdgpu_ring *ring, > @@ -2115,8 +2166,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = { > 3 + /* hdp invalidate */ > 6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */ > /* sdma_v4_4_2_ring_emit_vm_flush */ > - SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 + > + 4 + 2 * 3 + > 10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence, vm fence */ > .emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */ > .emit_ib = sdma_v4_4_2_ring_emit_ib, > @@ -2148,8 +2198,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = { > 3 + /* hdp invalidate */ > 6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */ > /* sdma_v4_4_2_ring_emit_vm_flush */ > - SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 + > + 4 + 2 * 3 + > 10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence, vm fence */ > .emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */ > .emit_ib = sdma_v4_4_2_ring_emit_ib, > diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h b/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h > index 8de4ccce5e38..2da2e2443c87 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h > +++ b/drivers/gpu/drm/amd/amdgpu/vega10_sdma_pkt_open.h > @@ -64,6 +64,9 @@ > #define HEADER_BARRIER 5 > #define SDMA_OP_AQL_COPY 0 > #define SDMA_OP_AQL_BARRIER_OR 0 > +/* vm invalidation is only available for GC9.4.3/GC9.4.4/GC9.5.0 */ > +#define SDMA_OP_VM_INVALIDATE 8 > +#define SDMA_SUBOP_VM_INVALIDATE 4 > > /*define for op field*/ > #define SDMA_PKT_HEADER_op_offset 0 > @@ -3331,5 +3334,56 @@ > #define SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_completion_signal_63_32_shift 0 > #define SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_COMPLETION_SIGNAL_63_32(x) (((x) & SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_completion_signal_63_32_mask) << SDMA_AQL_PKT_BARRIER_OR_COMPLETION_SIGNAL_HI_completion_signal_63_32_shift) > > +/* > +** Definitions for SDMA_VM_INVALIDATION packet > +*/ > > +/* Define for HEADER word (DW0) */ > +#define SDMA_PKT_VM_INVALIDATE_HEADER_op_offset 0 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_op_mask 0x000000FF > +#define SDMA_PKT_VM_INVALIDATE_HEADER_op_shift 0 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_OP(x) (((x) & SDMA_PKT_VM_INVALIDATE_HEADER_op_mask) << SDMA_PKT_VM_INVALIDATE_HEADER_op_shift) > + > +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_offset 8 By consistent format, I meant - _offset = DWORD offset _mask = Mask of the field _shift = shift required for the field within that DWORD Besides, all defines start with SDMA_PKT_<packet name>_<dword name>_<field name>_offset SDMA_PKT_<packet name>_<dword name>_<field name>_mask SDMA_PKT_<packet name>_<dword name>_<field name>_shift Or, better approach design/verif team to generate the header and copy directly. Thanks, Lijo > +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_mask 0x000000FF > +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_shift 8 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_SUB_OP(x) (((x) & SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_mask) << SDMA_PKT_VM_INVALIDATE_HEADER_SUB_op_shift) > + > +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_offset 16 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_mask 0x0000001F > +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_shift 16 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_XCC0_ENGINE_ID(x) (((x) & SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_mask) << SDMA_PKT_VM_INVALIDATE_HEADER_xcc0_engine_id_shift) > + > +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_offset 21 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_mask 0x0000003E > +#define SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_shift 21 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_XCC1_ENGINE_ID(x) (((x) & SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_mask) << SDMA_PKT_VM_INVALIDATE_HEADER_xcc1_engine_id_shift) > + > +#define SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_offset 26 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_mask 0x0000007C > +#define SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_shift 26 > +#define SDMA_PKT_VM_INVALIDATE_HEADER_MMHUB_ENGINE_ID(x) (((x) & SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_mask) << SDMA_PKT_VM_INVALIDATE_HEADER_mmhub_engine_id_shift) > + > +/* Define for INVALIDATEREQ word (DW1) */ > +#define VM_INVALIDATE_req_invalidate_offset 0 > +#define VM_INVALIDATE_req_invalidate_mask 0xFFFFFFFF > +#define VM_INVALIDATE_req_invalidate_shift 0 > +#define VM_INVALIDATE_REQ_INVALIDATE(x) (((x) & VM_INVALIDATE_req_invalidate_mask) << VM_INVALIDATE_req_invalidate_shift) > + > +/* Define for ADDRESSRANGELO word (DW2) */ > +#define VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_offset 0 > +#define VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_mask 0xFFFFFFFF > +#define VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_shift 0 > +#define VM_INVALIDATE_ADDR_RANGE_LO_ADDR_31_0(x) (((x) & VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_mask) << VM_INVALIDATE_ADDR_RANGE_LO_addr_31_0_shift) > + > +#define VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_offset 16 > +#define VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_mask 0x0000001F > +#define VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_shift 16 > +#define VM_INVALIDATE_ADDR_RANGE_HI_ADDR_64_32(x) (((x) & VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_mask) << VM_INVALIDATE_ADDR_RANGE_HI_addr_64_32_shift) > + > +/* Define for ADDRESSRANGEHI and INVALIDATEACK word (DW3) */ > +#define VM_INVALIDATE_ADDR_RANGE_invalidate_ack_offset 0 > +#define VM_INVALIDATE_ADDR_RANGE_invalidate_ack_mask 0x0000FFFF > +#define VM_INVALIDATE_ADDR_RANGE_invalidate_ack_shift 0 > +#define VM_INVALIDATE_ADDR_RANGE_INVALIDATE_ACK(x) (((x) & VM_INVALIDATE_ADDR_RANGE_invalidate_ack_mask) << VM_INVALIDATE_ADDR_RANGE_invalidate_ack_shift) > #endif /* __SDMA_PKT_OPEN_H_ */