Am 27.02.25 um 12:47 schrieb Jesse.zhang@xxxxxxx: > This commit updates the VM flush implementation for the SDMA engine. > > - Added a new function `sdma_v4_4_2_get_invalidate_req` to construct the VM_INVALIDATE_ENG0_REQ > register value for the specified VMID and flush type. This function ensures that all relevant > page table cache levels (L1 PTEs, L2 PTEs, and L2 PDEs) are invalidated. > > - Modified the `sdma_v4_4_2_ring_emit_vm_flush` function to use the new `sdma_v4_4_2_get_invalidate_req` > function. The updated function emits the necessary register writes and waits to perform a VM flush > for the specified VMID. It updates the PTB address registers and issues a VM invalidation request > using the specified VM invalidation engine. > > - Included the necessary header file `gc/gc_9_0_sh_mask.h` to provide access to the required register > definitions. > > v2: vm flush by the vm inalidation packet (Lijo) > > Suggested-by: Lijo Lazar <lijo.lazar@xxxxxxx> > Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 81 ++++++++++++++++++++---- > 1 file changed, 67 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > index ba43c8f46f45..a9e46a4ed7a8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > @@ -31,6 +31,7 @@ > #include "amdgpu_ucode.h" > #include "amdgpu_trace.h" > #include "amdgpu_reset.h" > +#include "gc/gc_9_0_sh_mask.h" > > #include "sdma/sdma_4_4_2_offset.h" > #include "sdma/sdma_4_4_2_sh_mask.h" > @@ -1292,21 +1293,75 @@ static void sdma_v4_4_2_ring_emit_pipeline_sync(struct amdgpu_ring *ring) > seq, 0xffffffff, 4); > } > > - > -/** > - * sdma_v4_4_2_ring_emit_vm_flush - vm flush using sDMA > +/* > + * sdma_v4_4_2_get_invalidate_req - Construct the VM_INVALIDATE_ENG0_REQ register value > + * @vmid: The VMID to invalidate > + * @flush_type: The type of flush (0 = legacy, 1 = lightweight, 2 = heavyweight) > * > - * @ring: amdgpu_ring pointer > - * @vmid: vmid number to use > - * @pd_addr: address > + * This function constructs the VM_INVALIDATE_ENG0_REQ register value for the specified VMID > + * and flush type. It ensures that all relevant page table cache levels (L1 PTEs, L2 PTEs, and > + * L2 PDEs) are invalidated. > + */ > +static uint32_t sdma_v4_4_2_get_invalidate_req(unsigned int vmid, > + uint32_t flush_type) > +{ > + u32 req = 0; > + > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, > + PER_VMID_INVALIDATE_REQ, 1 << vmid); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, FLUSH_TYPE, flush_type); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PTES, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE0, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE1, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L2_PDE2, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, INVALIDATE_L1_PTES, 1); > + req = REG_SET_FIELD(req, VM_INVALIDATE_ENG0_REQ, > + CLEAR_PROTECTION_FAULT_STATUS_ADDR, 0); > + > + return req; > +} > +/* The vm validate packet is only available for GC9.4.3/GC9.4.4/GC9.5.0 */ > +#define SDMA_OP_VM_INVALIDATE 0x8 > +#define SDMA_SUBOP_VM_INVALIDATE 0x4 That needs to be in a header. > + > +/* > + * sdma_v4_4_2_ring_emit_vm_flush - Emit VM flush commands for SDMA > + * @ring: The SDMA ring > + * @vmid: The VMID to flush > + * @pd_addr: The page directory address > * > - * Update the page table base and flush the VM TLB > - * using sDMA. > + * This function emits the necessary register writes and waits to perform a VM flush for the > + * specified VMID. It updates the PTB address registers and issues a VM invalidation request > + * using the specified VM invalidation engine. > */ > static void sdma_v4_4_2_ring_emit_vm_flush(struct amdgpu_ring *ring, > - unsigned vmid, uint64_t pd_addr) > + unsigned int vmid, uint64_t pd_addr) > { > - amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr); > + struct amdgpu_device *adev = ring->adev; > + uint32_t req = sdma_v4_4_2_get_invalidate_req(vmid, 0); > + unsigned int eng = ring->vm_inv_eng; > + struct amdgpu_vmhub *hub = &adev->vmhub[ring->vm_hub]; > + > + amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_lo32 + > + (hub->ctx_addr_distance * vmid), > + lower_32_bits(pd_addr)); > + > + amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + > + (hub->ctx_addr_distance * vmid), > + upper_32_bits(pd_addr)); That is unecessary. > + /* > + * Construct and emit the VM invalidation packet: > + * DW0: OP, Sub OP, Engine IDs (XCC0, XCC1, MMHUB) > + * DW1: Invalidation request > + * DW2: Lower 32 bits of page directory address > + * DW3: Upper 32 bits of page directory address and INVALIDATEACK How are upper bits and invalidateack mixed together here? > + */ > + amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_VM_INVALIDATE) | > + SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_VM_INVALIDATE) | > + (0x1f << 16) | (0x1f << 21) | (eng << 26)); What does those magic numbers mean? > + amdgpu_ring_write(ring, req); > + amdgpu_ring_write(ring, 0x0); > + amdgpu_ring_write(ring, (0x1 << vmid)); Either drop the () and the 0x or even better use the bit macro. And it looks like you completely missed the upper and lower bits of the page directory. Christian. > } > > static void sdma_v4_4_2_ring_emit_wreg(struct amdgpu_ring *ring, > @@ -2112,8 +2167,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = { > 3 + /* hdp invalidate */ > 6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */ > /* sdma_v4_4_2_ring_emit_vm_flush */ > - SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 + > + 4 + 2 * 3 + > 10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence, vm fence */ > .emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */ > .emit_ib = sdma_v4_4_2_ring_emit_ib, > @@ -2145,8 +2199,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = { > 3 + /* hdp invalidate */ > 6 + /* sdma_v4_4_2_ring_emit_pipeline_sync */ > /* sdma_v4_4_2_ring_emit_vm_flush */ > - SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + > - SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 + > + 4 + 2 * 3 + > 10 + 10 + 10, /* sdma_v4_4_2_ring_emit_fence x3 for user fence, vm fence */ > .emit_ib_size = 7 + 6, /* sdma_v4_4_2_ring_emit_ib */ > + .emit_ib = sdma_v4_4_2_ring_emit_ib,