Some notes: Follows AMD64 calling conventions. Uses masm 64 (ml64.exe). Dropped in the next patches because it is too slow. --- xddm/display/amd64/x64.asm | 236 +++++++++++++++++++++++++++++++++++++++++++++ xddm/display/res.c | 13 +-- 2 files changed, 241 insertions(+), 8 deletions(-) diff --git a/xddm/display/amd64/x64.asm b/xddm/display/amd64/x64.asm index 36971d3..bb45d33 100644 --- a/xddm/display/amd64/x64.asm +++ b/xddm/display/amd64/x64.asm @@ -11,4 +11,240 @@ CheckAndSetSSE2 proc ret CheckAndSetSSE2 endp +RestoreFPU proc +; rcx PDev *pdev +; rdx size_t aligned_addr + movdqa xmm0, [rcx] + movdqa xmm1, [rcx + 16] + movdqa xmm2, [rcx + 32] + movdqa xmm3, [rcx + 48] + ret +RestoreFPU endp + +SaveFPU proc +; rcx PDev *pdev +; rdx size_t aligned_addr + movdqa [rcx], xmm0 + movdqa [rcx + 16], xmm1 + movdqa [rcx + 32], xmm2 + movdqa [rcx + 48], xmm3 + ret +SaveFPU endp + +fast_memcpy_aligned proc +; rcx void *dest +; rdx const void *src +; r8 size_t len + ; Save rsi and rdi + mov r9, rsi + mov r10, rdi + + mov rsi, rdx + mov rdi, rcx + mov rcx, r8 + + cmp rcx, 128 + jb try_to_copy64 + + prefetchnta [rsi] + copy_128: + prefetchnta [rsi + 64] + + movdqa xmm0, [rsi] + movdqa xmm1, [rsi + 16] + movdqa xmm2, [rsi + 32] + movdqa xmm3, [rsi + 48] + + prefetchnta [rsi + 128] + + movntdq [rdi], xmm0 + movntdq [rdi + 16], xmm1 + movntdq [rdi + 32], xmm2 + movntdq [rdi + 48], xmm3 + + movdqa xmm0, [rsi + 64] + movdqa xmm1, [rsi + 80] + movdqa xmm2, [rsi + 96] + movdqa xmm3, [rsi + 112] + + movntdq [rdi + 64], xmm0 + movntdq [rdi + 80], xmm1 + movntdq [rdi + 96], xmm2 + movntdq [rdi + 112], xmm3 + + add rdi, 128 + add rsi, 128 + sub rcx, 128 + cmp rcx, 128 + jae copy_128 + + try_to_copy64: + cmp rcx, 64 + jb try_to_copy32 + + movdqa xmm0, [rsi] + movdqa xmm1, [rsi + 16] + movdqa xmm2, [rsi + 32] + movdqa xmm3, [rsi + 48] + + movntdq [rdi], xmm0 + movntdq [rdi + 16], xmm1 + movntdq [rdi + 32], xmm2 + movntdq [rdi + 48], xmm3 + + add rdi, 64 + add rsi, 64 + sub rcx, 64 + prefetchnta [rsi] + + try_to_copy32: + cmp rcx, 32 + jb try_to_copy16 + + movdqa xmm0, [rsi] + movdqa xmm1, [rsi + 16] + movntdq [rdi], xmm0 + movntdq [rdi + 16], xmm1 + + add rdi, 32 + add rsi, 32 + sub rcx, 32 + + try_to_copy16: + cmp rcx, 16 + jb try_to_copy4 + + movdqa xmm0, [rsi] + movntdq [rdi], xmm0 + + add rdi, 16 + add rsi, 16 + sub rcx, 16 + + + try_to_copy4: + cmp rcx, 4 + jb try_to_copy_1 + movsd + sub rcx, 4 + jmp try_to_copy4 + + try_to_copy_1: + rep movsb + + sfence + ; Save rsi and rdi + mov rsi, r9 + mov rdi, r10 + ret +fast_memcpy_aligned endp + +fast_memcpy_unaligned proc +; rcx void *dest +; rdx const void *src +; r8 size_t len + ; Save rsi and rdi + mov r9, rsi + mov r10, rdi + + mov rsi, rdx + mov rdi, rcx + mov rcx, r8 + + cmp rcx, 128 + jb try_to_copy64 + + prefetchnta [rsi] + copy_128: + prefetchnta [rsi + 64] + + movdqu xmm0, [rsi] + movdqu xmm1, [rsi + 16] + movdqu xmm2, [rsi + 32] + movdqu xmm3, [rsi + 48] + + prefetchnta [rsi + 128] + + movntdq [rdi], xmm0 + movntdq [rdi + 16], xmm1 + movntdq [rdi + 32], xmm2 + movntdq [rdi + 48], xmm3 + + movdqu xmm0, [rsi + 64] + movdqu xmm1, [rsi + 80] + movdqu xmm2, [rsi + 96] + movdqu xmm3, [rsi + 112] + + movntdq [rdi + 64], xmm0 + movntdq [rdi + 80], xmm1 + movntdq [rdi + 96], xmm2 + movntdq [rdi + 112], xmm3 + + add rdi, 128 + add rsi, 128 + sub rcx, 128 + cmp rcx, 128 + jae copy_128 + + try_to_copy64: + cmp rcx, 64 + jb try_to_copy32 + + movdqu xmm0, [rsi] + movdqu xmm1, [rsi + 16] + movdqu xmm2, [rsi + 32] + movdqu xmm3, [rsi + 48] + + movntdq [rdi], xmm0 + movntdq [rdi + 16], xmm1 + movntdq [rdi + 32], xmm2 + movntdq [rdi + 48], xmm3 + + add rdi, 64 + add rsi, 64 + sub rcx, 64 + prefetchnta [rsi] + + try_to_copy32: + cmp rcx, 32 + jb try_to_copy16 + + movdqu xmm0, [rsi] + movdqu xmm1, [rsi + 16] + movntdq [rdi], xmm0 + movntdq [rdi + 16], xmm1 + + add rdi, 32 + add rsi, 32 + sub rcx, 32 + + try_to_copy16: + cmp rcx, 16 + jb try_to_copy4 + + movdqu xmm0, [rsi] + movntdq [rdi], xmm0 + + add rdi, 16 + add rsi, 16 + sub rcx, 16 + + + try_to_copy4: + cmp rcx, 4 + jb try_to_copy_1 + movsd + sub rcx, 4 + jmp try_to_copy4 + + try_to_copy_1: + rep movsb + + sfence + ; restore rsi and rdi + mov rsi, r9 + mov rdi, r10 + ret +fast_memcpy_unaligned endp + end \ No newline at end of file diff --git a/xddm/display/res.c b/xddm/display/res.c index 9320dd1..60e9bcb 100644 --- a/xddm/display/res.c +++ b/xddm/display/res.c @@ -36,6 +36,11 @@ #include "devioctl.h" #include "ntddvdeo.h" +void SaveFPU(PDev *pdev, size_t aligned_addr); +void RestoreFPU(PDev *pdev, size_t aligned_addr); +void fast_memcpy_unaligned(void *dest, const void *src, size_t len); +void fast_memcpy_aligned(void *dest, const void *src, size_t len); + static _inline QXLPHYSICAL PA(PDev *pdev, PVOID virt, UINT8 slot_id) { PMemSlot *p_slot = &pdev->mem_slots[slot_id]; @@ -1312,7 +1317,6 @@ static void __PutBytesAlign(PDev *pdev, QXLDataChunk **chunk_ptr, UINT8 **now_pt NEW_DATA_CHUNK(page_counter, aligned_size); cp_size = (int)MIN(end - now, size); } -#ifndef _WIN64 if (use_sse) { offset = (size_t)now & SSE_MASK; if (offset) { @@ -1341,9 +1345,6 @@ static void __PutBytesAlign(PDev *pdev, QXLDataChunk **chunk_ptr, UINT8 **now_pt } else { RtlCopyMemory(now, src, cp_size); } -#else - RtlCopyMemory(now, src, cp_size); -#endif src += cp_size; now += cp_size; chunk->data_size += cp_size; @@ -1905,21 +1906,17 @@ static _inline Resource *GetBitmapImage(PDev *pdev, SURFOBJ *surf, XLATEOBJ *col dest_end = (UINT8 *)image_res + alloc_size; alloc_size = height * line_size; -#ifndef _WIN64 if (have_sse2 && alloc_size >= 1024) { use_sse = TRUE; SaveFPU(pdev, FPUSave); } -#endif for (; src != src_end; src -= surf->lDelta, alloc_size -= line_size) { PutBytesAlign(pdev, &chunk, &dest, &dest_end, src, line_size, &pdev->num_bits_pages, alloc_size, line_size, use_sse); } -#ifndef _WIN64 if (use_sse) { RestoreFPU(pdev, FPUSave); } -#endif GetPallette(pdev, &internal->image.bitmap, color_trans); DEBUG_PRINT((pdev, 13, "%s: done\n", __FUNCTION__)); -- 1.9.0 _______________________________________________ Spice-devel mailing list Spice-devel@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/spice-devel