hello I am trying to package a programme for debian http://tango-controls.org now using the debian tools I found a problem of non-pic code in the libtango library. I attached the result of a scanelf -qT as you can see there is a problem with for example _128mm variable. So to see what I mean I also attache the incriminated files for example in jpeg_color_mmx.cpp "psubw mm2,_128mm \n" "psubw mm3,_128mm \n" I tryed to replace this by "psubw mm2, %0 \n" "psubw mm3, %0 \n" . . . : /* no output */ : "rm"(_128mm), ... but I obtain that sort of error jpeg_color_mmx.cpp: Assembler messages: jpeg_color_mmx.cpp:487: Error: junk `(%ebx)' after expression jpeg_color_mmx.cpp:488: Error: junk `(%ebx)' after expression So how must I modify my these files to avoid this non-pic problem thanks for your help Frederic ps: The attached color_mmx file already contain my modification _128mm -> %0 I try the same with other _rxxx variables but the only result was more junk (%ebs) ps2: I already look with the mainstream autor but he was not really awair of this pic non-pic issue. -- GPG public key 1024D/A59B1171 2009-08-11 fingerprint = 1688 A3D6 F0BD E4DF 2E6B 06AA B6A9 BA6A A59B 1171 uid Picca Frédéric-Emmanuel <picca@xxxxxxxxxxxxxxxxxxxxx>
Attachment:
scanelf_output
Description: Binary data
///============================================================================= // // file : jpeg_color_mmx.cpp // // description : Simple jpeg coding/decoding library // Color space conversion (MMX routines) // // project : TANGO // // author(s) : JL Pons // // Copyright (C) : 2004,2005,2006,2007,2008,2009 // European Synchrotron Radiation Facility // BP 220, Grenoble 38043 // FRANCE // // This file is part of Tango. // // Tango is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // Tango is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with Tango. If not, see <http://www.gnu.org/licenses/>. // // $Revision: 1.6 $ // // $Log: jpeg_color_mmx.cpp,v $ // Revision 1.6 2009/04/20 14:55:58 jlpons // Added GPL header, changed memory allocation to C++ fashion. // //============================================================================= //------------------------------------------------------------------------------ // // (YCbCr to RGB) Y,Cb and Cr range in [0,255] // R = Y + 1.402 * (Cr-128) // G = Y - 0.34414 * (Cb-128) - 0.71414*(Cr-128) // B = Y + 1.772 * (Cb-128) // // (RGB to YCbCr) R,G,B range in [0,255] // Y = 0.299 * R + 0.587 * G + 0.114 * B // Cb = -0.16874 * R - 0.33126 * G + 0.5 * B + 128 // Cr = 0.5 * R - 0.41869 * G - 0.08131 * B + 128 // //------------------------------------------------------------------------------ #include "jpeg_memory.h" #include "jpeg_lib.h" #include <string.h> #ifdef JPG_USE_ASM #ifdef _WINDOWS // Disable "has no EMMS instruction" warning #pragma warning( disable : 4799 ) #endif #define FIX14(x) ((short) ((x) * (1L<<14) + 0.5)) #define FIX15(x) ((short) ((x) * (1L<<15) + 0.5)) #define FIX16(x) ((short) ((x) * (1L<<16) + 0.5)) #ifdef _WINDOWS // Visual C++ align directive #define ALIGN8 __declspec(align(8)) #else // gcc align directive #define ALIGN8 __attribute__ ((aligned (8))) #endif // MMX constant (YCbCr to RGB) ALIGN8 short _cbgmm[] = { -FIX14(0.34414),-FIX14(0.34414),-FIX14(0.34414),-FIX14(0.34414) }; ALIGN8 short _cbbmm[] = { FIX14(1.772) , FIX14(1.772) , FIX14(1.772) , FIX14(1.772) }; ALIGN8 short _crgmm[] = { -FIX14(0.71414),-FIX14(0.71414),-FIX14(0.71414),-FIX14(0.71414) }; ALIGN8 short _crrmm[] = { FIX14(1.402) , FIX14(1.402) , FIX14(1.402) , FIX14(1.402) }; ALIGN8 short _128mm[] = { 128,128,128,128 }; // RGB to YCbCr ALIGN8 short _rymm[] = { FIX15(0.299), FIX15(0.299), FIX15(0.299), FIX15(0.299) }; ALIGN8 short _gymm[] = { FIX15(0.587), FIX15(0.587), FIX15(0.587), FIX15(0.587) }; ALIGN8 short _bymm[] = { FIX15(0.114), FIX15(0.114), FIX15(0.114), FIX15(0.114) }; ALIGN8 short _offymm[] = { -127,-127,-127,-127 }; // +1 for rounding ALIGN8 short _rcbcrmm[] = {-FIX15(0.16874),-FIX15(0.16874), FIX15(0.5) , FIX15(0.5) }; ALIGN8 short _gcbcrmm[] = {-FIX15(0.33126),-FIX15(0.33126),-FIX15(0.41869),-FIX15(0.41869)}; ALIGN8 short _bcbcrmm[] = { FIX15(0.5) , FIX15(0.5) ,-FIX15(0.08131),-FIX15(0.08131)}; ALIGN8 short _rcmm[] = { 1,1,1,1 }; //------------------------------------------------------------------------------ // MCU16x16 YCbCr H2V2 (2x2:1:1, 6 blocks per MCU) to 32-bit RGB //------------------------------------------------------------------------------ void jpeg_yh2v2_to_rgb32_mmx(unsigned char *block,long width,unsigned char *rgb) { unsigned char *y = block; unsigned char *cb = block+64*4; long j,y0; // ! Due to wrong gcc stack code (does not detect push), y0 is cleared during asm ! for(j=0;j<8;j++) { y0 = ((j&4)<<5) + ((j&3)<<4); #ifdef _WINDOWS // Visual C++ inline assembly _asm { mov edi,rgb mov esi,y mov eax,width mov ebx,cb mov edx,y0 xor ecx,ecx shl eax,2 __blrow_h2v2: // Y block offset add edx,ecx // -- 00 -- movd mm1,[esi+edx] // [0000][y11][y10][y01][y00] movd mm2,[ebx] // [0000][cb3][cb2][cb1][cb0] movd mm3,[ebx+64] // [0000][cr3][cr2][cr1][cr0] pxor mm0,mm0 punpcklbw mm1,mm0 // [y11][y10][y01][y00] punpcklbw mm2,mm0 // [cb3][cb2][cb1][cb0] punpcklbw mm3,mm0 // [cr3][cr2][cr1][cr0] psubw mm2,_128mm psubw mm3,_128mm psllw mm2,2 psllw mm3,2 movq mm4,mm2 movq mm5,mm3 pmulhw mm2,_cbgmm pmulhw mm4,_cbbmm pmulhw mm3,_crgmm pmulhw mm5,_crrmm movq mm6,mm5 punpcklwd mm6,mm6 // [cr1*crr][cr1*crr][cr0*crr][cr0*crr] paddw mm6,mm1 // R3R2R1R0 movq mm0,mm2 movq mm7,mm3 punpcklwd mm0,mm0 // [cb1*cbg][cb1*cbg][cb0*cbg][cb0*cbg] punpcklwd mm7,mm7 // [cr1*crg][cr1*crg][cr0*crg][cr0*crg] paddw mm0,mm1 paddw mm0,mm7 // G3G2G1G0 movq mm7,mm4 punpcklwd mm7,mm7 // [cb1*cbb][cb1*cbb][cb0*cbb][cb0*cbb] paddw mm7,mm1 // B3B2B1B0 pxor mm1,mm1 packuswb mm6,mm1 // [0000]R3R2R1R0 packuswb mm0,mm1 // [0000]G3G2G1G0 packuswb mm7,mm1 // [0000]B3B2B1B0 punpcklbw mm6,mm0 // G3R3G2R2G1R1G0R0 punpcklbw mm7,mm1 // 00B300B200B100B0 movq mm1,mm6 punpcklwd mm1,mm7 // 00B1G1R100B0G0R0 punpckhwd mm6,mm7 // 00B3G3R300B2G2R2 movq [edi] ,mm1 movq [edi+8],mm6 // -- 01 -- movd mm1,[esi+edx+4] // [0000][y31][y30][y21][y20] pxor mm0,mm0 punpcklbw mm1,mm0 movq mm6,mm5 punpckhwd mm6,mm6 // [cr3*crr][cr3*crr][cr2*crr][cr2*crr] paddw mm6,mm1 // R3R2R1R0 movq mm0,mm2 movq mm7,mm3 punpckhwd mm0,mm0 // [cb3*cbg][cb3*cbg][cb2*cbg][cb2*cbg] punpckhwd mm7,mm7 // [cr3*crg][cr3*crg][cr2*crg][cr2*crg] paddw mm0,mm1 paddw mm0,mm7 // G3G2G1G0 movq mm7,mm4 punpckhwd mm7,mm7 // [cb3*cbb][cb3*cbb][cb2*cbb][cb2*cbb] paddw mm7,mm1 // B3B2B1B0 pxor mm1,mm1 packuswb mm6,mm1 // [0000]R3R2R1R0 packuswb mm0,mm1 // [0000]G3G2G1G0 packuswb mm7,mm1 // [0000]B3B2B1B0 punpcklbw mm6,mm0 // G3R3G2R2G1R1G0R0 punpcklbw mm7,mm1 // 00B300B200B100B0 movq mm1,mm6 punpcklwd mm1,mm7 // 00B1G1R100B0G0R0 punpckhwd mm6,mm7 // 00B3G3R300B2G2R2 movq [edi+16],mm1 movq [edi+24],mm6 // -- 10 -- movd mm1,[esi+edx+8] // [0000][y11][y10][y01][y00] pxor mm0,mm0 punpcklbw mm1,mm0 movq mm6,mm5 punpcklwd mm6,mm6 // [cr1*crr][cr1*crr][cr0*crr][cr0*crr] paddw mm6,mm1 // R3R2R1R0 movq mm0,mm2 movq mm7,mm3 punpcklwd mm0,mm0 // [cb1*cbg][cb1*cbg][cb0*cbg][cb0*cbg] punpcklwd mm7,mm7 // [cr1*crg][cr1*crg][cr0*crg][cr0*crg] paddw mm0,mm1 paddw mm0,mm7 // G3G2G1G0 movq mm7,mm4 punpcklwd mm7,mm7 // [cb1*cbb][cb1*cbb][cb0*cbb][cb0*cbb] paddw mm7,mm1 // B3B2B1B0 pxor mm1,mm1 packuswb mm6,mm1 // [0000]R3R2R1R0 packuswb mm0,mm1 // [0000]G3G2G1G0 packuswb mm7,mm1 // [0000]B3B2B1B0 punpcklbw mm6,mm0 // G3R3G2R2G1R1G0R0 punpcklbw mm7,mm1 // 00B300B200B100B0 movq mm1,mm6 punpcklwd mm1,mm7 // 00B1G1R100B0G0R0 punpckhwd mm6,mm7 // 00B3G3R300B2G2R2 movq [edi+eax] ,mm1 movq [edi+eax+8],mm6 // -- 11 -- movd mm1,[esi+edx+12] // [0000][y31][y30][y21][y20] pxor mm0,mm0 punpcklbw mm1,mm0 movq mm6,mm5 punpckhwd mm6,mm6 // [cr3*crr][cr3*crr][cr2*crr][cr2*crr] paddw mm6,mm1 // R3R2R1R0 movq mm0,mm2 movq mm7,mm3 punpckhwd mm0,mm0 // [cb3*cbg][cb3*cbg][cb2*cbg][cb2*cbg] punpckhwd mm7,mm7 // [cr3*crg][cr3*crg][cr2*crg][cr2*crg] paddw mm0,mm1 paddw mm0,mm7 // G3G2G1G0 movq mm7,mm4 punpckhwd mm7,mm7 // [cb3*cbb][cb3*cbb][cb2*cbb][cb2*cbb] paddw mm7,mm1 // B3B2B1B0 pxor mm1,mm1 packuswb mm6,mm1 // [0000]R3R2R1R0 packuswb mm0,mm1 // [0000]G3G2G1G0 packuswb mm7,mm1 // [0000]B3B2B1B0 punpcklbw mm6,mm0 // G3R3G2R2G1R1G0R0 punpcklbw mm7,mm1 // 00B300B200B100B0 movq mm1,mm6 punpcklwd mm1,mm7 // 00B1G1R100B0G0R0 punpckhwd mm6,mm7 // 00B3G3R300B2G2R2 movq [edi+eax+16],mm1 movq [edi+eax+24],mm6 sub edx,ecx // Restore edx add edi,32 add ebx,4 add ecx,64 cmp ecx,128 jl __blrow_h2v2 } #else // GCC inline assembly code __asm__ ( ".intel_syntax noprefix \n" #ifdef _64BITS "push rbx \n" "mov rbx,rcx \n" "xor rcx,rcx \n" "shl rax,2 \n" "__blrow_h2v2: \n" "add rdx,rcx \n" "movd mm1,[rsi+rdx] \n" "movd mm2,[rbx] \n" "movd mm3,[rbx+64] \n" #else "push ebx \n" "mov ebx,ecx \n" "xor ecx,ecx \n" "shl eax,2 \n" "__blrow_h2v2: \n" "add edx,ecx \n" "movd mm1,[esi+edx] \n" "movd mm2,[ebx] \n" "movd mm3,[ebx+64] \n" #endif "pxor mm0,mm0 \n" "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "psubw mm2,%0 \n" "psubw mm3,%0 \n" "psllw mm2,2 \n" "psllw mm3,2 \n" "movq mm4,mm2 \n" "movq mm5,mm3 \n" "pmulhw mm2,_cbgmm \n" "pmulhw mm4,_cbbmm \n" "pmulhw mm3,_crgmm \n" "pmulhw mm5,_crrmm \n" "movq mm6,mm5 \n" "punpcklwd mm6,mm6 \n" "paddw mm6,mm1 \n" "movq mm0,mm2 \n" "movq mm7,mm3 \n" "punpcklwd mm0,mm0 \n" "punpcklwd mm7,mm7 \n" "paddw mm0,mm1 \n" "paddw mm0,mm7 \n" "movq mm7,mm4 \n" "punpcklwd mm7,mm7 \n" "paddw mm7,mm1 \n" "pxor mm1,mm1 \n" "packuswb mm6,mm1 \n" "packuswb mm0,mm1 \n" "packuswb mm7,mm1 \n" "punpcklbw mm6,mm0 \n" "punpcklbw mm7,mm1 \n" "movq mm1,mm6 \n" "punpcklwd mm1,mm7 \n" "punpckhwd mm6,mm7 \n" #ifdef _64BITS "movq [rdi] ,mm1 \n" "movq [rdi+8],mm6 \n" "movd mm1,[rsi+rdx+4] \n" #else "movq [edi] ,mm1 \n" "movq [edi+8],mm6 \n" "movd mm1,[esi+edx+4] \n" #endif "pxor mm0,mm0 \n" "punpcklbw mm1,mm0 \n" "movq mm6,mm5 \n" "punpckhwd mm6,mm6 \n" "paddw mm6,mm1 \n" "movq mm0,mm2 \n" "movq mm7,mm3 \n" "punpckhwd mm0,mm0 \n" "punpckhwd mm7,mm7 \n" "paddw mm0,mm1 \n" "paddw mm0,mm7 \n" "movq mm7,mm4 \n" "punpckhwd mm7,mm7 \n" "paddw mm7,mm1 \n" "pxor mm1,mm1 \n" "packuswb mm6,mm1 \n" "packuswb mm0,mm1 \n" "packuswb mm7,mm1 \n" "punpcklbw mm6,mm0 \n" "punpcklbw mm7,mm1 \n" "movq mm1,mm6 \n" "punpcklwd mm1,mm7 \n" "punpckhwd mm6,mm7 \n" #ifdef _64BITS "movq [rdi+16],mm1 \n" "movq [rdi+24],mm6 \n" "movd mm1,[rsi+rdx+8] \n" #else "movq [edi+16],mm1 \n" "movq [edi+24],mm6 \n" "movd mm1,[esi+edx+8] \n" #endif "pxor mm0,mm0 \n" "punpcklbw mm1,mm0 \n" "movq mm6,mm5 \n" "punpcklwd mm6,mm6 \n" "paddw mm6,mm1 \n" "movq mm0,mm2 \n" "movq mm7,mm3 \n" "punpcklwd mm0,mm0 \n" "punpcklwd mm7,mm7 \n" "paddw mm0,mm1 \n" "paddw mm0,mm7 \n" "movq mm7,mm4 \n" "punpcklwd mm7,mm7 \n" "paddw mm7,mm1 \n" "pxor mm1,mm1 \n" "packuswb mm6,mm1 \n" "packuswb mm0,mm1 \n" "packuswb mm7,mm1 \n" "punpcklbw mm6,mm0 \n" "punpcklbw mm7,mm1 \n" "movq mm1,mm6 \n" "punpcklwd mm1,mm7 \n" "punpckhwd mm6,mm7 \n" #ifdef _64BITS "movq [rdi+rax] ,mm1 \n" "movq [rdi+rax+8],mm6 \n" "movd mm1,[rsi+rdx+12] \n" #else "movq [edi+eax] ,mm1 \n" "movq [edi+eax+8],mm6 \n" "movd mm1,[esi+edx+12] \n" #endif "pxor mm0,mm0 \n" "punpcklbw mm1,mm0 \n" "movq mm6,mm5 \n" "punpckhwd mm6,mm6 \n" "paddw mm6,mm1 \n" "movq mm0,mm2 \n" "movq mm7,mm3 \n" "punpckhwd mm0,mm0 \n" "punpckhwd mm7,mm7 \n" "paddw mm0,mm1 \n" "paddw mm0,mm7 \n" "movq mm7,mm4 \n" "punpckhwd mm7,mm7 \n" "paddw mm7,mm1 \n" "pxor mm1,mm1 \n" "packuswb mm6,mm1 \n" "packuswb mm0,mm1 \n" "packuswb mm7,mm1 \n" "punpcklbw mm6,mm0 \n" "punpcklbw mm7,mm1 \n" "movq mm1,mm6 \n" "punpcklwd mm1,mm7 \n" "punpckhwd mm6,mm7 \n" #ifdef _64BITS "movq [rdi+rax+16],mm1 \n" "movq [rdi+rax+24],mm6 \n" "sub rdx,rcx \n" "add rdi,32 \n" "add rbx,4 \n" "add rcx,64 \n" "cmp rcx,128 \n" "jl __blrow_h2v2 \n" "pop rbx \n" #else "movq [edi+eax+16],mm1 \n" "movq [edi+eax+24],mm6 \n" "sub edx,ecx \n" "add edi,32 \n" "add ebx,4 \n" "add ecx,64 \n" "cmp ecx,128 \n" "jl __blrow_h2v2 \n" "pop ebx \n" #endif ".att_syntax \n" : /* no output */ : "rm"(_128mm), "D"(rgb),"S"(y),"a"(width),"c"(cb),"d"(y0) : "memory","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); #endif cb += 8; rgb += 8*width; } } //------------------------------------------------------------------------------ // MCU8x8 YCbCr H1V1 (1x1:1:1, 3 blocks per MCU) to 32-bit RGB //------------------------------------------------------------------------------ void jpeg_yh1v1_to_rgb32_mmx(unsigned char *block,long width,unsigned char *rgb) { #ifdef _WINDOWS // Visual C++ inline assembly _asm { mov edi,rgb mov esi,block mov eax,width shl eax,2 mov ecx,8 pxor mm0,mm0 __blcol_h1v1: // ----- movd mm1,[esi] // [0000][y03][y02][y01][y00] movd mm2,[esi+64] // [0000][cb3][cb2][cb1][cb0] movd mm3,[esi+128] // [0000][cr3][cr2][cr1][cr0] punpcklbw mm1,mm0 // [y03][y02][y01][y00] punpcklbw mm2,mm0 // [cb3][cb2][cb1][cb0] punpcklbw mm3,mm0 // [cr3][cr2][cr1][cr0] psubw mm2,_128mm psubw mm3,_128mm psllw mm2,2 psllw mm3,2 movq mm4,mm2 movq mm5,mm3 pmulhw mm2,_cbgmm // [cb3*cbg][cb2*cbg][cb1*cbg][cb0*cbg] pmulhw mm4,_cbbmm // [cb3*cbb][cb2*cbb][cb1*cbb][cb0*cbb] pmulhw mm3,_crgmm // [cb3*crg][cb2*crg][cb1*crg][cb0*crg] pmulhw mm5,_crrmm // [cb3*crr][cb2*crr][cb1*crr][cb0*crr] paddw mm5,mm1 // R3R2R1R0 paddw mm2,mm1 paddw mm2,mm3 // G3G2G1G0 paddw mm4,mm1 // B3B2B1B0 packuswb mm5,mm0 // [0000]R3R2R1R0 packuswb mm2,mm0 // [0000]G3G2G1G0 packuswb mm4,mm0 // [0000]B3B2B1B0 punpcklbw mm5,mm2 // G3R3G2R2G1R1G0R0 punpcklbw mm4,mm0 // 00B300B200B100B0 movq mm1,mm5 punpcklwd mm1,mm4 // 00B1G1R100B0G0R0 punpckhwd mm5,mm4 // 00B3G3R300B2G2R2 movq [edi],mm1 movq [edi+8],mm5 // ----- movd mm1,[esi+4] // [0000][y03][y02][y01][y00] movd mm2,[esi+68] // [0000][cb3][cb2][cb1][cb0] movd mm3,[esi+132] // [0000][cr3][cr2][cr1][cr0] punpcklbw mm1,mm0 // [y03][y02][y01][y00] punpcklbw mm2,mm0 // [cb3][cb2][cb1][cb0] punpcklbw mm3,mm0 // [cr3][cr2][cr1][cr0] psubw mm2,_128mm psubw mm3,_128mm psllw mm2,2 psllw mm3,2 movq mm4,mm2 movq mm5,mm3 pmulhw mm2,_cbgmm // [cb3*cbg][cb2*cbg][cb1*cbg][cb0*cbg] pmulhw mm4,_cbbmm // [cb3*cbb][cb2*cbb][cb1*cbb][cb0*cbb] pmulhw mm3,_crgmm // [cb3*crg][cb2*crg][cb1*crg][cb0*crg] pmulhw mm5,_crrmm // [cb3*crr][cb2*crr][cb1*crr][cb0*crr] paddw mm5,mm1 // R3R2R1R0 paddw mm2,mm1 paddw mm2,mm3 // G3G2G1G0 paddw mm4,mm1 // B3B2B1B0 packuswb mm5,mm0 // [0000]R3R2R1R0 packuswb mm2,mm0 // [0000]G3G2G1G0 packuswb mm4,mm0 // [0000]B3B2B1B0 punpcklbw mm5,mm2 // G3R3G2R2G1R1G0R0 punpcklbw mm4,mm0 // 00B300B200B100B0 movq mm1,mm5 punpcklwd mm1,mm4 // 00B1G1R100B0G0R0 punpckhwd mm5,mm4 // 00B3G3R300B2G2R2 movq [edi+16],mm1 movq [edi+24],mm5 add esi,8 add edi,eax dec ecx jnz __blcol_h1v1 } #else // GCC inline assembly code __asm__ ( ".intel_syntax noprefix \n" #ifdef _64BITS "shl rax,2 \n" "mov rcx,8 \n" "pxor mm0,mm0 \n" "__blcol_h1v1: \n" "movd mm1,[rsi] \n" "movd mm2,[rsi+64] \n" "movd mm3,[rsi+128] \n" #else "shl eax,2 \n" "mov ecx,8 \n" "pxor mm0,mm0 \n" "__blcol_h1v1: \n" "movd mm1,[esi] \n" "movd mm2,[esi+64] \n" "movd mm3,[esi+128] \n" #endif "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "psubw mm2,_128mm \n" "psubw mm3,_128mm \n" "psllw mm2,2 \n" "psllw mm3,2 \n" "movq mm4,mm2 \n" "movq mm5,mm3 \n" "pmulhw mm2,_cbgmm \n" "pmulhw mm4,_cbbmm \n" "pmulhw mm3,_crgmm \n" "pmulhw mm5,_crrmm \n" "paddw mm5,mm1 \n" "paddw mm2,mm1 \n" "paddw mm2,mm3 \n" "paddw mm4,mm1 \n" "packuswb mm5,mm0 \n" "packuswb mm2,mm0 \n" "packuswb mm4,mm0 \n" "punpcklbw mm5,mm2 \n" "punpcklbw mm4,mm0 \n" "movq mm1,mm5 \n" "punpcklwd mm1,mm4 \n" "punpckhwd mm5,mm4 \n" #ifdef _64BITS "movq [rdi],mm1 \n" "movq [rdi+8],mm5 \n" "movd mm1,[rsi+4] \n" "movd mm2,[rsi+68] \n" "movd mm3,[rsi+132] \n" #else "movq [edi],mm1 \n" "movq [edi+8],mm5 \n" "movd mm1,[esi+4] \n" "movd mm2,[esi+68] \n" "movd mm3,[esi+132] \n" #endif "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "psubw mm2,_128mm \n" "psubw mm3,_128mm \n" "psllw mm2,2 \n" "psllw mm3,2 \n" "movq mm4,mm2 \n" "movq mm5,mm3 \n" "pmulhw mm2,_cbgmm \n" "pmulhw mm4,_cbbmm \n" "pmulhw mm3,_crgmm \n" "pmulhw mm5,_crrmm \n" "paddw mm5,mm1 \n" "paddw mm2,mm1 \n" "paddw mm2,mm3 \n" "paddw mm4,mm1 \n" "packuswb mm5,mm0 \n" "packuswb mm2,mm0 \n" "packuswb mm4,mm0 \n" "punpcklbw mm5,mm2 \n" "punpcklbw mm4,mm0 \n" "movq mm1,mm5 \n" "punpcklwd mm1,mm4 \n" "punpckhwd mm5,mm4 \n" #ifdef _64BITS "movq [rdi+16],mm1 \n" "movq [rdi+24],mm5 \n" "add rsi,8 \n" "add rdi,rax \n" "dec rcx \n" "jnz __blcol_h1v1 \n" #else "movq [edi+16],mm1 \n" "movq [edi+24],mm5 \n" "add esi,8 \n" "add edi,eax \n" "dec ecx \n" "jnz __blcol_h1v1 \n" #endif ".att_syntax \n" : /* no output */ : "D"(rgb),"S"(block),"a"(width) #ifdef _64BITS : "memory","rcx","mm0","mm1","mm2","mm3","mm4","mm5" #else : "memory","ecx","mm0","mm1","mm2","mm3","mm4","mm5" #endif ); #endif } // Convert 8x8 GRAY8 pixel map to (1xY) block void conv_block_GRAY8Y_mmx(long width,unsigned char *g,short *y) { #ifdef _WINDOWS // Visual C++ inline assembly _asm { mov esi,g mov edi,y mov eax,width mov ecx,8 pxor mm0,mm0 __blrow_gray8: movd mm1,[esi] movd mm2,[esi+4] punpcklbw mm1,mm0 punpcklbw mm2,mm0 psubw mm1,_128mm psubw mm2,_128mm movq [edi] ,mm1 movq [edi+8],mm2 add esi,eax add edi,16 dec ecx jnz __blrow_gray8 } #else // GCC inline assembly code __asm__ ( ".intel_syntax noprefix \n" #ifdef _64BITS "mov rcx,8 \n" "pxor mm0,mm0 \n" "__blrow_gray8: \n" "movd mm1,[rsi] \n" "movd mm2,[rsi+4] \n" "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "psubw mm1,_128mm \n" "psubw mm2,_128mm \n" "movq [rdi] ,mm1 \n" "movq [rdi+8],mm2 \n" "add rsi,rax \n" "add rdi,16 \n" "dec rcx \n" "jnz __blrow_gray8 \n" ".att_syntax \n" : /* no output */ : "D"(y),"S"(g),"a"(width) : "memory","rcx","mm0","mm1","mm2" #else "mov ecx,8 \n" "pxor mm0,mm0 \n" "__blrow_gray8: \n" "movd mm1,[esi] \n" "movd mm2,[esi+4] \n" "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "psubw mm1,_128mm \n" "psubw mm2,_128mm \n" "movq [edi] ,mm1 \n" "movq [edi+8],mm2 \n" "add esi,eax \n" "add edi,16 \n" "dec ecx \n" "jnz __blrow_gray8 \n" ".att_syntax \n" : /* no output */ : "D"(y),"S"(g),"a"(width) : "memory","ecx","mm0","mm1","mm2" #endif ); #endif } // Convert 16x16 RGB24 pixel map to (4xY 1xCb 1xCr) block void conv_block_RGB24H2V2_mmx(long width,unsigned char *rgb,short *y,short *cb,short *cr) { long i,j,y0,pitch; short *yB; // ! Due to wrong gcc stack code (does not detect push), yB is cleared during asm ! pitch = 6*width-48; for(j=0;j<8;j++) { y0 = (((j&4)<<5) + ((j&3)<<4)); for(i=0;i<4;i++) { yB = y + (y0 + (((i&2)<<5) + ((i&1)<<2))); #ifdef _WINDOWS // Visual C++ inline assembly _asm { mov esi,rgb mov edi,yB mov eax,width mov ebx,cb mov edx,cr mov ecx,eax shl eax,1 add eax,ecx pxor mm0,mm0 // Y 1st row movd mm1,[esi] // [0000]xxB0G0R0 movd mm3,[esi+6] // [0000]xxB2G2R2 movd mm2,[esi+3] // [0000]xxB1G1R1 movd mm4,[esi+8] // [0000]B3G3R3xx psrlq mm4,8 // [0000]xxB3G3R3 punpcklbw mm1,mm0 // xxB0G0R0 punpcklbw mm2,mm0 // xxB1G1R1 punpcklbw mm3,mm0 // xxB2G2R2 punpcklbw mm4,mm0 // xxB3G3R3 movq mm6,mm3 movq mm7,mm1 punpcklwd mm1,mm2 // G1G0R1R0 punpcklwd mm3,mm4 // G3G2R3R2 movq mm5,mm1 punpckldq mm1,mm3 // R3R2R1R0 punpckhdq mm5,mm3 // G3G2G1G0 punpckhwd mm7,mm2 // xxxxB1B0 punpckhwd mm6,mm4 // xxxxB3B2 punpckldq mm7,mm6 // B3B2B1B0 psllw mm1,1 psllw mm5,1 psllw mm7,1 pmulhw mm1,_rymm pmulhw mm5,_gymm pmulhw mm7,_bymm paddw mm1,mm5 paddw mm1,mm7 // Y3Y2Y1Y0 paddw mm1,_offymm movq [edi],mm1 // 2nd row movd mm1,[esi+eax] // [0000]xxB0G0R0 movd mm3,[esi+eax+6] // [0000]xxB2G2R2 movd mm2,[esi+eax+3] // [0000]xxB1G1R1 movd mm4,[esi+eax+8] // [0000]B3G3R3xx psrlq mm4,8 // [0000]xxB3G3R3 punpcklbw mm1,mm0 // xxB0G0R0 punpcklbw mm2,mm0 // xxB1G1R1 punpcklbw mm3,mm0 // xxB2G2R2 punpcklbw mm4,mm0 // xxB3G3R3 movq mm6,mm3 movq mm7,mm1 punpcklwd mm1,mm2 // G1G0R1R0 punpcklwd mm3,mm4 // G3G2R3R2 movq mm5,mm1 punpckldq mm1,mm3 // R3R2R1R0 punpckhdq mm5,mm3 // G3G2G1G0 punpckhwd mm7,mm2 // xxxxB1B0 punpckhwd mm6,mm4 // xxxxB3B2 punpckldq mm7,mm6 // B3B2B1B0 psllw mm1,1 psllw mm5,1 psllw mm7,1 pmulhw mm1,_rymm pmulhw mm5,_gymm pmulhw mm7,_bymm paddw mm1,mm5 paddw mm1,mm7 // Y3Y2Y1Y0 paddw mm1,_offymm movq [edi+16],mm1 // CbCr (2x downsampling) movd mm1,[esi] // [0000]xxB00G00R00 movd mm3,[esi+eax] // [0000]xxB01G01R01 movd mm2,[esi+3] // [0000]xxB10G10R10 movd mm4,[esi+eax+3] // [0000]xxB11G11R11 punpcklbw mm1,mm0 punpcklbw mm2,mm0 punpcklbw mm3,mm0 punpcklbw mm4,mm0 paddw mm1,mm2 // xx[B00+B10][G00+G10][R00+R10] paddw mm3,mm4 // xx[B01+B11][G01+G11][R01+R11] paddw mm1,mm3 psrlw mm1,1 // xx[B0][G0][R0] movd mm2,[esi+6] // [0000]xxB00G00R00 movd mm4,[esi+eax+6] // [0000]B01G01R01xx movd mm3,[esi+8] // [0000]xxB10G10R10 movd mm5,[esi+eax+8] // [0000]B11G11R11xx psrlq mm3,8 // [0000]xxB01G01R01 psrlq mm5,8 // [0000]xxB11G11R11 punpcklbw mm2,mm0 punpcklbw mm3,mm0 punpcklbw mm4,mm0 punpcklbw mm5,mm0 paddw mm2,mm3 // xx[B00+B10][G00+G10][R00+R10] paddw mm4,mm5 // xx[B01+B11][G01+G11][R01+R11] paddw mm2,mm4 psrlw mm2,1 // xx[B1][G1][R1] movq mm7,mm1 punpcklwd mm1,mm2 // G1G0R1R0 movq mm5,mm1 punpckldq mm1,mm1 // R1R0R1R0 punpckhdq mm5,mm5 // G1G0G1G0 punpckhwd mm7,mm2 // xxxxB1B0 punpckldq mm7,mm7 // B1B0B1B0 pmulhw mm1,_rcbcrmm pmulhw mm5,_gcbcrmm pmulhw mm7,_bcbcrmm paddw mm1,mm5 paddw mm1,mm7 // cb1cb0cr1cr0 paddw mm1,_rcmm movd [ebx],mm1 psrlq mm1,32 movd [edx],mm1 } // end asm #else // GCC inline assembly code __asm__ ( ".intel_syntax noprefix \n" #ifdef _64BITS "push rbx \n" "mov rbx,rcx \n" "mov rcx,rax \n" "shl rax,1 \n" "add rax,rcx \n" "pxor mm0,mm0 \n" "movd mm1,[rsi] \n" "movd mm3,[rsi+6] \n" "movd mm2,[rsi+3] \n" "movd mm4,[rsi+8] \n" #else "push ebx \n" "mov ebx,ecx \n" "mov ecx,eax \n" "shl eax,1 \n" "add eax,ecx \n" "pxor mm0,mm0 \n" "movd mm1,[esi] \n" "movd mm3,[esi+6] \n" "movd mm2,[esi+3] \n" "movd mm4,[esi+8] \n" #endif "psrlq mm4,8 \n" "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "movq mm6,mm3 \n" "movq mm7,mm1 \n" "punpcklwd mm1,mm2 \n" "punpcklwd mm3,mm4 \n" "movq mm5,mm1 \n" "punpckldq mm1,mm3 \n" "punpckhdq mm5,mm3 \n" "punpckhwd mm7,mm2 \n" "punpckhwd mm6,mm4 \n" "punpckldq mm7,mm6 \n" "psllw mm1,1 \n" "psllw mm5,1 \n" "psllw mm7,1 \n" "pmulhw mm1,_rymm \n" "pmulhw mm5,_gymm \n" "pmulhw mm7,_bymm \n" "paddw mm1,mm5 \n" "paddw mm1,mm7 \n" "paddw mm1,_offymm \n" #ifdef _64BITS "movq [rdi],mm1 \n" "movd mm1,[rsi+rax] \n" "movd mm3,[rsi+rax+6] \n" "movd mm2,[rsi+rax+3] \n" "movd mm4,[rsi+rax+8] \n" #else "movq [edi],mm1 \n" "movd mm1,[esi+eax] \n" "movd mm3,[esi+eax+6] \n" "movd mm2,[esi+eax+3] \n" "movd mm4,[esi+eax+8] \n" #endif "psrlq mm4,8 \n" "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "movq mm6,mm3 \n" "movq mm7,mm1 \n" "punpcklwd mm1,mm2 \n" "punpcklwd mm3,mm4 \n" "movq mm5,mm1 \n" "punpckldq mm1,mm3 \n" "punpckhdq mm5,mm3 \n" "punpckhwd mm7,mm2 \n" "punpckhwd mm6,mm4 \n" "punpckldq mm7,mm6 \n" "psllw mm1,1 \n" "psllw mm5,1 \n" "psllw mm7,1 \n" "pmulhw mm1,_rymm \n" "pmulhw mm5,_gymm \n" "pmulhw mm7,_bymm \n" "paddw mm1,mm5 \n" "paddw mm1,mm7 \n" "paddw mm1,_offymm \n" #ifdef _64BITS "movq [rdi+16],mm1 \n" "movd mm1,[rsi] \n" "movd mm3,[rsi+rax] \n" "movd mm2,[rsi+3] \n" "movd mm4,[rsi+rax+3] \n" #else "movq [edi+16],mm1 \n" "movd mm1,[esi] \n" "movd mm3,[esi+eax] \n" "movd mm2,[esi+3] \n" "movd mm4,[esi+eax+3] \n" #endif "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "paddw mm1,mm2 \n" "paddw mm3,mm4 \n" "paddw mm1,mm3 \n" "psrlw mm1,1 \n" #ifdef _64BITS "movd mm2,[rsi+6] \n" "movd mm4,[rsi+rax+6] \n" "movd mm3,[rsi+8] \n" "movd mm5,[rsi+rax+8] \n" #else "movd mm2,[esi+6] \n" "movd mm4,[esi+eax+6] \n" "movd mm3,[esi+8] \n" "movd mm5,[esi+eax+8] \n" #endif "psrlq mm3,8 \n" "psrlq mm5,8 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "punpcklbw mm5,mm0 \n" "paddw mm2,mm3 \n" "paddw mm4,mm5 \n" "paddw mm2,mm4 \n" "psrlw mm2,1 \n" "movq mm7,mm1 \n" "punpcklwd mm1,mm2 \n" "movq mm5,mm1 \n" "punpckldq mm1,mm1 \n" "punpckhdq mm5,mm5 \n" "punpckhwd mm7,mm2 \n" "punpckldq mm7,mm7 \n" "pmulhw mm1,_rcbcrmm \n" "pmulhw mm5,_gcbcrmm \n" "pmulhw mm7,_bcbcrmm \n" "paddw mm1,mm5 \n" "paddw mm1,mm7 \n" "paddw mm1,_rcmm \n" #ifdef _64BITS "movd [rbx],mm1 \n" "psrlq mm1,32 \n" "movd [rdx],mm1 \n" "pop rbx \n" #else "movd [ebx],mm1 \n" "psrlq mm1,32 \n" "movd [edx],mm1 \n" "pop ebx \n" #endif ".att_syntax \n" : /* no output */ : "D"(yB),"S"(rgb),"a"(width),"c"(cb),"d"(cr) : "memory","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); #endif cb+=2; cr+=2; rgb+=12; } rgb+=pitch; } } // Convert 16x16 RGB32 pixel map to (4xY 1xCb 1xCr) block void conv_block_RGB32H2V2_mmx(long width,unsigned char *rgb,short *y,short *cb,short *cr) { long i,j,y0,pitch; short *yB; // ! Due to wrong gcc stack code (does not detect push), yB is cleared during asm ! pitch = 8*width-64; for(j=0;j<8;j++) { y0 = (((j&4)<<5) + ((j&3)<<4)); for(i=0;i<4;i++) { yB = y + (y0 + (((i&2)<<5) + ((i&1)<<2))); #ifdef _WINDOWS // Visual C++ inline assembly _asm { mov esi,rgb mov edi,yB mov eax,width mov ebx,cb mov edx,cr pxor mm0,mm0 shl eax,2 // Y 1st row movd mm1,[esi] // [0000]xxB0G0R0 movd mm3,[esi+8] // [0000]xxB2G2R2 movd mm2,[esi+4] // [0000]xxB1G1R1 movd mm4,[esi+12] // [0000]xxB3G3R3 punpcklbw mm1,mm0 // xxB0G0R0 punpcklbw mm2,mm0 // xxB1G1R1 punpcklbw mm3,mm0 // xxB2G2R2 punpcklbw mm4,mm0 // xxB3G3R3 movq mm6,mm3 movq mm7,mm1 punpcklwd mm1,mm2 // G1G0R1R0 punpcklwd mm3,mm4 // G3G2R3R2 movq mm5,mm1 punpckldq mm1,mm3 // R3R2R1R0 punpckhdq mm5,mm3 // G3G2G1G0 punpckhwd mm7,mm2 // xxxxB1B0 punpckhwd mm6,mm4 // xxxxB3B2 punpckldq mm7,mm6 // B3B2B1B0 psllw mm1,1 psllw mm5,1 psllw mm7,1 pmulhw mm1,_rymm pmulhw mm5,_gymm pmulhw mm7,_bymm paddw mm1,mm5 paddw mm1,mm7 // Y3Y2Y1Y0 paddw mm1,_offymm movq [edi],mm1 // 2nd row movd mm1,[esi+eax] // [0000]xxB0G0R0 movd mm3,[esi+eax+8] // [0000]xxB2G2R2 movd mm2,[esi+eax+4] // [0000]xxB1G1R1 movd mm4,[esi+eax+12] // [0000]xxB3G3R3 punpcklbw mm1,mm0 // xxB0G0R0 punpcklbw mm2,mm0 // xxB1G1R1 punpcklbw mm3,mm0 // xxB2G2R2 punpcklbw mm4,mm0 // xxB3G3R3 movq mm6,mm3 movq mm7,mm1 punpcklwd mm1,mm2 // G1G0R1R0 punpcklwd mm3,mm4 // G3G2R3R2 movq mm5,mm1 punpckldq mm1,mm3 // R3R2R1R0 punpckhdq mm5,mm3 // G3G2G1G0 punpckhwd mm7,mm2 // xxxxB1B0 punpckhwd mm6,mm4 // xxxxB3B2 punpckldq mm7,mm6 // B3B2B1B0 psllw mm1,1 psllw mm5,1 psllw mm7,1 pmulhw mm1,_rymm pmulhw mm5,_gymm pmulhw mm7,_bymm paddw mm1,mm5 paddw mm1,mm7 // Y3Y2Y1Y0 paddw mm1,_offymm movq [edi+16],mm1 // CbCr (2x downsampling) movd mm1,[esi] // [0000]xxB00G00R00 movd mm3,[esi+eax] // [0000]xxB01G01R01 movd mm2,[esi+4] // [0000]xxB10G10R10 movd mm4,[esi+eax+4] // [0000]xxB11G11R11 punpcklbw mm1,mm0 punpcklbw mm2,mm0 punpcklbw mm3,mm0 punpcklbw mm4,mm0 paddw mm1,mm2 // xx[B00+B10][G00+G10][R00+R10] paddw mm3,mm4 // xx[B01+B11][G01+G11][R01+R11] paddw mm1,mm3 psrlw mm1,1 // xx[B0][G0][R0] movd mm2,[esi+8] // [0000]xxB00G00R00 movd mm4,[esi+eax+8] // [0000]B01G01R01xx movd mm3,[esi+12] // [0000]xxB10G10R10 movd mm5,[esi+eax+12] // [0000]xxB11G11R11 punpcklbw mm2,mm0 punpcklbw mm3,mm0 punpcklbw mm4,mm0 punpcklbw mm5,mm0 paddw mm2,mm3 // xx[B00+B10][G00+G10][R00+R10] paddw mm4,mm5 // xx[B01+B11][G01+G11][R01+R11] paddw mm2,mm4 psrlw mm2,1 // xx[B1][G1][R1] movq mm7,mm1 punpcklwd mm1,mm2 // G1G0R1R0 movq mm5,mm1 punpckldq mm1,mm1 // R1R0R1R0 punpckhdq mm5,mm5 // G1G0G1G0 punpckhwd mm7,mm2 // xxxxB1B0 punpckldq mm7,mm7 // B1B0B1B0 pmulhw mm1,_rcbcrmm pmulhw mm5,_gcbcrmm pmulhw mm7,_bcbcrmm paddw mm1,mm5 paddw mm1,mm7 // cb1cb0cr1cr0 paddw mm1,_rcmm movd [ebx],mm1 psrlq mm1,32 movd [edx],mm1 } // end asm #else // GCC inline assembly code __asm__ ( ".intel_syntax noprefix \n" #ifdef _64BITS "push rbx \n" "mov rbx,rcx \n" "pxor mm0,mm0 \n" "shl rax,2 \n" "movd mm1,[rsi] \n" "movd mm3,[rsi+8] \n" "movd mm2,[rsi+4] \n" "movd mm4,[rsi+12] \n" #else "push ebx \n" "mov ebx,ecx \n" "pxor mm0,mm0 \n" "shl eax,2 \n" "movd mm1,[esi] \n" "movd mm3,[esi+8] \n" "movd mm2,[esi+4] \n" "movd mm4,[esi+12] \n" #endif "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "movq mm6,mm3 \n" "movq mm7,mm1 \n" "punpcklwd mm1,mm2 \n" "punpcklwd mm3,mm4 \n" "movq mm5,mm1 \n" "punpckldq mm1,mm3 \n" "punpckhdq mm5,mm3 \n" "punpckhwd mm7,mm2 \n" "punpckhwd mm6,mm4 \n" "punpckldq mm7,mm6 \n" "psllw mm1,1 \n" "psllw mm5,1 \n" "psllw mm7,1 \n" "pmulhw mm1,_rymm \n" "pmulhw mm5,_gymm \n" "pmulhw mm7,_bymm \n" "paddw mm1,mm5 \n" "paddw mm1,mm7 \n" "paddw mm1,_offymm \n" #ifdef _64BITS "movq [rdi],mm1 \n" "movd mm1,[rsi+rax] \n" "movd mm3,[rsi+rax+8] \n" "movd mm2,[rsi+rax+4] \n" "movd mm4,[rsi+rax+12] \n" #else "movq [edi],mm1 \n" "movd mm1,[esi+eax] \n" "movd mm3,[esi+eax+8] \n" "movd mm2,[esi+eax+4] \n" "movd mm4,[esi+eax+12] \n" #endif "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "movq mm6,mm3 \n" "movq mm7,mm1 \n" "punpcklwd mm1,mm2 \n" "punpcklwd mm3,mm4 \n" "movq mm5,mm1 \n" "punpckldq mm1,mm3 \n" "punpckhdq mm5,mm3 \n" "punpckhwd mm7,mm2 \n" "punpckhwd mm6,mm4 \n" "punpckldq mm7,mm6 \n" "psllw mm1,1 \n" "psllw mm5,1 \n" "psllw mm7,1 \n" "pmulhw mm1,_rymm \n" "pmulhw mm5,_gymm \n" "pmulhw mm7,_bymm \n" "paddw mm1,mm5 \n" "paddw mm1,mm7 \n" "paddw mm1,_offymm \n" #ifdef _64BITS "movq [rdi+16],mm1 \n" "movd mm1,[rsi] \n" "movd mm3,[rsi+rax] \n" "movd mm2,[rsi+4] \n" "movd mm4,[rsi+rax+4] \n" #else "movq [edi+16],mm1 \n" "movd mm1,[esi] \n" "movd mm3,[esi+eax] \n" "movd mm2,[esi+4] \n" "movd mm4,[esi+eax+4] \n" #endif "punpcklbw mm1,mm0 \n" "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "paddw mm1,mm2 \n" "paddw mm3,mm4 \n" "paddw mm1,mm3 \n" "psrlw mm1,1 \n" #ifdef _64BITS "movd mm2,[rsi+8] \n" "movd mm4,[rsi+rax+8] \n" "movd mm3,[rsi+12] \n" "movd mm5,[rsi+rax+12] \n" #else "movd mm2,[esi+8] \n" "movd mm4,[esi+eax+8] \n" "movd mm3,[esi+12] \n" "movd mm5,[esi+eax+12] \n" #endif "punpcklbw mm2,mm0 \n" "punpcklbw mm3,mm0 \n" "punpcklbw mm4,mm0 \n" "punpcklbw mm5,mm0 \n" "paddw mm2,mm3 \n" "paddw mm4,mm5 \n" "paddw mm2,mm4 \n" "psrlw mm2,1 \n" "movq mm7,mm1 \n" "punpcklwd mm1,mm2 \n" "movq mm5,mm1 \n" "punpckldq mm1,mm1 \n" "punpckhdq mm5,mm5 \n" "punpckhwd mm7,mm2 \n" "punpckldq mm7,mm7 \n" "pmulhw mm1,_rcbcrmm \n" "pmulhw mm5,_gcbcrmm \n" "pmulhw mm7,_bcbcrmm \n" "paddw mm1,mm5 \n" "paddw mm1,mm7 \n" "paddw mm1,_rcmm \n" #ifdef _64BITS "movd [rbx],mm1 \n" "psrlq mm1,32 \n" "movd [rdx],mm1 \n" "pop rbx \n" #else "movd [ebx],mm1 \n" "psrlq mm1,32 \n" "movd [edx],mm1 \n" "pop ebx \n" #endif ".att_syntax \n" : /* no output */ : "D"(yB),"S"(rgb),"a"(width),"c"(cb),"d"(cr) : "memory","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); #endif cb+=2; cr+=2; rgb+=16; } rgb+=pitch; } } #endif /* JPG_USE_ASM */
///============================================================================= // // file : jpeg_dct_mmx.cpp // // description : Simple jpeg coding/decoding library // Discrete Cosine Transform (8x8) MMX code // // project : TANGO // // author(s) : JL Pons // // Copyright (C) : 2004,2005,2006,2007,2008,2009 // European Synchrotron Radiation Facility // BP 220, Grenoble 38043 // FRANCE // // This file is part of Tango. // // Tango is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // Tango is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with Tango. If not, see <http://www.gnu.org/licenses/>. // // $Revision: 1.6 $ // // $Log: jpeg_dct_mmx.cpp,v $ // Revision 1.6 2009/04/20 14:55:58 jlpons // Added GPL header, changed memory allocation to C++ fashion. // //============================================================================= // MMX implementation has been provided by Intel at AP-922 #include "jpeg_lib.h" #ifdef JPG_USE_ASM #ifdef _WINDOWS // Visual C++ align directive #define ALIGN8 __declspec(align(8)) #else // gcc align directive #define ALIGN8 __attribute__ ((aligned (8))) #endif #define BITS_FRW_ACC 3 //// 2 or 3 for accuracy #define SHIFT_FRW_COL BITS_FRW_ACC #define SHIFT_FRW_ROW (BITS_FRW_ACC + 14) #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) // MMX constants ALIGN8 short __jpmm_one_corr[] = {1,1,1,1}; ALIGN8 long __jpmm_round_frw_row[] = {RND_FRW_ROW,RND_FRW_ROW}; ALIGN8 short __jpmm_tg_1_16[] = { 13036, 13036, 13036, 13036 }; //tg * (2<<16) + 0.5 ALIGN8 short __jpmm_tg_2_16[] = { 27146, 27146, 27146, 27146 }; //tg * (2<<16) + 0.5 ALIGN8 short __jpmm_tg_3_16[] = { -21746, -21746, -21746, -21746 }; //tg * (2<<16) + 0.5 ALIGN8 short __jpmm_cos_4_16[] = { -19195, -19195, -19195, -19195 }; //cos * (2<<16) + 0.5 ALIGN8 short __jpmm_ocos_4_16[] = { 23170, 23170, 23170, 23170 }; //cos * (2<<15) + 0.5 ALIGN8 short __jpmm_row_tab_frw[] = { // forward_dct coeff table //row0 16384, 16384, 21407, -8867, // w09 w01 w08 w00 16384, 16384, 8867, -21407, // w13 w05 w12 w04 16384, -16384, 8867, 21407, // w11 w03 w10 w02 -16384, 16384, -21407, -8867, // w15 w07 w14 w06 22725, 12873, 19266, -22725, // w22 w20 w18 w16 19266, 4520, -4520, -12873, // w23 w21 w19 w17 12873, 4520, 4520, 19266, // w30 w28 w26 w24 -22725, 19266, -12873, -22725, // w31 w29 w27 w25 //row1 22725, 22725, 29692, -12299, // w09 w01 w08 w00 22725, 22725, 12299, -29692, // w13 w05 w12 w04 22725, -22725, 12299, 29692, // w11 w03 w10 w02 -22725, 22725, -29692, -12299, // w15 w07 w14 w06 31521, 17855, 26722, -31521, // w22 w20 w18 w16 26722, 6270, -6270, -17855, // w23 w21 w19 w17 17855, 6270, 6270, 26722, // w30 w28 w26 w24 -31521, 26722, -17855, -31521, // w31 w29 w27 w25 //row2 21407, 21407, 27969, -11585, // w09 w01 w08 w00 21407, 21407, 11585, -27969, // w13 w05 w12 w04 21407, -21407, 11585, 27969, // w11 w03 w10 w02 -21407, 21407, -27969, -11585, // w15 w07 w14 w06 29692, 16819, 25172, -29692, // w22 w20 w18 w16 25172, 5906, -5906, -16819, // w23 w21 w19 w17 16819, 5906, 5906, 25172, // w30 w28 w26 w24 -29692, 25172, -16819, -29692, // w31 w29 w27 w25 //row3 19266, 19266, 25172, -10426, // w09 w01 w08 w00 19266, 19266, 10426, -25172, // w13 w05 w12 w04 19266, -19266, 10426, 25172, // w11 w03 w10 w02 -19266, 19266, -25172, -10426, // w15 w07 w14 w06, 26722, 15137, 22654, -26722, // w22 w20 w18 w16 22654, 5315, -5315, -15137, // w23 w21 w19 w17 15137, 5315, 5315, 22654, // w30 w28 w26 w24 -26722, 22654, -15137, -26722, // w31 w29 w27 w25, //row4 16384, 16384, 21407, -8867, // w09 w01 w08 w00 16384, 16384, 8867, -21407, // w13 w05 w12 w04 16384, -16384, 8867, 21407, // w11 w03 w10 w02 -16384, 16384, -21407, -8867, // w15 w07 w14 w06 22725, 12873, 19266, -22725, // w22 w20 w18 w16 19266, 4520, -4520, -12873, // w23 w21 w19 w17 12873, 4520, 4520, 19266, // w30 w28 w26 w24 -22725, 19266, -12873, -22725, // w31 w29 w27 w25 //row5 19266, 19266, 25172, -10426, // w09 w01 w08 w00 19266, 19266, 10426, -25172, // w13 w05 w12 w04 19266, -19266, 10426, 25172, // w11 w03 w10 w02 -19266, 19266, -25172, -10426, // w15 w07 w14 w06 26722, 15137, 22654, -26722, // w22 w20 w18 w16 22654, 5315, -5315, -15137, // w23 w21 w19 w17 15137, 5315, 5315, 22654, // w30 w28 w26 w24 -26722, 22654, -15137, -26722, // w31 w29 w27 w25 //row6 21407, 21407, 27969, -11585, // w09 w01 w08 w00 21407, 21407, 11585, -27969, // w13 w05 w12 w04 21407, -21407, 11585, 27969, // w11 w03 w10 w02 -21407, 21407, -27969, -11585, // w15 w07 w14 w06, 29692, 16819, 25172, -29692, // w22 w20 w18 w16 25172, 5906, -5906, -16819, // w23 w21 w19 w17 16819, 5906, 5906, 25172, // w30 w28 w26 w24 -29692, 25172, -16819, -29692, // w31 w29 w27 w25, //row7 22725, 22725, 29692, -12299, // w09 w01 w08 w00 22725, 22725, 12299, -29692, // w13 w05 w12 w04 22725, -22725, 12299, 29692, // w11 w03 w10 w02 -22725, 22725, -29692, -12299, // w15 w07 w14 w06, 31521, 17855, 26722, -31521, // w22 w20 w18 w16 26722, 6270, -6270, -17855, // w23 w21 w19 w17 17855, 6270, 6270, 26722, // w30 w28 w26 w24 -31521, 26722, -17855, -31521 // w31 w29 w27 w25 }; void jpeg_fdct_mmx( short *block ) { #ifdef _WINDOWS // Visual C++ inline assembly code __asm { // Columns mov eax, block lea ebx, __jpmm_row_tab_frw mov ecx, eax movq mm0, [eax + 1*16] //; 0 ; x1 movq mm1, [eax + 6*16] //; 1 ; x6 movq mm2, mm0 //; 2 ; x1 movq mm3, [eax + 2*16] //; 3 ; x2 paddsw mm0, mm1 //; t1 = x[1] + x[6] movq mm4, [eax + 5*16] //; 4 ; x5 psllw mm0, SHIFT_FRW_COL //; t1 movq mm5, [eax + 0*16] //; 5 ; x0 paddsw mm4, mm3 //; t2 = x[2] + x[5] paddsw mm5, [eax + 7*16] // t0 = x[0] + x[7] psllw mm4, SHIFT_FRW_COL // t2 movq mm6, mm0 // 6 ; t1 psubsw mm2, mm1 // 1 ; t6 = x[1] - x[6] movq mm1, __jpmm_tg_2_16 // 1 ; tg_2_16 psubsw mm0, mm4 // tm12 = t1 - t2 movq mm7, [eax + 3*16] // 7 ; x3 pmulhw mm1, mm0 // tm12*tg_2_16 paddsw mm7, [eax + 4*16] // t3 = x[3] + x[4] psllw mm5, SHIFT_FRW_COL // t0 paddsw mm6, mm4 // 4 ; tp12 = t1 + t2 psllw mm7, SHIFT_FRW_COL // t3 movq mm4, mm5 // 4 ; t0 psubsw mm5, mm7 // tm03 = t0 - t3 paddsw mm1, mm5 // y2 = tm03 + tm12*tg_2_16 paddsw mm4, mm7 // 7 ; tp03 = t0 + t3 por mm1, __jpmm_one_corr // correction y2 +0.5 psllw mm2, SHIFT_FRW_COL+1 // t6 pmulhw mm5, __jpmm_tg_2_16 // tm03*tg_2_16 movq mm7, mm4 // 7 // tp03 psubsw mm3, [eax + 5*16] // t5 = x[2] - x[5] psubsw mm4, mm6 // y4 = tp03 - tp12 movq [ecx + 2*16], mm1 // 1 // save y2 paddsw mm7, mm6 // 6 // y0 = tp03 + tp12 movq mm1, [eax + 3*16] // 1 // x3 psllw mm3, SHIFT_FRW_COL+1 // t5 psubsw mm1, [eax + 4*16] // t4 = x[3] - x[4] movq mm6, mm2 // 6 // t6 movq [ecx + 4*16], mm4 // 4 // save y4 paddsw mm2, mm3 // t6 + t5 pmulhw mm2, __jpmm_ocos_4_16 // tp65 = (t6 + t5)*cos_4_16 psubsw mm6, mm3 // 3 // t6 - t5 pmulhw mm6, __jpmm_ocos_4_16 // tm65 = (t6 - t5)*cos_4_16 psubsw mm5, mm0 // 0 // y6 = tm03*tg_2_16 - tm12 por mm5, __jpmm_one_corr // correction y6 +0.5 psllw mm1, SHIFT_FRW_COL // t4 por mm2, __jpmm_one_corr // correction tp65 +0.5 movq mm4, mm1 // 4 // t4 movq mm3, [eax + 0*16] // 3 // x0 paddsw mm1, mm6 // tp465 = t4 + tm65 psubsw mm3, [eax + 7*16] // t7 = x[0] - x[7] psubsw mm4, mm6 // 6 // tm465 = t4 - tm65 movq mm0, __jpmm_tg_1_16 // 0 // tg_1_16 psllw mm3, SHIFT_FRW_COL // t7 movq mm6, __jpmm_tg_3_16 // 6 // tg_3_16 pmulhw mm0, mm1 // tp465*tg_1_16 movq [ecx + 0*16], mm7 // 7 // save y0 pmulhw mm6, mm4 // tm465*tg_3_16 movq [ecx + 6*16], mm5 // 5 // save y6 movq mm7, mm3 // 7 // t7 movq mm5, __jpmm_tg_3_16 // 5 // tg_3_16 psubsw mm7, mm2 // tm765 = t7 - tp65 paddsw mm3, mm2 // 2 // tp765 = t7 + tp65 pmulhw mm5, mm7 // tm765*tg_3_16 paddsw mm0, mm3 // y1 = tp765 + tp465*tg_1_16 paddsw mm6, mm4 // tm465*tg_3_16 pmulhw mm3, __jpmm_tg_1_16 // tp765*tg_1_16 por mm0, __jpmm_one_corr // correction y1 +0.5 paddsw mm5, mm7 // tm765*tg_3_16 psubsw mm7, mm6 // 6 // y3 = tm765 - tm465*tg_3_16 add eax, 0x08 // // increment pointer movq [ecx + 1*16], mm0 // 0 // save y1 paddsw mm5, mm4 // 4 // y5 = tm765*tg_3_16 + tm465 movq [ecx + 3*16], mm7 // 7 // save y3 psubsw mm3, mm1 // 1 // y7 = tp765*tg_1_16 - tp465 movq [ecx + 5*16], mm5 // 5 // save y5 movq mm0, [eax + 1*16] // 0 // x1 movq [ecx + 7*16], mm3 // 3 // save y7 (columns 0-4) movq mm1, [eax + 6*16] // 1 // x6 movq mm2, mm0 // 2 // x1 movq mm3, [eax + 2*16] // 3 // x2 paddsw mm0, mm1 // t1 = x[1] + x[6] movq mm4, [eax + 5*16] // 4 // x5 psllw mm0, SHIFT_FRW_COL // t1 movq mm5, [eax + 0*16] // 5 // x0 paddsw mm4, mm3 // t2 = x[2] + x[5] paddsw mm5, [eax + 7*16] // t0 = x[0] + x[7] psllw mm4, SHIFT_FRW_COL // t2 movq mm6, mm0 // 6 // t1 psubsw mm2, mm1 // 1 // t6 = x[1] - x[6] movq mm1, __jpmm_tg_2_16 // 1 // tg_2_16 psubsw mm0, mm4 // tm12 = t1 - t2 movq mm7, [eax + 3*16] // 7 // x3 pmulhw mm1, mm0 // tm12*tg_2_16 paddsw mm7, [eax + 4*16] // t3 = x[3] + x[4] psllw mm5, SHIFT_FRW_COL // t0 paddsw mm6, mm4 // 4 // tp12 = t1 + t2 psllw mm7, SHIFT_FRW_COL // t3 movq mm4, mm5 // 4 // t0 psubsw mm5, mm7 // tm03 = t0 - t3 paddsw mm1, mm5 // y2 = tm03 + tm12*tg_2_16 paddsw mm4, mm7 // 7 // tp03 = t0 + t3 por mm1, __jpmm_one_corr // correction y2 +0.5 psllw mm2, SHIFT_FRW_COL+1 // t6 pmulhw mm5, __jpmm_tg_2_16 // tm03*tg_2_16 movq mm7, mm4 // 7 // tp03 psubsw mm3, [eax + 5*16] // t5 = x[2] - x[5] psubsw mm4, mm6 // y4 = tp03 - tp12 movq [ecx + 2*16+8], mm1 // 1 // save y2 paddsw mm7, mm6 // 6 // y0 = tp03 + tp12 movq mm1, [eax + 3*16] // 1 // x3 psllw mm3, SHIFT_FRW_COL+1 // t5 psubsw mm1, [eax + 4*16] // t4 = x[3] - x[4] movq mm6, mm2 // 6 // t6 movq [ecx + 4*16+8], mm4 // 4 // save y4 paddsw mm2, mm3 // t6 + t5 pmulhw mm2, __jpmm_ocos_4_16 // tp65 = (t6 + t5)*cos_4_16 psubsw mm6, mm3 // 3 // t6 - t5 pmulhw mm6, __jpmm_ocos_4_16 // tm65 = (t6 - t5)*cos_4_16 psubsw mm5, mm0 // 0 // y6 = tm03*tg_2_16 - tm12 por mm5, __jpmm_one_corr // correction y6 +0.5 psllw mm1, SHIFT_FRW_COL // t4 por mm2, __jpmm_one_corr // correction tp65 +0.5 movq mm4, mm1 // 4 // t4 movq mm3, [eax + 0*16] // 3 // x0 paddsw mm1, mm6 // tp465 = t4 + tm65 psubsw mm3, [eax + 7*16] // t7 = x[0] - x[7] psubsw mm4, mm6 // 6 // tm465 = t4 - tm65 movq mm0, __jpmm_tg_1_16 // 0 // tg_1_16 psllw mm3, SHIFT_FRW_COL // t7 movq mm6, __jpmm_tg_3_16 // 6 // tg_3_16 pmulhw mm0, mm1 // tp465*tg_1_16 movq [ecx +8], mm7 // 7 // save y0 pmulhw mm6, mm4 // tm465*tg_3_16 movq [ecx + 6*16+8], mm5 // 5 // save y6 movq mm7, mm3 // 7 // t7 movq mm5, __jpmm_tg_3_16 // 5 // tg_3_16 psubsw mm7, mm2 // tm765 = t7 - tp65 paddsw mm3, mm2 // 2 // tp765 = t7 + tp65 pmulhw mm5, mm7 // tm765*tg_3_16 paddsw mm0, mm3 // y1 = tp765 + tp465*tg_1_16 paddsw mm6, mm4 // tm465*tg_3_16 pmulhw mm3, __jpmm_tg_1_16 // tp765*tg_1_16 por mm0, __jpmm_one_corr // correction y1 +0.5 paddsw mm5, mm7 // tm765*tg_3_16 psubsw mm7, mm6 // 6 // y3 = tm765 - tm465*tg_3_16 movq [ecx + 1*16+8], mm0 // 0 // save y1 paddsw mm5, mm4 // 4 // y5 = tm765*tg_3_16 + tm465 movq [ecx + 3*16+8], mm7 // 7 // save y3 psubsw mm3, mm1 // 1 // y7 = tp765*tg_1_16 - tp465 movq [ecx + 5*16+8], mm5 // 5 // save y5 movq [ecx + 7*16+8], mm3 // 3 // save y7 // Rows ----------------------------------------------------------------- mov eax, block mov edi, 0x08 lp_mmx_fdct_row1: movd mm5, dword ptr [eax+12]// // mm5 = 7 6 punpcklwd mm5, qword ptr [eax+8] // mm5 = 5 7 4 6 movq mm2, mm5// // mm2 = 5 7 4 6 psrlq mm5, 32// // mm5 = _ _ 5 7 movq mm0, qword ptr [eax]// // mm0 = 3 2 1 0 punpcklwd mm5, mm2//// mm5 = 4 5 6 7 movq mm1, mm0// // mm1 = 3 2 1 0 paddsw mm0, mm5// // mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0) psubsw mm1, mm5// // mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4) movq mm2, mm0// // mm2 = [ xt3 xt2 xt1 xt0 ] punpcklwd mm0, mm1//// mm0 = [ xt5 xt1 xt4 xt0 ] punpckhwd mm2, mm1//// mm2 = [ xt7 xt3 xt6 xt2 ] movq mm1, mm2// // mm1 //// shuffle bytes around movq mm2, mm0 // 2 // x3 x2 x1 x0 movq mm3, qword ptr [ebx] // 3 // w06 w04 w02 w00 punpcklwd mm0, mm1 // x5 x1 x4 x0 movq mm5, mm0 // 5 // x5 x1 x4 x0 punpckldq mm0, mm0 // x4 x0 x4 x0 [ xt2 xt0 xt2 xt0 ] movq mm4, qword ptr [ebx+8] // 4 // w07 w05 w03 w01 punpckhwd mm2, mm1 // 1 // x7 x3 x6 x2 pmaddwd mm3, mm0 // x4*w06+x0*w04 x4*w02+x0*w00 movq mm6, mm2 // 6 // x7 x3 x6 x2 movq mm1, qword ptr [ebx+32] // 1 // w22 w20 w18 w16 punpckldq mm2, mm2 // x6 x2 x6 x2 [ xt3 xt1 xt3 xt1 ] pmaddwd mm4, mm2 // x6*w07+x2*w05 x6*w03+x2*w01 punpckhdq mm5, mm5 // x5 x1 x5 x1 [ xt6 xt4 xt6 xt4 ] pmaddwd mm0, qword ptr [ebx+16] // x4*w14+x0*w12 x4*w10+x0*w08 punpckhdq mm6, mm6 // x7 x3 x7 x3 [ xt7 xt5 xt7 xt5 ] movq mm7, qword ptr [ebx+40] // 7 // w23 w21 w19 w17 pmaddwd mm1, mm5 // x5*w22+x1*w20 x5*w18+x1*w16 paddd mm3, __jpmm_round_frw_row // +rounder (y2,y0) pmaddwd mm7, mm6 // x7*w23+x3*w21 x7*w19+x3*w17 pmaddwd mm2, qword ptr [ebx+24] // x6*w15+x2*w13 x6*w11+x2*w09 paddd mm3, mm4 // 4 // a1=sum(even1) a0=sum(even0) // now ( y2, y0) pmaddwd mm5, qword ptr [ebx+48] // x5*w30+x1*w28 x5*w26+x1*w24 pmaddwd mm6, qword ptr [ebx+56] // x7*w31+x3*w29 x7*w27+x3*w25 paddd mm1, mm7 // 7 // b1=sum(odd1) b0=sum(odd0) // now ( y3, y1) paddd mm0, __jpmm_round_frw_row // +rounder (y6,y4) psrad mm3, SHIFT_FRW_ROW // (y2, y0) paddd mm1, __jpmm_round_frw_row // +rounder (y3,y1) paddd mm0, mm2 // 2 // a3=sum(even3) a2=sum(even2) // now (y6, y4) paddd mm5, __jpmm_round_frw_row // +rounder (y7,y5) psrad mm1, SHIFT_FRW_ROW // y1=a1+b1 y0=a0+b0 paddd mm5, mm6 // 6 // b3=sum(odd3) b2=sum(odd2) // now ( y7, y5) psrad mm0, SHIFT_FRW_ROW //y3=a3+b3 y2=a2+b2 add ecx, 16// // increment row-output address by 1 row psrad mm5, SHIFT_FRW_ROW // y4=a3-b3 y5=a2-b2 add eax, 16// // increment row-address by 1 row packssdw mm3, mm0 // 0 // y6 y4 y2 y0 packssdw mm1, mm5 // 3 // y7 y5 y3 y1 movq mm6, mm3// // mm0 = y6 y4 y2 y0 punpcklwd mm3, mm1// // y3 y2 y1 y0 sub edi, 0x01// // i = i - 1 punpckhwd mm6, mm1// // y7 y6 y5 y4 add ebx,64// // increment to next table movq qword ptr [ecx-16], mm3 // 1 // save y3 y2 y1 y0 movq qword ptr [ecx-8], mm6 // 7 // save y7 y6 y5 y4 cmp edi, 0x00// jg lp_mmx_fdct_row1// // begin fdct processing on next row emms// } #else #ifdef _64BITS // gcc inline assembly code (64bits) // Columns __asm__ ( ".intel_syntax noprefix \n" "mov rcx, rax \n" "movq mm0, [rax + 1*16] \n" "movq mm1, [rax + 6*16] \n" "movq mm2, mm0 \n" "movq mm3, [rax + 2*16] \n" "paddsw mm0, mm1 \n" "movq mm4, [rax + 5*16] \n" "psllw mm0, 3 \n" "movq mm5, [rax + 0*16] \n" "paddsw mm4, mm3 \n" "paddsw mm5, [rax + 7*16] \n" "psllw mm4, 3 \n" "movq mm6, mm0 \n" "psubsw mm2, mm1 \n" "movq mm1, __jpmm_tg_2_16 \n" "psubsw mm0, mm4 \n" "movq mm7, [rax + 3*16] \n" "pmulhw mm1, mm0 \n" "paddsw mm7, [rax + 4*16] \n" "psllw mm5, 3 \n" "paddsw mm6, mm4 \n" "psllw mm7, 3 \n" "movq mm4, mm5 \n" "psubsw mm5, mm7 \n" "paddsw mm1, mm5 \n" "paddsw mm4, mm7 \n" "por mm1, __jpmm_one_corr \n" "psllw mm2, 4 \n" "pmulhw mm5, __jpmm_tg_2_16 \n" "movq mm7, mm4 \n" "psubsw mm3, [rax + 5*16] \n" "psubsw mm4, mm6 \n" "movq [rcx + 2*16], mm1 \n" "paddsw mm7, mm6 \n" "movq mm1, [rax + 3*16] \n" "psllw mm3, 4 \n" "psubsw mm1, [rax + 4*16] \n" "movq mm6, mm2 \n" "movq [rcx + 4*16], mm4 \n" "paddsw mm2, mm3 \n" "pmulhw mm2, __jpmm_ocos_4_16 \n" "psubsw mm6, mm3 \n" "pmulhw mm6, __jpmm_ocos_4_16 \n" "psubsw mm5, mm0 \n" "por mm5, __jpmm_one_corr \n" "psllw mm1, 3 \n" "por mm2, __jpmm_one_corr \n" "movq mm4, mm1 \n" "movq mm3, [rax + 0*16] \n" "paddsw mm1, mm6 \n" "psubsw mm3, [rax + 7*16] \n" "psubsw mm4, mm6 \n" "movq mm0, __jpmm_tg_1_16 \n" "psllw mm3, 3 \n" "movq mm6, __jpmm_tg_3_16 \n" "pmulhw mm0, mm1 \n" "movq [rcx + 0*16], mm7 \n" "pmulhw mm6, mm4 \n" "movq [rcx + 6*16], mm5 \n" "movq mm7, mm3 \n" "movq mm5, __jpmm_tg_3_16 \n" "psubsw mm7, mm2 \n" "paddsw mm3, mm2 \n" "pmulhw mm5, mm7 \n" "paddsw mm0, mm3 \n" "paddsw mm6, mm4 \n" "pmulhw mm3, __jpmm_tg_1_16 \n" "por mm0, __jpmm_one_corr \n" "paddsw mm5, mm7 \n" "psubsw mm7, mm6 \n" "add rax, 0x08 \n" "movq [rcx + 1*16], mm0 \n" "paddsw mm5, mm4 \n" "movq [rcx + 3*16], mm7 \n" "psubsw mm3, mm1 \n" "movq [rcx + 5*16], mm5 \n" "movq mm0, [rax + 1*16] \n" "movq [rcx + 7*16], mm3 \n" "movq mm1, [rax + 6*16] \n" "movq mm2, mm0 \n" "movq mm3, [rax + 2*16] \n" "paddsw mm0, mm1 \n" "movq mm4, [rax + 5*16] \n" "psllw mm0, 3 \n" "movq mm5, [rax + 0*16] \n" "paddsw mm4, mm3 \n" "paddsw mm5, [rax + 7*16] \n" "psllw mm4, 3 \n" "movq mm6, mm0 \n" "psubsw mm2, mm1 \n" "movq mm1, __jpmm_tg_2_16 \n" "psubsw mm0, mm4 \n" "movq mm7, [rax + 3*16] \n" "pmulhw mm1, mm0 \n" "paddsw mm7, [rax + 4*16] \n" "psllw mm5, 3 \n" "paddsw mm6, mm4 \n" "psllw mm7, 3 \n" "movq mm4, mm5 \n" "psubsw mm5, mm7 \n" "paddsw mm1, mm5 \n" "paddsw mm4, mm7 \n" "por mm1, __jpmm_one_corr \n" "psllw mm2, 4 \n" "pmulhw mm5, __jpmm_tg_2_16 \n" "movq mm7, mm4 \n" "psubsw mm3, [rax + 5*16] \n" "psubsw mm4, mm6 \n" "movq [rcx + 2*16+8], mm1 \n" "paddsw mm7, mm6 \n" "movq mm1, [rax + 3*16] \n" "psllw mm3, 3+1 \n" "psubsw mm1, [rax + 4*16] \n" "movq mm6, mm2 \n" "movq [rcx + 4*16+8], mm4 \n" "paddsw mm2, mm3 \n" "pmulhw mm2, __jpmm_ocos_4_16 \n" "psubsw mm6, mm3 \n" "pmulhw mm6, __jpmm_ocos_4_16 \n" "psubsw mm5, mm0 \n" "por mm5, __jpmm_one_corr \n" "psllw mm1, 3 \n" "por mm2, __jpmm_one_corr \n" "movq mm4, mm1 \n" "movq mm3, [rax + 0*16] \n" "paddsw mm1, mm6 \n" "psubsw mm3, [rax + 7*16] \n" "psubsw mm4, mm6 \n" "movq mm0, __jpmm_tg_1_16 \n" "psllw mm3, 3 \n" "movq mm6, __jpmm_tg_3_16 \n" "pmulhw mm0, mm1 \n" "movq [rcx +8], mm7 \n" "pmulhw mm6, mm4 \n" "movq [rcx + 6*16+8], mm5 \n" "movq mm7, mm3 \n" "movq mm5, __jpmm_tg_3_16 \n" "psubsw mm7, mm2 \n" "paddsw mm3, mm2 \n" "pmulhw mm5, mm7 \n" "paddsw mm0, mm3 \n" "paddsw mm6, mm4 \n" "pmulhw mm3, __jpmm_tg_1_16 \n" "por mm0, __jpmm_one_corr \n" "paddsw mm5, mm7 \n" "psubsw mm7, mm6 \n" "movq [rcx + 1*16+8], mm0 \n" "paddsw mm5, mm4 \n" "movq [rcx + 3*16+8], mm7 \n" "psubsw mm3, mm1 \n" "movq [rcx + 5*16+8], mm5 \n" "movq [rcx + 7*16+8], mm3 \n" ".att_syntax \n" : /* no output */ : "a"(block) : "memory","rcx","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); // Rows __asm__ ( ".intel_syntax noprefix \n" "push rbx \n" "lea rbx,__jpmm_row_tab_frw \n" "mov rcx, rax \n" "mov rdi, 0x08 \n" "lp_mmx_fdct_row1: \n" "movd mm5, [rax+12] \n" "punpcklwd mm5, [rax+8] \n" "movq mm2, mm5 \n" "psrlq mm5, 32 \n" "movq mm0, [rax] \n" "punpcklwd mm5, mm2 \n" "movq mm1, mm0 \n" "paddsw mm0, mm5 \n" "psubsw mm1, mm5 \n" "movq mm2, mm0 \n" "punpcklwd mm0, mm1 \n" "punpckhwd mm2, mm1 \n" "movq mm1, mm2 \n" "movq mm2, mm0 \n" "movq mm3, [rbx] \n" "punpcklwd mm0, mm1 \n" "movq mm5, mm0 \n" "punpckldq mm0, mm0 \n" "movq mm4, [rbx+8] \n" "punpckhwd mm2, mm1 \n" "pmaddwd mm3, mm0 \n" "movq mm6, mm2 \n" "movq mm1, [rbx+32] \n" "punpckldq mm2, mm2 \n" "pmaddwd mm4, mm2 \n" "punpckhdq mm5, mm5 \n" "pmaddwd mm0, [rbx+16] \n" "punpckhdq mm6, mm6 \n" "movq mm7, [rbx+40] \n" "pmaddwd mm1, mm5 \n" "paddd mm3, __jpmm_round_frw_row \n" "pmaddwd mm7, mm6 \n" "pmaddwd mm2, [rbx+24] \n" "paddd mm3, mm4 \n" "pmaddwd mm5, [rbx+48] \n" "pmaddwd mm6, [rbx+56] \n" "paddd mm1, mm7 \n" "paddd mm0, __jpmm_round_frw_row \n" "psrad mm3, 17 \n" "paddd mm1, __jpmm_round_frw_row \n" "paddd mm0, mm2 \n" "paddd mm5, __jpmm_round_frw_row \n" "psrad mm1, 17 \n" "paddd mm5, mm6 \n" "psrad mm0, 17 \n" "add rcx, 16 \n" "psrad mm5, 17 \n" "add rax, 16 \n" "packssdw mm3, mm0 \n" "packssdw mm1, mm5 \n" "movq mm6, mm3 \n" "punpcklwd mm3, mm1 \n" "sub rdi, 0x01 \n" "punpckhwd mm6, mm1 \n" "add rbx,64 \n" "movq [rcx-16], mm3 \n" "movq [rcx-8], mm6 \n" "cmp rdi, 0x00 \n" "jg lp_mmx_fdct_row1 \n" "pop rbx \n" "emms \n" ".att_syntax \n" : /* no output */ : "a"(block) : "memory","rcx","rdi","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); #else // gcc inline assembly code (32bits) // Columns __asm__ ( ".intel_syntax noprefix \n" "mov ecx, eax \n" "movq mm0, [eax + 1*16] \n" "movq mm1, [eax + 6*16] \n" "movq mm2, mm0 \n" "movq mm3, [eax + 2*16] \n" "paddsw mm0, mm1 \n" "movq mm4, [eax + 5*16] \n" "psllw mm0, 3 \n" "movq mm5, [eax + 0*16] \n" "paddsw mm4, mm3 \n" "paddsw mm5, [eax + 7*16] \n" "psllw mm4, 3 \n" "movq mm6, mm0 \n" "psubsw mm2, mm1 \n" "movq mm1, __jpmm_tg_2_16 \n" "psubsw mm0, mm4 \n" "movq mm7, [eax + 3*16] \n" "pmulhw mm1, mm0 \n" "paddsw mm7, [eax + 4*16] \n" "psllw mm5, 3 \n" "paddsw mm6, mm4 \n" "psllw mm7, 3 \n" "movq mm4, mm5 \n" "psubsw mm5, mm7 \n" "paddsw mm1, mm5 \n" "paddsw mm4, mm7 \n" "por mm1, __jpmm_one_corr \n" "psllw mm2, 4 \n" "pmulhw mm5, __jpmm_tg_2_16 \n" "movq mm7, mm4 \n" "psubsw mm3, [eax + 5*16] \n" "psubsw mm4, mm6 \n" "movq [ecx + 2*16], mm1 \n" "paddsw mm7, mm6 \n" "movq mm1, [eax + 3*16] \n" "psllw mm3, 3+1 \n" "psubsw mm1, [eax + 4*16] \n" "movq mm6, mm2 \n" "movq [ecx + 4*16], mm4 \n" "paddsw mm2, mm3 \n" "pmulhw mm2, __jpmm_ocos_4_16 \n" "psubsw mm6, mm3 \n" "pmulhw mm6, __jpmm_ocos_4_16 \n" "psubsw mm5, mm0 \n" "por mm5, __jpmm_one_corr \n" "psllw mm1, 3 \n" "por mm2, __jpmm_one_corr \n" "movq mm4, mm1 \n" "movq mm3, [eax + 0*16] \n" "paddsw mm1, mm6 \n" "psubsw mm3, [eax + 7*16] \n" "psubsw mm4, mm6 \n" "movq mm0, __jpmm_tg_1_16 \n" "psllw mm3, 3 \n" "movq mm6, __jpmm_tg_3_16 \n" "pmulhw mm0, mm1 \n" "movq [ecx + 0*16], mm7 \n" "pmulhw mm6, mm4 \n" "movq [ecx + 6*16], mm5 \n" "movq mm7, mm3 \n" "movq mm5, __jpmm_tg_3_16 \n" "psubsw mm7, mm2 \n" "paddsw mm3, mm2 \n" "pmulhw mm5, mm7 \n" "paddsw mm0, mm3 \n" "paddsw mm6, mm4 \n" "pmulhw mm3, __jpmm_tg_1_16 \n" "por mm0, __jpmm_one_corr \n" "paddsw mm5, mm7 \n" "psubsw mm7, mm6 \n" "add eax, 0x08 \n" "movq [ecx + 1*16], mm0 \n" "paddsw mm5, mm4 \n" "movq [ecx + 3*16], mm7 \n" "psubsw mm3, mm1 \n" "movq [ecx + 5*16], mm5 \n" "movq mm0, [eax + 1*16] \n" "movq [ecx + 7*16], mm3 \n" "movq mm1, [eax + 6*16] \n" "movq mm2, mm0 \n" "movq mm3, [eax + 2*16] \n" "paddsw mm0, mm1 \n" "movq mm4, [eax + 5*16] \n" "psllw mm0, 3 \n" "movq mm5, [eax + 0*16] \n" "paddsw mm4, mm3 \n" "paddsw mm5, [eax + 7*16] \n" "psllw mm4, 3 \n" "movq mm6, mm0 \n" "psubsw mm2, mm1 \n" "movq mm1, __jpmm_tg_2_16 \n" "psubsw mm0, mm4 \n" "movq mm7, [eax + 3*16] \n" "pmulhw mm1, mm0 \n" "paddsw mm7, [eax + 4*16] \n" "psllw mm5, 3 \n" "paddsw mm6, mm4 \n" "psllw mm7, 3 \n" "movq mm4, mm5 \n" "psubsw mm5, mm7 \n" "paddsw mm1, mm5 \n" "paddsw mm4, mm7 \n" "por mm1, __jpmm_one_corr \n" "psllw mm2, 4 \n" "pmulhw mm5, __jpmm_tg_2_16 \n" "movq mm7, mm4 \n" "psubsw mm3, [eax + 5*16] \n" "psubsw mm4, mm6 \n" "movq [ecx + 2*16+8], mm1 \n" "paddsw mm7, mm6 \n" "movq mm1, [eax + 3*16] \n" "psllw mm3, 3+1 \n" "psubsw mm1, [eax + 4*16] \n" "movq mm6, mm2 \n" "movq [ecx + 4*16+8], mm4 \n" "paddsw mm2, mm3 \n" "pmulhw mm2, __jpmm_ocos_4_16 \n" "psubsw mm6, mm3 \n" "pmulhw mm6, __jpmm_ocos_4_16 \n" "psubsw mm5, mm0 \n" "por mm5, __jpmm_one_corr \n" "psllw mm1, 3 \n" "por mm2, __jpmm_one_corr \n" "movq mm4, mm1 \n" "movq mm3, [eax + 0*16] \n" "paddsw mm1, mm6 \n" "psubsw mm3, [eax + 7*16] \n" "psubsw mm4, mm6 \n" "movq mm0, __jpmm_tg_1_16 \n" "psllw mm3, 3 \n" "movq mm6, __jpmm_tg_3_16 \n" "pmulhw mm0, mm1 \n" "movq [ecx +8], mm7 \n" "pmulhw mm6, mm4 \n" "movq [ecx + 6*16+8], mm5 \n" "movq mm7, mm3 \n" "movq mm5, __jpmm_tg_3_16 \n" "psubsw mm7, mm2 \n" "paddsw mm3, mm2 \n" "pmulhw mm5, mm7 \n" "paddsw mm0, mm3 \n" "paddsw mm6, mm4 \n" "pmulhw mm3, __jpmm_tg_1_16 \n" "por mm0, __jpmm_one_corr \n" "paddsw mm5, mm7 \n" "psubsw mm7, mm6 \n" "movq [ecx + 1*16+8], mm0 \n" "paddsw mm5, mm4 \n" "movq [ecx + 3*16+8], mm7 \n" "psubsw mm3, mm1 \n" "movq [ecx + 5*16+8], mm5 \n" "movq [ecx + 7*16+8], mm3 \n" ".att_syntax \n" : /* no output */ : "a"(block) : "memory","ecx","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); // Rows __asm__ ( ".intel_syntax noprefix \n" "push ebx \n" "lea ebx,__jpmm_row_tab_frw \n" "mov edi, 0x08 \n" "mov ecx, eax \n" "lp_mmx_fdct_row1: \n" "movd mm5, [eax+12] \n" "punpcklwd mm5, [eax+8] \n" "movq mm2, mm5 \n" "psrlq mm5, 32 \n" "movq mm0, [eax] \n" "punpcklwd mm5, mm2 \n" "movq mm1, mm0 \n" "paddsw mm0, mm5 \n" "psubsw mm1, mm5 \n" "movq mm2, mm0 \n" "punpcklwd mm0, mm1 \n" "punpckhwd mm2, mm1 \n" "movq mm1, mm2 \n" "movq mm2, mm0 \n" "movq mm3, [ebx] \n" "punpcklwd mm0, mm1 \n" "movq mm5, mm0 \n" "punpckldq mm0, mm0 \n" "movq mm4, [ebx+8] \n" "punpckhwd mm2, mm1 \n" "pmaddwd mm3, mm0 \n" "movq mm6, mm2 \n" "movq mm1, [ebx+32] \n" "punpckldq mm2, mm2 \n" "pmaddwd mm4, mm2 \n" "punpckhdq mm5, mm5 \n" "pmaddwd mm0, [ebx+16] \n" "punpckhdq mm6, mm6 \n" "movq mm7, [ebx+40] \n" "pmaddwd mm1, mm5 \n" "paddd mm3, __jpmm_round_frw_row \n" "pmaddwd mm7, mm6 \n" "pmaddwd mm2, [ebx+24] \n" "paddd mm3, mm4 \n" "pmaddwd mm5, [ebx+48] \n" "pmaddwd mm6, [ebx+56] \n" "paddd mm1, mm7 \n" "paddd mm0, __jpmm_round_frw_row \n" "psrad mm3, 17 \n" "paddd mm1, __jpmm_round_frw_row \n" "paddd mm0, mm2 \n" "paddd mm5, __jpmm_round_frw_row \n" "psrad mm1, 17 \n" "paddd mm5, mm6 \n" "psrad mm0, 17 \n" "add ecx, 16 \n" "psrad mm5, 17 \n" "add eax, 16 \n" "packssdw mm3, mm0 \n" "packssdw mm1, mm5 \n" "movq mm6, mm3 \n" "punpcklwd mm3, mm1 \n" "sub edi, 0x01 \n" "punpckhwd mm6, mm1 \n" "add ebx,64 \n" "movq [ecx-16], mm3 \n" "movq [ecx-8], mm6 \n" "cmp edi, 0x00 \n" "jg lp_mmx_fdct_row1 \n" "pop ebx \n" "emms \n" ".att_syntax \n" : /* no output */ : "a"(block) : "memory","ecx","edi","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); #endif /* _64BITS */ #endif /* _WINDOWS */ } #define SHIFT_INV_ROW 11 #define SHIFT_INV_COL 6 ALIGN8 short __jpmm_row_tabs[] = { // Table for rows 0 - constants are multiplied by cos_4_16 16384, 16384, 16384, -16384, 21407, 8867, 8867, -21407, 16384, -16384, 16384, 16384, -8867, 21407, -21407, -8867, 22725, 12873, 19266, -22725, 19266, 4520, -4520, -12873, 12873, 4520, 4520, 19266, -22725, 19266, -12873, -22725, // Table for rows 1 - constants are multiplied by cos_1_16 22725, 22725, 22725, -22725, 29692, 12299, 12299, -29692, 22725, -22725, 22725, 22725, -12299, 29692, -29692, -12299, 31521, 17855, 26722, -31521, 26722, 6270, -6270, -17855, 17855, 6270, 6270, 26722, -31521, 26722, -17855, -31521, // Table for rows 2 - constants are multiplied by cos_2_16 21407, 21407, 21407, -21407, 27969, 11585, 11585, -27969, 21407, -21407, 21407, 21407, -11585, 27969, -27969, -11585, 29692, 16819, 25172, -29692, 25172, 5906, -5906, -16819, 16819, 5906, 5906, 25172, -29692, 25172, -16819, -29692, // Table for rows 3 - constants are multiplied by cos_3_16 19266, 19266, 19266, -19266, 25172, 10426, 10426, -25172, 19266, -19266, 19266, 19266, -10426, 25172, -25172, -10426, 26722, 15137, 22654, -26722, 22654, 5315, -5315, -15137, 15137, 5315, 5315, 22654, -26722, 22654, -15137, -26722, // Table for rows 4 - constants are multiplied by cos_4_16 16384, 16384, 16384, -16384, 21407, 8867, 8867, -21407, 16384, -16384, 16384, 16384, -8867, 21407, -21407, -8867, 22725, 12873, 19266, -22725, 19266, 4520, -4520, -12873, 12873, 4520, 4520, 19266, -22725, 19266, -12873, -22725, // Table for rows 5 - constants are multiplied by cos_3_16 19266, 19266, 19266, -19266, 25172, 10426, 10426, -25172, 19266, -19266, 19266, 19266, -10426, 25172, -25172, -10426, 26722, 15137, 22654, -26722, 22654, 5315, -5315, -15137, 15137, 5315, 5315, 22654, -26722, 22654, -15137, -26722, // Table for rows 6 - constants are multiplied by cos_2_16 21407, 21407, 21407, -21407, 27969, 11585, 11585, -27969, 21407, -21407, 21407, 21407, -11585, 27969, -27969, -11585, 29692, 16819, 25172, -29692, 25172, 5906, -5906, -16819, 16819, 5906, 5906, 25172, -29692, 25172, -16819, -29692, // Table for rows 7 - constants are multiplied by cos_1_16 22725, 22725, 22725, -22725, 29692, 12299, 12299, -29692, 22725, -22725, 22725, 22725, -12299, 29692, -29692, -12299, 31521, 17855, 26722, -31521, 26722, 6270, -6270, -17855, 17855, 6270, 6270, 26722, -31521, 26722, -17855, -31521 }; // Rounding ALIGN8 long __jpmm_rounder[] = { 65536, 65536 , 3597, 3597 , 2260, 2260 , 1203, 1203 , 0, 0 , 120, 120 , 512, 512 , 512, 512 }; // Offset ALIGN8 short __jpmm_offset128[] = { 128,128,128,128 }; void jpeg_idct_mmx(short *block, unsigned char *dest) { short innerBuff[64]; long scratch[4]; #ifdef _WINDOWS // Visual C++ inline assembly code __asm { // Rows ------------------- mov esi, block lea edi, innerBuff lea eax, __jpmm_row_tabs lea ebx, __jpmm_rounder mov ecx, 8 __mmx_idct_rows: movq mm0, [esi] ; 0 ; x3 x2 x1 x0 movq mm1, [esi+8] ; 1 ; x7 x6 x5 x4 movq mm2, mm0 ; 2 ; x3 x2 x1 x0 movq mm3, [eax] ; 3 ; w06 w04 w02 w00 punpcklwd mm0, mm1 ; x5 x1 x4 x0 movq mm5, mm0 ; 5 ; x5 x1 x4 x0 punpckldq mm0, mm0 ; x4 x0 x4 x0 movq mm4, [eax+8] ; 4 ; w07 w05 w03 w01 punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 movq mm6, mm2 ; 6 ; x7 x3 x6 x2 movq mm1, [eax+32] ; 1 ; w22 w20 w18 w16 punpckldq mm2, mm2 ; x6 x2 x6 x2 pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 punpckhdq mm5, mm5 ; x5 x1 x5 x1 pmaddwd mm0, [eax+16] ; x4*w14+x0*w12 x4*w10+x0*w08 punpckhdq mm6, mm6 ; x7 x3 x7 x3 movq mm7, [eax+40] ; 7 ; w23 w21 w19 w17 pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 paddd mm3, [ebx] ; +rounder pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 pmaddwd mm2, [eax+24] ; x6*w15+x2*w13 x6*w11+x2*w09 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) pmaddwd mm5, [eax+48] ; x5*w30+x1*w28 x5*w26+x1*w24 movq mm4, mm3 ; 4 ; a1 a0 pmaddwd mm6, [eax+56] ; x7*w31+x3*w29 x7*w27+x3*w25 paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) paddd mm0, [ebx] ; +rounder psubd mm3, mm1 ; a1-b1 a0-b0 psrad mm3, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 paddd mm1, mm4 ; 4 ; a1+b1 a0+b0 paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2) psrad mm1, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2) movq mm4, mm0 ; 4 ; a3 a2 paddd mm0, mm5 ; a3+b3 a2+b2 psubd mm4, mm5 ; 5 ; a3-b3 a2-b2 psrad mm0, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 psrad mm4, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0 packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5 movq mm7, mm4 ; 7 ; y6 y7 y4 y5 psrld mm4, 16 ; 0 y6 0 y4 pslld mm7, 16 ; y7 0 y5 0 movq [edi], mm1 ; 1 ; save y3 y2 y1 y0 por mm7, mm4 ; 4 ; y7 y6 y5 y4 movq [edi+8], mm7 ; 7 ; save y7 y6 y5 y4 add esi, 16 add edi, 16 add eax, 64 add ebx, 8 dec ecx jnz __mmx_idct_rows // Columns ------------------- lea esi, innerBuff mov edi, dest lea eax, scratch mov ecx, 2 __mmx_idct_cols: movq mm0, __jpmm_tg_3_16 movq mm3, [esi+16*3] movq mm1, mm0 ; tg_3_16 movq mm5, [esi+16*5] pmulhw mm0, mm3 ; x3*(tg_3_16-1) movq mm4, __jpmm_tg_1_16 pmulhw mm1, mm5 ; x5*(tg_3_16-1) movq mm7, [esi+16*7] movq mm2, mm4 ; tg_1_16 movq mm6, [esi+16*1] pmulhw mm4, mm7 ; x7*tg_1_16 paddsw mm0, mm3 ; x3*tg_3_16 pmulhw mm2, mm6 ; x1*tg_1_16 paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 movq mm3, __jpmm_ocos_4_16 paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 movq mm5, mm4 ; tp17 movq mm6, mm2 ; tm17 paddsw mm5, mm1 ; tp17+tp35 = b0 psubsw mm6, mm0 ; tm17-tm35 = b3 psubsw mm4, mm1 ; tp17-tp35 = t1 paddsw mm2, mm0 ; tm17+tm35 = t2 movq mm7, __jpmm_tg_2_16 movq mm1, mm4 ; t1 movq [eax+0], mm5 ; save b0 paddsw mm1, mm2 ; t1+t2 movq [eax+8], mm6 ; save b3 psubsw mm4, mm2 ; t1-t2 movq mm5, [esi+2*16] movq mm0, mm7 ; tg_2_16 movq mm6, [esi+6*16] pmulhw mm0, mm5 ; x2*tg_2_16 pmulhw mm7, mm6 ; x6*tg_2_16 pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2 movq mm2, [esi+0*16] pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2 psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26 movq mm3, mm2 ; x0 movq mm6, [esi+4*16] paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26 paddsw mm2, mm6 ; x0+x4 = tp04 psubsw mm3, mm6 ; x0-x4 = tm04 movq mm5, mm2 ; tp04 movq mm6, mm3 ; tm04 psubsw mm2, mm7 ; tp04-tp26 = a3 paddsw mm3, mm0 ; tm04+tm26 = a1 paddsw mm1, mm1 ; b1 paddsw mm4, mm4 ; b2 paddsw mm5, mm7 ; tp04+tp26 = a0 psubsw mm6, mm0 ; tm04-tm26 = a2 movq mm7, mm3 ; a1 movq mm0, mm6 ; a2 paddsw mm3, mm1 ; a1+b1 paddsw mm6, mm4 ; a2+b2 psraw mm3, SHIFT_INV_COL ; dst1 psubsw mm7, mm1 ; a1-b1 psraw mm6, SHIFT_INV_COL ; dst2 psubsw mm0, mm4 ; a2-b2 movq mm1, [eax+0] ; load b0 psraw mm7, SHIFT_INV_COL ; dst6 movq mm4, mm5 ; a0 psraw mm0, SHIFT_INV_COL ; dst5 paddw mm3,__jpmm_offset128 packuswb mm3,mm0 movd [edi+1*8], mm3 // R1 paddsw mm5, mm1 ; a0+b0 paddw mm6,__jpmm_offset128 packuswb mm6,mm0 movd [edi+2*8], mm6 // R2 psubsw mm4, mm1 ; a0-b0 movq mm3, [eax+8] ; load b3 psraw mm5, SHIFT_INV_COL ; dst0 movq mm6, mm2 ; a3 psraw mm4, SHIFT_INV_COL ; dst7 paddw mm0,__jpmm_offset128 packuswb mm0,mm1 movd [edi+5*8], mm0 // R5 paddsw mm2, mm3 ; a3+b3 paddw mm7,__jpmm_offset128 packuswb mm7,mm0 movd [edi+6*8], mm7 // R6 psubsw mm6, mm3 ; a3-b3 paddw mm5,__jpmm_offset128 packuswb mm5,mm0 movd [edi+0*8], mm5 // R0 psraw mm2, SHIFT_INV_COL ; dst3 paddw mm4,__jpmm_offset128 packuswb mm4,mm0 movd [edi+7*8], mm4 // R7 psraw mm6, SHIFT_INV_COL ; dst4 paddw mm2,__jpmm_offset128 packuswb mm2,mm0 movd [edi+3*8], mm2 // R3 paddw mm6,__jpmm_offset128 packuswb mm6,mm0 movd [edi+4*8], mm6 // R4 add edi,4 add esi,8 dec ecx jnz __mmx_idct_cols emms } #else #ifdef _64BITS // gcc inline assembly code (64bits) // Rows __asm__ ( ".intel_syntax noprefix \n" "push rbx \n" "mov rcx, 8 \n" "lea rax, __jpmm_row_tabs \n" "lea rbx, __jpmm_rounder \n" "__mmx_idct_rows: \n" "movq mm0, [rsi] \n" "movq mm1, [rsi+8] \n" "movq mm2, mm0 \n" "movq mm3, [rax] \n" "punpcklwd mm0, mm1 \n" "movq mm5, mm0 \n" "punpckldq mm0, mm0 \n" "movq mm4, [rax+8] \n" "punpckhwd mm2, mm1 \n" "pmaddwd mm3, mm0 \n" "movq mm6, mm2 \n" "movq mm1, [rax+32] \n" "punpckldq mm2, mm2 \n" "pmaddwd mm4, mm2 \n" "punpckhdq mm5, mm5 \n" "pmaddwd mm0, [rax+16] \n" "punpckhdq mm6, mm6 \n" "movq mm7, [rax+40] \n" "pmaddwd mm1, mm5 \n" "paddd mm3, [rbx] \n" "pmaddwd mm7, mm6 \n" "pmaddwd mm2, [rax+24] \n" "paddd mm3, mm4 \n" "pmaddwd mm5, [rax+48] \n" "movq mm4, mm3 \n" "pmaddwd mm6, [rax+56] \n" "paddd mm1, mm7 \n" "paddd mm0, [rbx] \n" "psubd mm3, mm1 \n" "psrad mm3, 11 \n" "paddd mm1, mm4 \n" "paddd mm0, mm2 \n" "psrad mm1, 11 \n" "paddd mm5, mm6 \n" "movq mm4, mm0 \n" "paddd mm0, mm5 \n" "psubd mm4, mm5 \n" "psrad mm0, 11 \n" "psrad mm4, 11 \n" "packssdw mm1, mm0 \n" "packssdw mm4, mm3 \n" "movq mm7, mm4 \n" "psrld mm4, 16 \n" "pslld mm7, 16 \n" "movq [rdi], mm1 \n" "por mm7, mm4 \n" "movq [rdi+8], mm7 \n" "add rsi, 16 \n" "add rdi, 16 \n" "add rax, 64 \n" "add rbx, 8 \n" "dec rcx \n" "jnz __mmx_idct_rows \n" "pop rbx \n" ".att_syntax \n" : /* no output */ : "D"(innerBuff),"S"(block) : "memory","rax","rcx","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); // Columns __asm__ ( ".intel_syntax noprefix \n" "mov rcx, 2 \n" "__mmx_idct_cols: \n" "movq mm0, __jpmm_tg_3_16 \n" "movq mm3, [rsi+16*3] \n" "movq mm1, mm0 \n" "movq mm5, [rsi+16*5] \n" "pmulhw mm0, mm3 \n" "movq mm4, __jpmm_tg_1_16 \n" "pmulhw mm1, mm5 \n" "movq mm7, [rsi+16*7] \n" "movq mm2, mm4 \n" "movq mm6, [rsi+16*1] \n" "pmulhw mm4, mm7 \n" "paddsw mm0, mm3 \n" "pmulhw mm2, mm6 \n" "paddsw mm1, mm3 \n" "psubsw mm0, mm5 \n" "movq mm3, __jpmm_ocos_4_16 \n" "paddsw mm1, mm5 \n" "paddsw mm4, mm6 \n" "psubsw mm2, mm7 \n" "movq mm5, mm4 \n" "movq mm6, mm2 \n" "paddsw mm5, mm1 \n" "psubsw mm6, mm0 \n" "psubsw mm4, mm1 \n" "paddsw mm2, mm0 \n" "movq mm7, __jpmm_tg_2_16 \n" "movq mm1, mm4 \n" "movq [rax+0], mm5 \n" "paddsw mm1, mm2 \n" "movq [rax+8], mm6 \n" "psubsw mm4, mm2 \n" "movq mm5, [rsi+2*16] \n" "movq mm0, mm7 \n" "movq mm6, [rsi+6*16] \n" "pmulhw mm0, mm5 \n" "pmulhw mm7, mm6 \n" "pmulhw mm1, mm3 \n" "movq mm2, [rsi+0*16] \n" "pmulhw mm4, mm3 \n" "psubsw mm0, mm6 \n" "movq mm3, mm2 \n" "movq mm6, [rsi+4*16] \n" "paddsw mm7, mm5 \n" "paddsw mm2, mm6 \n" "psubsw mm3, mm6 \n" "movq mm5, mm2 \n" "movq mm6, mm3 \n" "psubsw mm2, mm7 \n" "paddsw mm3, mm0 \n" "paddsw mm1, mm1 \n" "paddsw mm4, mm4 \n" "paddsw mm5, mm7 \n" "psubsw mm6, mm0 \n" "movq mm7, mm3 \n" "movq mm0, mm6 \n" "paddsw mm3, mm1 \n" "paddsw mm6, mm4 \n" "psraw mm3, 6 \n" "psubsw mm7, mm1 \n" "psraw mm6, 6 \n" "psubsw mm0, mm4 \n" "movq mm1, [rax+0] \n" "psraw mm7, 6 \n" "movq mm4, mm5 \n" "psraw mm0, 6 \n" "paddw mm3,__jpmm_offset128 \n" "packuswb mm3,mm0 \n" "movd [rdi+1*8], mm3 \n" "paddsw mm5, mm1 \n" "paddw mm6,__jpmm_offset128 \n" "packuswb mm6,mm0 \n" "movd [rdi+2*8], mm6 \n" "psubsw mm4, mm1 \n" "movq mm3, [rax+8] \n" "psraw mm5, 6 \n" "movq mm6, mm2 \n" "psraw mm4, 6 \n" "paddw mm0,__jpmm_offset128 \n" "packuswb mm0,mm1 \n" "movd [rdi+5*8], mm0 \n" "paddsw mm2, mm3 \n" "paddw mm7,__jpmm_offset128 \n" "packuswb mm7,mm0 \n" "movd [rdi+6*8], mm7 \n" "psubsw mm6, mm3 \n" "paddw mm5,__jpmm_offset128 \n" "packuswb mm5,mm0 \n" "movd [rdi+0*8], mm5 \n" "psraw mm2, 6 \n" "paddw mm4,__jpmm_offset128 \n" "packuswb mm4,mm0 \n" "movd [rdi+7*8], mm4 \n" "psraw mm6, 6 \n" "paddw mm2,__jpmm_offset128 \n" "packuswb mm2,mm0 \n" "movd [rdi+3*8], mm2 \n" "paddw mm6,__jpmm_offset128 \n" "packuswb mm6,mm0 \n" "movd [rdi+4*8], mm6 \n" "add rdi,4 \n" "add rsi,8 \n" "dec rcx \n" "jnz __mmx_idct_cols \n" "emms \n" ".att_syntax \n" : /* no output */ : "S"(innerBuff),"D"(dest),"a"(scratch) : "memory","rcx","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); #else // gcc inline assembly code (32bits) // Rows __asm__ ( ".intel_syntax noprefix \n" "push ebx \n" "mov ecx, 8 \n" "lea eax, __jpmm_row_tabs \n" "lea ebx, __jpmm_rounder \n" "__mmx_idct_rows: \n" "movq mm0, [esi] \n" "movq mm1, [esi+8] \n" "movq mm2, mm0 \n" "movq mm3, [eax] \n" "punpcklwd mm0, mm1 \n" "movq mm5, mm0 \n" "punpckldq mm0, mm0 \n" "movq mm4, [eax+8] \n" "punpckhwd mm2, mm1 \n" "pmaddwd mm3, mm0 \n" "movq mm6, mm2 \n" "movq mm1, [eax+32] \n" "punpckldq mm2, mm2 \n" "pmaddwd mm4, mm2 \n" "punpckhdq mm5, mm5 \n" "pmaddwd mm0, [eax+16] \n" "punpckhdq mm6, mm6 \n" "movq mm7, [eax+40] \n" "pmaddwd mm1, mm5 \n" "paddd mm3, [ebx] \n" "pmaddwd mm7, mm6 \n" "pmaddwd mm2, [eax+24] \n" "paddd mm3, mm4 \n" "pmaddwd mm5, [eax+48] \n" "movq mm4, mm3 \n" "pmaddwd mm6, [eax+56] \n" "paddd mm1, mm7 \n" "paddd mm0, [ebx] \n" "psubd mm3, mm1 \n" "psrad mm3, 11 \n" "paddd mm1, mm4 \n" "paddd mm0, mm2 \n" "psrad mm1, 11 \n" "paddd mm5, mm6 \n" "movq mm4, mm0 \n" "paddd mm0, mm5 \n" "psubd mm4, mm5 \n" "psrad mm0, 11 \n" "psrad mm4, 11 \n" "packssdw mm1, mm0 \n" "packssdw mm4, mm3 \n" "movq mm7, mm4 \n" "psrld mm4, 16 \n" "pslld mm7, 16 \n" "movq [edi], mm1 \n" "por mm7, mm4 \n" "movq [edi+8], mm7 \n" "add esi, 16 \n" "add edi, 16 \n" "add eax, 64 \n" "add ebx, 8 \n" "dec ecx \n" "jnz __mmx_idct_rows \n" "pop ebx \n" ".att_syntax \n" : /* no output */ : "D"(innerBuff),"S"(block) : "memory","eax","ecx","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); // Columns __asm__ ( ".intel_syntax noprefix \n" "mov ecx, 2 \n" "__mmx_idct_cols: \n" "movq mm0, __jpmm_tg_3_16 \n" "movq mm3, [esi+16*3] \n" "movq mm1, mm0 \n" "movq mm5, [esi+16*5] \n" "pmulhw mm0, mm3 \n" "movq mm4, __jpmm_tg_1_16 \n" "pmulhw mm1, mm5 \n" "movq mm7, [esi+16*7] \n" "movq mm2, mm4 \n" "movq mm6, [esi+16*1] \n" "pmulhw mm4, mm7 \n" "paddsw mm0, mm3 \n" "pmulhw mm2, mm6 \n" "paddsw mm1, mm3 \n" "psubsw mm0, mm5 \n" "movq mm3, __jpmm_ocos_4_16 \n" "paddsw mm1, mm5 \n" "paddsw mm4, mm6 \n" "psubsw mm2, mm7 \n" "movq mm5, mm4 \n" "movq mm6, mm2 \n" "paddsw mm5, mm1 \n" "psubsw mm6, mm0 \n" "psubsw mm4, mm1 \n" "paddsw mm2, mm0 \n" "movq mm7, __jpmm_tg_2_16 \n" "movq mm1, mm4 \n" "movq [eax+0], mm5 \n" "paddsw mm1, mm2 \n" "movq [eax+8], mm6 \n" "psubsw mm4, mm2 \n" "movq mm5, [esi+2*16] \n" "movq mm0, mm7 \n" "movq mm6, [esi+6*16] \n" "pmulhw mm0, mm5 \n" "pmulhw mm7, mm6 \n" "pmulhw mm1, mm3 \n" "movq mm2, [esi+0*16] \n" "pmulhw mm4, mm3 \n" "psubsw mm0, mm6 \n" "movq mm3, mm2 \n" "movq mm6, [esi+4*16] \n" "paddsw mm7, mm5 \n" "paddsw mm2, mm6 \n" "psubsw mm3, mm6 \n" "movq mm5, mm2 \n" "movq mm6, mm3 \n" "psubsw mm2, mm7 \n" "paddsw mm3, mm0 \n" "paddsw mm1, mm1 \n" "paddsw mm4, mm4 \n" "paddsw mm5, mm7 \n" "psubsw mm6, mm0 \n" "movq mm7, mm3 \n" "movq mm0, mm6 \n" "paddsw mm3, mm1 \n" "paddsw mm6, mm4 \n" "psraw mm3, 6 \n" "psubsw mm7, mm1 \n" "psraw mm6, 6 \n" "psubsw mm0, mm4 \n" "movq mm1, [eax+0] \n" "psraw mm7, 6 \n" "movq mm4, mm5 \n" "psraw mm0, 6 \n" "paddw mm3,__jpmm_offset128 \n" "packuswb mm3,mm0 \n" "movd [edi+1*8], mm3 \n" "paddsw mm5, mm1 \n" "paddw mm6,__jpmm_offset128 \n" "packuswb mm6,mm0 \n" "movd [edi+2*8], mm6 \n" "psubsw mm4, mm1 \n" "movq mm3, [eax+8] \n" "psraw mm5, 6 \n" "movq mm6, mm2 \n" "psraw mm4, 6 \n" "paddw mm0,__jpmm_offset128 \n" "packuswb mm0,mm1 \n" "movd [edi+5*8], mm0 \n" "paddsw mm2, mm3 \n" "paddw mm7,__jpmm_offset128 \n" "packuswb mm7,mm0 \n" "movd [edi+6*8], mm7 \n" "psubsw mm6, mm3 \n" "paddw mm5,__jpmm_offset128 \n" "packuswb mm5,mm0 \n" "movd [edi+0*8], mm5 \n" "psraw mm2, 6 \n" "paddw mm4,__jpmm_offset128 \n" "packuswb mm4,mm0 \n" "movd [edi+7*8], mm4 \n" "psraw mm6, 6 \n" "paddw mm2,__jpmm_offset128 \n" "packuswb mm2,mm0 \n" "movd [edi+3*8], mm2 \n" "paddw mm6,__jpmm_offset128 \n" "packuswb mm6,mm0 \n" "movd [edi+4*8], mm6 \n" "add edi,4 \n" "add esi,8 \n" "dec ecx \n" "jnz __mmx_idct_cols \n" "emms \n" ".att_syntax \n" : /* no output */ : "S"(innerBuff),"D"(dest),"a"(scratch) : "memory","ecx","mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" ); #endif /* _64BITS */ #endif /* _WINDOWS */ } #endif /* JPG_USE_ASM */
Attachment:
signature.asc
Description: PGP signature