Logo Search packages:      
Sourcecode: libtheora version File versions

fdct_mmx.c

;//==========================================================================
;//
;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
;//  PURPOSE.
;//
;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
;//
;//--------------------------------------------------------------------------

#include "theora/theora.h"
#include "codec_internal.h"
#include "dsp.h"


static const  ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15;
static const  ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83;
static const  ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4db;
static const  ogg_int64_t xC4S4 = 0x0b505b505b505b505;
static const  ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3a;
static const  ogg_int64_t xC6S2 = 0x061f861f861f861f8;
static const  ogg_int64_t xC7S1 = 0x031f131f131f131f1;


static __inline void Transpose_mmx( ogg_int16_t *InputData1, ogg_int16_t *OutputData1,
                                 ogg_int16_t *InputData2, ogg_int16_t *OutputData2)
{

    __asm {
        align 16
            mov     eax, InputData1
            mov     ebx, InputData2
            mov     ecx, OutputData1
            mov     edx, OutputData2


        movq    mm0, [eax]    ; /* mm0 = a0 a1 a2 a3 */
        movq    mm4, [ebx]    ; /* mm4 = e4 e5 e6 e7 */
        movq    mm1, [16 + eax]   ; /* mm1 = b0 b1 b2 b3 */
        movq    mm5, [16 + ebx]   ; /* mm5 = f4 f5 f6 f7 */
        movq    mm2, [32 + eax]   ; /* mm2 = c0 c1 c2 c3 */
        movq    mm6, [32 + ebx]   ; /* mm6 = g4 g5 g6 g7 */
        movq    mm3, [48 + eax]   ; /* mm3 = d0 d1 d2 d3 */
        movq    [16 + ecx], mm1   ; /* save  b0 b1 b2 b3 */
        movq    mm7, [48 + ebx]   ; /* mm7 = h0 h1 h2 h3 */
          ; /* Transpose 2x8 block */
        movq    mm1, mm4    ; /* mm1 = e3 e2 e1 e0 */
        punpcklwd   mm4, mm5    ; /* mm4 = f1 e1 f0 e0 */
        movq    [ecx], mm0    ; /* save a3 a2 a1 a0  */
        punpckhwd   mm1, mm5    ; /* mm1 = f3 e3 f2 e2 */
        movq    mm0, mm6    ; /* mm0 = g3 g2 g1 g0 */
        punpcklwd   mm6, mm7    ; /* mm6 = h1 g1 h0 g0 */
        movq    mm5, mm4    ; /* mm5 = f1 e1 f0 e0 */
        punpckldq   mm4, mm6    ; /* mm4 = h0 g0 f0 e0 = MM4 */
        punpckhdq   mm5, mm6    ; /* mm5 = h1 g1 f1 e1 = MM5 */
        movq    mm6, mm1    ; /* mm6 = f3 e3 f2 e2 */
        movq    [edx], mm4    ;
        punpckhwd   mm0, mm7    ; /* mm0 = h3 g3 h2 g2 */
        movq    [16 + edx], mm5   ;
        punpckhdq   mm6, mm0    ; /* mm6 = h3 g3 f3 e3 = MM7 */
        movq    mm4, [ecx]    ; /* mm4 = a3 a2 a1 a0 */
        punpckldq   mm1, mm0    ; /* mm1 = h2 g2 f2 e2 = MM6 */
        movq    mm5, [16 + ecx]   ; /* mm5 = b3 b2 b1 b0 */
        movq    mm0, mm4    ; /* mm0 = a3 a2 a1 a0 */
        movq    [48 + edx], mm6   ;
        punpcklwd   mm0, mm5    ; /* mm0 = b1 a1 b0 a0 */
        movq    [32 + edx], mm1   ;
        punpckhwd   mm4, mm5    ; /* mm4 = b3 a3 b2 a2 */
        movq    mm5, mm2    ; /* mm5 = c3 c2 c1 c0 */
        punpcklwd   mm2, mm3    ; /* mm2 = d1 c1 d0 c0 */
        movq    mm1, mm0    ; /* mm1 = b1 a1 b0 a0 */
        punpckldq   mm0, mm2    ; /* mm0 = d0 c0 b0 a0 = MM0 */
        punpckhdq   mm1, mm2    ; /* mm1 = d1 c1 b1 a1 = MM1 */
        movq    mm2, mm4    ; /* mm2 = b3 a3 b2 a2 */
        movq    [ecx], mm0    ;
        punpckhwd   mm5, mm3    ; /* mm5 = d3 c3 d2 c2 */
        movq    [16 + ecx], mm1   ;
        punpckhdq   mm4, mm5    ; /* mm4 = d3 c3 b3 a3 = MM3 */
        punpckldq   mm2, mm5    ; /* mm2 = d2 c2 b2 a2 = MM2 */
        movq    [48 + ecx], mm4   ;
        movq    [32 + ecx], mm2   ;

    };


}

static __inline void Fdct_mmx( ogg_int16_t *InputData1, ogg_int16_t *InputData2, ogg_int16_t *temp)
{

    __asm {
        align 16


                mov     eax, InputData1
                mov     ebx, InputData2
                mov     ecx, temp
        movq    mm0, [eax]    ;
        movq    mm1, [16 + eax]   ;
        movq    mm2, [48 + eax]   ;
        movq    mm3, [16 + ebx]   ;
        movq    mm4, mm0    ;
        movq    mm5, mm1    ;
        movq    mm6, mm2    ;
        movq    mm7, mm3    ;
                ;
        paddsw    mm0, [48 + ebx]   ; /* mm0 = ip0 + ip7 = is07 */
        paddsw    mm1, [32 + eax]   ; /* mm1 = ip1 + ip2 = is12 */
        paddsw    mm2, [ebx]    ; /* mm2 = ip3 + ip4 = is34 */
        paddsw    mm3, [32 + ebx]   ; /* mm3 = ip5 + ip6 = is56 */
        psubsw    mm4, [48 + ebx]   ; /* mm4 = ip0 - ip7 = id07 */
        psubsw    mm5, [32 + eax]   ; /* mm5 = ip1 - ip2 = id12 */
                ;
        psubsw    mm0, mm2    ; /* mm0 = is07 - is34 */
                ;
        paddsw    mm2, mm2    ;
                ;
        psubsw    mm6, [ebx]    ; /* mm6 = ip3 - ip4 = id34 */
                ;
        paddsw    mm2, mm0    ; /* mm2 = is07 + is34 = is0734 */
        psubsw    mm1, mm3    ; /* mm1 = is12 - is56 */
        movq    [ecx], mm0    ; /* Save is07 - is34 to free mm0; */
        paddsw    mm3, mm3    ;
        paddsw    mm3, mm1    ; /* mm3 = is12 + 1s56  = is1256 */
                ;
        psubsw    mm7, [32 + ebx]   ; /* mm7 = ip5 - ip6 = id56 */
          ; /* ------------------------------------------------------------------- */
        psubsw    mm5, mm7    ; /* mm5 = id12 - id56 */
        paddsw    mm7, mm7    ;
        paddsw    mm7, mm5    ; /* mm7 = id12 + id56 */
          ; /* ------------------------------------------------------------------- */
        psubsw    mm2, mm3    ; /* mm2 = is0734 - is1256 */
        paddsw    mm3, mm3    ;
                ;
        movq    mm0, mm2    ; /* make a copy */
        paddsw    mm3, mm2    ; /* mm3 = is0734 + is1256 */
                ;
        pmulhw    mm0, xC4S4    ; /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
        paddw   mm0, mm2    ; /* mm0 = xC4S4 * ( is0734 - is1256 ) */
        psrlw   mm2, 15   ;
        paddw   mm0, mm2    ; /* Truncate mm0, now it is op[4] */
                ;
        movq    mm2, mm3    ;
        movq    [ebx], mm0    ; /* save ip4, now mm0,mm2 are free */
                ;
        movq    mm0, mm3    ;
        pmulhw    mm3, xC4S4    ; /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
                ;
        psrlw   mm2, 15   ;
        paddw   mm3, mm0    ; /* mm3 = xC4S4 * ( is0734 +is1256 )  */
        paddw   mm3, mm2    ; /* Truncate mm3, now it is op[0] */
                ;
        movq    [eax], mm3    ;
          ; /* ------------------------------------------------------------------- */
        movq    mm3, [ecx]    ; /* mm3 = irot_input_y */
        pmulhw    mm3, xC2S6  ; /* mm3 = xC2S6 * irot_input_y - irot_input_y */
                ;
        movq    mm2, [ecx]    ;
        movq    mm0, mm2    ;
                ;
        psrlw   mm2, 15   ; /* mm3 = xC2S6 * irot_input_y */
        paddw   mm3, mm0    ;
                ;
        paddw   mm3, mm2    ; /* Truncated */
        movq    mm0, mm5    ;
                ;
        movq    mm2, mm5    ;
        pmulhw    mm0, xC6S2    ; /* mm0 = xC6S2 * irot_input_x */
                ;
        psrlw   mm2, 15   ;
        paddw   mm0, mm2    ; /* Truncated */
                ;
        paddsw    mm3, mm0    ; /* ip[2] */
        movq    [32 + eax], mm3   ; /* Save ip2 */
                ;
        movq    mm0, mm5    ;
        movq    mm2, mm5    ;
                ;
        pmulhw    mm5, xC2S6    ; /* mm5 = xC2S6 * irot_input_x - irot_input_x */
        psrlw   mm2, 15   ;
                ;
        movq    mm3, [ecx]    ;
        paddw   mm5, mm0    ; /* mm5 = xC2S6 * irot_input_x */
                ;
        paddw   mm5, mm2    ; /* Truncated */
        movq    mm2, mm3    ;
                ;
        pmulhw    mm3, xC6S2    ; /* mm3 = xC6S2 * irot_input_y */
        psrlw   mm2, 15   ;
                ;
        paddw   mm3, mm2    ; /* Truncated */
        psubsw    mm3, mm5    ;
                ;
        movq    [32 + ebx], mm3   ;
          ; /* ------------------------------------------------------------------- */
        movq    mm0, xC4S4    ;
        movq    mm2, mm1    ;
        movq    mm3, mm1    ;
                ;
        pmulhw    mm1, mm0    ; /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
        psrlw   mm2, 15   ;
                ;
        paddw   mm1, mm3    ; /* mm0 = xC4S4 * ( is12 - is56 ) */
        paddw   mm1, mm2    ; /* Truncate mm1, now it is icommon_product1 */
                ;
        movq    mm2, mm7    ;
        movq    mm3, mm7    ;
                ;
        pmulhw    mm7, mm0    ; /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
        psrlw   mm2, 15   ;
                ;
        paddw   mm7, mm3    ; /* mm7 = xC4S4 * ( id12 + id56 ) */
        paddw   mm7, mm2    ; /* Truncate mm7, now it is icommon_product2 */
          ; /* ------------------------------------------------------------------- */
        pxor    mm0, mm0    ; /* Clear mm0 */
        psubsw    mm0, mm6    ; /* mm0 = - id34 */
                ;
        psubsw    mm0, mm7    ; /* mm0 = - ( id34 + idcommon_product2 ) */
        paddsw    mm6, mm6    ;
        paddsw    mm6, mm0    ; /* mm6 = id34 - icommon_product2 */
                ;
        psubsw    mm4, mm1    ; /* mm4 = id07 - icommon_product1 */
        paddsw    mm1, mm1    ;
        paddsw    mm1, mm4    ; /* mm1 = id07 + icommon_product1 */
          ; /* ------------------------------------------------------------------- */
        movq    mm7, xC1S7    ;
        movq    mm2, mm1    ;
                ;
        movq    mm3, mm1    ;
        pmulhw    mm1, mm7    ; /* mm1 = xC1S7 * irot_input_x - irot_input_x */
                ;
        movq    mm7, xC7S1    ;
        psrlw   mm2, 15   ;
                ;
        paddw   mm1, mm3    ; /* mm1 = xC1S7 * irot_input_x */
        paddw   mm1, mm2    ; /* Trucated */
                ;
        pmulhw    mm3, mm7    ; /* mm3 = xC7S1 * irot_input_x */
        paddw   mm3, mm2    ; /* Truncated */
                ;
        movq    mm5, mm0    ;
        movq    mm2, mm0    ;
                ;
        movq    mm7, xC1S7    ;
        pmulhw    mm0, mm7    ; /* mm0 = xC1S7 * irot_input_y - irot_input_y */
                ;
        movq    mm7, xC7S1    ;
        psrlw   mm2, 15   ;
                ;
        paddw   mm0, mm5    ; /* mm0 = xC1S7 * irot_input_y */
        paddw   mm0, mm2    ; /* Truncated */
                ;
        pmulhw    mm5, mm7    ; /* mm5 = xC7S1 * irot_input_y */
        paddw   mm5, mm2    ; /* Truncated */
                ;
        psubsw    mm1, mm5    ; /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */
        paddsw    mm3, mm0    ; /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */
                ;
        movq    [16 + eax], mm1   ;
        movq    [48 + ebx], mm3   ;
          ; /* ------------------------------------------------------------------- */
        movq    mm0, xC3S5    ;
        movq    mm1, xC5S3    ;
                ;
        movq    mm5, mm6    ;
        movq    mm7, mm6    ;
                ;
        movq    mm2, mm4    ;
        movq    mm3, mm4    ;
                ;
        pmulhw    mm4, mm0    ; /* mm4 = xC3S5 * irot_input_x - irot_input_x */
        pmulhw    mm6, mm1    ; /* mm6 = xC5S3 * irot_input_y - irot_input_y */
                ;
        psrlw   mm2, 15   ;
        psrlw   mm5, 15   ;
                ;
        paddw   mm4, mm3    ; /* mm4 = xC3S5 * irot_input_x */
        paddw   mm6, mm7    ; /* mm6 = xC5S3 * irot_input_y */
                ;
        paddw   mm4, mm2    ; /* Truncated */
        paddw   mm6, mm5    ; /* Truncated */
                ;
        psubsw    mm4, mm6    ; /* ip3 */
        movq    [48 + eax], mm4   ;
                ;
        movq    mm4, mm3    ;
        movq    mm6, mm7    ;
                ;
        pmulhw    mm3, mm1    ; /* mm3 = xC5S3 * irot_input_x - irot_input_x */
        pmulhw    mm7, mm0    ; /* mm7 = xC3S5 * irot_input_y - irot_input_y */
                ;
        paddw   mm4, mm2    ;
        paddw   mm6, mm5    ;
                ;
        paddw   mm3, mm4    ; /* mm3 = xC5S3 * irot_input_x */
        paddw   mm7, mm6    ; /* mm7 = xC3S5 * irot_input_y */
                ;
        paddw   mm3, mm7    ; /* ip5 */
        movq    [16 + ebx], mm3   ;

};

}


static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
{

  static ogg_int16_t tmp[32];
  ogg_int16_t* align_tmp = (ogg_int16_t*)((unsigned char*)tmp + (16 - ((int)tmp)&15));


  Transpose_mmx(InputData, OutputData, InputData + 4, OutputData + 4);
  Fdct_mmx(OutputData, OutputData + 4, align_tmp);

  Transpose_mmx(InputData + 32, OutputData + 32, InputData + 36, OutputData + 36);
  Fdct_mmx(OutputData+32, OutputData + 36, align_tmp);

  Transpose_mmx(OutputData, OutputData, OutputData + 32, OutputData + 32);
  Fdct_mmx(OutputData, OutputData + 32, align_tmp);

  Transpose_mmx(OutputData + 4, OutputData + 4, OutputData + 36, OutputData + 36);
  Fdct_mmx(OutputData + 4, OutputData + 36, align_tmp);

  __asm     emms

}

void dsp_mmx_fdct_init(DspFunctions *funcs)
{
  funcs->fdct_short = fdct_short__mmx;
}

Generated by  Doxygen 1.6.0   Back to index