Logo Search packages:      
Sourcecode: libtheora version File versions  Download package

recon_mmx.c

/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include "codec_internal.h"


static const unsigned __int64 V128 = 0x8080808080808080LL;

static void copy8x8__mmx (unsigned char *src,
                      unsigned char *dest,
                      unsigned int stride)
{

    //Is this even the fastest way to do this?
    __asm {
        align 16        

        mov         eax, src
        mov         ebx, dest
        mov         ecx, stride

        lea           edi, [ecx + ecx * 2]
        movq            mm0, [eax]
        movq            mm1, [eax + ecx]
        movq            mm2, [eax + ecx * 2]
        movq            mm3, [eax + edi]
        lea           eax, [eax + ecx * 4]
        movq            [ebx], mm0
        movq            [ebx + ecx], mm1
        movq            [ebx + ecx * 2], mm2
        movq            [ebx + edi], mm3
        lea           ebx, [ebx + ecx * 4]
        movq            mm0, [eax]
        movq            mm1, [eax + ecx]
        movq            mm2, [eax + ecx * 2]
        movq            mm3, [eax + edi]
        movq            [ebx], mm0
        movq            [ebx + ecx], mm1
        movq            [ebx + ecx * 2], mm2
        movq            [ebx + edi], mm3

    };

}

static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
                  ogg_uint32_t LineStep)
{

    __asm {
        align 16

        mov         eax, ReconPtr
        mov         ebx, ChangePtr
        mov         ecx, LineStep

        movq            mm0, V128

        lea           edi, [128 + ebx]
    loop_start:   
        movq            mm2, [ebx]

        packsswb  mm2, [8 + ebx]
        por           mm0, mm0
        pxor            mm2, mm0
        lea           ebx, [16 + ebx]
        cmp           ebx, edi

        movq            [eax], mm2



        lea           eax, [eax + ecx]
        jc            loop_start


    };
    
}





static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
                  ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{

    __asm {

        align 16

        mov         eax, ReconPtr
        mov         ebx, ChangePtr
        mov         ecx, LineStep
        mov         edx, RefPtr
    
        pxor            mm0, mm0
        lea           edi, [128 + ebx]

    loop_start:
        movq            mm2, [edx]

        movq            mm4, [ebx]
        movq            mm3, mm2
        movq            mm5, [8 + ebx]
        punpcklbw mm2, mm0
        paddsw          mm2, mm4
        punpckhbw mm3, mm0
        paddsw          mm3, mm5
        add           edx, ecx
        packuswb  mm2, mm3
        lea           ebx, [16 + ebx]
        cmp           ebx, edi

        movq            [eax], mm2

        lea           eax, [eax + ecx]
        jc            loop_start

    };
}




static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
                       unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
                     ogg_uint32_t LineStep)
{
    __asm {
        align 16

        mov     eax, ReconPtr
        mov     ebx, ChangePtr
        mov     ecx, RefPtr1
        mov     edx, RefPtr2
                
        pxor            mm0, mm0
        lea       edi, [128 + ebx]

    loop_start:
        movq            mm2, [ecx]
        movq            mm4, [edx]

        movq            mm3, mm2
        punpcklbw       mm2, mm0
        movq            mm5, mm4
        movq            mm6, [ebx]
        punpckhbw       mm3, mm0
        movq            mm7, [8 + ebx]
        punpcklbw       mm4, mm0
        punpckhbw       mm5, mm0
        paddw           mm2, mm4
        paddw           mm3, mm5
        psrlw           mm2, 1
        psrlw           mm3, 1
        paddw           mm2, mm6
        paddw           mm3, mm7
        lea       ebx, [16 + ebx]
        packuswb        mm2, mm3
        add       ecx, LineStep
        add       edx, LineStep
        movq            [eax], mm2
        add       eax, LineStep
        cmp       ebx, edi
        jc        loop_start

    };

}




void dsp_mmx_recon_init(DspFunctions *funcs)
{
  TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n");
  funcs->copy8x8 = copy8x8__mmx;
  funcs->recon_intra8x8 = recon_intra8x8__mmx;
  funcs->recon_inter8x8 = recon_inter8x8__mmx;
  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
}


Generated by  Doxygen 1.6.0   Back to index