/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


#include "memory.h"
#include "preproc.h"
#include "pragmas.h"

/****************************************************************************
*  Macros
****************************************************************************/
#define FRAMECOUNT 7
#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )

/****************************************************************************
*  Imports
****************************************************************************/
extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);

/****************************************************************************
*  Exported Global Variables
****************************************************************************/
void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);

/****************************************************************************
 *
 *  ROUTINE       : temp_filter_wmt
 *
 *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
 *                  unsigned char *s     : Pointer to source frame.
 *                  unsigned char *d     : Pointer to destination frame.
 *                  int bytes            : Number of bytes to filter.
 *                  int strength         : Strength of filter to apply.
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Performs a closesness adjusted temporarl blur
 *
 *  SPECIAL NOTES : Destination frame can be same as source frame.
 *
 ****************************************************************************/
void temp_filter_wmt
(
    pre_proc_instance *ppi,
    unsigned char *s,
    unsigned char *d,
    int bytes,
    int strength
)
{
    int byte = 0;
    unsigned char *frameptr = ppi->frame_buffer;

    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};

    if (ppi->frame == 0)
    {
        do
        {
            int i;
            int frame = 0;

            do
            {
                for (i = 0; i < 8; i++)
                {
                    *frameptr = s[byte+i];
                    ++frameptr;
                }

                ++frame;
            }
            while (frame < FRAMECOUNT);

            for (i = 0; i < 8; i++)
                d[byte+i] = s[byte+i];

            byte += 8;

        }
        while (byte < bytes);
    }
    else
    {
        int i;
        int offset2 = (ppi->frame % FRAMECOUNT);

        do
        {
            __declspec(align(16)) unsigned short counts[8];
            __declspec(align(16)) unsigned short sums[8];
            __asm
            {
                mov         eax, offset2
                mov         edi, s                  // source pixels
                pxor        xmm1, xmm1              // accumulator

                pxor        xmm7, xmm7

                mov         esi, frameptr           // accumulator
                pxor        xmm2, xmm2              // count

                movq        xmm3, QWORD PTR [edi]

                movq        QWORD PTR [esi+8*eax], xmm3

                punpcklbw   xmm3, xmm2              // xmm3 source pixels
                mov         ecx,  FRAMECOUNT

                next_frame:
                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
                movdqa      xmm6, xmm4              // save the pixel values
                psubsw      xmm4, xmm3              // subtracted pixel values
                pmullw      xmm4, xmm4              // square xmm4
                movd        xmm5, strength
                psrlw       xmm4, xmm5              // should be strength
                pmullw      xmm4, threes            // 3 * modifier
                movdqa      xmm5, sixteens          // 16s
                psubusw     xmm5, xmm4              // 16 - modifiers
                movdqa      xmm4, xmm5              // save the modifiers
                pmullw      xmm4, xmm6              // multiplier values
                paddusw     xmm1, xmm4              // accumulator
                paddusw     xmm2, xmm5              // count
                add         esi, 8                  // next frame
                dec         ecx                     // next set of eight pixels
                jnz         next_frame

                movdqa      counts, xmm2
                psrlw       xmm2, 1                 // divide count by 2 for rounding
                paddusw     xmm1, xmm2              // rounding added in

                mov         frameptr, esi

                movdqa      sums, xmm1
            }

            for (i = 0; i < 8; i++)
            {
                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
                blurvalue >>= 16;
                d[i] = blurvalue;
            }

            s += 8;
            d += 8;
            byte += 8;
        }
        while (byte < bytes);
    }

    ++ppi->frame;
    __asm emms
}

/****************************************************************************
 *
 *  ROUTINE       : temp_filter_mmx
 *
 *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
 *                  unsigned char *s     : Pointer to source frame.
 *                  unsigned char *d     : Pointer to destination frame.
 *                  int bytes            : Number of bytes to filter.
 *                  int strength         : Strength of filter to apply.
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Performs a closesness adjusted temporarl blur
 *
 *  SPECIAL NOTES : Destination frame can be same as source frame.
 *
 ****************************************************************************/
void temp_filter_mmx
(
    pre_proc_instance *ppi,
    unsigned char *s,
    unsigned char *d,
    int bytes,
    int strength
)
{
    int byte = 0;
    unsigned char *frameptr = ppi->frame_buffer;

    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};

    if (ppi->frame == 0)
    {
        do
        {
            int i;
            int frame = 0;

            do
            {
                for (i = 0; i < 4; i++)
                {
                    *frameptr = s[byte+i];
                    ++frameptr;
                }

                ++frame;
            }
            while (frame < FRAMECOUNT);

            for (i = 0; i < 4; i++)
                d[byte+i] = s[byte+i];

            byte += 4;

        }
        while (byte < bytes);
    }
    else
    {
        int i;
        int offset2 = (ppi->frame % FRAMECOUNT);

        do
        {
            __declspec(align(16)) unsigned short counts[8];
            __declspec(align(16)) unsigned short sums[8];
            __asm
            {

                mov         eax, offset2
                mov         edi, s                  // source pixels
                pxor        mm1, mm1                // accumulator
                pxor        mm7, mm7

                mov         esi, frameptr           // accumulator
                pxor        mm2, mm2                // count

                movd        mm3, DWORD PTR [edi]
                movd        DWORD PTR [esi+4*eax], mm3

                punpcklbw   mm3, mm2                // mm3 source pixels
                mov         ecx,  FRAMECOUNT

                next_frame:
                movd        mm4, DWORD PTR [esi]    // get frame buffer values
                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
                movq        mm6, mm4                // save the pixel values
                psubsw      mm4, mm3                // subtracted pixel values
                pmullw      mm4, mm4                // square mm4
                movd        mm5, strength
                psrlw       mm4, mm5                // should be strength
                pmullw      mm4, threes             // 3 * modifier
                movq        mm5, sixteens           // 16s
                psubusw     mm5, mm4                // 16 - modifiers
                movq        mm4, mm5                // save the modifiers
                pmullw      mm4, mm6                // multiplier values
                paddusw     mm1, mm4                // accumulator
                paddusw     mm2, mm5                // count
                add         esi, 4                  // next frame
                dec         ecx                     // next set of eight pixels
                jnz         next_frame

                movq        counts, mm2
                psrlw       mm2, 1                  // divide count by 2 for rounding
                paddusw     mm1, mm2                // rounding added in

                mov         frameptr, esi

                movq        sums, mm1

            }

            for (i = 0; i < 4; i++)
            {
                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
                blurvalue >>= 16;
                d[i] = blurvalue;
            }

            s += 4;
            d += 4;
            byte += 4;
        }
        while (byte < bytes);
    }

    ++ppi->frame;
    __asm emms
}