summaryrefslogtreecommitdiff
path: root/vpx_scale/x86_64/scaleopt.c
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_scale/x86_64/scaleopt.c')
-rw-r--r--vpx_scale/x86_64/scaleopt.c1750
1 files changed, 0 insertions, 1750 deletions
diff --git a/vpx_scale/x86_64/scaleopt.c b/vpx_scale/x86_64/scaleopt.c
deleted file mode 100644
index 101f5ff60..000000000
--- a/vpx_scale/x86_64/scaleopt.c
+++ /dev/null
@@ -1,1750 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-* Module Title : scaleopt.cpp
-*
-* Description : Optimized scaling functions
-*
-****************************************************************************/
-#include "pragmas.h"
-
-
-
-/****************************************************************************
-* Module Statics
-****************************************************************************/
-__declspec(align(16)) const static unsigned short one_fifth[] = { 51, 51, 51, 51 };
-__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
-__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
-__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
-__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
-__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102, 51 };
-__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
-__declspec(align(16)) const static unsigned char mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
-__declspec(align(16)) const static unsigned short const35_2[] = { 154, 51, 205, 102 };
-__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205, 51, 154 };
-
-
-
-#include "vpx_scale/vpxscale.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
-*
-* ROUTINE : horizontal_line_3_5_scale_mmx
-*
-* INPUTS : const unsigned char *source :
-* unsigned int source_width :
-* unsigned char *dest :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 3 to 5 up-scaling of a horizontal line of pixels.
-*
-* SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_3_5_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-)
-{
- (void) dest_width;
-
- __asm
- {
-
- push rbx
-
- mov rsi, source
- mov rdi, dest
-
- mov ecx, source_width
- lea rdx, [rsi+rcx-3];
-
- movq mm5, const35_1 // mm5 = 66 xx cd xx 33 xx 9a xx
- movq mm6, const35_2 // mm6 = 9a xx 33 xx cd xx 66 xx
-
- movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
- pxor mm7, mm7 // clear mm7
-
- horiz_line_3_5_loop:
-
- mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
- mov ebx, eax
-
- and ebx, 0xffff00 // ebx = xx 01 02 xx
- mov ecx, eax // ecx = 00 01 02 03
-
- and eax, 0xffff0000 // eax = xx xx 02 03
- xor ecx, eax // ecx = 00 01 xx xx
-
- shr ebx, 8 // ebx = 01 02 xx xx
- or eax, ebx // eax = 01 02 02 03
-
- shl ebx, 16 // ebx = xx xx 01 02
- movd mm1, eax // mm1 = 01 02 02 03 xx xx xx xx
-
- or ebx, ecx // ebx = 00 01 01 02
- punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 03 xx
-
- movd mm0, ebx // mm0 = 00 01 01 02
- pmullw mm1, mm6 //
-
- punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
- pmullw mm0, mm5 //
-
- mov [rdi], ebx // writeoutput 00 xx xx xx
- add rsi, 3
-
- add rdi, 5
- paddw mm0, mm1
-
- paddw mm0, mm4
- psrlw mm0, 8
-
- cmp rsi, rdx
- packuswb mm0, mm7
-
- movd DWORD Ptr [rdi-4], mm0
- jl horiz_line_3_5_loop
-
-//Exit:
- mov eax, DWORD PTR [rsi] // eax = 00 01 02 03
- mov ebx, eax
-
- and ebx, 0xffff00 // ebx = xx 01 02 xx
- mov ecx, eax // ecx = 00 01 02 03
-
- and eax, 0xffff0000 // eax = xx xx 02 03
- xor ecx, eax // ecx = 00 01 xx xx
-
- shr ebx, 8 // ebx = 01 02 xx xx
- or eax, ebx // eax = 01 02 02 03
-
- shl eax, 8 // eax = xx 01 02 02
- and eax, 0xffff0000 // eax = xx xx 02 02
-
- or eax, ebx // eax = 01 02 02 02
-
- shl ebx, 16 // ebx = xx xx 01 02
- movd mm1, eax // mm1 = 01 02 02 02 xx xx xx xx
-
- or ebx, ecx // ebx = 00 01 01 02
- punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 02 xx 02 xx
-
- movd mm0, ebx // mm0 = 00 01 01 02
- pmullw mm1, mm6 //
-
- punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 01 xx 02 xx
- pmullw mm0, mm5 //
-
- mov [rdi], ebx // writeoutput 00 xx xx xx
- paddw mm0, mm1
-
- paddw mm0, mm4
- psrlw mm0, 8
-
- packuswb mm0, mm7
- movd DWORD Ptr [rdi+1], mm0
-
- pop rbx
-
- }
-
-}
-
-
-/****************************************************************************
-*
-* ROUTINE : horizontal_line_4_5_scale_mmx
-*
-* INPUTS : const unsigned char *source :
-* unsigned int source_width :
-* unsigned char *dest :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 4 to 5 up-scaling of a horizontal line of pixels.
-*
-* SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_4_5_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-)
-{
- (void)dest_width;
-
- __asm
- {
-
- mov rsi, source
- mov rdi, dest
-
- mov ecx, source_width
- lea rdx, [rsi+rcx-8];
-
- movq mm5, const45_1 // mm5 = 33 xx 66 xx 9a xx cd xx
- movq mm6, const45_2 // mm6 = cd xx 9a xx 66 xx 33 xx
-
- movq mm4, round_values // mm4 = 80 xx 80 xx 80 xx 80 xx
- pxor mm7, mm7 // clear mm7
-
- horiz_line_4_5_loop:
-
- movq mm0, QWORD PTR [rsi] // mm0 = 00 01 02 03 04 05 06 07
- movq mm1, QWORD PTR [rsi+1]; // mm1 = 01 02 03 04 05 06 07 08
-
- movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
- movq mm3, mm1 // mm3 = 01 02 03 04 05 06 07 08
-
- movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
- punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
-
- punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
- pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
-
- pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
- punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
-
- movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
- pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
-
- punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
- pmullw mm3, mm6 // 05*205 06*154 07*102 08* 51
-
- paddw mm0, mm1 // added round values
- paddw mm0, mm4
-
- psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
- packuswb mm0, mm7
-
- movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
- add rdi, 10
-
- add rsi, 8
- paddw mm2, mm3 //
-
- paddw mm2, mm4 // added round values
- cmp rsi, rdx
-
- psrlw mm2, 8
- packuswb mm2, mm7
-
- movd DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
- jl horiz_line_4_5_loop
-
-//Exit:
- movq mm0, [rsi] // mm0 = 00 01 02 03 04 05 06 07
- movq mm1, mm0 // mm1 = 00 01 02 03 04 05 06 07
-
- movq mm2, mm0 // mm2 = 00 01 02 03 04 05 06 07
- psrlq mm1, 8 // mm1 = 01 02 03 04 05 06 07 00
-
- movq mm3, mask45 // mm3 = 00 00 00 00 00 00 ff 00
- pand mm3, mm1 // mm3 = 00 00 00 00 00 00 07 00
-
- psllq mm3, 8 // mm3 = 00 00 00 00 00 00 00 07
- por mm1, mm3 // mm1 = 01 02 03 04 05 06 07 07
-
- movq mm3, mm1
-
- movd DWORD PTR [rdi], mm0 // write output 00 xx xx xx
- punpcklbw mm0, mm7 // mm0 = 00 xx 01 xx 02 xx 03 xx
-
- punpcklbw mm1, mm7 // mm1 = 01 xx 02 xx 03 xx 04 xx
- pmullw mm0, mm5 // 00* 51 01*102 02*154 03*205
-
- pmullw mm1, mm6 // 01*205 02*154 03*102 04* 51
- punpckhbw mm2, mm7 // mm2 = 04 xx 05 xx 06 xx 07 xx
-
- movd DWORD PTR [rdi+5], mm2 // write ouput 05 xx xx xx
- pmullw mm2, mm5 // 04* 51 05*102 06*154 07*205
-
- punpckhbw mm3, mm7 // mm3 = 05 xx 06 xx 07 xx 08 xx
- pmullw mm3, mm6 // 05*205 06*154 07*102 07* 51
-
- paddw mm0, mm1 // added round values
- paddw mm0, mm4
-
- psrlw mm0, 8 // output: 01 xx 02 xx 03 xx 04 xx
- packuswb mm0, mm7 // 01 02 03 04 xx xx xx xx
-
- movd DWORD PTR [rdi+1], mm0 // write output 01 02 03 04
- paddw mm2, mm3 //
-
- paddw mm2, mm4 // added round values
- psrlw mm2, 8
-
- packuswb mm2, mm7
- movd DWORD PTR [rdi+6], mm2 // writeoutput 06 07 08 09
-
-
- }
-}
-
-/****************************************************************************
-*
-* ROUTINE : vertical_band_4_5_scale_mmx
-*
-* INPUTS : unsigned char *dest :
-* unsigned int dest_pitch :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 4 to 5 up-scaling of a 4 pixel high band of pixels.
-*
-* SPECIAL NOTES : The routine uses the first line of the band below
-* the current band. The function also has a "C" only
-* version.
-*
-****************************************************************************/
-static
-void vertical_band_4_5_scale_mmx
-(
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
- __asm
- {
-
- mov rsi, dest // Get the source and destination pointer
- mov ecx, dest_pitch // Get the pitch size
-
- lea rdi, [rsi+rcx*2] // tow lines below
- add rdi, rcx // three lines below
-
- pxor mm7, mm7 // clear out mm7
- mov edx, dest_width // Loop counter
-
- vs_4_5_loop:
-
- movq mm0, QWORD ptr [rsi] // src[0];
- movq mm1, QWORD ptr [rsi+rcx] // src[1];
-
- movq mm2, mm0 // Make a copy
- punpcklbw mm0, mm7 // unpack low to word
-
- movq mm5, one_fifth
- punpckhbw mm2, mm7 // unpack high to word
-
- pmullw mm0, mm5 // a * 1/5
-
- movq mm3, mm1 // make a copy
- punpcklbw mm1, mm7 // unpack low to word
-
- pmullw mm2, mm5 // a * 1/5
- movq mm6, four_fifths // constan
-
- movq mm4, mm1 // copy of low b
- pmullw mm4, mm6 // b * 4/5
-
- punpckhbw mm3, mm7 // unpack high to word
- movq mm5, mm3 // copy of high b
-
- pmullw mm5, mm6 // b * 4/5
- paddw mm0, mm4 // a * 1/5 + b * 4/5
-
- paddw mm2, mm5 // a * 1/5 + b * 4/5
- paddw mm0, round_values // + 128
-
- paddw mm2, round_values // + 128
- psrlw mm0, 8
-
- psrlw mm2, 8
- packuswb mm0, mm2 // des [1]
-
- movq QWORD ptr [rsi+rcx], mm0 // write des[1]
- movq mm0, [rsi+rcx*2] // mm0 = src[2]
-
- // mm1, mm3 --- Src[1]
- // mm0 --- Src[2]
- // mm7 for unpacking
-
- movq mm5, two_fifths
- movq mm2, mm0 // make a copy
-
- pmullw mm1, mm5 // b * 2/5
- movq mm6, three_fifths
-
-
- punpcklbw mm0, mm7 // unpack low to word
- pmullw mm3, mm5 // b * 2/5
-
- movq mm4, mm0 // make copy of c
- punpckhbw mm2, mm7 // unpack high to word
-
- pmullw mm4, mm6 // c * 3/5
- movq mm5, mm2
-
- pmullw mm5, mm6 // c * 3/5
- paddw mm1, mm4 // b * 2/5 + c * 3/5
-
- paddw mm3, mm5 // b * 2/5 + c * 3/5
- paddw mm1, round_values // + 128
-
- paddw mm3, round_values // + 128
- psrlw mm1, 8
-
- psrlw mm3, 8
- packuswb mm1, mm3 // des[2]
-
- movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
- movq mm1, [rdi] // mm1=Src[3];
-
- // mm0, mm2 --- Src[2]
- // mm1 --- Src[3]
- // mm6 --- 3/5
- // mm7 for unpacking
-
- pmullw mm0, mm6 // c * 3/5
- movq mm5, two_fifths // mm5 = 2/5
-
- movq mm3, mm1 // make a copy
- pmullw mm2, mm6 // c * 3/5
-
- punpcklbw mm1, mm7 // unpack low
- movq mm4, mm1 // make a copy
-
- punpckhbw mm3, mm7 // unpack high
- pmullw mm4, mm5 // d * 2/5
-
- movq mm6, mm3 // make a copy
- pmullw mm6, mm5 // d * 2/5
-
- paddw mm0, mm4 // c * 3/5 + d * 2/5
- paddw mm2, mm6 // c * 3/5 + d * 2/5
-
- paddw mm0, round_values // + 128
- paddw mm2, round_values // + 128
-
- psrlw mm0, 8
- psrlw mm2, 8
-
- packuswb mm0, mm2 // des[3]
- movq QWORD ptr [rdi], mm0 // write des[3]
-
- // mm1, mm3 --- Src[3]
- // mm7 -- cleared for unpacking
-
- movq mm0, [rdi+rcx*2] // mm0, Src[0] of the next group
-
- movq mm5, four_fifths // mm5 = 4/5
- pmullw mm1, mm5 // d * 4/5
-
- movq mm6, one_fifth // mm6 = 1/5
- movq mm2, mm0 // make a copy
-
- pmullw mm3, mm5 // d * 4/5
- punpcklbw mm0, mm7 // unpack low
-
- pmullw mm0, mm6 // an * 1/5
- punpckhbw mm2, mm7 // unpack high
-
- paddw mm1, mm0 // d * 4/5 + an * 1/5
- pmullw mm2, mm6 // an * 1/5
-
- paddw mm3, mm2 // d * 4/5 + an * 1/5
- paddw mm1, round_values // + 128
-
- paddw mm3, round_values // + 128
- psrlw mm1, 8
-
- psrlw mm3, 8
- packuswb mm1, mm3 // des[4]
-
- movq QWORD ptr [rdi+rcx], mm1 // write des[4]
-
- add rdi, 8
- add rsi, 8
-
- sub rdx, 8
- jg vs_4_5_loop
- }
-}
-
-/****************************************************************************
-*
-* ROUTINE : last_vertical_band_4_5_scale_mmx
-*
-* INPUTS : unsigned char *dest :
-* unsigned int dest_pitch :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : None
-*
-* FUNCTION : 4 to 5 up-scaling of the last 4-pixel high band in an image.
-*
-* SPECIAL NOTES : The routine uses the first line of the band below
-* the current band. The function also has an "C" only
-* version.
-*
-****************************************************************************/
-static
-void last_vertical_band_4_5_scale_mmx
-(
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
- __asm
- {
- mov rsi, dest // Get the source and destination pointer
- mov ecx, dest_pitch // Get the pitch size
-
- lea rdi, [rsi+rcx*2] // tow lines below
- add rdi, rcx // three lines below
-
- pxor mm7, mm7 // clear out mm7
- mov edx, dest_width // Loop counter
-
- last_vs_4_5_loop:
-
- movq mm0, QWORD ptr [rsi] // src[0];
- movq mm1, QWORD ptr [rsi+rcx] // src[1];
-
- movq mm2, mm0 // Make a copy
- punpcklbw mm0, mm7 // unpack low to word
-
- movq mm5, one_fifth
- punpckhbw mm2, mm7 // unpack high to word
-
- pmullw mm0, mm5 // a * 1/5
-
- movq mm3, mm1 // make a copy
- punpcklbw mm1, mm7 // unpack low to word
-
- pmullw mm2, mm5 // a * 1/5
- movq mm6, four_fifths // constan
-
- movq mm4, mm1 // copy of low b
- pmullw mm4, mm6 // b * 4/5
-
- punpckhbw mm3, mm7 // unpack high to word
- movq mm5, mm3 // copy of high b
-
- pmullw mm5, mm6 // b * 4/5
- paddw mm0, mm4 // a * 1/5 + b * 4/5
-
- paddw mm2, mm5 // a * 1/5 + b * 4/5
- paddw mm0, round_values // + 128
-
- paddw mm2, round_values // + 128
- psrlw mm0, 8
-
- psrlw mm2, 8
- packuswb mm0, mm2 // des [1]
-
- movq QWORD ptr [rsi+rcx], mm0 // write des[1]
- movq mm0, [rsi+rcx*2] // mm0 = src[2]
-
- // mm1, mm3 --- Src[1]
- // mm0 --- Src[2]
- // mm7 for unpacking
-
- movq mm5, two_fifths
- movq mm2, mm0 // make a copy
-
- pmullw mm1, mm5 // b * 2/5
- movq mm6, three_fifths
-
-
- punpcklbw mm0, mm7 // unpack low to word
- pmullw mm3, mm5 // b * 2/5
-
- movq mm4, mm0 // make copy of c
- punpckhbw mm2, mm7 // unpack high to word
-
- pmullw mm4, mm6 // c * 3/5
- movq mm5, mm2
-
- pmullw mm5, mm6 // c * 3/5
- paddw mm1, mm4 // b * 2/5 + c * 3/5
-
- paddw mm3, mm5 // b * 2/5 + c * 3/5
- paddw mm1, round_values // + 128
-
- paddw mm3, round_values // + 128
- psrlw mm1, 8
-
- psrlw mm3, 8
- packuswb mm1, mm3 // des[2]
-
- movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
- movq mm1, [rdi] // mm1=Src[3];
-
- movq QWORD ptr [rdi+rcx], mm1 // write des[4];
-
- // mm0, mm2 --- Src[2]
- // mm1 --- Src[3]
- // mm6 --- 3/5
- // mm7 for unpacking
-
- pmullw mm0, mm6 // c * 3/5
- movq mm5, two_fifths // mm5 = 2/5
-
- movq mm3, mm1 // make a copy
- pmullw mm2, mm6 // c * 3/5
-
- punpcklbw mm1, mm7 // unpack low
- movq mm4, mm1 // make a copy
-
- punpckhbw mm3, mm7 // unpack high
- pmullw mm4, mm5 // d * 2/5
-
- movq mm6, mm3 // make a copy
- pmullw mm6, mm5 // d * 2/5
-
- paddw mm0, mm4 // c * 3/5 + d * 2/5
- paddw mm2, mm6 // c * 3/5 + d * 2/5
-
- paddw mm0, round_values // + 128
- paddw mm2, round_values // + 128
-
- psrlw mm0, 8
- psrlw mm2, 8
-
- packuswb mm0, mm2 // des[3]
- movq QWORD ptr [rdi], mm0 // write des[3]
-
- // mm1, mm3 --- Src[3]
- // mm7 -- cleared for unpacking
- add rdi, 8
- add rsi, 8
-
- sub rdx, 8
- jg last_vs_4_5_loop
- }
-}
-
-/****************************************************************************
-*
-* ROUTINE : vertical_band_3_5_scale_mmx
-*
-* INPUTS : unsigned char *dest :
-* unsigned int dest_pitch :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
-*
-* SPECIAL NOTES : The routine uses the first line of the band below
-* the current band. The function also has an "C" only
-* version.
-*
-****************************************************************************/
-static
-void vertical_band_3_5_scale_mmx
-(
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
- __asm
- {
- mov rsi, dest // Get the source and destination pointer
- mov ecx, dest_pitch // Get the pitch size
-
- lea rdi, [rsi+rcx*2] // two lines below
- add rdi, rcx // three lines below
-
- pxor mm7, mm7 // clear out mm7
- mov edx, dest_width // Loop counter
-
- vs_3_5_loop:
-
- movq mm0, QWORD ptr [rsi] // src[0];
- movq mm1, QWORD ptr [rsi+rcx] // src[1];
-
- movq mm2, mm0 // Make a copy
- punpcklbw mm0, mm7 // unpack low to word
-
- movq mm5, two_fifths // mm5 = 2/5
- punpckhbw mm2, mm7 // unpack high to word
-
- pmullw mm0, mm5 // a * 2/5
-
- movq mm3, mm1 // make a copy
- punpcklbw mm1, mm7 // unpack low to word
-
- pmullw mm2, mm5 // a * 2/5
- movq mm6, three_fifths // mm6 = 3/5
-
- movq mm4, mm1 // copy of low b
- pmullw mm4, mm6 // b * 3/5
-
- punpckhbw mm3, mm7 // unpack high to word
- movq mm5, mm3 // copy of high b
-
- pmullw mm5, mm6 // b * 3/5
- paddw mm0, mm4 // a * 2/5 + b * 3/5
-
- paddw mm2, mm5 // a * 2/5 + b * 3/5
- paddw mm0, round_values // + 128
-
- paddw mm2, round_values // + 128
- psrlw mm0, 8
-
- psrlw mm2, 8
- packuswb mm0, mm2 // des [1]
-
- movq QWORD ptr [rsi+rcx], mm0 // write des[1]
- movq mm0, [rsi+rcx*2] // mm0 = src[2]
-
- // mm1, mm3 --- Src[1]
- // mm0 --- Src[2]
- // mm7 for unpacking
-
- movq mm4, mm1 // b low
- pmullw mm1, four_fifths // b * 4/5 low
-
- movq mm5, mm3 // b high
- pmullw mm3, four_fifths // b * 4/5 high
-
- movq mm2, mm0 // c
- pmullw mm4, one_fifth // b * 1/5
-
- punpcklbw mm0, mm7 // c low
- pmullw mm5, one_fifth // b * 1/5
-
- movq mm6, mm0 // make copy of c low
- punpckhbw mm2, mm7 // c high
-
- pmullw mm6, one_fifth // c * 1/5 low
- movq mm7, mm2 // make copy of c high
-
- pmullw mm7, one_fifth // c * 1/5 high
- paddw mm1, mm6 // b * 4/5 + c * 1/5 low
-
- paddw mm3, mm7 // b * 4/5 + c * 1/5 high
- movq mm6, mm0 // make copy of c low
-
- pmullw mm6, four_fifths // c * 4/5 low
- movq mm7, mm2 // make copy of c high
-
- pmullw mm7, four_fifths // c * 4/5 high
-
- paddw mm4, mm6 // b * 1/5 + c * 4/5 low
- paddw mm5, mm7 // b * 1/5 + c * 4/5 high
-
- paddw mm1, round_values // + 128
- paddw mm3, round_values // + 128
-
- psrlw mm1, 8
- psrlw mm3, 8
-
- packuswb mm1, mm3 // des[2]
- movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
-
- paddw mm4, round_values // + 128
- paddw mm5, round_values // + 128
-
- psrlw mm4, 8
- psrlw mm5, 8
-
- packuswb mm4, mm5 // des[3]
- movq QWORD ptr [rdi], mm4 // write des[3]
-
- // mm0, mm2 --- Src[3]
-
- pxor mm7, mm7 // clear mm7 for unpacking
- movq mm1, [rdi+rcx*2] // mm1 = Src[0] of the next group
-
- movq mm5, three_fifths // mm5 = 3/5
- pmullw mm0, mm5 // d * 3/5
-
- movq mm6, two_fifths // mm6 = 2/5
- movq mm3, mm1 // make a copy
-
- pmullw mm2, mm5 // d * 3/5
- punpcklbw mm1, mm7 // unpack low
-
- pmullw mm1, mm6 // an * 2/5
- punpckhbw mm3, mm7 // unpack high
-
- paddw mm0, mm1 // d * 3/5 + an * 2/5
- pmullw mm3, mm6 // an * 2/5
-
- paddw mm2, mm3 // d * 3/5 + an * 2/5
- paddw mm0, round_values // + 128
-
- paddw mm2, round_values // + 128
- psrlw mm0, 8
-
- psrlw mm2, 8
- packuswb mm0, mm2 // des[4]
-
- movq QWORD ptr [rdi+rcx], mm0 // write des[4]
-
- add rdi, 8
- add rsi, 8
-
- sub rdx, 8
- jg vs_3_5_loop
- }
-}
-
-/****************************************************************************
-*
-* ROUTINE : last_vertical_band_3_5_scale_mmx
-*
-* INPUTS : unsigned char *dest :
-* unsigned int dest_pitch :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 3 to 5 up-scaling of a 3-pixel high band of pixels.
-*
-* SPECIAL NOTES : The routine uses the first line of the band below
-* the current band. The function also has an "C" only
-* version.
-*
-****************************************************************************/
-static
-void last_vertical_band_3_5_scale_mmx
-(
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
- __asm
- {
- mov rsi, dest // Get the source and destination pointer
- mov ecx, dest_pitch // Get the pitch size
-
- lea rdi, [rsi+rcx*2] // tow lines below
- add rdi, rcx // three lines below
-
- pxor mm7, mm7 // clear out mm7
- mov edx, dest_width // Loop counter
-
-
- last_vs_3_5_loop:
-
- movq mm0, QWORD ptr [rsi] // src[0];
- movq mm1, QWORD ptr [rsi+rcx] // src[1];
-
- movq mm2, mm0 // Make a copy
- punpcklbw mm0, mm7 // unpack low to word
-
- movq mm5, two_fifths // mm5 = 2/5
- punpckhbw mm2, mm7 // unpack high to word
-
- pmullw mm0, mm5 // a * 2/5
-
- movq mm3, mm1 // make a copy
- punpcklbw mm1, mm7 // unpack low to word
-
- pmullw mm2, mm5 // a * 2/5
- movq mm6, three_fifths // mm6 = 3/5
-
- movq mm4, mm1 // copy of low b
- pmullw mm4, mm6 // b * 3/5
-
- punpckhbw mm3, mm7 // unpack high to word
- movq mm5, mm3 // copy of high b
-
- pmullw mm5, mm6 // b * 3/5
- paddw mm0, mm4 // a * 2/5 + b * 3/5
-
- paddw mm2, mm5 // a * 2/5 + b * 3/5
- paddw mm0, round_values // + 128
-
- paddw mm2, round_values // + 128
- psrlw mm0, 8
-
- psrlw mm2, 8
- packuswb mm0, mm2 // des [1]
-
- movq QWORD ptr [rsi+rcx], mm0 // write des[1]
- movq mm0, [rsi+rcx*2] // mm0 = src[2]
-
-
-
- // mm1, mm3 --- Src[1]
- // mm0 --- Src[2]
- // mm7 for unpacking
-
- movq mm4, mm1 // b low
- pmullw mm1, four_fifths // b * 4/5 low
-
- movq QWORD ptr [rdi+rcx], mm0 // write des[4]
-
- movq mm5, mm3 // b high
- pmullw mm3, four_fifths // b * 4/5 high
-
- movq mm2, mm0 // c
- pmullw mm4, one_fifth // b * 1/5
-
- punpcklbw mm0, mm7 // c low
- pmullw mm5, one_fifth // b * 1/5
-
- movq mm6, mm0 // make copy of c low
- punpckhbw mm2, mm7 // c high
-
- pmullw mm6, one_fifth // c * 1/5 low
- movq mm7, mm2 // make copy of c high
-
- pmullw mm7, one_fifth // c * 1/5 high
- paddw mm1, mm6 // b * 4/5 + c * 1/5 low
-
- paddw mm3, mm7 // b * 4/5 + c * 1/5 high
- movq mm6, mm0 // make copy of c low
-
- pmullw mm6, four_fifths // c * 4/5 low
- movq mm7, mm2 // make copy of c high
-
- pmullw mm7, four_fifths // c * 4/5 high
-
- paddw mm4, mm6 // b * 1/5 + c * 4/5 low
- paddw mm5, mm7 // b * 1/5 + c * 4/5 high
-
- paddw mm1, round_values // + 128
- paddw mm3, round_values // + 128
-
- psrlw mm1, 8
- psrlw mm3, 8
-
- packuswb mm1, mm3 // des[2]
- movq QWORD ptr [rsi+rcx*2], mm1 // write des[2]
-
- paddw mm4, round_values // + 128
- paddw mm5, round_values // + 128
-
- psrlw mm4, 8
- psrlw mm5, 8
-
- packuswb mm4, mm5 // des[3]
- movq QWORD ptr [rdi], mm4 // write des[3]
-
- // mm0, mm2 --- Src[3]
-
- add rdi, 8
- add rsi, 8
-
- sub rdx, 8
- jg last_vs_3_5_loop
- }
-}
-
-/****************************************************************************
-*
-* ROUTINE : vertical_band_1_2_scale_mmx
-*
-* INPUTS : unsigned char *dest :
-* unsigned int dest_pitch :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 1 to 2 up-scaling of a band of pixels.
-*
-* SPECIAL NOTES : The routine uses the first line of the band below
-* the current band. The function also has an "C" only
-* version.
-*
-****************************************************************************/
-static
-void vertical_band_1_2_scale_mmx
-(
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
- __asm
- {
-
- mov rsi, dest // Get the source and destination pointer
- mov ecx, dest_pitch // Get the pitch size
-
- pxor mm7, mm7 // clear out mm7
- mov edx, dest_width // Loop counter
-
- vs_1_2_loop:
-
- movq mm0, [rsi] // get Src[0]
- movq mm1, [rsi + rcx * 2] // get Src[1]
-
- movq mm2, mm0 // make copy before unpack
- movq mm3, mm1 // make copy before unpack
-
- punpcklbw mm0, mm7 // low Src[0]
- movq mm6, four_ones // mm6= 1, 1, 1, 1
-
- punpcklbw mm1, mm7 // low Src[1]
- paddw mm0, mm1 // low (a + b)
-
- punpckhbw mm2, mm7 // high Src[0]
- paddw mm0, mm6 // low (a + b + 1)
-
- punpckhbw mm3, mm7
- paddw mm2, mm3 // high (a + b )
-
- psraw mm0, 1 // low (a + b +1 )/2
- paddw mm2, mm6 // high (a + b + 1)
-
- psraw mm2, 1 // high (a + b + 1)/2
- packuswb mm0, mm2 // pack results
-
- movq [rsi+rcx], mm0 // write out eight bytes
- add rsi, 8
-
- sub rdx, 8
- jg vs_1_2_loop
- }
-
-}
-
-/****************************************************************************
-*
-* ROUTINE : last_vertical_band_1_2_scale_mmx
-*
-* INPUTS : unsigned char *dest :
-* unsigned int dest_pitch :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 1 to 2 up-scaling of band of pixels.
-*
-* SPECIAL NOTES : The routine uses the first line of the band below
-* the current band. The function also has an "C" only
-* version.
-*
-****************************************************************************/
-static
-void last_vertical_band_1_2_scale_mmx
-(
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
- __asm
- {
- mov rsi, dest // Get the source and destination pointer
- mov ecx, dest_pitch // Get the pitch size
-
- mov edx, dest_width // Loop counter
-
- last_vs_1_2_loop:
-
- movq mm0, [rsi] // get Src[0]
- movq [rsi+rcx], mm0 // write out eight bytes
-
- add rsi, 8
- sub rdx, 8
-
- jg last_vs_1_2_loop
- }
-}
-
-/****************************************************************************
-*
-* ROUTINE : horizontal_line_1_2_scale
-*
-* INPUTS : const unsigned char *source :
-* unsigned int source_width :
-* unsigned char *dest :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
-*
-* SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_1_2_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-)
-{
- (void) dest_width;
-
- __asm
- {
- mov rsi, source
- mov rdi, dest
-
- pxor mm7, mm7
- movq mm6, four_ones
-
- mov ecx, source_width
-
- hs_1_2_loop:
-
- movq mm0, [rsi]
- movq mm1, [rsi+1]
-
- movq mm2, mm0
- movq mm3, mm1
-
- movq mm4, mm0
- punpcklbw mm0, mm7
-
- punpcklbw mm1, mm7
- paddw mm0, mm1
-
- paddw mm0, mm6
- punpckhbw mm2, mm7
-
- punpckhbw mm3, mm7
- paddw mm2, mm3
-
- paddw mm2, mm6
- psraw mm0, 1
-
- psraw mm2, 1
- packuswb mm0, mm2
-
- movq mm2, mm4
- punpcklbw mm2, mm0
-
- movq [rdi], mm2
- punpckhbw mm4, mm0
-
- movq [rdi+8], mm4
- add rsi, 8
-
- add rdi, 16
- sub rcx, 8
-
- cmp rcx, 8
- jg hs_1_2_loop
-
-// last eight pixel
-
- movq mm0, [rsi]
- movq mm1, mm0
-
- movq mm2, mm0
- movq mm3, mm1
-
- psrlq mm1, 8
- psrlq mm3, 56
-
- psllq mm3, 56
- por mm1, mm3
-
- movq mm3, mm1
- movq mm4, mm0
-
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-
- paddw mm0, mm1
- paddw mm0, mm6
-
- punpckhbw mm2, mm7
- punpckhbw mm3, mm7
-
- paddw mm2, mm3
- paddw mm2, mm6
-
- psraw mm0, 1
- psraw mm2, 1
-
- packuswb mm0, mm2
- movq mm2, mm4
-
- punpcklbw mm2, mm0
- movq [rdi], mm2
-
- punpckhbw mm4, mm0
- movq [rdi+8], mm4
- }
-}
-
-
-
-
-
-__declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
-__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
-
-
-/****************************************************************************
-*
-* ROUTINE : horizontal_line_5_4_scale_mmx
-*
-* INPUTS : const unsigned char *source : Pointer to source data.
-* unsigned int source_width : Stride of source.
-* unsigned char *dest : Pointer to destination data.
-* unsigned int dest_width : Stride of destination (NOT USED).
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : Copies horizontal line of pixels from source to
-* destination scaling up by 4 to 5.
-*
-* SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_5_4_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-)
-{
- /*
- unsigned i;
- unsigned int a, b, c, d, e;
- unsigned char *des = dest;
- const unsigned char *src = source;
-
- (void) dest_width;
-
- for ( i=0; i<source_width; i+=5 )
- {
- a = src[0];
- b = src[1];
- c = src[2];
- d = src[3];
- e = src[4];
-
- des[0] = a;
- des[1] = ((b*192 + c* 64 + 128)>>8);
- des[2] = ((c*128 + d*128 + 128)>>8);
- des[3] = ((d* 64 + e*192 + 128)>>8);
-
- src += 5;
- des += 4;
- }
- */
- __asm
- {
-
- mov rsi, source ;
- mov rdi, dest ;
-
- mov ecx, source_width ;
- movq mm5, const54_1 ;
-
- pxor mm7, mm7 ;
- movq mm6, const54_2 ;
-
- movq mm4, round_values ;
- lea rdx, [rsi+rcx] ;
- horizontal_line_5_4_loop:
-
- movq mm0, QWORD PTR [rsi] ;
- 00 01 02 03 04 05 06 07
- movq mm1, mm0 ;
- 00 01 02 03 04 05 06 07
-
- psrlq mm0, 8 ;
- 01 02 03 04 05 06 07 xx
- punpcklbw mm1, mm7 ;
- xx 00 xx 01 xx 02 xx 03
-
- punpcklbw mm0, mm7 ;
- xx 01 xx 02 xx 03 xx 04
- pmullw mm1, mm5
-
- pmullw mm0, mm6
- add rsi, 5
-
- add rdi, 4
- paddw mm1, mm0
-
- paddw mm1, mm4
- psrlw mm1, 8
-
- cmp rsi, rdx
- packuswb mm1, mm7
-
- movd DWORD PTR [rdi-4], mm1
-
- jl horizontal_line_5_4_loop
-
- }
-
-}
-__declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
-__declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
-
-static
-void vertical_band_5_4_scale_mmx
-(
- unsigned char *source,
- unsigned int src_pitch,
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
-
- __asm
- {
-
- mov rsi, source // Get the source and destination pointer
- mov ecx, src_pitch // Get the pitch size
-
- mov rdi, dest // tow lines below
- pxor mm7, mm7 // clear out mm7
-
- mov edx, dest_pitch // Loop counter
- mov ebx, dest_width
-
- vs_5_4_loop:
-
- movd mm0, DWORD ptr [rsi] // src[0];
- movd mm1, DWORD ptr [rsi+rcx] // src[1];
-
- movd mm2, DWORD ptr [rsi+rcx*2]
- lea rax, [rsi+rcx*2] //
-
- punpcklbw mm1, mm7
- punpcklbw mm2, mm7
-
- movq mm3, mm2
- pmullw mm1, three_fourths
-
- pmullw mm2, one_fourths
- movd mm4, [rax+rcx]
-
- pmullw mm3, two_fourths
- punpcklbw mm4, mm7
-
- movq mm5, mm4
- pmullw mm4, two_fourths
-
- paddw mm1, mm2
- movd mm6, [rax+rcx*2]
-
- pmullw mm5, one_fourths
- paddw mm1, round_values;
-
- paddw mm3, mm4
- psrlw mm1, 8
-
- punpcklbw mm6, mm7
- paddw mm3, round_values
-
- pmullw mm6, three_fourths
- psrlw mm3, 8
-
- packuswb mm1, mm7
- packuswb mm3, mm7
-
- movd DWORD PTR [rdi], mm0
- movd DWORD PTR [rdi+rdx], mm1
-
-
- paddw mm5, mm6
- movd DWORD PTR [rdi+rdx*2], mm3
-
- lea rax, [rdi+rdx*2]
- paddw mm5, round_values
-
- psrlw mm5, 8
- add rdi, 4
-
- packuswb mm5, mm7
- movd DWORD PTR [rax+rdx], mm5
-
- add rsi, 4
- sub rbx, 4
-
- jg vs_5_4_loop
- }
-}
-
-
-__declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
-__declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
-
-
-static
-void horizontal_line_5_3_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-)
-{
- __asm
- {
-
- mov rsi, source ;
- mov rdi, dest ;
-
- mov ecx, source_width ;
- movq mm5, const53_1 ;
-
- pxor mm7, mm7 ;
- movq mm6, const53_2 ;
-
- movq mm4, round_values ;
- lea rdx, [rsi+rcx-5] ;
- horizontal_line_5_3_loop:
-
- movq mm0, QWORD PTR [rsi] ;
- 00 01 02 03 04 05 06 07
- movq mm1, mm0 ;
- 00 01 02 03 04 05 06 07
-
- psllw mm0, 8 ;
- xx 00 xx 02 xx 04 xx 06
- psrlw mm1, 8 ;
- 01 xx 03 xx 05 xx 07 xx
-
- psrlw mm0, 8 ;
- 00 xx 02 xx 04 xx 06 xx
- psllq mm1, 16 ;
- xx xx 01 xx 03 xx 05 xx
-
- pmullw mm0, mm6
-
- pmullw mm1, mm5
- add rsi, 5
-
- add rdi, 3
- paddw mm1, mm0
-
- paddw mm1, mm4
- psrlw mm1, 8
-
- cmp rsi, rdx
- packuswb mm1, mm7
-
- movd DWORD PTR [rdi-3], mm1
- jl horizontal_line_5_3_loop
-
-//exit condition
- movq mm0, QWORD PTR [rsi] ;
- 00 01 02 03 04 05 06 07
- movq mm1, mm0 ;
- 00 01 02 03 04 05 06 07
-
- psllw mm0, 8 ;
- xx 00 xx 02 xx 04 xx 06
- psrlw mm1, 8 ;
- 01 xx 03 xx 05 xx 07 xx
-
- psrlw mm0, 8 ;
- 00 xx 02 xx 04 xx 06 xx
- psllq mm1, 16 ;
- xx xx 01 xx 03 xx 05 xx
-
- pmullw mm0, mm6
-
- pmullw mm1, mm5
- paddw mm1, mm0
-
- paddw mm1, mm4
- psrlw mm1, 8
-
- packuswb mm1, mm7
- movd rax, mm1
-
- mov rdx, rax
- shr rdx, 16
-
- mov WORD PTR[rdi], ax
- mov BYTE PTR[rdi+2], dl
-
- }
-
-}
-
-__declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
-__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
-
-static
-void vertical_band_5_3_scale_mmx
-(
- unsigned char *source,
- unsigned int src_pitch,
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
-
- __asm
- {
-
- mov rsi, source // Get the source and destination pointer
- mov ecx, src_pitch // Get the pitch size
-
- mov rdi, dest // tow lines below
- pxor mm7, mm7 // clear out mm7
-
- mov edx, dest_pitch // Loop counter
- movq mm5, one_thirds
-
- movq mm6, two_thirds
- mov ebx, dest_width;
-
- vs_5_3_loop:
-
- movd mm0, DWORD ptr [rsi] // src[0];
- movd mm1, DWORD ptr [rsi+rcx] // src[1];
-
- movd mm2, DWORD ptr [rsi+rcx*2]
- lea rax, [rsi+rcx*2] //
-
- punpcklbw mm1, mm7
- punpcklbw mm2, mm7
-
- pmullw mm1, mm5
- pmullw mm2, mm6
-
- movd mm3, DWORD ptr [rax+rcx]
- movd mm4, DWORD ptr [rax+rcx*2]
-
- punpcklbw mm3, mm7
- punpcklbw mm4, mm7
-
- pmullw mm3, mm6
- pmullw mm4, mm5
-
-
- movd DWORD PTR [rdi], mm0
- paddw mm1, mm2
-
- paddw mm1, round_values
- psrlw mm1, 8
-
- packuswb mm1, mm7
- paddw mm3, mm4
-
- paddw mm3, round_values
- movd DWORD PTR [rdi+rdx], mm1
-
- psrlw mm3, 8
- packuswb mm3, mm7
-
- movd DWORD PTR [rdi+rdx*2], mm3
-
-
- add rdi, 4
- add rsi, 4
-
- sub rbx, 4
- jg vs_5_3_loop
- }
-}
-
-
-
-
-/****************************************************************************
-*
-* ROUTINE : horizontal_line_2_1_scale
-*
-* INPUTS : const unsigned char *source :
-* unsigned int source_width :
-* unsigned char *dest :
-* unsigned int dest_width :
-*
-* OUTPUTS : None.
-*
-* RETURNS : void
-*
-* FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
-*
-* SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_2_1_scale_mmx
-(
- const unsigned char *source,
- unsigned int source_width,
- unsigned char *dest,
- unsigned int dest_width
-)
-{
- (void) dest_width;
-
- __asm
- {
- mov rsi, source
- mov rdi, dest
-
- pxor mm7, mm7
- mov ecx, dest_width
-
- xor rdx, rdx
- hs_2_1_loop:
-
- movq mm0, [rsi+rdx*2]
- psllw mm0, 8
-
- psrlw mm0, 8
- packuswb mm0, mm7
-
- movd DWORD Ptr [rdi+rdx], mm0;
- add rdx, 4
-
- cmp rdx, rcx
- jl hs_2_1_loop
-
- }
-}
-
-
-
-static
-void vertical_band_2_1_scale_mmx
-(
- unsigned char *source,
- unsigned int src_pitch,
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width)
-{
- vpx_memcpy(dest, source, dest_width);
-}
-
-
-__declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
-__declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
-
-static
-void vertical_band_2_1_scale_i_mmx
-(
- unsigned char *source,
- unsigned int src_pitch,
- unsigned char *dest,
- unsigned int dest_pitch,
- unsigned int dest_width
-)
-{
- __asm
- {
- mov rsi, source
- mov rdi, dest
-
- mov eax, src_pitch
- mov edx, dest_width
-
- pxor mm7, mm7
- sub rsi, rax //back one line
-
-
- lea rcx, [rsi+rdx];
- movq mm6, round_values;
-
- movq mm5, three_sixteenths;
- movq mm4, ten_sixteenths;
-
- vs_2_1_i_loop:
- movd mm0, [rsi] //
- movd mm1, [rsi+rax] //
-
- movd mm2, [rsi+rax*2] //
- punpcklbw mm0, mm7
-
- pmullw mm0, mm5
- punpcklbw mm1, mm7
-
- pmullw mm1, mm4
- punpcklbw mm2, mm7
-
- pmullw mm2, mm5
- paddw mm0, round_values
-
- paddw mm1, mm2
- paddw mm0, mm1
-
- psrlw mm0, 8
- packuswb mm0, mm7
-
- movd DWORD PTR [rdi], mm0
- add rsi, 4
-
- add rdi, 4;
- cmp rsi, rcx
- jl vs_2_1_i_loop
-
- }
-}
-
-
-
-void
-register_mmxscalers(void)
-{
- vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_mmx;
- vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_mmx;
- vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_mmx;
- vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_mmx;
- vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_mmx;
- vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_mmx;
- vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_mmx;
- vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_mmx;
- vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_mmx;
-
- vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
- vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
- vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
- vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
- vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
- vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
- vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
-}