summaryrefslogtreecommitdiff
path: root/vp8/decoder/x86
diff options
context:
space:
mode:
authorJeff Muizelaar <jmuizelaar@mozilla.com>2010-06-03 10:16:07 -0400
committerFritz Koenig <frkoenig@google.com>2010-07-23 15:21:36 -0400
commit98fcccfe9751894ace9693a39ba0609fe5ea904d (patch)
tree030e0720292e315d0e90cf70fbeb73a47cf7713d /vp8/decoder/x86
parentb2fa74ac18c4a333f7913a346588b87087989202 (diff)
downloadlibvpx-98fcccfe9751894ace9693a39ba0609fe5ea904d.tar
libvpx-98fcccfe9751894ace9693a39ba0609fe5ea904d.tar.gz
libvpx-98fcccfe9751894ace9693a39ba0609fe5ea904d.tar.bz2
libvpx-98fcccfe9751894ace9693a39ba0609fe5ea904d.zip
Change the x86 idct functions to do reconstruction at the same time
Change-Id: I896fe6f9664e6849c7cee2cc6bb4e045eb42540f
Diffstat (limited to 'vp8/decoder/x86')
-rw-r--r--vp8/decoder/x86/dequantize_mmx.asm97
-rw-r--r--vp8/decoder/x86/dequantize_x86.h8
-rw-r--r--vp8/decoder/x86/x86_dsystemdependent.c2
3 files changed, 82 insertions, 25 deletions
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
index 2dcf548f6..7ad9289cc 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -50,12 +50,12 @@ sym(vp8_dequantize_b_impl_mmx):
ret
-;void dequant_idct_mmx(short *input, short *dq, short *output, int pitch)
-global sym(vp8_dequant_idct_mmx)
-sym(vp8_dequant_idct_mmx):
+;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
+global sym(vp8_dequant_idct_add_mmx)
+sym(vp8_dequant_idct_add_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
+ SHADOW_ARGS_TO_STACK 6
GET_GOT rbx
push rsi
push rdi
@@ -77,7 +77,8 @@ sym(vp8_dequant_idct_mmx):
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
- mov rdx, arg(2) ;output
+ mov rdx, arg(3) ;dest
+ mov rsi, arg(2) ;pred
pxor mm7, mm7
@@ -88,7 +89,8 @@ sym(vp8_dequant_idct_mmx):
movq [rax+24],mm7
- movsxd rax, dword ptr arg(3) ;pitch
+ movsxd rax, dword ptr arg(4) ;pitch
+ movsxd rdi, dword ptr arg(5) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@@ -207,13 +209,34 @@ sym(vp8_dequant_idct_mmx):
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
- movq [rdx], mm0
+ pxor mm7, mm7
- movq [rdx+rax], mm1
- movq [rdx+rax*2], mm2
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
- add rdx, rax
- movq [rdx+rax*2], mm5
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
; begin epilog
pop rdi
@@ -224,12 +247,12 @@ sym(vp8_dequant_idct_mmx):
ret
-;void dequant_dc_idct_mmx(short *input, short *dq, short *output, int pitch, int Dc)
-global sym(vp8_dequant_dc_idct_mmx)
-sym(vp8_dequant_dc_idct_mmx):
+;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
+global sym(vp8_dequant_dc_idct_add_mmx)
+sym(vp8_dequant_dc_idct_add_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
+ SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
@@ -238,7 +261,7 @@ sym(vp8_dequant_dc_idct_mmx):
mov rax, arg(0) ;input
mov rdx, arg(1) ;dq
- movsxd rcx, dword ptr arg(4) ;Dc
+ movsxd rcx, dword ptr arg(6) ;Dc
movq mm0, [rax ]
pmullw mm0, [rdx]
@@ -252,7 +275,8 @@ sym(vp8_dequant_dc_idct_mmx):
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
- mov rdx, arg(2) ;output
+ mov rdx, arg(3) ;dest
+ mov rsi, arg(2) ;pred
pxor mm7, mm7
@@ -262,8 +286,10 @@ sym(vp8_dequant_dc_idct_mmx):
movq [rax+16],mm7
movq [rax+24],mm7
+
pinsrw mm0, rcx, 0
- movsxd rax, dword ptr arg(3) ;pitch
+ movsxd rax, dword ptr arg(4) ;pitch
+ movsxd rdi, dword ptr arg(5) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@@ -382,13 +408,34 @@ sym(vp8_dequant_dc_idct_mmx):
punpckldq mm2, mm4 ; 32 22 12 02
punpckhdq mm5, mm4 ; 33 23 13 03
- movq [rdx], mm0
-
- movq [rdx+rax], mm1
- movq [rdx+rax*2], mm2
-
- add rdx, rax
- movq [rdx+rax*2], mm5
+ pxor mm7, mm7
+
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
+
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
+
+ add rdx, rdi
+ add rsi, rax
+
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
; begin epilog
pop rdi
diff --git a/vp8/decoder/x86/dequantize_x86.h b/vp8/decoder/x86/dequantize_x86.h
index 563d58460..32e777994 100644
--- a/vp8/decoder/x86/dequantize_x86.h
+++ b/vp8/decoder/x86/dequantize_x86.h
@@ -21,12 +21,20 @@
*/
#if HAVE_MMX
extern prototype_dequant_block(vp8_dequantize_b_mmx);
+extern prototype_dequant_idct_add(vp8_dequant_idct_add_mmx);
+extern prototype_dequant_idct_dc_add(vp8_dequant_dc_idct_add_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_mmx
+#undef vp8_dequant_idct_add
+#define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
+
+#undef vp8_dequant_idct_dc
+#define vp8_dequant_idct_add_dc vp8_dequant_dc_idct_add_mmx
+
#endif
#endif
diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index bcd2acc75..d7bed0873 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c
@@ -43,6 +43,8 @@ void vp8_arch_x86_decode_init(VP8D_COMP *pbi)
if (flags & HAS_MMX)
{
pbi->dequant.block = vp8_dequantize_b_mmx;
+ pbi->dequant.idct_add = vp8_dequant_idct_add_mmx;
+ pbi->dequant.idct_dc_add = vp8_dequant_dc_idct_add_mmx;
}
#endif