summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2013-11-19 14:29:25 -0800
committerYunqing Wang <yunqingwang@google.com>2013-11-19 15:10:04 -0800
commit3d50da5397d20abc932d81453b26cde758293a40 (patch)
treee15143338fe82217be5618ecfac1b1574f4bcfd2 /vp9
parent69541e1decbc193b0b7af328f48e25c96c8e49eb (diff)
downloadlibvpx-3d50da5397d20abc932d81453b26cde758293a40.tar
libvpx-3d50da5397d20abc932d81453b26cde758293a40.tar.gz
libvpx-3d50da5397d20abc932d81453b26cde758293a40.tar.bz2
libvpx-3d50da5397d20abc932d81453b26cde758293a40.zip
Fix decoder mismatch with ssse3 enabled
This patch fixed issue 661: "Decoder produces mismatched outputs with ssse3 enabled and disabled." In sub-pixel filters, a pixel value was multiplied by a filter coefficient, and the results were added up. The order of adding up these multiplications had to be arranged carefully to prevent incorrect overflowing. Change-Id: Id08af4200fea9e1b896fc40157b8651c2c7e80f2
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_ssse3.asm56
1 files changed, 37 insertions, 19 deletions
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index 7a5cca056..dbc17ec0f 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -11,17 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-
-
%macro VERTx4 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
@@ -81,11 +70,14 @@
pmaddubsw xmm4, k4k5
pmaddubsw xmm6, k6k7
+ movdqa xmm1, xmm2
paddsw xmm0, xmm6
- paddsw xmm0, xmm2
+ pmaxsw xmm2, xmm4
+ pminsw xmm4, xmm1
paddsw xmm0, xmm4
- paddsw xmm0, krd
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
@@ -538,14 +530,22 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movdqa %2, %1
pshufb %1, [GLOBAL(shuf_t0t1)]
pshufb %2, [GLOBAL(shuf_t2t3)]
- pmaddubsw %1, xmm6
- pmaddubsw %2, xmm7
+ pmaddubsw %1, k0k1k4k5
+ pmaddubsw %2, k2k3k6k7
- paddsw %1, %2
- movdqa %2, %1
+ movdqa xmm4, %1
+ movdqa xmm5, %2
+ psrldq %1, 8
psrldq %2, 8
- paddsw %1, %2
- paddsw %1, xmm5
+ movdqa xmm6, xmm5
+
+ paddsw xmm4, %2
+ pmaxsw xmm5, %1
+ pminsw %1, xmm6
+ paddsw %1, xmm4
+ paddsw %1, xmm5
+
+ paddsw %1, krd
psraw %1, 7
packuswb %1, %1
%endm
@@ -565,6 +565,10 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
pshufd xmm5, xmm5, 0 ;rounding
+ movdqa k0k1k4k5, xmm6
+ movdqa k2k3k6k7, xmm7
+ movdqa krd, xmm5
+
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
@@ -826,8 +830,15 @@ sym(vp9_filter_block1d4_h8_ssse3):
push rdi
; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 3
+ %define k0k1k4k5 [rsp + 16 * 0]
+ %define k2k3k6k7 [rsp + 16 * 1]
+ %define krd [rsp + 16 * 2]
+
HORIZx4 0
+ add rsp, 16 * 3
; begin epilog
pop rdi
pop rsi
@@ -932,8 +943,15 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
push rdi
; end prolog
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 3
+ %define k0k1k4k5 [rsp + 16 * 0]
+ %define k2k3k6k7 [rsp + 16 * 1]
+ %define krd [rsp + 16 * 2]
+
HORIZx4 1
+ add rsp, 16 * 3
; begin epilog
pop rdi
pop rsi