summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2011-03-08 16:25:06 -0500
committerYunqing Wang <yunqingwang@google.com>2011-03-08 16:25:06 -0500
commit419f638910245f5501fcad4eede1efcab0bd22ee (patch)
tree792ed3a0eea79f5264392df4f15d419ec0241be3
parent95adf3df77439740526b73bca7242f7096687d82 (diff)
downloadlibvpx-419f638910245f5501fcad4eede1efcab0bd22ee.tar
libvpx-419f638910245f5501fcad4eede1efcab0bd22ee.tar.gz
libvpx-419f638910245f5501fcad4eede1efcab0bd22ee.tar.bz2
libvpx-419f638910245f5501fcad4eede1efcab0bd22ee.zip
Improve SSE2 half-pixel filter funtions
Rewrote these functions to process 16 pixels once instead of 8. Change-Id: Ic67e80124467a446a3df4cfecfb76a4248602adb
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm355
-rw-r--r--vp8/encoder/x86/variance_sse2.c116
-rw-r--r--vp8/encoder/x86/variance_ssse3.c34
3 files changed, 391 insertions, 114 deletions
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 5d1a17d44..c2c30deb2 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -790,7 +790,7 @@ filter_block2d_bil_variance:
ret
-;void vp8_half_horiz_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
@@ -800,8 +800,8 @@ filter_block2d_bil_variance:
; int *sum,
; unsigned int *sumsquared
;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2)
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance8x_h_sse2)
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -835,7 +835,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2):
add rsi, r8
%endif
-vp8_half_horiz_vert_variance16x_h_1:
+vp8_half_horiz_vert_variance8x_h_1:
movq xmm1, QWORD PTR [rsi] ;
movq xmm2, QWORD PTR [rsi+1] ;
@@ -863,7 +863,7 @@ vp8_half_horiz_vert_variance16x_h_1:
%endif
sub rcx, 1 ;
- jnz vp8_half_horiz_vert_variance16x_h_1 ;
+ jnz vp8_half_horiz_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -910,8 +910,123 @@ vp8_half_horiz_vert_variance16x_h_1:
pop rbp
ret
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
-;void vp8_half_vert_variance16x_h_sse2
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ movdqu xmm3, XMMWORD PTR [rsi+1]
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+ lea rsi, [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+ movdqu xmm1, XMMWORD PTR [rsi] ;
+ movdqu xmm2, XMMWORD PTR [rsi+1] ;
+ pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+ pavgb xmm5, xmm1 ; xmm = vertical average of the above
+
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm4, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+
+ movq xmm3, QWORD PTR [rdi+8]
+ punpcklbw xmm3, xmm0
+ psubw xmm4, xmm3
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm1 ; save xmm1 for use on the next row
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_vert_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
@@ -921,8 +1036,8 @@ vp8_half_horiz_vert_variance16x_h_1:
; int *sum,
; unsigned int *sumsquared
;)
-global sym(vp8_half_vert_variance16x_h_sse2)
-sym(vp8_half_vert_variance16x_h_sse2):
+global sym(vp8_half_vert_variance8x_h_sse2)
+sym(vp8_half_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -945,7 +1060,7 @@ sym(vp8_half_vert_variance16x_h_sse2):
movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
pxor xmm0, xmm0 ;
-vp8_half_vert_variance16x_h_1:
+vp8_half_vert_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
@@ -969,7 +1084,7 @@ vp8_half_vert_variance16x_h_1:
%endif
sub rcx, 1 ;
- jnz vp8_half_vert_variance16x_h_1 ;
+ jnz vp8_half_vert_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -1016,8 +1131,115 @@ vp8_half_vert_variance16x_h_1:
pop rbp
ret
+;void vp8_half_vert_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
-;void vp8_half_horiz_variance16x_h_sse2
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr
+
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ movdqu xmm5, XMMWORD PTR [rsi]
+ lea rsi, [rsi + rax ]
+ pxor xmm0, xmm0
+
+vp8_half_vert_variance16x_h_1:
+ movdqu xmm3, XMMWORD PTR [rsi]
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm4, xmm5
+ punpcklbw xmm5, xmm0
+ punpckhbw xmm4, xmm0
+
+ movq xmm2, QWORD PTR [rdi]
+ punpcklbw xmm2, xmm0
+ psubw xmm5, xmm2
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+ psubw xmm4, xmm2
+
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm4
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm4, xmm4
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm4
+
+ movdqa xmm5, xmm3
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1
+ jnz vp8_half_vert_variance16x_h_1
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
;(
; unsigned char *ref_ptr,
; int ref_pixels_per_line,
@@ -1027,8 +1249,8 @@ vp8_half_vert_variance16x_h_1:
; int *sum,
; unsigned int *sumsquared
;)
-global sym(vp8_half_horiz_variance16x_h_sse2)
-sym(vp8_half_horiz_variance16x_h_sse2):
+global sym(vp8_half_horiz_variance8x_h_sse2)
+sym(vp8_half_horiz_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
@@ -1050,7 +1272,7 @@ sym(vp8_half_horiz_variance16x_h_sse2):
movsxd rcx, dword ptr arg(4) ;Height ;
pxor xmm0, xmm0 ;
-vp8_half_horiz_variance16x16_1:
+vp8_half_horiz_variance8x_h_1:
movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
@@ -1073,7 +1295,7 @@ vp8_half_horiz_variance16x16_1:
add rdi, r9
%endif
sub rcx, 1 ;
- jnz vp8_half_horiz_variance16x16_1 ;
+ jnz vp8_half_horiz_variance8x_h_1 ;
movdq2q mm6, xmm6 ;
movdq2q mm7, xmm7 ;
@@ -1120,6 +1342,109 @@ vp8_half_horiz_variance16x16_1:
pop rbp
ret
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int *sum,
+; unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ pxor xmm6, xmm6 ; error accumulator
+ pxor xmm7, xmm7 ; sse eaccumulator
+ mov rsi, arg(0) ;ref_ptr ;
+
+ mov rdi, arg(2) ;src_ptr ;
+ movsxd rcx, dword ptr arg(4) ;Height ;
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
+
+ pxor xmm0, xmm0 ;
+
+vp8_half_horiz_variance16x_h_1:
+ movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
+ movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
+
+ pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
+ movdqa xmm1, xmm5
+ punpcklbw xmm5, xmm0 ; xmm5 = words of above
+ punpckhbw xmm1, xmm0
+
+ movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
+ punpcklbw xmm3, xmm0 ; xmm3 = words of above
+ movq xmm2, QWORD PTR [rdi+8]
+ punpcklbw xmm2, xmm0
+
+ psubw xmm5, xmm3 ; xmm5 -= xmm3
+ psubw xmm1, xmm2
+ paddw xmm6, xmm5 ; xmm6 += accumulated column differences
+ paddw xmm6, xmm1
+ pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
+ pmaddwd xmm1, xmm1
+ paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
+ paddd xmm7, xmm1
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+
+ sub rcx, 1 ;
+ jnz vp8_half_horiz_variance16x_h_1 ;
+
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(5) ;[Sum]
+ mov rdi, arg(6) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
SECTION_RODATA
; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 7cf6a6308..4612a6711 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c
@@ -81,6 +81,16 @@ void vp8_filter_block2d_bil_var_sse2
int *sum,
unsigned int *sumsquared
);
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
void vp8_half_horiz_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
@@ -91,6 +101,16 @@ void vp8_half_horiz_vert_variance16x_h_sse2
int *sum,
unsigned int *sumsquared
);
+void vp8_half_horiz_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
void vp8_half_horiz_variance16x_h_sse2
(
const unsigned char *ref_ptr,
@@ -101,6 +121,16 @@ void vp8_half_horiz_variance16x_h_sse2
int *sum,
unsigned int *sumsquared
);
+void vp8_half_vert_variance8x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
void vp8_half_vert_variance16x_h_sse2
(
const unsigned char *ref_ptr,
@@ -262,21 +292,21 @@ unsigned int vp8_sub_pixel_variance8x8_wmt
if (xoffset == 4 && yoffset == 0)
{
- vp8_half_horiz_variance16x_h_sse2(
+ vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
- vp8_half_vert_variance16x_h_sse2(
+ vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
- vp8_half_horiz_vert_variance16x_h_sse2(
+ vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum, &xxsum);
@@ -317,11 +347,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
}
else if (xoffset == 0 && yoffset == 4)
{
@@ -329,11 +354,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
}
else if (xoffset == 4 && yoffset == 4)
{
@@ -341,11 +361,6 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
}
else
{
@@ -356,17 +371,16 @@ unsigned int vp8_sub_pixel_variance16x16_wmt
&xsum0, &xxsum0
);
-
vp8_filter_block2d_bil_var_sse2(
src_ptr + 8, src_pixels_per_line,
dst_ptr + 8, dst_pixels_per_line, 16,
xoffset, yoffset,
&xsum1, &xxsum1
);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
}
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
@@ -406,11 +420,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
-
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- &xsum1, &xxsum1);
}
else if (xoffset == 0 && yoffset == 4)
{
@@ -418,11 +427,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
-
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- &xsum1, &xxsum1);
}
else if (xoffset == 4 && yoffset == 4)
{
@@ -430,11 +434,6 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 8,
&xsum0, &xxsum0);
-
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- &xsum1, &xxsum1);
}
else
{
@@ -449,11 +448,10 @@ unsigned int vp8_sub_pixel_variance16x8_wmt
dst_ptr + 8, dst_pixels_per_line, 8,
xoffset, yoffset,
&xsum1, &xxsum1);
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
}
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 7));
}
@@ -474,21 +472,21 @@ unsigned int vp8_sub_pixel_variance8x16_wmt
if (xoffset == 4 && yoffset == 0)
{
- vp8_half_horiz_variance16x_h_sse2(
+ vp8_half_horiz_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 0 && yoffset == 4)
{
- vp8_half_vert_variance16x_h_sse2(
+ vp8_half_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
}
else if (xoffset == 4 && yoffset == 4)
{
- vp8_half_horiz_vert_variance16x_h_sse2(
+ vp8_half_horiz_vert_variance8x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum, &xxsum);
@@ -589,21 +587,14 @@ unsigned int vp8_variance_halfpixvar16x16_h_wmt(
int dst_pixels_per_line,
unsigned int *sse)
{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
+ int xsum0;
+ unsigned int xxsum0;
vp8_half_horiz_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
@@ -616,21 +607,13 @@ unsigned int vp8_variance_halfpixvar16x16_v_wmt(
int dst_pixels_per_line,
unsigned int *sse)
{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
+ int xsum0;
+ unsigned int xxsum0;
vp8_half_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
@@ -643,21 +626,14 @@ unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
int dst_pixels_per_line,
unsigned int *sse)
{
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
+ int xsum0;
+ unsigned int xxsum0;
vp8_half_horiz_vert_variance16x_h_sse2(
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
*sse = xxsum0;
return (xxsum0 - ((xsum0 * xsum0) >> 8));
}
diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c
index 750ae8b86..d50ae3ade 100644
--- a/vp8/encoder/x86/variance_ssse3.c
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -87,14 +87,6 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
}
else if (xoffset == 0 && yoffset == 4)
{
@@ -102,14 +94,6 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
}
else if (xoffset == 4 && yoffset == 4)
{
@@ -117,22 +101,14 @@ unsigned int vp8_sub_pixel_variance16x16_ssse3
src_ptr, src_pixels_per_line,
dst_ptr, dst_pixels_per_line, 16,
&xsum0, &xxsum0);
-
- vp8_half_horiz_vert_variance16x_h_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- &xsum1, &xxsum1);
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
}
else
{
- vp8_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0);
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
}
*sse = xxsum0;