summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2013-06-13 11:07:12 -0700
committerJingning Han <jingning@google.com>2013-06-13 16:18:18 -0700
commit15f50e7b4211fe50c5d0eac675d64afa09e5041f (patch)
treef8505283ea43bb908a0406b033ddc44abd0e5488 /vp9/encoder/x86
parent1a5bb3cc76304bf46a2ca8d1c1ec9ec95e060759 (diff)
downloadlibvpx-15f50e7b4211fe50c5d0eac675d64afa09e5041f.tar
libvpx-15f50e7b4211fe50c5d0eac675d64afa09e5041f.tar.gz
libvpx-15f50e7b4211fe50c5d0eac675d64afa09e5041f.tar.bz2
libvpx-15f50e7b4211fe50c5d0eac675d64afa09e5041f.zip
Enable sse2 version of sad8x4/4x8
The encoding time for bus at CIF goes from 661s to 625s. This commit also enabled unit test of sad8x4/4x8 in sad_test.cc. Change-Id: If3d10ebb56bda584bdb69bcf056599d580b12cb1
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r--vp9/encoder/x86/vp9_sad_sse2.asm57
1 files changed, 37 insertions, 20 deletions
diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm
index ea92377ee..8fb7d4118 100644
--- a/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/vp9/encoder/x86/vp9_sad_sse2.asm
@@ -166,29 +166,46 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \
INIT_XMM sse2
SAD8XN 16 ; sad8x16_sse2
SAD8XN 8 ; sad8x8_sse2
+SAD8XN 4 ; sad8x4_sse2
-; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-INIT_MMX sse
-cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride
+; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD4XN 1
+cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
movsxdifnidn src_strideq, src_strided
movsxdifnidn ref_strideq, ref_strided
- movd m0, [refq]
- movd m1, [refq+ref_strideq]
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+ mov n_rowsd, %1/4
+ pxor m0, m0
+
+.loop:
+ movd m1, [refq]
+ movd m2, [refq+ref_strideq]
+ movd m3, [refq+ref_strideq*2]
+ movd m4, [refq+ref_stride3q]
+ punpckldq m1, m2
+ punpckldq m3, m4
movd m2, [srcq]
- movd m3, [srcq+src_strideq]
- lea refq, [refq+ref_strideq*2]
- lea srcq, [srcq+src_strideq*2]
- movd m4, [refq]
- movd m5, [refq+ref_strideq]
- movd m6, [srcq]
- movd m7, [srcq+src_strideq]
- punpckldq m0, m1
- punpckldq m2, m3
- punpckldq m4, m5
- punpckldq m6, m7
- psadbw m0, m2
- psadbw m4, m6
- paddd m0, m4
+ movd m5, [srcq+src_strideq]
+ movd m4, [srcq+src_strideq*2]
+ movd m6, [srcq+src_stride3q]
+ punpckldq m2, m5
+ punpckldq m4, m6
+ psadbw m1, m2
+ psadbw m3, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
movd eax, m0
RET
+%endmacro
+
+INIT_MMX sse
+SAD4XN 8 ; sad4x8_sse
+SAD4XN 4 ; sad4x4_sse