diff options
author | Jingning Han <jingning@google.com> | 2013-06-13 11:07:12 -0700 |
---|---|---|
committer | Jingning Han <jingning@google.com> | 2013-06-14 09:19:28 -0700 |
commit | c43af9a8a3adc7bd3888e746ce7b7bd581c476ae (patch) | |
tree | f8505283ea43bb908a0406b033ddc44abd0e5488 /vp9/encoder | |
parent | 1a5bb3cc76304bf46a2ca8d1c1ec9ec95e060759 (diff) | |
download | libvpx-c43af9a8a3adc7bd3888e746ce7b7bd581c476ae.tar libvpx-c43af9a8a3adc7bd3888e746ce7b7bd581c476ae.tar.gz libvpx-c43af9a8a3adc7bd3888e746ce7b7bd581c476ae.tar.bz2 libvpx-c43af9a8a3adc7bd3888e746ce7b7bd581c476ae.zip |
Enable sse2 version of sad8x4/4x8
The encoding time for bus at CIF goes from 661s to 625s. This commit
also enabled unit test of sad8x4/4x8 in sad_test.cc.
Change-Id: If3d10ebb56bda584bdb69bcf056599d580b12cb1
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse2.asm | 57 |
1 files changed, 37 insertions, 20 deletions
diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm index ea92377ee..8fb7d4118 100644 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ b/vp9/encoder/x86/vp9_sad_sse2.asm @@ -166,29 +166,46 @@ cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \ INIT_XMM sse2 SAD8XN 16 ; sad8x16_sse2 SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 -; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -INIT_MMX sse -cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride +; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1 +cglobal sad4x%1, 4, 7, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided - movd m0, [refq] - movd m1, [refq+ref_strideq] + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 movd m2, [srcq] - movd m3, [srcq+src_strideq] - lea refq, [refq+ref_strideq*2] - lea srcq, [srcq+src_strideq*2] - movd m4, [refq] - movd m5, [refq+ref_strideq] - movd m6, [srcq] - movd m7, [srcq+src_strideq] - punpckldq m0, m1 - punpckldq m2, m3 - punpckldq m4, m5 - punpckldq m6, m7 - psadbw m0, m2 - psadbw m4, m6 - paddd m0, m4 + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m6, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m6 + psadbw m1, m2 + psadbw m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + movd eax, m0 RET +%endmacro + +INIT_MMX sse +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse |