summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorJian Zhou <zhoujian@google.com>2015-11-19 18:57:36 -0800
committerJian Zhou <zhoujian@google.com>2015-11-24 16:07:06 -0800
commitf4621c5c8d733cfd47d0396e7e50677857998770 (patch)
treebc583c3f6cecd8c48cdebc8f5ed97391706b2159 /vpx_dsp
parentf3f6b6fe3e960959489db2568d9942aeca261daa (diff)
downloadlibvpx-f4621c5c8d733cfd47d0396e7e50677857998770.tar
libvpx-f4621c5c8d733cfd47d0396e7e50677857998770.tar.gz
libvpx-f4621c5c8d733cfd47d0396e7e50677857998770.tar.bz2
libvpx-f4621c5c8d733cfd47d0396e7e50677857998770.zip
Speed up tm_predictor_8x8
Left neighbor read from memory only once. Speed up by ~20% in ./test_intra_pred_speed. Change-Id: Ia1388630df6fed0dce9a6eeded6cb855bbc43505
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/x86/intrapred_sse2.asm36
1 files changed, 17 insertions, 19 deletions
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 04b39a583..62c2d2974 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -545,33 +545,31 @@ cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left
RET
INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
pxor m1, m1
movd m2, [aboveq-1]
movq m0, [aboveq]
punpcklbw m2, m1
- punpcklbw m0, m1
- pshuflw m2, m2, 0x0
+ punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+ pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word]
DEFINE_ARGS dst, stride, line, left
mov lineq, -4
- punpcklqdq m2, m2
- add leftq, 8
- psubw m0, m2
-.loop:
- movd m2, [leftq+lineq*2]
- movd m3, [leftq+lineq*2+1]
- punpcklbw m2, m1
- punpcklbw m3, m1
- pshuflw m2, m2, 0x0
- pshuflw m3, m3, 0x0
- punpcklqdq m2, m2
- punpcklqdq m3, m3
- paddw m2, m0
+ punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word]
+ psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word]
+ movq m2, [leftq]
+ punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop
+ pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word]
+ pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word]
+ punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+ punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+ paddw m4, m0
paddw m3, m0
- packuswb m2, m3
- movq [dstq ], m2
- movhps [dstq+strideq], m2
+ packuswb m4, m3
+ movq [dstq ], m4
+ movhps [dstq+strideq], m4
lea dstq, [dstq+strideq*2]
+ psrldq m2, 4
inc lineq
jnz .loop
REP_RET