summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
authorYunqing Wang <yunqingwang@google.com>2013-10-01 12:49:25 -0700
committerYunqing Wang <yunqingwang@google.com>2013-10-01 12:49:25 -0700
commitdf8e1564324f28e6e3bd17413b589a21647a07c2 (patch)
treef49cd589664958e575873f7a68eca40c2da12602 /vp9
parent682c27239f711cb0dbdcb329d4070f62c76f1f5f (diff)
downloadlibvpx-df8e1564324f28e6e3bd17413b589a21647a07c2.tar
libvpx-df8e1564324f28e6e3bd17413b589a21647a07c2.tar.gz
libvpx-df8e1564324f28e6e3bd17413b589a21647a07c2.tar.bz2
libvpx-df8e1564324f28e6e3bd17413b589a21647a07c2.zip
Modify HORIZx16 macro in subpixel filter functions
Interleaved the instructions, reduced register dependency, and prefetched the source data. This improved the decoder speed by 0.6% - 2%. Change-Id: I568067aa0c629b2e58219326899c82aedf7eccca
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_ssse3.asm75
1 files changed, 34 insertions, 41 deletions
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index bbf9888ca..277902fc9 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -705,60 +705,53 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movsxd rcx, dword ptr arg(4) ;output_height
.loop:
- movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
+ prefetcht0 [rsi + 2 * rax -3]
- movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
- punpcklqdq xmm0, xmm3
+ movq xmm0, [rsi - 3] ;load src data
+ movq xmm4, [rsi + 5]
+ movq xmm7, [rsi + 13]
+ punpcklqdq xmm0, xmm4
+ punpcklqdq xmm4, xmm7
movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm0, k0k1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
- movdqa xmm2, xmm1
+ pshufb xmm0, [GLOBAL(shuf_t0t1)]
pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
+ pshufb xmm3, [GLOBAL(shuf_t6t7)]
+ pshufb xmm4, [GLOBAL(shuf_t0t1)]
+ pshufb xmm5, [GLOBAL(shuf_t2t3)]
+ pshufb xmm6, [GLOBAL(shuf_t4t5)]
+ pshufb xmm7, [GLOBAL(shuf_t6t7)]
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm1, k2k3
+ pmaddubsw xmm2, k4k5
+ pmaddubsw xmm3, k6k7
+ pmaddubsw xmm4, k0k1
+ pmaddubsw xmm5, k2k3
+ pmaddubsw xmm6, k4k5
+ pmaddubsw xmm7, k6k7
paddsw xmm0, xmm1
- paddsw xmm0, xmm4
+ paddsw xmm0, xmm3
paddsw xmm0, xmm2
+ paddsw xmm4, xmm5
+ paddsw xmm4, xmm7
+ paddsw xmm4, xmm6
+
paddsw xmm0, krd
+ paddsw xmm4, krd
psraw xmm0, 7
+ psraw xmm4, 7
packuswb xmm0, xmm0
-
-
- movq xmm3, [rsi + 5]
- movq xmm7, [rsi + 13]
- punpcklqdq xmm3, xmm7
-
- movdqa xmm1, xmm3
- pshufb xmm3, [GLOBAL(shuf_t0t1)]
- pmaddubsw xmm3, k0k1
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf_t2t3)]
- pmaddubsw xmm1, k2k3
-
- movdqa xmm4, xmm2
- pshufb xmm2, [GLOBAL(shuf_t4t5)]
- pmaddubsw xmm2, k4k5
-
- pshufb xmm4, [GLOBAL(shuf_t6t7)]
- pmaddubsw xmm4, k6k7
-
- paddsw xmm3, xmm1
- paddsw xmm3, xmm4
- paddsw xmm3, xmm2
- paddsw xmm3, krd
- psraw xmm3, 7
- packuswb xmm3, xmm3
- punpcklqdq xmm0, xmm3
+ packuswb xmm4, xmm4
+ punpcklqdq xmm0, xmm4
%if %1
movdqa xmm1, [rdi]
pavgb xmm0, xmm1