diff options
author | levytamar82 <levytamar82@gmail.com> | 2013-11-21 15:49:29 -0700 |
---|---|---|
committer | levytamar82 <levytamar82@gmail.com> | 2014-02-14 15:08:42 -0700 |
commit | 3068d7d94428d32e0c33a5d3061ba8e362838a41 (patch) | |
tree | 945a47822c6a8db9123b3db4ab6dcfc7de44a9a8 /vp9/common/x86/vp9_asm_stubs.c | |
parent | bb07de7ccea40c145548e8d49752bcccdd08c248 (diff) | |
download | libvpx-3068d7d94428d32e0c33a5d3061ba8e362838a41.tar libvpx-3068d7d94428d32e0c33a5d3061ba8e362838a41.tar.gz libvpx-3068d7d94428d32e0c33a5d3061ba8e362838a41.tar.bz2 libvpx-3068d7d94428d32e0c33a5d3061ba8e362838a41.zip |
SSSE3 convolution optimization
Optimizing all SSSE3 assembly for convolution:
1. vp9_filter_block1d4_h8_sse2
2. vp9_filter_block1d8_h8_sse2
3. vp9_filter_block1d16_h8_sse2
4. vp9_filter_block1d4_v8_sse2
5. vp9_filter_block1d8_v8_sse2
6. vp9_filter_block1d16_v8_sse2
my optimization include:
-processing 2x8 elements in one 128 bit register instead of processing
8 elements in one 128 bit register.
-removing unecessary loads.
This optimization gives between 2.4% user level gain for 480p input
and 1.6% user level gain for 720p.
This Optimization is done only for 64 bit
Change-Id: Ic07fce2f9360329b4f2d956efda1480ae958766b
Diffstat (limited to 'vp9/common/x86/vp9_asm_stubs.c')
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 31 |
1 files changed, 27 insertions, 4 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index a2cf910a4..1b4904c39 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -142,20 +142,29 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ #if HAVE_AVX2 filter8_1dfunction vp9_filter_block1d16_v8_avx2; filter8_1dfunction vp9_filter_block1d16_h8_avx2; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3 +#else filter8_1dfunction vp9_filter_block1d8_v8_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 +#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 +#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 +#endif filter8_1dfunction vp9_filter_block1d16_v2_ssse3; filter8_1dfunction vp9_filter_block1d16_h2_ssse3; filter8_1dfunction vp9_filter_block1d8_v2_ssse3; filter8_1dfunction vp9_filter_block1d8_h2_ssse3; filter8_1dfunction vp9_filter_block1d4_v2_ssse3; filter8_1dfunction vp9_filter_block1d4_h2_ssse3; -#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3 -#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3 #define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3 -#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3 #define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3 #define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3 #define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3 @@ -183,12 +192,26 @@ FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); FUN_CONV_2D(, avx2); #endif #if HAVE_SSSE3 +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; +#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3 +#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3 +#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3 +#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3 +#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3 +#else filter8_1dfunction vp9_filter_block1d16_v8_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#endif filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |