diff options
author | levytamar82 <levytamar82@gmail.com> | 2013-11-21 15:49:29 -0700 |
---|---|---|
committer | levytamar82 <levytamar82@gmail.com> | 2014-01-09 12:27:51 -0700 |
commit | 511d218c60b9b6c1ab9383db746815e907af0359 (patch) | |
tree | a7cbf64477adac2433384293d88d08f27c373fec /vp9/common/x86/vp9_asm_stubs.c | |
parent | a622ed554f7072268e4c8d0b8f26d2e8865c2b3b (diff) | |
download | libvpx-511d218c60b9b6c1ab9383db746815e907af0359.tar libvpx-511d218c60b9b6c1ab9383db746815e907af0359.tar.gz libvpx-511d218c60b9b6c1ab9383db746815e907af0359.tar.bz2 libvpx-511d218c60b9b6c1ab9383db746815e907af0359.zip |
SSSE3 convolution optimization
Optimizing all SSSE3 assembly for convolution:
1. vp9_filter_block1d4_h8_sse2
2. vp9_filter_block1d8_h8_sse2
3. vp9_filter_block1d16_h8_sse2
4. vp9_filter_block1d4_v8_sse2
5. vp9_filter_block1d8_v8_sse2
6. vp9_filter_block1d16_v8_sse2
my optimization include:
-processing 2x8 elements in one 128 bit register instead of processing
8 elements in one 128 bit register.
-removing unecessary loads.
This optimization gives between 2.4% user level gain for 480p input
and 1.6% user level gain for 720p.
This Optimization done only for 64bit.
Change-Id: Icb586dc0c938b56699864fcee6c52fd43b36b969
Diffstat (limited to 'vp9/common/x86/vp9_asm_stubs.c')
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 100 |
1 files changed, 93 insertions, 7 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 106e6d426..b4ab4c306 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -45,20 +45,105 @@ typedef void filter8_1dfunction ( const short *filter ); -#if HAVE_SSSE3 -filter8_1dfunction vp9_filter_block1d16_v8_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; +#if (HAVE_SSSE3) filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; +#if (ARCH_X86_64) +filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; + +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Ensure the filter can be compressed to int16_t. */ + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +#else +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -135,6 +220,7 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, w, h); } } +#endif void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, |