diff options
author | Yunqing Wang <yunqingwang@google.com> | 2014-02-03 12:40:18 -0800 |
---|---|---|
committer | Yunqing Wang <yunqingwang@google.com> | 2014-02-04 08:01:55 -0800 |
commit | d1961e6fbfd5642cf2250e17e65d6499d8819e8e (patch) | |
tree | 3b0bba205cdb9791062cc08915183fe1fd0cedd4 /vp9/common/x86/vp9_asm_stubs.c | |
parent | 2b7338aca4c820010a4fbad114b13ba27a6e2556 (diff) | |
download | libvpx-d1961e6fbfd5642cf2250e17e65d6499d8819e8e.tar libvpx-d1961e6fbfd5642cf2250e17e65d6499d8819e8e.tar.gz libvpx-d1961e6fbfd5642cf2250e17e65d6499d8819e8e.tar.bz2 libvpx-d1961e6fbfd5642cf2250e17e65d6499d8819e8e.zip |
Optimize bilinear sub-pixel filters in ssse3
This patch added ssse3 optimization of bilinear sub-pixel filters.
The real time encoder was speeded up by ~1%.
Change-Id: Ie82e98976f411183cb8c61ab8d2ba0276e55a338
Diffstat (limited to 'vp9/common/x86/vp9_asm_stubs.c')
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 57 |
1 files changed, 28 insertions, 29 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 8ab5fb1bc..feb00985a 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -23,8 +23,8 @@ typedef void filter8_1dfunction ( const short *filter ); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \ -void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ +void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ uint8_t *dst, ptrdiff_t dst_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ @@ -32,7 +32,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ if (step_q4 == 16 && filter[3] != 128) { \ if (filter[0] || filter[1] || filter[2]) { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \ + vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \ dst, dst_stride, \ h, filter); \ src += 16; \ @@ -40,7 +40,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \ + vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \ dst, dst_stride, \ h, filter); \ src += 8; \ @@ -48,7 +48,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \ + vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \ dst, dst_stride, \ h, filter); \ src += 4; \ @@ -57,7 +57,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ } \ } else { \ while (w >= 16) { \ - vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \ + vp9_filter_block1d16_##dir##2_##avg##opt(src, src_stride, \ dst, dst_stride, \ h, filter); \ src += 16; \ @@ -65,7 +65,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 16; \ } \ while (w >= 8) { \ - vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \ + vp9_filter_block1d8_##dir##2_##avg##opt(src, src_stride, \ dst, dst_stride, \ h, filter); \ src += 8; \ @@ -73,7 +73,7 @@ void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ w -= 8; \ } \ while (w >= 4) { \ - vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \ + vp9_filter_block1d4_##dir##2_##avg##opt(src, src_stride, \ dst, dst_stride, \ h, filter); \ src += 4; \ @@ -136,18 +136,18 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -filter8_1dfunction vp9_filter_block1d16_v2_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_sse2; -filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3; // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -169,11 +169,11 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3, sse2); + ssse3); // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -236,11 +236,10 @@ filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2, - sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, |