diff options
author | Kyle Siefring <kylesiefring@gmail.com> | 2017-10-07 16:02:02 -0400 |
---|---|---|
committer | Kyle Siefring <kylesiefring@gmail.com> | 2017-10-07 23:37:48 -0400 |
commit | 9ca06bcdd21dc0e3c2baa80c4c7fd0e7e8637e59 (patch) | |
tree | 29c5626341649a9b997c41926fd5c1679d269163 | |
parent | 807248ec81188ce12d7039e9b3c9d770e57fba5b (diff) | |
download | libvpx-9ca06bcdd21dc0e3c2baa80c4c7fd0e7e8637e59.tar libvpx-9ca06bcdd21dc0e3c2baa80c4c7fd0e7e8637e59.tar.gz libvpx-9ca06bcdd21dc0e3c2baa80c4c7fd0e7e8637e59.tar.bz2 libvpx-9ca06bcdd21dc0e3c2baa80c4c7fd0e7e8637e59.zip |
Add AVX2 version of vpx_convolve8_avg.
vpx_convolve8_avg works by first running a normal horizontal filter then a
vertical filter averages at the end.
The added vpx_convolve8_avg_avx2 calls pre-existing AVX2 code for the
horizontal step.
vpx_convolve8_avg_vert_avx2 is also added, but only uses ssse3 code.
Change-Id: If5160c0c8e778e10de61ee9bf42ee4be5975c983
-rw-r--r-- | test/convolve_test.cc | 25 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 24 |
3 files changed, 50 insertions, 3 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 954975c54..f61e09cea 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -603,6 +603,29 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) { UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); } +TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) { + const uint8_t *const in = input(); + uint8_t *const out = output(); + const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP]; + const int kNumTests = 5000000; + const int width = Width(); + const int height = Height(); + vpx_usec_timer timer; + + SetConstantInput(127); + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16, + width, height); + } + vpx_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer)); + printf("convolve8_avg_%dx%d_%d: %d us\n", width, height, + UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time); +} + TEST_P(ConvolveTest, Copy) { uint8_t *const in = input(); uint8_t *const out = output(); @@ -1178,7 +1201,7 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, const ConvolveFunctions convolve8_avx2( vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2, vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2, - vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3, + vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) }; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 5cb17e167..43c506cd4 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -364,13 +364,13 @@ add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/; add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_scaled_2d ssse3 neon/; diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 6eafe9aaf..c10d626c5 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -539,6 +539,12 @@ filter8_1dfunction vpx_filter_block1d4_h8_ssse3; #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3 #endif // ARCH_X86_64 +filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; +#define vpx_filter_block1d16_v8_avg_avx2 vpx_filter_block1d16_v8_avg_ssse3 +#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3 +#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3 filter8_1dfunction vpx_filter_block1d16_v2_ssse3; filter8_1dfunction vpx_filter_block1d16_h2_ssse3; filter8_1dfunction vpx_filter_block1d8_v2_ssse3; @@ -552,6 +558,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3; #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 +filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3; +#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3 +#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3 +#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const InterpKernel *filter, int x0_q4, @@ -562,13 +574,25 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); +// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); +// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); FUN_CONV_2D(, avx2); +FUN_CONV_2D(avg_, avx2); #endif // HAVE_AX2 && HAVE_SSSE3 |