diff options
author | Scott LaVarnway <slavarnway@google.com> | 2017-11-10 10:19:52 -0800 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2017-11-10 12:24:12 -0800 |
commit | 8e6022844fdf3e97cfe10659f386299d716736ab (patch) | |
tree | 9e2bd1befda164e20fc8fbfc81dc5131ec188324 /vpx_dsp | |
parent | 8c7213bc00b56143b4374b1b8b8e1300331475b5 (diff) | |
download | libvpx-8e6022844fdf3e97cfe10659f386299d716736ab.tar libvpx-8e6022844fdf3e97cfe10659f386299d716736ab.tar.gz libvpx-8e6022844fdf3e97cfe10659f386299d716736ab.tar.bz2 libvpx-8e6022844fdf3e97cfe10659f386299d716736ab.zip |
vpx: [x86] add vpx_satd_avx2()
SSE2 instrinsic vs AVX2 intrinsic speed gains:
blocksize 16: ~1.33
blocksize 64: ~1.51
blocksize 256: ~3.03
blocksize 1024: ~3.71
Change-Id: I79b28cba82d21f9dd765e79881aa16d24fd0cb58
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_intrin_avx2.c | 24 |
2 files changed, 26 insertions, 2 deletions
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8ae847c3d..e117b9d15 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -773,7 +773,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; - specialize qw/vpx_satd sse2 neon/; + specialize qw/vpx_satd avx2 sse2 neon/; } else { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff"; specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64"; @@ -782,7 +782,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/; add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; - specialize qw/vpx_satd sse2 neon msa/; + specialize qw/vpx_satd avx2 sse2 neon msa/; } add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height"; diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c index 4dc759bb5..ff19ea647 100644 --- a/vpx_dsp/x86/avg_intrin_avx2.c +++ b/vpx_dsp/x86/avg_intrin_avx2.c @@ -171,3 +171,27 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride, t_coeff += 16; } } + +int vpx_satd_avx2(const tran_low_t *coeff, int length) { + const __m256i one = _mm256_set1_epi16(1); + __m256i accum = _mm256_setzero_si256(); + int i; + + for (i = 0; i < length; i += 16) { + const __m256i src_line = load_tran_low(coeff); + const __m256i abs = _mm256_abs_epi16(src_line); + const __m256i sum = _mm256_madd_epi16(abs, one); + accum = _mm256_add_epi32(accum, sum); + coeff += 16; + } + + { // 32 bit horizontal add + const __m256i a = _mm256_srli_si256(accum, 8); + const __m256i b = _mm256_add_epi32(accum, a); + const __m256i c = _mm256_srli_epi64(b, 32); + const __m256i d = _mm256_add_epi32(b, c); + const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d), + _mm256_extractf128_si256(d, 1)); + return _mm_cvtsi128_si32(accum_128); + } +} |