summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2017-11-10 10:19:52 -0800
committerScott LaVarnway <slavarnway@google.com>2017-11-10 12:24:12 -0800
commit8e6022844fdf3e97cfe10659f386299d716736ab (patch)
tree9e2bd1befda164e20fc8fbfc81dc5131ec188324 /vpx_dsp
parent8c7213bc00b56143b4374b1b8b8e1300331475b5 (diff)
downloadlibvpx-8e6022844fdf3e97cfe10659f386299d716736ab.tar
libvpx-8e6022844fdf3e97cfe10659f386299d716736ab.tar.gz
libvpx-8e6022844fdf3e97cfe10659f386299d716736ab.tar.bz2
libvpx-8e6022844fdf3e97cfe10659f386299d716736ab.zip
vpx: [x86] add vpx_satd_avx2()
SSE2 instrinsic vs AVX2 intrinsic speed gains: blocksize 16: ~1.33 blocksize 64: ~1.51 blocksize 256: ~3.03 blocksize 1024: ~3.71 Change-Id: I79b28cba82d21f9dd765e79881aa16d24fd0cb58
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl4
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c24
2 files changed, 26 insertions, 2 deletions
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8ae847c3d..e117b9d15 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -773,7 +773,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon/;
+ specialize qw/vpx_satd avx2 sse2 neon/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
@@ -782,7 +782,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon msa/;
+ specialize qw/vpx_satd avx2 sse2 neon msa/;
}
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index 4dc759bb5..ff19ea647 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -171,3 +171,27 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
t_coeff += 16;
}
}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 16) {
+ const __m256i src_line = load_tran_low(coeff);
+ const __m256i abs = _mm256_abs_epi16(src_line);
+ const __m256i sum = _mm256_madd_epi16(abs, one);
+ accum = _mm256_add_epi32(accum, sum);
+ coeff += 16;
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}