summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2018-07-23 08:12:19 -0700
committerScott LaVarnway <slavarnway@google.com>2018-07-23 12:49:50 -0700
commita83d11f9c44343df9585afa6f13545701d79adfb (patch)
treec3ee1ab2e63b6c393bde458f74edeee25c20f8e8 /vpx_dsp
parente858863dda2e242ede57916dae4086a991f618dd (diff)
downloadlibvpx-a83d11f9c44343df9585afa6f13545701d79adfb.tar
libvpx-a83d11f9c44343df9585afa6f13545701d79adfb.tar.gz
libvpx-a83d11f9c44343df9585afa6f13545701d79adfb.tar.bz2
libvpx-a83d11f9c44343df9585afa6f13545701d79adfb.zip
VPX: Add vpx_hadamard_32x32_avx2
BUG=webm:1546 Change-Id: I64629ed83cb7acd0f2ac49b9c31f369d17a1aed2
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl4
-rw-r--r--vpx_dsp/x86/avg_intrin_avx2.c35
2 files changed, 37 insertions, 2 deletions
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 13b83e1f5..2350bc6e8 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -783,7 +783,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_32x32 sse2/;
+ specialize qw/vpx_hadamard_32x32 sse2 avx2/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon/;
@@ -795,7 +795,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_32x32 sse2/;
+ specialize qw/vpx_hadamard_32x32 sse2 avx2/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon msa/;
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index ff19ea647..7cbb54b51 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -172,6 +172,41 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
}
}
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ vpx_hadamard_16x16_avx2(src_ptr, src_stride, coeff + idx * 256);
+ }
+
+ for (idx = 0; idx < 256; idx += 16) {
+ const __m256i coeff0 = load_tran_low(coeff);
+ const __m256i coeff1 = load_tran_low(coeff + 256);
+ const __m256i coeff2 = load_tran_low(coeff + 512);
+ const __m256i coeff3 = load_tran_low(coeff + 768);
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 2);
+ b1 = _mm256_srai_epi16(b1, 2);
+ b2 = _mm256_srai_epi16(b2, 2);
+ b3 = _mm256_srai_epi16(b3, 2);
+
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+ coeff += 16;
+ }
+}
+
int vpx_satd_avx2(const tran_low_t *coeff, int length) {
const __m256i one = _mm256_set1_epi16(1);
__m256i accum = _mm256_setzero_si256();