diff options
author | Scott LaVarnway <slavarnway@google.com> | 2017-11-08 16:06:29 -0800 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2017-11-09 05:02:31 -0800 |
commit | 62ab5e99c1aa13704716ca056b8c806b22544a6b (patch) | |
tree | 4b0d59e2c5bd22fa0b08fc28fb87c249081ffb71 /vp9/encoder | |
parent | acb9460929ac31ec221102c5d2cdb400a92f4e6f (diff) | |
download | libvpx-62ab5e99c1aa13704716ca056b8c806b22544a6b.tar libvpx-62ab5e99c1aa13704716ca056b8c806b22544a6b.tar.gz libvpx-62ab5e99c1aa13704716ca056b8c806b22544a6b.tar.bz2 libvpx-62ab5e99c1aa13704716ca056b8c806b22544a6b.zip |
vpx: [x86] add vp9_block_error_fp_avx2()
SSE2 asm vs AVX2 intrinsics speed gains:
blocksize 16: ~1.00
blocksize 64: ~1.17
blocksize 256: ~1.67
blocksize 1024: ~1.81
Change-Id: I2a86db239cf57e3ff617890ccb2d236aba83ad5e
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/x86/vp9_error_avx2.c | 54 |
1 files changed, 54 insertions, 0 deletions
diff --git a/vp9/encoder/x86/vp9_error_avx2.c b/vp9/encoder/x86/vp9_error_avx2.c index e228bd8b7..be414359a 100644 --- a/vp9/encoder/x86/vp9_error_avx2.c +++ b/vp9/encoder/x86/vp9_error_avx2.c @@ -105,3 +105,57 @@ int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, _mm_storel_epi64((__m128i *)(ssz), ssz_128); return sse; } + +int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, int block_size) { + int i; + const __m256i zero = _mm256_setzero_si256(); + __m256i sse_256 = zero; + __m256i sse_hi; + __m128i sse_128; + int64_t sse; + + if (block_size == 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + // dqcoeff - coeff + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + // madd (dqcoeff - coeff) + const __m256i error_lo = _mm256_madd_epi16(diff, diff); + // Save the higher 64 bit of each 128 bit lane. + const __m256i error_hi = _mm256_srli_si256(error_lo, 8); + // Add the higher 64 bit to the low 64 bit. + const __m256i error = _mm256_add_epi32(error_lo, error_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_256 = _mm256_unpacklo_epi32(error, zero); + } else { + for (i = 0; i < block_size; i += 16) { + // Load 16 elements for coeff and dqcoeff. + const __m256i _coeff = load_tran_low(coeff); + const __m256i _dqcoeff = load_tran_low(dqcoeff); + const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff); + const __m256i error = _mm256_madd_epi16(diff, diff); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero); + const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero); + // Add each quad word of madd (dqcoeff - coeff). + sse_256 = _mm256_add_epi64(sse_256, exp_error_lo); + sse_256 = _mm256_add_epi64(sse_256, exp_error_hi); + coeff += 16; + dqcoeff += 16; + } + } + // Save the higher 64 bit of each 128 bit lane. + sse_hi = _mm256_srli_si256(sse_256, 8); + // Add the higher 64 bit to the low 64 bit. + sse_256 = _mm256_add_epi64(sse_256, sse_hi); + + // Add each 64 bit from each of the 128 bit lane of the 256 bit. + sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256), + _mm256_extractf128_si256(sse_256, 1)); + + // Store the results. + _mm_storel_epi64((__m128i *)&sse, sse_128); + return sse; +} |