diff options
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r-- | vp9/encoder/x86/vp9_avg_intrin_sse2.c | 53 |
1 files changed, 27 insertions, 26 deletions
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 123255a72..482fa3da3 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -90,6 +90,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, s0 = _mm_adds_epu16(s0, t0); s1 = _mm_adds_epu16(s1, t1); + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + _mm_store_si128((__m128i *)hbuf, s0); hbuf += 8; _mm_store_si128((__m128i *)hbuf, s1); @@ -112,51 +115,49 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { s1 = _mm_srli_si128(s0, 8); s0 = _mm_adds_epu16(s0, s1); - return _mm_extract_epi16(s0, 0); + return (_mm_extract_epi16(s0, 0)) >> 5; } -int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src, - const int width) { +int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, + const int bwl) { int idx; - __m128i zero = _mm_setzero_si128(); - __m128i sum; + int width = 4 << bwl; + int16_t mean; __m128i v0 = _mm_loadu_si128((const __m128i *)ref); __m128i v1 = _mm_load_si128((const __m128i *)src); __m128i diff = _mm_subs_epi16(v0, v1); - __m128i sign = _mm_srai_epi16(diff, 15); - - diff = _mm_xor_si128(diff, sign); - sum = _mm_sub_epi16(diff, sign); + __m128i sum = diff; + __m128i sse = _mm_madd_epi16(diff, diff); ref += 8; src += 8; - v0 = _mm_unpacklo_epi16(sum, zero); - v1 = _mm_unpackhi_epi16(sum, zero); - sum = _mm_add_epi32(v0, v1); - for (idx = 8; idx < width; idx += 8) { v0 = _mm_loadu_si128((const __m128i *)ref); v1 = _mm_load_si128((const __m128i *)src); diff = _mm_subs_epi16(v0, v1); - sign = _mm_srai_epi16(diff, 15); - diff = _mm_xor_si128(diff, sign); - diff = _mm_sub_epi16(diff, sign); - v0 = _mm_unpacklo_epi16(diff, zero); - v1 = _mm_unpackhi_epi16(diff, zero); - - sum = _mm_add_epi32(sum, v0); - sum = _mm_add_epi32(sum, v1); + sum = _mm_add_epi16(sum, diff); + v0 = _mm_madd_epi16(diff, diff); + sse = _mm_add_epi32(sse, v0); ref += 8; src += 8; } - v0 = _mm_srli_si128(sum, 8); - sum = _mm_add_epi32(sum, v0); - v0 = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi32(sum, v0); + v0 = _mm_srli_si128(sum, 8); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi64(sum, 32); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi32(sum, 16); + sum = _mm_add_epi16(sum, v0); + + v1 = _mm_srli_si128(sse, 8); + sse = _mm_add_epi32(sse, v1); + v1 = _mm_srli_epi64(sse, 32); + sse = _mm_add_epi32(sse, v1); + + mean = _mm_extract_epi16(sum, 0); - return _mm_cvtsi128_si32(sum); + return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); } |