diff options
author | Jingning Han <jingning@google.com> | 2015-02-27 13:35:22 -0800 |
---|---|---|
committer | Jingning Han <jingning@google.com> | 2015-03-01 10:42:56 -0800 |
commit | 1790d45252efb29903baae3d1776bb24cee76558 (patch) | |
tree | c78441dc2aef61b80f9b027634c927540e1a4912 /vp9/encoder/x86 | |
parent | f4e0eb17e85eb84c162ac0218cf32a9eccece353 (diff) | |
download | libvpx-1790d45252efb29903baae3d1776bb24cee76558.tar libvpx-1790d45252efb29903baae3d1776bb24cee76558.tar.gz libvpx-1790d45252efb29903baae3d1776bb24cee76558.tar.bz2 libvpx-1790d45252efb29903baae3d1776bb24cee76558.zip |
Use variance metric for integral projection vector match
This commit replaces the SAD with variance as metric for the
integral projection vector match. It improves the search accuracy
in the presence of slight light change. The average speed -6
compression performance for rtc set is improved by 1.7%. No speed
changes are observed for the test clips.
Change-Id: I71c1d27e42de2aa429fb3564e6549bba1c7d6d4d
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r-- | vp9/encoder/x86/vp9_avg_intrin_sse2.c | 53 |
1 files changed, 27 insertions, 26 deletions
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 123255a72..482fa3da3 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -90,6 +90,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, s0 = _mm_adds_epu16(s0, t0); s1 = _mm_adds_epu16(s1, t1); + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + _mm_store_si128((__m128i *)hbuf, s0); hbuf += 8; _mm_store_si128((__m128i *)hbuf, s1); @@ -112,51 +115,49 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { s1 = _mm_srli_si128(s0, 8); s0 = _mm_adds_epu16(s0, s1); - return _mm_extract_epi16(s0, 0); + return (_mm_extract_epi16(s0, 0)) >> 5; } -int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src, - const int width) { +int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, + const int bwl) { int idx; - __m128i zero = _mm_setzero_si128(); - __m128i sum; + int width = 4 << bwl; + int16_t mean; __m128i v0 = _mm_loadu_si128((const __m128i *)ref); __m128i v1 = _mm_load_si128((const __m128i *)src); __m128i diff = _mm_subs_epi16(v0, v1); - __m128i sign = _mm_srai_epi16(diff, 15); - - diff = _mm_xor_si128(diff, sign); - sum = _mm_sub_epi16(diff, sign); + __m128i sum = diff; + __m128i sse = _mm_madd_epi16(diff, diff); ref += 8; src += 8; - v0 = _mm_unpacklo_epi16(sum, zero); - v1 = _mm_unpackhi_epi16(sum, zero); - sum = _mm_add_epi32(v0, v1); - for (idx = 8; idx < width; idx += 8) { v0 = _mm_loadu_si128((const __m128i *)ref); v1 = _mm_load_si128((const __m128i *)src); diff = _mm_subs_epi16(v0, v1); - sign = _mm_srai_epi16(diff, 15); - diff = _mm_xor_si128(diff, sign); - diff = _mm_sub_epi16(diff, sign); - v0 = _mm_unpacklo_epi16(diff, zero); - v1 = _mm_unpackhi_epi16(diff, zero); - - sum = _mm_add_epi32(sum, v0); - sum = _mm_add_epi32(sum, v1); + sum = _mm_add_epi16(sum, diff); + v0 = _mm_madd_epi16(diff, diff); + sse = _mm_add_epi32(sse, v0); ref += 8; src += 8; } - v0 = _mm_srli_si128(sum, 8); - sum = _mm_add_epi32(sum, v0); - v0 = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi32(sum, v0); + v0 = _mm_srli_si128(sum, 8); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi64(sum, 32); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi32(sum, 16); + sum = _mm_add_epi16(sum, v0); + + v1 = _mm_srli_si128(sse, 8); + sse = _mm_add_epi32(sse, v1); + v1 = _mm_srli_epi64(sse, 32); + sse = _mm_add_epi32(sse, v1); + + mean = _mm_extract_epi16(sum, 0); - return _mm_cvtsi128_si32(sum); + return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); } |