summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2015-02-27 13:35:22 -0800
committerJingning Han <jingning@google.com>2015-03-01 10:42:56 -0800
commit1790d45252efb29903baae3d1776bb24cee76558 (patch)
treec78441dc2aef61b80f9b027634c927540e1a4912 /vp9/encoder/x86
parentf4e0eb17e85eb84c162ac0218cf32a9eccece353 (diff)
downloadlibvpx-1790d45252efb29903baae3d1776bb24cee76558.tar
libvpx-1790d45252efb29903baae3d1776bb24cee76558.tar.gz
libvpx-1790d45252efb29903baae3d1776bb24cee76558.tar.bz2
libvpx-1790d45252efb29903baae3d1776bb24cee76558.zip
Use variance metric for integral projection vector match
This commit replaces the SAD with variance as metric for the integral projection vector match. It improves the search accuracy in the presence of slight light change. The average speed -6 compression performance for rtc set is improved by 1.7%. No speed changes are observed for the test clips. Change-Id: I71c1d27e42de2aa429fb3564e6549bba1c7d6d4d
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r--vp9/encoder/x86/vp9_avg_intrin_sse2.c53
1 files changed, 27 insertions, 26 deletions
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 123255a72..482fa3da3 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -90,6 +90,9 @@ void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
s0 = _mm_adds_epu16(s0, t0);
s1 = _mm_adds_epu16(s1, t1);
+ s0 = _mm_srai_epi16(s0, 5);
+ s1 = _mm_srai_epi16(s1, 5);
+
_mm_store_si128((__m128i *)hbuf, s0);
hbuf += 8;
_mm_store_si128((__m128i *)hbuf, s1);
@@ -112,51 +115,49 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
s1 = _mm_srli_si128(s0, 8);
s0 = _mm_adds_epu16(s0, s1);
- return _mm_extract_epi16(s0, 0);
+ return (_mm_extract_epi16(s0, 0)) >> 5;
}
-int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
- const int width) {
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+ const int bwl) {
int idx;
- __m128i zero = _mm_setzero_si128();
- __m128i sum;
+ int width = 4 << bwl;
+ int16_t mean;
__m128i v0 = _mm_loadu_si128((const __m128i *)ref);
__m128i v1 = _mm_load_si128((const __m128i *)src);
__m128i diff = _mm_subs_epi16(v0, v1);
- __m128i sign = _mm_srai_epi16(diff, 15);
-
- diff = _mm_xor_si128(diff, sign);
- sum = _mm_sub_epi16(diff, sign);
+ __m128i sum = diff;
+ __m128i sse = _mm_madd_epi16(diff, diff);
ref += 8;
src += 8;
- v0 = _mm_unpacklo_epi16(sum, zero);
- v1 = _mm_unpackhi_epi16(sum, zero);
- sum = _mm_add_epi32(v0, v1);
-
for (idx = 8; idx < width; idx += 8) {
v0 = _mm_loadu_si128((const __m128i *)ref);
v1 = _mm_load_si128((const __m128i *)src);
diff = _mm_subs_epi16(v0, v1);
- sign = _mm_srai_epi16(diff, 15);
- diff = _mm_xor_si128(diff, sign);
- diff = _mm_sub_epi16(diff, sign);
- v0 = _mm_unpacklo_epi16(diff, zero);
- v1 = _mm_unpackhi_epi16(diff, zero);
-
- sum = _mm_add_epi32(sum, v0);
- sum = _mm_add_epi32(sum, v1);
+ sum = _mm_add_epi16(sum, diff);
+ v0 = _mm_madd_epi16(diff, diff);
+ sse = _mm_add_epi32(sse, v0);
ref += 8;
src += 8;
}
- v0 = _mm_srli_si128(sum, 8);
- sum = _mm_add_epi32(sum, v0);
- v0 = _mm_srli_epi64(sum, 32);
- sum = _mm_add_epi32(sum, v0);
+ v0 = _mm_srli_si128(sum, 8);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi64(sum, 32);
+ sum = _mm_add_epi16(sum, v0);
+ v0 = _mm_srli_epi32(sum, 16);
+ sum = _mm_add_epi16(sum, v0);
+
+ v1 = _mm_srli_si128(sse, 8);
+ sse = _mm_add_epi32(sse, v1);
+ v1 = _mm_srli_epi64(sse, 32);
+ sse = _mm_add_epi32(sse, v1);
+
+ mean = _mm_extract_epi16(sum, 0);
- return _mm_cvtsi128_si32(sum);
+ return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
}