Merge "Refactor integral projection based motion estimation"

author: Jingning Han <jingning@google.com> 2015-02-27 09:49:30 -0800
committer: Gerrit Code Review <gerrit@gerrit.golo.chromium.org> 2015-02-27 09:49:30 -0800
commit: 89ee460ee4cebcef775109d8267d5475d03408db (patch)
tree: a741f7d4cfd2e79a1f0693359de047c002b43a37 /vp9/encoder
parent: c4cb8059ff921e5318d018557ac557348e76a64f (diff)
parent: 73a00d3219b474279c169f779d29dd7f5ec5cf24 (diff)
download: libvpx-89ee460ee4cebcef775109d8267d5475d03408db.tar
libvpx-89ee460ee4cebcef775109d8267d5475d03408db.tar.gz
libvpx-89ee460ee4cebcef775109d8267d5475d03408db.tar.bz2
libvpx-89ee460ee4cebcef775109d8267d5475d03408db.zip
2 files changed, 40 insertions, 48 deletions
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ff12bf8a2..a5e9ed616 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -519,13 +519,14 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
 #endif
 
 #if GLOBAL_MOTION
-static int vector_match(int16_t *ref, int16_t *src) {
+static int vector_match(int16_t *ref, int16_t *src, int length) {
   int best_sad = INT_MAX;
   int this_sad;
   int d;
   int center, offset = 0;
-  for (d = 0; d <= 64; d += 16) {
-    this_sad = vp9_vector_sad(&ref[d], src, 64);
+  int bw = length;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = vp9_vector_sad(&ref[d], src, length);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       offset = d;
@@ -536,9 +537,9 @@ static int vector_match(int16_t *ref, int16_t *src) {
   for (d = -8; d <= 8; d += 16) {
     int this_pos = offset + d;
     // check limit
-    if (this_pos < 0 || this_pos > 64)
+    if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -549,9 +550,9 @@ static int vector_match(int16_t *ref, int16_t *src) {
   for (d = -4; d <= 4; d += 8) {
     int this_pos = offset + d;
     // check limit
-    if (this_pos < 0 || this_pos > 64)
+    if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -562,9 +563,9 @@ static int vector_match(int16_t *ref, int16_t *src) {
   for (d = -2; d <= 2; d += 4) {
     int this_pos = offset + d;
     // check limit
-    if (this_pos < 0 || this_pos > 64)
+    if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
@@ -575,16 +576,16 @@ static int vector_match(int16_t *ref, int16_t *src) {
   for (d = -1; d <= 1; d += 2) {
     int this_pos = offset + d;
     // check limit
-    if (this_pos < 0 || this_pos > 64)
+    if (this_pos < 0 || this_pos > bw)
       continue;
-    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    this_sad = vp9_vector_sad(&ref[this_pos], src, length);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       center = this_pos;
     }
   }
 
-  return (center - 32);
+  return (center - (bw >> 1));
 }
 
 static const MV search_pos[9] = {
@@ -592,16 +593,18 @@ static const MV search_pos[9] = {
   {1, -1}, {1, 0}, {1, 1},
 };
 
-static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x) {
+static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   DECLARE_ALIGNED(16, int16_t, hbuf[128]);
   DECLARE_ALIGNED(16, int16_t, vbuf[128]);
   DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
   DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
   int idx;
-  const int stride = 64;
-  const int search_width = 128;
-  const int search_height = 128;
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
   const int src_stride = x->plane[0].src.stride;
   const int ref_stride = xd->plane[0].pre[0].stride;
   uint8_t const *ref_buf, *src_buf;
@@ -610,34 +613,33 @@ static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x) {
   MV this_mv;
 
   // Set up prediction 1-D reference set
-  ref_buf = xd->plane[0].pre[0].buf + (-32);
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
   for (idx = 0; idx < search_width; idx += 16) {
-    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, 64);
+    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
     ref_buf += 16;
   }
 
-  ref_buf = xd->plane[0].pre[0].buf + (-32) * ref_stride;
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
   for (idx = 0; idx < search_height; ++idx) {
-    vbuf[idx] = vp9_int_pro_col(ref_buf, 64);
+    vbuf[idx] = vp9_int_pro_col(ref_buf, bw);
     ref_buf += ref_stride;
   }
 
   // Set up src 1-D reference set
-  for (idx = 0; idx < stride; idx += 16) {
+  for (idx = 0; idx < bw; idx += 16) {
     src_buf = x->plane[0].src.buf + idx;
-    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, 64);
+    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
   }
 
   src_buf = x->plane[0].src.buf;
-  for (idx = 0; idx < stride; ++idx) {
-    src_vbuf[idx] = vp9_int_pro_col(src_buf, 64);
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw);
     src_buf += src_stride;
   }
 
   // Find the best match per 1-D search
-
-  tmp_mv->col = vector_match(hbuf, src_hbuf);
-  tmp_mv->row = vector_match(vbuf, src_vbuf);
+  tmp_mv->col = vector_match(hbuf, src_hbuf, bw);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, bh);
 
   best_sad = INT_MAX;
   this_mv = *tmp_mv;
@@ -648,8 +650,8 @@ static void motion_estimation(VP9_COMP *cpi, MACROBLOCK *x) {
         (search_pos[idx].row + this_mv.row) * ref_stride +
         (search_pos[idx].col + this_mv.col);
 
-    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src_buf, src_stride,
-                                            ref_buf, ref_stride);
+    this_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                      ref_buf, ref_stride);
     if (this_sad < best_sad) {
       best_sad = this_sad;
       tmp_mv->row = search_pos[idx].row + this_mv.row;
@@ -717,7 +719,7 @@ static void choose_partitioning(VP9_COMP *cpi,
     mbmi->interp_filter = BILINEAR;
 
 #if GLOBAL_MOTION
-    motion_estimation(cpi, x);
+    motion_estimation(cpi, x, BLOCK_64X64);
 #endif
 
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 0a105629f..123255a72 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -100,22 +100,14 @@ int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
   __m128i src_line = _mm_load_si128((const __m128i *)ref);
   __m128i s0 = _mm_sad_epu8(src_line, zero);
   __m128i s1;
-  (void) width;  // width = 64
+  int i;
 
-  ref += 16;
-  src_line = _mm_load_si128((const __m128i *)ref);
-  s1 = _mm_sad_epu8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  ref += 16;
-  src_line = _mm_load_si128((const __m128i *)ref);
-  s1 = _mm_sad_epu8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  ref += 16;
-  src_line = _mm_load_si128((const __m128i *)ref);
-  s1 = _mm_sad_epu8(src_line, zero);
-  s0 = _mm_adds_epu16(s0, s1);
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_load_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
 
   s1 = _mm_srli_si128(s0, 8);
   s0 = _mm_adds_epu16(s0, s1);
@@ -136,8 +128,6 @@ int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
   diff = _mm_xor_si128(diff, sign);
   sum = _mm_sub_epi16(diff, sign);
 
-  (void) width;  // width = 64;
-
   ref += 8;
   src += 8;
 
@@ -145,7 +135,7 @@ int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
   v1 = _mm_unpackhi_epi16(sum, zero);
   sum = _mm_add_epi32(v0, v1);
 
-  for (idx = 1; idx < 8; ++idx) {
+  for (idx = 8; idx < width; idx += 8) {
     v0 = _mm_loadu_si128((const __m128i *)ref);
     v1 = _mm_load_si128((const __m128i *)src);
     diff = _mm_subs_epi16(v0, v1);
author	Jingning Han <jingning@google.com>	2015-02-27 09:49:30 -0800
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>	2015-02-27 09:49:30 -0800
commit	89ee460ee4cebcef775109d8267d5475d03408db (patch)
tree	a741f7d4cfd2e79a1f0693359de047c002b43a37 /vp9/encoder
parent	c4cb8059ff921e5318d018557ac557348e76a64f (diff)
parent	73a00d3219b474279c169f779d29dd7f5ec5cf24 (diff)
download	libvpx-89ee460ee4cebcef775109d8267d5475d03408db.tar libvpx-89ee460ee4cebcef775109d8267d5475d03408db.tar.gz libvpx-89ee460ee4cebcef775109d8267d5475d03408db.tar.bz2 libvpx-89ee460ee4cebcef775109d8267d5475d03408db.zip