5 files changed, 275 insertions, 9 deletions
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index f8fa7d2e8..8d6cf0667 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -28,6 +28,36 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
   return (sum + 8) >> 4;
 }
 
+// Integer projection onto row vectors.
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
+                       const int ref_stride, const int height) {
+  int idx;
+  for (idx = 0; idx < 16; ++idx) {
+    int i;
+    hbuf[idx] = 0;
+    for (i = 0; i < height; ++i)
+      hbuf[idx] += ref[i * ref_stride];
+    ++ref;
+  }
+}
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
+  int idx;
+  int16_t sum = 0;
+  for (idx = 0; idx < width; ++idx)
+    sum += ref[idx];
+  return sum;
+}
+
+int vp9_vector_sad_c(int16_t const *ref, int16_t const *src,
+                     const int width) {
+  int i;
+  int this_sad = 0;
+  for (i = 0; i < width; ++i)
+    this_sad += abs(ref[i] - src[i]);
+  return this_sad;
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 99bb9300e..39dedf6cb 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -507,6 +507,119 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+#define GLOBAL_MOTION 0
+#else
+#define GLOBAL_MOTION 1
+#endif
+
+#if GLOBAL_MOTION
+static int vector_match(int16_t *ref, int16_t *src) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  for (d = 0; d <= 64; d += 16) {
+    this_sad = vp9_vector_sad(&ref[d], src, 64);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > 64 || this_pos == 32)
+      continue;
+    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > 64 || this_pos == 32)
+      continue;
+    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 1) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > 64 || this_pos == 32)
+      continue;
+    this_sad = vp9_vector_sad(&ref[this_pos], src, 64);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - 32);
+}
+
+static void motion_estimation(MACROBLOCK *x) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  int idx;
+  const int stride = 64;
+  const int search_width = 128;
+  const int search_height = 128;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0].src_mi->mbmi.mv[0].as_mv;
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf + (-32);
+  for (idx = 0; idx < search_width; idx += 16) {
+    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, 64);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf + (-32) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = vp9_int_pro_col(ref_buf, 64);
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < stride; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, 64);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < stride; ++idx) {
+    src_vbuf[idx] = vp9_int_pro_col(src_buf, 64);
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+
+  tmp_mv->col = vector_match(hbuf, src_hbuf);
+  tmp_mv->row = vector_match(vbuf, src_vbuf);
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  x->pred_mv[LAST_FRAME] = *tmp_mv;
+}
+#endif
 
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for downs-sampled inputs.
@@ -551,6 +664,11 @@ static void choose_partitioning(VP9_COMP *cpi,
     mbmi->ref_frame[1] = NONE;
     mbmi->sb_type = BLOCK_64X64;
     mbmi->mv[0].as_int = 0;
+
+#if GLOBAL_MOTION
+    motion_estimation(x);
+#endif
+
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
     for (i = 1; i <= 2; ++i) {
@@ -3129,7 +3247,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                  bsize, pc_tree);
   }
 
-  if (bsize == BLOCK_64X64) {
+  if (bsize == BLOCK_64X64 && do_recon) {
     assert(tp_orig < *tp);
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 366aeb193..4c45a7926 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -110,9 +110,9 @@ static void output_stats(FIRSTPASS_STATS *stats,
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
 
-    fprintf(fpfile, "%12.0f %12.4f %12.0f %12.0f %12.0f %12.4f %12.4f"
-            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
-            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
+    fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
             stats->frame,
             stats->weight,
             stats->intra_error,
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 46a354700..71cea0e45 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -216,6 +216,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int64_t dist;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
+  const int64_t dc_thr = p->quant_thred[0] >> 6;
+  const int64_t ac_thr = p->quant_thred[1] >> 6;
   const uint32_t dc_quant = pd->dequant[0];
   const uint32_t ac_quant = pd->dequant[1];
   unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
@@ -223,12 +225,14 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *var_y = var;
   *sse_y = sse;
 
-  if (sse < dc_quant * dc_quant >> 6)
-    x->skip_txfm[0] = 1;
-  else if (var < ac_quant * ac_quant >> 6)
+  x->skip_txfm[0] = 0;
+  // Check if all ac coefficients can be quantized to zero.
+  if (var < ac_thr || var == 0) {
     x->skip_txfm[0] = 2;
-  else
-    x->skip_txfm[0] = 0;
+    // Check if dc coefficient can be quantized to zero.
+    if (sse - var < dc_thr || sse == var)
+      x->skip_txfm[0] = 1;
+  }
 
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 4c3495b05..0a105629f 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -56,3 +56,117 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
   avg = _mm_extract_epi16(s0, 0);
   return (avg + 8) >> 4;
 }
+
+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+                          const int ref_stride, const int height) {
+  int idx;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+
+  for (idx = 1; idx < height_1; idx += 2) {
+    src_line = _mm_load_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_load_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+  }
+
+  src_line = _mm_load_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+
+  _mm_store_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_store_si128((__m128i *)hbuf, s1);
+}
+
+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  (void) width;  // width = 64
+
+  ref += 16;
+  src_line = _mm_load_si128((const __m128i *)ref);
+  s1 = _mm_sad_epu8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  ref += 16;
+  src_line = _mm_load_si128((const __m128i *)ref);
+  s1 = _mm_sad_epu8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  ref += 16;
+  src_line = _mm_load_si128((const __m128i *)ref);
+  s1 = _mm_sad_epu8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}
+
+int vp9_vector_sad_sse2(int16_t const *ref, int16_t const *src,
+                        const int width) {
+  int idx;
+  __m128i zero = _mm_setzero_si128();
+  __m128i sum;
+  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+  __m128i v1 = _mm_load_si128((const __m128i *)src);
+  __m128i diff = _mm_subs_epi16(v0, v1);
+  __m128i sign = _mm_srai_epi16(diff, 15);
+
+  diff = _mm_xor_si128(diff, sign);
+  sum = _mm_sub_epi16(diff, sign);
+
+  (void) width;  // width = 64;
+
+  ref += 8;
+  src += 8;
+
+  v0 = _mm_unpacklo_epi16(sum, zero);
+  v1 = _mm_unpackhi_epi16(sum, zero);
+  sum = _mm_add_epi32(v0, v1);
+
+  for (idx = 1; idx < 8; ++idx) {
+    v0 = _mm_loadu_si128((const __m128i *)ref);
+    v1 = _mm_load_si128((const __m128i *)src);
+    diff = _mm_subs_epi16(v0, v1);
+    sign = _mm_srai_epi16(diff, 15);
+    diff = _mm_xor_si128(diff, sign);
+    diff = _mm_sub_epi16(diff, sign);
+
+    v0 = _mm_unpacklo_epi16(diff, zero);
+    v1 = _mm_unpackhi_epi16(diff, zero);
+
+    sum = _mm_add_epi32(sum, v0);
+    sum = _mm_add_epi32(sum, v1);
+
+    ref += 8;
+    src += 8;
+  }
+
+  v0 = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi32(sum, v0);
+  v0 = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi32(sum, v0);
+
+  return _mm_cvtsi128_si32(sum);
+}