6 files changed, 202 insertions, 208 deletions
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index c2df83ed6..66a3956a1 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -305,15 +305,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
 $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
 
 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_neon_asm=vp9_convolve8_neon;
 
 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
 
 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon_asm dspr2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/;
 $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
 
 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -447,10 +447,10 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
 specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance64x64/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -477,10 +477,10 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_
 specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -653,7 +653,7 @@ add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const
 specialize qw/vp9_sad4x4x8 sse4/;
 
 add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
+specialize qw/vp9_sad64x64x4d sse2/;
 
 add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad32x64x4d sse2/;
@@ -668,7 +668,7 @@ add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, co
 specialize qw/vp9_sad16x32x4d sse2/;
 
 add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
+specialize qw/vp9_sad32x32x4d sse2/;
 
 add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad16x16x4d sse2/;
@@ -739,7 +739,7 @@ add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int strid
 specialize qw/vp9_fht8x8 sse2 avx2/;
 
 add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2 avx2/;
+specialize qw/vp9_fht16x16 sse2/;
 
 add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
@@ -760,7 +760,7 @@ add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int
 specialize qw/vp9_fdct16x16_1 sse2/;
 
 add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2 avx2/;
+specialize qw/vp9_fdct16x16 sse2/;
 
 add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
 specialize qw/vp9_fdct32x32_1 sse2/;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 75416f2aa..537500294 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -697,6 +697,38 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                      x->e_mbd.plane[i].subsampling_y);
 }
 
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
+                                   int64_t *dist, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  INTERP_FILTER filter_ref;
+
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1]->mbmi.interp_filter;
+  else
+    filter_ref = EIGHTTAP;
+
+  mbmi->sb_type = bsize;
+  mbmi->mode = ZEROMV;
+  mbmi->tx_size = MIN(max_txsize_lookup[bsize],
+                      tx_mode_to_biggest_tx_size[tx_mode]);
+  mbmi->skip = 1;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->mv[0].as_int = 0;
+  mbmi->interp_filter = filter_ref;
+
+  xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
+  x->skip = 1;
+  x->skip_encode = 1;
+
+  *rate = 0;
+  *dist = 0;
+}
+
 static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
                              int mi_row, int mi_col,
                              int *totalrate, int64_t *totaldist,
@@ -2442,17 +2474,21 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  xd->mi[0]->mbmi.sb_type = bsize;
+  mbmi = &xd->mi[0]->mbmi;
+  mbmi->sb_type = bsize;
 
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
-    if (xd->mi[0]->mbmi.segment_id && x->in_static_area)
+    if (mbmi->segment_id && x->in_static_area)
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
   if (!frame_is_intra_only(cm)) {
-    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col,
-                        rate, dist, bsize);
+    if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+    else
+      vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize);
   } else {
     set_mode_info(&xd->mi[0]->mbmi, bsize, DC_PRED);
   }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index a49fe3df5..d008c63d0 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -597,73 +597,91 @@ void vp9_first_pass(VP9_COMP *cpi) {
       if (cm->current_video_frame > 0) {
         int tmp_err, motion_error;
         int_mv mv, tmp_mv;
+        int raw_motion_error;
+        struct buf_2d unscaled_last_source_buf_2d;
 
         xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
         motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                             &xd->plane[0].pre[0]);
-        // Assume 0,0 motion with no mv overhead.
-        mv.as_int = tmp_mv.as_int = 0;
-
-        // Test last reference frame using the previous best mv as the
-        // starting point (best reference) for the search.
-        first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
-                                 &motion_error);
-        if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-          vp9_clear_system_state();
-          motion_error = (int)(motion_error * error_weight);
-        }
 
-        // If the current best reference mv is not centered on 0,0 then do a 0,0
-        // based search as well.
-        if (best_ref_mv.as_int) {
-          tmp_err = INT_MAX;
-          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
-                                   &tmp_err);
+        // compute the motion error of the zero motion vector using the last
+        // source frame as the reference
+        // skip the further motion search on reconstructed frame
+        // if this error is small
+        unscaled_last_source_buf_2d.buf = cpi->unscaled_last_source->y_buffer
+                                  + recon_yoffset;
+        unscaled_last_source_buf_2d.stride =
+                                          cpi->unscaled_last_source->y_stride;
+        raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                &unscaled_last_source_buf_2d);
+
+        // TODO(pengchong): Replace the hard-coded threshold
+        if (raw_motion_error > 25) {
+          // Assume 0,0 motion with no mv overhead.
+          mv.as_int = tmp_mv.as_int = 0;
+
+          // Test last reference frame using the previous best mv as the
+          // starting point (best reference) for the search.
+          first_pass_motion_search(cpi, x, &best_ref_mv.as_mv, &mv.as_mv,
+                                   &motion_error);
           if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();
-            tmp_err = (int)(tmp_err * error_weight);
+            motion_error = (int)(motion_error * error_weight);
           }
 
-          if (tmp_err < motion_error) {
-            motion_error = tmp_err;
-            mv.as_int = tmp_mv.as_int;
+          // If the current best reference mv is not centered on 0,0
+          // then do a 0,0
+          // based search as well.
+          if (best_ref_mv.as_int) {
+            tmp_err = INT_MAX;
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+                                     &tmp_err);
+            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+              vp9_clear_system_state();
+              tmp_err = (int)(tmp_err * error_weight);
+            }
+
+            if (tmp_err < motion_error) {
+              motion_error = tmp_err;
+              mv.as_int = tmp_mv.as_int;
+            }
           }
-        }
 
-        // Search in an older reference frame.
-        if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
-          // Assume 0,0 motion with no mv overhead.
-          int gf_motion_error;
+          // Search in an older reference frame.
+          if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+            // Assume 0,0 motion with no mv overhead.
+            int gf_motion_error;
 
-          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                 &xd->plane[0].pre[0]);
+            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+            gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                   &xd->plane[0].pre[0]);
 
-          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
-                                   &gf_motion_error);
-          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();
-            gf_motion_error = (int)(gf_motion_error * error_weight);
-          }
+            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
+                                     &gf_motion_error);
+            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+              vp9_clear_system_state();
+              gf_motion_error = (int)(gf_motion_error * error_weight);
+            }
 
-          if (gf_motion_error < motion_error && gf_motion_error < this_error)
-            ++second_ref_count;
-
-          // Reset to last frame as reference buffer.
-          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
-          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
-
-          // In accumulating a score for the older reference frame take the
-          // best of the motion predicted score and the intra coded error
-          // (just as will be done for) accumulation of "coded_error" for
-          // the last frame.
-          if (gf_motion_error < this_error)
-            sr_coded_error += gf_motion_error;
-          else
-            sr_coded_error += this_error;
-        } else {
-          sr_coded_error += motion_error;
+            if (gf_motion_error < motion_error && gf_motion_error < this_error)
+              ++second_ref_count;
+
+            // Reset to last frame as reference buffer.
+            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+            // In accumulating a score for the older reference frame take the
+            // best of the motion predicted score and the intra coded error
+            // (just as will be done for) accumulation of "coded_error" for
+            // the last frame.
+            if (gf_motion_error < this_error)
+              sr_coded_error += gf_motion_error;
+            else
+              sr_coded_error += this_error;
+          } else {
+            sr_coded_error += motion_error;
+          }
         }
         // Start by assuming that intra mode is best.
         best_ref_mv.as_int = 0;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f68aa2738..f8acf5b7a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -994,21 +994,13 @@ static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
     return;
   }
 
-  if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
-    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-      model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
-                           &r[tx_size][0], &d[tx_size], &s[tx_size]);
-    choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
-                                  skip, sse, ref_best_rd, bs);
-  } else {
-    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-      txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
-                       &s[tx_size], &sse[tx_size],
-                       ref_best_rd, 0, bs, tx_size,
-                       cpi->sf.use_fast_coef_costing);
-    choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
-                             skip, txfm_cache, bs);
-  }
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+    txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size], &s[tx_size],
+                     &sse[tx_size], ref_best_rd, 0, bs, tx_size,
+                     cpi->sf.use_fast_coef_costing);
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
+                           skip, txfm_cache, bs);
+
   if (psse)
     *psse = sse[mbmi->tx_size];
 }
@@ -2810,7 +2802,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     *rate2 += vp9_get_switchable_rate(cpi);
 
   if (!is_comp_pred) {
-    if (!x->in_active_map) {
+    if (!x->in_active_map ||
+        vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       if (psse)
         *psse = 0;
       *distortion = 0;
@@ -3127,9 +3120,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // If the segment skip feature is enabled....
   // then do nothing if the current mode is not allowed..
   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
-    const int inter_non_zero_mode_mask = 0x1F7F7;
-    mode_skip_mask |= inter_non_zero_mode_mask;
-    mode_skip_mask &= ~(1 << THR_ZEROMV);
+    mode_skip_mask = ~(1 << THR_ZEROMV);
     inter_mode_mask = (1 << ZEROMV);
   }
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 0a88499af..4a1a13bc0 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -56,8 +56,6 @@ typedef enum {
 
 typedef enum {
   USE_FULL_RD = 0,
-  USE_LARGESTINTRA,
-  USE_LARGESTINTRA_MODELINTER,
   USE_LARGESTALL,
   USE_TX_8X8
 } TX_SIZE_SEARCH_METHOD;
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index ae2f976af..ce1c83297 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -12,141 +12,92 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp9_get8x8var_mmx
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-extern unsigned int vp9_get4x4var_mmx
-(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *SSE,
-  int *Sum
-);
-
-unsigned int vp9_variance4x4_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
-
-  vp9_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 4));
+unsigned int vp9_get8x8var_mmx(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse, int *sum);
+
+unsigned int vp9_get4x4var_mmx(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *SSE, int *sum);
+
+unsigned int vp9_variance4x4_mmx(const uint8_t *src, int src_stride,
+                                 const uint8_t *ref, int ref_stride,
+                                 unsigned int *sse) {
+  int sum;
+  vp9_get4x4var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 4);
 }
 
-unsigned int vp9_variance8x8_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int var;
-  int avg;
+unsigned int vp9_variance8x8_mmx(const uint8_t *src, int src_stride,
+                                 const uint8_t *ref, int ref_stride,
+                                 unsigned int *sse) {
+  int sum;
+  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 6);
+}
+
+unsigned int vp9_mse16x16_mmx(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  unsigned int sse0, sse1, sse2, sse3;
+  int sum0, sum1, sum2, sum3;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg);
-  *sse = var;
+  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
+                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
+                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
 
-  return (var - (((unsigned int)avg * avg) >> 6));
+  *sse = sse0 + sse1 + sse2 + sse3;
+  return *sse;
 }
 
-unsigned int vp9_mse16x16_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3, var;
-  int sum0, sum1, sum2, sum3;
 
+unsigned int vp9_variance16x16_mmx(const uint8_t *src, int src_stride,
+                                   const uint8_t *ref, int ref_stride,
+                                   unsigned int *sse) {
+  unsigned int sse0, sse1, sse2, sse3;
+  int sum0, sum1, sum2, sum3, sum;
 
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                    &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
-                    &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
-                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
-                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
+  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
+                    ref + 8 * ref_stride, ref_stride, &sse2, &sum2);
+  vp9_get8x8var_mmx(src + 8 * src_stride + 8, src_stride,
+                    ref + 8 * ref_stride + 8, ref_stride, &sse3, &sum3);
 
-  var = sse0 + sse1 + sse2 + sse3;
-  *sse = var;
-  return var;
+  *sse = sse0 + sse1 + sse2 + sse3;
+  sum = sum0 + sum1 + sum2 + sum3;
+  return *sse - (((unsigned int)sum * sum) >> 8);
 }
 
+unsigned int vp9_variance16x8_mmx(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  unsigned int sse0, sse1;
+  int sum0, sum1, sum;
 
-unsigned int vp9_variance16x16_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, sse2, sse3, var;
-  int sum0, sum1, sum2, sum3, avg;
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                    &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
-                    &sse1, &sum1);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
-                    ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride,
-                    ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-  var = sse0 + sse1 + sse2 + sse3;
-  avg = sum0 + sum1 + sum2 + sum3;
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 8));
-}
+  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src + 8, src_stride, ref + 8, ref_stride, &sse1, &sum1);
 
-unsigned int vp9_variance16x8_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                    &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride,
-                    &sse1, &sum1);
-
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
-  *sse = var;
-  return (var - (((unsigned int)avg * avg) >> 7));
+  *sse = sse0 + sse1;
+  sum = sum0 + sum1;
+  return *sse - (((unsigned int)sum * sum) >> 7);
 }
 
 
-unsigned int vp9_variance8x16_mmx(
-  const unsigned char *src_ptr,
-  int  source_stride,
-  const unsigned char *ref_ptr,
-  int  recon_stride,
-  unsigned int *sse) {
-  unsigned int sse0, sse1, var;
-  int sum0, sum1, avg;
-
-  vp9_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
-                    &sum0);
-  vp9_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride,
-                    ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1);
+unsigned int vp9_variance8x16_mmx(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  unsigned int sse0, sse1;
+  int sum0, sum1, sum;
 
-  var = sse0 + sse1;
-  avg = sum0 + sum1;
-  *sse = var;
+  vp9_get8x8var_mmx(src, src_stride, ref, ref_stride, &sse0, &sum0);
+  vp9_get8x8var_mmx(src + 8 * src_stride, src_stride,
+                    ref + 8 * ref_stride, ref_stride, &sse1, &sum1);
 
-  return (var - (((unsigned int)avg * avg) >> 7));
+  *sse = sse0 + sse1;
+  sum = sum0 + sum1;
+  return *sse - (((unsigned int)sum * sum) >> 7);
 }