8 files changed, 121 insertions, 210 deletions
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index b685d813b..c04cc8f05 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -12,6 +12,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_scale_rtcd.h"
 #include "./vp9_rtcd.h"
@@ -587,32 +588,6 @@ static void fillrd(struct postproc_state *state, int q, int a) {
   state->last_noise = a;
 }
 
-void vp9_plane_add_noise_c(uint8_t *start, char *noise,
-                           char blackclamp[16],
-                           char whiteclamp[16],
-                           char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  // TODO(jbb): why does simd code use both but c doesn't,  normalize and
-  // fix..
-  (void) bothclamp;
-  for (i = 0; i < height; i++) {
-    uint8_t *pos = start + i * pitch;
-    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; j++) {
-      if (pos[j] < blackclamp[0])
-        pos[j] = blackclamp[0];
-
-      if (pos[j] > 255 + whiteclamp[0])
-        pos[j] = 255 + whiteclamp[0];
-
-      pos[j] += ref[j];
-    }
-  }
-}
-
 static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO *temp = cm->postproc_state.prev_mip;
@@ -726,8 +701,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
         ppstate->last_noise != noise_level) {
       fillrd(ppstate, 63 - q, noise_level);
     }
-
-    vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
+    vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp,
                         ppstate->whiteclamp, ppstate->bothclamp,
                         ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride);
   }
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1cf636c1d..d7f5a2113 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -70,10 +70,6 @@ add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8
 specialize qw/vp9_post_proc_down_and_across sse2/;
 $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 
-add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-specialize qw/vp9_plane_add_noise sse2/;
-$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
-
 add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
 specialize qw/vp9_filter_by_weight16x16 sse2 msa/;
 
@@ -169,9 +165,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
     add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
     specialize qw/vp9_highbd_post_proc_down_and_across/;
-
-    add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
-    specialize qw/vp9_highbd_plane_add_noise/;
   }
 
   #
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
index ec8bfdb18..430762815 100644
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -624,68 +624,6 @@ sym(vp9_mbpost_proc_across_ip_xmm):
 %undef flimit4
 
 
-;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
-;                            unsigned char blackclamp[16],
-;                            unsigned char whiteclamp[16],
-;                            unsigned char bothclamp[16],
-;                            unsigned int width, unsigned int height, int pitch)
-global sym(vp9_plane_add_noise_wmt) PRIVATE
-sym(vp9_plane_add_noise_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-.addnoise_loop:
-    call sym(LIBVPX_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    ; we rely on the fact that the clamping vectors are stored contiguously
-    ; in black/white/both order. Note that we have to reload this here because
-    ; rdx could be trashed by rand()
-    mov     rdx, arg(2) ; blackclamp
-
-
-            mov     rdi, rcx
-            movsxd  rcx, dword arg(5) ;[Width]
-            mov     rsi, arg(0) ;Pos
-            xor         rax,rax
-
-.addnoise_nextset:
-            movdqu      xmm1,[rsi+rax]         ; get the source
-
-            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, [rdx+32] ;bothclamp
-            psubusb     xmm1, [rdx+16] ;whiteclamp
-
-            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-            paddb       xmm1,xmm2              ; add it in
-            movdqu      [rsi+rax],xmm1         ; store the result
-
-            add         rax,16                 ; move to the next line
-
-            cmp         rax, rcx
-            jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd42:
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index ae5ca7dd5..e96c96cde 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -662,12 +662,79 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
   }
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
+// Check if most of the superblock is skin content, and if so, force split to
+// 32x32, and set x->sb_is_skin for use in mode selection.
+static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
+                         int mi_row, int mi_col, int *force_split) {
+  VP9_COMMON * const cm = &cpi->common;
+  // Avoid checking superblocks on/near boundary and avoid low resolutions.
+  // Note superblock may still pick 64X64 if y_sad is very small
+  // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
+  if (!low_res && (mi_col >= 8 && mi_col + 8 < cm->mi_cols && mi_row >= 8 &&
+      mi_row + 8 < cm->mi_rows)) {
+    int num_16x16_skin = 0;
+    int num_16x16_nonskin = 0;
+    uint8_t *ysignal = x->plane[0].src.buf;
+    uint8_t *usignal = x->plane[1].src.buf;
+    uint8_t *vsignal = x->plane[2].src.buf;
+    int sp = x->plane[0].src.stride;
+    int spuv = x->plane[1].src.stride;
+    const int block_index = mi_row * cm->mi_cols + mi_col;
+    const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+    const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+    const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+    const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+    // Loop through the 16x16 sub-blocks.
+    int i, j;
+    for (i = 0; i < ymis; i+=2) {
+      for (j = 0; j < xmis; j+=2) {
+        int bl_index = block_index + i * cm->mi_cols + j;
+        int bl_index1 = bl_index + 1;
+        int bl_index2 = bl_index + cm->mi_cols;
+        int bl_index3 = bl_index2 + 1;
+        int consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                   VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                   cpi->consec_zero_mv[bl_index3])));
+        int is_skin = vp9_compute_skin_block(ysignal,
+                                             usignal,
+                                             vsignal,
+                                             sp,
+                                             spuv,
+                                             BLOCK_16X16,
+                                             consec_zeromv,
+                                             0);
+        num_16x16_skin += is_skin;
+        num_16x16_nonskin += (1 - is_skin);
+        if (num_16x16_nonskin > 3) {
+          // Exit loop if at least 4 of the 16x16 blocks are not skin.
+          i = ymis;
+          break;
+        }
+        ysignal += 16;
+        usignal += 8;
+        vsignal += 8;
+      }
+      ysignal += (sp << 4) - 64;
+      usignal += (spuv << 3) - 32;
+      vsignal += (spuv << 3) - 32;
+    }
+    if (num_16x16_skin > 12) {
+      *force_split = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
 static int choose_partitioning(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MACROBLOCK *x,
-                                int mi_row, int mi_col) {
+                               const TileInfo *const tile,
+                               MACROBLOCK *x,
+                               int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   int i, j, k, m;
@@ -771,70 +838,13 @@ static int choose_partitioning(VP9_COMP *cpi,
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
-    // Check if most of the superblock is skin content, and if so, force split
-    // to 32x32, and set x->sb_is_skin for use in mode selection.
-    // Avoid checking superblocks on/near boundary and avoid low resolutions.
-    // Note superblock may still pick 64X64 if y_sad is very small
-    // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
     x->sb_is_skin = 0;
 #if !CONFIG_VP9_HIGHBITDEPTH
-    if (cpi->use_skin_detection && !low_res && (mi_col >= 8 &&
-        mi_col + 8 < cm->mi_cols && mi_row >= 8 && mi_row + 8 < cm->mi_rows)) {
-      int bl_index1, bl_index2, bl_index3;
-      int num_16x16_skin = 0;
-      int num_16x16_nonskin = 0;
-      int is_skin = 0;
-      int consec_zeromv = 0;
-      uint8_t *ysignal = x->plane[0].src.buf;
-      uint8_t *usignal = x->plane[1].src.buf;
-      uint8_t *vsignal = x->plane[2].src.buf;
-      int spuv = x->plane[1].src.stride;
-      const int block_index = mi_row * cm->mi_cols + mi_col;
-      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-      const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
-      const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
-      // Loop through the 16x16 sub-blocks.
-      int j, i;
-      for (i = 0; i < ymis; i+=2) {
-        for (j = 0; j < xmis; j+=2) {
-          int bl_index = block_index + i * cm->mi_cols + j;
-          bl_index1 = bl_index + 1;
-          bl_index2 = bl_index + cm->mi_cols;
-          bl_index3 = bl_index2 + 1;
-          consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
-                                 VPXMIN(cpi->consec_zero_mv[bl_index1],
-                                 VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                 cpi->consec_zero_mv[bl_index3])));
-          is_skin = vp9_compute_skin_block(ysignal,
-                                           usignal,
-                                           vsignal,
-                                           sp,
-                                           spuv,
-                                           BLOCK_16X16,
-                                           consec_zeromv,
-                                           0);
-          num_16x16_skin += is_skin;
-          num_16x16_nonskin += (1 - is_skin);
-          if (num_16x16_nonskin > 3) {
-            // Exit loop if at least 4 of the 16x16 blocks are not skin.
-            i = ymis;
-            j = xmis;
-          }
-          ysignal += 16;
-          usignal += 8;
-          vsignal += 8;
-        }
-        ysignal += (sp << 4) - 64;
-        usignal += (spuv << 3) - 32;
-        vsignal += (spuv << 3) - 32;
-      }
-      if (num_16x16_skin > 12) {
-        x->sb_is_skin = 1;
-        force_split[0] = 1;
-      }
-    }
+    if (cpi->use_skin_detection)
+      x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col,
+                                    &force_split[0]);
 #endif
+
     for (i = 1; i <= 2; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index def9b8c1d..441280cb1 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -20,8 +20,8 @@
 
 /* Return the buffer at the given absolute index and increment the index */
 static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+                                   int *idx) {
+  int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
   assert(index < ctx->max_sz);
@@ -35,7 +35,7 @@ static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
 void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
   if (ctx) {
     if (ctx->buf) {
-      unsigned int i;
+      int i;
 
       for (i = 0; i < ctx->max_sz; i++)
         vpx_free_frame_buffer(&ctx->buf[i].img);
@@ -221,9 +221,9 @@ struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
 
   if (index >= 0) {
     // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
       index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz)
+      if (index >= ctx->max_sz)
         index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 13820380f..db0fd1cde 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -36,10 +36,10 @@ struct lookahead_entry {
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
   struct lookahead_entry *buf; /* Buffer list */
 };
 
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ebe28b8bf..2ba2750e4 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -143,8 +143,8 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = i + idy;
-          int col = j + idx;
+          int row = (int)i + idy;
+          int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height &&
               col >= 0 && col < (int)block_width) {
@@ -211,8 +211,8 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = i + idy;
-          int col = j + idx;
+          int row = (int)i + idy;
+          int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height &&
               col >= 0 && col < (int)block_width) {
diff --git a/vp9/encoder/x86/vp9_denoiser_sse2.c b/vp9/encoder/x86/vp9_denoiser_sse2.c
index f4a149d9e..883507af3 100644
--- a/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -147,8 +147,9 @@ static int vp9_denoiser_NxM_sse2_small(
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
 
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+  for (r = 0; r < b_height; ++r) {
     memcpy(sig_buffer[r], sig, width);
     memcpy(sig_buffer[r] + width, sig + sig_stride, width);
     memcpy(mc_running_buffer[r], mc_running_avg_y, width);
@@ -188,8 +189,8 @@ static int vp9_denoiser_NxM_sse2_small(
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+        running_avg_y -= avg_y_stride * (b_height << 1);
+        for (r = 0; r < b_height; ++r) {
           acc_diff = vp9_denoiser_adj_16x1_sse2(
               sig_buffer[r], mc_running_buffer[r], running_buffer[r],
               k_0, k_delta, acc_diff);
@@ -235,38 +236,37 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
+  const int b_width = (4 << b_width_log2_lookup[bs]);
+  const int b_height = (4 << b_height_log2_lookup[bs]);
+  const int b_width_shift4 = b_width >> 4;
 
-  for (c = 0; c < 4; ++c) {
-    for (r = 0; r < 4; ++r) {
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
       acc_diff[c][r] = _mm_setzero_si128();
     }
   }
 
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-    for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-      acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
+  for (r = 0; r < b_height; ++r) {
+    for (c = 0; c < b_width_shift4; ++c) {
+      acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2(
           sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
-          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]);
       // Update pointers for next iteration.
       sig += 16;
       mc_running_avg_y += 16;
       running_avg_y += 16;
     }
 
-    if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-      for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-        sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+    if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+      for (c = 0; c < b_width_shift4; ++c) {
+        sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
       }
     }
 
     // Update pointers for next iteration.
-    sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-    mc_running_avg_y = mc_running_avg_y -
-                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                       mc_avg_y_stride;
-    running_avg_y = running_avg_y -
-                    16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                    avg_y_stride;
+    sig = sig - b_width + sig_stride;
+    mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+    running_avg_y = running_avg_y - b_width + avg_y_stride;
   }
 
   {
@@ -278,33 +278,29 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
-        sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
-        mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
+        sig -= sig_stride * b_height;
+        mc_running_avg_y -= mc_avg_y_stride * b_height;
+        running_avg_y -= avg_y_stride * b_height;
         sum_diff = 0;
-        for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-          for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-            acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
+        for (r = 0; r < b_height; ++r) {
+          for (c = 0; c < b_width_shift4; ++c) {
+            acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2(
                 sig, mc_running_avg_y, running_avg_y, k_0,
-                k_delta, acc_diff[c>>4][r>>4]);
+                k_delta, acc_diff[c][r>>4]);
             // Update pointers for next iteration.
             sig += 16;
             mc_running_avg_y += 16;
             running_avg_y += 16;
           }
 
-          if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
-            for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
-              sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
+          if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+            for (c = 0; c < b_width_shift4; ++c) {
+              sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
             }
           }
-          sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
-          mc_running_avg_y = mc_running_avg_y -
-                             16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                             mc_avg_y_stride;
-          running_avg_y = running_avg_y -
-                          16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                          avg_y_stride;
+          sig = sig - b_width + sig_stride;
+          mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+          running_avg_y = running_avg_y - b_width + avg_y_stride;
         }
         if (abs(sum_diff) > sum_diff_thresh) {
           return COPY_BLOCK;