8 files changed, 170 insertions, 135 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 3ff86b432..3d26fb2b5 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -158,6 +158,9 @@ typedef struct macroblockd {
   MODE_INFO *left_mi;
   MODE_INFO *above_mi;
 
+  unsigned int max_blocks_wide;
+  unsigned int max_blocks_high;
+
   const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
 
   /* Distance of MB away from frame edges */
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index d6a0ce96d..1cf636c1d 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -276,7 +276,7 @@ $vp9_full_search_sad_sse3=vp9_full_search_sadx3;
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
 
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx/;
+specialize qw/vp9_diamond_search_sad/;
 
 add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse2 msa/;
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 32c72194d..02962c8d5 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -883,6 +883,9 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
       const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
           0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
+      xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+      xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
       for (row = 0; row < max_blocks_high; row += step)
         for (col = 0; col < max_blocks_wide; col += step)
           predict_and_reconstruct_intra_block(xd, r, mi, plane,
@@ -911,6 +914,9 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
         const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
             0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
+        xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+        xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
             eobtotal += reconstruct_inter_block(xd, r, mi, plane, row, col,
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 596427c1e..f304de3da 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -353,11 +353,36 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   }
 }
 
+// TODO(slavarnway): Move this decoder version of
+// vp9_get_pred_context_switchable_interp() to vp9_pred_common.h and update the
+// encoder.
+//
+// Returns a context number for the given MB prediction signal
+static int dec_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const int left_type = left_mi ? left_mi->interp_filter : SWITCHABLE_FILTERS;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const int above_type = above_mi ? above_mi->interp_filter
+                             : SWITCHABLE_FILTERS;
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    return above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    return left_type;
+  else
+    return SWITCHABLE_FILTERS;
+}
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
     VP9_COMMON *const cm, MACROBLOCKD *const xd,
     vpx_reader *r) {
-  const int ctx = vp9_get_pred_context_switchable_interp(xd);
+  const int ctx = dec_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
       (INTERP_FILTER)vpx_read_tree(r, vp9_switchable_interp_tree,
                                    cm->fc->switchable_interp_prob[ctx]);
@@ -373,9 +398,6 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm,
   const BLOCK_SIZE bsize = mi->sb_type;
   int i;
 
-  mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
-
   switch (bsize) {
     case BLOCK_4X4:
       for (i = 0; i < 4; ++i)
@@ -399,6 +421,13 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm,
   }
 
   mi->uv_mode = read_intra_mode_uv(cm, xd, r, mi->mode);
+
+  // Initialize interp_filter here so we do not have to check for inter block
+  // modes in dec_get_pred_context_switchable_interp()
+  mi->interp_filter = SWITCHABLE_FILTERS;
+
+  mi->ref_frame[0] = INTRA_FRAME;
+  mi->ref_frame[1] = NONE;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index dcc75b9d2..47dc107fe 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -152,65 +152,73 @@ static int decode_coefs(const MACROBLOCKD *xd,
   return c;
 }
 
-// TODO(slavarnway): Decode version of vp9_set_context.  Modify vp9_set_context
-// after testing is complete, then delete this version.
-static
-void dec_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      TX_SIZE tx_size, int has_eob,
-                      int aoff, int loff) {
-  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_size_in_blocks = 1 << tx_size;
-
-  // above
-  if (has_eob && xd->mb_to_right_edge < 0) {
-    int i;
-    const int blocks_wide = pd->n4_w +
-                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_size_in_blocks;
-    if (above_contexts + aoff > blocks_wide)
-      above_contexts = blocks_wide - aoff;
-
-    for (i = 0; i < above_contexts; ++i)
-      a[i] = has_eob;
-    for (i = above_contexts; i < tx_size_in_blocks; ++i)
-      a[i] = 0;
-  } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
+                          int x, int y, unsigned int tx_size_in_blocks) {
+  if (xd->max_blocks_wide) {
+    if (tx_size_in_blocks + x > xd->max_blocks_wide)
+      *ctx_shift_a = (tx_size_in_blocks - (xd->max_blocks_wide - x)) * 8;
   }
-
-  // left
-  if (has_eob && xd->mb_to_bottom_edge < 0) {
-    int i;
-    const int blocks_high = pd->n4_h +
-                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_size_in_blocks;
-    if (left_contexts + loff > blocks_high)
-      left_contexts = blocks_high - loff;
-
-    for (i = 0; i < left_contexts; ++i)
-      l[i] = has_eob;
-    for (i = left_contexts; i < tx_size_in_blocks; ++i)
-      l[i] = 0;
-  } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  if (xd->max_blocks_high) {
+    if (tx_size_in_blocks + y > xd->max_blocks_high)
+      *ctx_shift_l = (tx_size_in_blocks - (xd->max_blocks_high - y)) * 8;
   }
 }
 
-int vp9_decode_block_tokens(MACROBLOCKD *xd,
-                            int plane, const scan_order *sc,
-                            int x, int y,
-                            TX_SIZE tx_size, vpx_reader *r,
+int vp9_decode_block_tokens(MACROBLOCKD *xd, int plane, const scan_order *sc,
+                            int x, int y, TX_SIZE tx_size, vpx_reader *r,
                             int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant[seg_id];
-  const int ctx = get_entropy_context(tx_size, pd->above_context + x,
-                                               pd->left_context + y);
-  const int eob = decode_coefs(xd, get_plane_type(plane),
-                               pd->dqcoeff, tx_size,
-                               dequant, ctx, sc->scan, sc->neighbors, r);
-  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
+  int eob;
+  ENTROPY_CONTEXT *a = pd->above_context + x;
+  ENTROPY_CONTEXT *l = pd->left_context + y;
+  int ctx;
+  int ctx_shift_a = 0;
+  int ctx_shift_l = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      ctx  = a[0] != 0;
+      ctx += l[0] != 0;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      a[0] = l[0] = (eob > 0);
+      break;
+    case TX_8X8:
+      get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_8X8);
+      ctx  = !!*(const uint16_t *)a;
+      ctx += !!*(const uint16_t *)l;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      *(uint16_t *)a = ((eob > 0) * 0x0101) >> ctx_shift_a;
+      *(uint16_t *)l = ((eob > 0) * 0x0101) >> ctx_shift_l;
+      break;
+    case TX_16X16:
+      get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_16X16);
+      ctx  = !!*(const uint32_t *)a;
+      ctx += !!*(const uint32_t *)l;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      *(uint32_t *)a = ((eob > 0) * 0x01010101) >> ctx_shift_a;
+      *(uint32_t *)l = ((eob > 0) * 0x01010101) >> ctx_shift_l;
+      break;
+    case TX_32X32:
+      get_ctx_shift(xd, &ctx_shift_a, &ctx_shift_l, x, y, 1 << TX_32X32);
+      // NOTE: casting to uint64_t here is safe because the default memory
+      // alignment is at least 8 bytes and the TX_32X32 is aligned on 8 byte
+      // boundaries.
+      ctx  = !!*(const uint64_t *)a;
+      ctx += !!*(const uint64_t *)l;
+      eob = decode_coefs(xd, get_plane_type(plane), pd->dqcoeff, tx_size,
+                         dequant, ctx, sc->scan, sc->neighbors, r);
+      *(uint64_t *)a = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_a;
+      *(uint64_t *)l = ((eob > 0) * 0x0101010101010101ULL) >> ctx_shift_l;
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      eob = 0;
+      break;
+  }
+
   return eob;
 }
-
-
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 9eca2a229..f40d8ab8f 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -183,7 +183,7 @@ int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
 
 static uint8_t *block_start(uint8_t *framebuf, int stride,
                             int mi_row, int mi_col) {
-  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+  return framebuf + (stride * mi_row  << 3) + (mi_col << 3);
 }
 
 static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
@@ -195,24 +195,27 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                                                          PICK_MODE_CONTEXT *ctx,
                                                          int motion_magnitude,
                                                          int is_skin,
-                                                         int *zeromv_filter) {
-  int mv_col, mv_row;
+                                                         int *zeromv_filter,
+                                                         int consec_zeromv) {
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MODE_INFO *mi = filter_mbd->mi[0];
   MODE_INFO saved_mi;
-  int i, j;
+  int i;
   struct buf_2d saved_dst[MAX_MB_PLANE];
-  struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
+  struct buf_2d saved_pre[MAX_MB_PLANE];
 
-  mv_col = ctx->best_sse_mv.as_mv.col;
-  mv_row = ctx->best_sse_mv.as_mv.row;
   frame = ctx->best_reference_frame;
-
   saved_mi = *mi;
 
-  if (is_skin && motion_magnitude > 0)
+  if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4))
+    return COPY_BLOCK;
+
+  // Avoid denoising for small block (unless motion is small).
+  // Small blocks are selected in variance partition (before encoding) and
+  // will typically lie on moving areas.
+  if (motion_magnitude > 16 && bs <= BLOCK_8X8)
     return COPY_BLOCK;
 
   // If the best reference frame uses inter-prediction and there is enough of a
@@ -255,34 +258,31 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
 
   // We will restore these after motion compensation.
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (j = 0; j < 2; ++j) {
-      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
-    }
+    saved_pre[i] = filter_mbd->plane[i].pre[0];
     saved_dst[i] = filter_mbd->plane[i].dst;
   }
 
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
-  for (j = 0; j < 2; ++j) {
-    filter_mbd->plane[0].pre[j].buf =
-        block_start(denoiser->running_avg_y[frame].y_buffer,
-                    denoiser->running_avg_y[frame].y_stride,
-                    mi_row, mi_col);
-    filter_mbd->plane[0].pre[j].stride =
-        denoiser->running_avg_y[frame].y_stride;
-    filter_mbd->plane[1].pre[j].buf =
-        block_start(denoiser->running_avg_y[frame].u_buffer,
-                    denoiser->running_avg_y[frame].uv_stride,
-                    mi_row, mi_col);
-    filter_mbd->plane[1].pre[j].stride =
-        denoiser->running_avg_y[frame].uv_stride;
-    filter_mbd->plane[2].pre[j].buf =
-        block_start(denoiser->running_avg_y[frame].v_buffer,
-                    denoiser->running_avg_y[frame].uv_stride,
-                    mi_row, mi_col);
-    filter_mbd->plane[2].pre[j].stride =
-        denoiser->running_avg_y[frame].uv_stride;
-  }
+  filter_mbd->plane[0].pre[0].buf =
+      block_start(denoiser->running_avg_y[frame].y_buffer,
+                  denoiser->running_avg_y[frame].y_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[0].pre[0].stride =
+      denoiser->running_avg_y[frame].y_stride;
+  filter_mbd->plane[1].pre[0].buf =
+       block_start(denoiser->running_avg_y[frame].u_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[1].pre[0].stride =
+      denoiser->running_avg_y[frame].uv_stride;
+  filter_mbd->plane[2].pre[0].buf =
+      block_start(denoiser->running_avg_y[frame].v_buffer,
+                  denoiser->running_avg_y[frame].uv_stride,
+                  mi_row, mi_col);
+  filter_mbd->plane[2].pre[0].stride =
+      denoiser->running_avg_y[frame].uv_stride;
+
   filter_mbd->plane[0].dst.buf =
       block_start(denoiser->mc_running_avg_y.y_buffer,
                   denoiser->mc_running_avg_y.y_stride,
@@ -299,20 +299,15 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                   mi_row, mi_col);
   filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
 
-  vp9_build_inter_predictors_sby(filter_mbd, mv_row, mv_col, bs);
+  vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
   *mi = saved_mi;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (j = 0; j < 2; ++j) {
-      filter_mbd->plane[i].pre[j] = saved_pre[i][j];
-    }
+    filter_mbd->plane[i].pre[0] = saved_pre[i];
     filter_mbd->plane[i].dst = saved_dst[i];
   }
 
-  mv_row = ctx->best_sse_mv.as_mv.row;
-  mv_col = ctx->best_sse_mv.as_mv.col;
-
   return FILTER_BLOCK;
 }
 
@@ -332,6 +327,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb,
                                           mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
   int is_skin = 0;
+  int consec_zeromv = 0;
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
   motion_magnitude = mv_row * mv_row + mv_col * mv_col;
@@ -343,7 +339,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb,
     // If motion for current block is small/zero, compute consec_zeromv for
     // skin detection (early exit in skin detection is done for large
     // consec_zeromv when current block has small/zero motion).
-    int consec_zeromv = 0;
+    consec_zeromv = 0;
     if (motion_level == 0) {
       CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
       VP9_COMMON * const cm = &cpi->common;
@@ -361,8 +357,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb,
           consec_zeromv = VPXMIN(cr->consec_zero_mv[bl_index], consec_zeromv);
           // No need to keep checking 8x8 blocks if any of the sub-blocks
           // has small consec_zeromv (since threshold for no_skin based on
-          // zero/small motion in skin detection is high, i.e, > 5).
-          if (consec_zeromv < 5) {
+          // zero/small motion in skin detection is high, i.e, > 4).
+          if (consec_zeromv < 4) {
             i = ymis;
             j = xmis;
           }
@@ -393,7 +389,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb,
                                            mi_row, mi_col, ctx,
                                            motion_magnitude,
                                            is_skin,
-                                           &zeromv_filter);
+                                           &zeromv_filter,
+                                           consec_zeromv);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride,
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index d505629a5..10ea01012 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -91,7 +91,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
   } else {
     if (ne->value > ne->thresh)
       noise_level = kMedium;
-    else if (ne->value > (ne->thresh >> 1))
+    else if (ne->value > ((9 * ne->thresh) >> 4))
       noise_level = kLow;
     else
       noise_level = kLowLow;
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 63f0ce213..307a1123a 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -446,7 +446,7 @@ static void resize_multistep(const uint8_t *const input,
                              int length,
                              uint8_t *output,
                              int olength,
-                             uint8_t *buf) {
+                             uint8_t *otmp) {
   int steps;
   if (length == olength) {
     memcpy(output, input, sizeof(output[0]) * length);
@@ -457,16 +457,10 @@ static void resize_multistep(const uint8_t *const input,
   if (steps > 0) {
     int s;
     uint8_t *out = NULL;
-    uint8_t *tmpbuf = NULL;
-    uint8_t *otmp, *otmp2;
+    uint8_t *otmp2;
     int filteredlength = length;
-    if (!tmpbuf) {
-      tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length);
-      if (tmpbuf == NULL) return;
-      otmp = tmpbuf;
-    } else {
-      otmp = buf;
-    }
+
+    assert(otmp != NULL);
     otmp2 = otmp + get_down2_length(length, 1);
     for (s = 0; s < steps; ++s) {
       const int proj_filteredlength = get_down2_length(filteredlength, 1);
@@ -484,8 +478,6 @@ static void resize_multistep(const uint8_t *const input,
     if (filteredlength != olength) {
       interpolate(out, filteredlength, output, olength);
     }
-    if (tmpbuf)
-      free(tmpbuf);
   } else {
     interpolate(input, length, output, olength);
   }
@@ -521,25 +513,29 @@ void vp9_resize_plane(const uint8_t *const input,
   uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
   uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
                                       (width < height ? height : width));
-  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
-  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL) goto Error;
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height);
+  uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL ||
+      arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
   assert(width > 0);
   assert(height > 0);
   assert(width2 > 0);
   assert(height2 > 0);
   for (i = 0; i < height; ++i)
     resize_multistep(input + in_stride * i, width,
-                        intbuf + width2 * i, width2, tmpbuf);
+                     intbuf + width2 * i, width2, tmpbuf);
   for (i = 0; i < width2; ++i) {
     fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf);
-    fill_arr_to_col(output + i, out_stride, height2, arrbuf + height);
+    resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
   }
 
  Error:
   free(intbuf);
   free(tmpbuf);
   free(arrbuf);
+  free(arrbuf2);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -742,7 +738,7 @@ static void highbd_resize_multistep(const uint16_t *const input,
                                     int length,
                                     uint16_t *output,
                                     int olength,
-                                    uint16_t *buf,
+                                    uint16_t *otmp,
                                     int bd) {
   int steps;
   if (length == olength) {
@@ -754,16 +750,10 @@ static void highbd_resize_multistep(const uint16_t *const input,
   if (steps > 0) {
     int s;
     uint16_t *out = NULL;
-    uint16_t *tmpbuf = NULL;
-    uint16_t *otmp, *otmp2;
+    uint16_t *otmp2;
     int filteredlength = length;
-    if (!tmpbuf) {
-      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
-      if (tmpbuf == NULL) return;
-      otmp = tmpbuf;
-    } else {
-      otmp = buf;
-    }
+
+    assert(otmp != NULL);
     otmp2 = otmp + get_down2_length(length, 1);
     for (s = 0; s < steps; ++s) {
       const int proj_filteredlength = get_down2_length(filteredlength, 1);
@@ -781,8 +771,6 @@ static void highbd_resize_multistep(const uint16_t *const input,
     if (filteredlength != olength) {
       highbd_interpolate(out, filteredlength, output, olength, bd);
     }
-    if (tmpbuf)
-      free(tmpbuf);
   } else {
     highbd_interpolate(input, length, output, olength, bd);
   }
@@ -821,24 +809,28 @@ void vp9_highbd_resize_plane(const uint8_t *const input,
   uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
   uint16_t *tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) *
                                         (width < height ? height : width));
-  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
-  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL) goto Error;
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * height);
+  uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL ||
+      arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
   for (i = 0; i < height; ++i) {
     highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
                             intbuf + width2 * i, width2, tmpbuf, bd);
   }
   for (i = 0; i < width2; ++i) {
     highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
+    highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf,
                             bd);
     highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
-                           arrbuf + height);
+                           arrbuf2);
   }
 
  Error:
   free(intbuf);
   free(tmpbuf);
   free(arrbuf);
+  free(arrbuf2);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH