13 files changed, 167 insertions, 190 deletions
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
index 93d3af301..b41f5661b 100644
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
@@ -576,6 +576,7 @@
     vld1.s16        {q14,q15}, [r0]!
 
     push            {r0-r10}
+    vpush           {d8-d15}
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -636,6 +637,7 @@ iadst_iadst
     IADST8X8_1D
 
 end_vp9_iht8x8_64_add_neon
+    vpop           {d8-d15}
     pop            {r0-r10}
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5)
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 6086323f6..07ba385d2 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -328,13 +328,6 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);
 
-
-static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
-                             TX_SIZE tx_size) {
-  const int eob_max = 16 << (tx_size << 1);
-  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 4031bda55..83ee69b7e 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -683,7 +683,7 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 
-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
+prototype int64_t vp9_block_error "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
 specialize vp9_block_error $sse2_x86inc
 
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 80340b51a..e52b3f759 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -15,6 +15,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -39,10 +40,6 @@
 #include "vp9/decoder/vp9_reader.h"
 #include "vp9/decoder/vp9_thread.h"
 
-static int read_be32(const uint8_t *p) {
-  return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
-}
-
 static int is_compound_reference_allowed(const VP9_COMMON *cm) {
   int i;
   for (i = 1; i < REFS_PER_FRAME; ++i)
@@ -837,7 +834,7 @@ static size_t get_tile(const uint8_t *const data_end,
       vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
                          "Truncated packet or corrupt tile length");
 
-    size = read_be32(*data);
+    size = mem_get_be32(*data);
     *data += 4;
 
     if (size > (size_t)(data_end - *data))
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 06a2f0071..14600e829 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -14,6 +14,7 @@
 
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -61,13 +62,6 @@ static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
                   &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
 
-static INLINE void write_be32(uint8_t *p, int value) {
-  p[0] = value >> 24;
-  p[1] = value >> 16;
-  p[2] = value >> 8;
-  p[3] = value;
-}
-
 static void encode_unsigned_max(struct vp9_write_bit_buffer *wb,
                                 int data, int max) {
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
@@ -198,45 +192,40 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
 }
 
 // This function encodes the reference frame
-static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mi = &xd->mi_8x8[0]->mbmi;
-  const int segment_id = mi->segment_id;
-  int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
-                                             SEG_LVL_REF_FRAME);
+static void write_ref_frames(const VP9_COMP *cpi, vp9_writer *w) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
   // If segment level coding of this signal is disabled...
   // or the segment allows multiple reference frame options
-  if (!seg_ref_active) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+               vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_reference_mode_prob(cm, xd));
+      vp9_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
     } else {
-      assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-             (cm->reference_mode == SINGLE_REFERENCE));
+      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
-    if (mi->ref_frame[1] > INTRA_FRAME) {
-      vp9_write(bc, mi->ref_frame[0] == GOLDEN_FRAME,
+    if (is_compound) {
+      vp9_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
-      vp9_write(bc, mi->ref_frame[0] != LAST_FRAME,
-                vp9_get_pred_prob_single_ref_p1(cm, xd));
-      if (mi->ref_frame[0] != LAST_FRAME)
-        vp9_write(bc, mi->ref_frame[0] != GOLDEN_FRAME,
-                  vp9_get_pred_prob_single_ref_p2(cm, xd));
+      const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
+      vp9_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
+        vp9_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+      }
     }
-  } else {
-    assert(mi->ref_frame[1] <= INTRA_FRAME);
-    assert(vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ==
-           mi->ref_frame[0]);
   }
-
-  // If using the prediction model we have nothing further to do because
-  // the reference frame is fully coded by the segment.
 }
 
 static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
@@ -302,7 +291,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     write_intra_mode(bc, mi->uv_mode, cm->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob *mv_ref_p;
-    encode_ref_frame(cpi, bc);
+    write_ref_frames(cpi, bc);
     mv_ref_p = cm->fc.inter_mode_probs[mi->mode_context[ref0]];
 
 #ifdef ENTROPY_STATS
@@ -1003,7 +992,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
-        write_be32(data_ptr + total_size, residual_bc.pos);
+        mem_put_be32(data_ptr + total_size, residual_bc.pos);
         total_size += 4;
       }
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 57865138d..84b7cef55 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -539,10 +539,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
-        int_mv best_mv[2];
+        MV best_mv[2];
         for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-        vp9_update_mv_count(cpi, x, best_mv);
+          best_mv[i] = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+        vp9_update_mv_count(cm, xd, best_mv);
       }
 
       if (cm->interp_filter == SWITCHABLE) {
@@ -1121,10 +1121,10 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       if (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV) {
-        int_mv best_mv[2];
+        MV best_mv[2];
         for (i = 0; i < 1 + has_second_ref(mbmi); ++i)
-          best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int;
-        vp9_update_mv_count(cpi, x, best_mv);
+          best_mv[i] = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+        vp9_update_mv_count(cm, xd, best_mv);
       }
 
       if (cm->interp_filter == SWITCHABLE) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index cc5821c93..13eabe05d 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -311,17 +311,6 @@ static void optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   *a = *l = (final_eob > 0);
 }
 
-static void optimize_init_b(int plane, BLOCK_SIZE bsize,
-                            struct encode_b_args *args) {
-  const MACROBLOCKD *xd = &args->x->e_mbd;
-  const struct macroblockd_plane* const pd = &xd->plane[plane];
-  const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
-
-  vp9_get_entropy_contexts(bsize, tx_size, pd,
-                           args->ctx->ta[plane], args->ctx->tl[plane]);
-}
-
 static INLINE void fdct32x32(int rd_transform,
                              const int16_t *src, int16_t *dst, int src_stride) {
   if (rd_transform)
@@ -474,8 +463,12 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
     if (!x->skip_recode)
       vp9_subtract_plane(x, bsize, plane);
 
-    if (x->optimize && (!x->skip_recode || !x->skip_optimize))
-      optimize_init_b(plane, bsize, &arg);
+    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+      const struct macroblockd_plane* const pd = &xd->plane[plane];
+      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
+      vp9_get_entropy_contexts(bsize, tx_size, pd,
+                               ctx.ta[plane], ctx.tl[plane]);
+    }
 
     vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
                                            &arg);
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index be6abc2a1..507969951 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -231,21 +231,22 @@ void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
-static void inc_mvs(int_mv mv[2], int_mv ref[2], int is_compound,
+static void inc_mvs(const int_mv mv[2], const MV ref[2], int is_compound,
                     nmv_context_counts *counts) {
   int i;
   for (i = 0; i < 1 + is_compound; ++i) {
-    const MV diff = { mv[i].as_mv.row - ref[i].as_mv.row,
-                      mv[i].as_mv.col - ref[i].as_mv.col };
+    const MV diff = { mv[i].as_mv.row - ref[i].row,
+                      mv[i].as_mv.col - ref[i].col };
     vp9_inc_mv(&diff, counts);
   }
 }
 
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
-  MODE_INFO *mi = x->e_mbd.mi_8x8[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                         const MV best_ref_mv[2]) {
+  const MODE_INFO *mi = xd->mi_8x8[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const int is_compound = has_second_ref(mbmi);
-  nmv_context_counts *counts = &cpi->common.counts.mv;
+  nmv_context_counts *counts = &cm->counts.mv;
 
   if (mbmi->sb_type < BLOCK_8X8) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi->sb_type];
diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h
index 7f997ff37..f16b2c17c 100644
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -28,7 +28,8 @@ void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context* mvctx, int usehp);
 
-void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]);
+void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                         const MV best_ref_mv[2]);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 0b92df002..87f20fa1c 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -98,8 +98,15 @@ static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps, 1,
-                         &cpi->fn_ptr[bsize], &ref_mv.as_mv, &tmp_mv->as_mv);
+  if (cpi->sf.search_method == FAST_HEX) {
+    vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, &cpi->fn_ptr[bsize],
+                        1, &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else {
+    vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps,
+                           1, &cpi->fn_ptr[bsize], &ref_mv.as_mv,
+                           &tmp_mv->as_mv);
+  }
+
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
   x->mv_row_min = tmp_row_min;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index dc6c11816..89aa82140 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -8,23 +8,24 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 
 #define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 3a423953a..b57b94806 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -36,6 +36,7 @@
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
 
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -272,18 +273,12 @@ static void set_block_thresholds(VP9_COMP *cpi) {
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCK *x = &cpi->mb;
-  int qindex, i;
+  int i;
 
   vp9_clear_system_state();
 
-  // Further tests required to see if optimum is different
-  // for key frames, golden frames and arf frames.
-  // if (cpi->common.refresh_golden_frame ||
-  //     cpi->common.refresh_alt_ref_frame)
-  qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
-
   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
-  cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
+  cpi->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
   x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
   x->errorperbit += (x->errorperbit == 0);
@@ -525,15 +520,15 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *out_dist_sum = dist_sum << 4;
 }
 
-int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
                           intptr_t block_size, int64_t *ssz) {
   int i;
   int64_t error = 0, sqcoeff = 0;
 
   for (i = 0; i < block_size; i++) {
-    int this_diff = coeff[i] - dqcoeff[i];
-    error += (unsigned)this_diff * this_diff;
-    sqcoeff += (unsigned) coeff[i] * coeff[i];
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
   }
 
   *ssz = sqcoeff;
@@ -1037,10 +1032,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      int64_t *bestdistortion,
                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
   MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int64_t best_rd = rd_thresh;
-  int rate = 0;
-  int64_t distortion;
+
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
@@ -1049,8 +1043,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                                             src_stride)];
   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
                                                        dst_stride)];
-  int16_t *src_diff, *coeff;
-
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
 
@@ -1068,6 +1060,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     int64_t this_rd;
     int ratey = 0;
+    int64_t distortion = 0;
+    int rate = bmode_costs[mode];
 
     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
       continue;
@@ -1079,55 +1073,50 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           continue;
     }
 
-    rate = bmode_costs[mode];
-    distortion = 0;
-
     vpx_memcpy(tempa, ta, sizeof(ta));
     vpx_memcpy(templ, tl, sizeof(tl));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-        int64_t ssz;
-        const scan_order *so;
-        const uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
-        uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
         const int block = ib + idy * 2 + idx;
-        TX_TYPE tx_type;
+        const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+        uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
+                                                            p->src_diff);
+        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi_8x8[0]->bmi[block].as_mode = mode;
-        src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, idx, idy, 0);
-        vp9_subtract_block(4, 4, src_diff, 8,
-                           src, src_stride,
-                           dst, dst_stride);
-
-        tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-        so = &vp9_scan_orders[TX_4X4][tx_type];
-
-        if (tx_type != DCT_DCT)
+        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+
+        if (xd->lossless) {
+          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          vp9_fwht4x4(src_diff, coeff, 8);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors);
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
+                          p->eobs[block]);
+        } else {
+          int64_t unused;
+          const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-        else
-          x->fwd_txm4x4(src_diff, coeff, 8);
-
-        vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-
-        ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                             so->scan, so->neighbors);
-        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                                      16, &ssz) >> 2;
-        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-          goto next;
-
-        if (tx_type != DCT_DCT)
-          vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
-                               dst, pd->dst.stride, tx_type);
-        else
-          xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
-                       16);
+          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                               so->scan, so->neighbors);
+          distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+                                        16, &unused) >> 2;
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
+          vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+                         dst, dst_stride, p->eobs[block]);
+        }
       }
     }
 
@@ -3217,25 +3206,69 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     // All modes from vp9_mode_order that use this frame as any ref
     static const int ref_frame_mask_all[] = {
-        0x123291, 0x25c444, 0x39b722
+        0x0, 0x123291, 0x25c444, 0x39b722
     };
     // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
     // this frame as their primary ref
     static const int ref_frame_mask_fixedmv[] = {
-        0x121281, 0x24c404, 0x080102
+        0x0, 0x121281, 0x24c404, 0x080102
     };
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
       // Skip modes for missing references
-      mode_skip_mask |= ref_frame_mask_all[ref_frame - LAST_FRAME];
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
     } else if (cpi->sf.reference_masking) {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame - LAST_FRAME];
+          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
           break;
         }
       }
     }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+    }
+  }
+
+  // If the segment skip feature is enabled....
+  // then do nothing if the current mode is not allowed..
+  if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+    const int inter_non_zero_mode_mask = 0x1F7F7;
+    mode_skip_mask |= inter_non_zero_mode_mask;
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      const int altref_zero_mask =
+          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      mode_skip_mask |= altref_zero_mask;
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARA);
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask |= (1 << THR_NEARESTA);
+    }
+  }
+
+  // TODO(JBB): This is to make up for the fact that we don't have sad
+  // functions that work when the block size reads outside the umv.  We
+  // should fix this either by making the motion search just work on
+  // a representative block in the boundary ( first ) and then implement a
+  // function that does sads when inside the border..
+  if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
+    const int new_modes_mask =
+        (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
+        (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
+    mode_skip_mask |= new_modes_mask;
   }
 
   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
@@ -3287,11 +3320,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      // Do not allow compound prediction if the segment level reference
-      // frame feature is in use as in this case there can only be one
-      // reference.
-      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-        continue;
       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
           vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
         continue;
@@ -3307,47 +3335,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
     }
 
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
-      continue;
-    // Disable this drop out case if the ref frame
-    // segment level feature is enabled for this segment. This is to
-    // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative. We allow near/nearest as well
-      // because they may result in zero-zero MVs but be cheaper.
-      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != ZEROMV &&
-             !(this_mode == NEARMV &&
-               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == NEARESTMV &&
-               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
-            ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-    // TODO(JBB): This is to make up for the fact that we don't have sad
-    // functions that work when the block size reads outside the umv.  We
-    // should fix this either by making the motion search just work on
-    // a representative block in the boundary ( first ) and then implement a
-    // function that does sads when inside the border..
-    if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == NEWMV) {
-      continue;
-    }
-
     if (ref_frame == INTRA_FRAME) {
       // Disable intra modes other than DC_PRED for blocks with low variance
       // Threshold for intra skipping based on source variance
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 7ae110707..e8179f302 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -199,6 +199,12 @@ static INLINE void add_token_no_extra(TOKENEXTRA **t,
   ++counts[token];
 }
 
+static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
+                             TX_SIZE tx_size) {
+  const int eob_max = 16 << (tx_size << 1);
+  return vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;