32 files changed, 365 insertions, 386 deletions
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 33aa4e001..0b9fc09ab 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -11,31 +11,31 @@
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-extern void vp9_idct16x16_256_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_idct16x16_256_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void vp9_idct16x16_10_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_idct16x16_10_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+                                      int16_t *output,
+                                      int output_stride);
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+                                      int16_t *output,
+                                      int16_t *pass1Output,
+                                      int16_t skip_adding,
+                                      uint8_t *dest,
+                                      int dest_stride);
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+                                     int16_t *output,
+                                     int output_stride);
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+                                     int16_t *output,
+                                     int16_t *pass1Output,
+                                     int16_t skip_adding,
+                                     uint8_t *dest,
+                                     int dest_stride);
 
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
 extern void vp9_push_neon(int64_t *store);
 extern void vp9_pop_neon(int64_t *store);
 
-void vp9_idct16x16_256_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_neon(const int16_t *input,
+                                uint8_t *dest, int dest_stride) {
   int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
@@ -109,8 +109,8 @@ void vp9_idct16x16_256_add_neon(int16_t *input,
   return;
 }
 
-void vp9_idct16x16_10_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_10_add_neon(const int16_t *input,
+                               uint8_t *dest, int dest_stride) {
   int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
diff --git a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
index 963ef35da..2f326e24c 100644
--- a/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_iht4x4_add_neon|
+    EXPORT  |vp9_iht4x4_16_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -139,7 +139,7 @@
     MEND
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
 ;                               int dest_stride, int tx_type)
 ;
 ; r0  int16_t input
@@ -147,7 +147,7 @@
 ; r2  int dest_stride
 ; r3  int tx_type)
 ; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht4x4_add_neon| PROC
+|vp9_iht4x4_16_add_neon| PROC
 
     ; load the inputs into d16-d19
     vld1.s16    {q8,q9}, [r0]!
@@ -175,7 +175,7 @@ iadst_idct
     ; then transform columns
     IADST4x4_1D
 
-    b end_vp9_short_iht4x4_add_neon
+    b end_vp9_iht4x4_16_add_neon
 
 idct_iadst
     ; generate constants
@@ -191,7 +191,7 @@ idct_iadst
     ; then transform columns
     IDCT4x4_1D
 
-    b end_vp9_short_iht4x4_add_neon
+    b end_vp9_iht4x4_16_add_neon
 
 iadst_iadst
     ; generate constants
@@ -206,7 +206,7 @@ iadst_iadst
     ; then transform columns
     IADST4x4_1D
 
-end_vp9_short_iht4x4_add_neon
+end_vp9_iht4x4_16_add_neon
     ; ROUND_POWER_OF_TWO(temp_out[j], 4)
     vrshr.s16   q8, q8, #4
     vrshr.s16   q9, q9, #4
@@ -232,6 +232,6 @@ end_vp9_short_iht4x4_add_neon
     vst1.32     {d26[1]}, [r1], r2
     vst1.32     {d26[0]}, [r1]  ; no post-increment
     bx          lr
-    ENDP  ; |vp9_short_iht4x4_add_neon|
+    ENDP  ; |vp9_iht4x4_16_add_neon|
 
     END
diff --git a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
index bab9cb4a4..93d3af301 100644
--- a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -8,7 +8,7 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_short_iht8x8_add_neon|
+    EXPORT  |vp9_iht8x8_64_add_neon|
     ARM
     REQUIRE8
     PRESERVE8
@@ -559,7 +559,7 @@
 
 
     AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
 ;                               int dest_stride, int tx_type)
 ;
 ; r0  int16_t input
@@ -567,7 +567,7 @@
 ; r2  int dest_stride
 ; r3  int tx_type)
 ; This function will only handle tx_type of 1,2,3.
-|vp9_short_iht8x8_add_neon| PROC
+|vp9_iht8x8_64_add_neon| PROC
 
     ; load the inputs into d16-d19
     vld1.s16        {q8,q9}, [r0]!
@@ -602,7 +602,7 @@ iadst_idct
     ; then transform columns
     IADST8X8_1D
 
-    b end_vp9_short_iht8x8_add_neon
+    b end_vp9_iht8x8_64_add_neon
 
 idct_iadst
     ; generate IADST constants
@@ -620,7 +620,7 @@ idct_iadst
     ; then transform columns
     IDCT8x8_1D
 
-    b end_vp9_short_iht8x8_add_neon
+    b end_vp9_iht8x8_64_add_neon
 
 iadst_iadst
     ; generate IADST constants
@@ -635,7 +635,7 @@ iadst_iadst
     ; then transform columns
     IADST8X8_1D
 
-end_vp9_short_iht8x8_add_neon
+end_vp9_iht8x8_64_add_neon
     pop            {r0-r10}
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5)
@@ -691,6 +691,6 @@ end_vp9_short_iht8x8_add_neon
     vst1.64         {d6}, [r0], r2
     vst1.64         {d7}, [r0], r2
     bx          lr
-    ENDP  ; |vp9_short_iht8x8_add_neon|
+    ENDP  ; |vp9_iht8x8_64_add_neon|
 
     END
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index b1af13891..0538b37ac 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -221,7 +221,7 @@ typedef struct macroblockd {
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
+  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
 
   struct subpix_fn_table  subpix;
 
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 3cf508e05..02178b579 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -18,6 +18,8 @@
 #include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_treecoder.h"
 
+#define DIFF_UPDATE_PROB 252
+
 /* Coefficient token alphabet */
 
 #define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
@@ -208,7 +210,4 @@ static void get_scan_and_band(const MACROBLOCKD *xd, TX_SIZE tx_size,
   }
 }
 
-
-enum { VP9_COEF_UPDATE_PROB = 252 };
-
 #endif  // VP9_COMMON_VP9_ENTROPY_H_
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index e17679616..56e644460 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -226,7 +226,7 @@ static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
 };
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
-const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
+const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
   -TM_PRED, 4,                      /* 1 = TM_NODE */
   -V_PRED, 6,                       /* 2 = V_NODE */
@@ -237,22 +237,20 @@ const vp9_tree_index vp9_intra_mode_tree[INTRA_MODES * 2 - 2] = {
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
+struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
-const vp9_tree_index vp9_inter_mode_tree[6] = {
+const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
   -NEARMV, -NEWMV
 };
+struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
-const vp9_tree_index vp9_partition_tree[6] = {
+const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
@@ -338,7 +336,8 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
 }
 
-const vp9_tree_index vp9_switchable_interp_tree[SWITCHABLE_FILTERS*2-2] = {
+const vp9_tree_index vp9_switchable_interp_tree
+                         [TREE_SIZE(SWITCHABLE_FILTERS)] = {
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ccade2752..ab37b75c6 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -15,7 +15,6 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define TX_SIZE_CONTEXTS 2
-#define MODE_UPDATE_PROB  252
 #define SWITCHABLE_FILTERS 3   // number of switchable filters
 
 // #define MODE_STATS
@@ -38,19 +37,17 @@ extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
 
-extern const vp9_tree_index vp9_intra_mode_tree[];
-extern const vp9_tree_index vp9_inter_mode_tree[];
-
+extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
+
+extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
 extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
-// probability models for partition information
-extern const vp9_tree_index vp9_partition_tree[];
+extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
 extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 extern const vp9_tree_index vp9_switchable_interp_tree
-                 [2 * (SWITCHABLE_FILTERS - 1)];
-
+                                [TREE_SIZE(SWITCHABLE_FILTERS)];
 extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
 
 void vp9_entropy_mode_init();
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index baff637e8..e85118118 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -18,14 +18,14 @@
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
 #define COMPANDED_MVREF_THRESH 8
 
-const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
+const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_ZERO, 2,
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
 struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
-const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
+const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
   -MV_CLASS_1, 4,
   6, 8,
@@ -39,12 +39,12 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = {
 };
 struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
-const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2] = {
+const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
 struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
   -0, 2,
   -1, 4,
   -2, -3
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 3b782ab0a..c42653d42 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -43,9 +43,6 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
   return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
 }
 
-extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
 /* Symbols for coding magnitude class of nonzero components */
 #define MV_CLASSES     11
 typedef enum {
@@ -62,9 +59,6 @@ typedef enum {
   MV_CLASS_10 = 10,    /* (1024,2048] integer pel */
 } MV_CLASS_TYPE;
 
-extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
 #define CLASS0_BITS    1  /* bits at integer precision for class 0 */
 #define CLASS0_SIZE    (1 << CLASS0_BITS)
 #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
@@ -77,10 +71,16 @@ extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2];
+extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
+extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
+
+extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
+extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
+
+extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
 extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2];
+extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)];
 extern struct vp9_token vp9_mv_fp_encodings[4];
 
 typedef struct {
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 78d10877a..52b039d99 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,13 +18,13 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
    0.5 shifts per pixel. */
   int i;
   int16_t output[16];
   int a1, b1, c1, d1, e1;
-  int16_t *ip = input;
+  const int16_t *ip = input;
   int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
@@ -60,21 +60,21 @@ void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);
-    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);
-    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);
-    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);
+    dest[stride * 0] = clip_pixel(dest[stride * 0] + a1);
+    dest[stride * 1] = clip_pixel(dest[stride * 1] + b1);
+    dest[stride * 2] = clip_pixel(dest[stride * 2] + c1);
+    dest[stride * 3] = clip_pixel(dest[stride * 3] + d1);
 
     ip++;
     dest++;
   }
 }
 
-void vp9_iwht4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int a1, e1;
   int16_t tmp[4];
-  int16_t *ip = in;
+  const int16_t *ip = in;
   int16_t *op = tmp;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -96,7 +96,7 @@ void vp9_iwht4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   }
 }
 
-static void idct4_1d(int16_t *input, int16_t *output) {
+static void idct4_1d(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -116,7 +116,7 @@ static void idct4_1d(int16_t *input, int16_t *output) {
   output[3] = step[0] - step[3];
 }
 
-void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
   int i, j;
@@ -135,12 +135,12 @@ void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
       temp_in[j] = out[j * 4 + i];
     idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -156,7 +156,7 @@ void vp9_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
-static void idct8_1d(int16_t *input, int16_t *output) {
+static void idct8_1d(const int16_t *input, int16_t *output) {
   int16_t step1[8], step2[8];
   int temp1, temp2;
   // stage 1
@@ -201,7 +201,7 @@ static void idct8_1d(int16_t *input, int16_t *output) {
   output[7] = step1[0] - step1[7];
 }
 
-void vp9_idct8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8];
   int16_t *outptr = out;
   int i, j;
@@ -220,12 +220,12 @@ void vp9_idct8x8_64_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -234,11 +234,11 @@ void vp9_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
-static void iadst4_1d(int16_t *input, int16_t *output) {
+static void iadst4_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[0];
@@ -280,8 +280,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {
+void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
   const transform_2d IHT_4[] = {
     { idct4_1d, idct4_1d  },  // DCT_DCT  = 0
     { iadst4_1d, idct4_1d  },   // ADST_DCT = 1
@@ -307,11 +307,11 @@ void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * stride + i]);
   }
 }
-static void iadst8_1d(int16_t *input, int16_t *output) {
+static void iadst8_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -395,8 +395,8 @@ static const transform_2d IHT_8[] = {
   { iadst8_1d, iadst8_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                            int tx_type) {
+void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
   int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
@@ -416,12 +416,12 @@ void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
       temp_in[j] = out[j * 8 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);  }
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
+  }
 }
 
-void vp9_idct8x8_10_add_c(int16_t *input, uint8_t *dest,
-                                int dest_stride) {
+void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[8 * 8] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -441,12 +441,12 @@ void vp9_idct8x8_10_add_c(int16_t *input, uint8_t *dest,
       temp_in[j] = out[j * 8 + i];
     idct8_1d(temp_in, temp_out);
     for (j = 0; j < 8; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+                                  + dest[j * stride + i]);
   }
 }
 
-static void idct16_1d(int16_t *input, int16_t *output) {
+static void idct16_1d(const int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
 
@@ -611,7 +611,7 @@ static void idct16_1d(int16_t *input, int16_t *output) {
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[16 * 16];
   int16_t *outptr = out;
   int i, j;
@@ -630,12 +630,12 @@ void vp9_idct16x16_256_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void iadst16_1d(int16_t *input, int16_t *output) {
+static void iadst16_1d(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -813,8 +813,8 @@ static const transform_2d IHT_16[] = {
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
-                              int tx_type) {
+void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -834,12 +834,11 @@ void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);  }
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);  }
 }
 
-void vp9_idct16x16_10_add_c(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
+void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[16 * 16] = { 0 };
   int16_t *outptr = out;
   int i, j;
@@ -859,13 +858,12 @@ void vp9_idct16x16_10_add_c(int16_t *input, uint8_t *dest,
       temp_in[j] = out[j*16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
@@ -874,11 +872,11 @@ void vp9_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
-static void idct32_1d(int16_t *input, int16_t *output) {
+static void idct32_1d(const int16_t *input, int16_t *output) {
   int16_t step1[32], step2[32];
   int temp1, temp2;
 
@@ -1245,7 +1243,7 @@ static void idct32_1d(int16_t *input, int16_t *output) {
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_idct32x32_1024_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
   int i, j;
@@ -1277,13 +1275,12 @@ void vp9_idct32x32_1024_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
+      dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * stride + i]);
   }
 }
 
-void vp9_idct32x32_1_add_c(int16_t *input, uint8_t *dest,
-                                 int dest_stride) {
+void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
   int i, j;
   int a1;
 
@@ -1294,12 +1291,12 @@ void vp9_idct32x32_1_add_c(int16_t *input, uint8_t *dest,
   for (j = 0; j < 32; ++j) {
     for (i = 0; i < 32; ++i)
       dest[i] = clip_pixel(dest[i] + a1);
-    dest += dest_stride;
+    dest += stride;
   }
 }
 
 // idct
-void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob > 1)
     vp9_idct4x4_16_add(input, dest, stride);
   else
@@ -1307,14 +1304,14 @@ void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {
 }
 
 
-void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob > 1)
     vp9_iwht4x4_16_add(input, dest, stride);
   else
     vp9_iwht4x4_1_add(input, dest, stride);
 }
 
-void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   // If dc is 1, then input[0] is the reconstructed value, do not need
   // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
 
@@ -1333,7 +1330,8 @@ void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob) {
   }
 }
 
-void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob) {
@@ -1347,7 +1345,8 @@ void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob) {
   }
 }
 
-void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob) {
   if (eob) {
     if (eob == 1)
       vp9_idct32x32_1_add(input, dest, stride);
@@ -1357,32 +1356,32 @@ void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob) {
 }
 
 // iht
-void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
-                   int eob) {
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {
   if (tx_type == DCT_DCT)
     vp9_idct4x4_add(input, dest, stride, eob);
   else
-    vp9_short_iht4x4_add(input, dest, stride, tx_type);
+    vp9_iht4x4_16_add(input, dest, stride, tx_type);
 }
 
-void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob) {
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct8x8_add(input, dest, stride, eob);
   } else {
     if (eob > 0) {
-      vp9_short_iht8x8_add(input, dest, stride, tx_type);
+      vp9_iht8x8_64_add(input, dest, stride, tx_type);
     }
   }
 }
 
-void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                         int stride, int eob) {
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                      int stride, int eob) {
   if (tx_type == DCT_DCT) {
     vp9_idct16x16_add(input, dest, stride, eob);
   } else {
     if (eob > 0) {
-      vp9_short_iht16x16_add(input, dest, stride, tx_type);
+      vp9_iht16x16_256_add(input, dest, stride, tx_type);
     }
   }
 }
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index e85404e7a..2b3f35f0a 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -81,27 +81,27 @@ static INLINE int dct_const_round_shift(int input) {
   return rv;
 }
 
-typedef void (*transform_1d)(int16_t*, int16_t*);
+typedef void (*transform_1d)(const int16_t*, int16_t*);
 
 typedef struct {
   transform_1d cols, rows;  // vertical and horizontal
 } transform_2d;
 
-
-void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct8x8_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct16x16_add(int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct32x32_add(int16_t *input, uint8_t *dest, int stride, int eob);
-
-void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                 int stride, int eob);
-
-void vp9_iht_add_8x8(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                     int stride, int eob);
-
-void vp9_iht_add_16x16(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
-                       int stride, int eob);
+void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+
+void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+                       eob);
+void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+                       int eob);
+
+void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                    int stride, int eob);
+void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+                      int stride, int eob);
 
 
 #endif  // VP9_COMMON_VP9_IDCT_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 31227ad54..526be87df 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -267,51 +267,51 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
 #
 # dct
 #
-prototype void vp9_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct4x4_1_add sse2 neon
 
-prototype void vp9_idct4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct4x4_16_add sse2 neon
 
-prototype void vp9_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct8x8_1_add sse2 neon
 
-prototype void vp9_idct8x8_64_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct8x8_64_add sse2 neon
 
-prototype void vp9_idct8x8_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct8x8_10_add sse2 neon
 
-prototype void vp9_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct16x16_1_add sse2 neon
 
-prototype void vp9_idct16x16_256_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct16x16_256_add sse2 neon
 
-prototype void vp9_idct16x16_10_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct16x16_10_add sse2 neon
 
-prototype void vp9_idct32x32_1024_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct32x32_1024_add sse2 neon
 
-prototype void vp9_idct32x32_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct32x32_1_add sse2
 
-prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht4x4_add sse2 neon
+prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht4x4_16_add sse2 neon
 
-prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2 neon
+prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_iht8x8_64_add sse2 neon
 
-prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16_add sse2
+prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_iht16x16_256_add sse2
 
 # dct and add
 
-prototype void vp9_iwht4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_iwht4x4_1_add
 
-prototype void vp9_iwht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"
+prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_iwht4x4_16_add
 
 #
@@ -701,9 +701,6 @@ specialize vp9_short_fdct8x8 sse2
 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct4x4 sse2
 
-prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_fdct8x4 sse2
-
 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct32x32 sse2
 
@@ -716,9 +713,6 @@ specialize vp9_short_fdct16x16 sse2
 prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4
 
-prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh8x4
-
 #
 # Motion search
 #
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index b8d161d19..254a431a3 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -24,8 +24,8 @@ void vpx_reset_mmx_state(void);
 #define vp9_clear_system_state()
 #endif
 
-#ifdef _MSC_VER
-// round is not defined in MSVC
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// round is not defined in MSVC before VS2013.
 static int round(double x) {
   if (x < 0)
     return (int)ceil(x - 0.5);
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 24e6fa295..4ba171f46 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -21,6 +21,8 @@ typedef uint8_t vp9_prob;
 
 typedef int8_t vp9_tree_index;
 
+#define TREE_SIZE(leaf_count) (2 * (leaf_count) - 2)
+
 #define vp9_complement(x) (255 - x)
 
 /* We build coding trees compactly in arrays.
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index a2b0e8c73..cfec36b42 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,7 +15,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -26,10 +26,10 @@ void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   __m128i input0, input1, input2, input3;
 
   // Rows
-  input0 = _mm_loadl_epi64((__m128i *)input);
-  input1 = _mm_loadl_epi64((__m128i *)(input + 4));
-  input2 = _mm_loadl_epi64((__m128i *)(input + 8));
-  input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+  input0 = _mm_loadl_epi64((const __m128i *)input);
+  input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
+  input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
+  input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
 
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
   input0 = _mm_shufflelo_epi16(input0, 0xd8);
@@ -148,7 +148,7 @@ void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE4X4(dest, input3);
 }
 
-void vp9_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -264,16 +264,16 @@ static void iadst4_1d_sse2(__m128i *in) {
   in[3] = _mm_unpackhi_epi64(in[1], in[1]);
 }
 
-void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   __m128i in[4];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadl_epi64((__m128i *)input);
-  in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
-  in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
-  in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
+  in[0] = _mm_loadl_epi64((const __m128i *)input);
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -494,7 +494,7 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
       dest += stride; \
   }
 
-void vp9_idct8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -514,14 +514,14 @@ void vp9_idct8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   int i;
 
   // Load input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   // 2-D
   for (i = 0; i < 2; i++) {
@@ -562,7 +562,7 @@ void vp9_idct8x8_64_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
-void vp9_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a;
@@ -883,21 +883,21 @@ static void iadst8_1d_sse2(__m128i *in) {
 }
 
 
-void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                               int tx_type) {
+void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
 
   // load input data
-  in[0] = _mm_load_si128((__m128i *)input);
-  in[1] = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in[2] = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in[3] = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in[4] = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in[5] = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in[6] = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in[7] = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -950,7 +950,7 @@ void vp9_short_iht8x8_add_sse2(int16_t *input, uint8_t *dest, int stride,
   RECON_AND_STORE(dest, in[7]);
 }
 
-void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -970,10 +970,10 @@ void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
   // Rows. Load 4-row input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
 
   // 8x4 Transpose
   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
@@ -1228,7 +1228,8 @@ void vp9_idct8x8_10_add_sse2(int16_t *input, uint8_t *dest, int stride) {
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
+                                int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -1283,22 +1284,22 @@ void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) {
       if (i == 1) input += 128;
 
       // Load input data.
-      in0 = _mm_load_si128((__m128i *)input);
-      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-      in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
-      in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
-      in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
-      in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
-      in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
-      in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
-      in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
-      in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
-      in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
-      in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
-      in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
-      in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
-      in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
-      in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+      in0 = _mm_load_si128((const __m128i *)input);
+      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
+      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
+      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
+      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
+      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
+      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
+      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
+      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
 
       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
                     in4, in5, in6, in7);
@@ -1435,7 +1436,7 @@ void vp9_idct16x16_256_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vp9_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a, i;
@@ -2310,24 +2311,24 @@ static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
   iadst16_1d_8col(in1);
 }
 
-static INLINE void load_buffer_8x16(int16_t *input, __m128i *in) {
-  in[0]  = _mm_load_si128((__m128i *)(input + 0 * 16));
-  in[1]  = _mm_load_si128((__m128i *)(input + 1 * 16));
-  in[2]  = _mm_load_si128((__m128i *)(input + 2 * 16));
-  in[3]  = _mm_load_si128((__m128i *)(input + 3 * 16));
-  in[4]  = _mm_load_si128((__m128i *)(input + 4 * 16));
-  in[5]  = _mm_load_si128((__m128i *)(input + 5 * 16));
-  in[6]  = _mm_load_si128((__m128i *)(input + 6 * 16));
-  in[7]  = _mm_load_si128((__m128i *)(input + 7 * 16));
-
-  in[8]  = _mm_load_si128((__m128i *)(input + 8 * 16));
-  in[9]  = _mm_load_si128((__m128i *)(input + 9 * 16));
-  in[10]  = _mm_load_si128((__m128i *)(input + 10 * 16));
-  in[11]  = _mm_load_si128((__m128i *)(input + 11 * 16));
-  in[12]  = _mm_load_si128((__m128i *)(input + 12 * 16));
-  in[13]  = _mm_load_si128((__m128i *)(input + 13 * 16));
-  in[14]  = _mm_load_si128((__m128i *)(input + 14 * 16));
-  in[15]  = _mm_load_si128((__m128i *)(input + 15 * 16));
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * 16));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * 16));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * 16));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * 16));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * 16));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * 16));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * 16));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * 16));
+
+  in[8]  = _mm_load_si128((const __m128i *)(input + 8 * 16));
+  in[9]  = _mm_load_si128((const __m128i *)(input + 9 * 16));
+  in[10]  = _mm_load_si128((const __m128i *)(input + 10 * 16));
+  in[11]  = _mm_load_si128((const __m128i *)(input + 11 * 16));
+  in[12]  = _mm_load_si128((const __m128i *)(input + 12 * 16));
+  in[13]  = _mm_load_si128((const __m128i *)(input + 13 * 16));
+  in[14]  = _mm_load_si128((const __m128i *)(input + 14 * 16));
+  in[15]  = _mm_load_si128((const __m128i *)(input + 15 * 16));
 }
 
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -2386,8 +2387,8 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
   RECON_AND_STORE(dest, in[15]);
 }
 
-void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
-                                 int tx_type) {
+void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
+                               int tx_type) {
   __m128i in0[16], in1[16];
 
   load_buffer_8x16(input, in0);
@@ -2421,8 +2422,8 @@ void vp9_short_iht16x16_add_sse2(int16_t *input, uint8_t *dest, int stride,
   write_buffer_8x16(dest, in1, stride);
 }
 
-void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
-                                     int stride) {
+void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
+                               int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -2468,14 +2469,14 @@ void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
   // 1-D idct. Load input data.
-  in0 = _mm_load_si128((__m128i *)input);
-  in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-  in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
-  in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
-  in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
-  in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
-  in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
-  in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+  in0 = _mm_load_si128((const __m128i *)input);
+  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
   TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
   TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
@@ -2780,11 +2781,12 @@ void vp9_idct16x16_10_add_sse2(int16_t *input, uint8_t *dest,
 
 #define LOAD_DQCOEFF(reg, input) \
   {  \
-    reg = _mm_load_si128((__m128i *) input); \
+    reg = _mm_load_si128((const __m128i *) input); \
     input += 8; \
   }  \
 
-void vp9_idct32x32_1024_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
+                                 int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -3515,7 +3517,7 @@ void vp9_idct32x32_1024_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 }  //NOLINT
 
-void vp9_idct32x32_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
   const __m128i zero = _mm_setzero_si128();
   int a, i;
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 27e5f2cda..8c1399d79 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -363,15 +363,14 @@ static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB,
-                           &fc->switchable_interp_prob[j][i]);
+      vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
   int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &fc->inter_mode_probs[i][j]);
+      vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
 static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
@@ -505,7 +504,11 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
 
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
-    assert(bsize >= BLOCK_8X8);
+    if (bsize < BLOCK_8X8) {
+        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Invalid usage of segement feature on small blocks");
+        return;
+    }
   } else {
     if (bsize >= BLOCK_8X8)
       mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
@@ -606,17 +609,17 @@ static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
 
   if (cm->comp_pred_mode == HYBRID_PREDICTION)
     for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_inter_prob[i]);
+      vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
 
   if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++) {
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][0]);
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.single_ref_prob[i][1]);
+      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
+      vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
     }
 
   if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
     for (i = 0; i < REF_CONTEXTS; i++)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.comp_ref_prob[i]);
+      vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 }
 
 void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
@@ -626,7 +629,7 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
   // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
   // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.mbskip_probs[k]);
+    vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]);
 
   if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
     nmv_context *const nmvc = &pbi->common.fc.nmvc;
@@ -639,18 +642,17 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) {
       read_switchable_interp_probs(&cm->fc, r);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.intra_inter_prob[i]);
+      vp9_diff_update_prob(r, &cm->fc.intra_inter_prob[i]);
 
     read_comp_pred(cm, r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
-        vp9_diff_update_prob(r, MODE_UPDATE_PROB, &cm->fc.y_mode_prob[j][i]);
+        vp9_diff_update_prob(r, &cm->fc.y_mode_prob[j][i]);
 
     for (j = 0; j < NUM_PARTITION_CONTEXTS; ++j)
       for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        vp9_diff_update_prob(r, MODE_UPDATE_PROB,
-                             &cm->fc.partition_prob[INTER_FRAME][j][i]);
+        vp9_diff_update_prob(r, &cm->fc.partition_prob[INTER_FRAME][j][i]);
 
     read_mv_probs(r, nmvc, xd->allow_high_precision_mv);
   }
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 061508b08..acde390f2 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -63,15 +63,15 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 3; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p8x8[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 2; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p16x16[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
     for (j = 0; j < TX_SIZES - 1; ++j)
-      vp9_diff_update_prob(r, MODE_UPDATE_PROB, &tx_probs->p32x32[i][j]);
+      vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
 
 static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
@@ -101,15 +101,15 @@ static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
         if (tx_type == DCT_DCT)
           xd->itxm_add(qcoeff, dst, stride, eob);
         else
-          vp9_iht_add(tx_type, qcoeff, dst, stride, eob);
+          vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob);
         break;
       case TX_8X8:
         tx_type = get_tx_type_8x8(pd->plane_type, xd);
-        vp9_iht_add_8x8(tx_type, qcoeff, dst, stride, eob);
+        vp9_iht8x8_add(tx_type, qcoeff, dst, stride, eob);
         break;
       case TX_16X16:
         tx_type = get_tx_type_16x16(pd->plane_type, xd);
-        vp9_iht_add_16x16(tx_type, qcoeff, dst, stride, eob);
+        vp9_iht16x16_add(tx_type, qcoeff, dst, stride, eob);
         break;
       case TX_32X32:
         tx_type = DCT_DCT;
@@ -371,8 +371,7 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
           for (l = 0; l < PREV_COEF_CONTEXTS; l++)
             if (k > 0 || l < 3)
               for (m = 0; m < UNCONSTRAINED_NODES; m++)
-                vp9_diff_update_prob(r, VP9_COEF_UPDATE_PROB,
-                                     &coef_probs[i][j][k][l][m]);
+                vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
@@ -956,9 +955,15 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
 
   if (!first_partition_size) {
-    // showing a frame directly
-    *p_data_end = data + 1;
-    return 0;
+    if (!keyframe) {
+      // showing a frame directly
+      *p_data_end = data + 1;
+      return 0;
+    } else {
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Invalid key frame");
+      return -1;
+    }
   }
   data += vp9_rb_bytes_read(&rb);
   xd->corrupted = 0;
diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c
index 6f01cead6..fcca01729 100644
--- a/vp9/decoder/vp9_dsubexp.c
+++ b/vp9/decoder/vp9_dsubexp.c
@@ -48,8 +48,6 @@ static int merge_index(int v, int n, int modulus) {
 
 static int inv_remap_prob(int v, int m) {
   static int inv_map_table[MAX_PROB - 1] = {
-    // generated by:
-    //   inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM);
       6,  19,  32,  45,  58,  71,  84,  97, 110, 123, 136, 149, 162, 175, 188,
     201, 214, 227, 240, 253,   0,   1,   2,   3,   4,   5,   7,   8,   9,  10,
      11,  12,  13,  14,  15,  16,  17,  18,  20,  21,  22,  23,  24,  25,  26,
@@ -66,9 +64,11 @@ static int inv_remap_prob(int v, int m) {
     190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205,
     206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221,
     222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237,
-    238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+    238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252
   };
-  // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM);
+  // The clamp is not necessary for conforming VP9 stream, it is added to
+  // prevent out of bound access for bad input data
+  v = clamp(v, 0, 253);
   v = inv_map_table[v];
   m--;
   if ((m << 1) <= MAX_PROB) {
@@ -99,8 +99,8 @@ static int decode_term_subexp(vp9_reader *r, int k, int num_syms) {
   return word;
 }
 
-void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p) {
-  if (vp9_read(r, update_prob)) {
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p) {
+  if (vp9_read(r, DIFF_UPDATE_PROB)) {
     const int delp = decode_term_subexp(r, SUBEXP_PARAM, 255);
     *p = (vp9_prob)inv_remap_prob(delp, *p);
   }
diff --git a/vp9/decoder/vp9_dsubexp.h b/vp9/decoder/vp9_dsubexp.h
index 21ac31393..aeb9399d0 100644
--- a/vp9/decoder/vp9_dsubexp.h
+++ b/vp9/decoder/vp9_dsubexp.h
@@ -14,6 +14,6 @@
 
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-void vp9_diff_update_prob(vp9_reader *r, int update_prob, vp9_prob* p);
+void vp9_diff_update_prob(vp9_reader *r, vp9_prob* p);
 
 #endif  // VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index a42c2cf30..d3030746d 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -342,36 +342,33 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
     return retcode;
   }
 
-  {
-    swap_frame_buffers(pbi);
+  swap_frame_buffers(pbi);
 
 #if WRITE_RECON_BUFFER == 2
-    if (cm->show_frame)
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame);
-    else
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame + 1000);
+  if (cm->show_frame)
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame);
+  else
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 1000);
 #endif
 
-    if (!pbi->do_loopfilter_inline) {
-      /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
-    }
+  if (!pbi->do_loopfilter_inline) {
+    vp9_loop_filter_frame(cm, &pbi->mb, pbi->common.lf.filter_level, 0, 0);
+  }
 
 #if WRITE_RECON_BUFFER == 2
-    if (cm->show_frame)
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame + 2000);
-    else
-      write_dx_frame_to_file(cm->frame_to_show,
-                             cm->current_video_frame + 3000);
+  if (cm->show_frame)
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 2000);
+  else
+    write_dx_frame_to_file(cm->frame_to_show,
+                           cm->current_video_frame + 3000);
 #endif
 
-    vp9_extend_frame_inner_borders(cm->frame_to_show,
-                                   cm->subsampling_x,
-                                   cm->subsampling_y);
-  }
+  vp9_extend_frame_inner_borders(cm->frame_to_show,
+                                 cm->subsampling_x,
+                                 cm->subsampling_y);
 
 #if WRITE_RECON_BUFFER == 1
   if (cm->show_frame)
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 2f59d333a..428ca7e2b 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -179,9 +179,8 @@ static void update_mode(
   vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
   n--;
 
-  for (i = 0; i < n; ++i) {
-    vp9_cond_prob_diff_update(w, &Pcur[i], MODE_UPDATE_PROB, bct[i]);
-  }
+  for (i = 0; i < n; ++i)
+    vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
 }
 
 static void update_mbintra_mode_probs(VP9_COMP* const cpi,
@@ -227,8 +226,7 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
   int k;
 
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k],
-                              MODE_UPDATE_PROB, cm->counts.mbskip[k]);
+    vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
 }
 
 static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
@@ -251,7 +249,7 @@ static void update_switchable_interp_probs(VP9_COMP *const cpi,
   for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
       vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
-                                MODE_UPDATE_PROB, branch_ct[j][i]);
+                                branch_ct[j][i]);
     }
   }
 #ifdef MODE_STATS
@@ -273,7 +271,7 @@ static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
 
     for (j = 0; j < INTER_MODES - 1; ++j)
       vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
-                                MODE_UPDATE_PROB, branch_ct[j]);
+                                branch_ct[j]);
   }
 }
 
@@ -781,7 +779,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
   vp9_coeff_probs_model *old_frame_coef_probs =
       cpi->common.fc.coef_probs[tx_size];
   vp9_coeff_stats *frame_branch_ct = cpi->frame_branch_ct[tx_size];
-  const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+  const vp9_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
   switch (cpi->sf.use_fast_coef_updates) {
@@ -836,7 +834,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
                 vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
-                const vp9_prob upd = VP9_COEF_UPDATE_PROB;
+                const vp9_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
                 if (l >= 3 && k == 0)
@@ -1119,26 +1117,23 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
 
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
-                                     ct_8x8p);
+      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
-                                  MODE_UPDATE_PROB, ct_8x8p[j]);
+        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
-                                       ct_16x16p);
+      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
-                                  MODE_UPDATE_PROB, ct_16x16p[j]);
+                                  ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
-                                  MODE_UPDATE_PROB, ct_32x32p[j]);
+                                  ct_32x32p[j]);
     }
 #ifdef MODE_STATS
     if (!cpi->dummy_packing)
@@ -1468,7 +1463,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                MODE_UPDATE_PROB,
                                 cpi->intra_inter_count[i]);
 
     if (cm->allow_comp_inter_inter) {
@@ -1482,7 +1476,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      MODE_UPDATE_PROB,
                                       cpi->comp_inter_count[i]);
       }
     }
@@ -1490,10 +1483,8 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][0]);
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  MODE_UPDATE_PROB,
                                   cpi->single_ref_count[i][1]);
       }
     }
@@ -1501,7 +1492,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  MODE_UPDATE_PROB,
                                   cpi->comp_ref_count[i]);
 
     update_mbintra_mode_probs(cpi, &header_bc);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 3a2be56a1..b26ae329f 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -172,7 +172,6 @@ struct macroblock {
   BLOCK_SIZE sb64_partitioning;
 
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
-  void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 3008e46dd..b6555bc05 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -17,7 +17,7 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
 
-static void fdct4(int16_t *input, int16_t *output) {
+static void fdct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
 
@@ -102,7 +102,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-static void fadst4(int16_t *input, int16_t *output) {
+static void fadst4(const int16_t *input, int16_t *output) {
   int x0, x1, x2, x3;
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -178,12 +178,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
   }
 }
 
-void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) {
-    vp9_short_fdct4x4_c(input, output, pitch);
-    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-static void fdct8(int16_t *input, int16_t *output) {
+static void fdct8(const int16_t *input, int16_t *output) {
   /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
   /*needs32*/ int t0, t1, t2, t3;
   /*canbe16*/ int x0, x1, x2, x3;
@@ -486,7 +481,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-static void fadst8(int16_t *input, int16_t *output) {
+static void fadst8(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   int x0 = input[7];
@@ -647,14 +642,8 @@ void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-void vp9_short_walsh8x4_c(int16_t *input, int16_t *output, int pitch) {
-  vp9_short_walsh4x4_c(input,   output,    pitch);
-  vp9_short_walsh4x4_c(input + 4, output + 16, pitch);
-}
-
-
 // Rewrote to use same algorithm as others.
-static void fdct16(int16_t in[16], int16_t out[16]) {
+static void fdct16(const int16_t in[16], int16_t out[16]) {
   /*canbe16*/ int step1[8];
   /*canbe16*/ int step2[8];
   /*canbe16*/ int step3[8];
@@ -795,7 +784,7 @@ static void fdct16(int16_t in[16], int16_t out[16]) {
   out[15] = dct_const_round_shift(temp2);
 }
 
-void fadst16(int16_t *input, int16_t *output) {
+static void fadst16(const int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 
   int x0 = input[15];
@@ -1003,7 +992,7 @@ static INLINE int half_round_shift(int input) {
   return rv;
 }
 
-static void dct32_1d(int *input, int *output, int round) {
+static void dct32_1d(const int *input, int *output, int round) {
   int step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index b74609bc2..ac1fd6215 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1853,7 +1853,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
     // printf("Switching to lossless\n");
-    cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
     cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
     cpi->mb.optimize = 0;
@@ -1862,7 +1861,6 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
     cpi->common.tx_mode = ONLY_4X4;
   } else {
     // printf("Not lossless\n");
-    cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
     cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
   }
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 0fc36d98f..a0a7bab27 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -564,7 +564,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob)
-        vp9_iht_add_16x16(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
@@ -589,7 +589,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                      p->quant_shift, qcoeff, dqcoeff,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob)
-        vp9_iht_add_8x8(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
@@ -623,7 +623,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           // case.
           xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
         else
-          vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
+          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }
       break;
     default:
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 0833b4ac8..0afb35f54 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -955,10 +955,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
   cpi->mb.fwd_txm16x16  = vp9_short_fdct16x16;
   cpi->mb.fwd_txm8x8    = vp9_short_fdct8x8;
-  cpi->mb.fwd_txm8x4    = vp9_short_fdct8x4;
   cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;
   if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
-    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4;
   }
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 54e60d6e1..eb7ca6b72 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -110,6 +110,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZES] =
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC      1
 #define RD_THRESH_POW      1.25
+#define RD_MULT_EPB_RATIO  64
 
 #define MV_COST_WEIGHT      108
 #define MV_COST_WEIGHT_SUB  120
@@ -162,7 +163,17 @@ void vp9_init_me_luts() {
 
 static int compute_rd_mult(int qindex) {
   const int q = vp9_dc_quant(qindex, 0);
-  return (11 * q * q) >> 2;
+  // TODO(debargha): Adjust the function below
+  return (88 * q * q / 25);
+}
+
+static int compute_rd_thresh_factor(int qindex) {
+  int q;
+  // TODO(debargha): Adjust the function below
+  q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
+  if (q < 8)
+    q = 8;
+  return q;
 }
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
@@ -172,9 +183,7 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
 
 static void set_block_thresholds(VP9_COMP *cpi, int qindex) {
   int q, i, bsize;
-  q = ((int)pow(vp9_dc_quant(qindex, 0) >> 2, RD_THRESH_POW)) << 2;
-  if (q < 8)
-    q = 8;
+  q = compute_rd_thresh_factor(qindex);
 
   for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
     for (i = 0; i < MAX_MODES; ++i) {
@@ -216,7 +225,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
   //     cpi->common.refresh_alt_ref_frame)
   qindex = clamp(qindex, 0, MAXQ);
 
-  cpi->RDDIV = 100;
+  cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     if (cpi->twopass.next_iiratio > 31)
@@ -225,7 +234,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
       cpi->RDMULT +=
           (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
   }
-  cpi->mb.errorperbit = cpi->RDMULT >> 6;
+  cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
   cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
 
   vp9_set_speed_features(cpi);
@@ -1100,7 +1109,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           goto next;
 
         if (tx_type != DCT_DCT)
-          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
+          vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
                                dst, pd->dst.stride, tx_type);
         else
           xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index c86ea2723..aa4068d76 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -12,8 +12,10 @@
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
+#define RDDIV_BITS          7
+
 #define RDCOST(RM, DM, R, D) \
-  (((128 + ((int64_t)R) * (RM)) >> 8) + ((int64_t)DM) * (D))
+  (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
 #define QIDX_SKIP_THRESH     115
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 667b8012c..eb864d96c 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,8 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
 }
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               vp9_prob upd, unsigned int *ct) {
+                               unsigned int *ct) {
+  const vp9_prob upd = DIFF_UPDATE_PROB;
   vp9_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
                                                           upd);
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index 7acdaf6f1..521c7778d 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -19,7 +19,7 @@ void vp9_write_prob_diff_update(vp9_writer *w,
                                 vp9_prob newp, vp9_prob oldp);
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               vp9_prob upd, unsigned int *ct);
+                               unsigned int *ct);
 
 int vp9_prob_diff_update_savings_search(const unsigned int *ct,
                                         vp9_prob oldp, vp9_prob *bestp,
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index ad3d01da9..5e1e5ed4a 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -112,11 +112,6 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) {
-  vp9_short_fdct4x4_sse2(input, output, pitch);
-  vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch);
-}
-
 static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 7a5b78634..6b923162f 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -658,8 +658,10 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
 
   if (corrupted) {
     VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
-    *corrupted = pbi->common.frame_to_show->corrupted;
-
+    if (pbi)
+      *corrupted = pbi->common.frame_to_show->corrupted;
+    else
+      return VPX_CODEC_ERROR;
     return VPX_CODEC_OK;
   } else {
     return VPX_CODEC_INVALID_PARAM;