From fed8a1837fd6b1e4e36495be8210bd63bfb2bb8f Mon Sep 17 00:00:00 2001
From: Daniel Kang <ddkang@google.com>
Date: Thu, 2 Aug 2012 17:03:14 -0700
Subject: 16x16 DCT blocks.

Set on all 16x16 intra/inter modes

Features:
- Butterfly fDCT/iDCT
- Loop filter does not filter internal edges with 16x16
- Optimize coefficient function
- Update coefficient probability function
- RD
- Entropy stats
- 16x16 is a config option

Have not tested with experiments.

hd:     2.60%
std-hd: 2.43%
yt:     1.32%
derf:   0.60%

Change-Id: I96fb090517c30c5da84bad4fae602c3ec0c58b1c
---
 vp8/encoder/bitstream.c                | 314 +++++++++++++++++++------------
 vp8/encoder/block.h                    |  24 ++-
 vp8/encoder/dct.c                      | 202 ++++++++++++++++++++
 vp8/encoder/dct.h                      |  10 +
 vp8/encoder/encodeframe.c              |  29 ++-
 vp8/encoder/encodeintra.c              |  23 +++
 vp8/encoder/encodemb.c                 | 300 +++++++++++++++++++++++++++++-
 vp8/encoder/encodemb.h                 |   9 +
 vp8/encoder/generic/csystemdependent.c |   3 +
 vp8/encoder/onyx_if.c                  |  12 ++
 vp8/encoder/onyx_int.h                 |  25 ++-
 vp8/encoder/quantize.c                 | 196 ++++++++++++++++++--
 vp8/encoder/quantize.h                 |  14 ++
 vp8/encoder/ratectrl.c                 |   6 +
 vp8/encoder/rdopt.c                    | 186 +++++++++++++++++--
 vp8/encoder/tokenize.c                 | 327 ++++++++++++++++++++++++++-------
 vp8/encoder/tokenize.h                 |   5 +-
 17 files changed, 1457 insertions(+), 228 deletions(-)

(limited to 'vp8/encoder')

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 97e791bc6..c555c0300 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -42,6 +42,12 @@ unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]
                                   [COEF_BANDS]
                                   [PREV_COEF_CONTEXTS]
                                   [ENTROPY_NODES] [2];
+#if CONFIG_TX16X16
+unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]
+                                    [COEF_BANDS]
+                                    [PREV_COEF_CONTEXTS]
+                                    [ENTROPY_NODES] [2];
+#endif
 
 extern unsigned int active_section;
 #endif
@@ -1283,15 +1289,13 @@ static void print_prob_tree(vp8_prob
 
 
 void build_coeff_contexts(VP8_COMP *cpi) {
-  int i = 0;
-  do {
-    int j = 0;
-    do {
-      int k = 0;
-      do {
+  int i = 0, j, k;
 #ifdef ENTROPY_STATS
-        int t;
+  int t = 0;
 #endif
+  for (i = 0; i < BLOCK_TYPES; ++i) {
+    for (j = 0; j < COEF_BANDS; ++j) {
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
         if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
           continue;
         vp8_tree_probs_from_distribution(
@@ -1302,33 +1306,23 @@ void build_coeff_contexts(VP8_COMP *cpi) {
           256, 1
         );
 #ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing) {
-          t = 0;
-          do {
-            context_counters [i][j][k][t] +=
-              cpi->coef_counts [i][j][k][t];
-          } while (++t < MAX_ENTROPY_TOKENS);
-        }
+        if (!cpi->dummy_packing)
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
 #endif
-      } while (++k < PREV_COEF_CONTEXTS);
-    } while (++j < COEF_BANDS);
-  } while (++i < BLOCK_TYPES);
+      }
+    }
+  }
 
 
-  i = 0;
   if (cpi->common.txfm_mode == ALLOW_8X8) {
-    do {
-      int j = 0;      /* token/prob index */
-      do {
-        int k = 0;
-        do {
+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+      for (j = 0; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
           /* at every context */
           /* calc probs and branch cts for this frame only */
           // vp8_prob new_p           [ENTROPY_NODES];
           // unsigned int branch_ct   [ENTROPY_NODES] [2];
-#ifdef ENTROPY_STATS
-          int t = 0;      /* token/prob index */
-#endif
           if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
             continue;
           vp8_tree_probs_from_distribution(
@@ -1339,20 +1333,36 @@ void build_coeff_contexts(VP8_COMP *cpi) {
             256, 1
           );
 #ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing) {
-            t = 0;
-            do {
-              context_counters_8x8 [i][j][k][t] +=
-                cpi->coef_counts_8x8 [i][j][k][t];
-            } while (++t < MAX_ENTROPY_TOKENS);
-          }
+          if (!cpi->dummy_packing)
+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+              context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];
 #endif
-
-        } while (++k < PREV_COEF_CONTEXTS);
-      } while (++j < COEF_BANDS);
-    } while (++i < BLOCK_TYPES_8X8);
+        }
+      }
+    }
   }
 
+#if CONFIG_TX16X16
+  //16x16
+  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+    for (j = 0; j < COEF_BANDS; ++j) {
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp8_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+          cpi->frame_coef_probs_16x16[i][j][k],
+          cpi->frame_branch_ct_16x16[i][j][k],
+          cpi->coef_counts_16x16[i][j][k], 256, 1);
+#ifdef ENTROPY_STATS
+        if (!cpi->dummy_packing)
+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
+#endif
+      }
+    }
+  }
+#endif
 }
 
 static void update_coef_probs3(VP8_COMP *cpi) {
@@ -1696,7 +1706,7 @@ static void update_coef_probs2(VP8_COMP *cpi) {
 }
 
 static void update_coef_probs(VP8_COMP *cpi) {
-  int i = 0;
+  int i, j, k, t;
   vp8_writer *const w = & cpi->bc;
   int update[2] = {0, 0};
   int savings;
@@ -1704,21 +1714,17 @@ static void update_coef_probs(VP8_COMP *cpi) {
   vp8_clear_system_state(); // __asm emms;
 
   // Build the cofficient contexts based on counts collected in encode loop
-
   build_coeff_contexts(cpi);
 
   // vp8_prob bestupd = find_coef_update_prob(cpi);
 
   /* dry run to see if there is any udpate at all needed */
   savings = 0;
-  do {
-    int j = !i;
-    do {
-      int k = 0;
+  for (i = 0; i < BLOCK_TYPES; ++i) {
+    for (j = !i; j < COEF_BANDS; ++j) {
       int prev_coef_savings[ENTROPY_NODES] = {0};
-      do {
-        int t = 0;      /* token/prob index */
-        do {
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        for (t = 0; t < ENTROPY_NODES; ++t) {
           vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
           vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;
           const vp8_prob upd = COEF_UPDATE_PROB;
@@ -1747,29 +1753,23 @@ static void update_coef_probs(VP8_COMP *cpi) {
 #endif
 
           update[u]++;
-        } while (++t < ENTROPY_NODES);
-      } while (++k < PREV_COEF_CONTEXTS);
-    } while (++j < COEF_BANDS);
-  } while (++i < BLOCK_TYPES);
+        }
+      }
+    }
+  }
 
   // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
   /* Is coef updated at all */
   if (update[1] == 0 || savings < 0)
-  {
     vp8_write_bit(w, 0);
-  } else {
+  else {
     vp8_write_bit(w, 1);
-    i = 0;
-    do {
-      int j = !i;
-      do {
-        int k = 0;
+    for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (j = !i; j < COEF_BANDS; ++j) {
         int prev_coef_savings[ENTROPY_NODES] = {0};
-
-        do {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
           // calc probs and branch cts for this frame only
-          int t = 0;      /* token/prob index */
-          do {
+          for (t = 0; t < ENTROPY_NODES; ++t) {
             vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
             vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;
             const vp8_prob upd = COEF_UPDATE_PROB;
@@ -1791,8 +1791,6 @@ static void update_coef_probs(VP8_COMP *cpi) {
             if (s > 0)
               u = 1;
 #endif
-
-
             vp8_write(w, u, upd);
 #ifdef ENTROPY_STATS
             if (!cpi->dummy_packing)
@@ -1803,28 +1801,23 @@ static void update_coef_probs(VP8_COMP *cpi) {
               write_prob_diff_update(w, newp, *Pold);
               *Pold = newp;
             }
-          } while (++t < ENTROPY_NODES);
-
-        } while (++k < PREV_COEF_CONTEXTS);
-      } while (++j < COEF_BANDS);
-    } while (++i < BLOCK_TYPES);
+          }
+        }
+      }
+    }
   }
 
 
-  /* do not do this if not evena allowed */
+  /* do not do this if not even allowed */
   if (cpi->common.txfm_mode == ALLOW_8X8) {
     /* dry run to see if update is necessary */
     update[0] = update[1] = 0;
     savings = 0;
-    i = 0;
-    do {
-      int j = !i;
-      do {
-        int k = 0;
-        do {
+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+      for (j = !i; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
           // calc probs and branch cts for this frame only
-          int t = 0;      /* token/prob index */
-          do {
+          for (t = 0; t < ENTROPY_NODES; ++t) {
             const unsigned int *ct  = cpi->frame_branch_ct_8x8 [i][j][k][t];
             vp8_prob newp = cpi->frame_coef_probs_8x8 [i][j][k][t];
             vp8_prob *Pold = cpi->common.fc.coef_probs_8x8 [i][j][k] + t;
@@ -1846,26 +1839,20 @@ static void update_coef_probs(VP8_COMP *cpi) {
             if (u)
               savings += s;
 #endif
-
             update[u]++;
-          } while (++t < MAX_ENTROPY_TOKENS - 1);
-        } while (++k < PREV_COEF_CONTEXTS);
-      } while (++j < COEF_BANDS);
-    } while (++i < BLOCK_TYPES_8X8);
+          }
+        }
+      }
+    }
 
     if (update[1] == 0 || savings < 0)
-    {
       vp8_write_bit(w, 0);
-    } else {
+    else {
       vp8_write_bit(w, 1);
-      i = 0;
-      do {
-        int j = !i;
-        do {
-          int k = 0;
-          do {
-            int t = 0;      /* token/prob index */
-            do {
+      for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
+        for (j = !i; j < COEF_BANDS; ++j) {
+          for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+            for (t = 0; t < ENTROPY_NODES; ++t) {
               const unsigned int *ct  = cpi->frame_branch_ct_8x8 [i][j][k][t];
               vp8_prob newp = cpi->frame_coef_probs_8x8 [i][j][k][t];
               vp8_prob *Pold = cpi->common.fc.coef_probs_8x8 [i][j][k] + t;
@@ -1892,12 +1879,90 @@ static void update_coef_probs(VP8_COMP *cpi) {
                 write_prob_diff_update(w, newp, oldp);
                 *Pold = newp;
               }
-            } while (++t < MAX_ENTROPY_TOKENS - 1);
-          } while (++k < PREV_COEF_CONTEXTS);
-        } while (++j < COEF_BANDS);
-      } while (++i < BLOCK_TYPES_8X8);
+            }
+          }
+        }
+      }
     }
   }
+
+#if CONFIG_TX16X16
+  // 16x16
+  /* dry run to see if update is necessary */
+  update[0] = update[1] = 0;
+  savings = 0;
+  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+    for (j = !i; j < COEF_BANDS; ++j) {
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        // calc probs and branch cts for this frame only
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          const unsigned int *ct  = cpi->frame_branch_ct_16x16[i][j][k][t];
+          vp8_prob newp = cpi->frame_coef_probs_16x16[i][j][k][t];
+          vp8_prob *Pold = cpi->common.fc.coef_probs_16x16[i][j][k] + t;
+          const vp8_prob oldp = *Pold;
+          int s, u;
+          const vp8_prob upd = COEF_UPDATE_PROB_16X16;
+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+            continue;
+#if defined(SEARCH_NEWP)
+          s = prob_diff_update_savings_search(ct, oldp, &newp, upd);
+          u = s > 0 && newp != oldp ? 1 : 0;
+          if (u)
+            savings += s - (int)(vp8_cost_zero(upd));
+          else
+            savings -= (int)(vp8_cost_zero(upd));
+#else
+          s = prob_update_savings(ct, oldp, newp, upd);
+          u = s > 0 ? 1 : 0;
+          if (u)
+            savings += s;
+#endif
+          update[u]++;
+        }
+      }
+    }
+  }
+
+  if (update[1] == 0 || savings < 0)
+    vp8_write_bit(w, 0);
+  else {
+    vp8_write_bit(w, 1);
+    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
+      for (j = !i; j < COEF_BANDS; ++j) {
+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+          for (t = 0; t < ENTROPY_NODES; ++t) {
+            const unsigned int *ct  = cpi->frame_branch_ct_16x16[i][j][k][t];
+            vp8_prob newp = cpi->frame_coef_probs_16x16[i][j][k][t];
+            vp8_prob *Pold = cpi->common.fc.coef_probs_16x16[i][j][k] + t;
+            const vp8_prob oldp = *Pold;
+            const vp8_prob upd = COEF_UPDATE_PROB_16X16;
+            int s, u;
+            if (k >= 3 && ((i == 0 && j == 1) ||
+                           (i > 0 && j == 0)))
+              continue;
+#if defined(SEARCH_NEWP)
+            s = prob_diff_update_savings_search(ct, oldp, &newp, upd);
+            u = s > 0 && newp != oldp ? 1 : 0;
+#else
+            s = prob_update_savings(ct, oldp, newp, upd);
+            u = s > 0 ? 1 : 0;
+#endif
+            vp8_write(w, u, upd);
+#ifdef ENTROPY_STATS
+            if (!cpi->dummy_packing)
+              ++tree_update_hist_16x16[i][j][k][t][u];
+#endif
+            if (u) {
+              /* send/use new probability */
+              write_prob_diff_update(w, newp, oldp);
+              *Pold = newp;
+            }
+          }
+        }
+      }
+    }
+  }
+#endif
 }
 
 #ifdef PACKET_TESTING
@@ -2310,18 +2375,19 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
     vp8_write_bit(bc, pc->refresh_last_frame);
 
 #ifdef ENTROPY_STATS
-
   if (pc->frame_type == INTER_FRAME)
     active_section = 0;
   else
     active_section = 7;
-
 #endif
 
   vp8_clear_system_state();  // __asm emms;
 
   vp8_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);
   vp8_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);
+#if CONFIG_TX16X16
+  vp8_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
+#endif
   vp8_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
   vp8_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
   vp8_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
@@ -2401,24 +2467,20 @@ void print_tree_update_probs() {
   FILE *f = fopen("coefupdprob.h", "w");
   int Sum;
   fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+
   fprintf(f, "const vp8_prob\n"
           "vp8_coef_update_probs[BLOCK_TYPES]\n"
           "                     [COEF_BANDS]\n"
           "                     [PREV_COEF_CONTEXTS]\n"
           "                     [ENTROPY_NODES] = {\n");
-
   for (i = 0; i < BLOCK_TYPES; i++) {
     fprintf(f, "  { \n");
-
     for (j = 0; j < COEF_BANDS; j++) {
       fprintf(f, "    {\n");
-
       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
         fprintf(f, "      {");
-
         for (l = 0; l < ENTROPY_NODES; l++) {
           Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
-
           if (Sum > 0) {
             if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
               fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);
@@ -2427,16 +2489,12 @@ void print_tree_update_probs() {
           } else
             fprintf(f, "%3ld, ", 128);
         }
-
         fprintf(f, "},\n");
       }
-
       fprintf(f, "    },\n");
     }
-
     fprintf(f, "  },\n");
   }
-
   fprintf(f, "};\n");
 
   fprintf(f, "const vp8_prob\n"
@@ -2444,20 +2502,14 @@ void print_tree_update_probs() {
           "                         [COEF_BANDS]\n"
           "                         [PREV_COEF_CONTEXTS]\n"
           "                         [ENTROPY_NODES] = {\n");
-
-
   for (i = 0; i < BLOCK_TYPES_8X8; i++) {
     fprintf(f, "  { \n");
-
     for (j = 0; j < COEF_BANDS; j++) {
       fprintf(f, "    {\n");
-
       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
         fprintf(f, "      {");
-
         for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
           Sum = tree_update_hist_8x8[i][j][k][l][0] + tree_update_hist_8x8[i][j][k][l][1];
-
           if (Sum > 0) {
             if (((tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum) > 0)
               fprintf(f, "%3ld, ", (tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum);
@@ -2466,20 +2518,50 @@ void print_tree_update_probs() {
           } else
             fprintf(f, "%3ld, ", 128);
         }
-
         fprintf(f, "},\n");
       }
-
       fprintf(f, "    },\n");
     }
+    fprintf(f, "  },\n");
+  }
 
+#if CONFIG_TX16X16
+  fprintf(f, "const vp8_prob\n"
+          "vp8_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"
+          "                           [COEF_BANDS]\n"
+          "                           [PREV_COEF_CONTEXTS]\n"
+          "                           [ENTROPY_NODES] = {\n");
+  for (i = 0; i < BLOCK_TYPES_16X16; i++) {
+    fprintf(f, "  { \n");
+    for (j = 0; j < COEF_BANDS; j++) {
+      fprintf(f, "    {\n");
+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
+        fprintf(f, "      {");
+        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
+          Sum = tree_update_hist_16x16[i][j][k][l][0] + tree_update_hist_16x16[i][j][k][l][1];
+          if (Sum > 0) {
+            if (((tree_update_hist_16x16[i][j][k][l][0] * 255) / Sum) > 0)
+              fprintf(f, "%3ld, ", (tree_update_hist_16x16[i][j][k][l][0] * 255) / Sum);
+            else
+              fprintf(f, "%3ld, ", 1);
+          } else
+            fprintf(f, "%3ld, ", 128);
+        }
+        fprintf(f, "},\n");
+      }
+      fprintf(f, "    },\n");
+    }
     fprintf(f, "  },\n");
   }
+#endif
+
   fclose(f);
   f = fopen("treeupdate.bin", "wb");
   fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
   fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
+#if CONFIG_TX16X16
+  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+#endif
   fclose(f);
-
 }
 #endif
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 106036a0d..0019d5e8c 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -35,8 +35,14 @@ typedef struct {
   unsigned char *quant_shift;
   short *zbin;
   short *zbin_8x8;
+#if CONFIG_TX16X16
+  short *zbin_16x16;
+#endif
   short *zrun_zbin_boost;
   short *zrun_zbin_boost_8x8;
+#if CONFIG_TX16X16
+  short *zrun_zbin_boost_16x16;
+#endif
   short *round;
 
   // Zbin Over Quant value
@@ -49,7 +55,9 @@ typedef struct {
 
   int eob_max_offset;
   int eob_max_offset_8x8;
-
+#if CONFIG_TX16X16
+  int eob_max_offset_16x16;
+#endif
 } BLOCK;
 
 typedef struct {
@@ -153,9 +161,13 @@ typedef struct {
 #endif
 
   unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS]
-  [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
   unsigned int token_costs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]
-  [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+#if CONFIG_TX16X16
+  unsigned int token_costs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]
+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+#endif
 
   int optimize;
   int q_index;
@@ -176,7 +188,13 @@ typedef struct {
   void (*quantize_b)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
   void (*vp8_short_fdct8x8)(short *input, short *output, int pitch);
+#if CONFIG_TX16X16
+  void (*vp8_short_fdct16x16)(short *input, short *output, int pitch);
+#endif
   void (*short_fhaar2x2)(short *input, short *output, int pitch);
+#if CONFIG_TX16X16
+  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
+#endif
   void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
 
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index 5954a7685..ba2a692d1 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -575,3 +575,205 @@ void vp8_short_walsh8x4_x8_c(short *input, short *output, int pitch) {
   vp8_short_walsh4x4_x8_c(input + 4, output + 16, pitch);
 }
 #endif
+
+#if CONFIG_TX16X16
+static void dct16x16_1d(double input[16], double output[16]) {
+  double step[16];
+  double intermediate[16];
+  double temp1, temp2;
+
+  const double PI = 3.1415926535898;
+  const double C1 = cos(1*PI/(double)32);
+  const double C2 = cos(2*PI/(double)32);
+  const double C3 = cos(3*PI/(double)32);
+  const double C4 = cos(4*PI/(double)32);
+  const double C5 = cos(5*PI/(double)32);
+  const double C6 = cos(6*PI/(double)32);
+  const double C7 = cos(7*PI/(double)32);
+  const double C8 = cos(8*PI/(double)32);
+  const double C9 = cos(9*PI/(double)32);
+  const double C10 = cos(10*PI/(double)32);
+  const double C11 = cos(11*PI/(double)32);
+  const double C12 = cos(12*PI/(double)32);
+  const double C13 = cos(13*PI/(double)32);
+  const double C14 = cos(14*PI/(double)32);
+  const double C15 = cos(15*PI/(double)32);
+
+  // step 1
+  step[ 0] = input[0] + input[15];
+  step[ 1] = input[1] + input[14];
+  step[ 2] = input[2] + input[13];
+  step[ 3] = input[3] + input[12];
+  step[ 4] = input[4] + input[11];
+  step[ 5] = input[5] + input[10];
+  step[ 6] = input[6] + input[ 9];
+  step[ 7] = input[7] + input[ 8];
+  step[ 8] = input[7] - input[ 8];
+  step[ 9] = input[6] - input[ 9];
+  step[10] = input[5] - input[10];
+  step[11] = input[4] - input[11];
+  step[12] = input[3] - input[12];
+  step[13] = input[2] - input[13];
+  step[14] = input[1] - input[14];
+  step[15] = input[0] - input[15];
+
+  // step 2
+  output[0] = step[0] + step[7];
+  output[1] = step[1] + step[6];
+  output[2] = step[2] + step[5];
+  output[3] = step[3] + step[4];
+  output[4] = step[3] - step[4];
+  output[5] = step[2] - step[5];
+  output[6] = step[1] - step[6];
+  output[7] = step[0] - step[7];
+
+  temp1 = step[ 8]*C7;
+  temp2 = step[15]*C9;
+  output[ 8] = temp1 + temp2;
+
+  temp1 = step[ 9]*C11;
+  temp2 = step[14]*C5;
+  output[ 9] = temp1 - temp2;
+
+  temp1 = step[10]*C3;
+  temp2 = step[13]*C13;
+  output[10] = temp1 + temp2;
+
+  temp1 = step[11]*C15;
+  temp2 = step[12]*C1;
+  output[11] = temp1 - temp2;
+
+  temp1 = step[11]*C1;
+  temp2 = step[12]*C15;
+  output[12] = temp2 + temp1;
+
+  temp1 = step[10]*C13;
+  temp2 = step[13]*C3;
+  output[13] = temp2 - temp1;
+
+  temp1 = step[ 9]*C5;
+  temp2 = step[14]*C11;
+  output[14] = temp2 + temp1;
+
+  temp1 = step[ 8]*C9;
+  temp2 = step[15]*C7;
+  output[15] = temp2 - temp1;
+
+  // step 3
+  step[ 0] = output[0] + output[3];
+  step[ 1] = output[1] + output[2];
+  step[ 2] = output[1] - output[2];
+  step[ 3] = output[0] - output[3];
+
+  temp1 = output[4]*C14;
+  temp2 = output[7]*C2;
+  step[ 4] = temp1 + temp2;
+
+  temp1 = output[5]*C10;
+  temp2 = output[6]*C6;
+  step[ 5] = temp1 + temp2;
+
+  temp1 = output[5]*C6;
+  temp2 = output[6]*C10;
+  step[ 6] = temp2 - temp1;
+
+  temp1 = output[4]*C2;
+  temp2 = output[7]*C14;
+  step[ 7] = temp2 - temp1;
+
+  step[ 8] = output[ 8] + output[11];
+  step[ 9] = output[ 9] + output[10];
+  step[10] = output[ 9] - output[10];
+  step[11] = output[ 8] - output[11];
+
+  step[12] = output[12] + output[15];
+  step[13] = output[13] + output[14];
+  step[14] = output[13] - output[14];
+  step[15] = output[12] - output[15];
+
+  // step 4
+  output[ 0] = (step[ 0] + step[ 1]);
+  output[ 8] = (step[ 0] - step[ 1]);
+
+  temp1 = step[2]*C12;
+  temp2 = step[3]*C4;
+  temp1 = temp1 + temp2;
+  output[ 4] = 2*(temp1*C8);
+
+  temp1 = step[2]*C4;
+  temp2 = step[3]*C12;
+  temp1 = temp2 - temp1;
+  output[12] = 2*(temp1*C8);
+
+  output[ 2] = 2*((step[4] + step[ 5])*C8);
+  output[14] = 2*((step[7] - step[ 6])*C8);
+
+  temp1 = step[4] - step[5];
+  temp2 = step[6] + step[7];
+  output[ 6] = (temp1 + temp2);
+  output[10] = (temp1 - temp2);
+
+  intermediate[8] = step[8] + step[14];
+  intermediate[9] = step[9] + step[15];
+
+  temp1 = intermediate[8]*C12;
+  temp2 = intermediate[9]*C4;
+  temp1 = temp1 - temp2;
+  output[3] = 2*(temp1*C8);
+
+  temp1 = intermediate[8]*C4;
+  temp2 = intermediate[9]*C12;
+  temp1 = temp2 + temp1;
+  output[13] = 2*(temp1*C8);
+
+  output[ 9] = 2*((step[10] + step[11])*C8);
+
+  intermediate[11] = step[10] - step[11];
+  intermediate[12] = step[12] + step[13];
+  intermediate[13] = step[12] - step[13];
+  intermediate[14] = step[ 8] - step[14];
+  intermediate[15] = step[ 9] - step[15];
+
+  output[15] = (intermediate[11] + intermediate[12]);
+  output[ 1] = -(intermediate[11] - intermediate[12]);
+
+  output[ 7] = 2*(intermediate[13]*C8);
+
+  temp1 = intermediate[14]*C12;
+  temp2 = intermediate[15]*C4;
+  temp1 = temp1 - temp2;
+  output[11] = -2*(temp1*C8);
+
+  temp1 = intermediate[14]*C4;
+  temp2 = intermediate[15]*C12;
+  temp1 = temp2 + temp1;
+  output[ 5] = 2*(temp1*C8);
+}
+
+void vp8_short_fdct16x16_c(short *input, short *out, int pitch) {
+    int shortpitch = pitch >> 1;
+    int i, j;
+    double output[256];
+    // First transform columns
+    for (i = 0; i < 16; i++) {
+        double temp_in[16], temp_out[16];
+        for (j = 0; j < 16; j++)
+            temp_in[j] = input[j*shortpitch + i];
+        dct16x16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; j++)
+            output[j*16 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 16; ++i) {
+        double temp_in[16], temp_out[16];
+        for (j = 0; j < 16; ++j)
+            temp_in[j] = output[j + i*16];
+        dct16x16_1d(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+            output[j + i*16] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 256; i++)
+        out[i] = (short)round(output[i]/2);
+}
+#endif
diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index ac7769d3d..9936969d5 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h
@@ -28,6 +28,13 @@ void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
 void vp8_fht8x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);
 #endif
 
+#if CONFIG_TX16X16
+#ifndef vp8_fdct_short16x16
+#define vp8_fdct_short16x16 vp8_short_fdct16x16_c
+#endif
+extern prototype_fdct(vp8_fdct_short16x16);
+#endif
+
 #ifndef vp8_fdct_short8x8
 #define vp8_fdct_short8x8  vp8_short_fdct8x8_c
 #endif
@@ -71,6 +78,9 @@ extern prototype_fdct(vp8_short_walsh4x4_lossless_c);
 
 typedef prototype_fdct(*vp8_fdct_fn_t);
 typedef struct {
+#if CONFIG_TX16X16
+  vp8_fdct_fn_t    short16x16;
+#endif
   vp8_fdct_fn_t    short8x8;
   vp8_fdct_fn_t    haar_short2x2;
   vp8_fdct_fn_t    short4x4;
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 909d56961..23eed6d7a 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1132,6 +1132,9 @@ static void encode_frame_internal(VP8_COMP *cpi) {
 #endif
   vp8_zero(cpi->coef_counts);
   vp8_zero(cpi->coef_counts_8x8);
+#if CONFIG_TX16X16
+  vp8_zero(cpi->coef_counts_16x16);
+#endif
 
   vp8cx_frame_init_quantizer(cpi);
 
@@ -1437,6 +1440,13 @@ void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
   }
 
   /* test code: set transform size based on mode selection */
+#if CONFIG_TX16X16
+  if (x->e_mbd.mode_info_context->mbmi.mode <= TM_PRED) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_16X16;
+    cpi->t16x16_count++;
+  }
+  else
+#endif
   if (cpi->common.txfm_mode == ALLOW_8X8
       && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED
       && x->e_mbd.mode_info_context->mbmi.mode != B_PRED) {
@@ -1470,12 +1480,9 @@ extern int cnt_pm;
 
 extern void vp8_fix_contexts(MACROBLOCKD *x);
 
-void vp8cx_encode_inter_macroblock
-(
-  VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
-  int recon_yoffset, int recon_uvoffset,
-  int output_enabled
-) {
+void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,
+                                    TOKENEXTRA **t, int recon_yoffset,
+                                    int recon_uvoffset, int output_enabled) {
   VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned char *segment_id = &xd->mode_info_context->mbmi.segment_id;
@@ -1523,6 +1530,16 @@ void vp8cx_encode_inter_macroblock
   set_pred_flag(xd, PRED_REF, ref_pred_flag);
 
   /* test code: set transform size based on mode selection */
+#if CONFIG_TX16X16
+  if (x->e_mbd.mode_info_context->mbmi.mode <= TM_PRED ||
+      x->e_mbd.mode_info_context->mbmi.mode == NEWMV ||
+      x->e_mbd.mode_info_context->mbmi.mode == ZEROMV ||
+      x->e_mbd.mode_info_context->mbmi.mode == NEARMV ||
+      x->e_mbd.mode_info_context->mbmi.mode == NEARESTMV) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_16X16;
+    cpi->t16x16_count++;
+  } else
+#endif
   if (cpi->common.txfm_mode == ALLOW_8X8
       && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED
       && x->e_mbd.mode_info_context->mbmi.mode != B_PRED
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 01ae03a23..325efeb6b 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -160,23 +160,43 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
 
   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_transform_intra_mby_16x16(x);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_transform_intra_mby_8x8(x);
   else
     vp8_transform_intra_mby(x);
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_quantize_mby_16x16(x);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_quantize_mby_8x8(x);
   else
     vp8_quantize_mby(x);
 
   if (x->optimize) {
+#if CONFIG_TX16X16
+    if (tx_type == TX_16X16)
+      vp8_optimize_mby_16x16(x, rtcd);
+    else
+#endif
     if (tx_type == TX_8X8)
       vp8_optimize_mby_8x8(x, rtcd);
     else
       vp8_optimize_mby(x, rtcd);
   }
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
   else
@@ -220,6 +240,9 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
 
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
   int tx_type = x->e_mbd.mode_info_context->mbmi.txfm_size;
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16) tx_type = TX_8X8; // 16x16 for U and V should default to 8x8 behavior.
+#endif
 #if CONFIG_COMP_INTRA_PRED
   if (x->e_mbd.mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
 #endif
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 653a4cc70..bfab4c647 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -282,6 +282,42 @@ void vp8_transform_mby_8x8(MACROBLOCK *x) {
   }
 }
 
+#if CONFIG_TX16X16
+void vp8_transform_mbuv_16x16(MACROBLOCK *x) {
+  int i;
+
+  vp8_clear_system_state();
+  // Default to the 8x8
+  for (i = 16; i < 24; i += 4)
+    x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+        &x->block[i].coeff[0], 16);
+}
+
+
+void vp8_transform_intra_mby_16x16(MACROBLOCK *x) {
+  vp8_clear_system_state();
+  x->vp8_short_fdct16x16(&x->block[0].src_diff[0],
+      &x->block[0].coeff[0], 32);
+}
+
+
+void vp8_transform_mb_16x16(MACROBLOCK *x) {
+  int i;
+  vp8_clear_system_state();
+  x->vp8_short_fdct16x16(&x->block[0].src_diff[0],
+      &x->block[0].coeff[0], 32);
+
+  for (i = 16; i < 24; i += 4) {
+      x->vp8_short_fdct8x8(&x->block[i].src_diff[0],
+          &x->block[i].coeff[0], 16);
+  }
+}
+
+void vp8_transform_mby_16x16(MACROBLOCK *x) {
+  vp8_clear_system_state();
+  x->vp8_short_fdct16x16(&x->block[0].src_diff[0], &x->block[0].coeff[0], 32);
+}
+#endif
 
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
@@ -290,7 +326,7 @@ typedef struct vp8_token_state vp8_token_state;
 struct vp8_token_state {
   int           rate;
   int           error;
-  signed char   next;
+  int           next;
   signed char   token;
   short         qc;
 };
@@ -1017,29 +1053,280 @@ void vp8_optimize_mbuv_8x8(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) {
 
 }
 
+
+
+#if CONFIG_TX16X16
+#define UPDATE_RD_COST()\
+{\
+    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
+    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
+    if (rd_cost0 == rd_cost1) {\
+        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
+        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
+    }\
+}
+
+void optimize_b_16x16(MACROBLOCK *mb, int i, int type,
+                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                      const VP8_ENCODER_RTCD *rtcd) {
+  BLOCK *b = &mb->block[i];
+  BLOCKD *d = &mb->e_mbd.block[i];
+  vp8_token_state tokens[257][2];
+  unsigned best_index[257][2];
+  const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
+  short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;
+  int eob = d->eob, final_eob, sz = 0;
+  int rc, x, next;
+  int64_t rdmult, rddiv, rd_cost0, rd_cost1;
+  int rate0, rate1, error0, error1, t0, t1;
+  int best, band, pt;
+  int err_mult = plane_rd_mult[type];
+
+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
+  rdmult = mb->rdmult * err_mult;
+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+      rdmult = (rdmult * 9)>>4;
+  rddiv = mb->rddiv;
+  memset(best_index, 0, sizeof(best_index));
+  /* Initialize the sentinel node of the trellis. */
+  tokens[eob][0].rate = 0;
+  tokens[eob][0].error = 0;
+  tokens[eob][0].next = 256;
+  tokens[eob][0].token = DCT_EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  *(tokens[eob] + 1) = *(tokens[eob] + 0);
+  next = eob;
+  for (i = eob; i-- > 0;) {
+    int base_bits, d2, dx;
+
+    rc = vp8_default_zig_zag1d_16x16[i];
+    x = qcoeff_ptr[rc];
+    /* Only add a trellis state for non-zero coefficients. */
+    if (x) {
+      int shortcut = 0;
+      error0 = tokens[next][0].error;
+      error1 = tokens[next][1].error;
+      /* Evaluate the first possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+      t0 = (vp8_dct_value_tokens_ptr + x)->Token;
+      /* Consider both possible successor states. */
+      if (next < 256) {
+        band = vp8_coef_bands_16x16[i + 1];
+        pt = vp8_prev_token_class[t0];
+        rate0 += mb->token_costs_16x16[type][band][pt][tokens[next][0].token];
+        rate1 += mb->token_costs_16x16[type][band][pt][tokens[next][1].token];
+      }
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = *(vp8_dct_value_cost_ptr + x);
+      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
+      d2 = dx*dx;
+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][0].error = d2 + (best ? error1 : error0);
+      tokens[i][0].next = next;
+      tokens[i][0].token = t0;
+      tokens[i][0].qc = x;
+      best_index[i][0] = best;
+      /* Evaluate the second possibility for this state. */
+      rate0 = tokens[next][0].rate;
+      rate1 = tokens[next][1].rate;
+
+      if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&
+         (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))
+        shortcut = 1;
+      else
+        shortcut = 0;
+
+      if (shortcut) {
+        sz = -(x < 0);
+        x -= 2*sz + 1;
+      }
+
+      /* Consider both possible successor states. */
+      if (!x) {
+        /* If we reduced this coefficient to zero, check to see if
+         *  we need to move the EOB back here.
+         */
+        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
+             DCT_EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
+             DCT_EOB_TOKEN : ZERO_TOKEN;
+      }
+      else
+        t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
+      if (next < 256) {
+        band = vp8_coef_bands_16x16[i + 1];
+        if (t0 != DCT_EOB_TOKEN) {
+            pt = vp8_prev_token_class[t0];
+            rate0 += mb->token_costs_16x16[type][band][pt]
+                [tokens[next][0].token];
+        }
+        if (t1!=DCT_EOB_TOKEN) {
+            pt = vp8_prev_token_class[t1];
+            rate1 += mb->token_costs_16x16[type][band][pt]
+                [tokens[next][1].token];
+        }
+      }
+      UPDATE_RD_COST();
+      /* And pick the best. */
+      best = rd_cost1 < rd_cost0;
+      base_bits = *(vp8_dct_value_cost_ptr + x);
+
+      if(shortcut) {
+        dx -= (dequant_ptr[rc!=0] + sz) ^ sz;
+        d2 = dx*dx;
+      }
+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
+      tokens[i][1].error = d2 + (best ? error1 : error0);
+      tokens[i][1].next = next;
+      tokens[i][1].token = best ? t1 : t0;
+      tokens[i][1].qc = x;
+      best_index[i][1] = best;
+      /* Finally, make this the new head of the trellis. */
+      next = i;
+    }
+    /* There's no choice to make for a zero coefficient, so we don't
+     *  add a new trellis node, but we do need to update the costs.
+     */
+    else {
+      band = vp8_coef_bands_16x16[i + 1];
+      t0 = tokens[next][0].token;
+      t1 = tokens[next][1].token;
+      /* Update the cost of each path if we're past the EOB token. */
+      if (t0 != DCT_EOB_TOKEN) {
+        tokens[next][0].rate += mb->token_costs_16x16[type][band][0][t0];
+        tokens[next][0].token = ZERO_TOKEN;
+      }
+      if (t1 != DCT_EOB_TOKEN) {
+        tokens[next][1].rate += mb->token_costs_16x16[type][band][0][t1];
+        tokens[next][1].token = ZERO_TOKEN;
+      }
+      /* Don't update next, because we didn't add a new node. */
+    }
+  }
+
+  /* Now pick the best path through the whole trellis. */
+  band = vp8_coef_bands_16x16[i + 1];
+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  rate0 = tokens[next][0].rate;
+  rate1 = tokens[next][1].rate;
+  error0 = tokens[next][0].error;
+  error1 = tokens[next][1].error;
+  t0 = tokens[next][0].token;
+  t1 = tokens[next][1].token;
+  rate0 += mb->token_costs_16x16[type][band][pt][t0];
+  rate1 += mb->token_costs_16x16[type][band][pt][t1];
+  UPDATE_RD_COST();
+  best = rd_cost1 < rd_cost0;
+  final_eob = -1;
+
+  for (i = next; i < eob; i = next) {
+    x = tokens[i][best].qc;
+    if (x)
+      final_eob = i;
+    rc = vp8_default_zig_zag1d_16x16[i];
+    qcoeff_ptr[rc] = x;
+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);
+
+    next = tokens[i][best].next;
+    best = best_index[i][best];
+  }
+  final_eob++;
+
+  d->eob = final_eob;
+  *a = *l = (d->eob != !type);
+}
+
+void vp8_optimize_mby_16x16(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) {
+    ENTROPY_CONTEXT_PLANES t_above, t_left;
+    ENTROPY_CONTEXT *ta, *tl;
+
+    if (!x->e_mbd.above_context)
+        return;
+    if (!x->e_mbd.left_context)
+        return;
+
+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+    optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);
+    *(ta + 1) = *ta;
+    *(tl + 1) = *tl;
+}
+
+void optimize_mb_16x16(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) {
+  int b;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);
+  *(ta + 1) = *ta;
+  *(tl + 1) = *tl;
+
+  for (b = 16; b < 24; b += 4) {
+    optimize_b_8x8(x, b, PLANE_TYPE_UV,
+                   ta + vp8_block2above_8x8[b], tl + vp8_block2left_8x8[b],
+                   rtcd);
+    *(ta + vp8_block2above_8x8[b] + 1) = *(ta + vp8_block2above_8x8[b]);
+    *(tl + vp8_block2left_8x8[b] + 1) = *(tl + vp8_block2left_8x8[b]);
+  }
+}
+#endif
+
 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
   int tx_type = x->e_mbd.mode_info_context->mbmi.txfm_size;
   vp8_build_inter_predictors_mb(&x->e_mbd);
 
   vp8_subtract_mb(rtcd, x);
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_transform_mb_16x16(x);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_transform_mb_8x8(x);
   else
     transform_mb(x);
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_quantize_mb_16x16(x);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_quantize_mb_8x8(x);
   else
     vp8_quantize_mb(x);
 
   if (x->optimize) {
+#if CONFIG_TX16X16
+    if (tx_type == TX_16X16)
+      optimize_mb_16x16(x, rtcd);
+    else
+#endif
     if (tx_type == TX_8X8)
       optimize_mb_8x8(x, rtcd);
     else
       optimize_mb(x, rtcd);
   }
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
   else
@@ -1111,6 +1398,11 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
 
   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_transform_mby_16x16(x);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_transform_mby_8x8(x);
   else
@@ -1118,6 +1410,11 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
 
   vp8_quantize_mby(x);
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16)
+    vp8_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+  else
+#endif
   if (tx_type == TX_8X8)
     vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
   else
@@ -1126,3 +1423,4 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
   RECON_INVOKE(&rtcd->common->recon, recon_mby)
   (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
+
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index bfcd0f92c..228451936 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -121,6 +121,15 @@ void vp8_build_dcblock_8x8(MACROBLOCK *b);
 void vp8_optimize_mby_8x8(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 
+#if CONFIG_TX16X16
+void vp8_transform_mb_16x16(MACROBLOCK *mb);
+void vp8_transform_mby_16x16(MACROBLOCK *x);
+void vp8_transform_mbuv_16x16(MACROBLOCK *x);
+void vp8_transform_intra_mby_16x16(MACROBLOCK *x);
+void vp8_build_dcblock_16x16(MACROBLOCK *b);
+void vp8_optimize_mby_16x16(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
+#endif
+
 void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
 #endif
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 695e9c69b..6390f3fe4 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -69,6 +69,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) {
   cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
 
   cpi->rtcd.fdct.short8x8                  = vp8_short_fdct8x8_c;
+#if CONFIG_TX16X16
+  cpi->rtcd.fdct.short16x16                = vp8_short_fdct16x16_c;
+#endif
   cpi->rtcd.fdct.haar_short2x2             = vp8_short_fhaar2x2_c;
   cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
   cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 5b6684ac8..e471cab89 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1161,10 +1161,16 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
   }
 
   if (cpi->sf.improved_dct) {
+#if CONFIG_TX16X16
+    cpi->mb.vp8_short_fdct16x16 = FDCT_INVOKE(&cpi->rtcd.fdct, short16x16);
+#endif
     cpi->mb.vp8_short_fdct8x8 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x8);
     cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
     cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
   } else {
+#if CONFIG_TX16X16
+    cpi->mb.vp8_short_fdct16x16 = FDCT_INVOKE(&cpi->rtcd.fdct, short16x16);
+#endif
     cpi->mb.vp8_short_fdct8x8 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x8);
     cpi->mb.vp8_short_fdct8x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
     cpi->mb.vp8_short_fdct4x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
@@ -1177,6 +1183,9 @@ void vp8_set_speed_features(VP8_COMP *cpi) {
   cpi->mb.quantize_b      = vp8_regular_quantize_b;
   cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
   cpi->mb.quantize_b_8x8  = vp8_regular_quantize_b_8x8;
+#if CONFIG_TX16X16
+  cpi->mb.quantize_b_16x16= vp8_regular_quantize_b_16x16;
+#endif
   cpi->mb.quantize_b_2x2  = vp8_regular_quantize_b_2x2;
 
   vp8cx_init_quantizer(cpi);
@@ -3641,6 +3650,9 @@ static void encode_frame_to_data_rate
   update_reference_frames(cm);
   vp8_copy(cpi->common.fc.coef_counts, cpi->coef_counts);
   vp8_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
+#if CONFIG_TX16X16
+  vp8_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
+#endif
   vp8_adapt_coef_probs(&cpi->common);
   if (cpi->common.frame_type != KEY_FRAME) {
     vp8_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 2821aadd0..a1159cc5c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -91,9 +91,13 @@ typedef struct {
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
   vp8_prob coef_probs[BLOCK_TYPES]
-  [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
   vp8_prob coef_probs_8x8[BLOCK_TYPES_8X8]
-  [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+#if CONFIG_TX16X16
+  vp8_prob coef_probs_16x16[BLOCK_TYPES_16X16]
+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+#endif
 
   vp8_prob ymode_prob [VP8_YMODES - 1]; /* interframe intra mode probs */
   vp8_prob uv_mode_prob [VP8_YMODES][VP8_UV_MODES - 1];
@@ -390,6 +394,15 @@ typedef struct VP8_COMP {
   DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);
   DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);
 
+#if CONFIG_TX16X16
+  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
+#endif
+
   MACROBLOCK mb;
   VP8_COMMON common;
   vp8_writer bc, bc2;
@@ -540,6 +553,11 @@ typedef struct VP8_COMP {
   unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
   vp8_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
   unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+#if CONFIG_TX16X16
+  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
+  vp8_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+#endif
 
   int gfu_boost;
   int last_boost;
@@ -598,6 +616,9 @@ typedef struct VP8_COMP {
   int skip_false_count[3];
   int t4x4_count;
   int t8x8_count;
+#if CONFIG_TX16X16
+  int t16x16_count;
+#endif
 
   unsigned char *segmentation_map;
 
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 78892fc3f..81b4d12b3 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -302,15 +302,93 @@ void vp8_quantize_mb_8x8(MACROBLOCK *x) {
 void vp8_quantize_mbuv_8x8(MACROBLOCK *x) {
   int i;
 
-  for (i = 16; i < 24; i ++) {
+  for (i = 16; i < 24; i ++)
     x->e_mbd.block[i].eob = 0;
-  }
   for (i = 16; i < 24; i += 4)
     x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
 }
 
 
 
+
+#if CONFIG_TX16X16
+void vp8_quantize_mby_16x16(MACROBLOCK *x) {
+  int i;
+  for (i = 0; i < 16; i++)
+    x->e_mbd.block[i].eob = 0;
+  x->e_mbd.block[24].eob = 0;
+  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);
+}
+
+void vp8_quantize_mb_16x16(MACROBLOCK *x) {
+  int i;
+  for(i = 0; i < 25; i++)
+    x->e_mbd.block[i].eob = 0;
+  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);
+  for (i = 16; i < 24; i += 4)
+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+}
+
+// U and V should use 8x8
+void vp8_quantize_mbuv_16x16(MACROBLOCK *x) {
+  int i;
+
+  for(i = 16; i < 24; i++)
+    x->e_mbd.block[i].eob = 0;
+  for (i = 16; i < 24; i += 4)
+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);
+}
+
+void vp8_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+  int i, rc, eob;
+  int zbin;
+  int x, y, z, sz;
+  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+  short *coeff_ptr  = b->coeff;
+  short *zbin_ptr   = b->zbin_16x16;
+  short *round_ptr  = b->round;
+  short *quant_ptr  = b->quant;
+  unsigned char *quant_shift_ptr = b->quant_shift;
+  short *qcoeff_ptr = d->qcoeff;
+  short *dqcoeff_ptr = d->dqcoeff;
+  short *dequant_ptr = d->dequant;
+  short zbin_oq_value = b->zbin_extra;
+
+  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
+  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
+
+  eob = -1;
+  for (i = 0; i < b->eob_max_offset_16x16; i++) {
+    rc   = vp8_default_zig_zag1d_16x16[i];
+    z    = coeff_ptr[rc];
+
+    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
+    zbin_boost_ptr ++;
+
+    sz = (z >> 31);                               // sign of z
+    x  = (z ^ sz) - sz;                           // x = abs(z)
+
+    if (x >= zbin) {
+      x += (round_ptr[rc!=0]);
+      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
+          >> quant_shift_ptr[rc!=0];              // quantize (x)
+      x  = (y ^ sz) - sz;                         // get the sign back
+      qcoeff_ptr[rc]  = x;                        // write to destination
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value
+
+      if (y) {
+        eob = i;                                  // last nonzero coeffs
+        zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+      }
+    }
+  }
+
+  d->eob = eob + 1;
+}
+#endif
+
+
+
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
  * these two C functions if corresponding optimized routine is not available.
  * NEON optimized version implements currently the fast quantization for pair
@@ -337,20 +415,39 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) {
   int i;
   int quant_val;
   int Q;
-  int zbin_boost[16] = { 0,  0,  8, 10, 12, 14, 16, 20,
-                         24, 28, 32, 36, 40, 44, 44, 44
-                       };
-
-  int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,
-                              14, 16, 18, 20, 22, 24, 26, 28,
-                              30, 32, 34, 36, 38, 40, 42, 44,
-                              46, 48, 48, 48, 48, 48, 48, 48,
-                              48, 48, 48, 48, 48, 48, 48, 48,
-                              48, 48, 48, 48, 48, 48, 48, 48,
-                              48, 48, 48, 48, 48, 48, 48, 48,
-                              48, 48, 48, 48, 48, 48, 48, 48
-                           };
-
+  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,
+                                      24, 28, 32, 36, 40, 44, 44, 44
+                                    };
+
+  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,
+                                          14, 16, 18, 20, 22, 24, 26, 28,
+                                          30, 32, 34, 36, 38, 40, 42, 44,
+                                          46, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48,
+                                          48, 48, 48, 48, 48, 48, 48, 48
+                                        };
+#if CONFIG_TX16X16
+  static const int zbin_boost_16x16[256] = {
+     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+  };
+#endif
   int qrounding_factor = 48;
 
 
@@ -372,33 +469,52 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) {
                  cpi->Y1quant_shift[Q] + 0, quant_val);
     cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
     cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+#if CONFIG_TX16X16
+    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+#endif
     cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.Y1dequant[Q][0] = quant_val;
     cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_y1_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+#if CONFIG_TX16X16
+    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+#endif
+
 
     quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
     invert_quant(cpi->Y2quant[Q] + 0,
                  cpi->Y2quant_shift[Q] + 0, quant_val);
     cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
     cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+#if CONFIG_TX16X16
+    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+#endif
     cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.Y2dequant[Q][0] = quant_val;
     cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_y2_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+#if CONFIG_TX16X16
+    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+#endif
 
     quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
     invert_quant(cpi->UVquant[Q] + 0,
                  cpi->UVquant_shift[Q] + 0, quant_val);
-    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;;
-    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;;
+    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+#if CONFIG_TX16X16
+    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+#endif
     cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;
     cpi->common.UVdequant[Q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_uv_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
+#if CONFIG_TX16X16
+    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+#endif
 
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
@@ -453,6 +569,25 @@ void vp8cx_init_quantizer(VP8_COMP *cpi) {
       cpi->zrun_zbin_boost_uv_8x8[Q][i] =
         ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;
     }
+
+#if CONFIG_TX16X16
+    // 16x16 structures. Same comment above applies.
+    for (i = 1; i < 256; i++) {
+      int rc = vp8_default_zig_zag1d_16x16[i];
+
+      quant_val = vp8_ac_yquant(Q);
+      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+
+      quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+
+      quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+    }
+#endif
   }
 }
 
@@ -491,10 +626,16 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) {
     x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];
     x->block[i].zbin = cpi->Y1zbin[QIndex];
     x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
+#if CONFIG_TX16X16
+    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
+#endif
     x->block[i].round = cpi->Y1round[QIndex];
     x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
     x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
+#if CONFIG_TX16X16
+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
+#endif
     x->block[i].zbin_extra = (short)zbin_extra;
 
     // Segment max eob offset feature.
@@ -503,9 +644,16 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) {
         get_segdata(xd, segment_id, SEG_LVL_EOB);
       x->block[i].eob_max_offset_8x8 =
         get_segdata(xd, segment_id, SEG_LVL_EOB);
+#if CONFIG_TX16X16
+      x->block[i].eob_max_offset_16x16 =
+        get_segdata(xd, segment_id, SEG_LVL_EOB);
+#endif
     } else {
       x->block[i].eob_max_offset = 16;
       x->block[i].eob_max_offset_8x8 = 64;
+#if CONFIG_TX16X16
+      x->block[i].eob_max_offset_16x16 = 256;
+#endif
     }
   }
 
@@ -520,10 +668,16 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) {
     x->block[i].quant_shift = cpi->UVquant_shift[QIndex];
     x->block[i].zbin = cpi->UVzbin[QIndex];
     x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];
+#if CONFIG_TX16X16
+    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];
+#endif
     x->block[i].round = cpi->UVround[QIndex];
     x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
     x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];
+#if CONFIG_TX16X16
+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];
+#endif
 
     x->block[i].zbin_extra = (short)zbin_extra;
 
@@ -549,10 +703,16 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x) {
   x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];
   x->block[24].zbin = cpi->Y2zbin[QIndex];
   x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];
+#if CONFIG_TX16X16
+  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];
+#endif
   x->block[24].round = cpi->Y2round[QIndex];
   x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
   x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
   x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];
+#if CONFIG_TX16X16
+  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];
+#endif
   x->block[24].zbin_extra = (short)zbin_extra;
 
   // TBD perhaps not use for Y2
diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h
index 4106064f5..98fed4c11 100644
--- a/vp8/encoder/quantize.h
+++ b/vp8/encoder/quantize.h
@@ -46,6 +46,13 @@ extern prototype_quantize_block_pair(vp8_quantize_quantb_pair);
 #endif
 extern prototype_quantize_block(vp8_quantize_quantb_8x8);
 
+#if CONFIG_TX16X16
+#ifndef vp8_quantize_quantb_16x16
+#define vp8_quantize_quantb_16x16 vp8_regular_quantize_b_16x16
+#endif
+extern prototype_quantize_block(vp8_quantize_quantb_16x16);
+#endif
+
 #ifndef vp8_quantize_quantb_2x2
 #define vp8_quantize_quantb_2x2 vp8_regular_quantize_b_2x2
 #endif
@@ -70,6 +77,13 @@ extern prototype_quantize_mb(vp8_quantize_mby);
 extern prototype_quantize_mb(vp8_quantize_mby_8x8);
 extern prototype_quantize_mb(vp8_quantize_mbuv_8x8);
 
+#if CONFIG_TX16X16
+void vp8_quantize_mb_16x16(MACROBLOCK *x);
+extern prototype_quantize_block(vp8_quantize_quantb_16x16);
+extern prototype_quantize_mb(vp8_quantize_mby_16x16);
+extern prototype_quantize_mb(vp8_quantize_mbuv_16x16);
+#endif
+
 struct VP8_COMP;
 extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
 extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 43f38568a..b0f92c942 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -177,6 +177,9 @@ void vp8_save_coding_context(VP8_COMP *cpi) {
 #if CONFIG_SWITCHABLE_INTERP
   vp8_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
 #endif
+#if CONFIG_TX16X16
+  vp8_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
+#endif
 }
 
 void vp8_restore_coding_context(VP8_COMP *cpi) {
@@ -233,6 +236,9 @@ void vp8_restore_coding_context(VP8_COMP *cpi) {
 #if CONFIG_SWITCHABLE_INTERP
   vp8_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
 #endif
+#if CONFIG_TX16X16
+  vp8_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
+#endif
 }
 
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 66f342302..a2cd2651a 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -366,6 +366,13 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int QIndex) {
     (const vp8_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,
     BLOCK_TYPES_8X8);
 
+#if CONFIG_TX16X16
+  fill_token_costs(
+    cpi->mb.token_costs_16x16,
+    (const vp8_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,
+    BLOCK_TYPES_16X16);
+#endif
+
   /*rough estimate for costing*/
   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
   vp8_init_mode_costs(cpi);
@@ -809,6 +816,72 @@ static void macro_block_yrd_8x8(MACROBLOCK *mb,
   *Rate = vp8_rdcost_mby_8x8(mb);
 }
 
+#if CONFIG_TX16X16
+static int cost_coeffs_16x16(MACROBLOCK *mb, BLOCKD *b, int type,
+                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+  const int eob = b->eob;
+  int c = !type;              /* start at coef 0, unless Y with Y2 */
+  int cost = 0, pt;    /* surrounding block/prev coef predictor */
+  short *qcoeff_ptr = b->qcoeff;
+
+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+# define QC16X16(I)  ( qcoeff_ptr [vp8_default_zig_zag1d_16x16[I]] )
+
+  for (; c < eob; c++) {
+    int v = QC16X16(c);
+    int t = vp8_dct_value_tokens_ptr[v].Token;
+    cost += mb->token_costs_16x16[type][vp8_coef_bands_16x16[c]][pt][t];
+    cost += vp8_dct_value_cost_ptr[v];
+    pt = vp8_prev_token_class[t];
+  }
+
+# undef QC16X16
+  if (c < 256)
+    cost += mb->token_costs_16x16[type][vp8_coef_bands_16x16[c]]
+            [pt][DCT_EOB_TOKEN];
+
+  pt = (c != !type); // is eob first coefficient;
+  *a = *l = pt;
+  return cost;
+}
+
+static int vp8_rdcost_mby_16x16(MACROBLOCK *mb) {
+  int cost;
+  MACROBLOCKD *x = &mb->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+  ta = (ENTROPY_CONTEXT *)&t_above;
+  tl = (ENTROPY_CONTEXT *)&t_left;
+
+  cost = cost_coeffs_16x16(mb, x->block, PLANE_TYPE_Y_WITH_DC, ta, tl);
+  return cost;
+}
+
+static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
+                                  const VP8_ENCODER_RTCD *rtcd) {
+  int d;
+
+  ENCODEMB_INVOKE(&rtcd->encodemb, submby)(
+    mb->src_diff,
+    *(mb->block[0].base_src),
+    mb->e_mbd.predictor,
+    mb->block[0].src_stride);
+
+  vp8_transform_mby_16x16(mb);
+  vp8_quantize_mby_16x16(mb);
+  d = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(mb, 0);
+
+  *Distortion = (d >> 2);
+  // rate
+  *Rate = vp8_rdcost_mby_16x16(mb);
+}
+#endif
+
 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
@@ -1121,7 +1194,12 @@ static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
       }
 #endif
 
-      macro_block_yrd_8x8(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));
+#if CONFIG_TX16X16
+      if (mode <= TM_PRED)
+        macro_block_yrd_16x16(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));
+      else
+#endif
+        macro_block_yrd_8x8(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));
       // FIXME add compoundmode cost
       // FIXME add rate for mode2
       rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]
@@ -3081,16 +3159,33 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 0);
         }
         break;
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
         case D45_PRED:
         case D135_PRED:
         case D117_PRED:
         case D153_PRED:
         case D27_PRED:
         case D63_PRED:
-        case DC_PRED:
-        case V_PRED:
-        case H_PRED:
-        case TM_PRED:
+#if CONFIG_TX16X16
+          // FIXME: breaks lossless since 4x4 isn't allowed
+          x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+          // FIXME compound intra prediction
+          RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+              (&x->e_mbd);
+          macro_block_yrd_16x16(x, &rate_y, &distortion,
+                                IF_RTCD(&cpi->rtcd));
+          rate2 += rate_y;
+          distortion2 += distortion;
+          rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
+          rate2 += uv_intra_rate_8x8;
+          rate_uv = uv_intra_rate_tokenonly_8x8;
+          distortion2 += uv_intra_distortion_8x8;
+          distortion_uv = uv_intra_distortion_8x8;
+          break;
+#else
           x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
           // FIXME compound intra prediction
           RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
@@ -3116,6 +3211,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             distortion_uv = uv_intra_distortion;
           }
           break;
+#endif
 
         case NEWMV: {
           int thissme;
@@ -3269,7 +3365,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           }
 
         case ZEROMV:
-
           // Trap vectors that reach beyond the UMV borders
           // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
           // because of the lack of break statements in the previous two cases.
@@ -3348,12 +3443,23 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
 
           // Y cost and distortion
-          if (cpi->common.txfm_mode == ALLOW_8X8)
-            macro_block_yrd_8x8(x, &rate_y, &distortion,
-                                IF_RTCD(&cpi->rtcd));
-          else
-            macro_block_yrd(x, &rate_y, &distortion,
-                            IF_RTCD(&cpi->rtcd));
+#if CONFIG_TX16X16
+          if (this_mode == ZEROMV ||
+              this_mode == NEARESTMV ||
+              this_mode == NEARMV ||
+              this_mode == NEWMV)
+            macro_block_yrd_16x16(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd));
+          else {
+#endif
+            if (cpi->common.txfm_mode == ALLOW_8X8)
+              macro_block_yrd_8x8(x, &rate_y, &distortion,
+                                  IF_RTCD(&cpi->rtcd));
+            else
+              macro_block_yrd(x, &rate_y, &distortion,
+                              IF_RTCD(&cpi->rtcd));
+#if CONFIG_TX16X16
+          }
+#endif
 
           rate2 += rate_y;
           distortion2 += distortion;
@@ -3361,7 +3467,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
           // UV cost and distortion
           vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
 
-          if (cpi->common.txfm_mode == ALLOW_8X8)
+          if (cpi->common.txfm_mode == ALLOW_8X8
+#if CONFIG_TX16X16
+              || this_mode == ZEROMV ||
+              this_mode == NEARESTMV ||
+              this_mode == NEARMV ||
+              this_mode == NEWMV
+#endif
+              )
             rd_inter16x16_uv_8x8(cpi, x, &rate_uv,
                                  &distortion_uv,
                                  cpi->common.full_pixel);
@@ -3487,9 +3600,21 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                                                &x->e_mbd.predictor[320], 16, 8);
 
         /* Y cost and distortion */
-        if (cpi->common.txfm_mode == ALLOW_8X8)
+        if (cpi->common.txfm_mode == ALLOW_8X8
+#if CONFIG_TX16X16
+            || this_mode == ZEROMV ||
+            this_mode == NEARESTMV ||
+            this_mode == NEARMV ||
+            this_mode == NEWMV
+#endif
+            )
+#if CONFIG_TX16X16
+          macro_block_yrd_16x16(x, &rate_y, &distortion,
+                                IF_RTCD(&cpi->rtcd));
+#else
           macro_block_yrd_8x8(x, &rate_y, &distortion,
                               IF_RTCD(&cpi->rtcd));
+#endif
         else
           macro_block_yrd(x, &rate_y, &distortion,
                           IF_RTCD(&cpi->rtcd));
@@ -3498,7 +3623,14 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         distortion2 += distortion;
 
         /* UV cost and distortion */
-        if (cpi->common.txfm_mode == ALLOW_8X8)
+        if (cpi->common.txfm_mode == ALLOW_8X8
+#if CONFIG_TX16X16
+            || this_mode == ZEROMV ||
+            this_mode == NEARESTMV ||
+            this_mode == NEARMV ||
+            this_mode == NEWMV
+#endif
+            )
           rd_inter16x16_uv_8x8(cpi, x, &rate_uv,
                                &distortion_uv,
                                cpi->common.full_pixel);
@@ -3541,6 +3673,15 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                       && this_mode != B_PRED
                       && this_mode != I8X8_PRED);
 
+#if CONFIGURE_TX16X16
+        if (this_mode <= TM_PRED ||
+            this_mode == NEWMV ||
+            this_mode == ZEROMV ||
+            this_mode == NEARESTMV ||
+            this_mode == NEARMV)
+          mb_skippable = mb_is_skippable_16x16(&x->e_mbd);
+        else
+#endif
         if ((cpi->common.txfm_mode == ALLOW_8X8) && has_y2) {
           if (x->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME)
             mb_skippable = mb_is_skippable_8x8(&x->e_mbd);
@@ -4002,10 +4143,25 @@ int vp8cx_pick_mode_inter_macroblock
     }
 
     /* test code: set transform size based on mode selection */
+#if CONFIG_TX16X16
+    if (xd->mode_info_context->mbmi.mode <= TM_PRED ||
+        xd->mode_info_context->mbmi.mode == NEWMV ||
+        xd->mode_info_context->mbmi.mode == ZEROMV ||
+        xd->mode_info_context->mbmi.mode == NEARMV ||
+        xd->mode_info_context->mbmi.mode == NEARESTMV) {
+      xd->mode_info_context->mbmi.txfm_size = TX_16X16;
+      cpi->t16x16_count++;
+    }
+    else if (cpi->common.txfm_mode == ALLOW_8X8
+        && xd->mode_info_context->mbmi.mode != I8X8_PRED
+        && xd->mode_info_context->mbmi.mode != B_PRED
+        && xd->mode_info_context->mbmi.mode != SPLITMV) {
+#else
     if (cpi->common.txfm_mode == ALLOW_8X8
         && xd->mode_info_context->mbmi.mode != I8X8_PRED
         && xd->mode_info_context->mbmi.mode != B_PRED
         && xd->mode_info_context->mbmi.mode != SPLITMV) {
+#endif
       xd->mode_info_context->mbmi.txfm_size = TX_8X8;
       cpi->t8x8_count++;
     } else {
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 81ba6f2be..dac18c6db 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -26,17 +26,23 @@
 #ifdef ENTROPY_STATS
 INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
 INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-extern unsigned int tree_update_hist [BLOCK_TYPES]
-[COEF_BANDS]
-[PREV_COEF_CONTEXTS]
-[ENTROPY_NODES][2];
-extern unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]
-[COEF_BANDS]
-[PREV_COEF_CONTEXTS]
-[ENTROPY_NODES] [2];
+#if CONFIG_TX16X16
+INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
+extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
+extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+#if CONFIG_TX16X16
+extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+#endif
 #endif
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
 void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+#if CONFIG_TX16X16
+void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);
+#endif
 void vp8_fix_contexts(MACROBLOCKD *x);
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
@@ -103,6 +109,54 @@ static void fill_value_tokens() {
   vp8_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }
 
+#if CONFIG_TX16X16
+static void tokenize1st_order_b_16x16(MACROBLOCKD *xd, const BLOCKD *const b, TOKENEXTRA **tp,
+                                      const int type, const FRAME_TYPE frametype, ENTROPY_CONTEXT *a,
+                                      ENTROPY_CONTEXT *l, VP8_COMP *cpi) {
+  int pt; /* near block/prev token context index */
+  int c = 0;                  /* start at DC unless type 0 */
+  const int eob = b->eob;     /* one beyond last nonzero coeff */
+  TOKENEXTRA *t = *tp;        /* store tokens starting here */
+  int x;
+  const short *qcoeff_ptr = b->qcoeff;
+
+  int seg_eob = 256;
+  int segment_id = xd->mode_info_context->mbmi.segment_id;
+
+  if (segfeature_active(xd, segment_id, SEG_LVL_EOB))
+    seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+  do {
+    const int band = vp8_coef_bands_16x16[c];
+    int v;
+
+    x = DCT_EOB_TOKEN;
+    if (c < eob) {
+      int rc = vp8_default_zig_zag1d_16x16[c];
+      v = qcoeff_ptr[rc];
+
+      assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));
+
+      t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+      x        = vp8_dct_value_tokens_ptr[v].Token;
+    }
+
+    t->Token = x;
+    t->context_tree = cpi->common.fc.coef_probs_16x16[type][band][pt];
+
+    t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+    ++cpi->coef_counts_16x16[type][band][pt][x];
+  } while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < seg_eob);
+
+  *tp = t;
+  pt = (c != !type); /* 0 <-> all coeff data is zero */
+  *a = *l = pt;
+}
+#endif
+
 static void tokenize2nd_order_b_8x8
 (
   MACROBLOCKD *xd,
@@ -170,12 +224,8 @@ static void tokenize2nd_order_b_8x8
 
 }
 
-static void tokenize2nd_order_b
-(
-  MACROBLOCKD *xd,
-  TOKENEXTRA **tp,
-  VP8_COMP *cpi
-) {
+static void tokenize2nd_order_b(MACROBLOCKD *xd, TOKENEXTRA **tp,
+                                VP8_COMP *cpi) {
   int pt;             /* near block/prev token context index */
   int c;              /* start at DC */
   TOKENEXTRA *t = *tp;/* store tokens starting here */
@@ -188,9 +238,8 @@ static void tokenize2nd_order_b
   int seg_eob = 16;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
 
-  if (segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
+  if (segfeature_active(xd, segment_id, SEG_LVL_EOB))
     seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);
-  }
 
   b = xd->block + 24;
   qcoeff_ptr = b->qcoeff;
@@ -542,14 +591,10 @@ static void tokenize1st_order_b
   unsigned int block;
   const BLOCKD *b;
   int pt;             /* near block/prev token context index */
-  int c;
-  int token;
+  int band, rc, v, c, token;
   TOKENEXTRA *t = *tp;/* store tokens starting here */
   const short *qcoeff_ptr;
-  ENTROPY_CONTEXT *a;
-  ENTROPY_CONTEXT *l;
-  int band, rc, v;
-  int tmp1, tmp2;
+  ENTROPY_CONTEXT *a, *l;
 
   int seg_eob = 16;
   int segment_id = xd->mode_info_context->mbmi.segment_id;
@@ -561,11 +606,9 @@ static void tokenize1st_order_b
   b = xd->block;
   /* Luma */
   for (block = 0; block < 16; block++, b++) {
-    tmp1 = vp8_block2above[block];
-    tmp2 = vp8_block2left[block];
     qcoeff_ptr = b->qcoeff;
-    a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
-    l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+    a = (ENTROPY_CONTEXT *)xd->above_context + vp8_block2above[block];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp8_block2left[block];
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
     c = type ? 0 : 1;
@@ -609,11 +652,9 @@ static void tokenize1st_order_b
   }
   /* Chroma */
   for (block = 16; block < 24; block++, b++) {
-    tmp1 = vp8_block2above[block];
-    tmp2 = vp8_block2left[block];
     qcoeff_ptr = b->qcoeff;
-    a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;
-    l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;
+    a = (ENTROPY_CONTEXT *)xd->above_context + vp8_block2above[block];
+    l = (ENTROPY_CONTEXT *)xd->left_context + vp8_block2left[block];
 
     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
 
@@ -701,6 +742,20 @@ int mb_is_skippable_8x8(MACROBLOCKD *x) {
   return (mby_is_skippable_8x8(x) & mbuv_is_skippable_8x8(x));
 }
 
+#if CONFIG_TX16X16
+int mby_is_skippable_16x16(MACROBLOCKD *x) {
+  int skip = 1;
+  //skip &= (x->block[0].eob < 2); // I think this should be commented? No second order == DC must be coded
+  //skip &= (x->block[0].eob < 1);
+  //skip &= (!x->block[24].eob);
+  skip &= !x->block[0].eob;
+  return skip;
+}
+
+int mb_is_skippable_16x16(MACROBLOCKD *x) {
+  return (mby_is_skippable_16x16(x) & mbuv_is_skippable_8x8(x));
+}
+#endif
 
 void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
   int plane_type;
@@ -730,16 +785,32 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
   has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED
                   && x->mode_info_context->mbmi.mode != I8X8_PRED
                   && x->mode_info_context->mbmi.mode != SPLITMV);
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16) has_y2_block = 0; // Because of inter frames
+#endif
 
-  x->mode_info_context->mbmi.mb_skip_coeff =
-    ((tx_type == TX_8X8) ?
-     mb_is_skippable_8x8(x) :
-     mb_is_skippable(x, has_y2_block));
+  switch (tx_type) {
+#if CONFIG_TX16X16
+    case TX_16X16:
+      x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(x);
+      break;
+#endif
+    case TX_8X8:
+      x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(x);
+      break;
+    default:
+      x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x, has_y2_block);
+      break;
+  }
 
   if (x->mode_info_context->mbmi.mb_skip_coeff) {
     cpi->skip_true_count[mb_skip_context] += skip_inc;
-
     if (!cpi->common.mb_no_coeff_skip) {
+#if CONFIG_TX16X16
+      if (tx_type == TX_16X16)
+        vp8_stuff_mb_16x16(cpi, x, t);
+      else
+#endif
       if (tx_type == TX_8X8)
         vp8_stuff_mb_8x8(cpi, x, t);
       else
@@ -766,9 +837,28 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
       tokenize2nd_order_b(x, t, cpi);
 
     plane_type = 0;
-
   }
 
+#if CONFIG_TX16X16
+  if (tx_type == TX_16X16) {
+    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
+    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
+    tokenize1st_order_b_16x16(x, x->block, t, 3, x->frame_type, A, L, cpi);
+    for (b = 1; b < 16; b++) {
+      *(A + vp8_block2above[b]) = *(A);
+      *(L + vp8_block2left[b] ) = *(L);
+    }
+    for (b = 16; b < 24; b += 4) {
+      tokenize1st_order_b_8x8(x, x->block + b, t, 2, x->frame_type,
+          A + vp8_block2above_8x8[b], L + vp8_block2left_8x8[b], cpi);
+      *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);
+      *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);
+    }
+    vpx_memset(&A[8], 0, sizeof(A[8]));
+    vpx_memset(&L[8], 0, sizeof(L[8]));
+  }
+  else
+#endif
   if (tx_type == TX_8X8) {
     ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
     ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
@@ -827,15 +917,20 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
 
 
 #ifdef ENTROPY_STATS
-
 void init_context_counters(void) {
   FILE *f = fopen("context.bin", "rb");
   if (!f) {
     vpx_memset(context_counters, 0, sizeof(context_counters));
     vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
+#if CONFIG_TX16X16
+    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
+#endif
   } else {
     fread(context_counters, sizeof(context_counters), 1, f);
     fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+#if CONFIG_TX16X16
+    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+#endif
     fclose(f);
   }
 
@@ -843,15 +938,20 @@ void init_context_counters(void) {
   if (!f) {
     vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
     vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
+#if CONFIG_TX16X16
+    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
+#endif
   } else {
     fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
     fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
+#if CONFIG_TX16X16
+    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+#endif
     fclose(f);
   }
 }
 
 void print_context_counters() {
-
   int type, band, pt, t;
   FILE *f = fopen("context.c", "w");
 
@@ -892,7 +992,6 @@ void print_context_counters() {
   fprintf(f, "static const unsigned int\nvp8_default_coef_counts_8x8"
           "[BLOCK_TYPES_8X8] [COEF_BANDS]"
           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
-
   type = 0;
   do {
     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
@@ -921,26 +1020,54 @@ void print_context_counters() {
 
     fprintf(f, "\n  }");
   } while (++type < BLOCK_TYPES_8X8);
+  fprintf(f, "\n};\n");
 
+#if CONFIG_TX16X16
+  fprintf(f, "static const unsigned int\nvp8_default_coef_counts_16x16"
+          "[BLOCK_TYPES_16X16] [COEF_BANDS]"
+          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        fprintf(f, "%s\n      {", Comma(pt));
+        t = 0;
+        do {
+          const INT64 x = context_counters_16x16 [type] [band] [pt] [t];
+          const int y = (int) x;
+
+          assert(x == (INT64) y);  /* no overflow handling yet */
+          fprintf(f, "%s %d", Comma(t), y);
+
+        } while (++t < MAX_ENTROPY_TOKENS);
+
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+
+      fprintf(f, "\n    }");
+
+    } while (++band < COEF_BANDS);
+
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES_16X16);
   fprintf(f, "\n};\n");
+#endif
 
   fprintf(f, "static const vp8_prob\n"
           "vp8_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"
           "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
   type = 0;
-
   do {
     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-
     band = 0;
-
     do {
       fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-
       pt = 0;
-
       do {
-
         unsigned int branch_ct [ENTROPY_NODES] [2];
         unsigned int coef_counts[MAX_ENTROPY_TOKENS];
         vp8_prob coef_probs[ENTROPY_NODES];
@@ -952,7 +1079,6 @@ void print_context_counters() {
         fprintf(f, "%s\n      {", Comma(pt));
 
         t = 0;
-
         do {
           fprintf(f, "%s %d", Comma(t), coef_probs[t]);
 
@@ -960,11 +1086,8 @@ void print_context_counters() {
 
         fprintf(f, "}");
       } while (++pt < PREV_COEF_CONTEXTS);
-
       fprintf(f, "\n    }");
-
     } while (++band < COEF_BANDS);
-
     fprintf(f, "\n  }");
   } while (++type < BLOCK_TYPES);
   fprintf(f, "\n};\n");
@@ -973,19 +1096,13 @@ void print_context_counters() {
           "vp8_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"
           "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
   type = 0;
-
   do {
     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-
     band = 0;
-
     do {
       fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-
       pt = 0;
-
       do {
-
         unsigned int branch_ct [ENTROPY_NODES] [2];
         unsigned int coef_counts[MAX_ENTROPY_TOKENS];
         vp8_prob coef_probs[ENTROPY_NODES];
@@ -994,34 +1111,65 @@ void print_context_counters() {
         vp8_tree_probs_from_distribution(
           MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
           coef_probs, branch_ct, coef_counts, 256, 1);
-
         fprintf(f, "%s\n      {", Comma(pt));
-        t = 0;
 
+        t = 0;
         do {
           fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-
         } while (++t < ENTROPY_NODES);
-
         fprintf(f, "}");
       } while (++pt < PREV_COEF_CONTEXTS);
-
       fprintf(f, "\n    }");
-
     } while (++band < COEF_BANDS);
-
     fprintf(f, "\n  }");
   } while (++type < BLOCK_TYPES_8X8);
   fprintf(f, "\n};\n");
 
+#if CONFIG_TX16X16
+  fprintf(f, "static const vp8_prob\n"
+          "vp8_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"
+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
+  type = 0;
+  do {
+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    band = 0;
+    do {
+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      pt = 0;
+      do {
+        unsigned int branch_ct [ENTROPY_NODES] [2];
+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
+        vp8_prob coef_probs[ENTROPY_NODES];
+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+          coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];
+        vp8_tree_probs_from_distribution(
+          MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
+          coef_probs, branch_ct, coef_counts, 256, 1);
+        fprintf(f, "%s\n      {", Comma(pt));
+
+        t = 0;
+        do {
+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
+        } while (++t < ENTROPY_NODES);
+        fprintf(f, "}");
+      } while (++pt < PREV_COEF_CONTEXTS);
+      fprintf(f, "\n    }");
+    } while (++band < COEF_BANDS);
+    fprintf(f, "\n  }");
+  } while (++type < BLOCK_TYPES_16X16);
+  fprintf(f, "\n};\n");
+#endif
+
   fclose(f);
 
   f = fopen("context.bin", "wb");
   fwrite(context_counters, sizeof(context_counters), 1, f);
   fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+#if CONFIG_TX16X16
+  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+#endif
   fclose(f);
 }
-
 #endif
 
 
@@ -1151,6 +1299,50 @@ void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
 }
 
 
+#if CONFIG_TX16X16
+static __inline
+void stuff1st_order_b_16x16(const BLOCKD *const b, TOKENEXTRA **tp, const FRAME_TYPE frametype,
+                            ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, VP8_COMP *cpi)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    (void) frametype;
+    (void) b;
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs_16x16[3][1][pt];
+    t->skip_eob_node = 0;
+    ++t;
+    *tp = t;
+    ++cpi->coef_counts_16x16[3][1][pt][DCT_EOB_TOKEN];
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+}
+
+void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
+  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;
+  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;
+  int b, i;
+
+  stuff1st_order_b_16x16(x->block, t, x->frame_type, A, L, cpi);
+  for (i = 1; i < 16; i++) {
+    *(A + vp8_block2above[i]) = *(A);
+    *(L +  vp8_block2left[i]) = *(L);
+  }
+  for (b = 16; b < 24; b += 4) {
+    stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,
+        A + vp8_block2above[b],
+        L + vp8_block2left[b],
+        cpi);
+    *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);
+    *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);
+  }
+  vpx_memset(&A[8], 0, sizeof(A[8]));
+  vpx_memset(&L[8], 0, sizeof(L[8]));
+}
+#endif
+
 static __inline void stuff2nd_order_b
 (
   TOKENEXTRA **tp,
@@ -1215,7 +1407,6 @@ void stuff1st_order_buv
   ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
   pt = 0; /* 0 <-> all coeff data is zero */
   *a = *l = pt;
-
 }
 
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
@@ -1241,9 +1432,13 @@ void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
 }
 void vp8_fix_contexts(MACROBLOCKD *x) {
   /* Clear entropy contexts for Y2 blocks */
-  if (x->mode_info_context->mbmi.mode != B_PRED
+  if ((x->mode_info_context->mbmi.mode != B_PRED
       && x->mode_info_context->mbmi.mode != I8X8_PRED
-      && x->mode_info_context->mbmi.mode != SPLITMV) {
+      && x->mode_info_context->mbmi.mode != SPLITMV)
+#if CONFIG_TX16X16
+      || x->mode_info_context->mbmi.txfm_size == TX_16X16
+#endif
+      ) {
     vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
     vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
   } else {
diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h
index 4ee676e7f..4d2c74eb3 100644
--- a/vp8/encoder/tokenize.h
+++ b/vp8/encoder/tokenize.h
@@ -44,8 +44,11 @@ void print_context_counters();
 
 extern INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
 extern INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
+#if CONFIG_TX16X16
+extern INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+#endif
 #endif
+
 extern const int *vp8_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
-- 
cgit v1.2.3