From fcbff9ee04f5b67ce79fd329333c8b1970d9318d Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Thu, 2 Aug 2012 09:07:33 -0700
Subject: Replacing the 8x8 DCT with 8x8 ADST/DCT for I8x8

Fixed the code review comments.

Under the htrans8x8 experiment the 8X8 DCT in the
I8X8 mode is replaced with a combination of 8X8 ADST and
DCT.

Overall coding gains with the htrans8x8 experiment are:
derf:   0.486
std-hd: 1.040
hd:     1.063
yt:     0.506

Note that part of the gain comes from bigger transforms
(8x8 instead of 4x4) and part comes from replacing the DCT
wth the ADST.

Change-Id: I92ca6bbfce11b4165d612b81d9adfad4d010c775
---
 configure                       |   2 +-
 vp8/common/blockd.h             |  71 ++++++++++++++++++-
 vp8/common/default_coef_probs.h |   2 +-
 vp8/common/entropy.h            |   4 +-
 vp8/common/idctllm.c            | 152 +++++++++++++++++++++++++++++++++++++++-
 vp8/decoder/decodframe.c        |  37 +++-------
 vp8/decoder/dequantize.c        |  45 ++++++++++++
 vp8/decoder/detokenize.c        |   6 +-
 vp8/encoder/dct.c               | 147 ++++++++++++++++++++++++++++++++++++++
 vp8/encoder/encodeintra.c       |  38 ++++------
 vp8/encoder/rdopt.c             |  35 +++------
 vp8/encoder/tokenize.c          |   4 +-
 12 files changed, 452 insertions(+), 91 deletions(-)

diff --git a/configure b/configure
index 75b93f4d1..fc998d05e 100755
--- a/configure
+++ b/configure
@@ -223,8 +223,8 @@ EXPERIMENT_LIST="
     pred_filter
     lossless
     hybridtransform
+    hybridtransform8x8
     switchable_interp
-    htrans8x8
     tx16x16
 "
 CONFIG_LIST="
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 64fc06546..3c43a1e9a 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -54,7 +54,6 @@ typedef struct {
 #define PLANE_TYPE_UV         2
 #define PLANE_TYPE_Y_WITH_DC  3
 
-
 typedef char ENTROPY_CONTEXT;
 typedef struct {
   ENTROPY_CONTEXT y1[4];
@@ -179,6 +178,50 @@ typedef enum {
   B_MODE_COUNT
 } B_PREDICTION_MODE;
 
+#if CONFIG_HYBRIDTRANSFORM8X8
+// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
+static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
+  B_PREDICTION_MODE b_mode;
+  switch (mode) {
+    case DC_PRED:
+      b_mode = B_DC_PRED;
+      break;
+    case V_PRED:
+      b_mode = B_VE_PRED;
+      break;
+    case H_PRED:
+      b_mode = B_HE_PRED;
+      break;
+    case TM_PRED:
+      b_mode = B_TM_PRED;
+      break;
+    case D45_PRED:
+      b_mode = B_LD_PRED;
+      break;
+    case D135_PRED:
+      b_mode = B_RD_PRED;
+      break;
+    case D117_PRED:
+      b_mode = B_VR_PRED;
+      break;
+    case D153_PRED:
+      b_mode = B_HD_PRED;
+      break;
+    case D27_PRED:
+      b_mode = B_VL_PRED;
+      break;
+    case D63_PRED:
+      b_mode = B_HU_PRED;
+      break;
+    default :
+      // for debug purpose, to be removed after full testing
+      assert(0);
+      break;
+  }
+  return b_mode;
+}
+#endif
+
 #define VP8_BINTRAMODES (B_HU_PRED + 1)  /* 10 */
 #define VP8_SUBMVREFS (1 + NEW4X4 - LEFT4X4)
 
@@ -389,6 +432,32 @@ typedef struct MacroBlockD {
 
 } MACROBLOCKD;
 
+#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM
+// transform mapping
+static void txfm_map(BLOCKD *b, B_PREDICTION_MODE bmode) {
+  switch (bmode) {
+    case B_TM_PRED :
+    case B_RD_PRED :
+      b->bmi.as_mode.tx_type = ADST_ADST;
+      break;
+
+    case B_VE_PRED :
+    case B_VR_PRED :
+      b->bmi.as_mode.tx_type = ADST_DCT;
+      break;
+
+    case B_HE_PRED :
+    case B_HD_PRED :
+    case B_HU_PRED :
+      b->bmi.as_mode.tx_type = DCT_ADST;
+      break;
+
+    default :
+      b->bmi.as_mode.tx_type = DCT_DCT;
+      break;
+  }
+}
+#endif
 
 extern void vp8_build_block_doffsets(MACROBLOCKD *x);
 extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
diff --git a/vp8/common/default_coef_probs.h b/vp8/common/default_coef_probs.h
index dfb0e5ea7..940e971b7 100644
--- a/vp8/common/default_coef_probs.h
+++ b/vp8/common/default_coef_probs.h
@@ -434,7 +434,7 @@ vp8_default_coef_probs_8x8[BLOCK_TYPES_8X8]
       { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
     }
   }
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   ,
   { /* block Type 3 */
     { /* Coeff Band 0 */
diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h
index 4497a3d47..190221c16 100644
--- a/vp8/common/entropy.h
+++ b/vp8/common/entropy.h
@@ -60,9 +60,9 @@ extern vp8_extra_bit_struct vp8_extra_bits[12];    /* indexed by token value */
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
 /* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
-
 #define BLOCK_TYPES 4
-#if CONFIG_HTRANS8X8
+
+#if CONFIG_HYBRIDTRANSFORM8X8
 #define BLOCK_TYPES_8X8 4
 #else
 #define BLOCK_TYPES_8X8 3
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index e549fe098..616e4938e 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -35,6 +35,8 @@ static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
 static const int rounding = 0;
 
+// TODO: these transforms can be further converted into integer forms
+//       for complexity optimization
 #if CONFIG_HYBRIDTRANSFORM
 float idct_4[16] = {
   0.500000000000000,   0.653281482438188,   0.500000000000000,   0.270598050073099,
@@ -51,11 +53,52 @@ float iadst_4[16] = {
 };
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM8X8
+float idct_8[64] = {
+  0.353553390593274,   0.490392640201615,   0.461939766255643,   0.415734806151273,
+  0.353553390593274,   0.277785116509801,   0.191341716182545,   0.097545161008064,
+  0.353553390593274,   0.415734806151273,   0.191341716182545,  -0.097545161008064,
+ -0.353553390593274,  -0.490392640201615,  -0.461939766255643,  -0.277785116509801,
+  0.353553390593274,   0.277785116509801,  -0.191341716182545,  -0.490392640201615,
+ -0.353553390593274,   0.097545161008064,   0.461939766255643,   0.415734806151273,
+  0.353553390593274,   0.097545161008064,  -0.461939766255643,  -0.277785116509801,
+  0.353553390593274,   0.415734806151273,  -0.191341716182545,  -0.490392640201615,
+  0.353553390593274,  -0.097545161008064,  -0.461939766255643,   0.277785116509801,
+  0.353553390593274,  -0.415734806151273,  -0.191341716182545,   0.490392640201615,
+  0.353553390593274,  -0.277785116509801,  -0.191341716182545,   0.490392640201615,
+ -0.353553390593274,  -0.097545161008064,   0.461939766255643,  -0.415734806151273,
+  0.353553390593274,  -0.415734806151273,   0.191341716182545,   0.097545161008064,
+ -0.353553390593274,   0.490392640201615,  -0.461939766255643,   0.277785116509801,
+  0.353553390593274,  -0.490392640201615,   0.461939766255643,  -0.415734806151273,
+  0.353553390593274,  -0.277785116509801,   0.191341716182545,  -0.097545161008064
+};
+
+float iadst_8[64] = {
+  0.089131608307533,   0.255357107325376,   0.387095214016349,   0.466553967085785,
+  0.483002021635509,   0.434217976756762,   0.326790388032145,   0.175227946595735,
+  0.175227946595735,   0.434217976756762,   0.466553967085785,   0.255357107325376,
+ -0.089131608307533,  -0.387095214016348,  -0.483002021635509,  -0.326790388032145,
+  0.255357107325376,   0.483002021635509,   0.175227946595735,  -0.326790388032145,
+ -0.466553967085785,  -0.089131608307533,   0.387095214016349,   0.434217976756762,
+  0.326790388032145,   0.387095214016349,  -0.255357107325376,  -0.434217976756762,
+  0.175227946595735,   0.466553967085786,  -0.089131608307534,  -0.483002021635509,
+  0.387095214016349,   0.175227946595735,  -0.483002021635509,   0.089131608307533,
+  0.434217976756762,  -0.326790388032145,  -0.255357107325377,   0.466553967085785,
+  0.434217976756762,  -0.089131608307533,  -0.326790388032145,   0.483002021635509,
+ -0.255357107325376,  -0.175227946595735,   0.466553967085785,  -0.387095214016348,
+  0.466553967085785,  -0.326790388032145,   0.089131608307533,   0.175227946595735,
+ -0.387095214016348,   0.483002021635509,  -0.434217976756762,   0.255357107325376,
+  0.483002021635509,  -0.466553967085785,   0.434217976756762,  -0.387095214016348,
+  0.326790388032145,  -0.255357107325375,   0.175227946595736,  -0.089131608307532
+};
+#endif
+
 #if CONFIG_HYBRIDTRANSFORM
 void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
   int i, j, k;
   float bufa[16], bufb[16]; // buffers are for floating-point test purpose
-                            // the implementation could be simplified in conjunction with integer transform
+                            // the implementation could be simplified in
+                            // conjunction with integer transform
   short *ip = input;
   short *op = output;
   int shortpitch = pitch >> 1;
@@ -158,6 +201,113 @@ void vp8_iht4x4llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
 }
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM8X8
+void vp8_iht8x8llm_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+  int i, j, k;
+  float bufa[64], bufb[64]; // buffers are for floating-point test purpose
+                            // the implementation could be simplified in
+                            // conjunction with integer transform
+  short *ip = input;
+  short *op = output;
+  int shortpitch = pitch >> 1;
+
+  float *pfa = &bufa[0];
+  float *pfb = &bufb[0];
+
+  // pointers to vertical and horizontal transforms
+  float *ptv, *pth;
+
+  // load and convert residual array into floating-point
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = (float)ip[i];
+    }
+    pfa += 8;
+    ip  += 8;
+  }
+
+  // vertical transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case ADST_DCT  :
+      ptv = &iadst_8[0];
+      break;
+
+    default :
+      ptv = &idct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfb[i] = 0 ;
+      for(k = 0; k < 8; k++) {
+        pfb[i] += ptv[k] * pfa[(k<<3)];
+      }
+      pfa += 1;
+    }
+
+    pfb += 8;
+    ptv += 8;
+    pfa = &bufa[0];
+  }
+
+  // horizontal transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case  DCT_ADST :
+      pth = &iadst_8[0];
+      break;
+
+    default :
+      pth = &idct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = 0;
+      for(k = 0; k < 8; k++) {
+        pfa[i] += pfb[k] * pth[k];
+      }
+      pth += 8;
+     }
+
+    pfa += 8;
+    pfb += 8;
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = &iadst_8[0];
+        break;
+
+      default :
+        pth = &idct_8[0];
+        break;
+    }
+  }
+
+  // convert to short integer format and load BLOCKD buffer
+  op  = output;
+  pfa = &bufa[0];
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
+                             -(short)( - pfa[i] / 8 + 0.49);
+    }
+    op  += shortpitch;
+    pfa += 8;
+  }
+}
+#endif
 
 void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) {
   int i;
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index d50e1dfb3..0588d002b 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -46,7 +46,6 @@ int dec_debug = 0;
 
 #define COEFCOUNT_TESTING
 
-
 static int merge_index(int v, int n, int modulus) {
   int max1 = (n - 1 - modulus / 2) / modulus + 1;
   if (v < max1) v = v * modulus + modulus / 2;
@@ -260,7 +259,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
   }
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
     xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   }
@@ -336,29 +335,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     for (i = 0; i < 16; i++) {
       BLOCKD *b = &xd->block[i];
       int b_mode = xd->mode_info_context->bmi[i].as_mode.first;
-      if(active_ht) {
-        switch(b_mode) {
-          case B_TM_PRED :
-          case B_RD_PRED :
-            b->bmi.as_mode.tx_type = ADST_ADST;
-            break;
-
-          case B_VE_PRED :
-          case B_VR_PRED :
-            b->bmi.as_mode.tx_type = ADST_DCT;
-            break ;
-
-          case B_HE_PRED :
-          case B_HD_PRED :
-          case B_HU_PRED :
-            b->bmi.as_mode.tx_type = DCT_ADST;
-            break;
-
-          default :
-            b->bmi.as_mode.tx_type = DCT_DCT;
-            break;
-        }
-      }
+      if(active_ht)
+        txfm_map(b, b_mode);
     } // loop over 4x4 blocks
   }
 #endif
@@ -392,7 +370,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       int i8x8mode;
       BLOCKD *b;
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
       int idx = (ib & 0x02) ? (ib + 2) : ib;
 
       short *q  = xd->block[idx].qcoeff;
@@ -410,8 +388,11 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
       RECON_INVOKE(RTCD_VTABLE(recon), intra8x8_predict)
       (b, i8x8mode, b->predictor);
 
-#if CONFIG_HTRANS8X8
-      vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+#if CONFIG_HYBRIDTRANSFORM8X8
+      txfm_map(b, pred_mode_conv(i8x8mode));
+      vp8_ht_dequant_idct_add_8x8_c(b->bmi.as_mode.tx_type,
+                                    q, dq, pre, dst, 16, stride);
+      // vp8_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
       q += 64;
 #else
       for (j = 0; j < 4; j++) {
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index 655409176..bf44fd61a 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -79,6 +79,51 @@ void vp8_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
 }
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM8X8
+void vp8_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
+                                   unsigned char *pred, unsigned char *dest,
+                                   int pitch, int stride) {
+  short output[64];
+  short *diff_ptr = output;
+  int b, r, c;
+  int i;
+  unsigned char *origdest = dest;
+  unsigned char *origpred = pred;
+
+  input[0] = dq[0] * input[0];
+  for (i = 1; i < 64; i++) {
+    input[i] = dq[1] * input[i];
+  }
+
+  vp8_iht8x8llm_c(input, output, 16, tx_type);
+
+  vpx_memset(input, 0, 128);
+
+  for (b = 0; b < 4; b++) {
+    for (r = 0; r < 4; r++) {
+      for (c = 0; c < 4; c++) {
+        int a = diff_ptr[c] + pred[c];
+
+        if (a < 0)
+          a = 0;
+
+        if (a > 255)
+          a = 255;
+
+        dest[c] = (unsigned char) a;
+      }
+
+      dest += stride;
+      diff_ptr += 8;
+      pred += pitch;
+    }
+    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
+    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
+    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
+  }
+}
+#endif
+
 void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
                             unsigned char *dest, int pitch, int stride) {
   short output[16];
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index c93b8e9c5..5f9768d41 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -473,7 +473,7 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   const int seg_active = segfeature_active(xd, segment_id, SEG_LVL_EOB);
   INT16 *qcoeff_ptr = &xd->qcoeff[0];
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED) ? 16 : 24;
   if (xd->mode_info_context->mbmi.mode != B_PRED &&
       xd->mode_info_context->mbmi.mode != SPLITMV &&
@@ -506,7 +506,7 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   else
     seg_eob = 64;
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   for (i = 0; i < bufthred ; i += 4) {
 #else
   for (i = 0; i < 24; i += 4) {
@@ -528,7 +528,7 @@ int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {
     qcoeff_ptr += 64;
   }
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
     type = PLANE_TYPE_UV;
     seg_eob = 16;
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index ba2a692d1..ad5258552 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -17,6 +17,8 @@
 
 #include "vp8/common/blockd.h"
 
+// TODO: these transforms can be converted into integer forms to reduce
+//       the complexity
 float dct_4[16] = {
   0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,
   0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,
@@ -32,6 +34,45 @@ float adst_4[16] = {
 };
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM8X8
+float dct_8[64] = {
+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
+  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,
+  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,
+ -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,
+  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,
+ -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,
+  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,
+  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,
+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
+  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,
+  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,
+ -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,
+  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,
+ -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,
+  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,
+  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064
+};
+
+float adst_8[64] = {
+  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,
+  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,
+  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,
+  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,
+  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,
+ -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,
+  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,
+  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,
+  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,
+  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,
+  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,
+ -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,
+  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,
+ -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,
+  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,
+  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532
+};
+#endif
 
 static const int xC1S7 = 16069;
 static const int xC2S6 = 15137;
@@ -394,6 +435,112 @@ void vp8_fht4x4_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
 }
 #endif
 
+#if CONFIG_HYBRIDTRANSFORM8X8
+void vp8_fht8x8_c(short *input, short *output, int pitch, TX_TYPE tx_type) {
+  int i, j, k;
+  float bufa[64], bufb[64]; // buffers are for floating-point test purpose
+                             // the implementation could be simplified in
+                             // conjunction with integer transform
+  short *ip = input;
+  short *op = output;
+
+  float *pfa = &bufa[0];
+  float *pfb = &bufb[0];
+
+  // pointers to vertical and horizontal transforms
+  float *ptv, *pth;
+
+  // load and convert residual array into floating-point
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = (float)ip[i];
+    }
+    pfa += 8;
+    ip  += pitch / 2;
+  }
+
+  // vertical transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case ADST_DCT  :
+      ptv = &adst_8[0];
+      break;
+
+    default :
+      ptv = &dct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfb[i] = 0;
+      for(k = 0; k < 8; k++) {
+        pfb[i] += ptv[k] * pfa[(k<<3)];
+      }
+      pfa += 1;
+    }
+    pfb += 8;
+    ptv += 8;
+    pfa = &bufa[0];
+  }
+
+  // horizontal transformation
+  pfa = &bufa[0];
+  pfb = &bufb[0];
+
+  switch(tx_type) {
+    case ADST_ADST :
+    case  DCT_ADST :
+      pth = &adst_8[0];
+      break;
+
+    default :
+      pth = &dct_8[0];
+      break;
+  }
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      pfa[i] = 0;
+      for(k = 0; k < 8; k++) {
+        pfa[i] += pfb[k] * pth[k];
+      }
+      pth += 8;
+     }
+
+    pfa += 8;
+    pfb += 8;
+
+    switch(tx_type) {
+      case ADST_ADST :
+      case  DCT_ADST :
+        pth = &adst_8[0];
+        break;
+
+      default :
+        pth = &dct_8[0];
+        break;
+    }
+  }
+
+  // convert to short integer format and load BLOCKD buffer
+  op  = output ;
+  pfa = &bufa[0] ;
+
+  for(j = 0; j < 8; j++) {
+    for(i = 0; i < 8; i++) {
+      op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
+                                   -(short)(- 8 * pfa[i] + 0.49);
+    }
+    op  += 8;
+    pfa += 8;
+  }
+}
+#endif
+
 void vp8_short_fdct4x4_c(short *input, short *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 325efeb6b..964046d92 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -90,28 +90,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
 #if CONFIG_HYBRIDTRANSFORM
     if(active_ht) {
       b->bmi.as_mode.test = b->bmi.as_mode.first;
-      switch(b->bmi.as_mode.first) {
-        // case B_DC_PRED :
-        case B_TM_PRED :
-        case B_RD_PRED :
-          b->bmi.as_mode.tx_type = ADST_ADST;
-          break;
-
-        case B_VE_PRED :
-        case B_VR_PRED :
-          b->bmi.as_mode.tx_type = ADST_DCT;
-          break;
-
-        case B_HE_PRED :
-        case B_HD_PRED :
-        case B_HU_PRED :
-          b->bmi.as_mode.tx_type = DCT_ADST;
-          break;
-
-        default :
-          b->bmi.as_mode.tx_type = DCT_DCT;
-          break;
-      }
+      txfm_map(b, b->bmi.as_mode.first);
 
       vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
       vp8_ht_quantize_b(be, b);
@@ -329,16 +308,25 @@ void vp8_encode_intra8x8(const VP8_ENCODER_RTCD *rtcd,
   }
 #endif
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   {
     MACROBLOCKD *xd = &x->e_mbd;
     int idx = (ib & 0x02) ? (ib + 2) : ib;
 
     // generate residual blocks
     vp8_subtract_4b_c(be, b, 16);
-    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+
+    txfm_map(b, pred_mode_conv(b->bmi.as_mode.first));
+
+    vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32,
+                 b->bmi.as_mode.tx_type);
     x->quantize_b_8x8(x->block + idx, xd->block + idx);
-    vp8_short_idct8x8_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+    vp8_iht8x8llm_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,
+                    b->bmi.as_mode.tx_type);
+
+//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+//    x->quantize_b_8x8(x->block + idx, xd->block + idx);
+//    vp8_short_idct8x8_c(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
 
     // reconstruct submacroblock
     for (i = 0; i < 4; i++) {
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index a2cd2651a..6eb10f4f1 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -454,7 +454,7 @@ int vp8_block_error_c(short *coeff, short *dqcoeff) {
   return error;
 }
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
 int vp8_submb_error_c(short *coeff, short *dqcoeff) {
   int i;
   int error = 0;
@@ -985,28 +985,7 @@ static int64_t rd_pick_intra4x4block(
 #if CONFIG_HYBRIDTRANSFORM
       if(active_ht) {
         b->bmi.as_mode.test = mode;
-        switch(mode) {
-          // case B_DC_PRED :
-          case B_TM_PRED :
-          case B_RD_PRED :
-            b->bmi.as_mode.tx_type = ADST_ADST;
-            break;
-
-          case B_VE_PRED :
-          case B_VR_PRED :
-            b->bmi.as_mode.tx_type = ADST_DCT;
-            break;
-
-          case B_HE_PRED :
-          case B_HD_PRED :
-          case B_HU_PRED :
-            b->bmi.as_mode.tx_type = DCT_ADST;
-            break;
-
-          default :
-            b->bmi.as_mode.tx_type = DCT_DCT;
-            break;
-        }
+        txfm_map(b, mode);
 
         vp8_fht4x4_c(be->src_diff, be->coeff, 32, b->bmi.as_mode.tx_type);
         vp8_ht_quantize_b(be, b);
@@ -1267,7 +1246,7 @@ static int64_t rd_pick_intra8x8block(
   DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 8);
   DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   // perform transformation of dimension 8x8
   // note the input and output index mapping
   int idx = (ib & 0x02) ? (ib + 2) : ib;
@@ -1298,8 +1277,10 @@ static int64_t rd_pick_intra8x8block(
 
       vp8_subtract_4b_c(be, b, 16);
 
-#if CONFIG_HTRANS8X8
-      x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+#if CONFIG_HYBRIDTRANSFORM8X8
+      txfm_map(b, pred_mode_conv(mode));
+      vp8_fht8x8_c(be->src_diff, (x->block + idx)->coeff, 32, b->bmi.as_mode.tx_type);
+//    x->vp8_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
 
       // compute quantization mse of 8x8 block
@@ -1376,7 +1357,7 @@ static int64_t rd_pick_intra8x8block(
 #endif
   vp8_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
   *(a + vp8_block2above_8x8[idx])     = besta0;
   *(a + vp8_block2above_8x8[idx] + 1) = besta1;
   *(l + vp8_block2left_8x8 [idx])     = bestl0;
diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index dac18c6db..105aa6a7c 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c
@@ -504,7 +504,7 @@ static void tokenize1st_order_ht(   MACROBLOCKD *xd,
 #endif
 
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
 static void tokenize1st_order_chroma
 (
   MACROBLOCKD *xd,
@@ -886,7 +886,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {
       tokenize1st_order_ht(x, t, plane_type, cpi);
     } else {
 
-#if CONFIG_HTRANS8X8
+#if CONFIG_HYBRIDTRANSFORM8X8
       if (x->mode_info_context->mbmi.mode == I8X8_PRED) {
         ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
         ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
-- 
cgit v1.2.3