3 files changed, 172 insertions, 3 deletions
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 56a423e6c..fad492de7 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5801,9 +5801,9 @@ void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
                          TX_SIZE tx_size) {
   // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
   switch (tx_size) {
-    case TX_8X8: vpx_hadamard_8x8_c(src_diff, bw, coeff); break;
-    case TX_16X16: vpx_hadamard_16x16_c(src_diff, bw, coeff); break;
-    case TX_32X32: vpx_hadamard_32x32_c(src_diff, bw, coeff); break;
+    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
     default: assert(0);
   }
 }
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 2b687eed6..7ab98eab2 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -32,6 +32,166 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
   return (sum + 8) >> 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+// src_diff: 13 bit, dynamic range [-4095, 4095]
+// coeff: 16 bit
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+                                            ptrdiff_t src_stride,
+                                            int32_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int32_t *src_diff,
+                                             ptrdiff_t src_stride,
+                                             int32_t *coeff) {
+  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int32_t c0 = b0 + b2;
+  int32_t c1 = b1 + b3;
+  int32_t c2 = b0 - b2;
+  int32_t c3 = b1 - b3;
+  int32_t c4 = b4 + b6;
+  int32_t c5 = b5 + b7;
+  int32_t c6 = b4 - b6;
+  int32_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                               tran_low_t *coeff) {
+  int idx;
+  int32_t buffer[64];
+  int32_t buffer2[64];
+  int32_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // src_diff: 13 bit
+    // buffer: 16 bit, dynamic range [-32760, 32760]
+    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // buffer: 16 bit
+    // buffer2: 19 bit, dynamic range [-262080, 262080]
+    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 19 bit, dynamic range [-262080, 262080]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;
+    tran_low_t b1 = (a0 - a1) >> 1;
+    tran_low_t b2 = (a2 + a3) >> 1;
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 20 bit
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;
+    tran_low_t b1 = (a0 - a1) >> 2;
+    tran_low_t b2 = (a2 + a3) >> 2;
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 13d4b3d9b..198a70426 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -785,6 +785,15 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;
 
+    add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_8x8/;
+
+    add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_16x16/;
+
+    add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_32x32/;
+
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon/;
   } else {