4 files changed, 254 insertions, 63 deletions
diff --git a/vpx_dsp/arm/idct16x16_add_neon.asm b/vpx_dsp/arm/idct16x16_add_neon.asm
index 7e2161cf8..b80f2ba54 100644
--- a/vpx_dsp/arm/idct16x16_add_neon.asm
+++ b/vpx_dsp/arm/idct16x16_add_neon.asm
@@ -8,8 +8,14 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+    INCLUDE ./vpx_config.asm
+
     EXPORT  |vpx_idct16x16_256_add_neon_pass1|
     EXPORT  |vpx_idct16x16_256_add_neon_pass2|
+    IF CONFIG_VP9_HIGHBITDEPTH
+    EXPORT  |vpx_idct16x16_256_add_neon_pass1_tran_low|
+    EXPORT  |vpx_idct16x16_256_add_neon_pass2_tran_low|
+    ENDIF
     EXPORT  |vpx_idct16x16_10_add_neon_pass1|
     EXPORT  |vpx_idct16x16_10_add_neon_pass2|
     ARM
@@ -60,6 +66,7 @@
     vld2.s16        {q1,q2}, [r0]!
     vmov.s16        q15, q1
 
+idct16x16_256_add_neon_pass1
     ; cospi_28_64 = 3196
     movw            r3, #0x0c7c
 
@@ -255,6 +262,28 @@
     bx              lr
     ENDP  ; |vpx_idct16x16_256_add_neon_pass1|
 
+IF CONFIG_VP9_HIGHBITDEPTH
+;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input,
+;                                                 int16_t *output)
+;
+; r0  const tran_low_t *input
+; r1  int16_t *output
+
+|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC
+    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+    LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
+    vmov.s16        q15, q1
+
+    b               idct16x16_256_add_neon_pass1
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass1_tran_low|
+ENDIF  ; CONFIG_VP9_HIGHBITDEPTH
+
 ;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
 ;                                      int16_t *output,
 ;                                      int16_t *pass1_output,
@@ -273,8 +302,6 @@
 ; will be stored back into q8-q15 registers. This function will touch q0-q7
 ; registers and use them as buffer during calculation.
 |vpx_idct16x16_256_add_neon_pass2| PROC
-    push            {r3-r9}
-
     ; TODO(hkuang): Find a better way to load the elements.
     ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
     vld2.s16        {q8,q9}, [r0]!
@@ -287,6 +314,9 @@
     vld2.s16        {q0,q1}, [r0]!
     vmov.s16        q15, q0;
 
+idct16x16_256_add_neon_pass2
+    push            {r3-r9}
+
     ; cospi_30_64 = 1606
     movw            r3, #0x0646
 
@@ -755,6 +785,36 @@ end_idct16x16_pass2
     bx              lr
     ENDP  ; |vpx_idct16x16_256_add_neon_pass2|
 
+IF CONFIG_VP9_HIGHBITDEPTH
+;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
+;                                               int16_t *output,
+;                                               int16_t *pass1_output,
+;                                               int16_t skip_adding,
+;                                               uint8_t *dest,
+;                                               int dest_stride)
+;
+; r0  const tran_low_t *src
+; r1  int16_t *output
+; r2  int16_t *pass1_output
+; r3  int16_t skip_adding
+; r4  uint8_t *dest
+; r5  int dest_stride
+
+|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC
+    LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
+    LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
+    LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
+    LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
+    LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
+    LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
+    LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
+    LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
+    vmov.s16        q15, q0
+
+    b               idct16x16_256_add_neon_pass2
+    ENDP  ; |vpx_idct16x16_256_add_neon_pass2_tran_low|
+ENDIF  ; CONFIG_VP9_HIGHBITDEPTH
+
 ;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input,
 ;                                       int16_t *output)
 ;
diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c
index 08e82a15c..4e22b5520 100644
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -13,7 +13,11 @@
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) {
+static void idct16x16_256_add_neon_pass1(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t s6, const int16x8_t s7,
+                                         int16_t *out) {
   int16x4_t d0s16, d1s16, d2s16, d3s16;
   int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
   int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
@@ -22,31 +26,15 @@ void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) {
   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
   int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
   int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-  int16x8x2_t q0x2s16;
 
-  q0x2s16 = vld2q_s16(in);
-  q8s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q9s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q10s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q11s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q12s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q13s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q14s16 = q0x2s16.val[0];
-  in += 16;
-  q0x2s16 = vld2q_s16(in);
-  q15s16 = q0x2s16.val[0];
+  q8s16 = s0;
+  q9s16 = s1;
+  q10s16 = s2;
+  q11s16 = s3;
+  q12s16 = s4;
+  q13s16 = s5;
+  q14s16 = s6;
+  q15s16 = s7;
 
   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                     &q15s16);
@@ -211,10 +199,78 @@ void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) {
   vst1q_s16(out, q15s16);
 }
 
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
-                                      int16_t *pass1_output,
-                                      int16_t skip_adding, uint8_t *dest,
-                                      int dest_stride) {
+void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) {
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8x2_t v;
+
+  v = vld2q_s16(in);
+  s0 = v.val[0];
+  in += 16;
+  v = vld2q_s16(in);
+  s1 = v.val[0];
+  in += 16;
+  v = vld2q_s16(in);
+  s2 = v.val[0];
+  in += 16;
+  v = vld2q_s16(in);
+  s3 = v.val[0];
+  in += 16;
+  v = vld2q_s16(in);
+  s4 = v.val[0];
+  in += 16;
+  v = vld2q_s16(in);
+  s5 = v.val[0];
+  in += 16;
+  v = vld2q_s16(in);
+  s6 = v.val[0];
+  in += 16;
+  v = vld2q_s16(in);
+  s7 = v.val[0];
+
+  idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *in,
+                                               int16_t *out) {
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8x2_t v;
+
+  v = load_tran_low_to_s16x2q(in);
+  s0 = v.val[0];
+  in += 16;
+  v = load_tran_low_to_s16x2q(in);
+  s1 = v.val[0];
+  in += 16;
+  v = load_tran_low_to_s16x2q(in);
+  s2 = v.val[0];
+  in += 16;
+  v = load_tran_low_to_s16x2q(in);
+  s3 = v.val[0];
+  in += 16;
+  v = load_tran_low_to_s16x2q(in);
+  s4 = v.val[0];
+  in += 16;
+  v = load_tran_low_to_s16x2q(in);
+  s5 = v.val[0];
+  in += 16;
+  v = load_tran_low_to_s16x2q(in);
+  s6 = v.val[0];
+  in += 16;
+  v = load_tran_low_to_s16x2q(in);
+  s7 = v.val[0];
+
+  idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static void idct16x16_256_add_neon_pass2(const int16x8_t s0, const int16x8_t s1,
+                                         const int16x8_t s2, const int16x8_t s3,
+                                         const int16x8_t s4, const int16x8_t s5,
+                                         const int16x8_t s6, const int16x8_t s7,
+                                         int16_t *out, int16_t *pass1_output,
+                                         int16_t skip_adding, uint8_t *dest,
+                                         int dest_stride) {
   uint8_t *d;
   uint8x8_t d12u8, d13u8;
   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
@@ -229,31 +285,15 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
   int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
   int32x4_t q10s32, q11s32, q12s32, q13s32;
-  int16x8x2_t q0x2s16;
 
-  q0x2s16 = vld2q_s16(src);
-  q8s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q9s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q10s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q11s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q12s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q13s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q14s16 = q0x2s16.val[0];
-  src += 16;
-  q0x2s16 = vld2q_s16(src);
-  q15s16 = q0x2s16.val[0];
+  q8s16 = s0;
+  q9s16 = s1;
+  q10s16 = s2;
+  q11s16 = s3;
+  q12s16 = s4;
+  q13s16 = s5;
+  q14s16 = s6;
+  q15s16 = s7;
 
   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
                     &q15s16);
@@ -760,6 +800,81 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
   }
 }
 
+void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
+                                      int16_t *pass1_output,
+                                      int16_t skip_adding, uint8_t *dest,
+                                      int dest_stride) {
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = vld2q_s16(src);
+  q8s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q9s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q10s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q11s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q12s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q13s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q14s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = vld2q_s16(src);
+  q15s16 = q0x2s16.val[0];
+
+  idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
+                               q14s16, q15s16, out, pass1_output, skip_adding,
+                               dest, dest_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
+                                               int16_t *out,
+                                               int16_t *pass1_output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest, int dest_stride) {
+  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+  int16x8x2_t q0x2s16;
+
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q8s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q9s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q10s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q11s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q12s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q13s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q14s16 = q0x2s16.val[0];
+  src += 16;
+  q0x2s16 = load_tran_low_to_s16x2q(src);
+  q15s16 = q0x2s16.val[0];
+
+  idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
+                               q14s16, q15s16, out, pass1_output, skip_adding,
+                               dest, dest_stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *in, int16_t *out) {
   int16x4_t d4s16;
   int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
diff --git a/vpx_dsp/arm/idct16x16_neon.c b/vpx_dsp/arm/idct16x16_neon.c
index 8eae549bb..bcbbf4b6d 100644
--- a/vpx_dsp/arm/idct16x16_neon.c
+++ b/vpx_dsp/arm/idct16x16_neon.c
@@ -16,6 +16,21 @@ void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
                                       int16_t *pass1_output,
                                       int16_t skip_adding, uint8_t *dest,
                                       int dest_stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input,
+                                               int16_t *output);
+void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
+                                               int16_t *output,
+                                               int16_t *pass1_output,
+                                               int16_t skip_adding,
+                                               uint8_t *dest, int dest_stride);
+#else
+#define vpx_idct16x16_256_add_neon_pass1_tran_low \
+  vpx_idct16x16_256_add_neon_pass1
+#define vpx_idct16x16_256_add_neon_pass2_tran_low \
+  vpx_idct16x16_256_add_neon_pass2
+#endif
+
 void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output);
 void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
                                      int16_t *pass1_output);
@@ -26,7 +41,7 @@ extern void vpx_push_neon(int64_t *store);
 extern void vpx_pop_neon(int64_t *store);
 #endif  // HAVE_NEON_ASM
 
-void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
+void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
                                 int dest_stride) {
 #if HAVE_NEON_ASM
   int64_t store_reg[8];
@@ -42,24 +57,25 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest,
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input, pass1_output);
+  vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0,
-                                   dest, dest_stride);
+  vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output,
+                                            pass1_output, 0, dest, dest_stride);
 
   /* Parallel idct on the lower 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output);
+  vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8,
-                                   pass1_output, 0, dest, dest_stride);
+  vpx_idct16x16_256_add_neon_pass2_tran_low(input + 8 * 16 + 1,
+                                            row_idct_output + 8, pass1_output,
+                                            0, dest, dest_stride);
 
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fd5d6b48f..6d116559d 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -687,7 +687,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vpx_idct8x8_1_add neon sse2/;
 
     add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_256_add sse2/;
+    specialize qw/vpx_idct16x16_256_add neon sse2/;
 
     add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct16x16_10_add neon sse2/;