diff options
-rw-r--r-- | vpx_dsp/arm/idct32x32_135_add_neon.c | 72 | ||||
-rw-r--r-- | vpx_dsp/arm/idct32x32_34_add_neon.c | 16 | ||||
-rw-r--r-- | vpx_dsp/arm/idct4x4_add_neon.c | 4 | ||||
-rw-r--r-- | vpx_dsp/arm/idct8x8_add_neon.c | 32 | ||||
-rw-r--r-- | vpx_dsp/arm/idct_neon.h | 15 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp.mk | 2 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 |
7 files changed, 91 insertions, 52 deletions
diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c index db9ffef6c..28b946558 100644 --- a/vpx_dsp/arm/idct32x32_135_add_neon.c +++ b/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -16,6 +16,50 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" +static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0, + int16x8_t *const in1, int16x8_t *const in2, + int16x8_t *const in3, int16x8_t *const in4, + int16x8_t *const in5, int16x8_t *const in6, + int16x8_t *const in7) { + *in0 = load_tran_low_to_s16q(input); + input += 32; + *in1 = load_tran_low_to_s16q(input); + input += 32; + *in2 = load_tran_low_to_s16q(input); + input += 32; + *in3 = load_tran_low_to_s16q(input); + input += 32; + *in4 = load_tran_low_to_s16q(input); + input += 32; + *in5 = load_tran_low_to_s16q(input); + input += 32; + *in6 = load_tran_low_to_s16q(input); + input += 32; + *in7 = load_tran_low_to_s16q(input); +} + +static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0, + int16x4_t *const in1, int16x4_t *const in2, + int16x4_t *const in3, int16x4_t *const in4, + int16x4_t *const in5, int16x4_t *const in6, + int16x4_t *const in7) { + *in0 = load_tran_low_to_s16d(input); + input += 32; + *in1 = load_tran_low_to_s16d(input); + input += 32; + *in2 = load_tran_low_to_s16d(input); + input += 32; + *in3 = load_tran_low_to_s16d(input); + input += 32; + *in4 = load_tran_low_to_s16d(input); + input += 32; + *in5 = load_tran_low_to_s16d(input); + input += 32; + *in6 = load_tran_low_to_s16d(input); + input += 32; + *in7 = load_tran_low_to_s16d(input); +} + // Only for the first pass of the _135_ variant. Since it only uses values from // the top left 16x16 it can safely assume all the remaining values are 0 and // skip an awful lot of calculations. In fact, only the first 12 columns make @@ -43,7 +87,7 @@ // 13 84 93 103 110 125 // 14 98 106 115 127 // 15 117 128 -static void idct32_12_neon(const int16_t *input, int16_t *output) { +static void idct32_12_neon(const tran_low_t *input, int16_t *output) { int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int16x8_t in8, in9, in10, in11; @@ -66,27 +110,11 @@ static void idct32_12_neon(const int16_t *input, int16_t *output) { s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24, s7_25, s7_26, s7_27; - load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5, - &in6, &in7); - - input += 8; - - tmp0 = vld1_s16(input); - input += 32; - tmp1 = vld1_s16(input); - input += 32; - tmp2 = vld1_s16(input); - input += 32; - tmp3 = vld1_s16(input); - input += 32; - tmp4 = vld1_s16(input); - input += 32; - tmp5 = vld1_s16(input); - input += 32; - tmp6 = vld1_s16(input); - input += 32; - tmp7 = vld1_s16(input); + load_8x8_s16(input, &in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); + transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); + load_4x8_s16(input + 8, &tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6, + &tmp7); transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9, &in10, &in11); @@ -669,7 +697,7 @@ static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) { output + (24 * stride), stride); } -void vpx_idct32x32_135_add_neon(const int16_t *input, uint8_t *dest, +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { int i; int16_t temp[32 * 16]; diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c index a584b1d9e..7b3560a13 100644 --- a/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -46,21 +46,21 @@ static void idct32_6_neon(const tran_low_t *input, int16_t *output) { s2_31; int16x8_t s3_24, s3_25, s3_26, s3_27; - in0 = load_tran_low_to_s16(input); + in0 = load_tran_low_to_s16q(input); input += 32; - in1 = load_tran_low_to_s16(input); + in1 = load_tran_low_to_s16q(input); input += 32; - in2 = load_tran_low_to_s16(input); + in2 = load_tran_low_to_s16q(input); input += 32; - in3 = load_tran_low_to_s16(input); + in3 = load_tran_low_to_s16q(input); input += 32; - in4 = load_tran_low_to_s16(input); + in4 = load_tran_low_to_s16q(input); input += 32; - in5 = load_tran_low_to_s16(input); + in5 = load_tran_low_to_s16q(input); input += 32; - in6 = load_tran_low_to_s16(input); + in6 = load_tran_low_to_s16q(input); input += 32; - in7 = load_tran_low_to_s16(input); + in7 = load_tran_low_to_s16q(input); transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); // stage 1 diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index dd75dc08e..6ac516140 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -62,8 +62,8 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, assert(!(dest_stride % sizeof(uint32_t))); // Rows - a0 = load_tran_low_to_s16(input); - a1 = load_tran_low_to_s16(input + 8); + a0 = load_tran_low_to_s16q(input); + a1 = load_tran_low_to_s16q(input + 8); idct4x4_16_kernel(cospis, &a0, &a1); // Columns diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index 159a6ec98..5505cc84e 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -174,14 +174,14 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; - q8s16 = load_tran_low_to_s16(input); - q9s16 = load_tran_low_to_s16(input + 8); - q10s16 = load_tran_low_to_s16(input + 16); - q11s16 = load_tran_low_to_s16(input + 24); - q12s16 = load_tran_low_to_s16(input + 32); - q13s16 = load_tran_low_to_s16(input + 40); - q14s16 = load_tran_low_to_s16(input + 48); - q15s16 = load_tran_low_to_s16(input + 56); + q8s16 = load_tran_low_to_s16q(input); + q9s16 = load_tran_low_to_s16q(input + 8); + q10s16 = load_tran_low_to_s16q(input + 16); + q11s16 = load_tran_low_to_s16q(input + 24); + q12s16 = load_tran_low_to_s16q(input + 32); + q13s16 = load_tran_low_to_s16q(input + 40); + q14s16 = load_tran_low_to_s16q(input + 48); + q15s16 = load_tran_low_to_s16q(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); @@ -280,14 +280,14 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; - q8s16 = load_tran_low_to_s16(input); - q9s16 = load_tran_low_to_s16(input + 8); - q10s16 = load_tran_low_to_s16(input + 16); - q11s16 = load_tran_low_to_s16(input + 24); - q12s16 = load_tran_low_to_s16(input + 32); - q13s16 = load_tran_low_to_s16(input + 40); - q14s16 = load_tran_low_to_s16(input + 48); - q15s16 = load_tran_low_to_s16(input + 56); + q8s16 = load_tran_low_to_s16q(input); + q9s16 = load_tran_low_to_s16q(input + 8); + q10s16 = load_tran_low_to_s16q(input + 16); + q11s16 = load_tran_low_to_s16q(input + 24); + q12s16 = load_tran_low_to_s16q(input + 32); + q13s16 = load_tran_low_to_s16q(input + 40); + q14s16 = load_tran_low_to_s16q(input + 48); + q15s16 = load_tran_low_to_s16q(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index 5c2a53c03..e4493a105 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -18,9 +18,9 @@ #include "vpx_dsp/vpx_dsp_common.h" //------------------------------------------------------------------------------ +// Helper functions used to load tran_low_t into int16, narrowing if necessary. -// Helper function used to load tran_low_t into int16, narrowing if necessary. -static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) { +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { #if CONFIG_VP9_HIGHBITDEPTH const int32x4_t v0 = vld1q_s32(buf); const int32x4_t v1 = vld1q_s32(buf + 4); @@ -32,6 +32,17 @@ static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) { #endif } +static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + return vmovn_s32(v0); +#else + return vld1_s16(buf); +#endif +} + +//------------------------------------------------------------------------------ + // Multiply a by a_const. Saturate, shift and narrow by 14. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, const int16_t a_const) { diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index bb1143cca..200ef07f1 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -213,7 +213,6 @@ endif # HAVE_NEON endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c @@ -246,6 +245,7 @@ endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c endif # CONFIG_VP9 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index bba6b4f78..d78a35757 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -698,7 +698,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64"; # Need to add 135 eob idct32x32 implementations. $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; |