summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorLinfeng Zhang <linfengz@google.com>2018-02-27 17:06:01 -0800
committerLinfeng Zhang <linfengz@google.com>2018-03-13 17:39:23 -0700
commit9351f96069b67b5e4eb67acaea9250ed5b56e874 (patch)
treed9ce1fb7ec5a09900d246eac59972dbcb874c609 /vpx_dsp
parentd8424d2890ddce0d92778e055eee145655cf034e (diff)
downloadlibvpx-9351f96069b67b5e4eb67acaea9250ed5b56e874.tar
libvpx-9351f96069b67b5e4eb67acaea9250ed5b56e874.tar.gz
libvpx-9351f96069b67b5e4eb67acaea9250ed5b56e874.tar.bz2
libvpx-9351f96069b67b5e4eb67acaea9250ed5b56e874.zip
Add vp9_highbd_iht16x16_256_add_neon()
BUG=webm:1403 Change-Id: I2293c11666786be276909d48ee78dacb40a89e25
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/highbd_idct16x16_add_neon.c60
-rw-r--r--vpx_dsp/arm/highbd_idct32x32_135_add_neon.c1
-rw-r--r--vpx_dsp/arm/highbd_idct32x32_34_add_neon.c1
-rw-r--r--vpx_dsp/arm/highbd_idct_neon.h112
-rw-r--r--vpx_dsp/arm/idct_neon.h56
5 files changed, 118 insertions, 112 deletions
diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
index 3fa2f9e28..69196e174 100644
--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -11,6 +11,7 @@
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/inv_txfm.h"
@@ -515,62 +516,9 @@ static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
out[15] = vsubq_s32(step2[0], step2[15]);
}
-static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
- int32_t *output) {
- // Save the result into output
- vst1q_s32(output + 0, out[0].val[0]);
- vst1q_s32(output + 4, out[0].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[1].val[0]);
- vst1q_s32(output + 4, out[1].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[2].val[0]);
- vst1q_s32(output + 4, out[2].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[3].val[0]);
- vst1q_s32(output + 4, out[3].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[4].val[0]);
- vst1q_s32(output + 4, out[4].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[5].val[0]);
- vst1q_s32(output + 4, out[5].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[6].val[0]);
- vst1q_s32(output + 4, out[6].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[7].val[0]);
- vst1q_s32(output + 4, out[7].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[8].val[0]);
- vst1q_s32(output + 4, out[8].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[9].val[0]);
- vst1q_s32(output + 4, out[9].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[10].val[0]);
- vst1q_s32(output + 4, out[10].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[11].val[0]);
- vst1q_s32(output + 4, out[11].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[12].val[0]);
- vst1q_s32(output + 4, out[12].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[13].val[0]);
- vst1q_s32(output + 4, out[13].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[14].val[0]);
- vst1q_s32(output + 4, out[14].val[1]);
- output += 16;
- vst1q_s32(output + 0, out[15].val[0]);
- vst1q_s32(output + 4, out[15].val[1]);
-}
-
-static void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input,
- int32_t *output, uint16_t *dest,
- const int stride,
- const int bd) {
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+ uint16_t *dest, const int stride,
+ const int bd) {
const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
diff --git a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
index 3970a5a86..6750c1a42 100644
--- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -12,6 +12,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
diff --git a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
index 5d9063b15..f05932cec 100644
--- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -12,6 +12,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
diff --git a/vpx_dsp/arm/highbd_idct_neon.h b/vpx_dsp/arm/highbd_idct_neon.h
index 612bcf5a5..a17c5a6cc 100644
--- a/vpx_dsp/arm/highbd_idct_neon.h
+++ b/vpx_dsp/arm/highbd_idct_neon.h
@@ -359,4 +359,116 @@ static INLINE void idct8x8_64_half1d_bd12(
*io7 = vsubq_s32(step1[0], step2[7]);
}
+static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
+ int32_t *output) {
+ // Save the result into output
+ vst1q_s32(output + 0, out[0].val[0]);
+ vst1q_s32(output + 4, out[0].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[1].val[0]);
+ vst1q_s32(output + 4, out[1].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[2].val[0]);
+ vst1q_s32(output + 4, out[2].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[3].val[0]);
+ vst1q_s32(output + 4, out[3].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[4].val[0]);
+ vst1q_s32(output + 4, out[4].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[5].val[0]);
+ vst1q_s32(output + 4, out[5].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[6].val[0]);
+ vst1q_s32(output + 4, out[6].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[7].val[0]);
+ vst1q_s32(output + 4, out[7].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[8].val[0]);
+ vst1q_s32(output + 4, out[8].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[9].val[0]);
+ vst1q_s32(output + 4, out[9].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[10].val[0]);
+ vst1q_s32(output + 4, out[10].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[11].val[0]);
+ vst1q_s32(output + 4, out[11].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[12].val[0]);
+ vst1q_s32(output + 4, out[12].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[13].val[0]);
+ vst1q_s32(output + 4, out[13].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[14].val[0]);
+ vst1q_s32(output + 4, out[14].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[15].val[0]);
+ vst1q_s32(output + 4, out[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
+ uint16_t *dest, const int stride,
+ const int bd) {
+ // Add the result to dest
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int16x8_t o[16];
+ o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
+ vrshrn_n_s32(out[0].val[1], 6));
+ o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
+ vrshrn_n_s32(out[1].val[1], 6));
+ o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
+ vrshrn_n_s32(out[2].val[1], 6));
+ o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
+ vrshrn_n_s32(out[3].val[1], 6));
+ o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
+ vrshrn_n_s32(out[4].val[1], 6));
+ o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
+ vrshrn_n_s32(out[5].val[1], 6));
+ o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
+ vrshrn_n_s32(out[6].val[1], 6));
+ o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
+ vrshrn_n_s32(out[7].val[1], 6));
+ o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
+ vrshrn_n_s32(out[8].val[1], 6));
+ o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
+ vrshrn_n_s32(out[9].val[1], 6));
+ o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
+ vrshrn_n_s32(out[10].val[1], 6));
+ o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
+ vrshrn_n_s32(out[11].val[1], 6));
+ o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
+ vrshrn_n_s32(out[12].val[1], 6));
+ o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
+ vrshrn_n_s32(out[13].val[1], 6));
+ o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
+ vrshrn_n_s32(out[14].val[1], 6));
+ o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
+ vrshrn_n_s32(out[15].val[1], 6));
+ highbd_idct16x16_add8x1(o[0], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[1], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[2], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[3], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[4], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[5], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[6], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[7], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[8], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[9], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[10], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[11], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[12], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[13], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[14], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+ uint16_t *dest, const int stride,
+ const int bd);
+
#endif // VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
index 73dc2a4fb..628392689 100644
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -890,62 +890,6 @@ static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
highbd_idct16x16_add8x1_bd8(a[31], &out, stride);
}
-static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
- uint16_t *dest, const int stride,
- const int bd) {
- // Add the result to dest
- const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
- int16x8_t o[16];
- o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
- vrshrn_n_s32(out[0].val[1], 6));
- o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
- vrshrn_n_s32(out[1].val[1], 6));
- o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
- vrshrn_n_s32(out[2].val[1], 6));
- o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
- vrshrn_n_s32(out[3].val[1], 6));
- o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
- vrshrn_n_s32(out[4].val[1], 6));
- o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
- vrshrn_n_s32(out[5].val[1], 6));
- o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
- vrshrn_n_s32(out[6].val[1], 6));
- o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
- vrshrn_n_s32(out[7].val[1], 6));
- o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
- vrshrn_n_s32(out[8].val[1], 6));
- o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
- vrshrn_n_s32(out[9].val[1], 6));
- o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
- vrshrn_n_s32(out[10].val[1], 6));
- o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
- vrshrn_n_s32(out[11].val[1], 6));
- o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
- vrshrn_n_s32(out[12].val[1], 6));
- o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
- vrshrn_n_s32(out[13].val[1], 6));
- o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
- vrshrn_n_s32(out[14].val[1], 6));
- o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
- vrshrn_n_s32(out[15].val[1], 6));
- highbd_idct16x16_add8x1(o[0], max, &dest, stride);
- highbd_idct16x16_add8x1(o[1], max, &dest, stride);
- highbd_idct16x16_add8x1(o[2], max, &dest, stride);
- highbd_idct16x16_add8x1(o[3], max, &dest, stride);
- highbd_idct16x16_add8x1(o[4], max, &dest, stride);
- highbd_idct16x16_add8x1(o[5], max, &dest, stride);
- highbd_idct16x16_add8x1(o[6], max, &dest, stride);
- highbd_idct16x16_add8x1(o[7], max, &dest, stride);
- highbd_idct16x16_add8x1(o[8], max, &dest, stride);
- highbd_idct16x16_add8x1(o[9], max, &dest, stride);
- highbd_idct16x16_add8x1(o[10], max, &dest, stride);
- highbd_idct16x16_add8x1(o[11], max, &dest, stride);
- highbd_idct16x16_add8x1(o[12], max, &dest, stride);
- highbd_idct16x16_add8x1(o[13], max, &dest, stride);
- highbd_idct16x16_add8x1(o[14], max, &dest, stride);
- highbd_idct16x16_add8x1(o[15], max, &dest, stride);
-}
-
void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
void *const dest, const int stride,
const int highbd_flag);