summaryrefslogtreecommitdiff
path: root/vpx_dsp/arm
diff options
context:
space:
mode:
authorLinfeng Zhang <linfengz@google.com>2018-01-29 15:17:08 -0800
committerLinfeng Zhang <linfengz@google.com>2018-01-29 15:17:08 -0800
commitb14b616d968b3cb1b955651754b9761d72bd05ac (patch)
treed3710d74ee1302550c777a6f3e7e1c4f77081aa7 /vpx_dsp/arm
parent77108f50012fc156cd109ed0513e6f661a1d719a (diff)
downloadlibvpx-b14b616d968b3cb1b955651754b9761d72bd05ac.tar
libvpx-b14b616d968b3cb1b955651754b9761d72bd05ac.tar.gz
libvpx-b14b616d968b3cb1b955651754b9761d72bd05ac.tar.bz2
libvpx-b14b616d968b3cb1b955651754b9761d72bd05ac.zip
Update vp9_iht8x8_64_add_neon()
Change-Id: Ie70ed8b9273df5e1fd06bc93cb469e80630941d2
Diffstat (limited to 'vpx_dsp/arm')
-rw-r--r--vpx_dsp/arm/idct8x8_add_neon.c26
-rw-r--r--vpx_dsp/arm/idct_neon.h39
2 files changed, 35 insertions, 30 deletions
diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c
index 42e798ea4..7471387e4 100644
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -17,28 +17,6 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
-static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest,
- const int stride) {
- const uint8x8_t s = vld1_u8(*dest);
- const int16x8_t res = vrshrq_n_s16(a, 5);
- const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
- const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
- vst1_u8(*dest, d);
- *dest += stride;
-}
-
-static INLINE void add8x8(int16x8_t *const out, uint8_t *dest,
- const int stride) {
- idct8x8_add8x1(out[0], &dest, stride);
- idct8x8_add8x1(out[1], &dest, stride);
- idct8x8_add8x1(out[2], &dest, stride);
- idct8x8_add8x1(out[3], &dest, stride);
- idct8x8_add8x1(out[4], &dest, stride);
- idct8x8_add8x1(out[5], &dest, stride);
- idct8x8_add8x1(out[6], &dest, stride);
- idct8x8_add8x1(out[7], &dest, stride);
-}
-
void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
const int16x8_t cospis = vld1q_s16(kCospi);
@@ -57,7 +35,7 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
idct8x8_64_1d_bd8(cospis0, cospis1, a);
idct8x8_64_1d_bd8(cospis0, cospis1, a);
- add8x8(a, dest, stride);
+ idct8x8_add8x8_neon(a, dest, stride);
}
void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -77,5 +55,5 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a);
idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b);
- add8x8(b, dest, stride);
+ idct8x8_add8x8_neon(b, dest, stride);
}
diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h
index 4e940bd21..c4d3b4711 100644
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -417,18 +417,15 @@ static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0,
output[7] = vsubq_s16(step1[0], step2[7]);
}
-static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
- const int16x4_t cospis1,
- int16x8_t *const io) {
+static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0,
+ const int16x4_t cospis1,
+ int16x8_t *const io) {
int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
input7h;
int16x4_t step1l[4], step1h[4];
int16x8_t step1[8], step2[8];
int32x4_t t32[8];
- transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6],
- &io[7]);
-
// stage 1
input1l = vget_low_s16(io[1]);
input1h = vget_high_s16(io[1]);
@@ -514,6 +511,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
io[7] = vsubq_s16(step1[0], step2[7]);
}
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+ const int16x4_t cospis1,
+ int16x8_t *const io) {
+ transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6],
+ &io[7]);
+ idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io);
+}
+
static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
const int16x8_t s1,
const int16x4_t cospi_0_8_16_24,
@@ -736,6 +741,28 @@ static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
vst1q_s16(output, out[15]);
}
+static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest,
+ const int stride) {
+ const uint8x8_t s = vld1_u8(*dest);
+ const int16x8_t res = vrshrq_n_s16(a, 5);
+ const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+ const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+ vst1_u8(*dest, d);
+ *dest += stride;
+}
+
+static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest,
+ const int stride) {
+ idct8x8_add8x1(out[0], &dest, stride);
+ idct8x8_add8x1(out[1], &dest, stride);
+ idct8x8_add8x1(out[2], &dest, stride);
+ idct8x8_add8x1(out[3], &dest, stride);
+ idct8x8_add8x1(out[4], &dest, stride);
+ idct8x8_add8x1(out[5], &dest, stride);
+ idct8x8_add8x1(out[6], &dest, stride);
+ idct8x8_add8x1(out[7], &dest, stride);
+}
+
static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest,
const int stride) {
const uint8x8_t s = vld1_u8(*dest);