summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
authorKonstantinos Margaritis <konma@vectorcamp.gr>2022-10-06 16:00:43 +0000
committerKonstantinos Margaritis <konma@vectorcamp.gr>2022-10-12 18:59:52 +0000
commita49f896352671870f38c1374f3d5329e3b60193f (patch)
treeecda309a1b9de45afb06f0bbbce3c534b75dbea8 /vpx_dsp
parent165935a1b6c3dfe2af686545188c3abebc4941d8 (diff)
downloadlibvpx-a49f896352671870f38c1374f3d5329e3b60193f.tar
libvpx-a49f896352671870f38c1374f3d5329e3b60193f.tar.gz
libvpx-a49f896352671870f38c1374f3d5329e3b60193f.tar.bz2
libvpx-a49f896352671870f38c1374f3d5329e3b60193f.zip
[NEON] Add highbd FDCT 8x8 function
50% faster than C version in best/rt profiles Change-Id: I0f9504ed52b5d5f7722407e91108ed4056d66bc2
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/fdct8x8_neon.c78
-rw-r--r--vpx_dsp/arm/fdct_neon.h144
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl2
3 files changed, 223 insertions, 1 deletions
diff --git a/vpx_dsp/arm/fdct8x8_neon.c b/vpx_dsp/arm/fdct8x8_neon.c
index d9161c6d3..3fb15cc17 100644
--- a/vpx_dsp/arm/fdct8x8_neon.c
+++ b/vpx_dsp/arm/fdct8x8_neon.c
@@ -66,3 +66,81 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ int i;
+
+ // input[M * stride] * 16
+ int32x4_t left[8], right[8];
+ int16x8_t in[8];
+ in[0] = vld1q_s16(input + 0 * stride);
+ in[1] = vld1q_s16(input + 1 * stride);
+ in[2] = vld1q_s16(input + 2 * stride);
+ in[3] = vld1q_s16(input + 3 * stride);
+ in[4] = vld1q_s16(input + 4 * stride);
+ in[5] = vld1q_s16(input + 5 * stride);
+ in[6] = vld1q_s16(input + 6 * stride);
+ in[7] = vld1q_s16(input + 7 * stride);
+
+ left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+ left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+ left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+ left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+ left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+ left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+ left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+ left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+ right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+ right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+ right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+ right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+ right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+ right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+ right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+ right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+ for (i = 0; i < 2; ++i) {
+ vpx_highbd_fdct8x8_pass1_neon(left, right);
+ }
+ {
+ left[0] = highbd_add_round_shift_s32(left[0]);
+ left[1] = highbd_add_round_shift_s32(left[1]);
+ left[2] = highbd_add_round_shift_s32(left[2]);
+ left[3] = highbd_add_round_shift_s32(left[3]);
+ left[4] = highbd_add_round_shift_s32(left[4]);
+ left[5] = highbd_add_round_shift_s32(left[5]);
+ left[6] = highbd_add_round_shift_s32(left[6]);
+ left[7] = highbd_add_round_shift_s32(left[7]);
+ right[0] = highbd_add_round_shift_s32(right[0]);
+ right[1] = highbd_add_round_shift_s32(right[1]);
+ right[2] = highbd_add_round_shift_s32(right[2]);
+ right[3] = highbd_add_round_shift_s32(right[3]);
+ right[4] = highbd_add_round_shift_s32(right[4]);
+ right[5] = highbd_add_round_shift_s32(right[5]);
+ right[6] = highbd_add_round_shift_s32(right[6]);
+ right[7] = highbd_add_round_shift_s32(right[7]);
+
+ // store results
+ vst1q_s32(final_output, left[0]);
+ vst1q_s32(final_output + 4, right[0]);
+ vst1q_s32(final_output + 8, left[1]);
+ vst1q_s32(final_output + 12, right[1]);
+ vst1q_s32(final_output + 16, left[2]);
+ vst1q_s32(final_output + 20, right[2]);
+ vst1q_s32(final_output + 24, left[3]);
+ vst1q_s32(final_output + 28, right[3]);
+ vst1q_s32(final_output + 32, left[4]);
+ vst1q_s32(final_output + 36, right[4]);
+ vst1q_s32(final_output + 40, left[5]);
+ vst1q_s32(final_output + 44, right[5]);
+ vst1q_s32(final_output + 48, left[6]);
+ vst1q_s32(final_output + 52, right[6]);
+ vst1q_s32(final_output + 56, left[7]);
+ vst1q_s32(final_output + 60, right[7]);
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 68aeab3aa..c100e709d 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -342,6 +342,20 @@ static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
}
#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int32x4_t highbd_add_round_shift_s32(int32x4_t x) {
+ const int32x2_t x_lo = vget_low_s32(x);
+ const int32x2_t x_hi = vget_high_s32(x);
+ const int64x2_t x64_lo = vmovl_s32(x_lo);
+ const int64x2_t x64_hi = vmovl_s32(x_hi);
+
+ const int64x2_t sign_lo = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_lo, 63);
+ const int64x2_t sign_hi = (int64x2_t)vshrq_n_u64((uint64x2_t)x64_hi, 63);
+
+ const int64x2_t sum_lo = vaddq_s64(x64_lo, sign_lo);
+ const int64x2_t sum_hi = vaddq_s64(x64_hi, sign_hi);
+ return vcombine_s32(vshrn_n_s64(sum_lo, 1), vshrn_n_s64(sum_hi, 1));
+}
+
static INLINE void highbd_butterfly_one_coeff_s32(const int32x4_t a,
const int32x4_t b,
const tran_high_t c,
@@ -413,5 +427,135 @@ static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
in[3] = out[3];
}
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+ sl[0] = vaddq_s32(left[0], left[7]);
+ sl[1] = vaddq_s32(left[1], left[6]);
+ sl[2] = vaddq_s32(left[2], left[5]);
+ sl[3] = vaddq_s32(left[3], left[4]);
+ sl[4] = vsubq_s32(left[3], left[4]);
+ sl[5] = vsubq_s32(left[2], left[5]);
+ sl[6] = vsubq_s32(left[1], left[6]);
+ sl[7] = vsubq_s32(left[0], left[7]);
+ sr[0] = vaddq_s32(right[0], right[7]);
+ sr[1] = vaddq_s32(right[1], right[6]);
+ sr[2] = vaddq_s32(right[2], right[5]);
+ sr[3] = vaddq_s32(right[3], right[4]);
+ sr[4] = vsubq_s32(right[3], right[4]);
+ sr[5] = vsubq_s32(right[2], right[5]);
+ sr[6] = vsubq_s32(right[1], right[6]);
+ sr[7] = vsubq_s32(right[0], right[7]);
+
+ // fdct4(step, step);
+ // x0 = s0 + s3;
+ xl[0] = vaddq_s32(sl[0], sl[3]);
+ xr[0] = vaddq_s32(sr[0], sr[3]);
+ // x1 = s1 + s2;
+ xl[1] = vaddq_s32(sl[1], sl[2]);
+ xr[1] = vaddq_s32(sr[1], sr[2]);
+ // x2 = s1 - s2;
+ xl[2] = vsubq_s32(sl[1], sl[2]);
+ xr[2] = vsubq_s32(sr[1], sr[2]);
+ // x3 = s0 - s3;
+ xl[3] = vsubq_s32(sl[0], sl[3]);
+ xr[3] = vsubq_s32(sr[0], sr[3]);
+
+ // fdct4(step, step);
+ // t0 = (x0 + x1) * cospi_16_64;
+ // t1 = (x0 - x1) * cospi_16_64;
+ // out[0] = (tran_low_t)fdct_round_shift(t0);
+ // out[4] = (tran_low_t)fdct_round_shift(t1);
+ highbd_butterfly_one_coeff_s32(xl[0], xl[1], cospi_16_64, &left[0], &left[4]);
+ highbd_butterfly_one_coeff_s32(xr[0], xr[1], cospi_16_64, &right[0],
+ &right[4]);
+ // t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ // t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ // out[2] = (tran_low_t)fdct_round_shift(t2);
+ // out[6] = (tran_low_t)fdct_round_shift(t3);
+ highbd_butterfly_two_coeff_s32(xl[3], xl[2], cospi_8_64, cospi_24_64,
+ &left[2], &left[6]);
+ highbd_butterfly_two_coeff_s32(xr[3], xr[2], cospi_8_64, cospi_24_64,
+ &right[2], &right[6]);
+
+ // Stage 2
+ // t0 = (s6 - s5) * cospi_16_64;
+ highbd_butterfly_one_coeff_s32(sl[6], sl[5], cospi_16_64, &tl[1], &tl[0]);
+ highbd_butterfly_one_coeff_s32(sr[6], sr[5], cospi_16_64, &tr[1], &tr[0]);
+
+ // Stage 3
+ xl[0] = vaddq_s32(sl[4], tl[0]);
+ xr[0] = vaddq_s32(sr[4], tr[0]);
+ xl[1] = vsubq_s32(sl[4], tl[0]);
+ xr[1] = vsubq_s32(sr[4], tr[0]);
+ xl[2] = vsubq_s32(sl[7], tl[1]);
+ xr[2] = vsubq_s32(sr[7], tr[1]);
+ xl[3] = vaddq_s32(sl[7], tl[1]);
+ xr[3] = vaddq_s32(sr[7], tr[1]);
+
+ // Stage 4
+ // t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ // out[1] = (tran_low_t)fdct_round_shift(t0);
+ // t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ // out[7] = (tran_low_t)fdct_round_shift(t3);
+ highbd_butterfly_two_coeff_s32(xl[3], xl[0], cospi_4_64, cospi_28_64,
+ &left[1], &left[7]);
+ highbd_butterfly_two_coeff_s32(xr[3], xr[0], cospi_4_64, cospi_28_64,
+ &right[1], &right[7]);
+
+ // t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ // out[5] = (tran_low_t)fdct_round_shift(t1);
+ // t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ // out[3] = (tran_low_t)fdct_round_shift(t2);
+ highbd_butterfly_two_coeff_s32(xl[2], xl[1], cospi_20_64, cospi_12_64,
+ &left[5], &left[3]);
+ highbd_butterfly_two_coeff_s32(xr[2], xr[1], cospi_20_64, cospi_12_64,
+ &right[5], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+ int32x4_t *right) {
+ int32x4x2_t out[8];
+ vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+ out[0].val[0] = left[0];
+ out[0].val[1] = right[0];
+ out[1].val[0] = left[1];
+ out[1].val[1] = right[1];
+ out[2].val[0] = left[2];
+ out[2].val[1] = right[2];
+ out[3].val[0] = left[3];
+ out[3].val[1] = right[3];
+ out[4].val[0] = left[4];
+ out[4].val[1] = right[4];
+ out[5].val[0] = left[5];
+ out[5].val[1] = right[5];
+ out[6].val[0] = left[6];
+ out[6].val[1] = right[6];
+ out[7].val[0] = left[7];
+ out[7].val[1] = right[7];
+
+ transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+ &out[6], &out[7]);
+
+ left[0] = out[0].val[0];
+ right[0] = out[0].val[1];
+ left[1] = out[1].val[0];
+ right[1] = out[1].val[1];
+ left[2] = out[2].val[0];
+ right[2] = out[2].val[1];
+ left[3] = out[3].val[0];
+ right[3] = out[3].val[1];
+ left[4] = out[4].val[0];
+ right[4] = out[4].val[1];
+ left[5] = out[5].val[0];
+ right[5] = out[5].val[1];
+ left[6] = out[6].val[0];
+ right[6] = out[6].val[1];
+ left[7] = out[7].val[0];
+ right[7] = out[7].val[1];
+}
+
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c5514b14d..e886c0ae4 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -555,7 +555,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_highbd_fdct8x8 sse2/;
+ specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_highbd_fdct8x8_1 neon/;