summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/highbd_intrapred_neon.c1078
-rw-r--r--vpx_dsp/arm/idct32x32_135_add_neon.c686
-rw-r--r--vpx_dsp/arm/idct32x32_34_add_neon.c22
-rw-r--r--vpx_dsp/arm/idct_neon.asm1
-rw-r--r--vpx_dsp/arm/intrapred_neon.c191
-rw-r--r--vpx_dsp/arm/transpose_neon.h65
-rw-r--r--vpx_dsp/deblock.c4
-rw-r--r--vpx_dsp/inv_txfm.c421
-rw-r--r--vpx_dsp/inv_txfm.h5
-rw-r--r--vpx_dsp/mips/convolve8_avg_dspr2.c7
-rw-r--r--vpx_dsp/mips/convolve8_dspr2.c5
-rw-r--r--vpx_dsp/mips/deblock_msa.c3
-rw-r--r--vpx_dsp/mips/intrapred16_dspr2.c1
-rw-r--r--vpx_dsp/mips/intrapred4_dspr2.c1
-rw-r--r--vpx_dsp/mips/intrapred8_dspr2.c1
-rw-r--r--vpx_dsp/mips/txfm_macros_msa.h19
-rw-r--r--vpx_dsp/vpx_dsp.mk5
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl60
-rw-r--r--vpx_dsp/x86/deblock_sse2.asm16
-rw-r--r--vpx_dsp/x86/fdct.h57
-rw-r--r--vpx_dsp/x86/inv_txfm_sse2.c6
-rw-r--r--vpx_dsp/x86/quantize_sse2.c59
22 files changed, 2294 insertions, 419 deletions
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 000000000..6f7e5da76
--- /dev/null
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,1078 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+ const uint16x4_t ref_u16 = vld1_u16(ref);
+ const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
+ return vpadd_u16(p0, p0);
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ vst1_u16(dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t a = vld1_u16(above);
+ const uint16x4_t l = vld1_u16(left);
+ uint16x4_t sum;
+ uint16x4_t dc;
+ (void)bd;
+ sum = vadd_u16(a, l);
+ sum = vpadd_u16(sum, sum);
+ sum = vpadd_u16(sum, sum);
+ dc = vrshr_n_u16(sum, 3);
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_4(left);
+ const uint16x4_t dc = vrshr_n_u16(sum, 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_4(above);
+ const uint16x4_t dc = vrshr_n_u16(sum, 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+ const uint16x8_t ref_u16 = vld1q_u16(ref);
+ uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
+ sum = vpadd_u16(sum, sum);
+ return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ vst1q_u16(dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t above_u16 = vld1q_u16(above);
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ uint16x4_t dc;
+ (void)bd;
+ sum = vpadd_u16(sum, sum);
+ sum = vpadd_u16(sum, sum);
+ dc = vrshr_n_u16(sum, 4);
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_8(left);
+ const uint16x4_t dc = vrshr_n_u16(sum, 3);
+ (void)above;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_8(above);
+ const uint16x4_t dc = vrshr_n_u16(sum, 3);
+ (void)left;
+ (void)bd;
+ dc_store_8x8(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_8x8(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
+ const uint16x8x2_t ref_u16 = vld2q_u16(ref);
+ const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+ sum = vpadd_u16(sum, sum);
+ return vpadd_u16(sum, sum);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ uint16x8x2_t dc_dup;
+ int i;
+ dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+ for (i = 0; i < 16; ++i, dst += stride) {
+ vst2q_u16(dst, dc_dup);
+ }
+}
+
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x2_t a = vld2q_u16(above);
+ const uint16x8x2_t l = vld2q_u16(left);
+ const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
+ const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+ uint32x2_t sum;
+ uint16x4_t dc;
+ (void)bd;
+ pal1 = vpadd_u16(pal1, pal1);
+ sum = vpaddl_u16(pal1);
+ dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_16(left);
+ const uint16x4_t dc = vrshr_n_u16(sum, 4);
+ (void)above;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t sum = dc_sum_16(above);
+ const uint16x4_t dc = vrshr_n_u16(sum, 4);
+ (void)left;
+ (void)bd;
+ dc_store_16x16(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_16x16(dst, stride, dc);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
+ const uint16x8x4_t r = vld4q_u16(ref);
+ const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
+ const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+ const uint16x8_t p2 = vaddq_u16(p0, p1);
+ uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+ sum = vpadd_u16(sum, sum);
+ return vpaddl_u16(sum);
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+ const uint16x4_t dc) {
+ uint16x8x2_t dc_dup;
+ int i;
+ dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
+
+ for (i = 0; i < 32; ++i) {
+ vst2q_u16(dst, dc_dup);
+ dst += 16;
+ vst2q_u16(dst, dc_dup);
+ dst += stride - 16;
+ }
+}
+
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x4_t a = vld4q_u16(above);
+ const uint16x8x4_t l = vld4q_u16(left);
+ const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
+ const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
+ const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
+ const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+ const uint16x8_t pa = vaddq_u16(pa0, pa1);
+ const uint16x8_t pl = vaddq_u16(pl0, pl1);
+ const uint16x8_t pal0 = vaddq_u16(pa, pl);
+ const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
+ uint32x2_t sum = vpaddl_u16(pal1);
+ uint16x4_t dc;
+ (void)bd;
+ sum = vpadd_u32(sum, sum);
+ dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32x2_t sum = dc_sum_32(left);
+ const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+ (void)above;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint32x2_t sum = dc_sum_32(above);
+ const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+ (void)left;
+ (void)bd;
+ dc_store_32x32(dst, stride, dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+ (void)above;
+ (void)left;
+ dc_store_32x32(dst, stride, dc);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t ABCDEFGH = vld1q_u16(above);
+ const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
+ const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
+ const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
+ const uint16x4_t avg2_low = vget_low_u16(avg2);
+ const uint16x4_t avg2_high = vget_high_u16(avg2);
+ const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
+ const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
+ const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+ (void)left;
+ (void)bd;
+ vst1_u16(dst, avg2_low);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, r3);
+ vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t above_right, uint16x8_t *row) {
+ *row = vextq_u16(*row, above_right, 1);
+ vst1q_u16(*dst, *row);
+ *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0 = vld1q_u16(above);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
+ const uint16x8_t A1 = vld1q_u16(above + 1);
+ const uint16x8_t A2 = vld1q_u16(above + 2);
+ const uint16x8_t avg1 = vhaddq_u16(A0, A2);
+ uint16x8_t row = vrhaddq_u16(avg1, A1);
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row);
+ dst += stride;
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ d45_store_8(&dst, stride, above_right, &row);
+ vst1q_u16(dst, above_right);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t above_right, uint16x8_t *row_0,
+ uint16x8_t *row_1) {
+ *row_0 = vextq_u16(*row_0, *row_1, 1);
+ *row_1 = vextq_u16(*row_1, above_right, 1);
+ vst1q_u16(*dst, *row_0);
+ *dst += 8;
+ vst1q_u16(*dst, *row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0_0 = vld1q_u16(above);
+ const uint16x8_t A0_1 = vld1q_u16(above + 8);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
+ const uint16x8_t A1_0 = vld1q_u16(above + 1);
+ const uint16x8_t A1_1 = vld1q_u16(above + 9);
+ const uint16x8_t A2_0 = vld1q_u16(above + 2);
+ const uint16x8_t A2_1 = vld1q_u16(above + 10);
+ const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+ const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row_0);
+ vst1q_u16(dst + 8, row_1);
+ dst += stride;
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ d45_store_16(&dst, stride, above_right, &row_0, &row_1);
+ vst1q_u16(dst, above_right);
+ vst1q_u16(dst + 8, above_right);
+}
+
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t A0_0 = vld1q_u16(above);
+ const uint16x8_t A0_1 = vld1q_u16(above + 8);
+ const uint16x8_t A0_2 = vld1q_u16(above + 16);
+ const uint16x8_t A0_3 = vld1q_u16(above + 24);
+ const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
+ const uint16x8_t A1_0 = vld1q_u16(above + 1);
+ const uint16x8_t A1_1 = vld1q_u16(above + 9);
+ const uint16x8_t A1_2 = vld1q_u16(above + 17);
+ const uint16x8_t A1_3 = vld1q_u16(above + 25);
+ const uint16x8_t A2_0 = vld1q_u16(above + 2);
+ const uint16x8_t A2_1 = vld1q_u16(above + 10);
+ const uint16x8_t A2_2 = vld1q_u16(above + 18);
+ const uint16x8_t A2_3 = vld1q_u16(above + 26);
+ const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
+ const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
+ const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
+ const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+ int i;
+ (void)left;
+ (void)bd;
+
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst += stride - 24;
+
+ for (i = 0; i < 30; ++i) {
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, above_right, 1);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst += stride - 24;
+ }
+
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+ dst += 8;
+ vst1q_u16(dst, above_right);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123___ = vld1q_u16(above - 1);
+ const uint16x4_t L0123 = vld1_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(L0123);
+ const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
+ const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
+ const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
+ const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
+ const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
+ const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
+ const uint16x4_t row_0 = vget_low_u16(avg2);
+ const uint16x4_t row_1 = vget_high_u16(avg2);
+ const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
+ const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
+ const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1_u16(dst, r0);
+ dst += stride;
+ vst1_u16(dst, r1);
+ dst += stride;
+ vst1_u16(dst, r2);
+ dst += stride;
+ vst1_u16(dst, row_0);
+}
+
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A1234567_ = vld1q_u16(above + 1);
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
+ const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+ vst1q_u16(dst, r0);
+ dst += stride;
+ vst1q_u16(dst, r1);
+ dst += stride;
+ vst1q_u16(dst, r2);
+ dst += stride;
+ vst1q_u16(dst, r3);
+ dst += stride;
+ vst1q_u16(dst, r4);
+ dst += stride;
+ vst1q_u16(dst, r5);
+ dst += stride;
+ vst1q_u16(dst, r6);
+ dst += stride;
+ vst1q_u16(dst, row_0);
+}
+
+static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row_0,
+ const uint16x8_t row_1) {
+ vst1q_u16(*dst, row_0);
+ *dst += 8;
+ vst1q_u16(*dst, row_1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t L01234567 = vld1q_u16(left);
+ const uint16x8_t L89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
+ const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
+ const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
+ const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
+ const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
+ const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
+ const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
+ const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
+ const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
+
+ const uint16x8_t XA0123456 = vld1q_u16(above - 1);
+ const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
+ const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
+ const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
+
+ const uint16x8_t A01234567 = vld1q_u16(above);
+ const uint16x8_t A12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
+ const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
+
+ const uint16x8_t A789abcde = vld1q_u16(above + 7);
+ const uint16x8_t A89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
+ const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
+ const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
+
+ const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
+ const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
+ const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
+ const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
+ const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
+ const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
+ const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
+ const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
+ const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
+ const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
+ const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
+ const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
+ const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
+ const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
+ const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
+ const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
+ const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
+ const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
+ const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
+ const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
+ const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
+ (void)bd;
+
+ d135_store_16(&dst, stride, r0_0, r0_1);
+ d135_store_16(&dst, stride, r1_0, r1_1);
+ d135_store_16(&dst, stride, r2_0, r2_1);
+ d135_store_16(&dst, stride, r3_0, r3_1);
+ d135_store_16(&dst, stride, r4_0, r4_1);
+ d135_store_16(&dst, stride, r5_0, r5_1);
+ d135_store_16(&dst, stride, r6_0, r6_1);
+ d135_store_16(&dst, stride, row_1, row_2);
+ d135_store_16(&dst, stride, r8_0, r0_0);
+ d135_store_16(&dst, stride, r9_0, r1_0);
+ d135_store_16(&dst, stride, ra_0, r2_0);
+ d135_store_16(&dst, stride, rb_0, r3_0);
+ d135_store_16(&dst, stride, rc_0, r4_0);
+ d135_store_16(&dst, stride, rd_0, r5_0);
+ d135_store_16(&dst, stride, re_0, r6_0);
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+}
+
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t LL01234567 = vld1q_u16(left + 16);
+ const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
+ const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
+ const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
+ const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
+ const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
+ const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
+ const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
+ const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
+ const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
+ const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
+ uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
+
+ const uint16x8_t LU01234567 = vld1q_u16(left);
+ const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
+ const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
+ const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
+ const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
+ const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
+ const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
+ const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
+ const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
+ const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
+ const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
+ uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
+
+ const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
+ const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
+ const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
+ uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
+
+ const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
+ const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
+ const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
+ const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
+ uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
+
+ const uint16x8_t AL01234567 = vld1q_u16(above);
+ const uint16x8_t AL12345678 = vld1q_u16(above + 1);
+ const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
+ uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
+
+ const uint16x8_t AL789abcde = vld1q_u16(above + 7);
+ const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
+ const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
+ const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
+ uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
+
+ const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
+ const uint16x8_t AR01234567 = vld1q_u16(above + 16);
+ const uint16x8_t AR12345678 = vld1q_u16(above + 17);
+ const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
+ uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
+
+ const uint16x8_t AR789abcde = vld1q_u16(above + 23);
+ const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
+ const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
+ const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
+ uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
+ int i, j;
+ (void)bd;
+
+ dst += 31 * stride;
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 8; ++j) {
+ vst1q_u16(dst, row_0);
+ dst += 8;
+ vst1q_u16(dst, row_1);
+ dst += 8;
+ vst1q_u16(dst, row_2);
+ dst += 8;
+ vst1q_u16(dst, row_3);
+ dst -= stride + 24;
+ row_0 = vextq_u16(row_0, row_1, 1);
+ row_1 = vextq_u16(row_1, row_2, 1);
+ row_2 = vextq_u16(row_2, row_3, 1);
+ row_3 = vextq_u16(row_3, row_4, 1);
+ row_4 = vextq_u16(row_4, row_4, 1);
+ }
+ row_4 = row_5;
+ row_5 = row_6;
+ row_6 = row_7;
+ }
+}
+
+//------------------------------------------------------------------------------
+
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t row = vld1_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, dst += stride) {
+ vst1_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t row = vld1q_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 8; i++, dst += stride) {
+ vst1q_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x2_t row = vld2q_u16(above);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 16; i++, dst += stride) {
+ vst2q_u16(dst, row);
+ }
+}
+
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8x2_t row0 = vld2q_u16(above);
+ const uint16x8x2_t row1 = vld2q_u16(above + 16);
+ int i;
+ (void)left;
+ (void)bd;
+
+ for (i = 0; i < 32; i++) {
+ vst2q_u16(dst, row0);
+ dst += 16;
+ vst2q_u16(dst, row1);
+ dst += stride - 16;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x4_t left_u16 = vld1_u16(left);
+ uint16x4_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdup_lane_u16(left_u16, 0);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 1);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 2);
+ vst1_u16(dst, row);
+ dst += stride;
+ row = vdup_lane_u16(left_u16, 3);
+ vst1_u16(dst, row);
+}
+
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const uint16x8_t left_u16 = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16);
+ const uint16x4_t left_high = vget_high_u16(left_u16);
+ uint16x8_t row;
+ (void)above;
+ (void)bd;
+
+ row = vdupq_lane_u16(left_low, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_low, 3);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 0);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 1);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 2);
+ vst1q_u16(dst, row);
+ dst += stride;
+ row = vdupq_lane_u16(left_high, 3);
+ vst1q_u16(dst, row);
+}
+
+static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_16(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_16(&dst, stride, row);
+ }
+}
+
+static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
+ const uint16x8_t row) {
+ // Note: vst1q is faster than vst2q
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += 8;
+ vst1q_u16(*dst, row);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const uint16x8_t left_u16q = vld1q_u16(left);
+ const uint16x4_t left_low = vget_low_u16(left_u16q);
+ const uint16x4_t left_high = vget_high_u16(left_u16q);
+ uint16x8_t row;
+
+ row = vdupq_lane_u16(left_low, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_low, 3);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 0);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 1);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 2);
+ h_store_32(&dst, stride, row);
+ row = vdupq_lane_u16(left_high, 3);
+ h_store_32(&dst, stride, row);
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
+ const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
+ const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x8_t sum;
+ uint16x8_t row;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+ dst += stride;
+
+ sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
+ sum = vaddq_s16(sum, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1_u16(dst, vget_low_u16(row));
+ dst += stride;
+ vst1_u16(dst, vget_high_u16(row));
+}
+
+static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub,
+ const int16x8_t max) {
+ uint16x8_t row;
+ int16x8_t sum = vaddq_s16(left_dup, sub);
+ sum = vminq_s16(sum, max);
+ row = vqshluq_n_s16(sum, 0);
+ vst1q_u16(*dst, row);
+ *dst += stride;
+}
+
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
+ const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
+ const int16x8_t sub = vsubq_s16(above_s16, top_left);
+ int16x4_t left_s16d;
+ int16x8_t left_dup;
+ int i;
+
+ left_s16d = vget_low_s16(left_s16);
+
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_8_kernel(&dst, stride, left_dup, sub, max);
+ }
+}
+
+static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t max) {
+ uint16x8_t row0, row1;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += stride - 8;
+}
+
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (j = 0; j < 2; j++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
+ }
+ }
+}
+
+static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
+ const int16x8_t left_dup, const int16x8_t sub0,
+ const int16x8_t sub1, const int16x8_t sub2,
+ const int16x8_t sub3, const int16x8_t max) {
+ uint16x8_t row0, row1, row2, row3;
+ int16x8_t sum0 = vaddq_s16(left_dup, sub0);
+ int16x8_t sum1 = vaddq_s16(left_dup, sub1);
+ int16x8_t sum2 = vaddq_s16(left_dup, sub2);
+ int16x8_t sum3 = vaddq_s16(left_dup, sub3);
+ sum0 = vminq_s16(sum0, max);
+ sum1 = vminq_s16(sum1, max);
+ sum2 = vminq_s16(sum2, max);
+ sum3 = vminq_s16(sum3, max);
+ row0 = vqshluq_n_s16(sum0, 0);
+ row1 = vqshluq_n_s16(sum1, 0);
+ row2 = vqshluq_n_s16(sum2, 0);
+ row3 = vqshluq_n_s16(sum3, 0);
+ vst1q_u16(*dst, row0);
+ *dst += 8;
+ vst1q_u16(*dst, row1);
+ *dst += 8;
+ vst1q_u16(*dst, row2);
+ *dst += 8;
+ vst1q_u16(*dst, row3);
+ *dst += stride - 24;
+}
+
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
+ const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
+ const int16x8_t above0 = vld1q_s16((const int16_t *)above);
+ const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
+ const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
+ const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
+ const int16x8_t sub0 = vsubq_s16(above0, top_left);
+ const int16x8_t sub1 = vsubq_s16(above1, top_left);
+ const int16x8_t sub2 = vsubq_s16(above2, top_left);
+ const int16x8_t sub3 = vsubq_s16(above3, top_left);
+ int16x8_t left_dup;
+ int i, j;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
+ int16x4_t left_s16d = vget_low_s16(left_s16q);
+ for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
+ left_dup = vdupq_lane_s16(left_s16d, 0);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 1);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 2);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+
+ left_dup = vdupq_lane_s16(left_s16d, 3);
+ tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
+ }
+ }
+}
diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c
new file mode 100644
index 000000000..db9ffef6c
--- /dev/null
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+// Only for the first pass of the _135_ variant. Since it only uses values from
+// the top left 16x16 it can safely assume all the remaining values are 0 and
+// skip an awful lot of calculations. In fact, only the first 12 columns make
+// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are
+// used so it skips any calls to input[12|13|14|15] too.
+// In C this does a single row of 32 for each call. Here it transposes the top
+// left 12x8 to allow using SIMD.
+
+// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
+// coefficients as follows:
+// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+// 0 0 2 5 10 17 25 38 47 62 83 101 121
+// 1 1 4 8 15 22 30 45 58 74 92 112 133
+// 2 3 7 12 18 28 36 52 64 82 102 118
+// 3 6 11 16 23 31 43 60 73 90 109 126
+// 4 9 14 19 29 37 50 65 78 98 116 134
+// 5 13 20 26 35 44 54 72 85 105 123
+// 6 21 27 33 42 53 63 80 94 113 132
+// 7 24 32 39 48 57 71 88 104 120
+// 8 34 40 46 56 68 81 96 111 130
+// 9 41 49 55 67 77 91 107 124
+// 10 51 59 66 76 89 99 119 131
+// 11 61 69 75 87 100 114 129
+// 12 70 79 86 97 108 122
+// 13 84 93 103 110 125
+// 14 98 106 115 127
+// 15 117 128
+static void idct32_12_neon(const int16_t *input, int16_t *output) {
+ int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
+ int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int16x8_t in8, in9, in10, in11;
+ int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27,
+ s1_28, s1_29, s1_31;
+ int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21,
+ s2_26, s2_27, s2_28, s2_29;
+ int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22,
+ s3_25, s3_26, s3_29, s3_30;
+ int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18,
+ s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28,
+ s4_29, s4_30, s4_31;
+ int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+ s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+ s5_29;
+ int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+ s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+ s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+ int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+ s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+ s7_25, s7_26, s7_27;
+
+ load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
+ &in6, &in7);
+
+ input += 8;
+
+ tmp0 = vld1_s16(input);
+ input += 32;
+ tmp1 = vld1_s16(input);
+ input += 32;
+ tmp2 = vld1_s16(input);
+ input += 32;
+ tmp3 = vld1_s16(input);
+ input += 32;
+ tmp4 = vld1_s16(input);
+ input += 32;
+ tmp5 = vld1_s16(input);
+ input += 32;
+ tmp6 = vld1_s16(input);
+ input += 32;
+ tmp7 = vld1_s16(input);
+
+ transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9,
+ &in10, &in11);
+
+ // stage 1
+ s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+ s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+ s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+ s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+ s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+ s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+ s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+ s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+ s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+ s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+ s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+ s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+ // stage 2
+ s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+ s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+ s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+ s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+ s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+ s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+ s2_18 = vsubq_s16(s1_19, s1_18);
+ s2_19 = vaddq_s16(s1_18, s1_19);
+ s2_20 = vaddq_s16(s1_20, s1_21);
+ s2_21 = vsubq_s16(s1_20, s1_21);
+ s2_26 = vsubq_s16(s1_27, s1_26);
+ s2_27 = vaddq_s16(s1_26, s1_27);
+ s2_28 = vaddq_s16(s1_28, s1_29);
+ s2_29 = vsubq_s16(s1_28, s1_29);
+
+ // stage 3
+ s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+ s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+ s3_10 = vsubq_s16(s2_11, s2_10);
+ s3_11 = vaddq_s16(s2_10, s2_11);
+ s3_12 = vaddq_s16(s2_12, s2_13);
+ s3_13 = vsubq_s16(s2_12, s2_13);
+
+ s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31,
+ cospi_28_64);
+ s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31,
+ cospi_4_64);
+
+ s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+ -cospi_4_64);
+ s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+ cospi_28_64);
+
+ s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+ cospi_12_64);
+ s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+ cospi_20_64);
+
+ s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24,
+ -cospi_20_64);
+ s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24,
+ cospi_12_64);
+
+ // stage 4
+ s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+ s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+ s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+ s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15,
+ cospi_24_64);
+ s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15,
+ cospi_8_64);
+
+ s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+ -cospi_8_64);
+ s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+ cospi_24_64);
+
+ s4_16 = vaddq_s16(s1_16, s2_19);
+ s4_17 = vaddq_s16(s3_17, s3_18);
+ s4_18 = vsubq_s16(s3_17, s3_18);
+ s4_19 = vsubq_s16(s1_16, s2_19);
+ s4_20 = vsubq_s16(s1_23, s2_20);
+ s4_21 = vsubq_s16(s3_22, s3_21);
+ s4_22 = vaddq_s16(s3_21, s3_22);
+ s4_23 = vaddq_s16(s2_20, s1_23);
+ s4_24 = vaddq_s16(s1_24, s2_27);
+ s4_25 = vaddq_s16(s3_25, s3_26);
+ s4_26 = vsubq_s16(s3_25, s3_26);
+ s4_27 = vsubq_s16(s1_24, s2_27);
+ s4_28 = vsubq_s16(s1_31, s2_28);
+ s4_29 = vsubq_s16(s3_30, s3_29);
+ s4_30 = vaddq_s16(s3_29, s3_30);
+ s4_31 = vaddq_s16(s2_28, s1_31);
+
+ // stage 5
+ s5_0 = vaddq_s16(s4_0, s4_3);
+ s5_1 = vaddq_s16(s4_0, s4_2);
+ s5_2 = vsubq_s16(s4_0, s4_2);
+ s5_3 = vsubq_s16(s4_0, s4_3);
+
+ s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64);
+ s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64);
+
+ s5_8 = vaddq_s16(s2_8, s3_11);
+ s5_9 = vaddq_s16(s4_9, s4_10);
+ s5_10 = vsubq_s16(s4_9, s4_10);
+ s5_11 = vsubq_s16(s2_8, s3_11);
+ s5_12 = vsubq_s16(s2_15, s3_12);
+ s5_13 = vsubq_s16(s4_14, s4_13);
+ s5_14 = vaddq_s16(s4_13, s4_14);
+ s5_15 = vaddq_s16(s2_15, s3_12);
+
+ s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+ cospi_24_64);
+ s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+ cospi_8_64);
+
+ s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+ cospi_24_64);
+ s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+ cospi_8_64);
+
+ s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+ -cospi_8_64);
+ s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+ cospi_24_64);
+
+ s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+ -cospi_8_64);
+ s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+ cospi_24_64);
+
+ // stage 6
+ s6_0 = vaddq_s16(s5_0, s3_7);
+ s6_1 = vaddq_s16(s5_1, s5_6);
+ s6_2 = vaddq_s16(s5_2, s5_5);
+ s6_3 = vaddq_s16(s5_3, s3_4);
+ s6_4 = vsubq_s16(s5_3, s3_4);
+ s6_5 = vsubq_s16(s5_2, s5_5);
+ s6_6 = vsubq_s16(s5_1, s5_6);
+ s6_7 = vsubq_s16(s5_0, s3_7);
+
+ s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+ s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+ s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+ s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+ s6_16 = vaddq_s16(s4_16, s4_23);
+ s6_17 = vaddq_s16(s4_17, s4_22);
+ s6_18 = vaddq_s16(s5_18, s5_21);
+ s6_19 = vaddq_s16(s5_19, s5_20);
+ s6_20 = vsubq_s16(s5_19, s5_20);
+ s6_21 = vsubq_s16(s5_18, s5_21);
+ s6_22 = vsubq_s16(s4_17, s4_22);
+ s6_23 = vsubq_s16(s4_16, s4_23);
+
+ s6_24 = vsubq_s16(s4_31, s4_24);
+ s6_25 = vsubq_s16(s4_30, s4_25);
+ s6_26 = vsubq_s16(s5_29, s5_26);
+ s6_27 = vsubq_s16(s5_28, s5_27);
+ s6_28 = vaddq_s16(s5_27, s5_28);
+ s6_29 = vaddq_s16(s5_26, s5_29);
+ s6_30 = vaddq_s16(s4_25, s4_30);
+ s6_31 = vaddq_s16(s4_24, s4_31);
+
+ // stage 7
+ s7_0 = vaddq_s16(s6_0, s5_15);
+ s7_1 = vaddq_s16(s6_1, s5_14);
+ s7_2 = vaddq_s16(s6_2, s6_13);
+ s7_3 = vaddq_s16(s6_3, s6_12);
+ s7_4 = vaddq_s16(s6_4, s6_11);
+ s7_5 = vaddq_s16(s6_5, s6_10);
+ s7_6 = vaddq_s16(s6_6, s5_9);
+ s7_7 = vaddq_s16(s6_7, s5_8);
+ s7_8 = vsubq_s16(s6_7, s5_8);
+ s7_9 = vsubq_s16(s6_6, s5_9);
+ s7_10 = vsubq_s16(s6_5, s6_10);
+ s7_11 = vsubq_s16(s6_4, s6_11);
+ s7_12 = vsubq_s16(s6_3, s6_12);
+ s7_13 = vsubq_s16(s6_2, s6_13);
+ s7_14 = vsubq_s16(s6_1, s5_14);
+ s7_15 = vsubq_s16(s6_0, s5_15);
+
+ s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+ s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+ s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+ s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+ s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+ s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+ s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+ s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+ // final stage
+ vst1q_s16(output, vaddq_s16(s7_0, s6_31));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_1, s6_30));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_2, s6_29));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_3, s6_28));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_4, s7_27));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_5, s7_26));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_6, s7_25));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_7, s7_24));
+ output += 16;
+
+ vst1q_s16(output, vaddq_s16(s7_8, s7_23));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_9, s7_22));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_10, s7_21));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_11, s7_20));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_12, s6_19));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_13, s6_18));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_14, s6_17));
+ output += 16;
+ vst1q_s16(output, vaddq_s16(s7_15, s6_16));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7_15, s6_16));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_14, s6_17));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_13, s6_18));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_12, s6_19));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_11, s7_20));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_10, s7_21));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_9, s7_22));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_8, s7_23));
+ output += 16;
+
+ vst1q_s16(output, vsubq_s16(s7_7, s7_24));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_6, s7_25));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_5, s7_26));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_4, s7_27));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_3, s6_28));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_2, s6_29));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_1, s6_30));
+ output += 16;
+ vst1q_s16(output, vsubq_s16(s7_0, s6_31));
+}
+
+static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) {
+ int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+ in13, in14, in15;
+ int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24,
+ s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31;
+ int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17,
+ s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27,
+ s2_28, s2_29, s2_30, s2_31;
+ int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13,
+ s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30;
+ int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14,
+ s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25,
+ s4_26, s4_27, s4_28, s4_29, s4_30, s4_31;
+ int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12,
+ s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28,
+ s5_29;
+ int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12,
+ s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24,
+ s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31;
+ int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10,
+ s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24,
+ s7_25, s7_26, s7_27;
+ int16x8_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5,
+ &in6, &in7);
+
+ load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12,
+ &in13, &in14, &in15);
+
+ // stage 1
+ s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64);
+ s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64);
+
+ s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64);
+ s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64);
+
+ s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64);
+ s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64);
+
+ s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64);
+ s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64);
+
+ s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64);
+ s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64);
+
+ s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64);
+ s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64);
+
+ s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64);
+ s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64);
+
+ s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64);
+ s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64);
+
+ // stage 2
+ s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64);
+ s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64);
+
+ s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64);
+ s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64);
+
+ s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64);
+ s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64);
+
+ s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64);
+ s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64);
+
+ s2_16 = vaddq_s16(s1_16, s1_17);
+ s2_17 = vsubq_s16(s1_16, s1_17);
+ s2_18 = vsubq_s16(s1_19, s1_18);
+ s2_19 = vaddq_s16(s1_18, s1_19);
+ s2_20 = vaddq_s16(s1_20, s1_21);
+ s2_21 = vsubq_s16(s1_20, s1_21);
+ s2_22 = vsubq_s16(s1_23, s1_22);
+ s2_23 = vaddq_s16(s1_22, s1_23);
+ s2_24 = vaddq_s16(s1_24, s1_25);
+ s2_25 = vsubq_s16(s1_24, s1_25);
+ s2_26 = vsubq_s16(s1_27, s1_26);
+ s2_27 = vaddq_s16(s1_26, s1_27);
+ s2_28 = vaddq_s16(s1_28, s1_29);
+ s2_29 = vsubq_s16(s1_28, s1_29);
+ s2_30 = vsubq_s16(s1_31, s1_30);
+ s2_31 = vaddq_s16(s1_30, s1_31);
+
+ // stage 3
+ s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64);
+ s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64);
+
+ s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64);
+ s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64);
+
+ s3_8 = vaddq_s16(s2_8, s2_9);
+ s3_9 = vsubq_s16(s2_8, s2_9);
+ s3_10 = vsubq_s16(s2_11, s2_10);
+ s3_11 = vaddq_s16(s2_10, s2_11);
+ s3_12 = vaddq_s16(s2_12, s2_13);
+ s3_13 = vsubq_s16(s2_12, s2_13);
+ s3_14 = vsubq_s16(s2_15, s2_14);
+ s3_15 = vaddq_s16(s2_14, s2_15);
+
+ s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30,
+ cospi_28_64);
+ s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30,
+ cospi_4_64);
+
+ s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29,
+ -cospi_4_64);
+ s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29,
+ cospi_28_64);
+
+ s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26,
+ cospi_12_64);
+ s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26,
+ cospi_20_64);
+
+ s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25,
+ -cospi_20_64);
+ s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25,
+ cospi_12_64);
+
+ // stage 4
+ s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64);
+ s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64);
+ s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64);
+
+ s4_4 = vaddq_s16(s3_4, s3_5);
+ s4_5 = vsubq_s16(s3_4, s3_5);
+ s4_6 = vsubq_s16(s3_7, s3_6);
+ s4_7 = vaddq_s16(s3_6, s3_7);
+
+ s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14,
+ cospi_24_64);
+ s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14,
+ cospi_8_64);
+
+ s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13,
+ -cospi_8_64);
+ s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13,
+ cospi_24_64);
+
+ s4_16 = vaddq_s16(s2_16, s2_19);
+ s4_17 = vaddq_s16(s3_17, s3_18);
+ s4_18 = vsubq_s16(s3_17, s3_18);
+ s4_19 = vsubq_s16(s2_16, s2_19);
+ s4_20 = vsubq_s16(s2_23, s2_20);
+ s4_21 = vsubq_s16(s3_22, s3_21);
+ s4_22 = vaddq_s16(s3_21, s3_22);
+ s4_23 = vaddq_s16(s2_20, s2_23);
+ s4_24 = vaddq_s16(s2_24, s2_27);
+ s4_25 = vaddq_s16(s3_25, s3_26);
+ s4_26 = vsubq_s16(s3_25, s3_26);
+ s4_27 = vsubq_s16(s2_24, s2_27);
+ s4_28 = vsubq_s16(s2_31, s2_28);
+ s4_29 = vsubq_s16(s3_30, s3_29);
+ s4_30 = vaddq_s16(s3_29, s3_30);
+ s4_31 = vaddq_s16(s2_28, s2_31);
+
+ // stage 5
+ s5_0 = vaddq_s16(s4_0, s4_3);
+ s5_1 = vaddq_s16(s4_0, s4_2);
+ s5_2 = vsubq_s16(s4_0, s4_2);
+ s5_3 = vsubq_s16(s4_0, s4_3);
+
+ s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64);
+ s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64);
+
+ s5_8 = vaddq_s16(s3_8, s3_11);
+ s5_9 = vaddq_s16(s4_9, s4_10);
+ s5_10 = vsubq_s16(s4_9, s4_10);
+ s5_11 = vsubq_s16(s3_8, s3_11);
+ s5_12 = vsubq_s16(s3_15, s3_12);
+ s5_13 = vsubq_s16(s4_14, s4_13);
+ s5_14 = vaddq_s16(s4_13, s4_14);
+ s5_15 = vaddq_s16(s3_15, s3_12);
+
+ s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29,
+ cospi_24_64);
+ s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29,
+ cospi_8_64);
+
+ s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28,
+ cospi_24_64);
+ s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28,
+ cospi_8_64);
+
+ s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27,
+ -cospi_8_64);
+ s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27,
+ cospi_24_64);
+
+ s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26,
+ -cospi_8_64);
+ s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26,
+ cospi_24_64);
+
+ // stage 6
+ s6_0 = vaddq_s16(s5_0, s4_7);
+ s6_1 = vaddq_s16(s5_1, s5_6);
+ s6_2 = vaddq_s16(s5_2, s5_5);
+ s6_3 = vaddq_s16(s5_3, s4_4);
+ s6_4 = vsubq_s16(s5_3, s4_4);
+ s6_5 = vsubq_s16(s5_2, s5_5);
+ s6_6 = vsubq_s16(s5_1, s5_6);
+ s6_7 = vsubq_s16(s5_0, s4_7);
+
+ s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64);
+ s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64);
+
+ s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64);
+ s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64);
+
+ s6_16 = vaddq_s16(s4_16, s4_23);
+ s6_17 = vaddq_s16(s4_17, s4_22);
+ s6_18 = vaddq_s16(s5_18, s5_21);
+ s6_19 = vaddq_s16(s5_19, s5_20);
+ s6_20 = vsubq_s16(s5_19, s5_20);
+ s6_21 = vsubq_s16(s5_18, s5_21);
+ s6_22 = vsubq_s16(s4_17, s4_22);
+ s6_23 = vsubq_s16(s4_16, s4_23);
+ s6_24 = vsubq_s16(s4_31, s4_24);
+ s6_25 = vsubq_s16(s4_30, s4_25);
+ s6_26 = vsubq_s16(s5_29, s5_26);
+ s6_27 = vsubq_s16(s5_28, s5_27);
+ s6_28 = vaddq_s16(s5_27, s5_28);
+ s6_29 = vaddq_s16(s5_26, s5_29);
+ s6_30 = vaddq_s16(s4_25, s4_30);
+ s6_31 = vaddq_s16(s4_24, s4_31);
+
+ // stage 7
+ s7_0 = vaddq_s16(s6_0, s5_15);
+ s7_1 = vaddq_s16(s6_1, s5_14);
+ s7_2 = vaddq_s16(s6_2, s6_13);
+ s7_3 = vaddq_s16(s6_3, s6_12);
+ s7_4 = vaddq_s16(s6_4, s6_11);
+ s7_5 = vaddq_s16(s6_5, s6_10);
+ s7_6 = vaddq_s16(s6_6, s5_9);
+ s7_7 = vaddq_s16(s6_7, s5_8);
+ s7_8 = vsubq_s16(s6_7, s5_8);
+ s7_9 = vsubq_s16(s6_6, s5_9);
+ s7_10 = vsubq_s16(s6_5, s6_10);
+ s7_11 = vsubq_s16(s6_4, s6_11);
+ s7_12 = vsubq_s16(s6_3, s6_12);
+ s7_13 = vsubq_s16(s6_2, s6_13);
+ s7_14 = vsubq_s16(s6_1, s5_14);
+ s7_15 = vsubq_s16(s6_0, s5_15);
+
+ s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64);
+ s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64);
+
+ s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64);
+ s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64);
+
+ s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64);
+ s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64);
+
+ s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64);
+ s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64);
+
+ // final stage
+ out0 = vaddq_s16(s7_0, s6_31);
+ out1 = vaddq_s16(s7_1, s6_30);
+ out2 = vaddq_s16(s7_2, s6_29);
+ out3 = vaddq_s16(s7_3, s6_28);
+ out4 = vaddq_s16(s7_4, s7_27);
+ out5 = vaddq_s16(s7_5, s7_26);
+ out6 = vaddq_s16(s7_6, s7_25);
+ out7 = vaddq_s16(s7_7, s7_24);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output,
+ stride);
+
+ out0 = vaddq_s16(s7_8, s7_23);
+ out1 = vaddq_s16(s7_9, s7_22);
+ out2 = vaddq_s16(s7_10, s7_21);
+ out3 = vaddq_s16(s7_11, s7_20);
+ out4 = vaddq_s16(s7_12, s6_19);
+ out5 = vaddq_s16(s7_13, s6_18);
+ out6 = vaddq_s16(s7_14, s6_17);
+ out7 = vaddq_s16(s7_15, s6_16);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (8 * stride), stride);
+
+ out0 = vsubq_s16(s7_15, s6_16);
+ out1 = vsubq_s16(s7_14, s6_17);
+ out2 = vsubq_s16(s7_13, s6_18);
+ out3 = vsubq_s16(s7_12, s6_19);
+ out4 = vsubq_s16(s7_11, s7_20);
+ out5 = vsubq_s16(s7_10, s7_21);
+ out6 = vsubq_s16(s7_9, s7_22);
+ out7 = vsubq_s16(s7_8, s7_23);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (16 * stride), stride);
+
+ out0 = vsubq_s16(s7_7, s7_24);
+ out1 = vsubq_s16(s7_6, s7_25);
+ out2 = vsubq_s16(s7_5, s7_26);
+ out3 = vsubq_s16(s7_4, s7_27);
+ out4 = vsubq_s16(s7_3, s6_28);
+ out5 = vsubq_s16(s7_2, s6_29);
+ out6 = vsubq_s16(s7_1, s6_30);
+ out7 = vsubq_s16(s7_0, s6_31);
+
+ add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7,
+ output + (24 * stride), stride);
+}
+
+void vpx_idct32x32_135_add_neon(const int16_t *input, uint8_t *dest,
+ int stride) {
+ int i;
+ int16_t temp[32 * 16];
+ int16_t *t = temp;
+
+ idct32_12_neon(input, temp);
+ idct32_12_neon(input + 32 * 8, temp + 8);
+
+ for (i = 0; i < 32; i += 8) {
+ idct32_16_neon(t, dest, stride);
+ t += (16 * 8);
+ dest += 8;
+ }
+}
diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c
index ebec9df54..a584b1d9e 100644
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -34,7 +34,7 @@
// 5 13 20 26
// 6 21 27 33
// 7 24 32
-static void idct32_6_neon(const int16_t *input, int16_t *output) {
+static void idct32_6_neon(const tran_low_t *input, int16_t *output) {
int16x8_t in0, in1, in2, in3, in4, in5, in6, in7;
int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10,
s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20,
@@ -46,8 +46,22 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) {
s2_31;
int16x8_t s3_24, s3_25, s3_26, s3_27;
- load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5,
- &in6, &in7);
+ in0 = load_tran_low_to_s16(input);
+ input += 32;
+ in1 = load_tran_low_to_s16(input);
+ input += 32;
+ in2 = load_tran_low_to_s16(input);
+ input += 32;
+ in3 = load_tran_low_to_s16(input);
+ input += 32;
+ in4 = load_tran_low_to_s16(input);
+ input += 32;
+ in5 = load_tran_low_to_s16(input);
+ input += 32;
+ in6 = load_tran_low_to_s16(input);
+ input += 32;
+ in7 = load_tran_low_to_s16(input);
+ transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7);
// stage 1
// input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
@@ -503,7 +517,7 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) {
output + (24 * stride), stride);
}
-void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest,
+void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
int i;
int16_t temp[32 * 8];
diff --git a/vpx_dsp/arm/idct_neon.asm b/vpx_dsp/arm/idct_neon.asm
index a223c0b63..f39e8ddd4 100644
--- a/vpx_dsp/arm/idct_neon.asm
+++ b/vpx_dsp/arm/idct_neon.asm
@@ -27,3 +27,4 @@
vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]!
ENDIF
MEND
+ END
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 0a8607849..fb1fa6b68 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -776,133 +776,98 @@ void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
vst1_u8(dst, d);
}
+static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0);
+ *dst += stride;
+ vst1q_u8(*dst, row_1);
+ *dst += stride;
+ vst1q_u8(*dst, row_2);
+ *dst += stride;
+ vst1q_u8(*dst, row_3);
+ *dst += stride;
+ vst1q_u8(*dst, row_4);
+ *dst += stride;
+ vst1q_u8(*dst, row_5);
+ *dst += stride;
+ vst1q_u8(*dst, row_6);
+ *dst += stride;
+ vst1q_u8(*dst, row_7);
+ *dst += stride;
+}
+
void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const uint8x16_t left_u8q = vld1q_u8(left);
- uint8x8_t left_u8d = vget_low_u8(left_u8q);
- uint8x16_t d;
- int i;
(void)above;
- for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) {
- d = vdupq_lane_u8(left_u8d, 0);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 1);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 2);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 3);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 4);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 5);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 6);
- vst1q_u8(dst, d);
- dst += stride;
- d = vdupq_lane_u8(left_u8d, 7);
- vst1q_u8(dst, d);
- dst += stride;
- }
+ h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
+ h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
+}
+
+static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
+ const uint8x8_t left) {
+ const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
+ const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
+ const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
+ const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
+ const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
+ const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
+ const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
+ const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
+
+ vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8
+ *dst += 16;
+ vst1q_u8(*dst, row_0);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_1);
+ *dst += 16;
+ vst1q_u8(*dst, row_1);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_2);
+ *dst += 16;
+ vst1q_u8(*dst, row_2);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_3);
+ *dst += 16;
+ vst1q_u8(*dst, row_3);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_4);
+ *dst += 16;
+ vst1q_u8(*dst, row_4);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_5);
+ *dst += 16;
+ vst1q_u8(*dst, row_5);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_6);
+ *dst += 16;
+ vst1q_u8(*dst, row_6);
+ *dst += stride - 16;
+ vst1q_u8(*dst, row_7);
+ *dst += 16;
+ vst1q_u8(*dst, row_7);
+ *dst += stride - 16;
}
void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
- uint8x16_t d;
int i;
(void)above;
for (i = 0; i < 2; i++, left += 16) {
const uint8x16_t left_u8 = vld1q_u8(left);
- const uint8x8_t left_low = vget_low_u8(left_u8);
- const uint8x8_t left_high = vget_high_u8(left_u8);
- d = vdupq_lane_u8(left_low, 0);
- vst1q_u8(dst, d); // Note clang-3.8 produced poor code w/vst2q_u8
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 1);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 2);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 3);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 4);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 5);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 6);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_low, 7);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
-
- d = vdupq_lane_u8(left_high, 0);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 1);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 2);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 3);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 4);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 5);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 6);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
- d = vdupq_lane_u8(left_high, 7);
- vst1q_u8(dst, d);
- dst += 16;
- vst1q_u8(dst, d);
- dst += stride - 16;
+ h_store_32x8(&dst, stride, vget_low_u8(left_u8));
+ h_store_32x8(&dst, stride, vget_high_u8(left_u8));
}
}
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 445add296..4fa8ff115 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -141,6 +141,71 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
*a1 = d0.val[1];
}
+static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
+ const int16x4_t a2, const int16x4_t a3,
+ const int16x4_t a4, const int16x4_t a5,
+ const int16x4_t a6, const int16x4_t a7,
+ int16x8_t *o0, int16x8_t *o1,
+ int16x8_t *o2, int16x8_t *o3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ const int16x4x2_t b0 = vtrn_s16(a0, a1);
+ const int16x4x2_t b1 = vtrn_s16(a2, a3);
+ const int16x4x2_t b2 = vtrn_s16(a4, a5);
+ const int16x4x2_t b3 = vtrn_s16(a6, a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+ const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+ vreinterpret_s32_s16(b3.val[0]));
+ const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+ vreinterpret_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // o0: 00 10 20 30 40 50 60 70
+ // o1: 01 11 21 31 41 51 61 71
+ // o2: 02 12 22 32 42 52 62 72
+ // o3: 03 13 23 33 43 53 63 73
+
+ *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+ vreinterpret_s16_s32(c2.val[0]));
+ *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+ vreinterpret_s16_s32(c3.val[0]));
+ *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+ vreinterpret_s16_s32(c2.val[1]));
+ *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+ vreinterpret_s16_s32(c3.val[1]));
+}
+
static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
uint8x8_t *a3) {
// Swap 8 bit elements. Goes from:
diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
index 589b124e2..b2d94795d 100644
--- a/vpx_dsp/deblock.c
+++ b/vpx_dsp/deblock.c
@@ -156,14 +156,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
int flimit) {
int r, c, i;
- const int16_t *rv3 = &vpx_rv[63 & rand()];
for (c = 0; c < cols; c++) {
unsigned char *s = &dst[c];
int sumsq = 0;
int sum = 0;
unsigned char d[16];
- const int16_t *rv2 = rv3 + ((c * 17) & 127);
for (i = -8; i < 0; i++) s[i * pitch] = s[0];
@@ -183,7 +181,7 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
d[r & 15] = s[0];
if (sumsq * 15 - sum * sum < flimit) {
- d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+ d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4;
}
if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
s += pitch;
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index 46ddd1da0..f3f543ddf 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -96,6 +96,7 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
void idct4_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step[4];
tran_high_t temp1, temp2;
+
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -114,9 +115,9 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) {
}
void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[4], temp_out[4];
// Rows
@@ -142,6 +143,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
int i;
tran_high_t a1;
tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -157,6 +159,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
void idct8_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step1[8], step2[8];
tran_high_t temp1, temp2;
+
// stage 1
step1[0] = input[0];
step1[2] = input[4];
@@ -209,9 +212,9 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) {
}
void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[8], temp_out[8];
// First transform rows
@@ -236,6 +239,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
tran_high_t a1;
tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
@@ -246,14 +250,13 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
void iadst4_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
tran_low_t x0 = input[0];
tran_low_t x1 = input[1];
tran_low_t x2 = input[2];
tran_low_t x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
- output[0] = output[1] = output[2] = output[3] = 0;
+ memset(output, 0, 4 * sizeof(*output));
return;
}
@@ -283,7 +286,6 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
void iadst8_c(const tran_low_t *input, tran_low_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
-
tran_high_t x0 = input[7];
tran_high_t x1 = input[0];
tran_high_t x2 = input[5];
@@ -294,8 +296,7 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = 0;
+ memset(output, 0, 8 * sizeof(*output));
return;
}
@@ -359,13 +360,13 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
}
void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[8], temp_out[8];
// First transform rows
- // only first 4 row has non-zero coefs
+ // Only first 4 row has non-zero coefs
for (i = 0; i < 4; ++i) {
idct8_c(input, outptr);
input += 8;
@@ -550,9 +551,9 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) {
void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
+ int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[16], temp_out[16];
// First transform rows
@@ -576,7 +577,6 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
void iadst16_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
tran_high_t x0 = input[15];
tran_high_t x1 = input[0];
tran_high_t x2 = input[13];
@@ -596,9 +596,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
x13 | x14 | x15)) {
- output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
- output[6] = output[7] = output[8] = output[9] = output[10] =
- output[11] = output[12] = output[13] = output[14] = output[15] = 0;
+ memset(output, 0, 16 * sizeof(*output));
return;
}
@@ -746,9 +744,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
+ int i, j;
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[16], temp_out[16];
// First transform rows. Since all non-zero dct coefficients are in
@@ -774,6 +772,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
tran_high_t a1;
tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
@@ -1151,9 +1150,9 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) {
void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
+ int i, j;
tran_low_t out[32 * 32];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[32], temp_out[32];
// Rows
@@ -1188,13 +1187,13 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
+ int i, j;
tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[32], temp_out[32];
// Rows
- // only upper-left 16x16 has non-zero coeff
+ // Only upper-left 16x16 has non-zero coeff
for (i = 0; i < 16; ++i) {
idct32_c(input, outptr);
input += 32;
@@ -1214,13 +1213,13 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
+ int i, j;
tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[32], temp_out[32];
// Rows
- // only upper-left 8x8 has non-zero coeff
+ // Only upper-left 8x8 has non-zero coeff
for (i = 0; i < 8; ++i) {
idct32_c(input, outptr);
input += 32;
@@ -1241,8 +1240,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
tran_high_t a1;
-
tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -1373,12 +1372,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
- step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
- step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
// stage 2
output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
@@ -1389,9 +1388,9 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
+ int i, j;
tran_low_t out[4 * 4];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[4], temp_out[4];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1418,10 +1417,10 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
int i;
tran_high_t a1;
tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 4);
for (i = 0; i < 4; i++) {
@@ -1452,12 +1451,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
step1[3] = input[6];
temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
// stage 2 & stage 3 - even half
vpx_highbd_idct4_c(step1, step1, bd);
@@ -1472,8 +1471,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[7] = step2[7];
// stage 4
@@ -1489,20 +1488,20 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
+ int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[8], temp_out[8];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- // First transform rows.
+ // First transform rows
for (i = 0; i < 8; ++i) {
vpx_highbd_idct8_c(input, outptr, bd);
input += 8;
outptr += 8;
}
- // Then transform columns.
+ // Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1518,9 +1517,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
int i, j;
tran_high_t a1;
tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1567,10 +1567,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
// The overall dynamic range is 14b (input) + 14b (multiplication scaling)
// + 1b (addition) = 29b.
// Hence the output bit depth is 15b.
- output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
- output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
- output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
+ output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+ output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
+ output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
}
void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1608,14 +1608,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
+ x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+ x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
// stage 2
s0 = x0;
@@ -1631,10 +1631,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
// stage 3
s2 = cospi_16_64 * (x2 + x3);
@@ -1642,10 +1642,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
s6 = cospi_16_64 * (x6 + x7);
s7 = cospi_16_64 * (x6 - x7);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
output[0] = HIGHBD_WRAPLOW(x0, bd);
output[1] = HIGHBD_WRAPLOW(-x4, bd);
@@ -1657,22 +1657,23 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
output[7] = HIGHBD_WRAPLOW(-x1, bd);
}
-void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
+ int i, j;
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[8], temp_out[8];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- // First transform rows.
- // Only first 4 row has non-zero coefs.
+ // First transform rows
+ // Only first 4 row has non-zero coefs
for (i = 0; i < 4; ++i) {
vpx_highbd_idct8_c(input, outptr, bd);
input += 8;
outptr += 8;
}
- // Then transform columns.
+
+ // Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
vpx_highbd_idct8_c(temp_in, temp_out, bd);
@@ -1726,23 +1727,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
// stage 3
step1[0] = step2[0];
@@ -1752,12 +1753,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -1771,12 +1772,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -1786,12 +1787,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -1803,8 +1804,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[7] = step2[7];
step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -1829,12 +1830,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -1859,20 +1860,20 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
+ int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[16], temp_out[16];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- // First transform rows.
+ // First transform rows
for (i = 0; i < 16; ++i) {
vpx_highbd_idct16_c(input, outptr, bd);
input += 16;
outptr += 16;
}
- // Then transform columns.
+ // Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -1936,22 +1937,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
- x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
- x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
+ x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+ x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+ x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+ x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
// stage 2
s0 = x0;
@@ -1979,14 +1980,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
- x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
- x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
+ x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+ x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
// stage 3
s0 = x0;
@@ -2010,18 +2011,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
- x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
- x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
// stage 4
s2 = (-cospi_16_64) * (x2 + x3);
@@ -2033,14 +2034,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
s14 = (-cospi_16_64) * (x14 + x15);
s15 = cospi_16_64 * (x14 - x15);
- x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
- x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
- x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
- x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
- x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
output[0] = HIGHBD_WRAPLOW(x0, bd);
output[1] = HIGHBD_WRAPLOW(-x8, bd);
@@ -2062,9 +2063,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
+ int i, j;
tran_low_t out[16 * 16] = { 0 };
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[16], temp_out[16];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -2076,7 +2077,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
outptr += 16;
}
- // Then transform columns.
+ // Then transform columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
vpx_highbd_idct16_c(temp_in, temp_out, bd);
@@ -2092,10 +2093,10 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
int i, j;
tran_high_t a1;
tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2137,43 +2138,43 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
- step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
// stage 2
step2[0] = step1[0];
@@ -2187,23 +2188,23 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
- step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
@@ -2230,12 +2231,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
- step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
@@ -2250,22 +2251,22 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[31] = step2[31];
temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
- step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[19] = step2[19];
step1[20] = step2[20];
temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[23] = step2[23];
step1[24] = step2[24];
step1[27] = step2[27];
@@ -2274,12 +2275,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
// stage 4
temp1 = (step1[0] + step1[1]) * cospi_16_64;
temp2 = (step1[0] - step1[1]) * cospi_16_64;
- step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
- step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
@@ -2289,12 +2290,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step2[15] = step1[15];
temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
- step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[11] = step1[11];
step2[12] = step1[12];
@@ -2324,8 +2325,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[4] = step2[4];
temp1 = (step2[6] - step2[5]) * cospi_16_64;
temp2 = (step2[5] + step2[6]) * cospi_16_64;
- step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[7] = step2[7];
step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
@@ -2341,20 +2342,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[17] = step2[17];
temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
- step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
- step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[22] = step2[22];
step1[23] = step2[23];
step1[24] = step2[24];
@@ -2375,12 +2376,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step2[9] = step1[9];
temp1 = (-step1[10] + step1[13]) * cospi_16_64;
temp2 = (step1[10] + step1[13]) * cospi_16_64;
- step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = (-step1[11] + step1[12]) * cospi_16_64;
temp2 = (step1[11] + step1[12]) * cospi_16_64;
- step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step2[14] = step1[14];
step2[15] = step1[15];
@@ -2426,20 +2427,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
step1[19] = step2[19];
temp1 = (-step2[20] + step2[27]) * cospi_16_64;
temp2 = (step2[20] + step2[27]) * cospi_16_64;
- step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = (-step2[21] + step2[26]) * cospi_16_64;
temp2 = (step2[21] + step2[26]) * cospi_16_64;
- step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = (-step2[22] + step2[25]) * cospi_16_64;
temp2 = (step2[22] + step2[25]) * cospi_16_64;
- step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
temp1 = (-step2[23] + step2[24]) * cospi_16_64;
temp2 = (step2[23] + step2[24]) * cospi_16_64;
- step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
- step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+ step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
+ step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
step1[28] = step2[28];
step1[29] = step2[29];
step1[30] = step2[30];
@@ -2482,9 +2483,9 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
+ int i, j;
tran_low_t out[32 * 32];
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[32], temp_out[32];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -2520,19 +2521,20 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
+ int i, j;
tran_low_t out[32 * 32] = { 0 };
tran_low_t *outptr = out;
- int i, j;
tran_low_t temp_in[32], temp_out[32];
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
// Rows
- // Only upper-left 8x8 has non-zero coeff.
+ // Only upper-left 8x8 has non-zero coeff
for (i = 0; i < 8; ++i) {
highbd_idct32_c(input, outptr, bd);
input += 32;
outptr += 32;
}
+
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
@@ -2549,10 +2551,10 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
int i, j;
int a1;
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
tran_low_t out =
- HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 32; ++j) {
@@ -2560,4 +2562,5 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
dest += stride;
}
}
+
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index e530730d5..13137659f 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -57,11 +57,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
(void)bd;
return input;
}
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return (tran_high_t)rv;
-}
#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_EMULATE_HARDWARE
diff --git a/vpx_dsp/mips/convolve8_avg_dspr2.c b/vpx_dsp/mips/convolve8_avg_dspr2.c
index 31812299c..b4ed6ee85 100644
--- a/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -403,8 +403,11 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int filter_y_stride, int w,
int h) {
int x, y;
- uint32_t tp1, tp2, tn1;
- uint32_t tp3, tp4, tn2;
+ uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
/* prefetch data to cache memory */
prefetch_load(src);
diff --git a/vpx_dsp/mips/convolve8_dspr2.c b/vpx_dsp/mips/convolve8_dspr2.c
index f6812c7d0..8d35b6394 100644
--- a/vpx_dsp/mips/convolve8_dspr2.c
+++ b/vpx_dsp/mips/convolve8_dspr2.c
@@ -1307,6 +1307,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
assert(y_step_q4 == 16);
assert(((const int32_t *)filter_x)[1] != 0x800000);
assert(((const int32_t *)filter_y)[1] != 0x800000);
+ (void)x_step_q4;
/* bit positon for extract from acc */
__asm__ __volatile__("wrdsp %[pos], 1 \n\t"
@@ -1398,6 +1399,10 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
int x, y;
+ (void)filter_x;
+ (void)filter_x_stride;
+ (void)filter_y;
+ (void)filter_y_stride;
/* prefetch data to cache memory */
prefetch_load(src);
diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c
index cc633c669..ba52e8095 100644
--- a/vpx_dsp/mips/deblock_msa.c
+++ b/vpx_dsp/mips/deblock_msa.c
@@ -573,7 +573,6 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
int32_t cols, int32_t flimit) {
int32_t row, col, cnt, i;
- const int16_t *rv3 = &vpx_rv[63 & rand()];
v4i32 flimit_vec;
v16u8 dst7, dst8, dst_r_b, dst_l_b;
v16i8 mask;
@@ -601,7 +600,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
dst = LD_UB(dst_tmp);
for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
- rv2[i] = rv3 + ((cnt * 17) & 127);
+ rv2[i] = vpx_rv + (i & 7);
++i;
}
for (cnt = -8; cnt < 0; ++cnt) {
diff --git a/vpx_dsp/mips/intrapred16_dspr2.c b/vpx_dsp/mips/intrapred16_dspr2.c
index 3e29d0ac3..835e10e12 100644
--- a/vpx_dsp/mips/intrapred16_dspr2.c
+++ b/vpx_dsp/mips/intrapred16_dspr2.c
@@ -15,6 +15,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+ (void)above;
__asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
diff --git a/vpx_dsp/mips/intrapred4_dspr2.c b/vpx_dsp/mips/intrapred4_dspr2.c
index 9f51d50c7..dce03a2b2 100644
--- a/vpx_dsp/mips/intrapred4_dspr2.c
+++ b/vpx_dsp/mips/intrapred4_dspr2.c
@@ -14,6 +14,7 @@
void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4;
+ (void)above;
__asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
diff --git a/vpx_dsp/mips/intrapred8_dspr2.c b/vpx_dsp/mips/intrapred8_dspr2.c
index eac79d510..16e7fc550 100644
--- a/vpx_dsp/mips/intrapred8_dspr2.c
+++ b/vpx_dsp/mips/intrapred8_dspr2.c
@@ -14,6 +14,7 @@
void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ (void)above;
__asm__ __volatile__(
"lb %[tmp1], (%[left]) \n\t"
diff --git a/vpx_dsp/mips/txfm_macros_msa.h b/vpx_dsp/mips/txfm_macros_msa.h
index da100f6a9..f077fa481 100644
--- a/vpx_dsp/mips/txfm_macros_msa.h
+++ b/vpx_dsp/mips/txfm_macros_msa.h
@@ -15,19 +15,24 @@
#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \
{ \
- v8i16 k0_m = __msa_fill_h(cnst0); \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
+ v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \
+ v8i16 k0_m, k1_m, k2_m, zero = { 0 }; \
\
- s0_m = (v4i32)__msa_fill_h(cnst1); \
- k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \
+ k0_m = __msa_fill_h(cnst0); \
+ k1_m = __msa_fill_h(cnst1); \
+ k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m); \
+ k0_m = __msa_ilvev_h((v8i16)zero, k0_m); \
+ k1_m = __msa_ilvev_h(k1_m, (v8i16)zero); \
\
- ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \
+ ILVRL_H2_SW(reg1, reg0, s5_m, s4_m); \
ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \
- DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \
+ DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m); \
+ s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m); \
+ s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
\
- DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \
+ DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m); \
SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \
out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \
}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 8c91b141f..bb1143cca 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -48,6 +48,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
@@ -212,7 +213,7 @@ endif # HAVE_NEON
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c
-DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c
@@ -244,6 +245,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
endif # HAVE_NEON_ASM
DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
endif # CONFIG_VP9
@@ -252,6 +254,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
DSP_SRCS-yes += quantize.c
DSP_SRCS-yes += quantize.h
+DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h
DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 71015c439..bba6b4f78 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -214,6 +214,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -222,33 +223,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_4x4 sse2/;
+ specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_4x4 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_4x4 sse2/;
+ specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -257,33 +264,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_8x8 sse2/;
+ specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_8x8 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_8x8 sse2/;;
+ specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -292,33 +305,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_16x16 sse2/;
+ specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_16x16 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_16x16 sse2/;
+ specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
@@ -327,27 +346,32 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_h_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_v_predictor_32x32 sse2/;
+ specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_tm_predictor_32x32 sse2/;
+ specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- specialize qw/vpx_highbd_dc_predictor_32x32 sse2/;
+ specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/;
add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;
} # CONFIG_VP9_HIGHBITDEPTH
#
@@ -640,7 +664,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
@@ -679,7 +703,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64";
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add neon sse2/;
@@ -690,8 +714,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct8x8_64_add sse2/;
- add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
- specialize qw/vpx_highbd_idct8x8_10_add sse2/;
+ add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+ specialize qw/vpx_highbd_idct8x8_12_add sse2/;
add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct16x16_256_add sse2/;
@@ -759,9 +783,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
- # Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
- $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
$vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
$vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
@@ -1728,11 +1750,9 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vpx_mbpost_proc_down sse2 msa/;
- $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm;
add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;
- $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm;
add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;
diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm
index 6df360df4..ebca50930 100644
--- a/vpx_dsp/x86/deblock_sse2.asm
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -230,11 +230,11 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
ret
%undef flimit
-;void vpx_mbpost_proc_down_xmm(unsigned char *dst,
-; int pitch, int rows, int cols,int flimit)
+;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
+; int pitch, int rows, int cols,int flimit)
extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_xmm) PRIVATE
-sym(vpx_mbpost_proc_down_xmm):
+global sym(vpx_mbpost_proc_down_sse2) PRIVATE
+sym(vpx_mbpost_proc_down_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
@@ -462,10 +462,10 @@ sym(vpx_mbpost_proc_down_xmm):
%undef flimit4
-;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src,
-; int pitch, int rows, int cols,int flimit)
-global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE
-sym(vpx_mbpost_proc_across_ip_xmm):
+;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
+; int pitch, int rows, int cols,int flimit)
+global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
+sym(vpx_mbpost_proc_across_ip_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
diff --git a/vpx_dsp/x86/fdct.h b/vpx_dsp/x86/fdct.h
new file mode 100644
index 000000000..54a6d81fc
--- /dev/null
+++ b/vpx_dsp/x86/fdct.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_FDCT_H_
+#define VPX_DSP_X86_FDCT_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then cast down.
+// This does not saturate values. It only truncates.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
+ (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
+ (int16_t)a[6], (int16_t)a[7]);
+#else
+ return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_hi = _mm_mulhi_epi16(a, one);
+ const __m128i a_lo = _mm_mullo_epi16(a, one);
+ const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+ const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+ _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+ const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm_store_si128((__m128i *)(a), zero);
+ _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+ _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif // VPX_DSP_X86_FDCT_H_
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 00d18f917..d5fc1440c 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -3673,7 +3673,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
tran_low_t out[8 * 8] = { 0 };
tran_low_t *outptr = out;
@@ -4017,8 +4017,8 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8,
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
tran_low_t out;
- out = highbd_dct_const_round_shift(input[0] * cospi_16_64);
- out = highbd_dct_const_round_shift(out * cospi_16_64);
+ out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
a = ROUND_POWER_OF_TWO(out, 6);
d = _mm_set1_epi32(a);
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 2c7e431c7..0580a7bd7 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -13,32 +13,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
- return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
- (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
- (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
- return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
- tran_low_t *coeff_ptr) {
-#if CONFIG_VP9_HIGHBITDEPTH
- __m128i one = _mm_set1_epi16(1);
- __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
- __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
- __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
- __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
- _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
-}
+#include "vpx_dsp/x86/fdct.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,
@@ -81,8 +56,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i qtmp0, qtmp1;
__m128i cmp_mask0, cmp_mask1;
// Do DC and first 15 AC
- coeff0 = load_coefficients(coeff_ptr + n_coeffs);
- coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+ coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+ coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -117,15 +92,15 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
- store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
- store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+ store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+ store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
dequant = _mm_unpackhi_epi64(dequant, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
- store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+ store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+ store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
}
{
@@ -159,8 +134,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
__m128i qtmp0, qtmp1;
__m128i cmp_mask0, cmp_mask1;
- coeff0 = load_coefficients(coeff_ptr + n_coeffs);
- coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
+ coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+ coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
// Poor man's sign extract
coeff0_sign = _mm_srai_epi16(coeff0, 15);
@@ -191,14 +166,14 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
- store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
- store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+ store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+ store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
- store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
- store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
+ store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+ store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
}
{
@@ -237,10 +212,10 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
}
} else {
do {
- store_coefficients(zero, dqcoeff_ptr + n_coeffs);
- store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
- store_coefficients(zero, qcoeff_ptr + n_coeffs);
- store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
+ store_tran_low(zero, dqcoeff_ptr + n_coeffs);
+ store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
+ store_tran_low(zero, qcoeff_ptr + n_coeffs);
+ store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
n_coeffs += 8 * 2;
} while (n_coeffs < 0);
*eob_ptr = 0;